datamarket 0.9.50__tar.gz → 0.9.52__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (34) hide show
  1. {datamarket-0.9.50 → datamarket-0.9.52}/PKG-INFO +3 -2
  2. {datamarket-0.9.50 → datamarket-0.9.52}/pyproject.toml +1 -1
  3. datamarket-0.9.52/src/datamarket/interfaces/azure.py +127 -0
  4. datamarket-0.9.52/src/datamarket/interfaces/ftp.py +76 -0
  5. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/strings/normalization.py +11 -4
  6. datamarket-0.9.50/src/datamarket/interfaces/ftp.py +0 -62
  7. {datamarket-0.9.50 → datamarket-0.9.52}/LICENSE +0 -0
  8. {datamarket-0.9.50 → datamarket-0.9.52}/README.md +0 -0
  9. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/exceptions/__init__.py +0 -0
  11. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/exceptions/main.py +0 -0
  12. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/__init__.py +0 -0
  13. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/alchemy.py +0 -0
  14. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/aws.py +0 -0
  15. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/drive.py +0 -0
  16. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/nominatim.py +0 -0
  17. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/peerdb.py +0 -0
  18. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/proxy.py +0 -0
  19. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/interfaces/tinybird.py +0 -0
  20. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/params/__init__.py +0 -0
  21. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/params/nominatim.py +0 -0
  22. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/__init__.py +0 -0
  23. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/airflow.py +0 -0
  24. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/alchemy.py +0 -0
  25. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/main.py +0 -0
  26. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/playwright/__init__.py +0 -0
  27. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/playwright/async_api.py +0 -0
  28. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/playwright/sync_api.py +0 -0
  29. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/selenium.py +0 -0
  30. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/soda.py +0 -0
  31. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/strings/__init__.py +0 -0
  32. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/strings/obfuscation.py +0 -0
  33. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/typer.py +0 -0
  34. {datamarket-0.9.50 → datamarket-0.9.52}/src/datamarket/utils/types.py +0 -0
@@ -1,8 +1,9 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: datamarket
3
- Version: 0.9.50
3
+ Version: 0.9.52
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
+ License-File: LICENSE
6
7
  Author: DataMarket
7
8
  Author-email: techsupport@datamarket.es
8
9
  Requires-Python: >=3.12,<3.13
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.50"
3
+ version = "0.9.52"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -0,0 +1,127 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional
7
+
8
+ from azure.storage.blob import BlobServiceClient
9
+ from pendulum import now
10
+
11
+ ########################################################################################################################
12
+ # CLASSES
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class AzureBlobInterface:
18
+ def __init__(self, config):
19
+ self.profiles: List[Dict[str, Any]] = []
20
+ self.config = config
21
+
22
+ for section in getattr(self.config, "sections", lambda: [])():
23
+ if section.startswith("azure:"):
24
+ profile_name = section.split(":", 1)[1]
25
+ connection_string = self.config[section].get("connection_string")
26
+ container_name = self.config[section].get("container_name")
27
+
28
+ self.profiles.append(
29
+ {
30
+ "profile": profile_name,
31
+ "container_name": container_name,
32
+ "session": BlobServiceClient.from_connection_string(
33
+ connection_string
34
+ ).get_container_client(container_name),
35
+ }
36
+ )
37
+
38
+ if not self.profiles:
39
+ logger.warning("No Azure profiles found in config file")
40
+ self.current_profile: Optional[Dict[str, Any]] = (
41
+ self.profiles[0] if self.profiles else None
42
+ )
43
+
44
+ def switch_profile(self, profile_name: str) -> None:
45
+ for profile in self.profiles:
46
+ if profile["profile"] == profile_name:
47
+ self.current_profile = profile
48
+ return
49
+ logger.warning(f"Profile {profile_name} not found")
50
+
51
+ def upload_file(
52
+ self,
53
+ local_file,
54
+ remote_folder,
55
+ remote_file=None,
56
+ upload_file_info=False,
57
+ **kwargs,
58
+ ):
59
+ if not remote_file:
60
+ remote_file = Path(local_file).name
61
+
62
+ remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
63
+
64
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
65
+ with open(local_file, "rb") as data:
66
+ blob_client.upload_blob(data, overwrite=True)
67
+
68
+ if upload_file_info:
69
+ self.upload_file_info(remote_path, **kwargs)
70
+
71
+ def upload_file_info(self, remote_path, **kwargs):
72
+ summary_file = remote_path.split(".")[0] + "_resumen.csv"
73
+ blob_client = self.current_profile["session"].get_blob_client(summary_file)
74
+
75
+ new_record = {
76
+ "file": remote_path,
77
+ "num_rows": kwargs["num_rows"],
78
+ "schema_version": kwargs["schema_version"],
79
+ "upload_date": now(tz="Europe/Madrid").to_datetime_string(),
80
+ }
81
+
82
+ new_record_str = "file,num_rows,schema_version,upload_date\n"
83
+ new_record_str += ",".join([str(v) for v in new_record.values()]) + "\n"
84
+
85
+ blob_client.upload_blob(new_record_str, overwrite=True)
86
+
87
+ def download_file(self, local_file, remote_path):
88
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
89
+ blob_data = blob_client.download_blob()
90
+ with open(local_file, "wb") as f:
91
+ blob_data.readinto(f)
92
+
93
+ def check_file_exists_and_not_empty(self, remote_file, remote_folder):
94
+ """
95
+ Checks if a blob exists in the specified folder and has a size greater than 100 bytes.
96
+
97
+ Args:
98
+ remote_file (str): The name of the file (blob) to check.
99
+ remote_folder (str): The folder (prefix) where the file is located.
100
+
101
+ Returns:
102
+ bool: True if the blob exists and has a size greater than 100, False otherwise.
103
+ """
104
+
105
+ remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
106
+
107
+ try:
108
+ blob_client = self.current_profile["session"].get_blob_client(remote_path)
109
+ if blob_client.exists():
110
+ properties = blob_client.get_blob_properties()
111
+ if properties.size > 100: # Check if size is greater than 100 bytes
112
+ logger.debug(
113
+ f"Blob '{remote_path}' exists and is not empty (size: {properties.size})."
114
+ )
115
+ return True
116
+ else:
117
+ logger.debug(
118
+ f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes."
119
+ )
120
+ return False
121
+ else:
122
+ logger.debug(f"Blob '{remote_path}' does not exist.")
123
+ return False
124
+ except Exception as e:
125
+ logger.error(f"Error checking blob '{remote_path}': {e}")
126
+ # In case of error, assume it doesn't exist or is empty to allow upload attempt
127
+ return False
@@ -0,0 +1,76 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ from ftplib import FTP, FTP_TLS
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional
8
+
9
+ ########################################################################################################################
10
+ # CLASSES
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class FTPInterface:
16
+ def __init__(self, config):
17
+ self.profiles: List[Dict[str, Any]] = []
18
+ self.config = config
19
+ for section in getattr(self.config, "sections", lambda: [])():
20
+ if section.startswith("ftp:"):
21
+ profile_name = section.split(":", 1)[1]
22
+ ftps = self.config[section]["ftps"].lower() == "true"
23
+ ftp_conn = (
24
+ FTP_TLS(self.config[section]["server"])
25
+ if ftps
26
+ else FTP(self.config[section]["server"])
27
+ ) # noqa: S321
28
+ ftp_conn.login(
29
+ self.config[section]["username"], self.config[section]["password"]
30
+ )
31
+ self.profiles.append({"profile": profile_name, "session": ftp_conn})
32
+
33
+ if not self.profiles:
34
+ logger.warning("no ftp section in config")
35
+
36
+ self.current_profile: Optional[Dict[str, Any]] = (
37
+ self.profiles[0] if self.profiles else None
38
+ )
39
+ self.ftp = self.current_profile["session"]
40
+
41
+ def switch_profile(self, profile_name: str) -> None:
42
+ for profile in self.profiles:
43
+ if profile["profile"] == profile_name:
44
+ self.current_profile = profile
45
+ self.ftp = profile["session"]
46
+ return
47
+ logger.warning(f"Profile {profile_name} not found")
48
+
49
+ def upload_file(self, local_file, remote_folder, remote_file=None):
50
+ if not remote_file:
51
+ remote_file = Path(local_file).name
52
+
53
+ self._create_remote_dir_tree(full_path=f"/{remote_folder}{remote_file}")
54
+
55
+ self.ftp.cwd(remote_folder)
56
+
57
+ with open(local_file, "rb") as f:
58
+ self.ftp.storbinary(f"STOR {remote_file}", f)
59
+
60
+ def download_file(self, local_file, remote_file):
61
+ with open(local_file, "wb") as f:
62
+ self.ftp.retrbinary(f"RETR {remote_file}", f.write)
63
+
64
+ def _create_remote_dir_tree(self, full_path):
65
+ dir_tree = full_path.split("/")[0:-1] # Exclude filename
66
+
67
+ for part in dir_tree:
68
+ if not part:
69
+ continue
70
+
71
+ try:
72
+ self.ftp.cwd(part)
73
+ except Exception as e:
74
+ logger.warning(f"Error while creating remote directory: {e}")
75
+ self.ftp.mkd(part)
76
+ self.ftp.cwd(part)
@@ -137,8 +137,13 @@ def normalize(
137
137
  # Parameter mapping
138
138
  if isinstance(mode, str):
139
139
  mode = NormalizationMode[mode.upper()]
140
+ if not isinstance(mode, NormalizationMode):
141
+ raise TypeError("mode must be NormalizationMode or str")
142
+
140
143
  if isinstance(naming, str):
141
144
  naming = NamingConvention[naming.upper()]
145
+ if not isinstance(naming, NamingConvention):
146
+ raise TypeError("naming must be NamingConvention or str")
142
147
 
143
148
  _allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
144
149
 
@@ -148,7 +153,11 @@ def normalize(
148
153
  elif not isinstance(s, str):
149
154
  return str(s)
150
155
  else:
151
- text = prettify(strip_html(str(s), True))
156
+ raw_text = str(s)
157
+ if naming is NamingConvention.NONE:
158
+ text = raw_text
159
+ else:
160
+ text = prettify(strip_html(raw_text, True))
152
161
 
153
162
  if mode is NormalizationMode.NONE:
154
163
  normalized = text
@@ -170,9 +179,7 @@ def normalize(
170
179
 
171
180
  for c in intermediate_text:
172
181
  cat = unicodedata.category(c)
173
- if c in _allowed_symbols_set: # Allowed symbols are part of tokens
174
- current_token_chars.append(c)
175
- elif c.isalnum():
182
+ if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
176
183
  current_token_chars.append(c)
177
184
  elif mode is NormalizationMode.FULL and cat.startswith("S"):
178
185
  # Transliterate S* category symbols not in allowed_symbols
@@ -1,62 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import logging
5
- from ftplib import FTP, FTP_TLS
6
- from pathlib import Path
7
-
8
- from dynaconf import Dynaconf
9
-
10
- ########################################################################################################################
11
- # CLASSES
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
- class FTPInterface:
16
- def __init__(self, config: Dynaconf):
17
- if "ftp" in config:
18
- self.config = config["ftp"]
19
-
20
- self.ftp = self.get_ftp()
21
- else:
22
- logger.warning("no ftp section in config")
23
-
24
- def get_ftp(self):
25
- if self.config["ftps"]:
26
- ftp_conn = FTP_TLS(self.config["server"])
27
-
28
- else:
29
- ftp_conn = FTP(self.config["server"])
30
-
31
- ftp_conn.login(self.config["username"], self.config["password"])
32
-
33
- return ftp_conn
34
-
35
- def upload_file(self, local_file, remote_folder, remote_file=None):
36
- if not remote_file:
37
- remote_file = Path(local_file).name
38
-
39
- self._create_remote_dir_tree(full_path=f"{remote_folder}{remote_file}")
40
-
41
- self.ftp.cwd(remote_folder)
42
-
43
- with open(local_file, "rb") as f:
44
- self.ftp.storbinary(f"STOR {remote_file}", f)
45
-
46
- def download_file(self, local_file, remote_file):
47
- with open(local_file, "wb") as f:
48
- self.ftp.retrbinary(f"RETR {remote_file}", f.write)
49
-
50
- def _create_remote_dir_tree(self, full_path):
51
- dir_tree = full_path.split("/")[0:-1] # Exclude filename
52
-
53
- for part in dir_tree:
54
- if not part:
55
- continue
56
-
57
- try:
58
- self.ftp.cwd(part)
59
- except Exception as e:
60
- logger.warning(f"Error while creating remote directory: {e}")
61
- self.ftp.mkd(part)
62
- self.ftp.cwd(part)
File without changes
File without changes