datamarket 0.9.50__py3-none-any.whl → 0.9.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/ftp.py +33 -19
- datamarket/utils/strings/normalization.py +11 -4
- {datamarket-0.9.50.dist-info → datamarket-0.9.52.dist-info}/METADATA +3 -2
- {datamarket-0.9.50.dist-info → datamarket-0.9.52.dist-info}/RECORD +7 -6
- {datamarket-0.9.50.dist-info → datamarket-0.9.52.dist-info}/WHEEL +1 -1
- {datamarket-0.9.50.dist-info → datamarket-0.9.52.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
from azure.storage.blob import BlobServiceClient
|
|
9
|
+
from pendulum import now
|
|
10
|
+
|
|
11
|
+
########################################################################################################################
|
|
12
|
+
# CLASSES
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AzureBlobInterface:
|
|
18
|
+
def __init__(self, config):
|
|
19
|
+
self.profiles: List[Dict[str, Any]] = []
|
|
20
|
+
self.config = config
|
|
21
|
+
|
|
22
|
+
for section in getattr(self.config, "sections", lambda: [])():
|
|
23
|
+
if section.startswith("azure:"):
|
|
24
|
+
profile_name = section.split(":", 1)[1]
|
|
25
|
+
connection_string = self.config[section].get("connection_string")
|
|
26
|
+
container_name = self.config[section].get("container_name")
|
|
27
|
+
|
|
28
|
+
self.profiles.append(
|
|
29
|
+
{
|
|
30
|
+
"profile": profile_name,
|
|
31
|
+
"container_name": container_name,
|
|
32
|
+
"session": BlobServiceClient.from_connection_string(
|
|
33
|
+
connection_string
|
|
34
|
+
).get_container_client(container_name),
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not self.profiles:
|
|
39
|
+
logger.warning("No Azure profiles found in config file")
|
|
40
|
+
self.current_profile: Optional[Dict[str, Any]] = (
|
|
41
|
+
self.profiles[0] if self.profiles else None
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def switch_profile(self, profile_name: str) -> None:
|
|
45
|
+
for profile in self.profiles:
|
|
46
|
+
if profile["profile"] == profile_name:
|
|
47
|
+
self.current_profile = profile
|
|
48
|
+
return
|
|
49
|
+
logger.warning(f"Profile {profile_name} not found")
|
|
50
|
+
|
|
51
|
+
def upload_file(
|
|
52
|
+
self,
|
|
53
|
+
local_file,
|
|
54
|
+
remote_folder,
|
|
55
|
+
remote_file=None,
|
|
56
|
+
upload_file_info=False,
|
|
57
|
+
**kwargs,
|
|
58
|
+
):
|
|
59
|
+
if not remote_file:
|
|
60
|
+
remote_file = Path(local_file).name
|
|
61
|
+
|
|
62
|
+
remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
|
|
63
|
+
|
|
64
|
+
blob_client = self.current_profile["session"].get_blob_client(remote_path)
|
|
65
|
+
with open(local_file, "rb") as data:
|
|
66
|
+
blob_client.upload_blob(data, overwrite=True)
|
|
67
|
+
|
|
68
|
+
if upload_file_info:
|
|
69
|
+
self.upload_file_info(remote_path, **kwargs)
|
|
70
|
+
|
|
71
|
+
def upload_file_info(self, remote_path, **kwargs):
|
|
72
|
+
summary_file = remote_path.split(".")[0] + "_resumen.csv"
|
|
73
|
+
blob_client = self.current_profile["session"].get_blob_client(summary_file)
|
|
74
|
+
|
|
75
|
+
new_record = {
|
|
76
|
+
"file": remote_path,
|
|
77
|
+
"num_rows": kwargs["num_rows"],
|
|
78
|
+
"schema_version": kwargs["schema_version"],
|
|
79
|
+
"upload_date": now(tz="Europe/Madrid").to_datetime_string(),
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
new_record_str = "file,num_rows,schema_version,upload_date\n"
|
|
83
|
+
new_record_str += ",".join([str(v) for v in new_record.values()]) + "\n"
|
|
84
|
+
|
|
85
|
+
blob_client.upload_blob(new_record_str, overwrite=True)
|
|
86
|
+
|
|
87
|
+
def download_file(self, local_file, remote_path):
|
|
88
|
+
blob_client = self.current_profile["session"].get_blob_client(remote_path)
|
|
89
|
+
blob_data = blob_client.download_blob()
|
|
90
|
+
with open(local_file, "wb") as f:
|
|
91
|
+
blob_data.readinto(f)
|
|
92
|
+
|
|
93
|
+
def check_file_exists_and_not_empty(self, remote_file, remote_folder):
|
|
94
|
+
"""
|
|
95
|
+
Checks if a blob exists in the specified folder and has a size greater than 100 bytes.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
remote_file (str): The name of the file (blob) to check.
|
|
99
|
+
remote_folder (str): The folder (prefix) where the file is located.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
bool: True if the blob exists and has a size greater than 100, False otherwise.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
remote_path = f"{remote_folder}/{remote_file}" if remote_folder else remote_file
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
blob_client = self.current_profile["session"].get_blob_client(remote_path)
|
|
109
|
+
if blob_client.exists():
|
|
110
|
+
properties = blob_client.get_blob_properties()
|
|
111
|
+
if properties.size > 100: # Check if size is greater than 100 bytes
|
|
112
|
+
logger.debug(
|
|
113
|
+
f"Blob '{remote_path}' exists and is not empty (size: {properties.size})."
|
|
114
|
+
)
|
|
115
|
+
return True
|
|
116
|
+
else:
|
|
117
|
+
logger.debug(
|
|
118
|
+
f"Blob '{remote_path}' exists but size ({properties.size}) is not > 100 bytes."
|
|
119
|
+
)
|
|
120
|
+
return False
|
|
121
|
+
else:
|
|
122
|
+
logger.debug(f"Blob '{remote_path}' does not exist.")
|
|
123
|
+
return False
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Error checking blob '{remote_path}': {e}")
|
|
126
|
+
# In case of error, assume it doesn't exist or is empty to allow upload attempt
|
|
127
|
+
return False
|
datamarket/interfaces/ftp.py
CHANGED
|
@@ -4,39 +4,53 @@
|
|
|
4
4
|
import logging
|
|
5
5
|
from ftplib import FTP, FTP_TLS
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
from dynaconf import Dynaconf
|
|
7
|
+
from typing import Any, Dict, List, Optional
|
|
9
8
|
|
|
10
9
|
########################################################################################################################
|
|
11
10
|
# CLASSES
|
|
12
11
|
|
|
13
12
|
logger = logging.getLogger(__name__)
|
|
14
13
|
|
|
15
|
-
class FTPInterface:
|
|
16
|
-
def __init__(self, config: Dynaconf):
|
|
17
|
-
if "ftp" in config:
|
|
18
|
-
self.config = config["ftp"]
|
|
19
14
|
|
|
20
|
-
|
|
21
|
-
|
|
15
|
+
class FTPInterface:
|
|
16
|
+
def __init__(self, config):
|
|
17
|
+
self.profiles: List[Dict[str, Any]] = []
|
|
18
|
+
self.config = config
|
|
19
|
+
for section in getattr(self.config, "sections", lambda: [])():
|
|
20
|
+
if section.startswith("ftp:"):
|
|
21
|
+
profile_name = section.split(":", 1)[1]
|
|
22
|
+
ftps = self.config[section]["ftps"].lower() == "true"
|
|
23
|
+
ftp_conn = (
|
|
24
|
+
FTP_TLS(self.config[section]["server"])
|
|
25
|
+
if ftps
|
|
26
|
+
else FTP(self.config[section]["server"])
|
|
27
|
+
) # noqa: S321
|
|
28
|
+
ftp_conn.login(
|
|
29
|
+
self.config[section]["username"], self.config[section]["password"]
|
|
30
|
+
)
|
|
31
|
+
self.profiles.append({"profile": profile_name, "session": ftp_conn})
|
|
32
|
+
|
|
33
|
+
if not self.profiles:
|
|
22
34
|
logger.warning("no ftp section in config")
|
|
23
35
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
36
|
+
self.current_profile: Optional[Dict[str, Any]] = (
|
|
37
|
+
self.profiles[0] if self.profiles else None
|
|
38
|
+
)
|
|
39
|
+
self.ftp = self.current_profile["session"]
|
|
27
40
|
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
41
|
+
def switch_profile(self, profile_name: str) -> None:
|
|
42
|
+
for profile in self.profiles:
|
|
43
|
+
if profile["profile"] == profile_name:
|
|
44
|
+
self.current_profile = profile
|
|
45
|
+
self.ftp = profile["session"]
|
|
46
|
+
return
|
|
47
|
+
logger.warning(f"Profile {profile_name} not found")
|
|
34
48
|
|
|
35
49
|
def upload_file(self, local_file, remote_folder, remote_file=None):
|
|
36
50
|
if not remote_file:
|
|
37
51
|
remote_file = Path(local_file).name
|
|
38
52
|
|
|
39
|
-
self._create_remote_dir_tree(full_path=f"{remote_folder}{remote_file}")
|
|
53
|
+
self._create_remote_dir_tree(full_path=f"/{remote_folder}{remote_file}")
|
|
40
54
|
|
|
41
55
|
self.ftp.cwd(remote_folder)
|
|
42
56
|
|
|
@@ -59,4 +73,4 @@ class FTPInterface:
|
|
|
59
73
|
except Exception as e:
|
|
60
74
|
logger.warning(f"Error while creating remote directory: {e}")
|
|
61
75
|
self.ftp.mkd(part)
|
|
62
|
-
self.ftp.cwd(part)
|
|
76
|
+
self.ftp.cwd(part)
|
|
@@ -137,8 +137,13 @@ def normalize(
|
|
|
137
137
|
# Parameter mapping
|
|
138
138
|
if isinstance(mode, str):
|
|
139
139
|
mode = NormalizationMode[mode.upper()]
|
|
140
|
+
if not isinstance(mode, NormalizationMode):
|
|
141
|
+
raise TypeError("mode must be NormalizationMode or str")
|
|
142
|
+
|
|
140
143
|
if isinstance(naming, str):
|
|
141
144
|
naming = NamingConvention[naming.upper()]
|
|
145
|
+
if not isinstance(naming, NamingConvention):
|
|
146
|
+
raise TypeError("naming must be NamingConvention or str")
|
|
142
147
|
|
|
143
148
|
_allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
|
|
144
149
|
|
|
@@ -148,7 +153,11 @@ def normalize(
|
|
|
148
153
|
elif not isinstance(s, str):
|
|
149
154
|
return str(s)
|
|
150
155
|
else:
|
|
151
|
-
|
|
156
|
+
raw_text = str(s)
|
|
157
|
+
if naming is NamingConvention.NONE:
|
|
158
|
+
text = raw_text
|
|
159
|
+
else:
|
|
160
|
+
text = prettify(strip_html(raw_text, True))
|
|
152
161
|
|
|
153
162
|
if mode is NormalizationMode.NONE:
|
|
154
163
|
normalized = text
|
|
@@ -170,9 +179,7 @@ def normalize(
|
|
|
170
179
|
|
|
171
180
|
for c in intermediate_text:
|
|
172
181
|
cat = unicodedata.category(c)
|
|
173
|
-
if c in _allowed_symbols_set: # Allowed symbols are part of tokens
|
|
174
|
-
current_token_chars.append(c)
|
|
175
|
-
elif c.isalnum():
|
|
182
|
+
if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
|
|
176
183
|
current_token_chars.append(c)
|
|
177
184
|
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
178
185
|
# Transliterate S* category symbols not in allowed_symbols
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.52
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
|
+
License-File: LICENSE
|
|
6
7
|
Author: DataMarket
|
|
7
8
|
Author-email: techsupport@datamarket.es
|
|
8
9
|
Requires-Python: >=3.12,<3.13
|
|
@@ -4,8 +4,9 @@ datamarket/exceptions/main.py,sha256=MP5ql6M7DoMbBf-Dg_2ohcUFdWXgzv-dXHntPPit31s
|
|
|
4
4
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
datamarket/interfaces/alchemy.py,sha256=mQwjDqBpz1QHRV2JTCALvn5iK_ky69oE2Gw-EtRXsqQ,14664
|
|
6
6
|
datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
|
|
7
|
+
datamarket/interfaces/azure.py,sha256=4lxjL4O4nGO8aDmzY8m9x6vzgYpzQaBWwhERrTuwtqA,4936
|
|
7
8
|
datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjHk,4840
|
|
8
|
-
datamarket/interfaces/ftp.py,sha256=
|
|
9
|
+
datamarket/interfaces/ftp.py,sha256=t4zU3ccKo2-8R3nZvM0gqgWbLMRr21jYS5bYpX0jjWk,2772
|
|
9
10
|
datamarket/interfaces/nominatim.py,sha256=HLk0FcdfbOVCF_i71l-Hlb17swL0W1a3Gg2n5OLD0tM,15507
|
|
10
11
|
datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
|
|
11
12
|
datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
|
|
@@ -22,11 +23,11 @@ datamarket/utils/playwright/sync_api.py,sha256=Tw_-KLB3vipFuEQwcX8iCbj7giCzcwXB-
|
|
|
22
23
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
23
24
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
24
25
|
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
25
|
-
datamarket/utils/strings/normalization.py,sha256=
|
|
26
|
+
datamarket/utils/strings/normalization.py,sha256=rj0wfJSjqcCRp-ruHqc5pylO3_TOmY5_V1lKzkyWoAA,8991
|
|
26
27
|
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
27
28
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
28
29
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
29
|
-
datamarket-0.9.
|
|
30
|
-
datamarket-0.9.
|
|
31
|
-
datamarket-0.9.
|
|
32
|
-
datamarket-0.9.
|
|
30
|
+
datamarket-0.9.52.dist-info/METADATA,sha256=80X3F8UhM8-vlWqkp3_72Tmh0TzGJudtngDOt5x7Kcc,7348
|
|
31
|
+
datamarket-0.9.52.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
32
|
+
datamarket-0.9.52.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
33
|
+
datamarket-0.9.52.dist-info/RECORD,,
|
|
File without changes
|