datamarket 0.6.0__py3-none-any.whl → 0.10.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/__init__.py +0 -1
- datamarket/exceptions/__init__.py +1 -0
- datamarket/exceptions/main.py +118 -0
- datamarket/interfaces/alchemy.py +1934 -25
- datamarket/interfaces/aws.py +81 -14
- datamarket/interfaces/azure.py +127 -0
- datamarket/interfaces/drive.py +60 -10
- datamarket/interfaces/ftp.py +37 -14
- datamarket/interfaces/llm.py +1220 -0
- datamarket/interfaces/nominatim.py +314 -42
- datamarket/interfaces/peerdb.py +272 -104
- datamarket/interfaces/proxy.py +354 -50
- datamarket/interfaces/tinybird.py +7 -15
- datamarket/params/nominatim.py +439 -0
- datamarket/utils/__init__.py +1 -1
- datamarket/utils/airflow.py +10 -7
- datamarket/utils/alchemy.py +2 -1
- datamarket/utils/logs.py +88 -0
- datamarket/utils/main.py +138 -10
- datamarket/utils/nominatim.py +201 -0
- datamarket/utils/playwright/__init__.py +0 -0
- datamarket/utils/playwright/async_api.py +274 -0
- datamarket/utils/playwright/sync_api.py +281 -0
- datamarket/utils/requests.py +655 -0
- datamarket/utils/selenium.py +6 -12
- datamarket/utils/strings/__init__.py +1 -0
- datamarket/utils/strings/normalization.py +217 -0
- datamarket/utils/strings/obfuscation.py +153 -0
- datamarket/utils/strings/standardization.py +40 -0
- datamarket/utils/typer.py +2 -1
- datamarket/utils/types.py +1 -0
- datamarket-0.10.3.dist-info/METADATA +172 -0
- datamarket-0.10.3.dist-info/RECORD +38 -0
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info}/WHEEL +1 -2
- datamarket-0.6.0.dist-info/METADATA +0 -49
- datamarket-0.6.0.dist-info/RECORD +0 -24
- datamarket-0.6.0.dist-info/top_level.txt +0 -1
- {datamarket-0.6.0.dist-info → datamarket-0.10.3.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
import unicodedata
|
|
4
|
+
from enum import Enum, auto
|
|
5
|
+
from typing import Any, Optional, Set, Union
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
from inflection import camelize, parameterize, titleize, underscore
|
|
9
|
+
from string_utils import prettify, strip_html
|
|
10
|
+
from unidecode import unidecode
|
|
11
|
+
|
|
12
|
+
########################################################################################################################
|
|
13
|
+
# CLASSES
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NormalizationMode(Enum):
|
|
17
|
+
NONE = auto()
|
|
18
|
+
BASIC = auto() # removes accents and converts punctuation to spaces
|
|
19
|
+
SYMBOLS = auto() # translates only symbols to Unicode name
|
|
20
|
+
FULL = auto() # BASIC + SYMBOLS
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NamingConvention(Enum):
|
|
24
|
+
NONE = auto() # no style change
|
|
25
|
+
LOWER = auto() # lowercase
|
|
26
|
+
UPPER = auto() # UPPERCASE
|
|
27
|
+
CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
|
|
28
|
+
SNAKE = auto() # snake_case (lowercase, underscores)
|
|
29
|
+
CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
|
|
30
|
+
PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
|
|
31
|
+
PARAM = auto() # parameterize (hyphens)
|
|
32
|
+
TITLE = auto() # titleize (capitalize words)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
########################################################################################################################
|
|
36
|
+
# FUNCTIONS
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Processes a string by unidecoding characters, optionally lowercasing them,
|
|
42
|
+
while preserving a specified set of allowed characters.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
input_text: The string to process.
|
|
46
|
+
allowed_chars: A set of characters to preserve in their original form.
|
|
47
|
+
apply_lowercase: Whether to convert unidecoded characters to lowercase. Defaults to False.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
The processed string.
|
|
51
|
+
"""
|
|
52
|
+
chars_list: list[str] = []
|
|
53
|
+
for char_original in input_text:
|
|
54
|
+
if char_original in allowed_chars:
|
|
55
|
+
chars_list.append(char_original)
|
|
56
|
+
else:
|
|
57
|
+
decoded_segment = unidecode(char_original)
|
|
58
|
+
for dc in decoded_segment: # unidecode can return multiple chars
|
|
59
|
+
if apply_lowercase:
|
|
60
|
+
chars_list.append(dc.lower())
|
|
61
|
+
else:
|
|
62
|
+
chars_list.append(dc)
|
|
63
|
+
return "".join(chars_list)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
|
|
69
|
+
with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
s: The input string.
|
|
73
|
+
allowed_symbols_set: A set of characters to preserve without transliteration.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
The string with symbols transliterated or preserved.
|
|
77
|
+
"""
|
|
78
|
+
if allowed_symbols_set is None:
|
|
79
|
+
allowed_symbols_set = set()
|
|
80
|
+
out: list[str] = []
|
|
81
|
+
for c in s:
|
|
82
|
+
if c in allowed_symbols_set:
|
|
83
|
+
out.append(c)
|
|
84
|
+
elif unicodedata.category(c).startswith("S"):
|
|
85
|
+
name = unicodedata.name(c, "")
|
|
86
|
+
if name:
|
|
87
|
+
out.append(name.lower().replace(" ", "_"))
|
|
88
|
+
else:
|
|
89
|
+
out.append(c)
|
|
90
|
+
return "".join(out)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def normalize(
|
|
94
|
+
s: Any,
|
|
95
|
+
mode: Union[NormalizationMode, str] = NormalizationMode.BASIC,
|
|
96
|
+
naming: Union[NamingConvention, str] = NamingConvention.LOWER,
|
|
97
|
+
allowed_symbols: Optional[str] = None,
|
|
98
|
+
) -> str:
|
|
99
|
+
"""
|
|
100
|
+
Normalizes and applies a naming convention to the input.
|
|
101
|
+
|
|
102
|
+
Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
|
|
103
|
+
|
|
104
|
+
Normalization (controlled by `mode`) occurs first, followed by naming convention application.
|
|
105
|
+
- NONE: Returns the input as a string without any normalization. Case is preserved.
|
|
106
|
+
- BASIC: Removes accents (via unidecode). Punctuation and spaces typically become single spaces between tokens.
|
|
107
|
+
Case is preserved from the unidecode step by default.
|
|
108
|
+
- SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
|
|
109
|
+
Other characters are preserved, including their case.
|
|
110
|
+
- FULL: Applies unidecode (case-preserved by default) and then SYMBOLS-like transliteration for S* category
|
|
111
|
+
characters not otherwise handled.
|
|
112
|
+
|
|
113
|
+
The `allowed_symbols` parameter can be used to specify characters that should be preserved in their original form
|
|
114
|
+
throughout the normalization process. These characters will not be unidecoded or transliterated by the symbol logic.
|
|
115
|
+
|
|
116
|
+
After normalization, a naming convention (controlled by `naming`) is applied:
|
|
117
|
+
- NONE: Returns the normalized text, preserving its case from the normalization step.
|
|
118
|
+
- LOWER: Converts the normalized text to lowercase. (Default)
|
|
119
|
+
- UPPER: Converts the normalized text to UPPERCASE.
|
|
120
|
+
- CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
|
|
121
|
+
- SNAKE: Converts to snake_case (lowercase with underscores).
|
|
122
|
+
- CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
|
|
123
|
+
- PASCAL: Converts to PascalCase (capitalize all words, no spaces).
|
|
124
|
+
- PARAM: Converts to parameterize (lowercase with hyphens).
|
|
125
|
+
- TITLE: Converts to Title Case (capitalize each word).
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
s: The input value to normalize and format. Can be any type.
|
|
129
|
+
mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
|
|
130
|
+
naming: The naming convention to apply. Defaults to NamingConvention.LOWER.
|
|
131
|
+
allowed_symbols: A string of characters to preserve during normalization.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
The normalized and formatted string.
|
|
135
|
+
"""
|
|
136
|
+
# Parameter mapping
|
|
137
|
+
if isinstance(mode, str):
|
|
138
|
+
mode = NormalizationMode[mode.upper()]
|
|
139
|
+
if not isinstance(mode, NormalizationMode):
|
|
140
|
+
raise TypeError("mode must be NormalizationMode or str")
|
|
141
|
+
|
|
142
|
+
if isinstance(naming, str):
|
|
143
|
+
naming = NamingConvention[naming.upper()]
|
|
144
|
+
if not isinstance(naming, NamingConvention):
|
|
145
|
+
raise TypeError("naming must be NamingConvention or str")
|
|
146
|
+
|
|
147
|
+
_allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
|
|
148
|
+
|
|
149
|
+
# Handling null values
|
|
150
|
+
if s is None or (isinstance(s, float) and np.isnan(s)):
|
|
151
|
+
normalized = ""
|
|
152
|
+
elif not isinstance(s, str):
|
|
153
|
+
return str(s)
|
|
154
|
+
else:
|
|
155
|
+
raw_text = str(s)
|
|
156
|
+
if naming is NamingConvention.NONE:
|
|
157
|
+
text = raw_text
|
|
158
|
+
else:
|
|
159
|
+
text = prettify(strip_html(raw_text, True))
|
|
160
|
+
|
|
161
|
+
if mode is NormalizationMode.NONE:
|
|
162
|
+
normalized = text
|
|
163
|
+
elif mode is NormalizationMode.SYMBOLS:
|
|
164
|
+
normalized = transliterate_symbols(text, _allowed_symbols_set)
|
|
165
|
+
else:
|
|
166
|
+
# BASIC and FULL modes
|
|
167
|
+
intermediate_text = get_unidecoded_text(text, _allowed_symbols_set)
|
|
168
|
+
|
|
169
|
+
# Now, tokenize the intermediate_text for BASIC and FULL
|
|
170
|
+
tokens: list[str] = []
|
|
171
|
+
current_token_chars: list[str] = []
|
|
172
|
+
|
|
173
|
+
def flush_current_token():
|
|
174
|
+
nonlocal current_token_chars
|
|
175
|
+
if current_token_chars:
|
|
176
|
+
tokens.append("".join(current_token_chars))
|
|
177
|
+
current_token_chars.clear()
|
|
178
|
+
|
|
179
|
+
for c in intermediate_text:
|
|
180
|
+
cat = unicodedata.category(c)
|
|
181
|
+
if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
|
|
182
|
+
current_token_chars.append(c)
|
|
183
|
+
elif mode is NormalizationMode.FULL and cat.startswith("S"):
|
|
184
|
+
# Transliterate S* category symbols not in allowed_symbols
|
|
185
|
+
flush_current_token()
|
|
186
|
+
name = unicodedata.name(c, "")
|
|
187
|
+
if name:
|
|
188
|
+
tokens.append(name.lower().replace(" ", "_"))
|
|
189
|
+
elif cat.startswith("P") or c.isspace():
|
|
190
|
+
# Punctuation (not allowed) or space acts as a separator
|
|
191
|
+
flush_current_token()
|
|
192
|
+
# Other characters are ignored
|
|
193
|
+
|
|
194
|
+
flush_current_token()
|
|
195
|
+
normalized = " ".join(tokens)
|
|
196
|
+
|
|
197
|
+
# Apply naming convention
|
|
198
|
+
if naming is NamingConvention.NONE:
|
|
199
|
+
return normalized
|
|
200
|
+
if naming is NamingConvention.LOWER:
|
|
201
|
+
return normalized.lower()
|
|
202
|
+
if naming is NamingConvention.UPPER:
|
|
203
|
+
return normalized.upper()
|
|
204
|
+
if naming is NamingConvention.PARAM:
|
|
205
|
+
return parameterize(normalized)
|
|
206
|
+
if naming is NamingConvention.TITLE:
|
|
207
|
+
return titleize(normalized)
|
|
208
|
+
|
|
209
|
+
underscored = underscore(parameterize(normalized))
|
|
210
|
+
if naming is NamingConvention.CONSTANT:
|
|
211
|
+
return underscored.upper()
|
|
212
|
+
if naming is NamingConvention.CAMEL:
|
|
213
|
+
return camelize(underscored, False)
|
|
214
|
+
if naming is NamingConvention.PASCAL:
|
|
215
|
+
return camelize(underscored)
|
|
216
|
+
|
|
217
|
+
return underscored
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PiiDependenciesMissingError(ImportError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SpacyModelNotFoundError(ImportError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import phonenumbers # type: ignore
|
|
19
|
+
import spacy # type: ignore
|
|
20
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry # type: ignore
|
|
21
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider # type: ignore
|
|
22
|
+
from presidio_analyzer.predefined_recognizers import PhoneRecognizer # type: ignore
|
|
23
|
+
from presidio_anonymizer import AnonymizerEngine # type: ignore
|
|
24
|
+
from spacy.language import Language # type: ignore
|
|
25
|
+
from spacy_langdetect import LanguageDetector # type: ignore
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise PiiDependenciesMissingError(
|
|
28
|
+
"One or more PII anonymization dependencies are missing. "
|
|
29
|
+
"Please install them by running: pip install datamarket[pii]\n"
|
|
30
|
+
f"Original error: {e}"
|
|
31
|
+
) from e
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# SETTINGS
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger()
|
|
38
|
+
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
|
|
39
|
+
|
|
40
|
+
warnings.filterwarnings(
|
|
41
|
+
"ignore",
|
|
42
|
+
message=r"\[W108\]",
|
|
43
|
+
category=UserWarning,
|
|
44
|
+
module="spacy.pipeline.lemmatizer",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@Language.factory("language_detector")
|
|
49
|
+
def get_lang_detector(nlp, name):
|
|
50
|
+
return LanguageDetector()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
########################################################################################################################
|
|
54
|
+
# CLASSES
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PiiAnonymizer:
|
|
58
|
+
SUPPORTED_LANG = ["es", "en"]
|
|
59
|
+
|
|
60
|
+
def __init__(self):
|
|
61
|
+
# Check for required spaCy models
|
|
62
|
+
required_models = {
|
|
63
|
+
"es_core_news_md": "python -m spacy download es_core_news_md",
|
|
64
|
+
"en_core_web_md": "python -m spacy download en_core_web_md",
|
|
65
|
+
}
|
|
66
|
+
missing_models_instructions = []
|
|
67
|
+
for model_name, install_command in required_models.items():
|
|
68
|
+
if not spacy.util.is_package(model_name):
|
|
69
|
+
missing_models_instructions.append(
|
|
70
|
+
f"Model '{model_name}' not found. Please install it by running: {install_command}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if missing_models_instructions:
|
|
74
|
+
raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
|
|
75
|
+
|
|
76
|
+
self.anonymizer = AnonymizerEngine()
|
|
77
|
+
self.analyzer = self._load_analyzer_engine()
|
|
78
|
+
|
|
79
|
+
self.nlp = self._nlp()
|
|
80
|
+
|
|
81
|
+
def _nlp(self) -> Language:
|
|
82
|
+
analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
|
|
83
|
+
shared_vocab = analyzer_en_model.vocab
|
|
84
|
+
nlp = spacy.blank("en", vocab=shared_vocab)
|
|
85
|
+
|
|
86
|
+
if nlp.has_factory("sentencizer"):
|
|
87
|
+
nlp.add_pipe("sentencizer")
|
|
88
|
+
|
|
89
|
+
if nlp.has_factory("language_detector"):
|
|
90
|
+
nlp.add_pipe("language_detector", last=True)
|
|
91
|
+
|
|
92
|
+
return nlp
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _nlp_config():
|
|
96
|
+
return {
|
|
97
|
+
"nlp_engine_name": "spacy",
|
|
98
|
+
"models": [
|
|
99
|
+
{"lang_code": "es", "model_name": "es_core_news_md"},
|
|
100
|
+
{"lang_code": "en", "model_name": "en_core_web_md"},
|
|
101
|
+
],
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def _load_analyzer_engine(self) -> AnalyzerEngine:
|
|
105
|
+
provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
|
|
106
|
+
nlp_engine = provider.create_engine()
|
|
107
|
+
phone_recognizer_es = PhoneRecognizer(
|
|
108
|
+
supported_language="es",
|
|
109
|
+
supported_regions=phonenumbers.SUPPORTED_REGIONS,
|
|
110
|
+
context=["teléfono", "móvil", "número"],
|
|
111
|
+
)
|
|
112
|
+
registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
|
|
113
|
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
|
|
114
|
+
registry.add_recognizer(phone_recognizer_es)
|
|
115
|
+
|
|
116
|
+
analyzer = AnalyzerEngine(
|
|
117
|
+
registry=registry,
|
|
118
|
+
nlp_engine=nlp_engine,
|
|
119
|
+
supported_languages=self.SUPPORTED_LANG,
|
|
120
|
+
)
|
|
121
|
+
return analyzer
|
|
122
|
+
|
|
123
|
+
def detect_lang(self, text: str) -> str:
|
|
124
|
+
if hasattr(self, "nlp") and self.nlp:
|
|
125
|
+
with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
|
|
126
|
+
doc = self.nlp(text)
|
|
127
|
+
return doc._.language["language"]
|
|
128
|
+
else:
|
|
129
|
+
logger.error("Language detection NLP model not initialized. Cannot detect language.")
|
|
130
|
+
return "unknown"
|
|
131
|
+
|
|
132
|
+
def anonymize_text(
|
|
133
|
+
self,
|
|
134
|
+
text: str,
|
|
135
|
+
entities: Optional[List[str]] = None,
|
|
136
|
+
lang: str = "unknown",
|
|
137
|
+
) -> str:
|
|
138
|
+
if lang == "unknown":
|
|
139
|
+
lang = self.detect_lang(text)
|
|
140
|
+
if lang not in self.SUPPORTED_LANG:
|
|
141
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
142
|
+
return ""
|
|
143
|
+
elif lang not in self.SUPPORTED_LANG:
|
|
144
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
145
|
+
return ""
|
|
146
|
+
|
|
147
|
+
analyzer_result = self.analyzer.analyze(
|
|
148
|
+
text=text,
|
|
149
|
+
entities=entities,
|
|
150
|
+
language=lang,
|
|
151
|
+
)
|
|
152
|
+
anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
|
|
153
|
+
return anonymizer_result.text
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from ...params.nominatim import COUNTRY_PARSING_RULES
|
|
8
|
+
|
|
9
|
+
########################################################################################################################
|
|
10
|
+
# FUNCTIONS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
|
|
14
|
+
"""Clean and standardize phone number from a certain country_code
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
number (str): phone number
|
|
18
|
+
country_code (Literal["es", "pt"]): country code of the phone number to parse
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
ValueError: when parsing is not supported for a certain country
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
str | None: standardized phone number
|
|
25
|
+
"""
|
|
26
|
+
clean_number = re.sub(r"\D", "", number)
|
|
27
|
+
if country_code in {"es", "pt"}:
|
|
28
|
+
# Get the validation regex from params
|
|
29
|
+
pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
|
|
30
|
+
|
|
31
|
+
# Validate and extract in one step
|
|
32
|
+
if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
|
|
33
|
+
match = pattern.match(clean_number)
|
|
34
|
+
|
|
35
|
+
# Return the captured group (the 9-digit number)
|
|
36
|
+
return match.group(0)[-9:] if match else None
|
|
37
|
+
else:
|
|
38
|
+
return None # Or handle the case where the number is too short
|
|
39
|
+
else:
|
|
40
|
+
raise ValueError(f"Country code ({country_code}) is not currently supported")
|
datamarket/utils/typer.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing_extensions import Annotated
|
|
|
9
9
|
########################################################################################################################
|
|
10
10
|
# TYPES
|
|
11
11
|
|
|
12
|
+
|
|
12
13
|
class Dict(dict):
|
|
13
14
|
def __init__(self, value: str):
|
|
14
15
|
super().__init__(json.loads(value))
|
|
@@ -25,4 +26,4 @@ def parse_json_dict(value: str) -> Dict:
|
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
DictArg = Annotated[Dict, typer.Argument(parser=parse_json_dict)]
|
|
28
|
-
DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
|
|
29
|
+
DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Json = str | int | float | bool | None | dict[str, "Json"] | list["Json"]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datamarket
|
|
3
|
+
Version: 0.10.3
|
|
4
|
+
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
License: GPL-3.0-or-later
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Author: DataMarket
|
|
8
|
+
Author-email: techsupport@datamarket.es
|
|
9
|
+
Requires-Python: >=3.12,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
11
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
17
|
+
Provides-Extra: aws
|
|
18
|
+
Provides-Extra: azure-storage-blob
|
|
19
|
+
Provides-Extra: boto3
|
|
20
|
+
Provides-Extra: camoufox
|
|
21
|
+
Provides-Extra: chompjs
|
|
22
|
+
Provides-Extra: click
|
|
23
|
+
Provides-Extra: clickhouse-driver
|
|
24
|
+
Provides-Extra: datetime
|
|
25
|
+
Provides-Extra: ddgs
|
|
26
|
+
Provides-Extra: demjson3
|
|
27
|
+
Provides-Extra: dnspython
|
|
28
|
+
Provides-Extra: drive
|
|
29
|
+
Provides-Extra: fake-useragent
|
|
30
|
+
Provides-Extra: geoalchemy2
|
|
31
|
+
Provides-Extra: geopandas
|
|
32
|
+
Provides-Extra: google-api-python-client
|
|
33
|
+
Provides-Extra: google-auth-httplib2
|
|
34
|
+
Provides-Extra: google-auth-oauthlib
|
|
35
|
+
Provides-Extra: html2text
|
|
36
|
+
Provides-Extra: httpx
|
|
37
|
+
Provides-Extra: json5
|
|
38
|
+
Provides-Extra: llm
|
|
39
|
+
Provides-Extra: lxml
|
|
40
|
+
Provides-Extra: matplotlib
|
|
41
|
+
Provides-Extra: nodriver
|
|
42
|
+
Provides-Extra: openai
|
|
43
|
+
Provides-Extra: openpyxl
|
|
44
|
+
Provides-Extra: pandarallel
|
|
45
|
+
Provides-Extra: pandas
|
|
46
|
+
Provides-Extra: pandera
|
|
47
|
+
Provides-Extra: peerdb
|
|
48
|
+
Provides-Extra: pii
|
|
49
|
+
Provides-Extra: pillow
|
|
50
|
+
Provides-Extra: playwright
|
|
51
|
+
Provides-Extra: playwright-stealth
|
|
52
|
+
Provides-Extra: plotly
|
|
53
|
+
Provides-Extra: pyarrow
|
|
54
|
+
Provides-Extra: pydantic
|
|
55
|
+
Provides-Extra: pydrive2
|
|
56
|
+
Provides-Extra: pymupdf
|
|
57
|
+
Provides-Extra: pyproj
|
|
58
|
+
Provides-Extra: pyrate-limiter
|
|
59
|
+
Provides-Extra: pysocks
|
|
60
|
+
Provides-Extra: pyspark
|
|
61
|
+
Provides-Extra: pytest
|
|
62
|
+
Provides-Extra: retry
|
|
63
|
+
Provides-Extra: shapely
|
|
64
|
+
Provides-Extra: soda-core-mysql
|
|
65
|
+
Provides-Extra: soda-core-postgres
|
|
66
|
+
Provides-Extra: sqlparse
|
|
67
|
+
Provides-Extra: tqdm
|
|
68
|
+
Provides-Extra: undetected-chromedriver
|
|
69
|
+
Provides-Extra: xmltodict
|
|
70
|
+
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0)
|
|
71
|
+
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
72
|
+
Requires-Dist: babel (>=2.0.0,<3.0.0)
|
|
73
|
+
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
74
|
+
Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
75
|
+
Requires-Dist: browserforge (>=1.2.0,<2.0.0) ; extra == "camoufox"
|
|
76
|
+
Requires-Dist: camoufox[geoip] (>=0.4.11,<0.5.0) ; extra == "camoufox"
|
|
77
|
+
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
78
|
+
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
79
|
+
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
80
|
+
Requires-Dist: croniter (>=3.0.0,<4.0.0)
|
|
81
|
+
Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
|
|
82
|
+
Requires-Dist: ddgs (>=9.0.0,<10.0.0) ; extra == "ddgs"
|
|
83
|
+
Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
|
|
84
|
+
Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
|
|
85
|
+
Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
|
|
86
|
+
Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
|
|
87
|
+
Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
|
|
88
|
+
Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
|
|
89
|
+
Requires-Dist: geopy (>=2.0.0,<3.0.0)
|
|
90
|
+
Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
|
|
91
|
+
Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
|
|
92
|
+
Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
|
|
93
|
+
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
94
|
+
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
95
|
+
Requires-Dist: inflection (>=0.5.0,<0.6.0)
|
|
96
|
+
Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
|
|
97
|
+
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
98
|
+
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
99
|
+
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
100
|
+
Requires-Dist: matplotlib (>=3.0.0,<4.0.0) ; extra == "matplotlib"
|
|
101
|
+
Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
|
|
102
|
+
Requires-Dist: numpy (>=2.0.0,<3.0.0)
|
|
103
|
+
Requires-Dist: openai (>=2.0.0,<3.0.0) ; extra == "openai" or extra == "llm"
|
|
104
|
+
Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
|
|
105
|
+
Requires-Dist: pandarallel (>=1.0.0,<2.0.0) ; extra == "pandarallel"
|
|
106
|
+
Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
|
|
107
|
+
Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
|
|
108
|
+
Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
109
|
+
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
110
|
+
Requires-Dist: playwright (==1.47.0) ; extra == "playwright" or extra == "camoufox"
|
|
111
|
+
Requires-Dist: plotly (>=6.0.0,<7.0.0) ; extra == "plotly"
|
|
112
|
+
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
113
|
+
Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
114
|
+
Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
115
|
+
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
116
|
+
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
117
|
+
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
118
|
+
Requires-Dist: pydantic (>=2.0.0,<3.0.0) ; extra == "pydantic" or extra == "llm"
|
|
119
|
+
Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
|
|
120
|
+
Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
|
|
121
|
+
Requires-Dist: pyproj (>=3.0.0,<4.0.0) ; extra == "pyproj"
|
|
122
|
+
Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
|
|
123
|
+
Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
|
|
124
|
+
Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
|
|
125
|
+
Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
|
|
126
|
+
Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
|
|
127
|
+
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0)
|
|
128
|
+
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
129
|
+
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
130
|
+
Requires-Dist: rnet (>=3.0.0rc10,<4.0.0)
|
|
131
|
+
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
132
|
+
Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
133
|
+
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
134
|
+
Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
|
|
135
|
+
Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
|
|
136
|
+
Requires-Dist: sqlparse (>=0.5.0,<0.6.0) ; extra == "sqlparse"
|
|
137
|
+
Requires-Dist: stem (>=1.0.0,<2.0.0)
|
|
138
|
+
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
139
|
+
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
140
|
+
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
141
|
+
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
142
|
+
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
143
|
+
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
144
|
+
Project-URL: Homepage, https://datamarket.es
|
|
145
|
+
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
146
|
+
Description-Content-Type: text/markdown
|
|
147
|
+
|
|
148
|
+
# DataMarket scraping core
|
|
149
|
+
|
|
150
|
+
------------------------------------------------------
|
|
151
|
+
[](https://github.com/psf/black)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
Utilities that integrate advance scraping knowledge into just one library.
|
|
155
|
+
|
|
156
|
+
## Installation
|
|
157
|
+
|
|
158
|
+
To install this library in your Python environment:
|
|
159
|
+
|
|
160
|
+
`pip install datamarket`
|
|
161
|
+
|
|
162
|
+
## Documentation
|
|
163
|
+
|
|
164
|
+
This library has built functionalities for the following topics:
|
|
165
|
+
|
|
166
|
+
- **Databases**: through sqlalchemy it allows to insert records and perform queries in any database.
|
|
167
|
+
- **Proxies**: wide range of functions to perform HTTP requests through custom proxies or the Tor network.
|
|
168
|
+
- **Tinybird**: a Python client for this popular API.
|
|
169
|
+
- **Drive**: functions to upload, delete or authenticate to Google Drive.
|
|
170
|
+
- **FTP**: functions to upload, delete or authenticate to an FTP, SFTP or FTPS server.
|
|
171
|
+
- **Selenium**: wrapper for the main Selenium functions.
|
|
172
|
+
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
datamarket/exceptions/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
3
|
+
datamarket/exceptions/main.py,sha256=S5EksLt_pmmX5OY-_keB12K3r5R-lTBnqdJ9VBPy8D8,3674
|
|
4
|
+
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
datamarket/interfaces/alchemy.py,sha256=KoZrXnfYAT9Gv9voKRlOgXVML5PdMqJr2sJbHSnWpvY,76516
|
|
6
|
+
datamarket/interfaces/aws.py,sha256=4HEN_VfQuEEvDnksRYlcMBUdKbgJXWBkLnymKpyRtrs,4781
|
|
7
|
+
datamarket/interfaces/azure.py,sha256=0pqd6LmQzRGjOUu85YKlPeQnlwsq0q5laNUw_iI3XPw,5180
|
|
8
|
+
datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjHk,4840
|
|
9
|
+
datamarket/interfaces/ftp.py,sha256=LH3Oz19k_xUNhzDXcrq5Ofb4c3uiph5pWUqpgiaDvHI,2671
|
|
10
|
+
datamarket/interfaces/llm.py,sha256=rRD_Rghm8uSDNLpflwTxGVkzQAH2Or-FTXOcJHSstFg,43829
|
|
11
|
+
datamarket/interfaces/nominatim.py,sha256=57hlW0w6XHBWEmyLyMn6eq1o_T5caYcLNBSNI1qLWCQ,16145
|
|
12
|
+
datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
|
|
13
|
+
datamarket/interfaces/proxy.py,sha256=fke9THv2h1jpr5cxJ25w0bYchuboErokQlSiq50FWVE,14632
|
|
14
|
+
datamarket/interfaces/tinybird.py,sha256=cNG-kAPTdQn2inlNX9LPf-VVdtnLud947ApLVO40Now,2594
|
|
15
|
+
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
datamarket/params/nominatim.py,sha256=yWOBZ0CQ7YfsBbhpbOZZEgFQTTc6vATPOPDQ7EmWGBk,14648
|
|
17
|
+
datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
|
|
18
|
+
datamarket/utils/airflow.py,sha256=Tc8vFB85NGJn0vgEkvT_yGMbn_NmW0OAJa9fy1qKocQ,804
|
|
19
|
+
datamarket/utils/alchemy.py,sha256=B-6cdMiEStzD4JKhi7Xpk7pVs7eUcdT_fHqpfm2ToNc,637
|
|
20
|
+
datamarket/utils/logs.py,sha256=EX7rI7c-QmQbTTDutyxeCbprUilzV58pypZUwA_vPK0,2543
|
|
21
|
+
datamarket/utils/main.py,sha256=MDCR-EWKgWMXo2XmLR_K7YEp26vSTcuwuijzNcMt5EQ,7271
|
|
22
|
+
datamarket/utils/nominatim.py,sha256=HUJfR86lw68PzaLfhZOCIT5YlopDvRSbwEY2JCf0WyI,5704
|
|
23
|
+
datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
24
|
+
datamarket/utils/playwright/async_api.py,sha256=Wl2LFHiOTZDg4Jic5nJDPSk1g_AWsY04XKvs3m_ZTrQ,10838
|
|
25
|
+
datamarket/utils/playwright/sync_api.py,sha256=JrVZkphQfCxOtl0oxQZd3W0LALdT7qVV6kwph7FfA94,10729
|
|
26
|
+
datamarket/utils/requests.py,sha256=YJ97ziTe6z8lq3Wu0_ivNbYWWXkyjaaB_pnUShWhFRU,24368
|
|
27
|
+
datamarket/utils/selenium.py,sha256=Fc2BJzTH7_xIqjBP9LbZODF69RSH4fF8LhD5WuGdlZ0,2457
|
|
28
|
+
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
29
|
+
datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
|
|
30
|
+
datamarket/utils/strings/normalization.py,sha256=tlZHq8h9AtcANkaJ2AOrR6UD5yKShn1cLldfFfFQgTA,8990
|
|
31
|
+
datamarket/utils/strings/obfuscation.py,sha256=lQL1TiDMpY1veYGKHqYgsHFgFJhuD9GLcJ1uwglOMy8,5374
|
|
32
|
+
datamarket/utils/strings/standardization.py,sha256=j_NbT-O1XnxDvDhct8panfkrfAC8R5OX6XM5fYBZ4RU,1496
|
|
33
|
+
datamarket/utils/typer.py,sha256=geWuwMwGQjBQhxo27hX0vEAeRl1j1TS0u2oFVfpAs5I,816
|
|
34
|
+
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
35
|
+
datamarket-0.10.3.dist-info/METADATA,sha256=dhTFRAUsUG2EhIBbujgSk81cWbmegxEAZRSplayQd4M,7695
|
|
36
|
+
datamarket-0.10.3.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
|
37
|
+
datamarket-0.10.3.dist-info/licenses/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
38
|
+
datamarket-0.10.3.dist-info/RECORD,,
|