datamarket 0.7.41__py3-none-any.whl → 0.7.125__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+ import unicodedata
4
+ from enum import Enum, auto
5
+ from typing import Any, Optional, Set, Union
6
+
7
+ import numpy as np
8
+ from inflection import camelize, parameterize, titleize, underscore
9
+ from string_utils import prettify, strip_html
10
+ from unidecode import unidecode
11
+
12
+ ########################################################################################################################
13
+ # CLASSES
14
+
15
+
16
+ class NormalizationMode(Enum):
17
+ NONE = auto()
18
+ BASIC = auto() # removes accents and converts punctuation to spaces
19
+ SYMBOLS = auto() # translates only symbols to Unicode name
20
+ FULL = auto() # BASIC + SYMBOLS
21
+
22
+
23
+ class NamingConvention(Enum):
24
+ NONE = auto() # no style change
25
+ LOWER = auto() # lowercase
26
+ UPPER = auto() # UPPERCASE
27
+ CONSTANT = auto() # CONSTANT_CASE (uppercase, underscores)
28
+ SNAKE = auto() # snake_case (lowercase, underscores)
29
+ CAMEL = auto() # camelCase (capitalize words except first one, no spaces)
30
+ PASCAL = auto() # PascalCase (capitalize words including first one, no spaces)
31
+ PARAM = auto() # parameterize (hyphens)
32
+ TITLE = auto() # titleize (capitalize words)
33
+
34
+
35
+ ########################################################################################################################
36
+ # FUNCTIONS
37
+
38
+
39
+ def get_unidecoded_text(input_text: str, allowed_chars: Set[str], apply_lowercase: bool = False) -> str:
40
+ """
41
+ Processes a string by unidecoding characters, optionally lowercasing them,
42
+ while preserving a specified set of allowed characters.
43
+
44
+ Args:
45
+ input_text: The string to process.
46
+ allowed_chars: A set of characters to preserve in their original form.
47
+ apply_lowercase: Whether to convert unidecoded characters to lowercase. Defaults to False.
48
+
49
+ Returns:
50
+ The processed string.
51
+ """
52
+ chars_list: list[str] = []
53
+ for char_original in input_text:
54
+ if char_original in allowed_chars:
55
+ chars_list.append(char_original)
56
+ else:
57
+ decoded_segment = unidecode(char_original)
58
+ for dc in decoded_segment: # unidecode can return multiple chars
59
+ if apply_lowercase:
60
+ chars_list.append(dc.lower())
61
+ else:
62
+ chars_list.append(dc)
63
+ return "".join(chars_list)
64
+
65
+
66
+ def transliterate_symbols(s: str, allowed_symbols_set: Optional[Set[str]] = None) -> str:
67
+ """
68
+ Translates Unicode symbols (category S*) in the input string to their lowercase Unicode names,
69
+ with spaces replaced by underscores. Other characters, or characters in allowed_symbols_set, remain unchanged.
70
+
71
+ Args:
72
+ s: The input string.
73
+ allowed_symbols_set: A set of characters to preserve without transliteration.
74
+
75
+ Returns:
76
+ The string with symbols transliterated or preserved.
77
+ """
78
+ if allowed_symbols_set is None:
79
+ allowed_symbols_set = set()
80
+ out: list[str] = []
81
+ for c in s:
82
+ if c in allowed_symbols_set:
83
+ out.append(c)
84
+ elif unicodedata.category(c).startswith("S"):
85
+ name = unicodedata.name(c, "")
86
+ if name:
87
+ out.append(name.lower().replace(" ", "_"))
88
+ else:
89
+ out.append(c)
90
+ return "".join(out)
91
+
92
+
93
+ def normalize(
94
+ s: Any,
95
+ mode: Union[NormalizationMode, str] = NormalizationMode.BASIC,
96
+ naming: Union[NamingConvention, str] = NamingConvention.LOWER,
97
+ allowed_symbols: Optional[str] = None,
98
+ ) -> str:
99
+ """
100
+ Normalizes and applies a naming convention to the input.
101
+
102
+ Handles None and NaN values by returning an empty string. Converts non-string inputs to strings.
103
+
104
+ Normalization (controlled by `mode`) occurs first, followed by naming convention application.
105
+ - NONE: Returns the input as a string without any normalization. Case is preserved.
106
+ - BASIC: Removes accents (via unidecode). Punctuation and spaces typically become single spaces between tokens.
107
+ Case is preserved from the unidecode step by default.
108
+ - SYMBOLS: Translates only Unicode symbols (category S*) to their lowercase Unicode names with underscores.
109
+ Other characters are preserved, including their case.
110
+ - FULL: Applies unidecode (case-preserved by default) and then SYMBOLS-like transliteration for S* category
111
+ characters not otherwise handled.
112
+
113
+ The `allowed_symbols` parameter can be used to specify characters that should be preserved in their original form
114
+ throughout the normalization process. These characters will not be unidecoded or transliterated by the symbol logic.
115
+
116
+ After normalization, a naming convention (controlled by `naming`) is applied:
117
+ - NONE: Returns the normalized text, preserving its case from the normalization step.
118
+ - LOWER: Converts the normalized text to lowercase. (Default)
119
+ - UPPER: Converts the normalized text to UPPERCASE.
120
+ - CONSTANT: Converts to CONSTANT_CASE (uppercase with underscores).
121
+ - SNAKE: Converts to snake_case (lowercase with underscores).
122
+ - CAMEL: Converts to camelCase (lowercase first word, capitalize subsequent words, no spaces).
123
+ - PASCAL: Converts to PascalCase (capitalize all words, no spaces).
124
+ - PARAM: Converts to parameterize (lowercase with hyphens).
125
+ - TITLE: Converts to Title Case (capitalize each word).
126
+
127
+ Args:
128
+ s: The input value to normalize and format. Can be any type.
129
+ mode: The normalization mode to apply. Defaults to NormalizationMode.BASIC.
130
+ naming: The naming convention to apply. Defaults to NamingConvention.LOWER.
131
+ allowed_symbols: A string of characters to preserve during normalization.
132
+
133
+ Returns:
134
+ The normalized and formatted string.
135
+ """
136
+ # Parameter mapping
137
+ if isinstance(mode, str):
138
+ mode = NormalizationMode[mode.upper()]
139
+ if not isinstance(mode, NormalizationMode):
140
+ raise TypeError("mode must be NormalizationMode or str")
141
+
142
+ if isinstance(naming, str):
143
+ naming = NamingConvention[naming.upper()]
144
+ if not isinstance(naming, NamingConvention):
145
+ raise TypeError("naming must be NamingConvention or str")
146
+
147
+ _allowed_symbols_set: Set[str] = set(allowed_symbols) if allowed_symbols else set()
148
+
149
+ # Handling null values
150
+ if s is None or (isinstance(s, float) and np.isnan(s)):
151
+ normalized = ""
152
+ elif not isinstance(s, str):
153
+ return str(s)
154
+ else:
155
+ raw_text = str(s)
156
+ if naming is NamingConvention.NONE:
157
+ text = raw_text
158
+ else:
159
+ text = prettify(strip_html(raw_text, True))
160
+
161
+ if mode is NormalizationMode.NONE:
162
+ normalized = text
163
+ elif mode is NormalizationMode.SYMBOLS:
164
+ normalized = transliterate_symbols(text, _allowed_symbols_set)
165
+ else:
166
+ # BASIC and FULL modes
167
+ intermediate_text = get_unidecoded_text(text, _allowed_symbols_set)
168
+
169
+ # Now, tokenize the intermediate_text for BASIC and FULL
170
+ tokens: list[str] = []
171
+ current_token_chars: list[str] = []
172
+
173
+ def flush_current_token():
174
+ nonlocal current_token_chars
175
+ if current_token_chars:
176
+ tokens.append("".join(current_token_chars))
177
+ current_token_chars.clear()
178
+
179
+ for c in intermediate_text:
180
+ cat = unicodedata.category(c)
181
+ if c in _allowed_symbols_set or c.isalnum(): # Allowed symbols are part of tokens
182
+ current_token_chars.append(c)
183
+ elif mode is NormalizationMode.FULL and cat.startswith("S"):
184
+ # Transliterate S* category symbols not in allowed_symbols
185
+ flush_current_token()
186
+ name = unicodedata.name(c, "")
187
+ if name:
188
+ tokens.append(name.lower().replace(" ", "_"))
189
+ elif cat.startswith("P") or c.isspace():
190
+ # Punctuation (not allowed) or space acts as a separator
191
+ flush_current_token()
192
+ # Other characters are ignored
193
+
194
+ flush_current_token()
195
+ normalized = " ".join(tokens)
196
+
197
+ # Apply naming convention
198
+ if naming is NamingConvention.NONE:
199
+ return normalized
200
+ if naming is NamingConvention.LOWER:
201
+ return normalized.lower()
202
+ if naming is NamingConvention.UPPER:
203
+ return normalized.upper()
204
+ if naming is NamingConvention.PARAM:
205
+ return parameterize(normalized)
206
+ if naming is NamingConvention.TITLE:
207
+ return titleize(normalized)
208
+
209
+ underscored = underscore(parameterize(normalized))
210
+ if naming is NamingConvention.CONSTANT:
211
+ return underscored.upper()
212
+ if naming is NamingConvention.CAMEL:
213
+ return camelize(underscored, False)
214
+ if naming is NamingConvention.PASCAL:
215
+ return camelize(underscored)
216
+
217
+ return underscored
@@ -0,0 +1,153 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+
9
+ class PiiDependenciesMissingError(ImportError):
10
+ pass
11
+
12
+
13
+ class SpacyModelNotFoundError(ImportError):
14
+ pass
15
+
16
+
17
+ try:
18
+ import phonenumbers
19
+ import spacy
20
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
21
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
22
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer
23
+ from presidio_anonymizer import AnonymizerEngine
24
+ from spacy.language import Language
25
+ from spacy_langdetect import LanguageDetector
26
+ except ImportError as e:
27
+ raise PiiDependenciesMissingError(
28
+ "One or more PII anonymization dependencies are missing. "
29
+ "Please install them by running: pip install datamarket[pii]\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+
33
+
34
+ ########################################################################################################################
35
+ # SETTINGS
36
+
37
+ logger = logging.getLogger()
38
+ logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
39
+
40
+ warnings.filterwarnings(
41
+ "ignore",
42
+ message=r"\[W108\]",
43
+ category=UserWarning,
44
+ module="spacy.pipeline.lemmatizer",
45
+ )
46
+
47
+
48
+ @Language.factory("language_detector")
49
+ def get_lang_detector(nlp, name):
50
+ return LanguageDetector()
51
+
52
+
53
+ ########################################################################################################################
54
+ # CLASSES
55
+
56
+
57
+ class PiiAnonymizer:
58
+ SUPPORTED_LANG = ["es", "en"]
59
+
60
+ def __init__(self):
61
+ # Check for required spaCy models
62
+ required_models = {
63
+ "es_core_news_md": "python -m spacy download es_core_news_md",
64
+ "en_core_web_md": "python -m spacy download en_core_web_md",
65
+ }
66
+ missing_models_instructions = []
67
+ for model_name, install_command in required_models.items():
68
+ if not spacy.util.is_package(model_name):
69
+ missing_models_instructions.append(
70
+ f"Model '{model_name}' not found. Please install it by running: {install_command}"
71
+ )
72
+
73
+ if missing_models_instructions:
74
+ raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
75
+
76
+ self.anonymizer = AnonymizerEngine()
77
+ self.analyzer = self._load_analyzer_engine()
78
+
79
+ self.nlp = self._nlp()
80
+
81
+ def _nlp(self) -> Language:
82
+ analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
83
+ shared_vocab = analyzer_en_model.vocab
84
+ nlp = spacy.blank("en", vocab=shared_vocab)
85
+
86
+ if nlp.has_factory("sentencizer"):
87
+ nlp.add_pipe("sentencizer")
88
+
89
+ if nlp.has_factory("language_detector"):
90
+ nlp.add_pipe("language_detector", last=True)
91
+
92
+ return nlp
93
+
94
+ @staticmethod
95
+ def _nlp_config():
96
+ return {
97
+ "nlp_engine_name": "spacy",
98
+ "models": [
99
+ {"lang_code": "es", "model_name": "es_core_news_md"},
100
+ {"lang_code": "en", "model_name": "en_core_web_md"},
101
+ ],
102
+ }
103
+
104
+ def _load_analyzer_engine(self) -> AnalyzerEngine:
105
+ provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
106
+ nlp_engine = provider.create_engine()
107
+ phone_recognizer_es = PhoneRecognizer(
108
+ supported_language="es",
109
+ supported_regions=phonenumbers.SUPPORTED_REGIONS,
110
+ context=["teléfono", "móvil", "número"],
111
+ )
112
+ registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
113
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
114
+ registry.add_recognizer(phone_recognizer_es)
115
+
116
+ analyzer = AnalyzerEngine(
117
+ registry=registry,
118
+ nlp_engine=nlp_engine,
119
+ supported_languages=self.SUPPORTED_LANG,
120
+ )
121
+ return analyzer
122
+
123
+ def detect_lang(self, text: str) -> str:
124
+ if hasattr(self, "nlp") and self.nlp:
125
+ with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
126
+ doc = self.nlp(text)
127
+ return doc._.language["language"]
128
+ else:
129
+ logger.error("Language detection NLP model not initialized. Cannot detect language.")
130
+ return "unknown"
131
+
132
+ def anonymize_text(
133
+ self,
134
+ text: str,
135
+ entities: Optional[List[str]] = None,
136
+ lang: str = "unknown",
137
+ ) -> str:
138
+ if lang == "unknown":
139
+ lang = self.detect_lang(text)
140
+ if lang not in self.SUPPORTED_LANG:
141
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
142
+ return ""
143
+ elif lang not in self.SUPPORTED_LANG:
144
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
145
+ return ""
146
+
147
+ analyzer_result = self.analyzer.analyze(
148
+ text=text,
149
+ entities=entities,
150
+ language=lang,
151
+ )
152
+ anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
153
+ return anonymizer_result.text
@@ -0,0 +1,40 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import re
5
+ from typing import Literal
6
+
7
+ from ...params.nominatim import COUNTRY_PARSING_RULES
8
+
9
+ ########################################################################################################################
10
+ # FUNCTIONS
11
+
12
+
13
+ def parse_phone_number(number: str, country_code: Literal["es", "pt"]) -> str | None:
14
+ """Clean and standardize phone number from a certain country_code
15
+
16
+ Args:
17
+ number (str): phone number
18
+ country_code (Literal["es", "pt"]): country code of the phone number to parse
19
+
20
+ Raises:
21
+ ValueError: when parsing is not supported for a certain country
22
+
23
+ Returns:
24
+ str | None: standardized phone number
25
+ """
26
+ clean_number = re.sub(r"\D", "", number)
27
+ if country_code in {"es", "pt"}:
28
+ # Get the validation regex from params
29
+ pattern = COUNTRY_PARSING_RULES[country_code]["phone_validate_pattern"]
30
+
31
+ # Validate and extract in one step
32
+ if len(clean_number) >= 9: # Check if the cleaned number has at least 9 digits
33
+ match = pattern.match(clean_number)
34
+
35
+ # Return the captured group (the 9-digit number)
36
+ return match.group(0)[-9:] if match else None
37
+ else:
38
+ return None # Or handle the case where the number is too short
39
+ else:
40
+ raise ValueError(f"Country code ({country_code}) is not currently supported")
datamarket/utils/typer.py CHANGED
@@ -9,6 +9,7 @@ from typing_extensions import Annotated
9
9
  ########################################################################################################################
10
10
  # TYPES
11
11
 
12
+
12
13
  class Dict(dict):
13
14
  def __init__(self, value: str):
14
15
  super().__init__(json.loads(value))
@@ -25,4 +26,4 @@ def parse_json_dict(value: str) -> Dict:
25
26
 
26
27
 
27
28
  DictArg = Annotated[Dict, typer.Argument(parser=parse_json_dict)]
28
- DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
29
+ DictOpt = Annotated[Dict, typer.Option(parser=parse_json_dict)]
@@ -0,0 +1 @@
1
+ Json = str | int | float | bool | None | dict[str, "Json"] | list["Json"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.41
3
+ Version: 0.7.125
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -15,19 +15,18 @@ Classifier: Programming Language :: Python :: 3.13
15
15
  Provides-Extra: aws
16
16
  Provides-Extra: azure-storage-blob
17
17
  Provides-Extra: boto3
18
+ Provides-Extra: camoufox
18
19
  Provides-Extra: chompjs
19
20
  Provides-Extra: click
20
21
  Provides-Extra: clickhouse-driver
21
- Provides-Extra: croniter
22
22
  Provides-Extra: datetime
23
+ Provides-Extra: ddgs
23
24
  Provides-Extra: demjson3
24
25
  Provides-Extra: dnspython
25
26
  Provides-Extra: drive
26
- Provides-Extra: duckduckgo-search
27
27
  Provides-Extra: fake-useragent
28
28
  Provides-Extra: geoalchemy2
29
29
  Provides-Extra: geopandas
30
- Provides-Extra: geopy
31
30
  Provides-Extra: google-api-python-client
32
31
  Provides-Extra: google-auth-httplib2
33
32
  Provides-Extra: google-auth-oauthlib
@@ -35,83 +34,104 @@ Provides-Extra: html2text
35
34
  Provides-Extra: httpx
36
35
  Provides-Extra: json5
37
36
  Provides-Extra: lxml
37
+ Provides-Extra: matplotlib
38
38
  Provides-Extra: nodriver
39
39
  Provides-Extra: openpyxl
40
+ Provides-Extra: pandarallel
40
41
  Provides-Extra: pandas
41
42
  Provides-Extra: pandera
42
43
  Provides-Extra: peerdb
43
- Provides-Extra: pendulum
44
+ Provides-Extra: pii
44
45
  Provides-Extra: pillow
45
46
  Provides-Extra: playwright
46
47
  Provides-Extra: playwright-stealth
47
- Provides-Extra: proxy
48
+ Provides-Extra: plotly
48
49
  Provides-Extra: pyarrow
49
50
  Provides-Extra: pydrive2
50
51
  Provides-Extra: pymupdf
52
+ Provides-Extra: pyproj
53
+ Provides-Extra: pyrate-limiter
51
54
  Provides-Extra: pysocks
52
55
  Provides-Extra: pyspark
53
56
  Provides-Extra: pytest
54
- Provides-Extra: rapidfuzz
55
57
  Provides-Extra: retry
56
58
  Provides-Extra: shapely
57
59
  Provides-Extra: soda-core-mysql
58
60
  Provides-Extra: soda-core-postgres
59
- Provides-Extra: stem
61
+ Provides-Extra: sqlparse
60
62
  Provides-Extra: tqdm
61
63
  Provides-Extra: undetected-chromedriver
62
- Provides-Extra: unidecode
63
64
  Provides-Extra: xmltodict
64
65
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0)
65
66
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
67
+ Requires-Dist: babel (>=2.0.0,<3.0.0)
66
68
  Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
67
69
  Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
70
+ Requires-Dist: browserforge (>=1.2.0,<2.0.0) ; extra == "camoufox"
71
+ Requires-Dist: camoufox[geoip] (>=0.4.11,<0.5.0) ; extra == "camoufox"
68
72
  Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
69
73
  Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
70
74
  Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
71
- Requires-Dist: croniter (>=3.0.0,<4.0.0) ; extra == "croniter"
75
+ Requires-Dist: croniter (>=3.0.0,<4.0.0)
72
76
  Requires-Dist: datetime (>=5.0,<6.0) ; extra == "datetime"
77
+ Requires-Dist: ddgs (>=9.0.0,<10.0.0) ; extra == "ddgs"
73
78
  Requires-Dist: demjson3 (>=3.0.0,<4.0.0) ; extra == "demjson3"
74
79
  Requires-Dist: dnspython (>=2.0.0,<3.0.0) ; extra == "dnspython"
75
- Requires-Dist: duckduckgo-search (>=7.0.0,<8.0.0) ; extra == "duckduckgo-search"
76
80
  Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
77
81
  Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
78
82
  Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
79
83
  Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
80
- Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
84
+ Requires-Dist: geopy (>=2.0.0,<3.0.0)
81
85
  Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
82
86
  Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
83
87
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
84
88
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
85
89
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
90
+ Requires-Dist: inflection (>=0.5.0,<0.6.0)
91
+ Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
86
92
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
87
93
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
88
94
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
95
+ Requires-Dist: matplotlib (>=3.0.0,<4.0.0) ; extra == "matplotlib"
89
96
  Requires-Dist: nodriver (>=0.44,<0.45) ; extra == "nodriver"
97
+ Requires-Dist: numpy (>=2.0.0,<3.0.0)
90
98
  Requires-Dist: openpyxl (>=3.0.0,<4.0.0) ; extra == "openpyxl"
99
+ Requires-Dist: pandarallel (>=1.0.0,<2.0.0) ; extra == "pandarallel"
91
100
  Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
92
101
  Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
93
- Requires-Dist: pendulum (>=3.0.0,<4.0.0) ; extra == "pendulum"
102
+ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
94
103
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
95
- Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
104
+ Requires-Dist: playwright (==1.47.0) ; extra == "playwright" or extra == "camoufox"
105
+ Requires-Dist: plotly (>=6.0.0,<7.0.0) ; extra == "plotly"
96
106
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
107
+ Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
108
+ Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
97
109
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
98
110
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
111
+ Requires-Dist: pycountry (>=24.0.0,<25.0.0)
99
112
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
100
113
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
114
+ Requires-Dist: pyproj (>=3.0.0,<4.0.0) ; extra == "pyproj"
115
+ Requires-Dist: pyrate-limiter (>=3.0.0,<4.0.0) ; extra == "pyrate-limiter"
101
116
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
102
117
  Requires-Dist: pyspark (>=3.0.0,<4.0.0) ; extra == "pyspark"
103
118
  Requires-Dist: pytest (>=8.0.0,<9.0.0) ; extra == "pytest"
104
- Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
119
+ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
120
+ Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0)
105
121
  Requires-Dist: requests (>=2.0.0,<3.0.0)
106
122
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
123
+ Requires-Dist: rnet (>=3.0.0rc10,<4.0.0)
107
124
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
108
- Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
125
+ Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
109
126
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
110
- Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
127
+ Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
128
+ Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
129
+ Requires-Dist: sqlparse (>=0.5.0,<0.6.0) ; extra == "sqlparse"
130
+ Requires-Dist: stem (>=1.0.0,<2.0.0)
111
131
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
112
132
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
113
133
  Requires-Dist: typer (>=0.15.0,<0.16.0)
114
- Requires-Dist: unidecode (>=1.0.0,<2.0.0) ; extra == "unidecode"
134
+ Requires-Dist: unidecode (>=1.0.0,<2.0.0)
115
135
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
116
136
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
117
137
  Project-URL: Homepage, https://datamarket.es
@@ -0,0 +1,36 @@
1
+ datamarket/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ datamarket/exceptions/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
3
+ datamarket/exceptions/main.py,sha256=S5EksLt_pmmX5OY-_keB12K3r5R-lTBnqdJ9VBPy8D8,3674
4
+ datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ datamarket/interfaces/alchemy.py,sha256=2EZp7fn7-o8yL918dnqIYZ-gO7FUXGH8d8TzQFa7XRI,15769
6
+ datamarket/interfaces/aws.py,sha256=4HEN_VfQuEEvDnksRYlcMBUdKbgJXWBkLnymKpyRtrs,4781
7
+ datamarket/interfaces/azure.py,sha256=0pqd6LmQzRGjOUu85YKlPeQnlwsq0q5laNUw_iI3XPw,5180
8
+ datamarket/interfaces/drive.py,sha256=3nhx3THr2SHNWKYwme9F2nPpvsqyEMFIxz0whF2FjHk,4840
9
+ datamarket/interfaces/ftp.py,sha256=LH3Oz19k_xUNhzDXcrq5Ofb4c3uiph5pWUqpgiaDvHI,2671
10
+ datamarket/interfaces/nominatim.py,sha256=57hlW0w6XHBWEmyLyMn6eq1o_T5caYcLNBSNI1qLWCQ,16145
11
+ datamarket/interfaces/peerdb.py,sha256=sO451wEGNb_0DDwchZ6eBVYKltqHM5XKau-WsfspXzA,23640
12
+ datamarket/interfaces/proxy.py,sha256=fke9THv2h1jpr5cxJ25w0bYchuboErokQlSiq50FWVE,14632
13
+ datamarket/interfaces/tinybird.py,sha256=cNG-kAPTdQn2inlNX9LPf-VVdtnLud947ApLVO40Now,2594
14
+ datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ datamarket/params/nominatim.py,sha256=yWOBZ0CQ7YfsBbhpbOZZEgFQTTc6vATPOPDQ7EmWGBk,14648
16
+ datamarket/utils/__init__.py,sha256=FHLh-Qp9XpM4LkAocppCf_llW2CWVVghGorkqxqt1wk,34
17
+ datamarket/utils/airflow.py,sha256=Tc8vFB85NGJn0vgEkvT_yGMbn_NmW0OAJa9fy1qKocQ,804
18
+ datamarket/utils/alchemy.py,sha256=B-6cdMiEStzD4JKhi7Xpk7pVs7eUcdT_fHqpfm2ToNc,637
19
+ datamarket/utils/main.py,sha256=MDCR-EWKgWMXo2XmLR_K7YEp26vSTcuwuijzNcMt5EQ,7271
20
+ datamarket/utils/nominatim.py,sha256=HUJfR86lw68PzaLfhZOCIT5YlopDvRSbwEY2JCf0WyI,5704
21
+ datamarket/utils/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ datamarket/utils/playwright/async_api.py,sha256=Wl2LFHiOTZDg4Jic5nJDPSk1g_AWsY04XKvs3m_ZTrQ,10838
23
+ datamarket/utils/playwright/sync_api.py,sha256=JrVZkphQfCxOtl0oxQZd3W0LALdT7qVV6kwph7FfA94,10729
24
+ datamarket/utils/requests.py,sha256=-FErEhB5f4oQEVeSrQjquU7ulDwxiwx93lTnnKy3Ft0,24274
25
+ datamarket/utils/selenium.py,sha256=Fc2BJzTH7_xIqjBP9LbZODF69RSH4fF8LhD5WuGdlZ0,2457
26
+ datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
27
+ datamarket/utils/strings/__init__.py,sha256=b6TYOT9v7y9ID-lDyZk4E8BH2uIPbsF2ZSLGjCQ1MCQ,43
28
+ datamarket/utils/strings/normalization.py,sha256=tlZHq8h9AtcANkaJ2AOrR6UD5yKShn1cLldfFfFQgTA,8990
29
+ datamarket/utils/strings/obfuscation.py,sha256=Jo-x3f2Cb75983smmpcdPqUlBrLCTyrnmH3FPlgUUjM,5246
30
+ datamarket/utils/strings/standardization.py,sha256=j_NbT-O1XnxDvDhct8panfkrfAC8R5OX6XM5fYBZ4RU,1496
31
+ datamarket/utils/typer.py,sha256=geWuwMwGQjBQhxo27hX0vEAeRl1j1TS0u2oFVfpAs5I,816
32
+ datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
33
+ datamarket-0.7.125.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
34
+ datamarket-0.7.125.dist-info/METADATA,sha256=Ln7uyk5CVKifcSeXSFENsSfi8gMR03h8jpiRRK0aGA8,7397
35
+ datamarket-0.7.125.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
36
+ datamarket-0.7.125.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.2
2
+ Generator: poetry-core 2.1.3
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,23 +0,0 @@
1
- datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
2
- datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- datamarket/interfaces/alchemy.py,sha256=z7VarlKZ-JfsXWtuDXCYnNp_pSzEYo5IvrD7wqoRpbI,8891
4
- datamarket/interfaces/aws.py,sha256=Mk9h-UcdbyNPUaYaG9wlKdd0R95xzTwtX1-_PgsBkjo,2084
5
- datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
- datamarket/interfaces/ftp.py,sha256=9GQgiNBBK7njkv8ytHQaP9YLB9kI5vnUFA5gtz9J7As,1859
7
- datamarket/interfaces/nominatim.py,sha256=ysIA2J1GhsZ0TJxD6B8N1_a7dkMEqtZQV6mT4Hayecg,3672
8
- datamarket/interfaces/peerdb.py,sha256=5zoncgNiy24ZZQJl-YORtjE0Bz1GOk4YAW75gTfkFvs,22220
9
- datamarket/interfaces/proxy.py,sha256=vZ42V4zXBWkW7dTGpDvBMNL45MCHe1ZrYrF8xIY25GU,3367
10
- datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
11
- datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- datamarket/params/nominatim.py,sha256=pBYRfoBkkLBg2INbFymefmYSzaAVujQSpEro5c1hD_I,1143
13
- datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
14
- datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
15
- datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
16
- datamarket/utils/main.py,sha256=TyX8ZrcIitvBeyfEizLtYIIc0vSGd19CN5uWUr_ahAo,2926
17
- datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
- datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
- datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.7.41.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.7.41.dist-info/METADATA,sha256=jg3OpyqE0FKTR2rmDAtjAVifA_D9aw8mFnrtwET3KaA,6412
22
- datamarket-0.7.41.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
23
- datamarket-0.7.41.dist-info/RECORD,,