datamarket 0.9.28__py3-none-any.whl → 0.9.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -3,12 +3,20 @@
3
3
 
4
4
  import gettext
5
5
  import logging
6
- import pycountry
7
- from geopy.distance import geodesic
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
7
 
8
+ import pycountry
9
9
  import requests
10
+ from geopy.distance import geodesic
11
+ from jellyfish import jaro_winkler_similarity
10
12
 
11
13
  from ..params.nominatim import POSTCODES
14
+ from ..utils.strings import normalize
15
+
16
+ ########################################################################################################################
17
+ # PARAMETERS
18
+
19
+ JARO_WINKLER_THRESHOLD = 0.85
12
20
 
13
21
  ########################################################################################################################
14
22
  # CLASSES
@@ -19,11 +27,11 @@ spanish.install()
19
27
 
20
28
 
21
29
  class GeoNames:
22
- def __init__(self, endpoint):
30
+ def __init__(self, endpoint: str) -> None:
23
31
  self.endpoint = endpoint
24
32
 
25
33
  @staticmethod
26
- def validate_postcode(postcode):
34
+ def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
27
35
  if isinstance(postcode, int):
28
36
  postcode = str(postcode)
29
37
 
@@ -36,148 +44,245 @@ class GeoNames:
36
44
  return postcode
37
45
 
38
46
  @staticmethod
39
- def get_province_from_postcode(postcode):
47
+ def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
40
48
  if postcode:
41
49
  return POSTCODES[postcode[:2]]
42
50
 
43
- def reverse(self, lat, lon):
44
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
51
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
52
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
45
53
 
46
54
 
47
55
  class Nominatim:
48
- def __init__(self, nominatim_endpoint, geonames_endpoint):
56
+ def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
49
57
  self.endpoint = nominatim_endpoint
50
58
  self.geonames = GeoNames(geonames_endpoint)
51
59
 
52
60
  @staticmethod
53
- def get_attribute(raw_json, keys):
61
+ def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
54
62
  for key in keys:
55
63
  if key in raw_json:
56
64
  return raw_json[key]
57
65
 
58
- def geocode(self, address):
59
- return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
66
+ def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
67
+ dist = float("inf")
68
+ if lat_str and lon_str:
69
+ try:
70
+ coords = (float(lat_str), float(lon_str))
71
+ dist = geodesic(input_coords, coords).km
72
+ except (ValueError, TypeError):
73
+ logger.warning("Invalid coordinates for distance calculation.")
74
+ return dist
75
+
76
+ def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
77
+ raw_address = nominatim_raw_json.get("address", {})
78
+
79
+ postcode_str = str(raw_address.get("postcode", ""))
80
+ postcode = self.geonames.validate_postcode(postcode_str)
81
+
82
+ city = self._get_attribute(raw_address, ["city", "town", "village"])
83
+ district, quarter = self._get_district_quarter(raw_address)
84
+
85
+ return {
86
+ "country": raw_address.get("country"),
87
+ "country_code": (raw_address.get("country_code") or "").lower(),
88
+ "state": raw_address.get("state"),
89
+ "province": raw_address.get("province"),
90
+ "city": city,
91
+ "postcode": postcode,
92
+ "district": district,
93
+ "quarter": quarter,
94
+ "street": raw_address.get("road"),
95
+ "number": raw_address.get("house_number"),
96
+ }
97
+
98
+ def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
99
+ geonames_country_code_str = geonames_raw_json.get("country_code")
100
+ country_name = None
101
+ if geonames_country_code_str:
102
+ try:
103
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
104
+ if country_obj:
105
+ country_name = spanish.gettext(country_obj.name)
106
+ except LookupError:
107
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
108
+
109
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
110
+ postcode = self.geonames.validate_postcode(postcode_str)
111
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
112
+ city = geonames_raw_json.get("place_name")
113
+
114
+ return {
115
+ "country": country_name,
116
+ "country_code": (geonames_country_code_str or "").lower(),
117
+ "state": geonames_raw_json.get("community"),
118
+ "province": province,
119
+ "city": city,
120
+ "postcode": postcode,
121
+ "district": None,
122
+ "quarter": None,
123
+ "street": None,
124
+ "number": None,
125
+ }
126
+
127
+ def _get_empty_address_result(self) -> Dict[str, None]:
128
+ return {
129
+ "country": None,
130
+ "country_code": None,
131
+ "state": None,
132
+ "province": None,
133
+ "city": None,
134
+ "postcode": None,
135
+ "district": None,
136
+ "quarter": None,
137
+ "street": None,
138
+ "number": None,
139
+ }
140
+
141
+ def _select_postcode_and_derived_province(
142
+ self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
143
+ ) -> Tuple[Optional[str], Optional[str]]:
144
+ """
145
+ Determines the postcode and its derived province based on comparisons
146
+ between Nominatim and GeoNames data, and Nominatim's raw address province.
147
+ """
148
+ nominatim_postcode = parsed_nominatim_result.get("postcode")
149
+ geonames_postcode = parsed_geonames_result.get("postcode")
150
+
151
+ province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
152
+ province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
153
+
154
+ norm_raw_nominatim_province = (
155
+ normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
156
+ )
157
+ norm_province_from_nominatim_postcode = (
158
+ normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
159
+ )
160
+ norm_province_from_geonames_postcode = (
161
+ normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
162
+ )
163
+
164
+ selected_postcode = None
165
+ selected_province_from_postcode = None
166
+
167
+ # If provinces derived from Nominatim and GeoNames postcodes differ
168
+ nominatim_postcode_province_matches = False
169
+ if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
170
+ nominatim_postcode_province_matches = (
171
+ jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
172
+ > JARO_WINKLER_THRESHOLD
173
+ )
174
+
175
+ geonames_postcode_province_matches = False
176
+ if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
177
+ geonames_postcode_province_matches = (
178
+ jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
179
+ > JARO_WINKLER_THRESHOLD
180
+ )
181
+
182
+ # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
183
+ # and Nominatim's own postcode-derived province does not.
184
+ if nominatim_postcode_province_matches:
185
+ selected_postcode = nominatim_postcode
186
+ selected_province_from_postcode = province_from_nominatim_postcode
187
+ if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
188
+ selected_postcode = geonames_postcode
189
+ selected_province_from_postcode = province_from_geonames_postcode
190
+
191
+ return selected_postcode, selected_province_from_postcode
192
+
193
+ def _select_final_result(
194
+ self,
195
+ parsed_nominatim_result: Dict[str, Optional[str]],
196
+ parsed_geonames_result: Dict[str, Optional[str]],
197
+ dist_nominatim: float,
198
+ dist_geonames: float,
199
+ authoritative_postcode: Optional[str],
200
+ authoritative_province_from_postcode: Optional[str],
201
+ nominatim_address_province_raw: Optional[str],
202
+ ) -> Dict[str, Optional[str]]:
203
+ """
204
+ Selects the final address result based on distances and applies the authoritative postcode/province.
205
+ """
206
+ if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
207
+ final_result = parsed_nominatim_result
208
+ final_result["postcode"] = authoritative_postcode
209
+ final_result["province"] = nominatim_address_province_raw
210
+ elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
211
+ final_result = parsed_geonames_result
212
+ final_result["postcode"] = authoritative_postcode
213
+ final_result["province"] = authoritative_province_from_postcode
214
+ else:
215
+ final_result = self._get_empty_address_result()
216
+ return final_result
217
+
218
+ def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
219
+ district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
220
+ quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
60
221
 
61
- def geocode_parsed(self, address):
222
+ if not district and quarter:
223
+ district = quarter
224
+ quarter = None
225
+
226
+ return district, quarter
227
+
228
+ def geocode(self, address: str) -> List[Dict[str, Any]]:
229
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
230
+
231
+ def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
62
232
  results = self.geocode(address)
63
233
 
64
234
  if results:
65
235
  return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
66
236
 
67
- def reverse(self, lat, lon):
68
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
237
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
238
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
239
+
240
+ def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
241
+ nominatim_response = self.reverse(lat, lon)
242
+ geonames_response = self.geonames.reverse(lat, lon)
69
243
 
70
- def reverse_parsed(self, lat, lon):
71
- nominatim_raw_json = self.reverse(lat, lon)
72
- geonames_raw_json = self.geonames.reverse(lat, lon)
244
+ # Initial parsing
245
+ parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
246
+ parsed_geonames_result = self._parse_geonames_result(geonames_response)
73
247
 
74
- nominatim_res_lat_str = nominatim_raw_json.get("lat")
75
- nominatim_res_lon_str = nominatim_raw_json.get("lon")
76
- geonames_res_lat_str = geonames_raw_json.get("lat")
77
- geonames_res_lon_str = geonames_raw_json.get("lon")
248
+ # Determine authoritative postcode
249
+ raw_nominatim_province = nominatim_response.get("address", {}).get("province")
250
+ selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
251
+ parsed_nominatim_result, parsed_geonames_result, raw_nominatim_province
252
+ )
78
253
 
79
- dist_nominatim = float("inf")
80
- dist_geonames = float("inf")
254
+ # Calculate distances
255
+ nominatim_response_lat = nominatim_response.get("lat")
256
+ nominatim_response_lon = nominatim_response.get("lon")
257
+ geonames_response_lat = geonames_response.get("lat")
258
+ geonames_response_lon = geonames_response.get("lon")
81
259
 
260
+ input_coords = None
82
261
  try:
83
262
  input_coords = (float(lat), float(lon))
84
263
  except (ValueError, TypeError):
85
264
  logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
86
- else:
87
- if nominatim_res_lat_str and nominatim_res_lon_str:
88
- try:
89
- nominatim_coords = (float(nominatim_res_lat_str), float(nominatim_res_lon_str))
90
- dist_nominatim = geodesic(input_coords, nominatim_coords).km
91
- except (ValueError, TypeError):
92
- logger.warning("Invalid Nominatim coordinates for distance calculation.")
93
-
94
- if geonames_res_lat_str and geonames_res_lon_str:
95
- try:
96
- geonames_coords = (float(geonames_res_lat_str), float(geonames_res_lon_str))
97
- dist_geonames = geodesic(input_coords, geonames_coords).km
98
- except (ValueError, TypeError):
99
- logger.warning("Invalid GeoNames coordinates for distance calculation.")
100
-
101
- if dist_nominatim <= dist_geonames and nominatim_res_lat_str is not None and nominatim_res_lon_str is not None:
102
- # Use Nominatim data
103
- raw_address = nominatim_raw_json.get("address", {})
104
- postcode_str = str(raw_address.get("postcode", ""))
105
- postcode = self.geonames.validate_postcode(postcode_str)
106
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
107
- city = self.get_attribute(raw_address, ["city", "town", "village"])
108
- district, quarter = self.get_district_quarter(raw_address)
109
-
110
- return {
111
- "country": raw_address.get("country"),
112
- "country_code": (raw_address.get("country_code") or "").lower(),
113
- "state": raw_address.get("state"),
114
- "province": province,
115
- "city": city,
116
- "postcode": postcode,
117
- "district": district,
118
- "quarter": quarter,
119
- "street": raw_address.get("road"),
120
- "number": raw_address.get("house_number"),
121
- }
122
-
123
- elif dist_geonames < dist_nominatim and geonames_res_lat_str is not None and geonames_res_lon_str is not None:
124
- # Use GeoNames data
125
- geonames_country_code_str = geonames_raw_json.get("country_code")
126
- country_name = None
127
- if geonames_country_code_str:
128
- try:
129
- country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
130
- if country_obj:
131
- country_name = spanish.gettext(country_obj.name)
132
- except LookupError:
133
- logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
134
-
135
- postcode_str = str(geonames_raw_json.get("postal_code", ""))
136
- postcode = self.geonames.validate_postcode(postcode_str)
137
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
138
- city = geonames_raw_json.get("place_name")
139
-
140
- return {
141
- "country": country_name,
142
- "country_code": (geonames_country_code_str or "").lower(),
143
- "state": geonames_raw_json.get("community"),
144
- "province": province,
145
- "city": city,
146
- "postcode": postcode,
147
- "district": None,
148
- "quarter": None,
149
- "street": None,
150
- "number": None,
151
- }
265
+ return self._get_empty_address_result()
152
266
 
153
- else:
154
- # Neither source provided valid coordinates
155
- return {
156
- "country": None,
157
- "country_code": None,
158
- "state": None,
159
- "province": None,
160
- "city": None,
161
- "postcode": None,
162
- "district": None,
163
- "quarter": None,
164
- "street": None,
165
- "number": None,
166
- }
167
-
168
- def get_district_quarter(self, raw_json):
169
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
170
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
267
+ dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
268
+ dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
171
269
 
172
- if not district and quarter:
173
- district = quarter
174
- quarter = None
270
+ # Select final result
271
+ final_result = self._select_final_result(
272
+ parsed_nominatim_result,
273
+ parsed_geonames_result,
274
+ dist_nominatim,
275
+ dist_geonames,
276
+ selected_postcode,
277
+ selected_province_from_postcode,
278
+ raw_nominatim_province,
279
+ )
175
280
 
176
- return district, quarter
281
+ return final_result
177
282
 
178
283
 
179
284
  class NominatimInterface(Nominatim):
180
- def __init__(self, config):
285
+ def __init__(self, config: Dict[str, Any]) -> None:
181
286
  if "osm" in config:
182
287
  self.config = config["osm"]
183
288
 
@@ -0,0 +1,2 @@
1
+ from .normalization import * # noqa: F403
2
+ from .obfuscation import * # noqa: F403
@@ -1,14 +1,14 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import unicodedata
4
5
  from enum import Enum, auto
5
6
  from typing import Any
6
- import unicodedata
7
7
 
8
8
  import numpy as np
9
- from unidecode import unidecode
10
- from inflection import parameterize, underscore, titleize, camelize
9
+ from inflection import camelize, parameterize, titleize, underscore
11
10
  from string_utils import prettify, strip_html
11
+ from unidecode import unidecode
12
12
 
13
13
  ########################################################################################################################
14
14
  # CLASSES
@@ -0,0 +1,153 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+
9
+ class PiiDependenciesMissingError(ImportError):
10
+ pass
11
+
12
+
13
+ class SpacyModelNotFoundError(ImportError):
14
+ pass
15
+
16
+
17
+ try:
18
+ import phonenumbers
19
+ import spacy
20
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
21
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
22
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer
23
+ from presidio_anonymizer import AnonymizerEngine
24
+ from spacy.language import Language
25
+ from spacy_langdetect import LanguageDetector
26
+ except ImportError as e:
27
+ raise PiiDependenciesMissingError(
28
+ "One or more PII anonymization dependencies are missing. "
29
+ "Please install them by running: pip install datamarket[pii]\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+
33
+
34
+ ########################################################################################################################
35
+ # SETTINGS
36
+
37
+ logger = logging.getLogger()
38
+ logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
39
+
40
+ warnings.filterwarnings(
41
+ "ignore",
42
+ message=r"\[W108\]",
43
+ category=UserWarning,
44
+ module="spacy.pipeline.lemmatizer",
45
+ )
46
+
47
+
48
+ @Language.factory("language_detector")
49
+ def get_lang_detector(nlp, name):
50
+ return LanguageDetector()
51
+
52
+
53
+ ########################################################################################################################
54
+ # CLASSES
55
+
56
+
57
+ class PiiAnonymizer:
58
+ SUPPORTED_LANG = ["es", "en"]
59
+
60
+ def __init__(self):
61
+ # Check for required spaCy models
62
+ required_models = {
63
+ "en_core_web_md": "python -m spacy download en_core_web_md",
64
+ "es_core_news_md": "python -m spacy download es_core_news_md",
65
+ }
66
+ missing_models_instructions = []
67
+ for model_name, install_command in required_models.items():
68
+ if not spacy.util.is_package(model_name):
69
+ missing_models_instructions.append(
70
+ f"Model '{model_name}' not found. Please install it by running: {install_command}"
71
+ )
72
+
73
+ if missing_models_instructions:
74
+ raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
75
+
76
+ self.anonymizer = AnonymizerEngine()
77
+ self.analyzer = self._load_analyzer_engine()
78
+
79
+ self.nlp = self._nlp()
80
+
81
+ def _nlp(self) -> Language:
82
+ analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
83
+ shared_vocab = analyzer_en_model.vocab
84
+ nlp = spacy.blank("en", vocab=shared_vocab)
85
+
86
+ if nlp.has_factory("sentencizer"):
87
+ nlp.add_pipe("sentencizer")
88
+
89
+ if nlp.has_factory("language_detector"):
90
+ nlp.add_pipe("language_detector", last=True)
91
+
92
+ return nlp
93
+
94
+ @staticmethod
95
+ def _nlp_config():
96
+ return {
97
+ "nlp_engine_name": "spacy",
98
+ "models": [
99
+ {"lang_code": "es", "model_name": "es_core_news_md"},
100
+ {"lang_code": "en", "model_name": "en_core_web_md"},
101
+ ],
102
+ }
103
+
104
+ def _load_analyzer_engine(self) -> AnalyzerEngine:
105
+ provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
106
+ nlp_engine = provider.create_engine()
107
+ phone_recognizer_es = PhoneRecognizer(
108
+ supported_language="es",
109
+ supported_regions=phonenumbers.SUPPORTED_REGIONS,
110
+ context=["teléfono", "móvil", "número"],
111
+ )
112
+ registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
113
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
114
+ registry.add_recognizer(phone_recognizer_es)
115
+
116
+ analyzer = AnalyzerEngine(
117
+ registry=registry,
118
+ nlp_engine=nlp_engine,
119
+ supported_languages=self.SUPPORTED_LANG,
120
+ )
121
+ return analyzer
122
+
123
+ def detect_lang(self, text: str) -> str:
124
+ if hasattr(self, "nlp") and self.nlp:
125
+ with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
126
+ doc = self.nlp(text)
127
+ return doc._.language["language"]
128
+ else:
129
+ logger.error("Language detection NLP model not initialized. Cannot detect language.")
130
+ return "unknown"
131
+
132
+ def anonymize_text(
133
+ self,
134
+ text: str,
135
+ entities: Optional[List[str]] = None,
136
+ lang: str = "unknown",
137
+ ) -> str:
138
+ if lang == "unknown":
139
+ lang = self.detect_lang(text)
140
+ if lang not in self.SUPPORTED_LANG:
141
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
142
+ return ""
143
+ elif lang not in self.SUPPORTED_LANG:
144
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
145
+ return ""
146
+
147
+ analyzer_result = self.analyzer.analyze(
148
+ text=text,
149
+ entities=entities,
150
+ language=lang,
151
+ )
152
+ anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
153
+ return anonymizer_result.text
@@ -1,17 +1,17 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: datamarket
3
- Version: 0.9.28
3
+ Version: 0.9.30
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
+ Home-page: https://datamarket.es
5
6
  License: GPL-3.0-or-later
6
7
  Author: DataMarket
7
8
  Author-email: techsupport@datamarket.es
8
- Requires-Python: >=3.12,<4.0
9
+ Requires-Python: >=3.12,<3.13
9
10
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
12
  Classifier: Operating System :: OS Independent
12
13
  Classifier: Programming Language :: Python :: 3
13
14
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
15
  Provides-Extra: alchemy
16
16
  Provides-Extra: aws
17
17
  Provides-Extra: azure-storage-blob
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
40
40
  Provides-Extra: pandas
41
41
  Provides-Extra: pandera
42
42
  Provides-Extra: peerdb
43
+ Provides-Extra: pii
43
44
  Provides-Extra: pillow
44
45
  Provides-Extra: playwright
45
46
  Provides-Extra: playwright-stealth
@@ -83,6 +84,7 @@ Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oau
83
84
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
84
85
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
85
86
  Requires-Dist: inflection (>=0.5.0,<0.6.0)
87
+ Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
86
88
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
87
89
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
88
90
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
@@ -95,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
95
97
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
96
98
  Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
97
99
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
100
+ Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
101
+ Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
98
102
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
99
103
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
100
104
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
@@ -110,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
110
114
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
111
115
  Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
112
116
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
117
+ Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
118
+ Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
113
119
  Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
114
120
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
121
+ Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
115
122
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
116
123
  Requires-Dist: typer (>=0.15.0,<0.16.0)
117
124
  Requires-Dist: unidecode (>=1.0.0,<2.0.0)
118
125
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
119
126
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
120
- Project-URL: Homepage, https://datamarket.es
121
127
  Project-URL: Repository, https://github.com/Data-Market/datamarket
122
128
  Description-Content-Type: text/markdown
123
129
 
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=4q_gLKCKPK437VKOpdBKSrCyy42P_yWxIhE7KuvH
4
4
  datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
7
- datamarket/interfaces/nominatim.py,sha256=TD4OhWdIwn53Va41BS1ugogHEZw0ANKxWIfc9G2JWLU,7280
7
+ datamarket/interfaces/nominatim.py,sha256=rUnodcRKyZ_reBtyfFFjXNqP1TN0NMScW7zSGiJQ10I,12380
8
8
  datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
@@ -16,10 +16,12 @@ datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,6
16
16
  datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
- datamarket/utils/strings.py,sha256=rEX9NeBG4C7RECgT0EQebgoFoxgZMy9-7EcBSxgBANU,5654
19
+ datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
20
+ datamarket/utils/strings/normalization.py,sha256=337M2UPwEETvhVTOnP4w_igTXpHUHoaD8e7x_-L-Bpk,5654
21
+ datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
20
22
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
21
23
  datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
22
- datamarket-0.9.28.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
23
- datamarket-0.9.28.dist-info/METADATA,sha256=5V61eLQ9HR_8Tl7J0sRKwR_mzKbJctIjvEebNtPtqug,6546
24
- datamarket-0.9.28.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
25
- datamarket-0.9.28.dist-info/RECORD,,
24
+ datamarket-0.9.30.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
25
+ datamarket-0.9.30.dist-info/METADATA,sha256=zzhHMrHhBf_CfBLwjj4melul8sCkcO8np-nmay0jKOQ,6871
26
+ datamarket-0.9.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
27
+ datamarket-0.9.30.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.1.3
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any