datamarket 0.9.28__tar.gz → 0.9.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (28) hide show
  1. {datamarket-0.9.28 → datamarket-0.9.30}/PKG-INFO +11 -5
  2. {datamarket-0.9.28 → datamarket-0.9.30}/pyproject.toml +12 -2
  3. datamarket-0.9.30/src/datamarket/interfaces/nominatim.py +294 -0
  4. datamarket-0.9.30/src/datamarket/utils/strings/__init__.py +2 -0
  5. datamarket-0.9.28/src/datamarket/utils/strings.py → datamarket-0.9.30/src/datamarket/utils/strings/normalization.py +3 -3
  6. datamarket-0.9.30/src/datamarket/utils/strings/obfuscation.py +153 -0
  7. datamarket-0.9.28/src/datamarket/interfaces/nominatim.py +0 -189
  8. {datamarket-0.9.28 → datamarket-0.9.30}/LICENSE +0 -0
  9. {datamarket-0.9.28 → datamarket-0.9.30}/README.md +0 -0
  10. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/__init__.py +0 -0
  11. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/__init__.py +0 -0
  12. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/alchemy.py +0 -0
  13. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/aws.py +0 -0
  14. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/drive.py +0 -0
  15. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/ftp.py +0 -0
  16. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/peerdb.py +0 -0
  17. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/proxy.py +0 -0
  18. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/interfaces/tinybird.py +0 -0
  19. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/params/__init__.py +0 -0
  20. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/params/nominatim.py +0 -0
  21. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/__init__.py +0 -0
  22. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/airflow.py +0 -0
  23. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/alchemy.py +0 -0
  24. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/main.py +0 -0
  25. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/selenium.py +0 -0
  26. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/soda.py +0 -0
  27. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/typer.py +0 -0
  28. {datamarket-0.9.28 → datamarket-0.9.30}/src/datamarket/utils/types.py +0 -0
@@ -1,17 +1,17 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.1
2
2
  Name: datamarket
3
- Version: 0.9.28
3
+ Version: 0.9.30
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
+ Home-page: https://datamarket.es
5
6
  License: GPL-3.0-or-later
6
7
  Author: DataMarket
7
8
  Author-email: techsupport@datamarket.es
8
- Requires-Python: >=3.12,<4.0
9
+ Requires-Python: >=3.12,<3.13
9
10
  Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
10
11
  Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
11
12
  Classifier: Operating System :: OS Independent
12
13
  Classifier: Programming Language :: Python :: 3
13
14
  Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
15
15
  Provides-Extra: alchemy
16
16
  Provides-Extra: aws
17
17
  Provides-Extra: azure-storage-blob
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
40
40
  Provides-Extra: pandas
41
41
  Provides-Extra: pandera
42
42
  Provides-Extra: peerdb
43
+ Provides-Extra: pii
43
44
  Provides-Extra: pillow
44
45
  Provides-Extra: playwright
45
46
  Provides-Extra: playwright-stealth
@@ -83,6 +84,7 @@ Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oau
83
84
  Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
84
85
  Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
85
86
  Requires-Dist: inflection (>=0.5.0,<0.6.0)
87
+ Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
86
88
  Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
87
89
  Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
88
90
  Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
@@ -95,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
95
97
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
96
98
  Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
97
99
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
100
+ Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
101
+ Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
98
102
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
99
103
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
100
104
  Requires-Dist: pycountry (>=24.0.0,<25.0.0)
@@ -110,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
110
114
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
111
115
  Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
112
116
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
117
+ Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
118
+ Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
113
119
  Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
114
120
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
121
+ Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
115
122
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
116
123
  Requires-Dist: typer (>=0.15.0,<0.16.0)
117
124
  Requires-Dist: unidecode (>=1.0.0,<2.0.0)
118
125
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
119
126
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
120
- Project-URL: Homepage, https://datamarket.es
121
127
  Project-URL: Repository, https://github.com/Data-Market/datamarket
122
128
  Description-Content-Type: text/markdown
123
129
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.28"
3
+ version = "0.9.30"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -15,7 +15,7 @@ classifiers = [
15
15
  ]
16
16
 
17
17
  [tool.poetry.dependencies]
18
- python = "^3.12"
18
+ python = ">=3.12,<3.13"
19
19
  typer = "~0.15.0"
20
20
  psycopg2-binary = "^2.0.0"
21
21
  requests = "^2.0.0"
@@ -32,6 +32,7 @@ unidecode = "^1.0.0"
32
32
  numpy = "^2.0.0"
33
33
  pycountry = "^24.0.0"
34
34
  geopy = "^2.0.0"
35
+ jellyfish = "^1.0.0"
35
36
 
36
37
  boto3 = { version = "~1.35.0", optional = true }
37
38
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
@@ -75,6 +76,12 @@ openpyxl = { version = "^3.0.0", optional = true }
75
76
  httpx = { extras = ["http2"], version = "~0.28.0", optional = true }
76
77
  SQLAlchemy = { version = "^2.0.0", optional = true }
77
78
  camoufox = { extras = ["geoip"], version = "~0.4.11", optional = true }
79
+ presidio-analyzer = { version = "^2.0.0", optional = true, extras = [
80
+ "phonenumbers",
81
+ ] }
82
+ presidio-anonymizer = { version = "^2.0.0", optional = true }
83
+ spacy = { version = "^3.0.0", optional = true }
84
+ spacy-langdetect = { version = "~0.1.0", optional = true }
78
85
 
79
86
  [tool.poetry.extras]
80
87
  boto3 = ["boto3"]
@@ -127,6 +134,9 @@ peerdb = ["boto3", "clickhouse-driver"]
127
134
  proxy = ["stem"]
128
135
  alchemy = ["SQLAlchemy"]
129
136
 
137
+ # Other groups
138
+ pii = ["presidio-analyzer", "presidio-anonymizer", "spacy", "spacy-langdetect"]
139
+
130
140
 
131
141
  [build-system]
132
142
  requires = ["poetry-core>=1.0.0"]
@@ -0,0 +1,294 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import gettext
5
+ import logging
6
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
+
8
+ import pycountry
9
+ import requests
10
+ from geopy.distance import geodesic
11
+ from jellyfish import jaro_winkler_similarity
12
+
13
+ from ..params.nominatim import POSTCODES
14
+ from ..utils.strings import normalize
15
+
16
+ ########################################################################################################################
17
+ # PARAMETERS
18
+
19
+ JARO_WINKLER_THRESHOLD = 0.85
20
+
21
+ ########################################################################################################################
22
+ # CLASSES
23
+
24
+ logger = logging.getLogger(__name__)
25
+ spanish = gettext.translation("iso3166-1", pycountry.LOCALES_DIR, languages=["es"])
26
+ spanish.install()
27
+
28
+
29
+ class GeoNames:
30
+ def __init__(self, endpoint: str) -> None:
31
+ self.endpoint = endpoint
32
+
33
+ @staticmethod
34
+ def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
35
+ if isinstance(postcode, int):
36
+ postcode = str(postcode)
37
+
38
+ if postcode and len(postcode) == 5 and postcode[:2] in POSTCODES:
39
+ return postcode
40
+
41
+ if postcode and len(postcode) == 4:
42
+ postcode = f"0{postcode}"
43
+ if postcode[:2] in POSTCODES:
44
+ return postcode
45
+
46
+ @staticmethod
47
+ def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
48
+ if postcode:
49
+ return POSTCODES[postcode[:2]]
50
+
51
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
52
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
53
+
54
+
55
+ class Nominatim:
56
+ def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
57
+ self.endpoint = nominatim_endpoint
58
+ self.geonames = GeoNames(geonames_endpoint)
59
+
60
+ @staticmethod
61
+ def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
62
+ for key in keys:
63
+ if key in raw_json:
64
+ return raw_json[key]
65
+
66
+ def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
67
+ dist = float("inf")
68
+ if lat_str and lon_str:
69
+ try:
70
+ coords = (float(lat_str), float(lon_str))
71
+ dist = geodesic(input_coords, coords).km
72
+ except (ValueError, TypeError):
73
+ logger.warning("Invalid coordinates for distance calculation.")
74
+ return dist
75
+
76
+ def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
77
+ raw_address = nominatim_raw_json.get("address", {})
78
+
79
+ postcode_str = str(raw_address.get("postcode", ""))
80
+ postcode = self.geonames.validate_postcode(postcode_str)
81
+
82
+ city = self._get_attribute(raw_address, ["city", "town", "village"])
83
+ district, quarter = self._get_district_quarter(raw_address)
84
+
85
+ return {
86
+ "country": raw_address.get("country"),
87
+ "country_code": (raw_address.get("country_code") or "").lower(),
88
+ "state": raw_address.get("state"),
89
+ "province": raw_address.get("province"),
90
+ "city": city,
91
+ "postcode": postcode,
92
+ "district": district,
93
+ "quarter": quarter,
94
+ "street": raw_address.get("road"),
95
+ "number": raw_address.get("house_number"),
96
+ }
97
+
98
+ def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
99
+ geonames_country_code_str = geonames_raw_json.get("country_code")
100
+ country_name = None
101
+ if geonames_country_code_str:
102
+ try:
103
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
104
+ if country_obj:
105
+ country_name = spanish.gettext(country_obj.name)
106
+ except LookupError:
107
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
108
+
109
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
110
+ postcode = self.geonames.validate_postcode(postcode_str)
111
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
112
+ city = geonames_raw_json.get("place_name")
113
+
114
+ return {
115
+ "country": country_name,
116
+ "country_code": (geonames_country_code_str or "").lower(),
117
+ "state": geonames_raw_json.get("community"),
118
+ "province": province,
119
+ "city": city,
120
+ "postcode": postcode,
121
+ "district": None,
122
+ "quarter": None,
123
+ "street": None,
124
+ "number": None,
125
+ }
126
+
127
+ def _get_empty_address_result(self) -> Dict[str, None]:
128
+ return {
129
+ "country": None,
130
+ "country_code": None,
131
+ "state": None,
132
+ "province": None,
133
+ "city": None,
134
+ "postcode": None,
135
+ "district": None,
136
+ "quarter": None,
137
+ "street": None,
138
+ "number": None,
139
+ }
140
+
141
+ def _select_postcode_and_derived_province(
142
+ self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
143
+ ) -> Tuple[Optional[str], Optional[str]]:
144
+ """
145
+ Determines the postcode and its derived province based on comparisons
146
+ between Nominatim and GeoNames data, and Nominatim's raw address province.
147
+ """
148
+ nominatim_postcode = parsed_nominatim_result.get("postcode")
149
+ geonames_postcode = parsed_geonames_result.get("postcode")
150
+
151
+ province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
152
+ province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
153
+
154
+ norm_raw_nominatim_province = (
155
+ normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
156
+ )
157
+ norm_province_from_nominatim_postcode = (
158
+ normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
159
+ )
160
+ norm_province_from_geonames_postcode = (
161
+ normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
162
+ )
163
+
164
+ selected_postcode = None
165
+ selected_province_from_postcode = None
166
+
167
+ # If provinces derived from Nominatim and GeoNames postcodes differ
168
+ nominatim_postcode_province_matches = False
169
+ if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
170
+ nominatim_postcode_province_matches = (
171
+ jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
172
+ > JARO_WINKLER_THRESHOLD
173
+ )
174
+
175
+ geonames_postcode_province_matches = False
176
+ if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
177
+ geonames_postcode_province_matches = (
178
+ jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
179
+ > JARO_WINKLER_THRESHOLD
180
+ )
181
+
182
+ # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
183
+ # and Nominatim's own postcode-derived province does not.
184
+ if nominatim_postcode_province_matches:
185
+ selected_postcode = nominatim_postcode
186
+ selected_province_from_postcode = province_from_nominatim_postcode
187
+ if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
188
+ selected_postcode = geonames_postcode
189
+ selected_province_from_postcode = province_from_geonames_postcode
190
+
191
+ return selected_postcode, selected_province_from_postcode
192
+
193
+ def _select_final_result(
194
+ self,
195
+ parsed_nominatim_result: Dict[str, Optional[str]],
196
+ parsed_geonames_result: Dict[str, Optional[str]],
197
+ dist_nominatim: float,
198
+ dist_geonames: float,
199
+ authoritative_postcode: Optional[str],
200
+ authoritative_province_from_postcode: Optional[str],
201
+ nominatim_address_province_raw: Optional[str],
202
+ ) -> Dict[str, Optional[str]]:
203
+ """
204
+ Selects the final address result based on distances and applies the authoritative postcode/province.
205
+ """
206
+ if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
207
+ final_result = parsed_nominatim_result
208
+ final_result["postcode"] = authoritative_postcode
209
+ final_result["province"] = nominatim_address_province_raw
210
+ elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
211
+ final_result = parsed_geonames_result
212
+ final_result["postcode"] = authoritative_postcode
213
+ final_result["province"] = authoritative_province_from_postcode
214
+ else:
215
+ final_result = self._get_empty_address_result()
216
+ return final_result
217
+
218
+ def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
219
+ district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
220
+ quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
221
+
222
+ if not district and quarter:
223
+ district = quarter
224
+ quarter = None
225
+
226
+ return district, quarter
227
+
228
+ def geocode(self, address: str) -> List[Dict[str, Any]]:
229
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
230
+
231
+ def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
232
+ results = self.geocode(address)
233
+
234
+ if results:
235
+ return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
236
+
237
+ def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
238
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
239
+
240
+ def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
241
+ nominatim_response = self.reverse(lat, lon)
242
+ geonames_response = self.geonames.reverse(lat, lon)
243
+
244
+ # Initial parsing
245
+ parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
246
+ parsed_geonames_result = self._parse_geonames_result(geonames_response)
247
+
248
+ # Determine authoritative postcode
249
+ raw_nominatim_province = nominatim_response.get("address", {}).get("province")
250
+ selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
251
+ parsed_nominatim_result, parsed_geonames_result, raw_nominatim_province
252
+ )
253
+
254
+ # Calculate distances
255
+ nominatim_response_lat = nominatim_response.get("lat")
256
+ nominatim_response_lon = nominatim_response.get("lon")
257
+ geonames_response_lat = geonames_response.get("lat")
258
+ geonames_response_lon = geonames_response.get("lon")
259
+
260
+ input_coords = None
261
+ try:
262
+ input_coords = (float(lat), float(lon))
263
+ except (ValueError, TypeError):
264
+ logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
265
+ return self._get_empty_address_result()
266
+
267
+ dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
268
+ dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
269
+
270
+ # Select final result
271
+ final_result = self._select_final_result(
272
+ parsed_nominatim_result,
273
+ parsed_geonames_result,
274
+ dist_nominatim,
275
+ dist_geonames,
276
+ selected_postcode,
277
+ selected_province_from_postcode,
278
+ raw_nominatim_province,
279
+ )
280
+
281
+ return final_result
282
+
283
+
284
+ class NominatimInterface(Nominatim):
285
+ def __init__(self, config: Dict[str, Any]) -> None:
286
+ if "osm" in config:
287
+ self.config = config["osm"]
288
+
289
+ self.nominatim_endpoint = self.config["nominatim_endpoint"]
290
+ self.geonames_endpoint = self.config["geonames_endpoint"]
291
+
292
+ super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
293
+ else:
294
+ logger.warning("no osm section in config")
@@ -0,0 +1,2 @@
1
+ from .normalization import * # noqa: F403
2
+ from .obfuscation import * # noqa: F403
@@ -1,14 +1,14 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import unicodedata
4
5
  from enum import Enum, auto
5
6
  from typing import Any
6
- import unicodedata
7
7
 
8
8
  import numpy as np
9
- from unidecode import unidecode
10
- from inflection import parameterize, underscore, titleize, camelize
9
+ from inflection import camelize, parameterize, titleize, underscore
11
10
  from string_utils import prettify, strip_html
11
+ from unidecode import unidecode
12
12
 
13
13
  ########################################################################################################################
14
14
  # CLASSES
@@ -0,0 +1,153 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import logging
5
+ import warnings
6
+ from typing import List, Optional
7
+
8
+
9
+ class PiiDependenciesMissingError(ImportError):
10
+ pass
11
+
12
+
13
+ class SpacyModelNotFoundError(ImportError):
14
+ pass
15
+
16
+
17
+ try:
18
+ import phonenumbers
19
+ import spacy
20
+ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
21
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
22
+ from presidio_analyzer.predefined_recognizers import PhoneRecognizer
23
+ from presidio_anonymizer import AnonymizerEngine
24
+ from spacy.language import Language
25
+ from spacy_langdetect import LanguageDetector
26
+ except ImportError as e:
27
+ raise PiiDependenciesMissingError(
28
+ "One or more PII anonymization dependencies are missing. "
29
+ "Please install them by running: pip install datamarket[pii]\n"
30
+ f"Original error: {e}"
31
+ ) from e
32
+
33
+
34
+ ########################################################################################################################
35
+ # SETTINGS
36
+
37
+ logger = logging.getLogger()
38
+ logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
39
+
40
+ warnings.filterwarnings(
41
+ "ignore",
42
+ message=r"\[W108\]",
43
+ category=UserWarning,
44
+ module="spacy.pipeline.lemmatizer",
45
+ )
46
+
47
+
48
+ @Language.factory("language_detector")
49
+ def get_lang_detector(nlp, name):
50
+ return LanguageDetector()
51
+
52
+
53
+ ########################################################################################################################
54
+ # CLASSES
55
+
56
+
57
+ class PiiAnonymizer:
58
+ SUPPORTED_LANG = ["es", "en"]
59
+
60
+ def __init__(self):
61
+ # Check for required spaCy models
62
+ required_models = {
63
+ "en_core_web_md": "python -m spacy download en_core_web_md",
64
+ "es_core_news_md": "python -m spacy download es_core_news_md",
65
+ }
66
+ missing_models_instructions = []
67
+ for model_name, install_command in required_models.items():
68
+ if not spacy.util.is_package(model_name):
69
+ missing_models_instructions.append(
70
+ f"Model '{model_name}' not found. Please install it by running: {install_command}"
71
+ )
72
+
73
+ if missing_models_instructions:
74
+ raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
75
+
76
+ self.anonymizer = AnonymizerEngine()
77
+ self.analyzer = self._load_analyzer_engine()
78
+
79
+ self.nlp = self._nlp()
80
+
81
+ def _nlp(self) -> Language:
82
+ analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
83
+ shared_vocab = analyzer_en_model.vocab
84
+ nlp = spacy.blank("en", vocab=shared_vocab)
85
+
86
+ if nlp.has_factory("sentencizer"):
87
+ nlp.add_pipe("sentencizer")
88
+
89
+ if nlp.has_factory("language_detector"):
90
+ nlp.add_pipe("language_detector", last=True)
91
+
92
+ return nlp
93
+
94
+ @staticmethod
95
+ def _nlp_config():
96
+ return {
97
+ "nlp_engine_name": "spacy",
98
+ "models": [
99
+ {"lang_code": "es", "model_name": "es_core_news_md"},
100
+ {"lang_code": "en", "model_name": "en_core_web_md"},
101
+ ],
102
+ }
103
+
104
+ def _load_analyzer_engine(self) -> AnalyzerEngine:
105
+ provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
106
+ nlp_engine = provider.create_engine()
107
+ phone_recognizer_es = PhoneRecognizer(
108
+ supported_language="es",
109
+ supported_regions=phonenumbers.SUPPORTED_REGIONS,
110
+ context=["teléfono", "móvil", "número"],
111
+ )
112
+ registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
113
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
114
+ registry.add_recognizer(phone_recognizer_es)
115
+
116
+ analyzer = AnalyzerEngine(
117
+ registry=registry,
118
+ nlp_engine=nlp_engine,
119
+ supported_languages=self.SUPPORTED_LANG,
120
+ )
121
+ return analyzer
122
+
123
+ def detect_lang(self, text: str) -> str:
124
+ if hasattr(self, "nlp") and self.nlp:
125
+ with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
126
+ doc = self.nlp(text)
127
+ return doc._.language["language"]
128
+ else:
129
+ logger.error("Language detection NLP model not initialized. Cannot detect language.")
130
+ return "unknown"
131
+
132
+ def anonymize_text(
133
+ self,
134
+ text: str,
135
+ entities: Optional[List[str]] = None,
136
+ lang: str = "unknown",
137
+ ) -> str:
138
+ if lang == "unknown":
139
+ lang = self.detect_lang(text)
140
+ if lang not in self.SUPPORTED_LANG:
141
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
142
+ return ""
143
+ elif lang not in self.SUPPORTED_LANG:
144
+ logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
145
+ return ""
146
+
147
+ analyzer_result = self.analyzer.analyze(
148
+ text=text,
149
+ entities=entities,
150
+ language=lang,
151
+ )
152
+ anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
153
+ return anonymizer_result.text
@@ -1,189 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import gettext
5
- import logging
6
- import pycountry
7
- from geopy.distance import geodesic
8
-
9
- import requests
10
-
11
- from ..params.nominatim import POSTCODES
12
-
13
- ########################################################################################################################
14
- # CLASSES
15
-
16
- logger = logging.getLogger(__name__)
17
- spanish = gettext.translation("iso3166-1", pycountry.LOCALES_DIR, languages=["es"])
18
- spanish.install()
19
-
20
-
21
- class GeoNames:
22
- def __init__(self, endpoint):
23
- self.endpoint = endpoint
24
-
25
- @staticmethod
26
- def validate_postcode(postcode):
27
- if isinstance(postcode, int):
28
- postcode = str(postcode)
29
-
30
- if postcode and len(postcode) == 5 and postcode[:2] in POSTCODES:
31
- return postcode
32
-
33
- if postcode and len(postcode) == 4:
34
- postcode = f"0{postcode}"
35
- if postcode[:2] in POSTCODES:
36
- return postcode
37
-
38
- @staticmethod
39
- def get_province_from_postcode(postcode):
40
- if postcode:
41
- return POSTCODES[postcode[:2]]
42
-
43
- def reverse(self, lat, lon):
44
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
45
-
46
-
47
- class Nominatim:
48
- def __init__(self, nominatim_endpoint, geonames_endpoint):
49
- self.endpoint = nominatim_endpoint
50
- self.geonames = GeoNames(geonames_endpoint)
51
-
52
- @staticmethod
53
- def get_attribute(raw_json, keys):
54
- for key in keys:
55
- if key in raw_json:
56
- return raw_json[key]
57
-
58
- def geocode(self, address):
59
- return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
60
-
61
- def geocode_parsed(self, address):
62
- results = self.geocode(address)
63
-
64
- if results:
65
- return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
66
-
67
- def reverse(self, lat, lon):
68
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
69
-
70
- def reverse_parsed(self, lat, lon):
71
- nominatim_raw_json = self.reverse(lat, lon)
72
- geonames_raw_json = self.geonames.reverse(lat, lon)
73
-
74
- nominatim_res_lat_str = nominatim_raw_json.get("lat")
75
- nominatim_res_lon_str = nominatim_raw_json.get("lon")
76
- geonames_res_lat_str = geonames_raw_json.get("lat")
77
- geonames_res_lon_str = geonames_raw_json.get("lon")
78
-
79
- dist_nominatim = float("inf")
80
- dist_geonames = float("inf")
81
-
82
- try:
83
- input_coords = (float(lat), float(lon))
84
- except (ValueError, TypeError):
85
- logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
86
- else:
87
- if nominatim_res_lat_str and nominatim_res_lon_str:
88
- try:
89
- nominatim_coords = (float(nominatim_res_lat_str), float(nominatim_res_lon_str))
90
- dist_nominatim = geodesic(input_coords, nominatim_coords).km
91
- except (ValueError, TypeError):
92
- logger.warning("Invalid Nominatim coordinates for distance calculation.")
93
-
94
- if geonames_res_lat_str and geonames_res_lon_str:
95
- try:
96
- geonames_coords = (float(geonames_res_lat_str), float(geonames_res_lon_str))
97
- dist_geonames = geodesic(input_coords, geonames_coords).km
98
- except (ValueError, TypeError):
99
- logger.warning("Invalid GeoNames coordinates for distance calculation.")
100
-
101
- if dist_nominatim <= dist_geonames and nominatim_res_lat_str is not None and nominatim_res_lon_str is not None:
102
- # Use Nominatim data
103
- raw_address = nominatim_raw_json.get("address", {})
104
- postcode_str = str(raw_address.get("postcode", ""))
105
- postcode = self.geonames.validate_postcode(postcode_str)
106
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
107
- city = self.get_attribute(raw_address, ["city", "town", "village"])
108
- district, quarter = self.get_district_quarter(raw_address)
109
-
110
- return {
111
- "country": raw_address.get("country"),
112
- "country_code": (raw_address.get("country_code") or "").lower(),
113
- "state": raw_address.get("state"),
114
- "province": province,
115
- "city": city,
116
- "postcode": postcode,
117
- "district": district,
118
- "quarter": quarter,
119
- "street": raw_address.get("road"),
120
- "number": raw_address.get("house_number"),
121
- }
122
-
123
- elif dist_geonames < dist_nominatim and geonames_res_lat_str is not None and geonames_res_lon_str is not None:
124
- # Use GeoNames data
125
- geonames_country_code_str = geonames_raw_json.get("country_code")
126
- country_name = None
127
- if geonames_country_code_str:
128
- try:
129
- country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
130
- if country_obj:
131
- country_name = spanish.gettext(country_obj.name)
132
- except LookupError:
133
- logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
134
-
135
- postcode_str = str(geonames_raw_json.get("postal_code", ""))
136
- postcode = self.geonames.validate_postcode(postcode_str)
137
- province = self.geonames.get_province_from_postcode(postcode) if postcode else None
138
- city = geonames_raw_json.get("place_name")
139
-
140
- return {
141
- "country": country_name,
142
- "country_code": (geonames_country_code_str or "").lower(),
143
- "state": geonames_raw_json.get("community"),
144
- "province": province,
145
- "city": city,
146
- "postcode": postcode,
147
- "district": None,
148
- "quarter": None,
149
- "street": None,
150
- "number": None,
151
- }
152
-
153
- else:
154
- # Neither source provided valid coordinates
155
- return {
156
- "country": None,
157
- "country_code": None,
158
- "state": None,
159
- "province": None,
160
- "city": None,
161
- "postcode": None,
162
- "district": None,
163
- "quarter": None,
164
- "street": None,
165
- "number": None,
166
- }
167
-
168
- def get_district_quarter(self, raw_json):
169
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
170
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
171
-
172
- if not district and quarter:
173
- district = quarter
174
- quarter = None
175
-
176
- return district, quarter
177
-
178
-
179
- class NominatimInterface(Nominatim):
180
- def __init__(self, config):
181
- if "osm" in config:
182
- self.config = config["osm"]
183
-
184
- self.nominatim_endpoint = self.config["nominatim_endpoint"]
185
- self.geonames_endpoint = self.config["geonames_endpoint"]
186
-
187
- super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
188
- else:
189
- logger.warning("no osm section in config")
File without changes
File without changes