datamarket 0.9.28__py3-none-any.whl → 0.9.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/nominatim.py +217 -112
- datamarket/utils/strings/__init__.py +2 -0
- datamarket/utils/{strings.py → strings/normalization.py} +3 -3
- datamarket/utils/strings/obfuscation.py +153 -0
- {datamarket-0.9.28.dist-info → datamarket-0.9.30.dist-info}/METADATA +11 -5
- {datamarket-0.9.28.dist-info → datamarket-0.9.30.dist-info}/RECORD +8 -6
- {datamarket-0.9.28.dist-info → datamarket-0.9.30.dist-info}/WHEEL +1 -1
- {datamarket-0.9.28.dist-info → datamarket-0.9.30.dist-info}/LICENSE +0 -0
|
@@ -3,12 +3,20 @@
|
|
|
3
3
|
|
|
4
4
|
import gettext
|
|
5
5
|
import logging
|
|
6
|
-
import
|
|
7
|
-
from geopy.distance import geodesic
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
7
|
|
|
8
|
+
import pycountry
|
|
9
9
|
import requests
|
|
10
|
+
from geopy.distance import geodesic
|
|
11
|
+
from jellyfish import jaro_winkler_similarity
|
|
10
12
|
|
|
11
13
|
from ..params.nominatim import POSTCODES
|
|
14
|
+
from ..utils.strings import normalize
|
|
15
|
+
|
|
16
|
+
########################################################################################################################
|
|
17
|
+
# PARAMETERS
|
|
18
|
+
|
|
19
|
+
JARO_WINKLER_THRESHOLD = 0.85
|
|
12
20
|
|
|
13
21
|
########################################################################################################################
|
|
14
22
|
# CLASSES
|
|
@@ -19,11 +27,11 @@ spanish.install()
|
|
|
19
27
|
|
|
20
28
|
|
|
21
29
|
class GeoNames:
|
|
22
|
-
def __init__(self, endpoint):
|
|
30
|
+
def __init__(self, endpoint: str) -> None:
|
|
23
31
|
self.endpoint = endpoint
|
|
24
32
|
|
|
25
33
|
@staticmethod
|
|
26
|
-
def validate_postcode(postcode):
|
|
34
|
+
def validate_postcode(postcode: Union[int, str]) -> Optional[str]:
|
|
27
35
|
if isinstance(postcode, int):
|
|
28
36
|
postcode = str(postcode)
|
|
29
37
|
|
|
@@ -36,148 +44,245 @@ class GeoNames:
|
|
|
36
44
|
return postcode
|
|
37
45
|
|
|
38
46
|
@staticmethod
|
|
39
|
-
def get_province_from_postcode(postcode):
|
|
47
|
+
def get_province_from_postcode(postcode: Optional[str]) -> Optional[str]:
|
|
40
48
|
if postcode:
|
|
41
49
|
return POSTCODES[postcode[:2]]
|
|
42
50
|
|
|
43
|
-
def reverse(self, lat, lon):
|
|
44
|
-
return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
|
|
51
|
+
def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
|
|
52
|
+
return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}", timeout=30).json()
|
|
45
53
|
|
|
46
54
|
|
|
47
55
|
class Nominatim:
|
|
48
|
-
def __init__(self, nominatim_endpoint, geonames_endpoint):
|
|
56
|
+
def __init__(self, nominatim_endpoint: str, geonames_endpoint: str) -> None:
|
|
49
57
|
self.endpoint = nominatim_endpoint
|
|
50
58
|
self.geonames = GeoNames(geonames_endpoint)
|
|
51
59
|
|
|
52
60
|
@staticmethod
|
|
53
|
-
def
|
|
61
|
+
def _get_attribute(raw_json: Dict[str, Any], keys: List[str]) -> Any:
|
|
54
62
|
for key in keys:
|
|
55
63
|
if key in raw_json:
|
|
56
64
|
return raw_json[key]
|
|
57
65
|
|
|
58
|
-
def
|
|
59
|
-
|
|
66
|
+
def _calculate_distance(self, lat_str: Optional[str], lon_str: Optional[str], input_coords: Tuple[float, float]) -> float:
|
|
67
|
+
dist = float("inf")
|
|
68
|
+
if lat_str and lon_str:
|
|
69
|
+
try:
|
|
70
|
+
coords = (float(lat_str), float(lon_str))
|
|
71
|
+
dist = geodesic(input_coords, coords).km
|
|
72
|
+
except (ValueError, TypeError):
|
|
73
|
+
logger.warning("Invalid coordinates for distance calculation.")
|
|
74
|
+
return dist
|
|
75
|
+
|
|
76
|
+
def _parse_nominatim_result(self, nominatim_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
77
|
+
raw_address = nominatim_raw_json.get("address", {})
|
|
78
|
+
|
|
79
|
+
postcode_str = str(raw_address.get("postcode", ""))
|
|
80
|
+
postcode = self.geonames.validate_postcode(postcode_str)
|
|
81
|
+
|
|
82
|
+
city = self._get_attribute(raw_address, ["city", "town", "village"])
|
|
83
|
+
district, quarter = self._get_district_quarter(raw_address)
|
|
84
|
+
|
|
85
|
+
return {
|
|
86
|
+
"country": raw_address.get("country"),
|
|
87
|
+
"country_code": (raw_address.get("country_code") or "").lower(),
|
|
88
|
+
"state": raw_address.get("state"),
|
|
89
|
+
"province": raw_address.get("province"),
|
|
90
|
+
"city": city,
|
|
91
|
+
"postcode": postcode,
|
|
92
|
+
"district": district,
|
|
93
|
+
"quarter": quarter,
|
|
94
|
+
"street": raw_address.get("road"),
|
|
95
|
+
"number": raw_address.get("house_number"),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
def _parse_geonames_result(self, geonames_raw_json: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
|
99
|
+
geonames_country_code_str = geonames_raw_json.get("country_code")
|
|
100
|
+
country_name = None
|
|
101
|
+
if geonames_country_code_str:
|
|
102
|
+
try:
|
|
103
|
+
country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
|
|
104
|
+
if country_obj:
|
|
105
|
+
country_name = spanish.gettext(country_obj.name)
|
|
106
|
+
except LookupError:
|
|
107
|
+
logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
|
|
108
|
+
|
|
109
|
+
postcode_str = str(geonames_raw_json.get("postal_code", ""))
|
|
110
|
+
postcode = self.geonames.validate_postcode(postcode_str)
|
|
111
|
+
province = self.geonames.get_province_from_postcode(postcode) if postcode else None
|
|
112
|
+
city = geonames_raw_json.get("place_name")
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
"country": country_name,
|
|
116
|
+
"country_code": (geonames_country_code_str or "").lower(),
|
|
117
|
+
"state": geonames_raw_json.get("community"),
|
|
118
|
+
"province": province,
|
|
119
|
+
"city": city,
|
|
120
|
+
"postcode": postcode,
|
|
121
|
+
"district": None,
|
|
122
|
+
"quarter": None,
|
|
123
|
+
"street": None,
|
|
124
|
+
"number": None,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
def _get_empty_address_result(self) -> Dict[str, None]:
|
|
128
|
+
return {
|
|
129
|
+
"country": None,
|
|
130
|
+
"country_code": None,
|
|
131
|
+
"state": None,
|
|
132
|
+
"province": None,
|
|
133
|
+
"city": None,
|
|
134
|
+
"postcode": None,
|
|
135
|
+
"district": None,
|
|
136
|
+
"quarter": None,
|
|
137
|
+
"street": None,
|
|
138
|
+
"number": None,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def _select_postcode_and_derived_province(
|
|
142
|
+
self, parsed_nominatim_result: Dict[str, Optional[str]], parsed_geonames_result: Dict[str, Optional[str]], nominatim_address_province_raw: Optional[str]
|
|
143
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
144
|
+
"""
|
|
145
|
+
Determines the postcode and its derived province based on comparisons
|
|
146
|
+
between Nominatim and GeoNames data, and Nominatim's raw address province.
|
|
147
|
+
"""
|
|
148
|
+
nominatim_postcode = parsed_nominatim_result.get("postcode")
|
|
149
|
+
geonames_postcode = parsed_geonames_result.get("postcode")
|
|
150
|
+
|
|
151
|
+
province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
|
|
152
|
+
province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
|
|
153
|
+
|
|
154
|
+
norm_raw_nominatim_province = (
|
|
155
|
+
normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
|
|
156
|
+
)
|
|
157
|
+
norm_province_from_nominatim_postcode = (
|
|
158
|
+
normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
|
|
159
|
+
)
|
|
160
|
+
norm_province_from_geonames_postcode = (
|
|
161
|
+
normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
selected_postcode = None
|
|
165
|
+
selected_province_from_postcode = None
|
|
166
|
+
|
|
167
|
+
# If provinces derived from Nominatim and GeoNames postcodes differ
|
|
168
|
+
nominatim_postcode_province_matches = False
|
|
169
|
+
if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
|
|
170
|
+
nominatim_postcode_province_matches = (
|
|
171
|
+
jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
|
|
172
|
+
> JARO_WINKLER_THRESHOLD
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
geonames_postcode_province_matches = False
|
|
176
|
+
if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
|
|
177
|
+
geonames_postcode_province_matches = (
|
|
178
|
+
jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
|
|
179
|
+
> JARO_WINKLER_THRESHOLD
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Prefer GeoNames postcode if its province matches Nominatim's raw address province,
|
|
183
|
+
# and Nominatim's own postcode-derived province does not.
|
|
184
|
+
if nominatim_postcode_province_matches:
|
|
185
|
+
selected_postcode = nominatim_postcode
|
|
186
|
+
selected_province_from_postcode = province_from_nominatim_postcode
|
|
187
|
+
if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
|
|
188
|
+
selected_postcode = geonames_postcode
|
|
189
|
+
selected_province_from_postcode = province_from_geonames_postcode
|
|
190
|
+
|
|
191
|
+
return selected_postcode, selected_province_from_postcode
|
|
192
|
+
|
|
193
|
+
def _select_final_result(
|
|
194
|
+
self,
|
|
195
|
+
parsed_nominatim_result: Dict[str, Optional[str]],
|
|
196
|
+
parsed_geonames_result: Dict[str, Optional[str]],
|
|
197
|
+
dist_nominatim: float,
|
|
198
|
+
dist_geonames: float,
|
|
199
|
+
authoritative_postcode: Optional[str],
|
|
200
|
+
authoritative_province_from_postcode: Optional[str],
|
|
201
|
+
nominatim_address_province_raw: Optional[str],
|
|
202
|
+
) -> Dict[str, Optional[str]]:
|
|
203
|
+
"""
|
|
204
|
+
Selects the final address result based on distances and applies the authoritative postcode/province.
|
|
205
|
+
"""
|
|
206
|
+
if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
|
|
207
|
+
final_result = parsed_nominatim_result
|
|
208
|
+
final_result["postcode"] = authoritative_postcode
|
|
209
|
+
final_result["province"] = nominatim_address_province_raw
|
|
210
|
+
elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
|
|
211
|
+
final_result = parsed_geonames_result
|
|
212
|
+
final_result["postcode"] = authoritative_postcode
|
|
213
|
+
final_result["province"] = authoritative_province_from_postcode
|
|
214
|
+
else:
|
|
215
|
+
final_result = self._get_empty_address_result()
|
|
216
|
+
return final_result
|
|
217
|
+
|
|
218
|
+
def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
219
|
+
district = self._get_attribute(raw_json, ["city_district", "suburb", "borough"])
|
|
220
|
+
quarter = self._get_attribute(raw_json, ["quarter", "neighbourhood"])
|
|
60
221
|
|
|
61
|
-
|
|
222
|
+
if not district and quarter:
|
|
223
|
+
district = quarter
|
|
224
|
+
quarter = None
|
|
225
|
+
|
|
226
|
+
return district, quarter
|
|
227
|
+
|
|
228
|
+
def geocode(self, address: str) -> List[Dict[str, Any]]:
|
|
229
|
+
return requests.get(f"{self.endpoint}/search?q={address}&format=json", timeout=30).json()
|
|
230
|
+
|
|
231
|
+
def geocode_parsed(self, address: str) -> Optional[Dict[str, Optional[str]]]:
|
|
62
232
|
results = self.geocode(address)
|
|
63
233
|
|
|
64
234
|
if results:
|
|
65
235
|
return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
|
|
66
236
|
|
|
67
|
-
def reverse(self, lat, lon):
|
|
68
|
-
return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
|
|
237
|
+
def reverse(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Any]:
|
|
238
|
+
return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json", timeout=30).json()
|
|
239
|
+
|
|
240
|
+
def reverse_parsed(self, lat: Union[float, str], lon: Union[float, str]) -> Dict[str, Optional[str]]:
|
|
241
|
+
nominatim_response = self.reverse(lat, lon)
|
|
242
|
+
geonames_response = self.geonames.reverse(lat, lon)
|
|
69
243
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
244
|
+
# Initial parsing
|
|
245
|
+
parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
|
|
246
|
+
parsed_geonames_result = self._parse_geonames_result(geonames_response)
|
|
73
247
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
248
|
+
# Determine authoritative postcode
|
|
249
|
+
raw_nominatim_province = nominatim_response.get("address", {}).get("province")
|
|
250
|
+
selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
|
|
251
|
+
parsed_nominatim_result, parsed_geonames_result, raw_nominatim_province
|
|
252
|
+
)
|
|
78
253
|
|
|
79
|
-
|
|
80
|
-
|
|
254
|
+
# Calculate distances
|
|
255
|
+
nominatim_response_lat = nominatim_response.get("lat")
|
|
256
|
+
nominatim_response_lon = nominatim_response.get("lon")
|
|
257
|
+
geonames_response_lat = geonames_response.get("lat")
|
|
258
|
+
geonames_response_lon = geonames_response.get("lon")
|
|
81
259
|
|
|
260
|
+
input_coords = None
|
|
82
261
|
try:
|
|
83
262
|
input_coords = (float(lat), float(lon))
|
|
84
263
|
except (ValueError, TypeError):
|
|
85
264
|
logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
|
|
86
|
-
|
|
87
|
-
if nominatim_res_lat_str and nominatim_res_lon_str:
|
|
88
|
-
try:
|
|
89
|
-
nominatim_coords = (float(nominatim_res_lat_str), float(nominatim_res_lon_str))
|
|
90
|
-
dist_nominatim = geodesic(input_coords, nominatim_coords).km
|
|
91
|
-
except (ValueError, TypeError):
|
|
92
|
-
logger.warning("Invalid Nominatim coordinates for distance calculation.")
|
|
93
|
-
|
|
94
|
-
if geonames_res_lat_str and geonames_res_lon_str:
|
|
95
|
-
try:
|
|
96
|
-
geonames_coords = (float(geonames_res_lat_str), float(geonames_res_lon_str))
|
|
97
|
-
dist_geonames = geodesic(input_coords, geonames_coords).km
|
|
98
|
-
except (ValueError, TypeError):
|
|
99
|
-
logger.warning("Invalid GeoNames coordinates for distance calculation.")
|
|
100
|
-
|
|
101
|
-
if dist_nominatim <= dist_geonames and nominatim_res_lat_str is not None and nominatim_res_lon_str is not None:
|
|
102
|
-
# Use Nominatim data
|
|
103
|
-
raw_address = nominatim_raw_json.get("address", {})
|
|
104
|
-
postcode_str = str(raw_address.get("postcode", ""))
|
|
105
|
-
postcode = self.geonames.validate_postcode(postcode_str)
|
|
106
|
-
province = self.geonames.get_province_from_postcode(postcode) if postcode else None
|
|
107
|
-
city = self.get_attribute(raw_address, ["city", "town", "village"])
|
|
108
|
-
district, quarter = self.get_district_quarter(raw_address)
|
|
109
|
-
|
|
110
|
-
return {
|
|
111
|
-
"country": raw_address.get("country"),
|
|
112
|
-
"country_code": (raw_address.get("country_code") or "").lower(),
|
|
113
|
-
"state": raw_address.get("state"),
|
|
114
|
-
"province": province,
|
|
115
|
-
"city": city,
|
|
116
|
-
"postcode": postcode,
|
|
117
|
-
"district": district,
|
|
118
|
-
"quarter": quarter,
|
|
119
|
-
"street": raw_address.get("road"),
|
|
120
|
-
"number": raw_address.get("house_number"),
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
elif dist_geonames < dist_nominatim and geonames_res_lat_str is not None and geonames_res_lon_str is not None:
|
|
124
|
-
# Use GeoNames data
|
|
125
|
-
geonames_country_code_str = geonames_raw_json.get("country_code")
|
|
126
|
-
country_name = None
|
|
127
|
-
if geonames_country_code_str:
|
|
128
|
-
try:
|
|
129
|
-
country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
|
|
130
|
-
if country_obj:
|
|
131
|
-
country_name = spanish.gettext(country_obj.name)
|
|
132
|
-
except LookupError:
|
|
133
|
-
logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
|
|
134
|
-
|
|
135
|
-
postcode_str = str(geonames_raw_json.get("postal_code", ""))
|
|
136
|
-
postcode = self.geonames.validate_postcode(postcode_str)
|
|
137
|
-
province = self.geonames.get_province_from_postcode(postcode) if postcode else None
|
|
138
|
-
city = geonames_raw_json.get("place_name")
|
|
139
|
-
|
|
140
|
-
return {
|
|
141
|
-
"country": country_name,
|
|
142
|
-
"country_code": (geonames_country_code_str or "").lower(),
|
|
143
|
-
"state": geonames_raw_json.get("community"),
|
|
144
|
-
"province": province,
|
|
145
|
-
"city": city,
|
|
146
|
-
"postcode": postcode,
|
|
147
|
-
"district": None,
|
|
148
|
-
"quarter": None,
|
|
149
|
-
"street": None,
|
|
150
|
-
"number": None,
|
|
151
|
-
}
|
|
265
|
+
return self._get_empty_address_result()
|
|
152
266
|
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
return {
|
|
156
|
-
"country": None,
|
|
157
|
-
"country_code": None,
|
|
158
|
-
"state": None,
|
|
159
|
-
"province": None,
|
|
160
|
-
"city": None,
|
|
161
|
-
"postcode": None,
|
|
162
|
-
"district": None,
|
|
163
|
-
"quarter": None,
|
|
164
|
-
"street": None,
|
|
165
|
-
"number": None,
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
def get_district_quarter(self, raw_json):
|
|
169
|
-
district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
|
|
170
|
-
quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
|
|
267
|
+
dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
|
|
268
|
+
dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
|
|
171
269
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
270
|
+
# Select final result
|
|
271
|
+
final_result = self._select_final_result(
|
|
272
|
+
parsed_nominatim_result,
|
|
273
|
+
parsed_geonames_result,
|
|
274
|
+
dist_nominatim,
|
|
275
|
+
dist_geonames,
|
|
276
|
+
selected_postcode,
|
|
277
|
+
selected_province_from_postcode,
|
|
278
|
+
raw_nominatim_province,
|
|
279
|
+
)
|
|
175
280
|
|
|
176
|
-
return
|
|
281
|
+
return final_result
|
|
177
282
|
|
|
178
283
|
|
|
179
284
|
class NominatimInterface(Nominatim):
|
|
180
|
-
def __init__(self, config):
|
|
285
|
+
def __init__(self, config: Dict[str, Any]) -> None:
|
|
181
286
|
if "osm" in config:
|
|
182
287
|
self.config = config["osm"]
|
|
183
288
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
########################################################################################################################
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
|
+
import unicodedata
|
|
4
5
|
from enum import Enum, auto
|
|
5
6
|
from typing import Any
|
|
6
|
-
import unicodedata
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
|
-
from
|
|
10
|
-
from inflection import parameterize, underscore, titleize, camelize
|
|
9
|
+
from inflection import camelize, parameterize, titleize, underscore
|
|
11
10
|
from string_utils import prettify, strip_html
|
|
11
|
+
from unidecode import unidecode
|
|
12
12
|
|
|
13
13
|
########################################################################################################################
|
|
14
14
|
# CLASSES
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
########################################################################################################################
|
|
2
|
+
# IMPORTS
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import warnings
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PiiDependenciesMissingError(ImportError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SpacyModelNotFoundError(ImportError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
import phonenumbers
|
|
19
|
+
import spacy
|
|
20
|
+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
|
|
21
|
+
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
22
|
+
from presidio_analyzer.predefined_recognizers import PhoneRecognizer
|
|
23
|
+
from presidio_anonymizer import AnonymizerEngine
|
|
24
|
+
from spacy.language import Language
|
|
25
|
+
from spacy_langdetect import LanguageDetector
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise PiiDependenciesMissingError(
|
|
28
|
+
"One or more PII anonymization dependencies are missing. "
|
|
29
|
+
"Please install them by running: pip install datamarket[pii]\n"
|
|
30
|
+
f"Original error: {e}"
|
|
31
|
+
) from e
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
########################################################################################################################
|
|
35
|
+
# SETTINGS
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger()
|
|
38
|
+
logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)
|
|
39
|
+
|
|
40
|
+
warnings.filterwarnings(
|
|
41
|
+
"ignore",
|
|
42
|
+
message=r"\[W108\]",
|
|
43
|
+
category=UserWarning,
|
|
44
|
+
module="spacy.pipeline.lemmatizer",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@Language.factory("language_detector")
|
|
49
|
+
def get_lang_detector(nlp, name):
|
|
50
|
+
return LanguageDetector()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
########################################################################################################################
|
|
54
|
+
# CLASSES
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class PiiAnonymizer:
|
|
58
|
+
SUPPORTED_LANG = ["es", "en"]
|
|
59
|
+
|
|
60
|
+
def __init__(self):
|
|
61
|
+
# Check for required spaCy models
|
|
62
|
+
required_models = {
|
|
63
|
+
"en_core_web_md": "python -m spacy download en_core_web_md",
|
|
64
|
+
"es_core_news_md": "python -m spacy download es_core_news_md",
|
|
65
|
+
}
|
|
66
|
+
missing_models_instructions = []
|
|
67
|
+
for model_name, install_command in required_models.items():
|
|
68
|
+
if not spacy.util.is_package(model_name):
|
|
69
|
+
missing_models_instructions.append(
|
|
70
|
+
f"Model '{model_name}' not found. Please install it by running: {install_command}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
if missing_models_instructions:
|
|
74
|
+
raise SpacyModelNotFoundError("\n".join(missing_models_instructions))
|
|
75
|
+
|
|
76
|
+
self.anonymizer = AnonymizerEngine()
|
|
77
|
+
self.analyzer = self._load_analyzer_engine()
|
|
78
|
+
|
|
79
|
+
self.nlp = self._nlp()
|
|
80
|
+
|
|
81
|
+
def _nlp(self) -> Language:
|
|
82
|
+
analyzer_en_model = self.analyzer.nlp_engine.nlp.get("en")
|
|
83
|
+
shared_vocab = analyzer_en_model.vocab
|
|
84
|
+
nlp = spacy.blank("en", vocab=shared_vocab)
|
|
85
|
+
|
|
86
|
+
if nlp.has_factory("sentencizer"):
|
|
87
|
+
nlp.add_pipe("sentencizer")
|
|
88
|
+
|
|
89
|
+
if nlp.has_factory("language_detector"):
|
|
90
|
+
nlp.add_pipe("language_detector", last=True)
|
|
91
|
+
|
|
92
|
+
return nlp
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _nlp_config():
|
|
96
|
+
return {
|
|
97
|
+
"nlp_engine_name": "spacy",
|
|
98
|
+
"models": [
|
|
99
|
+
{"lang_code": "es", "model_name": "es_core_news_md"},
|
|
100
|
+
{"lang_code": "en", "model_name": "en_core_web_md"},
|
|
101
|
+
],
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
def _load_analyzer_engine(self) -> AnalyzerEngine:
|
|
105
|
+
provider = NlpEngineProvider(nlp_configuration=PiiAnonymizer._nlp_config())
|
|
106
|
+
nlp_engine = provider.create_engine()
|
|
107
|
+
phone_recognizer_es = PhoneRecognizer(
|
|
108
|
+
supported_language="es",
|
|
109
|
+
supported_regions=phonenumbers.SUPPORTED_REGIONS,
|
|
110
|
+
context=["teléfono", "móvil", "número"],
|
|
111
|
+
)
|
|
112
|
+
registry = RecognizerRegistry(supported_languages=self.SUPPORTED_LANG)
|
|
113
|
+
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=self.SUPPORTED_LANG)
|
|
114
|
+
registry.add_recognizer(phone_recognizer_es)
|
|
115
|
+
|
|
116
|
+
analyzer = AnalyzerEngine(
|
|
117
|
+
registry=registry,
|
|
118
|
+
nlp_engine=nlp_engine,
|
|
119
|
+
supported_languages=self.SUPPORTED_LANG,
|
|
120
|
+
)
|
|
121
|
+
return analyzer
|
|
122
|
+
|
|
123
|
+
def detect_lang(self, text: str) -> str:
|
|
124
|
+
if hasattr(self, "nlp") and self.nlp:
|
|
125
|
+
with self.nlp.select_pipes(enable=["tokenizer", "sentencizer", "language_detector"]):
|
|
126
|
+
doc = self.nlp(text)
|
|
127
|
+
return doc._.language["language"]
|
|
128
|
+
else:
|
|
129
|
+
logger.error("Language detection NLP model not initialized. Cannot detect language.")
|
|
130
|
+
return "unknown"
|
|
131
|
+
|
|
132
|
+
def anonymize_text(
|
|
133
|
+
self,
|
|
134
|
+
text: str,
|
|
135
|
+
entities: Optional[List[str]] = None,
|
|
136
|
+
lang: str = "unknown",
|
|
137
|
+
) -> str:
|
|
138
|
+
if lang == "unknown":
|
|
139
|
+
lang = self.detect_lang(text)
|
|
140
|
+
if lang not in self.SUPPORTED_LANG:
|
|
141
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
142
|
+
return ""
|
|
143
|
+
elif lang not in self.SUPPORTED_LANG:
|
|
144
|
+
logger.warning(f"Support for language {lang} is not implemented yet! Fail safe to empty string.")
|
|
145
|
+
return ""
|
|
146
|
+
|
|
147
|
+
analyzer_result = self.analyzer.analyze(
|
|
148
|
+
text=text,
|
|
149
|
+
entities=entities,
|
|
150
|
+
language=lang,
|
|
151
|
+
)
|
|
152
|
+
anonymizer_result = self.anonymizer.anonymize(text=text, analyzer_results=analyzer_result)
|
|
153
|
+
return anonymizer_result.text
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.30
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
|
+
Home-page: https://datamarket.es
|
|
5
6
|
License: GPL-3.0-or-later
|
|
6
7
|
Author: DataMarket
|
|
7
8
|
Author-email: techsupport@datamarket.es
|
|
8
|
-
Requires-Python: >=3.12,<
|
|
9
|
+
Requires-Python: >=3.12,<3.13
|
|
9
10
|
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
10
11
|
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
11
12
|
Classifier: Operating System :: OS Independent
|
|
12
13
|
Classifier: Programming Language :: Python :: 3
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
15
|
Provides-Extra: alchemy
|
|
16
16
|
Provides-Extra: aws
|
|
17
17
|
Provides-Extra: azure-storage-blob
|
|
@@ -40,6 +40,7 @@ Provides-Extra: openpyxl
|
|
|
40
40
|
Provides-Extra: pandas
|
|
41
41
|
Provides-Extra: pandera
|
|
42
42
|
Provides-Extra: peerdb
|
|
43
|
+
Provides-Extra: pii
|
|
43
44
|
Provides-Extra: pillow
|
|
44
45
|
Provides-Extra: playwright
|
|
45
46
|
Provides-Extra: playwright-stealth
|
|
@@ -83,6 +84,7 @@ Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oau
|
|
|
83
84
|
Requires-Dist: html2text (>=2024.0.0,<2025.0.0) ; extra == "html2text"
|
|
84
85
|
Requires-Dist: httpx[http2] (>=0.28.0,<0.29.0) ; extra == "httpx"
|
|
85
86
|
Requires-Dist: inflection (>=0.5.0,<0.6.0)
|
|
87
|
+
Requires-Dist: jellyfish (>=1.0.0,<2.0.0)
|
|
86
88
|
Requires-Dist: jinja2 (>=3.0.0,<4.0.0)
|
|
87
89
|
Requires-Dist: json5 (>=0.10.0,<0.11.0) ; extra == "json5"
|
|
88
90
|
Requires-Dist: lxml[html-clean] (>=5.0.0,<6.0.0) ; extra == "lxml"
|
|
@@ -95,6 +97,8 @@ Requires-Dist: pendulum (>=3.0.0,<4.0.0)
|
|
|
95
97
|
Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
|
|
96
98
|
Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
|
|
97
99
|
Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
|
|
100
|
+
Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
101
|
+
Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
|
|
98
102
|
Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
|
|
99
103
|
Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
|
|
100
104
|
Requires-Dist: pycountry (>=24.0.0,<25.0.0)
|
|
@@ -110,14 +114,16 @@ Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
|
110
114
|
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
111
115
|
Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
112
116
|
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
117
|
+
Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
|
|
118
|
+
Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
|
|
113
119
|
Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
|
|
114
120
|
Requires-Dist: tenacity (>=9.0.0,<10.0.0)
|
|
121
|
+
Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
|
|
115
122
|
Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
|
|
116
123
|
Requires-Dist: typer (>=0.15.0,<0.16.0)
|
|
117
124
|
Requires-Dist: unidecode (>=1.0.0,<2.0.0)
|
|
118
125
|
Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
|
|
119
126
|
Project-URL: Documentation, https://github.com/Data-Market/datamarket
|
|
120
|
-
Project-URL: Homepage, https://datamarket.es
|
|
121
127
|
Project-URL: Repository, https://github.com/Data-Market/datamarket
|
|
122
128
|
Description-Content-Type: text/markdown
|
|
123
129
|
|
|
@@ -4,7 +4,7 @@ datamarket/interfaces/alchemy.py,sha256=4q_gLKCKPK437VKOpdBKSrCyy42P_yWxIhE7KuvH
|
|
|
4
4
|
datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
|
|
5
5
|
datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
|
|
6
6
|
datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
|
|
7
|
-
datamarket/interfaces/nominatim.py,sha256=
|
|
7
|
+
datamarket/interfaces/nominatim.py,sha256=rUnodcRKyZ_reBtyfFFjXNqP1TN0NMScW7zSGiJQ10I,12380
|
|
8
8
|
datamarket/interfaces/peerdb.py,sha256=cwYwvO740GyaPo9zLAwJsf3UeJDGDiYzjQVM9Q6s-_g,23652
|
|
9
9
|
datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
|
|
10
10
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
@@ -16,10 +16,12 @@ datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,6
|
|
|
16
16
|
datamarket/utils/main.py,sha256=j8wnAxeLvijdRU9M4V6HunWH7vgWWHP4u4xamzkWcUU,7009
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
|
-
datamarket/utils/strings.py,sha256=
|
|
19
|
+
datamarket/utils/strings/__init__.py,sha256=RmyN3hKGXmUym8w5tn28yWkw2uM-b5OvntB4D0lU1eo,84
|
|
20
|
+
datamarket/utils/strings/normalization.py,sha256=337M2UPwEETvhVTOnP4w_igTXpHUHoaD8e7x_-L-Bpk,5654
|
|
21
|
+
datamarket/utils/strings/obfuscation.py,sha256=8gMepfjPq0N4_IpKR6i2dy_9VJugQ3qJiRiRvKavB3s,5246
|
|
20
22
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
21
23
|
datamarket/utils/types.py,sha256=vxdQZdwdXrfPR4Es52gBgol-tMRIOD6oK9cBo3rB0JQ,74
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
24
|
-
datamarket-0.9.
|
|
25
|
-
datamarket-0.9.
|
|
24
|
+
datamarket-0.9.30.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
25
|
+
datamarket-0.9.30.dist-info/METADATA,sha256=zzhHMrHhBf_CfBLwjj4melul8sCkcO8np-nmay0jKOQ,6871
|
|
26
|
+
datamarket-0.9.30.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
27
|
+
datamarket-0.9.30.dist-info/RECORD,,
|
|
File without changes
|