datamarket 0.9.38__tar.gz → 0.9.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (30) hide show
  1. {datamarket-0.9.38 → datamarket-0.9.40}/PKG-INFO +5 -6
  2. {datamarket-0.9.38 → datamarket-0.9.40}/pyproject.toml +4 -4
  3. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/nominatim.py +102 -50
  4. datamarket-0.9.40/src/datamarket/utils/playwright/__init__.py +0 -0
  5. datamarket-0.9.40/src/datamarket/utils/playwright/async_api.py +23 -0
  6. datamarket-0.9.40/src/datamarket/utils/playwright/sync_api.py +23 -0
  7. {datamarket-0.9.38 → datamarket-0.9.40}/LICENSE +0 -0
  8. {datamarket-0.9.38 → datamarket-0.9.40}/README.md +0 -0
  9. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/aws.py +0 -0
  13. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/peerdb.py +0 -0
  16. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/proxy.py +0 -0
  17. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/params/nominatim.py +0 -0
  20. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/__init__.py +0 -0
  21. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/airflow.py +0 -0
  22. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/alchemy.py +0 -0
  23. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/main.py +0 -0
  24. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/selenium.py +0 -0
  25. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/soda.py +0 -0
  26. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/strings/__init__.py +0 -0
  27. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/strings/normalization.py +0 -0
  28. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/strings/obfuscation.py +0 -0
  29. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/typer.py +0 -0
  30. {datamarket-0.9.38 → datamarket-0.9.40}/src/datamarket/utils/types.py +0 -0
@@ -1,8 +1,7 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.38
3
+ Version: 0.9.40
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
- Home-page: https://datamarket.es
6
5
  License: GPL-3.0-or-later
7
6
  Author: DataMarket
8
7
  Author-email: techsupport@datamarket.es
@@ -97,7 +96,7 @@ Requires-Dist: pandas (>=2.0.0,<3.0.0) ; extra == "pandas"
97
96
  Requires-Dist: pandera (>=0.22.0,<0.23.0) ; extra == "pandera"
98
97
  Requires-Dist: pendulum (>=3.0.0,<4.0.0)
99
98
  Requires-Dist: pillow (>=11.0.0,<12.0.0) ; extra == "pillow"
100
- Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
99
+ Requires-Dist: playwright (==1.47.0) ; extra == "playwright" or extra == "camoufox"
101
100
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
102
101
  Requires-Dist: presidio-analyzer[phonenumbers] (>=2.0.0,<3.0.0) ; extra == "pii"
103
102
  Requires-Dist: presidio-anonymizer (>=2.0.0,<3.0.0) ; extra == "pii"
@@ -114,18 +113,18 @@ Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
114
113
  Requires-Dist: requests (>=2.0.0,<3.0.0)
115
114
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
116
115
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
117
- Requires-Dist: soda-core-mysql (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
116
+ Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
118
117
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
119
118
  Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
120
119
  Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
121
120
  Requires-Dist: stem (>=1.0.0,<2.0.0) ; extra == "stem" or extra == "proxy"
122
121
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
123
- Requires-Dist: tf-playwright-stealth (>=1.0.0,<2.0.0)
124
122
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
125
123
  Requires-Dist: typer (>=0.15.0,<0.16.0)
126
124
  Requires-Dist: unidecode (>=1.0.0,<2.0.0)
127
125
  Requires-Dist: xmltodict (>=0.14.0,<0.15.0) ; extra == "xmltodict"
128
126
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
127
+ Project-URL: Homepage, https://datamarket.es
129
128
  Project-URL: Repository, https://github.com/Data-Market/datamarket
130
129
  Description-Content-Type: text/markdown
131
130
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.38"
3
+ version = "0.9.40"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -43,7 +43,7 @@ pytest = { version = "^8.0.0", optional = true }
43
43
  playwright = { version = "1.47.0", optional = true }
44
44
  tf-playwright-stealth = { version = "^1.0.0", optional = true }
45
45
  soda-core-postgres = { version = "^3.0.0", optional = true }
46
- soda-core-mysql = { version = "^3.0.0", optional = true }
46
+ soda-core-mysql-utf8-hotfix = { version = "^3.0.0", optional = true }
47
47
  fake-useragent = { version = "^2.0.0", optional = true }
48
48
  pydrive2 = { version = "^1.0.0", optional = true }
49
49
  clickhouse-driver = { version = "~0.2.0", optional = true }
@@ -94,7 +94,7 @@ pytest = ["pytest"]
94
94
  playwright = ["playwright"]
95
95
  playwright-stealth = ["playwright-stealth"]
96
96
  soda-core-postgres = ["soda-core-postgres"]
97
- soda-core-mysql = ["soda-core-mysql"]
97
+ soda-core-mysql = ["soda-core-mysql-utf8-hotfix"]
98
98
  fake-useragent = ["fake-useragent"]
99
99
  pydrive2 = ["pydrive2"]
100
100
  clickhouse-driver = ["clickhouse-driver"]
@@ -126,7 +126,7 @@ google-auth-oauthlib = ["google-auth-oauthlib"]
126
126
  dnspython = ["dnspython"]
127
127
  openpyxl = ["openpyxl"]
128
128
  httpx = ["httpx"]
129
- camoufox = ["camoufox"]
129
+ camoufox = ["camoufox", "browserforge", "playwright"]
130
130
  pandarallel = ["pandarallel"]
131
131
 
132
132
  # Interface groups
@@ -17,6 +17,7 @@ from ..utils.strings import normalize
17
17
  # PARAMETERS
18
18
 
19
19
  JARO_WINKLER_THRESHOLD = 0.85
20
+ CLOSE_KM = 2.0
20
21
 
21
22
  ########################################################################################################################
22
23
  # CLASSES
@@ -145,55 +146,86 @@ class Nominatim:
145
146
  parsed_nominatim_result: Dict[str, Optional[str]],
146
147
  parsed_geonames_result: Dict[str, Optional[str]],
147
148
  nominatim_address_province_raw: Optional[str],
148
- ) -> Tuple[Optional[str], Optional[str]]:
149
+ dist_nominatim: float, # distance Nominatim ↔ input (km)
150
+ dist_geonames: float, # distance GeoNames ↔ input (km)
151
+ ) -> Tuple[Optional[str], Optional[str], Optional[str]]:
149
152
  """
150
- Determines the postcode and its derived province based on comparisons
151
- between Nominatim and GeoNames data, and Nominatim's raw address province.
153
+ Decide the authoritative postcode, the province derived from it and the associated state.
154
+
155
+ Strategy:
156
+ 1. Derive province from each postcode.
157
+ 2. Validate each postcode–province pair:
158
+ • Nominatim: compare with raw province string (if present).
159
+ • GeoNames: multi-step validation (raw province, then Nominatim-derived
160
+ province when Nominatim coords are close, then distance fallback).
161
+ 3. Return the postcode/province that passes validation with precedence:
162
+ Nominatim > GeoNames. Returns (None, None, None) if neither passes.
152
163
  """
164
+
165
+ # --- Extract postcodes ---
153
166
  nominatim_postcode = parsed_nominatim_result.get("postcode")
154
167
  geonames_postcode = parsed_geonames_result.get("postcode")
155
168
 
156
- province_from_nominatim_postcode = self.geonames.get_province_from_postcode(nominatim_postcode)
157
- province_from_geonames_postcode = self.geonames.get_province_from_postcode(geonames_postcode)
169
+ # --- Province derived from each postcode ---
170
+ province_from_nominatim_pc = self.geonames.get_province_from_postcode(nominatim_postcode)
171
+ province_from_geonames_pc = self.geonames.get_province_from_postcode(geonames_postcode)
158
172
 
159
- norm_raw_nominatim_province = (
160
- normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
161
- )
162
- norm_province_from_nominatim_postcode = (
163
- normalize(province_from_nominatim_postcode) if province_from_nominatim_postcode else ""
164
- )
165
- norm_province_from_geonames_postcode = (
166
- normalize(province_from_geonames_postcode) if province_from_geonames_postcode else ""
167
- )
173
+ # --- Normalised strings for similarity comparisons ---
174
+ norm_raw_province = normalize(nominatim_address_province_raw) if nominatim_address_province_raw else ""
175
+ norm_province_from_nominatim_pc = normalize(province_from_nominatim_pc) if province_from_nominatim_pc else ""
176
+ norm_province_from_geonames_pc = normalize(province_from_geonames_pc) if province_from_geonames_pc else ""
168
177
 
169
- selected_postcode = None
170
- selected_province_from_postcode = None
178
+ # --- Distance heuristics ---
179
+ nominatim_is_close = dist_nominatim < CLOSE_KM
180
+ geonames_is_close = dist_geonames < CLOSE_KM
171
181
 
172
- # If provinces derived from Nominatim and GeoNames postcodes differ
173
- nominatim_postcode_province_matches = False
174
- if norm_province_from_nominatim_postcode and norm_raw_nominatim_province:
175
- nominatim_postcode_province_matches = (
176
- jaro_winkler_similarity(norm_province_from_nominatim_postcode, norm_raw_nominatim_province)
177
- > JARO_WINKLER_THRESHOLD
182
+ # --- Validate Nominatim postcode ---
183
+ nominatim_pc_valid = False
184
+ if norm_province_from_nominatim_pc and norm_raw_province:
185
+ nominatim_pc_valid = (
186
+ jaro_winkler_similarity(norm_province_from_nominatim_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
178
187
  )
179
188
 
180
- geonames_postcode_province_matches = False
181
- if norm_province_from_geonames_postcode and norm_raw_nominatim_province:
182
- geonames_postcode_province_matches = (
183
- jaro_winkler_similarity(norm_province_from_geonames_postcode, norm_raw_nominatim_province)
184
- > JARO_WINKLER_THRESHOLD
185
- )
189
+ # --- Validate GeoNames postcode ---
190
+ geonames_pc_valid = False
186
191
 
187
- # Prefer GeoNames postcode if its province matches Nominatim's raw address province,
188
- # and Nominatim's own postcode-derived province does not.
189
- if nominatim_postcode_province_matches:
190
- selected_postcode = nominatim_postcode
191
- selected_province_from_postcode = province_from_nominatim_postcode
192
- if geonames_postcode_province_matches and not nominatim_postcode_province_matches:
193
- selected_postcode = geonames_postcode
194
- selected_province_from_postcode = province_from_geonames_postcode
192
+ # 1) Compare with raw province string (if exists)
193
+ if norm_province_from_geonames_pc and norm_raw_province:
194
+ geonames_pc_valid = (
195
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_raw_province) > JARO_WINKLER_THRESHOLD
196
+ )
195
197
 
196
- return selected_postcode, selected_province_from_postcode
198
+ # 2) If no raw province, compare with province from Nominatim PC **only when** Nominatim is close
199
+ if not geonames_pc_valid and not norm_raw_province and nominatim_is_close:
200
+ if norm_province_from_geonames_pc and norm_province_from_nominatim_pc:
201
+ geonames_pc_valid = (
202
+ jaro_winkler_similarity(norm_province_from_geonames_pc, norm_province_from_nominatim_pc)
203
+ > JARO_WINKLER_THRESHOLD
204
+ )
205
+
206
+ # 3) Fallback: accept GeoNames PC if its coordinates are very close
207
+ if not geonames_pc_valid and geonames_is_close and geonames_postcode:
208
+ geonames_pc_valid = True
209
+
210
+ # --- Select authoritative tuple ---
211
+ postcode = None
212
+ province = None
213
+ state = None
214
+
215
+ if nominatim_pc_valid:
216
+ postcode = nominatim_postcode
217
+ province = province_from_nominatim_pc
218
+ state = parsed_nominatim_result.get("state")
219
+ if not state and geonames_pc_valid:
220
+ state = parsed_geonames_result.get("state")
221
+ elif geonames_pc_valid:
222
+ postcode = geonames_postcode
223
+ province = province_from_geonames_pc
224
+ state = parsed_geonames_result.get("state")
225
+ if not state and nominatim_pc_valid:
226
+ state = parsed_nominatim_result.get("state")
227
+
228
+ return postcode, province, state
197
229
 
198
230
  def _select_final_result(
199
231
  self,
@@ -203,21 +235,41 @@ class Nominatim:
203
235
  dist_geonames: float,
204
236
  authoritative_postcode: Optional[str],
205
237
  authoritative_province_from_postcode: Optional[str],
206
- nominatim_province: Optional[str],
238
+ authoritative_state: Optional[str],
207
239
  ) -> Dict[str, Optional[str]]:
208
240
  """
209
- Selects the final address result based on distances and applies the authoritative postcode/province.
241
+ Choose the address block (Nominatim vs GeoNames) based on distance,
242
+ then apply the authoritative postcode/province.
243
+
244
+ Rules:
245
+ • Pick the source with the smaller finite distance.
246
+ • Always overwrite 'postcode' if authoritative_postcode is present.
247
+ • Overwrite 'province' only when authoritative_province_from_postcode is not None.
248
+ • If both distances are ∞, return an empty address.
210
249
  """
250
+
251
+ # ------------------------------------------------------------------ #
252
+ # 1. Decide the base address block #
253
+ # ------------------------------------------------------------------ #
211
254
  if dist_nominatim <= dist_geonames and dist_nominatim != float("inf"):
212
255
  final_result = parsed_nominatim_result
213
- final_result["postcode"] = authoritative_postcode
214
- final_result["province"] = nominatim_province
215
256
  elif dist_geonames < dist_nominatim and dist_geonames != float("inf"):
216
257
  final_result = parsed_geonames_result
258
+ else:
259
+ return self._get_empty_address_result()
260
+
261
+ # ------------------------------------------------------------------ #
262
+ # 2. Apply authoritative postcode / province #
263
+ # ------------------------------------------------------------------ #
264
+ if authoritative_postcode:
217
265
  final_result["postcode"] = authoritative_postcode
266
+
267
+ if authoritative_province_from_postcode:
218
268
  final_result["province"] = authoritative_province_from_postcode
219
- else:
220
- final_result = self._get_empty_address_result()
269
+
270
+ if authoritative_province_from_postcode:
271
+ final_result["state"] = authoritative_state
272
+
221
273
  return final_result
222
274
 
223
275
  def _get_district_quarter(self, raw_json: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -250,12 +302,6 @@ class Nominatim:
250
302
  parsed_nominatim_result = self._parse_nominatim_result(nominatim_response)
251
303
  parsed_geonames_result = self._parse_geonames_result(geonames_response)
252
304
 
253
- # Determine authoritative postcode
254
- nominatim_province = parsed_nominatim_result.get("province")
255
- selected_postcode, selected_province_from_postcode = self._select_postcode_and_derived_province(
256
- parsed_nominatim_result, parsed_geonames_result, nominatim_province
257
- )
258
-
259
305
  # Calculate distances
260
306
  nominatim_response_lat = nominatim_response.get("lat")
261
307
  nominatim_response_lon = nominatim_response.get("lon")
@@ -272,6 +318,12 @@ class Nominatim:
272
318
  dist_nominatim = self._calculate_distance(nominatim_response_lat, nominatim_response_lon, input_coords)
273
319
  dist_geonames = self._calculate_distance(geonames_response_lat, geonames_response_lon, input_coords)
274
320
 
321
+ # Determine authoritative postcode
322
+ nominatim_province = parsed_nominatim_result.get("province")
323
+ selected_postcode, selected_province_from_postcode, selected_state = self._select_postcode_and_derived_province(
324
+ parsed_nominatim_result, parsed_geonames_result, nominatim_province, dist_nominatim, dist_geonames
325
+ )
326
+
275
327
  # Select final result
276
328
  final_result = self._select_final_result(
277
329
  parsed_nominatim_result,
@@ -280,7 +332,7 @@ class Nominatim:
280
332
  dist_geonames,
281
333
  selected_postcode,
282
334
  selected_province_from_postcode,
283
- nominatim_province,
335
+ selected_state,
284
336
  )
285
337
 
286
338
  return final_result
@@ -0,0 +1,23 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import asyncio
5
+ from random import randint
6
+
7
+ from playwright.async_api import Page
8
+
9
+
10
+ ########################################################################################################################
11
+ # FUNCTIONS
12
+
13
+
14
+ async def human_type(page: Page, text: str, delay: int = 100):
15
+ for char in text:
16
+ await page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
17
+
18
+
19
+ async def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, sleep=True):
20
+ for _ in range(count):
21
+ await page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
22
+ if sleep:
23
+ await asyncio.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
@@ -0,0 +1,23 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import time
5
+ from random import randint
6
+
7
+ from playwright.sync_api import Page
8
+
9
+
10
+ ########################################################################################################################
11
+ # FUNCTIONS
12
+
13
+
14
+ def human_type(page: Page, text: str, delay: int = 100):
15
+ for char in text:
16
+ page.keyboard.type(char, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
17
+
18
+
19
+ def human_press_key(page: Page, key: str, count: int = 1, delay: int = 100, sleep=True):
20
+ for _ in range(count):
21
+ page.keyboard.press(key, delay=randint(int(delay * 0.5), int(delay * 1.5))) # noqa: S311
22
+ if sleep:
23
+ time.sleep(randint(int(delay * 1.5), int(delay * 2.5)) / 1000) # noqa: S311
File without changes
File without changes