datamarket 0.9.44__tar.gz → 0.9.46__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (32) hide show
  1. {datamarket-0.9.44 → datamarket-0.9.46}/PKG-INFO +3 -1
  2. {datamarket-0.9.44 → datamarket-0.9.46}/pyproject.toml +3 -1
  3. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/nominatim.py +22 -1
  4. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/params/nominatim.py +22 -0
  5. {datamarket-0.9.44 → datamarket-0.9.46}/LICENSE +0 -0
  6. {datamarket-0.9.44 → datamarket-0.9.46}/README.md +0 -0
  7. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/exceptions/__init__.py +0 -0
  9. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/exceptions/main.py +0 -0
  10. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/aws.py +0 -0
  13. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/peerdb.py +0 -0
  16. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/proxy.py +0 -0
  17. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/__init__.py +0 -0
  20. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/airflow.py +0 -0
  21. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/alchemy.py +0 -0
  22. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/main.py +0 -0
  23. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/playwright/__init__.py +0 -0
  24. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/playwright/async_api.py +0 -0
  25. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/playwright/sync_api.py +0 -0
  26. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/selenium.py +0 -0
  27. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/soda.py +0 -0
  28. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/strings/__init__.py +0 -0
  29. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/strings/normalization.py +0 -0
  30. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/strings/obfuscation.py +0 -0
  31. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/typer.py +0 -0
  32. {datamarket-0.9.44 → datamarket-0.9.46}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.44
3
+ Version: 0.9.46
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -57,6 +57,7 @@ Provides-Extra: retry
57
57
  Provides-Extra: shapely
58
58
  Provides-Extra: soda-core-mysql
59
59
  Provides-Extra: soda-core-postgres
60
+ Provides-Extra: sqlparse
60
61
  Provides-Extra: tqdm
61
62
  Provides-Extra: undetected-chromedriver
62
63
  Provides-Extra: xmltodict
@@ -119,6 +120,7 @@ Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-cor
119
120
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
120
121
  Requires-Dist: spacy (>=3.0.0,<4.0.0) ; extra == "pii"
121
122
  Requires-Dist: spacy-langdetect (>=0.1.0,<0.2.0) ; extra == "pii"
123
+ Requires-Dist: sqlparse (>=0.5.0,<0.6.0) ; extra == "sqlparse"
122
124
  Requires-Dist: stem (>=1.0.0,<2.0.0)
123
125
  Requires-Dist: tenacity (>=9.0.0,<10.0.0)
124
126
  Requires-Dist: tqdm (>=4.0.0,<5.0.0) ; extra == "tqdm"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.44"
3
+ version = "0.9.46"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -85,6 +85,7 @@ spacy-langdetect = { version = "~0.1.0", optional = true }
85
85
  pandarallel = { version = "^1.0.0", optional = true }
86
86
  pyrate-limiter = { version = "^3.0.0", optional = true }
87
87
  pyproj = { version = "^3.0.0", optional = true }
88
+ sqlparse = { version = "~0.5.0", optional = true }
88
89
 
89
90
  [tool.poetry.extras]
90
91
  boto3 = ["boto3"]
@@ -131,6 +132,7 @@ camoufox = ["camoufox", "browserforge", "playwright"]
131
132
  pandarallel = ["pandarallel"]
132
133
  pyrate-limiter = ["pyrate-limiter"]
133
134
  pyproj = ["pyproj"]
135
+ sqlparse = ["sqlparse"]
134
136
 
135
137
  # Interface groups
136
138
  aws = ["boto3"]
@@ -10,7 +10,7 @@ import requests
10
10
  from geopy.distance import geodesic
11
11
  from jellyfish import jaro_winkler_similarity
12
12
 
13
- from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES
13
+ from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES, STATES
14
14
  from ..utils.strings import normalize
15
15
 
16
16
  ########################################################################################################################
@@ -141,6 +141,24 @@ class Nominatim:
141
141
  "number": None,
142
142
  }
143
143
 
144
+ @staticmethod
145
+ def _canonicalize_state(state: Optional[str]) -> Optional[str]:
146
+ """
147
+ Canonicalize the state name using similarity. The most similar canonical state name is
148
+ returned if the similarity score is above the threshold.
149
+ """
150
+ if not state:
151
+ return None
152
+ norm_state = normalize(state)
153
+ best_match = None
154
+ best_score = 0.0
155
+ for canonical in STATES:
156
+ score = jaro_winkler_similarity(norm_state, normalize(canonical))
157
+ if score > best_score:
158
+ best_score = score
159
+ best_match = canonical
160
+ return best_match if best_score > JARO_WINKLER_THRESHOLD else None
161
+
144
162
  def _select_postcode_and_derived_province(
145
163
  self,
146
164
  parsed_nominatim_result: Dict[str, Optional[str]],
@@ -225,6 +243,9 @@ class Nominatim:
225
243
  if not state and nominatim_pc_valid:
226
244
  state = parsed_nominatim_result.get("state")
227
245
 
246
+ # Canonicalize
247
+ state = self._canonicalize_state(state)
248
+
228
249
  return postcode, province, state
229
250
 
230
251
  def _select_final_result(
@@ -54,3 +54,25 @@ POSTCODES = {
54
54
  "51": "Ceuta",
55
55
  "52": "Melilla",
56
56
  }
57
+
58
+ STATES = {
59
+ "Andalucía",
60
+ "Aragón",
61
+ "Asturias",
62
+ "Baleares",
63
+ "Canarias",
64
+ "Cantabria",
65
+ "Castilla-La Mancha",
66
+ "Castilla y León",
67
+ "Cataluña",
68
+ "Ceuta",
69
+ "Comunidad Valenciana",
70
+ "Extremadura",
71
+ "Galicia",
72
+ "La Rioja",
73
+ "Madrid",
74
+ "Melilla",
75
+ "Murcia",
76
+ "Navarra",
77
+ "País Vasco"
78
+ }
File without changes
File without changes