datamarket 0.9.45__tar.gz → 0.9.47__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (32) hide show
  1. {datamarket-0.9.45 → datamarket-0.9.47}/PKG-INFO +3 -1
  2. {datamarket-0.9.45 → datamarket-0.9.47}/pyproject.toml +3 -1
  3. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/nominatim.py +22 -1
  4. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/params/nominatim.py +22 -0
  5. {datamarket-0.9.45 → datamarket-0.9.47}/LICENSE +0 -0
  6. {datamarket-0.9.45 → datamarket-0.9.47}/README.md +0 -0
  7. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/exceptions/__init__.py +0 -0
  9. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/exceptions/main.py +0 -0
  10. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/aws.py +0 -0
  13. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/drive.py +0 -0
  14. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/ftp.py +0 -0
  15. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/peerdb.py +0 -0
  16. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/proxy.py +0 -0
  17. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/tinybird.py +0 -0
  18. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/params/__init__.py +0 -0
  19. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/__init__.py +0 -0
  20. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/airflow.py +0 -0
  21. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/alchemy.py +0 -0
  22. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/main.py +0 -0
  23. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/__init__.py +0 -0
  24. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/async_api.py +0 -0
  25. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/sync_api.py +0 -0
  26. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/selenium.py +0 -0
  27. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/soda.py +0 -0
  28. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/__init__.py +0 -0
  29. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/normalization.py +0 -0
  30. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/obfuscation.py +0 -0
  31. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/typer.py +0 -0
  32. {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.45
3
+ Version: 0.9.47
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -54,6 +54,7 @@ Provides-Extra: pyspark
54
54
  Provides-Extra: pytest
55
55
  Provides-Extra: rapidfuzz
56
56
  Provides-Extra: retry
57
+ Provides-Extra: rnet
57
58
  Provides-Extra: shapely
58
59
  Provides-Extra: soda-core-mysql
59
60
  Provides-Extra: soda-core-postgres
@@ -115,6 +116,7 @@ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
115
116
  Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
116
117
  Requires-Dist: requests (>=2.0.0,<3.0.0)
117
118
  Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
119
+ Requires-Dist: rnet (>=2.0.0,<3.0.0) ; extra == "rnet"
118
120
  Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
119
121
  Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
120
122
  Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.45"
3
+ version = "0.9.47"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -86,6 +86,7 @@ pandarallel = { version = "^1.0.0", optional = true }
86
86
  pyrate-limiter = { version = "^3.0.0", optional = true }
87
87
  pyproj = { version = "^3.0.0", optional = true }
88
88
  sqlparse = { version = "~0.5.0", optional = true }
89
+ rnet = { version = "^2.0.0", optional = true }
89
90
 
90
91
  [tool.poetry.extras]
91
92
  boto3 = ["boto3"]
@@ -133,6 +134,7 @@ pandarallel = ["pandarallel"]
133
134
  pyrate-limiter = ["pyrate-limiter"]
134
135
  pyproj = ["pyproj"]
135
136
  sqlparse = ["sqlparse"]
137
+ rnet = ["rnet"]
136
138
 
137
139
  # Interface groups
138
140
  aws = ["boto3"]
@@ -10,7 +10,7 @@ import requests
10
10
  from geopy.distance import geodesic
11
11
  from jellyfish import jaro_winkler_similarity
12
12
 
13
- from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES
13
+ from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES, STATES
14
14
  from ..utils.strings import normalize
15
15
 
16
16
  ########################################################################################################################
@@ -141,6 +141,24 @@ class Nominatim:
141
141
  "number": None,
142
142
  }
143
143
 
144
+ @staticmethod
145
+ def _canonicalize_state(state: Optional[str]) -> Optional[str]:
146
+ """
147
+ Canonicalize the state name using similarity. The most similar canonical state name is
148
+ returned if the similarity score is above the threshold.
149
+ """
150
+ if not state:
151
+ return None
152
+ norm_state = normalize(state)
153
+ best_match = None
154
+ best_score = 0.0
155
+ for canonical in STATES:
156
+ score = jaro_winkler_similarity(norm_state, normalize(canonical))
157
+ if score > best_score:
158
+ best_score = score
159
+ best_match = canonical
160
+ return best_match if best_score > JARO_WINKLER_THRESHOLD else None
161
+
144
162
  def _select_postcode_and_derived_province(
145
163
  self,
146
164
  parsed_nominatim_result: Dict[str, Optional[str]],
@@ -225,6 +243,9 @@ class Nominatim:
225
243
  if not state and nominatim_pc_valid:
226
244
  state = parsed_nominatim_result.get("state")
227
245
 
246
+ # Canonicalize
247
+ state = self._canonicalize_state(state)
248
+
228
249
  return postcode, province, state
229
250
 
230
251
  def _select_final_result(
@@ -54,3 +54,25 @@ POSTCODES = {
54
54
  "51": "Ceuta",
55
55
  "52": "Melilla",
56
56
  }
57
+
58
+ STATES = {
59
+ "Andalucía",
60
+ "Aragón",
61
+ "Asturias",
62
+ "Baleares",
63
+ "Canarias",
64
+ "Cantabria",
65
+ "Castilla-La Mancha",
66
+ "Castilla y León",
67
+ "Cataluña",
68
+ "Ceuta",
69
+ "Comunidad Valenciana",
70
+ "Extremadura",
71
+ "Galicia",
72
+ "La Rioja",
73
+ "Madrid",
74
+ "Melilla",
75
+ "Murcia",
76
+ "Navarra",
77
+ "País Vasco"
78
+ }
File without changes
File without changes