datamarket 0.9.45__tar.gz → 0.9.47__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- {datamarket-0.9.45 → datamarket-0.9.47}/PKG-INFO +3 -1
- {datamarket-0.9.45 → datamarket-0.9.47}/pyproject.toml +3 -1
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/nominatim.py +22 -1
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/params/nominatim.py +22 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/LICENSE +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/README.md +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/exceptions/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/exceptions/main.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/alchemy.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/aws.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/drive.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/ftp.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/peerdb.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/proxy.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/interfaces/tinybird.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/params/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/airflow.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/alchemy.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/main.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/async_api.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/playwright/sync_api.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/selenium.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/soda.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/__init__.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/normalization.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/strings/obfuscation.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/typer.py +0 -0
- {datamarket-0.9.45 → datamarket-0.9.47}/src/datamarket/utils/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.47
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -54,6 +54,7 @@ Provides-Extra: pyspark
|
|
|
54
54
|
Provides-Extra: pytest
|
|
55
55
|
Provides-Extra: rapidfuzz
|
|
56
56
|
Provides-Extra: retry
|
|
57
|
+
Provides-Extra: rnet
|
|
57
58
|
Provides-Extra: shapely
|
|
58
59
|
Provides-Extra: soda-core-mysql
|
|
59
60
|
Provides-Extra: soda-core-postgres
|
|
@@ -115,6 +116,7 @@ Requires-Dist: python-string-utils (>=1.0.0,<2.0.0)
|
|
|
115
116
|
Requires-Dist: rapidfuzz (>=3.0.0,<4.0.0) ; extra == "rapidfuzz"
|
|
116
117
|
Requires-Dist: requests (>=2.0.0,<3.0.0)
|
|
117
118
|
Requires-Dist: retry (>=0.9.0,<0.10.0) ; extra == "retry"
|
|
119
|
+
Requires-Dist: rnet (>=2.0.0,<3.0.0) ; extra == "rnet"
|
|
118
120
|
Requires-Dist: shapely (>=2.0.0,<3.0.0) ; extra == "shapely"
|
|
119
121
|
Requires-Dist: soda-core-mysql-utf8-hotfix (>=3.0.0,<4.0.0) ; extra == "soda-core-mysql"
|
|
120
122
|
Requires-Dist: soda-core-postgres (>=3.0.0,<4.0.0) ; extra == "soda-core-postgres"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "datamarket"
|
|
3
|
-
version = "0.9.
|
|
3
|
+
version = "0.9.47"
|
|
4
4
|
description = "Utilities that integrate advanced scraping knowledge into just one library."
|
|
5
5
|
authors = ["DataMarket <techsupport@datamarket.es>"]
|
|
6
6
|
license = "GPL-3.0-or-later"
|
|
@@ -86,6 +86,7 @@ pandarallel = { version = "^1.0.0", optional = true }
|
|
|
86
86
|
pyrate-limiter = { version = "^3.0.0", optional = true }
|
|
87
87
|
pyproj = { version = "^3.0.0", optional = true }
|
|
88
88
|
sqlparse = { version = "~0.5.0", optional = true }
|
|
89
|
+
rnet = { version = "^2.0.0", optional = true }
|
|
89
90
|
|
|
90
91
|
[tool.poetry.extras]
|
|
91
92
|
boto3 = ["boto3"]
|
|
@@ -133,6 +134,7 @@ pandarallel = ["pandarallel"]
|
|
|
133
134
|
pyrate-limiter = ["pyrate-limiter"]
|
|
134
135
|
pyproj = ["pyproj"]
|
|
135
136
|
sqlparse = ["sqlparse"]
|
|
137
|
+
rnet = ["rnet"]
|
|
136
138
|
|
|
137
139
|
# Interface groups
|
|
138
140
|
aws = ["boto3"]
|
|
@@ -10,7 +10,7 @@ import requests
|
|
|
10
10
|
from geopy.distance import geodesic
|
|
11
11
|
from jellyfish import jaro_winkler_similarity
|
|
12
12
|
|
|
13
|
-
from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES
|
|
13
|
+
from ..params.nominatim import CITY_TO_PROVINCE, POSTCODES, STATES
|
|
14
14
|
from ..utils.strings import normalize
|
|
15
15
|
|
|
16
16
|
########################################################################################################################
|
|
@@ -141,6 +141,24 @@ class Nominatim:
|
|
|
141
141
|
"number": None,
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _canonicalize_state(state: Optional[str]) -> Optional[str]:
|
|
146
|
+
"""
|
|
147
|
+
Canonicalize the state name using similarity. The most similar canonical state name is
|
|
148
|
+
returned if the similarity score is above the threshold.
|
|
149
|
+
"""
|
|
150
|
+
if not state:
|
|
151
|
+
return None
|
|
152
|
+
norm_state = normalize(state)
|
|
153
|
+
best_match = None
|
|
154
|
+
best_score = 0.0
|
|
155
|
+
for canonical in STATES:
|
|
156
|
+
score = jaro_winkler_similarity(norm_state, normalize(canonical))
|
|
157
|
+
if score > best_score:
|
|
158
|
+
best_score = score
|
|
159
|
+
best_match = canonical
|
|
160
|
+
return best_match if best_score > JARO_WINKLER_THRESHOLD else None
|
|
161
|
+
|
|
144
162
|
def _select_postcode_and_derived_province(
|
|
145
163
|
self,
|
|
146
164
|
parsed_nominatim_result: Dict[str, Optional[str]],
|
|
@@ -225,6 +243,9 @@ class Nominatim:
|
|
|
225
243
|
if not state and nominatim_pc_valid:
|
|
226
244
|
state = parsed_nominatim_result.get("state")
|
|
227
245
|
|
|
246
|
+
# Canonicalize
|
|
247
|
+
state = self._canonicalize_state(state)
|
|
248
|
+
|
|
228
249
|
return postcode, province, state
|
|
229
250
|
|
|
230
251
|
def _select_final_result(
|
|
@@ -54,3 +54,25 @@ POSTCODES = {
|
|
|
54
54
|
"51": "Ceuta",
|
|
55
55
|
"52": "Melilla",
|
|
56
56
|
}
|
|
57
|
+
|
|
58
|
+
STATES = {
|
|
59
|
+
"Andalucía",
|
|
60
|
+
"Aragón",
|
|
61
|
+
"Asturias",
|
|
62
|
+
"Baleares",
|
|
63
|
+
"Canarias",
|
|
64
|
+
"Cantabria",
|
|
65
|
+
"Castilla-La Mancha",
|
|
66
|
+
"Castilla y León",
|
|
67
|
+
"Cataluña",
|
|
68
|
+
"Ceuta",
|
|
69
|
+
"Comunidad Valenciana",
|
|
70
|
+
"Extremadura",
|
|
71
|
+
"Galicia",
|
|
72
|
+
"La Rioja",
|
|
73
|
+
"Madrid",
|
|
74
|
+
"Melilla",
|
|
75
|
+
"Murcia",
|
|
76
|
+
"Navarra",
|
|
77
|
+
"País Vasco"
|
|
78
|
+
}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|