datamarket 0.9.27__tar.gz → 0.9.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (26) hide show
  1. {datamarket-0.9.27 → datamarket-0.9.28}/PKG-INFO +3 -3
  2. {datamarket-0.9.27 → datamarket-0.9.28}/pyproject.toml +3 -3
  3. datamarket-0.9.28/src/datamarket/interfaces/nominatim.py +189 -0
  4. datamarket-0.9.27/src/datamarket/interfaces/nominatim.py +0 -110
  5. {datamarket-0.9.27 → datamarket-0.9.28}/LICENSE +0 -0
  6. {datamarket-0.9.27 → datamarket-0.9.28}/README.md +0 -0
  7. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/__init__.py +0 -0
  9. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/alchemy.py +0 -0
  10. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/aws.py +0 -0
  11. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/drive.py +0 -0
  12. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/ftp.py +0 -0
  13. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/peerdb.py +0 -0
  14. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/main.py +0 -0
  22. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/selenium.py +0 -0
  23. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/soda.py +0 -0
  24. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/strings.py +0 -0
  25. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/typer.py +0 -0
  26. {datamarket-0.9.27 → datamarket-0.9.28}/src/datamarket/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.27
3
+ Version: 0.9.28
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -28,7 +28,6 @@ Provides-Extra: duckduckgo-search
28
28
  Provides-Extra: fake-useragent
29
29
  Provides-Extra: geoalchemy2
30
30
  Provides-Extra: geopandas
31
- Provides-Extra: geopy
32
31
  Provides-Extra: google-api-python-client
33
32
  Provides-Extra: google-auth-httplib2
34
33
  Provides-Extra: google-auth-oauthlib
@@ -77,7 +76,7 @@ Requires-Dist: dynaconf (>=3.0.0,<4.0.0)
77
76
  Requires-Dist: fake-useragent (>=2.0.0,<3.0.0) ; extra == "fake-useragent"
78
77
  Requires-Dist: geoalchemy2 (>=0.17.0,<0.18.0) ; extra == "geoalchemy2"
79
78
  Requires-Dist: geopandas (>=1.0.0,<2.0.0) ; extra == "geopandas"
80
- Requires-Dist: geopy (>=2.0.0,<3.0.0) ; extra == "geopy"
79
+ Requires-Dist: geopy (>=2.0.0,<3.0.0)
81
80
  Requires-Dist: google-api-python-client (>=2.0.0,<3.0.0) ; extra == "google-api-python-client"
82
81
  Requires-Dist: google-auth-httplib2 (>=0.2.0,<0.3.0) ; extra == "google-auth-httplib2"
83
82
  Requires-Dist: google-auth-oauthlib (>=1.0.0,<2.0.0) ; extra == "google-auth-oauthlib"
@@ -98,6 +97,7 @@ Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
98
97
  Requires-Dist: pre-commit (>=4.0.0,<5.0.0)
99
98
  Requires-Dist: psycopg2-binary (>=2.0.0,<3.0.0)
100
99
  Requires-Dist: pyarrow (>=19.0.0,<20.0.0) ; extra == "pyarrow"
100
+ Requires-Dist: pycountry (>=24.0.0,<25.0.0)
101
101
  Requires-Dist: pydrive2 (>=1.0.0,<2.0.0) ; extra == "pydrive2" or extra == "drive"
102
102
  Requires-Dist: pymupdf (>=1.0.0,<2.0.0) ; extra == "pymupdf"
103
103
  Requires-Dist: pysocks (>=1.0.0,<2.0.0) ; extra == "pysocks"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.27"
3
+ version = "0.9.28"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -30,6 +30,8 @@ inflection = "~0.5.0"
30
30
  python-string-utils = "^1.0.0"
31
31
  unidecode = "^1.0.0"
32
32
  numpy = "^2.0.0"
33
+ pycountry = "^24.0.0"
34
+ geopy = "^2.0.0"
33
35
 
34
36
  boto3 = { version = "~1.35.0", optional = true }
35
37
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
@@ -48,7 +50,6 @@ stem = { version = "^1.0.0", optional = true }
48
50
  click = { version = "^8.0.0", optional = true }
49
51
  rapidfuzz = { version = "^3.0.0", optional = true }
50
52
  demjson3 = { version = "^3.0.0", optional = true }
51
- geopy = { version = "^2.0.0", optional = true }
52
53
  nodriver = { version = "~0.44", optional = true }
53
54
  retry = { version = "~0.9.0", optional = true }
54
55
  shapely = { version = "^2.0.0", optional = true }
@@ -93,7 +94,6 @@ stem = ["stem"]
93
94
  click = ["click"]
94
95
  rapidfuzz = ["rapidfuzz"]
95
96
  demjson3 = ["demjson3"]
96
- geopy = ["geopy"]
97
97
  nodriver = ["nodriver"]
98
98
  undetected-chromedriver = ["undetected-chromedriver"]
99
99
  retry = ["retry"]
@@ -0,0 +1,189 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import gettext
5
+ import logging
6
+ import pycountry
7
+ from geopy.distance import geodesic
8
+
9
+ import requests
10
+
11
+ from ..params.nominatim import POSTCODES
12
+
13
+ ########################################################################################################################
14
+ # CLASSES
15
+
16
+ logger = logging.getLogger(__name__)
17
+ spanish = gettext.translation("iso3166-1", pycountry.LOCALES_DIR, languages=["es"])
18
+ spanish.install()
19
+
20
+
21
+ class GeoNames:
22
+ def __init__(self, endpoint):
23
+ self.endpoint = endpoint
24
+
25
+ @staticmethod
26
+ def validate_postcode(postcode):
27
+ if isinstance(postcode, int):
28
+ postcode = str(postcode)
29
+
30
+ if postcode and len(postcode) == 5 and postcode[:2] in POSTCODES:
31
+ return postcode
32
+
33
+ if postcode and len(postcode) == 4:
34
+ postcode = f"0{postcode}"
35
+ if postcode[:2] in POSTCODES:
36
+ return postcode
37
+
38
+ @staticmethod
39
+ def get_province_from_postcode(postcode):
40
+ if postcode:
41
+ return POSTCODES[postcode[:2]]
42
+
43
+ def reverse(self, lat, lon):
44
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
45
+
46
+
47
+ class Nominatim:
48
+ def __init__(self, nominatim_endpoint, geonames_endpoint):
49
+ self.endpoint = nominatim_endpoint
50
+ self.geonames = GeoNames(geonames_endpoint)
51
+
52
+ @staticmethod
53
+ def get_attribute(raw_json, keys):
54
+ for key in keys:
55
+ if key in raw_json:
56
+ return raw_json[key]
57
+
58
+ def geocode(self, address):
59
+ return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
60
+
61
+ def geocode_parsed(self, address):
62
+ results = self.geocode(address)
63
+
64
+ if results:
65
+ return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
66
+
67
+ def reverse(self, lat, lon):
68
+ return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
69
+
70
+ def reverse_parsed(self, lat, lon):
71
+ nominatim_raw_json = self.reverse(lat, lon)
72
+ geonames_raw_json = self.geonames.reverse(lat, lon)
73
+
74
+ nominatim_res_lat_str = nominatim_raw_json.get("lat")
75
+ nominatim_res_lon_str = nominatim_raw_json.get("lon")
76
+ geonames_res_lat_str = geonames_raw_json.get("lat")
77
+ geonames_res_lon_str = geonames_raw_json.get("lon")
78
+
79
+ dist_nominatim = float("inf")
80
+ dist_geonames = float("inf")
81
+
82
+ try:
83
+ input_coords = (float(lat), float(lon))
84
+ except (ValueError, TypeError):
85
+ logger.error(f"Invalid input coordinates for distance calculation: lat={lat}, lon={lon}")
86
+ else:
87
+ if nominatim_res_lat_str and nominatim_res_lon_str:
88
+ try:
89
+ nominatim_coords = (float(nominatim_res_lat_str), float(nominatim_res_lon_str))
90
+ dist_nominatim = geodesic(input_coords, nominatim_coords).km
91
+ except (ValueError, TypeError):
92
+ logger.warning("Invalid Nominatim coordinates for distance calculation.")
93
+
94
+ if geonames_res_lat_str and geonames_res_lon_str:
95
+ try:
96
+ geonames_coords = (float(geonames_res_lat_str), float(geonames_res_lon_str))
97
+ dist_geonames = geodesic(input_coords, geonames_coords).km
98
+ except (ValueError, TypeError):
99
+ logger.warning("Invalid GeoNames coordinates for distance calculation.")
100
+
101
+ if dist_nominatim <= dist_geonames and nominatim_res_lat_str is not None and nominatim_res_lon_str is not None:
102
+ # Use Nominatim data
103
+ raw_address = nominatim_raw_json.get("address", {})
104
+ postcode_str = str(raw_address.get("postcode", ""))
105
+ postcode = self.geonames.validate_postcode(postcode_str)
106
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
107
+ city = self.get_attribute(raw_address, ["city", "town", "village"])
108
+ district, quarter = self.get_district_quarter(raw_address)
109
+
110
+ return {
111
+ "country": raw_address.get("country"),
112
+ "country_code": (raw_address.get("country_code") or "").lower(),
113
+ "state": raw_address.get("state"),
114
+ "province": province,
115
+ "city": city,
116
+ "postcode": postcode,
117
+ "district": district,
118
+ "quarter": quarter,
119
+ "street": raw_address.get("road"),
120
+ "number": raw_address.get("house_number"),
121
+ }
122
+
123
+ elif dist_geonames < dist_nominatim and geonames_res_lat_str is not None and geonames_res_lon_str is not None:
124
+ # Use GeoNames data
125
+ geonames_country_code_str = geonames_raw_json.get("country_code")
126
+ country_name = None
127
+ if geonames_country_code_str:
128
+ try:
129
+ country_obj = pycountry.countries.get(alpha_2=geonames_country_code_str.upper())
130
+ if country_obj:
131
+ country_name = spanish.gettext(country_obj.name)
132
+ except LookupError:
133
+ logger.warning(f"Country name not found for code: {geonames_country_code_str} using pycountry.")
134
+
135
+ postcode_str = str(geonames_raw_json.get("postal_code", ""))
136
+ postcode = self.geonames.validate_postcode(postcode_str)
137
+ province = self.geonames.get_province_from_postcode(postcode) if postcode else None
138
+ city = geonames_raw_json.get("place_name")
139
+
140
+ return {
141
+ "country": country_name,
142
+ "country_code": (geonames_country_code_str or "").lower(),
143
+ "state": geonames_raw_json.get("community"),
144
+ "province": province,
145
+ "city": city,
146
+ "postcode": postcode,
147
+ "district": None,
148
+ "quarter": None,
149
+ "street": None,
150
+ "number": None,
151
+ }
152
+
153
+ else:
154
+ # Neither source provided valid coordinates
155
+ return {
156
+ "country": None,
157
+ "country_code": None,
158
+ "state": None,
159
+ "province": None,
160
+ "city": None,
161
+ "postcode": None,
162
+ "district": None,
163
+ "quarter": None,
164
+ "street": None,
165
+ "number": None,
166
+ }
167
+
168
+ def get_district_quarter(self, raw_json):
169
+ district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
170
+ quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
171
+
172
+ if not district and quarter:
173
+ district = quarter
174
+ quarter = None
175
+
176
+ return district, quarter
177
+
178
+
179
+ class NominatimInterface(Nominatim):
180
+ def __init__(self, config):
181
+ if "osm" in config:
182
+ self.config = config["osm"]
183
+
184
+ self.nominatim_endpoint = self.config["nominatim_endpoint"]
185
+ self.geonames_endpoint = self.config["geonames_endpoint"]
186
+
187
+ super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
188
+ else:
189
+ logger.warning("no osm section in config")
@@ -1,110 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import logging
5
-
6
- import requests
7
-
8
- from ..params.nominatim import POSTCODES
9
-
10
- ########################################################################################################################
11
- # CLASSES
12
-
13
- logger = logging.getLogger(__name__)
14
-
15
-
16
- class GeoNames:
17
- def __init__(self, endpoint):
18
- self.endpoint = endpoint
19
-
20
- @staticmethod
21
- def validate_postcode(postcode):
22
- if isinstance(postcode, int):
23
- postcode = str(postcode)
24
-
25
- if postcode and len(postcode) == 5 and postcode[:2] in POSTCODES:
26
- return postcode
27
-
28
- if postcode and len(postcode) == 4:
29
- postcode = f"0{postcode}"
30
- if postcode[:2] in POSTCODES:
31
- return postcode
32
-
33
- @staticmethod
34
- def get_province_from_postcode(postcode):
35
- if postcode:
36
- return POSTCODES[postcode[:2]]
37
-
38
- def reverse(self, lat, lon):
39
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}").json()
40
-
41
-
42
- class Nominatim:
43
- def __init__(self, nominatim_endpoint, geonames_endpoint):
44
- self.endpoint = nominatim_endpoint
45
- self.geonames = GeoNames(geonames_endpoint)
46
-
47
- @staticmethod
48
- def get_attribute(raw_json, keys):
49
- for key in keys:
50
- if key in raw_json:
51
- return raw_json[key]
52
-
53
- def geocode(self, address):
54
- return requests.get(f"{self.endpoint}/search?q={address}&format=json").json()
55
-
56
- def geocode_parsed(self, address):
57
- results = self.geocode(address)
58
-
59
- if results:
60
- return self.reverse_parsed(results[0]["lat"], results[0]["lon"])
61
-
62
- def reverse(self, lat, lon):
63
- return requests.get(f"{self.endpoint}/reverse?lat={lat}&lon={lon}&format=json").json()
64
-
65
- def reverse_parsed(self, lat, lon):
66
- raw_json = self.reverse(lat, lon).get("address", {})
67
- geoname = self.geonames.reverse(lat, lon)
68
-
69
- postcode = self.geonames.validate_postcode(
70
- str(geoname.get("postal_code", ""))
71
- ) or self.geonames.validate_postcode(str(raw_json.get("postcode")))
72
-
73
- city = self.get_attribute(raw_json, ["city", "town", "village"]) or geoname.get("place_name")
74
-
75
- district, quarter = self.get_district_quarter(raw_json)
76
- return {
77
- "country": raw_json.get("country"),
78
- "country_code": (raw_json.get("country_code") or geoname.get("country_code") or "").lower(),
79
- "state": raw_json.get("state") or geoname.get("community"),
80
- "province": self.geonames.get_province_from_postcode(postcode),
81
- "city": city,
82
- "postcode": postcode,
83
- "district": district,
84
- "quarter": quarter,
85
- "street": raw_json.get("road"),
86
- "number": raw_json.get("house_number"),
87
- }
88
-
89
- def get_district_quarter(self, raw_json):
90
- district = self.get_attribute(raw_json, ["city_district", "suburb", "borough"])
91
- quarter = self.get_attribute(raw_json, ["quarter", "neighbourhood"])
92
-
93
- if not district and quarter:
94
- district = quarter
95
- quarter = None
96
-
97
- return district, quarter
98
-
99
-
100
- class NominatimInterface(Nominatim):
101
- def __init__(self, config):
102
- if "osm" in config:
103
- self.config = config["osm"]
104
-
105
- self.nominatim_endpoint = self.config["nominatim_endpoint"]
106
- self.geonames_endpoint = self.config["geonames_endpoint"]
107
-
108
- super().__init__(self.nominatim_endpoint, self.geonames_endpoint)
109
- else:
110
- logger.warning("no osm section in config")
File without changes
File without changes