datamarket 0.8.5__tar.gz → 0.8.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (23) hide show
  1. {datamarket-0.8.5 → datamarket-0.8.7}/PKG-INFO +4 -2
  2. {datamarket-0.8.5 → datamarket-0.8.7}/pyproject.toml +4 -2
  3. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/peerdb.py +8 -34
  4. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/main.py +7 -0
  5. {datamarket-0.8.5 → datamarket-0.8.7}/LICENSE +0 -0
  6. {datamarket-0.8.5 → datamarket-0.8.7}/README.md +0 -0
  7. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/__init__.py +0 -0
  9. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/alchemy.py +0 -0
  10. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/aws.py +0 -0
  11. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/drive.py +0 -0
  12. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/ftp.py +0 -0
  13. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/nominatim.py +0 -0
  14. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.8.5 → datamarket-0.8.7}/src/datamarket/utils/typer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.8.5
3
+ Version: 0.8.7
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -55,6 +55,7 @@ Provides-Extra: pytest
55
55
  Provides-Extra: rapidfuzz
56
56
  Provides-Extra: retry
57
57
  Provides-Extra: shapely
58
+ Provides-Extra: soda-core-mysql
58
59
  Provides-Extra: soda-core-postgres
59
60
  Provides-Extra: stem
60
61
  Provides-Extra: tqdm
@@ -86,7 +87,7 @@ Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
86
87
  Requires-Dist: jinja2 (==3.1.5)
87
88
  Requires-Dist: json5 (==0.9.25) ; extra == "json5"
88
89
  Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
89
- Requires-Dist: nodriver (==0.37) ; extra == "nodriver"
90
+ Requires-Dist: nodriver (==0.38.post1) ; extra == "nodriver"
90
91
  Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
91
92
  Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
92
93
  Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
@@ -106,6 +107,7 @@ Requires-Dist: rapidfuzz (==3.10.1) ; extra == "rapidfuzz"
106
107
  Requires-Dist: requests (==2.32.3)
107
108
  Requires-Dist: retry (==0.9.2) ; extra == "retry"
108
109
  Requires-Dist: shapely (==2.0.6) ; extra == "shapely"
110
+ Requires-Dist: soda-core-mysql (==3.4.4) ; extra == "soda-core-mysql"
109
111
  Requires-Dist: soda-core-postgres (==3.4.1) ; extra == "soda-core-postgres"
110
112
  Requires-Dist: stem (==1.8.2) ; extra == "stem" or extra == "proxy"
111
113
  Requires-Dist: tenacity (==9.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.8.5"
3
+ version = "0.8.7"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -38,6 +38,7 @@ pytest = { version = "8.3.3", optional = true }
38
38
  playwright = { version = "1.47.0", optional = true }
39
39
  playwright-stealth = { version = "1.0.6", optional = true }
40
40
  soda-core-postgres = { version = "3.4.1", optional = true }
41
+ soda-core-mysql = { version = "3.4.4", optional = true }
41
42
  fake-useragent = { version = "1.5.1", optional = true }
42
43
  pydrive2 = { version = "1.20.0", optional = true }
43
44
  clickhouse-driver = { version = "0.2.9", optional = true }
@@ -46,7 +47,7 @@ click = { version = "8.1.7", optional = true }
46
47
  rapidfuzz = { version = "3.10.1", optional = true }
47
48
  demjson3 = { version = "3.0.6", optional = true }
48
49
  geopy = { version = "2.4.1", optional = true }
49
- nodriver = { version = "0.37", optional = true }
50
+ nodriver = { version = "0.38.post1", optional = true }
50
51
  undetected-chromedriver = { version = "3.5.5", optional = true }
51
52
  retry = { version = "0.9.2", optional = true }
52
53
  shapely = { version = "2.0.6", optional = true }
@@ -82,6 +83,7 @@ pytest = ["pytest"]
82
83
  playwright = ["playwright"]
83
84
  playwright-stealth = ["playwright-stealth"]
84
85
  soda-core-postgres = ["soda-core-postgres"]
86
+ soda-core-mysql = ["soda-core-mysql"]
85
87
  fake-useragent = ["fake-useragent"]
86
88
  pydrive2 = ["pydrive2"]
87
89
  clickhouse-driver = ["clickhouse-driver"]
@@ -216,52 +216,26 @@ class TransientS3:
216
216
  self.config = config["peerdb-s3"]
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_client = self.session.client("s3")
219
+ self.s3_resource = self.session.resource("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_client.meta.endpoint_url
224
+ self.endpoint_url = self.s3_resource.meta.endpoint_url
225
225
  else:
226
226
  logger.warning("no peerdb-s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- paginator = self.s3_client.get_paginator("list_objects_v2")
232
- pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
231
+ bucket = self.s3_resource.Bucket(self.bucket_name)
233
232
 
234
- for page in pages:
235
- if "CommonPrefixes" in page:
236
- for prefix in page["CommonPrefixes"]:
237
- folder = prefix["Prefix"]
238
- if schema_name in folder:
239
- self._delete_folder_contents(folder)
233
+ for prefix in [schema_name, f"clone_{schema_name}"]:
234
+ objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
+ objects_to_delete.delete()
240
236
 
241
237
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
242
238
 
243
- def _delete_folder_contents(self, folder):
244
- logger.info(f"Deleting contents of folder: {folder}")
245
-
246
- paginator = self.s3_client.get_paginator("list_objects_v2")
247
- pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
-
249
- delete_us = dict(Objects=[])
250
- for page in pages:
251
- if "Contents" in page:
252
- for obj in page["Contents"]:
253
- delete_us["Objects"].append(dict(Key=obj["Key"]))
254
-
255
- # AWS limits to deleting 1000 objects at a time
256
- if len(delete_us["Objects"]) >= 1000:
257
- self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
- delete_us = dict(Objects=[])
259
-
260
- if len(delete_us["Objects"]):
261
- self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
-
263
- logger.info(f"Deleted contents of folder: {folder}")
264
-
265
239
 
266
240
  class PeerDBInterface:
267
241
  def __init__(self, config):
@@ -308,11 +282,11 @@ class PeerDBInterface:
308
282
  if not self.docker_host_mapping or not host:
309
283
  return host
310
284
 
311
- if host in ['localhost', '127.0.0.1']:
285
+ if host in ["localhost", "127.0.0.1"]:
312
286
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
313
287
  return self.docker_host_mapping
314
288
 
315
- url_pattern = r'(localhost|127\.0\.0\.1)'
289
+ url_pattern = r"(localhost|127\.0\.0\.1)"
316
290
  match = re.search(url_pattern, host)
317
291
  if match:
318
292
  original_host = match.group(1)
@@ -1,6 +1,7 @@
1
1
  ########################################################################################################################
2
2
  # IMPORTS
3
3
 
4
+ import asyncio
4
5
  import configparser
5
6
  import inspect
6
7
  import logging
@@ -124,6 +125,12 @@ def ban_sleep(max_time, min_time=0):
124
125
  time.sleep(sleep_time)
125
126
 
126
127
 
128
+ async def ban_sleep_async(max_time, min_time=0):
129
+ sleep_time = int(random.uniform(min_time, max_time)) # noqa: S311
130
+ logger.info(f"sleeping for {sleep_time} seconds...")
131
+ await asyncio.sleep(sleep_time)
132
+
133
+
127
134
  def run_bash_command(command):
128
135
  p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
129
136
 
File without changes
File without changes