datamarket 0.8.8__tar.gz → 0.9.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (23) hide show
  1. {datamarket-0.8.8 → datamarket-0.9.1}/PKG-INFO +3 -2
  2. {datamarket-0.8.8 → datamarket-0.9.1}/pyproject.toml +3 -2
  3. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/peerdb.py +37 -11
  4. {datamarket-0.8.8 → datamarket-0.9.1}/LICENSE +0 -0
  5. {datamarket-0.8.8 → datamarket-0.9.1}/README.md +0 -0
  6. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/__init__.py +0 -0
  7. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/__init__.py +0 -0
  8. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/alchemy.py +0 -0
  9. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/aws.py +0 -0
  10. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/drive.py +0 -0
  11. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/ftp.py +0 -0
  12. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/nominatim.py +0 -0
  13. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/proxy.py +0 -0
  14. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/interfaces/tinybird.py +0 -0
  15. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/params/__init__.py +0 -0
  16. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/params/nominatim.py +0 -0
  17. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/__init__.py +0 -0
  18. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/airflow.py +0 -0
  19. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/alchemy.py +0 -0
  20. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/main.py +0 -0
  21. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.8.8 → datamarket-0.9.1}/src/datamarket/utils/typer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.8.8
3
+ Version: 0.9.1
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -15,6 +15,7 @@ Classifier: Programming Language :: Python :: 3.10
15
15
  Classifier: Programming Language :: Python :: 3.11
16
16
  Classifier: Programming Language :: Python :: 3.12
17
17
  Classifier: Programming Language :: Python :: 3.13
18
+ Provides-Extra: alchemy
18
19
  Provides-Extra: aws
19
20
  Provides-Extra: azure-storage-blob
20
21
  Provides-Extra: boto3
@@ -62,7 +63,7 @@ Provides-Extra: tqdm
62
63
  Provides-Extra: undetected-chromedriver
63
64
  Provides-Extra: unidecode
64
65
  Provides-Extra: xmltodict
65
- Requires-Dist: SQLAlchemy (==2.0.36)
66
+ Requires-Dist: SQLAlchemy (==2.0.36) ; extra == "alchemy"
66
67
  Requires-Dist: azure-storage-blob (==12.23.1) ; extra == "azure-storage-blob"
67
68
  Requires-Dist: beautifulsoup4 (==4.12.3)
68
69
  Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.8.8"
3
+ version = "0.9.1"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -17,7 +17,6 @@ classifiers = [
17
17
  [tool.poetry.dependencies]
18
18
  python = "^3.9"
19
19
  typer = "0.12.5"
20
- SQLAlchemy = "2.0.36"
21
20
  psycopg2-binary = "2.9.10"
22
21
  requests = "2.32.3"
23
22
  tenacity = "9.0.0"
@@ -71,6 +70,7 @@ google-auth-oauthlib = { version = "1.2.1", optional = true }
71
70
  dnspython = { version = "2.7.0", optional = true }
72
71
  openpyxl = { version = "3.1.5", optional = true }
73
72
  httpx = { extras = ["http2"], version = "0.28.1", optional = true }
73
+ SQLAlchemy = { version = "2.0.36", optional = true }
74
74
 
75
75
  [tool.poetry.extras]
76
76
  boto3 = ["boto3"]
@@ -122,6 +122,7 @@ aws = ["boto3"]
122
122
  drive = ["pydrive2"]
123
123
  peerdb = ["boto3", "clickhouse-driver"]
124
124
  proxy = ["stem"]
125
+ alchemy = ["SQLAlchemy"]
125
126
 
126
127
 
127
128
  [build-system]
@@ -212,30 +212,56 @@ class ClickhousePeer:
212
212
 
213
213
  class TransientS3:
214
214
  def __init__(self, config):
215
- if "peerdb-s3" in config:
216
- self.config = config["peerdb-s3"]
215
+ if section := config.get("peerdb", {}).get("s3"):
216
+ self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_resource = self.session.resource("s3")
219
+ self.s3_client = self.session.client("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_resource.meta.endpoint_url
224
+ self.endpoint_url = self.s3_client.meta.endpoint_url
225
225
  else:
226
- logger.warning("no peerdb-s3 section in config")
226
+ logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- bucket = self.s3_resource.Bucket(self.bucket_name)
231
+ paginator = self.s3_client.get_paginator("list_objects_v2")
232
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
232
233
 
233
- for prefix in [schema_name, f"clone_{schema_name}"]:
234
- objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
- objects_to_delete.delete()
234
+ for page in pages:
235
+ if "CommonPrefixes" in page:
236
+ for prefix in page["CommonPrefixes"]:
237
+ folder = prefix["Prefix"]
238
+ if schema_name in folder:
239
+ self._delete_folder_contents(folder)
236
240
 
237
241
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
238
242
 
243
+ def _delete_folder_contents(self, folder):
244
+ logger.info(f"Deleting contents of folder: {folder}")
245
+
246
+ paginator = self.s3_client.get_paginator("list_objects_v2")
247
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
+
249
+ delete_us = dict(Objects=[])
250
+ for page in pages:
251
+ if "Contents" in page:
252
+ for obj in page["Contents"]:
253
+ delete_us["Objects"].append(dict(Key=obj["Key"]))
254
+
255
+ # AWS limits to deleting 1000 objects at a time
256
+ if len(delete_us["Objects"]) >= 1000:
257
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
+ delete_us = dict(Objects=[])
259
+
260
+ if len(delete_us["Objects"]):
261
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
+
263
+ logger.info(f"Deleted contents of folder: {folder}")
264
+
239
265
 
240
266
  class PeerDBInterface:
241
267
  def __init__(self, config):
@@ -282,11 +308,11 @@ class PeerDBInterface:
282
308
  if not self.docker_host_mapping or not host:
283
309
  return host
284
310
 
285
- if host in ["localhost", "127.0.0.1"]:
311
+ if host in ['localhost', '127.0.0.1']:
286
312
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
287
313
  return self.docker_host_mapping
288
314
 
289
- url_pattern = r"(localhost|127\.0\.0\.1)"
315
+ url_pattern = r'(localhost|127\.0\.0\.1)'
290
316
  match = re.search(url_pattern, host)
291
317
  if match:
292
318
  original_host = match.group(1)
File without changes
File without changes