datamarket 0.9.4__tar.gz → 0.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (23) hide show
  1. {datamarket-0.9.4 → datamarket-0.9.5}/PKG-INFO +2 -2
  2. {datamarket-0.9.4 → datamarket-0.9.5}/pyproject.toml +2 -2
  3. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/ftp.py +1 -2
  4. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/peerdb.py +34 -8
  5. {datamarket-0.9.4 → datamarket-0.9.5}/LICENSE +0 -0
  6. {datamarket-0.9.4 → datamarket-0.9.5}/README.md +0 -0
  7. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/__init__.py +0 -0
  8. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/__init__.py +0 -0
  9. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/alchemy.py +0 -0
  10. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/aws.py +0 -0
  11. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/drive.py +0 -0
  12. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/nominatim.py +0 -0
  13. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/proxy.py +0 -0
  14. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/interfaces/tinybird.py +0 -0
  15. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/params/__init__.py +0 -0
  16. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/params/nominatim.py +0 -0
  17. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/__init__.py +0 -0
  18. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/airflow.py +0 -0
  19. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/alchemy.py +0 -0
  20. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/main.py +0 -0
  21. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.9.4 → datamarket-0.9.5}/src/datamarket/utils/typer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.4
3
+ Version: 0.9.5
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -63,7 +63,7 @@ Provides-Extra: xmltodict
63
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
64
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
65
65
  Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
66
- Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
66
+ Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
67
67
  Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
68
68
  Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
69
69
  Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.4"
3
+ version = "0.9.5"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -27,7 +27,7 @@ croniter = "^3.0.0"
27
27
  dynaconf = "^3.0.0"
28
28
  jinja2 = "^3.0.0"
29
29
 
30
- boto3 = { version = "^1.0.0", optional = true }
30
+ boto3 = { version = "~1.35.0", optional = true }
31
31
  unidecode = { version = "^1.0.0", optional = true }
32
32
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
33
33
  tqdm = { version = "^4.0.0", optional = true }
@@ -12,7 +12,6 @@ from ..utils.main import Config
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
-
16
15
  class FTPInterface:
17
16
  def __init__(self, config: Config):
18
17
  if "ftp" in config:
@@ -23,7 +22,7 @@ class FTPInterface:
23
22
  logger.warning("no ftp section in config")
24
23
 
25
24
  def get_ftp(self):
26
- if self.config["ftps"].lower() == "true":
25
+ if self.config["ftps"]:
27
26
  ftp_conn = FTP_TLS(self.config["server"])
28
27
 
29
28
  else:
@@ -216,26 +216,52 @@ class TransientS3:
216
216
  self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_resource = self.session.resource("s3")
219
+ self.s3_client = self.session.client("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_resource.meta.endpoint_url
224
+ self.endpoint_url = self.s3_client.meta.endpoint_url
225
225
  else:
226
226
  logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- bucket = self.s3_resource.Bucket(self.bucket_name)
231
+ paginator = self.s3_client.get_paginator("list_objects_v2")
232
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
232
233
 
233
- for prefix in [schema_name, f"clone_{schema_name}"]:
234
- objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
- objects_to_delete.delete()
234
+ for page in pages:
235
+ if "CommonPrefixes" in page:
236
+ for prefix in page["CommonPrefixes"]:
237
+ folder = prefix["Prefix"]
238
+ if schema_name in folder:
239
+ self._delete_folder_contents(folder)
236
240
 
237
241
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
238
242
 
243
+ def _delete_folder_contents(self, folder):
244
+ logger.info(f"Deleting contents of folder: {folder}")
245
+
246
+ paginator = self.s3_client.get_paginator("list_objects_v2")
247
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
+
249
+ delete_us = dict(Objects=[])
250
+ for page in pages:
251
+ if "Contents" in page:
252
+ for obj in page["Contents"]:
253
+ delete_us["Objects"].append(dict(Key=obj["Key"]))
254
+
255
+ # AWS limits to deleting 1000 objects at a time
256
+ if len(delete_us["Objects"]) >= 1000:
257
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
+ delete_us = dict(Objects=[])
259
+
260
+ if len(delete_us["Objects"]):
261
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
+
263
+ logger.info(f"Deleted contents of folder: {folder}")
264
+
239
265
 
240
266
  class PeerDBInterface:
241
267
  def __init__(self, config):
@@ -282,11 +308,11 @@ class PeerDBInterface:
282
308
  if not self.docker_host_mapping or not host:
283
309
  return host
284
310
 
285
- if host in ["localhost", "127.0.0.1"]:
311
+ if host in ['localhost', '127.0.0.1']:
286
312
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
287
313
  return self.docker_host_mapping
288
314
 
289
- url_pattern = r"(localhost|127\.0\.0\.1)"
315
+ url_pattern = r'(localhost|127\.0\.0\.1)'
290
316
  match = re.search(url_pattern, host)
291
317
  if match:
292
318
  original_host = match.group(1)
File without changes
File without changes