datamarket 0.9.4__py3-none-any.whl → 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -12,7 +12,6 @@ from ..utils.main import Config
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
-
16
15
  class FTPInterface:
17
16
  def __init__(self, config: Config):
18
17
  if "ftp" in config:
@@ -23,7 +22,7 @@ class FTPInterface:
23
22
  logger.warning("no ftp section in config")
24
23
 
25
24
  def get_ftp(self):
26
- if self.config["ftps"].lower() == "true":
25
+ if self.config["ftps"]:
27
26
  ftp_conn = FTP_TLS(self.config["server"])
28
27
 
29
28
  else:
@@ -216,26 +216,52 @@ class TransientS3:
216
216
  self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_resource = self.session.resource("s3")
219
+ self.s3_client = self.session.client("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_resource.meta.endpoint_url
224
+ self.endpoint_url = self.s3_client.meta.endpoint_url
225
225
  else:
226
226
  logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- bucket = self.s3_resource.Bucket(self.bucket_name)
231
+ paginator = self.s3_client.get_paginator("list_objects_v2")
232
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
232
233
 
233
- for prefix in [schema_name, f"clone_{schema_name}"]:
234
- objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
- objects_to_delete.delete()
234
+ for page in pages:
235
+ if "CommonPrefixes" in page:
236
+ for prefix in page["CommonPrefixes"]:
237
+ folder = prefix["Prefix"]
238
+ if schema_name in folder:
239
+ self._delete_folder_contents(folder)
236
240
 
237
241
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
238
242
 
243
+ def _delete_folder_contents(self, folder):
244
+ logger.info(f"Deleting contents of folder: {folder}")
245
+
246
+ paginator = self.s3_client.get_paginator("list_objects_v2")
247
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
+
249
+ delete_us = dict(Objects=[])
250
+ for page in pages:
251
+ if "Contents" in page:
252
+ for obj in page["Contents"]:
253
+ delete_us["Objects"].append(dict(Key=obj["Key"]))
254
+
255
+ # AWS limits to deleting 1000 objects at a time
256
+ if len(delete_us["Objects"]) >= 1000:
257
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
+ delete_us = dict(Objects=[])
259
+
260
+ if len(delete_us["Objects"]):
261
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
+
263
+ logger.info(f"Deleted contents of folder: {folder}")
264
+
239
265
 
240
266
  class PeerDBInterface:
241
267
  def __init__(self, config):
@@ -282,11 +308,11 @@ class PeerDBInterface:
282
308
  if not self.docker_host_mapping or not host:
283
309
  return host
284
310
 
285
- if host in ["localhost", "127.0.0.1"]:
311
+ if host in ['localhost', '127.0.0.1']:
286
312
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
287
313
  return self.docker_host_mapping
288
314
 
289
- url_pattern = r"(localhost|127\.0\.0\.1)"
315
+ url_pattern = r'(localhost|127\.0\.0\.1)'
290
316
  match = re.search(url_pattern, host)
291
317
  if match:
292
318
  original_host = match.group(1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.4
3
+ Version: 0.9.5
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -63,7 +63,7 @@ Provides-Extra: xmltodict
63
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
64
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
65
65
  Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
66
- Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
66
+ Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
67
67
  Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
68
68
  Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
69
69
  Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
@@ -3,9 +3,9 @@ datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
3
3
  datamarket/interfaces/alchemy.py,sha256=V8E1GtokxUNmrUftKTFkIpNoXaqJME7ACES2BY0znQM,4214
4
4
  datamarket/interfaces/aws.py,sha256=R6lYdSCD6a4g9l6aFMtNDt_EX3kroe2untDhgy7XG1k,2384
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
- datamarket/interfaces/ftp.py,sha256=Owk3D7tiF47_ZFT3Dc9h4_BaPsWtcJUbhagjpQB19q8,1900
6
+ datamarket/interfaces/ftp.py,sha256=VZSxISKquMIVbt-Nvb1HgOB9pwkzYunoror-anZNiiQ,1881
7
7
  datamarket/interfaces/nominatim.py,sha256=_gFJ04D-ju5xn3wuaGT5Pj5jhf4F5eINpxOpuQL_dIQ,3664
8
- datamarket/interfaces/peerdb.py,sha256=FhBLJfR2EMT9Rsnj_OJXvC14E5OlXGsMrPUQ1AQlwPY,20717
8
+ datamarket/interfaces/peerdb.py,sha256=rNQ1-THcVvrej8BEPJs9zM4VfH5dlByafOIHYN9sB2A,21833
9
9
  datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
11
11
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=0Abt3ww1VSPnX4AVKDcYzqDLAOEV_54iUHMLJfre2bg,6129
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.9.4.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.9.4.dist-info/METADATA,sha256=YYuxN--M3y9MX62_hG5Y1piS2TBBo_fl6MfJeLyyOZA,6360
22
- datamarket-0.9.4.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
23
- datamarket-0.9.4.dist-info/RECORD,,
20
+ datamarket-0.9.5.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
+ datamarket-0.9.5.dist-info/METADATA,sha256=0OFyrz2YcKfH1HachMVzhD7C_kL9iODaxEFLTB4e2NI,6362
22
+ datamarket-0.9.5.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
23
+ datamarket-0.9.5.dist-info/RECORD,,