datamarket 0.8.7__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

@@ -212,30 +212,56 @@ class ClickhousePeer:
212
212
 
213
213
  class TransientS3:
214
214
  def __init__(self, config):
215
- if "peerdb-s3" in config:
216
- self.config = config["peerdb-s3"]
215
+ if section := config.get("peerdb", {}).get("s3"):
216
+ self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_resource = self.session.resource("s3")
219
+ self.s3_client = self.session.client("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_resource.meta.endpoint_url
224
+ self.endpoint_url = self.s3_client.meta.endpoint_url
225
225
  else:
226
- logger.warning("no peerdb-s3 section in config")
226
+ logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- bucket = self.s3_resource.Bucket(self.bucket_name)
231
+ paginator = self.s3_client.get_paginator("list_objects_v2")
232
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
232
233
 
233
- for prefix in [schema_name, f"clone_{schema_name}"]:
234
- objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
- objects_to_delete.delete()
234
+ for page in pages:
235
+ if "CommonPrefixes" in page:
236
+ for prefix in page["CommonPrefixes"]:
237
+ folder = prefix["Prefix"]
238
+ if schema_name in folder:
239
+ self._delete_folder_contents(folder)
236
240
 
237
241
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
238
242
 
243
+ def _delete_folder_contents(self, folder):
244
+ logger.info(f"Deleting contents of folder: {folder}")
245
+
246
+ paginator = self.s3_client.get_paginator("list_objects_v2")
247
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
+
249
+ delete_us = dict(Objects=[])
250
+ for page in pages:
251
+ if "Contents" in page:
252
+ for obj in page["Contents"]:
253
+ delete_us["Objects"].append(dict(Key=obj["Key"]))
254
+
255
+ # AWS limits to deleting 1000 objects at a time
256
+ if len(delete_us["Objects"]) >= 1000:
257
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
+ delete_us = dict(Objects=[])
259
+
260
+ if len(delete_us["Objects"]):
261
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
+
263
+ logger.info(f"Deleted contents of folder: {folder}")
264
+
239
265
 
240
266
  class PeerDBInterface:
241
267
  def __init__(self, config):
@@ -282,11 +308,11 @@ class PeerDBInterface:
282
308
  if not self.docker_host_mapping or not host:
283
309
  return host
284
310
 
285
- if host in ["localhost", "127.0.0.1"]:
311
+ if host in ['localhost', '127.0.0.1']:
286
312
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
287
313
  return self.docker_host_mapping
288
314
 
289
- url_pattern = r"(localhost|127\.0\.0\.1)"
315
+ url_pattern = r'(localhost|127\.0\.0\.1)'
290
316
  match = re.search(url_pattern, host)
291
317
  if match:
292
318
  original_host = match.group(1)
datamarket/utils/main.py CHANGED
@@ -68,7 +68,8 @@ def get_config(config_file: Path, tz: str = "Europe/Madrid"):
68
68
  if Path(config_file).suffix == ".ini":
69
69
  logger.warning("Using legacy INI config reader. Please migrate to TOML")
70
70
  cfg = configparser.RawConfigParser()
71
- return cfg.read(config_file)
71
+ cfg.read(config_file)
72
+ return cfg
72
73
 
73
74
  add_converter("read", read_converter)
74
75
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.8.7
3
+ Version: 0.9.0
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -5,7 +5,7 @@ datamarket/interfaces/aws.py,sha256=UztVuBn561DnU1AcjyJ16UAIS1BUD5HUxiQ4gc9EhtM,
5
5
  datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
6
6
  datamarket/interfaces/ftp.py,sha256=9GQgiNBBK7njkv8ytHQaP9YLB9kI5vnUFA5gtz9J7As,1859
7
7
  datamarket/interfaces/nominatim.py,sha256=_gFJ04D-ju5xn3wuaGT5Pj5jhf4F5eINpxOpuQL_dIQ,3664
8
- datamarket/interfaces/peerdb.py,sha256=hGQ9TXKq9k2xEShz7n6iV-x66bhYylBaWoCc-I2VtN0,20705
8
+ datamarket/interfaces/peerdb.py,sha256=rNQ1-THcVvrej8BEPJs9zM4VfH5dlByafOIHYN9sB2A,21833
9
9
  datamarket/interfaces/proxy.py,sha256=8EJaW8zAMzUMIRLkdAcMkTO9qZXPIubE6vyB5ZXcRtU,3352
10
10
  datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
11
11
  datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -13,11 +13,11 @@ datamarket/params/nominatim.py,sha256=pBYRfoBkkLBg2INbFymefmYSzaAVujQSpEro5c1hD_
13
13
  datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
14
14
  datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
15
15
  datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
16
- datamarket/utils/main.py,sha256=6d94KrwLA48LY1my28dsjEhhPlJrcd5Q_TLPadGV98I,5763
16
+ datamarket/utils/main.py,sha256=agWVJ5ZFZjVrBNuMpnxN2F_edA3mMJop6dVHPBBkOqU,5775
17
17
  datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
18
18
  datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
19
19
  datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
20
- datamarket-0.8.7.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
- datamarket-0.8.7.dist-info/METADATA,sha256=CLkOI2fxObps_nWjUkUuIj32LnLOAl3xeSZLYYEeazc,6284
22
- datamarket-0.8.7.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
23
- datamarket-0.8.7.dist-info/RECORD,,
20
+ datamarket-0.9.0.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
21
+ datamarket-0.9.0.dist-info/METADATA,sha256=-MsG-FOmTXHlmzyVzNXbvNwWET4xYr401XgjC77uNP8,6284
22
+ datamarket-0.9.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
23
+ datamarket-0.9.0.dist-info/RECORD,,