datamarket 0.9.4__py3-none-any.whl → 0.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/ftp.py +1 -2
- datamarket/interfaces/peerdb.py +34 -8
- {datamarket-0.9.4.dist-info → datamarket-0.9.5.dist-info}/METADATA +2 -2
- {datamarket-0.9.4.dist-info → datamarket-0.9.5.dist-info}/RECORD +6 -6
- {datamarket-0.9.4.dist-info → datamarket-0.9.5.dist-info}/LICENSE +0 -0
- {datamarket-0.9.4.dist-info → datamarket-0.9.5.dist-info}/WHEEL +0 -0
datamarket/interfaces/ftp.py
CHANGED
|
@@ -12,7 +12,6 @@ from ..utils.main import Config
|
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
class FTPInterface:
|
|
17
16
|
def __init__(self, config: Config):
|
|
18
17
|
if "ftp" in config:
|
|
@@ -23,7 +22,7 @@ class FTPInterface:
|
|
|
23
22
|
logger.warning("no ftp section in config")
|
|
24
23
|
|
|
25
24
|
def get_ftp(self):
|
|
26
|
-
if self.config["ftps"]
|
|
25
|
+
if self.config["ftps"]:
|
|
27
26
|
ftp_conn = FTP_TLS(self.config["server"])
|
|
28
27
|
|
|
29
28
|
else:
|
datamarket/interfaces/peerdb.py
CHANGED
|
@@ -216,26 +216,52 @@ class TransientS3:
|
|
|
216
216
|
self.config = section
|
|
217
217
|
self.bucket_name = self.config["bucket"]
|
|
218
218
|
self.session = boto3.Session(profile_name=self.config["profile"])
|
|
219
|
-
self.
|
|
219
|
+
self.s3_client = self.session.client("s3")
|
|
220
220
|
self.credentials = self.session.get_credentials()
|
|
221
221
|
self.access_key = self.credentials.access_key
|
|
222
222
|
self.secret_key = self.credentials.secret_key
|
|
223
223
|
self.region_name = self.session.region_name
|
|
224
|
-
self.endpoint_url = self.
|
|
224
|
+
self.endpoint_url = self.s3_client.meta.endpoint_url
|
|
225
225
|
else:
|
|
226
226
|
logger.warning("no peerdb.s3 section in config")
|
|
227
227
|
|
|
228
228
|
def delete_paths_with_schema(self, schema_name):
|
|
229
229
|
logger.info(f"Deleting paths containing '{schema_name}' from S3")
|
|
230
230
|
|
|
231
|
-
|
|
231
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
232
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
|
|
232
233
|
|
|
233
|
-
for
|
|
234
|
-
|
|
235
|
-
|
|
234
|
+
for page in pages:
|
|
235
|
+
if "CommonPrefixes" in page:
|
|
236
|
+
for prefix in page["CommonPrefixes"]:
|
|
237
|
+
folder = prefix["Prefix"]
|
|
238
|
+
if schema_name in folder:
|
|
239
|
+
self._delete_folder_contents(folder)
|
|
236
240
|
|
|
237
241
|
logger.info(f"Deleted paths containing '{schema_name}' from S3")
|
|
238
242
|
|
|
243
|
+
def _delete_folder_contents(self, folder):
|
|
244
|
+
logger.info(f"Deleting contents of folder: {folder}")
|
|
245
|
+
|
|
246
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
247
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
|
|
248
|
+
|
|
249
|
+
delete_us = dict(Objects=[])
|
|
250
|
+
for page in pages:
|
|
251
|
+
if "Contents" in page:
|
|
252
|
+
for obj in page["Contents"]:
|
|
253
|
+
delete_us["Objects"].append(dict(Key=obj["Key"]))
|
|
254
|
+
|
|
255
|
+
# AWS limits to deleting 1000 objects at a time
|
|
256
|
+
if len(delete_us["Objects"]) >= 1000:
|
|
257
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
258
|
+
delete_us = dict(Objects=[])
|
|
259
|
+
|
|
260
|
+
if len(delete_us["Objects"]):
|
|
261
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
262
|
+
|
|
263
|
+
logger.info(f"Deleted contents of folder: {folder}")
|
|
264
|
+
|
|
239
265
|
|
|
240
266
|
class PeerDBInterface:
|
|
241
267
|
def __init__(self, config):
|
|
@@ -282,11 +308,11 @@ class PeerDBInterface:
|
|
|
282
308
|
if not self.docker_host_mapping or not host:
|
|
283
309
|
return host
|
|
284
310
|
|
|
285
|
-
if host in [
|
|
311
|
+
if host in ['localhost', '127.0.0.1']:
|
|
286
312
|
logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
|
|
287
313
|
return self.docker_host_mapping
|
|
288
314
|
|
|
289
|
-
url_pattern = r
|
|
315
|
+
url_pattern = r'(localhost|127\.0\.0\.1)'
|
|
290
316
|
match = re.search(url_pattern, host)
|
|
291
317
|
if match:
|
|
292
318
|
original_host = match.group(1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.5
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -63,7 +63,7 @@ Provides-Extra: xmltodict
|
|
|
63
63
|
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
64
64
|
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
65
65
|
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
66
|
-
Requires-Dist: boto3 (>=1.
|
|
66
|
+
Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
67
67
|
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
68
68
|
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
69
69
|
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
@@ -3,9 +3,9 @@ datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
|
3
3
|
datamarket/interfaces/alchemy.py,sha256=V8E1GtokxUNmrUftKTFkIpNoXaqJME7ACES2BY0znQM,4214
|
|
4
4
|
datamarket/interfaces/aws.py,sha256=R6lYdSCD6a4g9l6aFMtNDt_EX3kroe2untDhgy7XG1k,2384
|
|
5
5
|
datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
|
|
6
|
-
datamarket/interfaces/ftp.py,sha256=
|
|
6
|
+
datamarket/interfaces/ftp.py,sha256=VZSxISKquMIVbt-Nvb1HgOB9pwkzYunoror-anZNiiQ,1881
|
|
7
7
|
datamarket/interfaces/nominatim.py,sha256=_gFJ04D-ju5xn3wuaGT5Pj5jhf4F5eINpxOpuQL_dIQ,3664
|
|
8
|
-
datamarket/interfaces/peerdb.py,sha256=
|
|
8
|
+
datamarket/interfaces/peerdb.py,sha256=rNQ1-THcVvrej8BEPJs9zM4VfH5dlByafOIHYN9sB2A,21833
|
|
9
9
|
datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
|
|
10
10
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
11
11
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -17,7 +17,7 @@ datamarket/utils/main.py,sha256=0Abt3ww1VSPnX4AVKDcYzqDLAOEV_54iUHMLJfre2bg,6129
|
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
19
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
20
|
-
datamarket-0.9.
|
|
21
|
-
datamarket-0.9.
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
20
|
+
datamarket-0.9.5.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
21
|
+
datamarket-0.9.5.dist-info/METADATA,sha256=0OFyrz2YcKfH1HachMVzhD7C_kL9iODaxEFLTB4e2NI,6362
|
|
22
|
+
datamarket-0.9.5.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
23
|
+
datamarket-0.9.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|