datamarket 0.9.4__py3-none-any.whl → 0.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datamarket might be problematic. Click here for more details.
- datamarket/interfaces/aws.py +2 -3
- datamarket/interfaces/ftp.py +3 -4
- datamarket/interfaces/peerdb.py +34 -8
- datamarket/utils/main.py +30 -12
- {datamarket-0.9.4.dist-info → datamarket-0.9.6.dist-info}/METADATA +2 -2
- {datamarket-0.9.4.dist-info → datamarket-0.9.6.dist-info}/RECORD +8 -8
- {datamarket-0.9.4.dist-info → datamarket-0.9.6.dist-info}/LICENSE +0 -0
- {datamarket-0.9.4.dist-info → datamarket-0.9.6.dist-info}/WHEEL +0 -0
datamarket/interfaces/aws.py
CHANGED
|
@@ -4,8 +4,7 @@
|
|
|
4
4
|
import io
|
|
5
5
|
import logging
|
|
6
6
|
import boto3
|
|
7
|
-
|
|
8
|
-
from ..utils.main import Config
|
|
7
|
+
from dynaconf import Dynaconf
|
|
9
8
|
|
|
10
9
|
########################################################################################################################
|
|
11
10
|
# CLASSES
|
|
@@ -14,7 +13,7 @@ logger = logging.getLogger(__name__)
|
|
|
14
13
|
|
|
15
14
|
|
|
16
15
|
class AWSInterface:
|
|
17
|
-
def __init__(self, config:
|
|
16
|
+
def __init__(self, config: Dynaconf) -> None:
|
|
18
17
|
self.profiles = []
|
|
19
18
|
self.config = config
|
|
20
19
|
|
datamarket/interfaces/ftp.py
CHANGED
|
@@ -5,16 +5,15 @@ import logging
|
|
|
5
5
|
from ftplib import FTP, FTP_TLS
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from dynaconf import Dynaconf
|
|
9
9
|
|
|
10
10
|
########################################################################################################################
|
|
11
11
|
# CLASSES
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
|
|
15
|
-
|
|
16
15
|
class FTPInterface:
|
|
17
|
-
def __init__(self, config:
|
|
16
|
+
def __init__(self, config: Dynaconf):
|
|
18
17
|
if "ftp" in config:
|
|
19
18
|
self.config = config["ftp"]
|
|
20
19
|
|
|
@@ -23,7 +22,7 @@ class FTPInterface:
|
|
|
23
22
|
logger.warning("no ftp section in config")
|
|
24
23
|
|
|
25
24
|
def get_ftp(self):
|
|
26
|
-
if self.config["ftps"]
|
|
25
|
+
if self.config["ftps"]:
|
|
27
26
|
ftp_conn = FTP_TLS(self.config["server"])
|
|
28
27
|
|
|
29
28
|
else:
|
datamarket/interfaces/peerdb.py
CHANGED
|
@@ -216,26 +216,52 @@ class TransientS3:
|
|
|
216
216
|
self.config = section
|
|
217
217
|
self.bucket_name = self.config["bucket"]
|
|
218
218
|
self.session = boto3.Session(profile_name=self.config["profile"])
|
|
219
|
-
self.
|
|
219
|
+
self.s3_client = self.session.client("s3")
|
|
220
220
|
self.credentials = self.session.get_credentials()
|
|
221
221
|
self.access_key = self.credentials.access_key
|
|
222
222
|
self.secret_key = self.credentials.secret_key
|
|
223
223
|
self.region_name = self.session.region_name
|
|
224
|
-
self.endpoint_url = self.
|
|
224
|
+
self.endpoint_url = self.s3_client.meta.endpoint_url
|
|
225
225
|
else:
|
|
226
226
|
logger.warning("no peerdb.s3 section in config")
|
|
227
227
|
|
|
228
228
|
def delete_paths_with_schema(self, schema_name):
|
|
229
229
|
logger.info(f"Deleting paths containing '{schema_name}' from S3")
|
|
230
230
|
|
|
231
|
-
|
|
231
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
232
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
|
|
232
233
|
|
|
233
|
-
for
|
|
234
|
-
|
|
235
|
-
|
|
234
|
+
for page in pages:
|
|
235
|
+
if "CommonPrefixes" in page:
|
|
236
|
+
for prefix in page["CommonPrefixes"]:
|
|
237
|
+
folder = prefix["Prefix"]
|
|
238
|
+
if schema_name in folder:
|
|
239
|
+
self._delete_folder_contents(folder)
|
|
236
240
|
|
|
237
241
|
logger.info(f"Deleted paths containing '{schema_name}' from S3")
|
|
238
242
|
|
|
243
|
+
def _delete_folder_contents(self, folder):
|
|
244
|
+
logger.info(f"Deleting contents of folder: {folder}")
|
|
245
|
+
|
|
246
|
+
paginator = self.s3_client.get_paginator("list_objects_v2")
|
|
247
|
+
pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
|
|
248
|
+
|
|
249
|
+
delete_us = dict(Objects=[])
|
|
250
|
+
for page in pages:
|
|
251
|
+
if "Contents" in page:
|
|
252
|
+
for obj in page["Contents"]:
|
|
253
|
+
delete_us["Objects"].append(dict(Key=obj["Key"]))
|
|
254
|
+
|
|
255
|
+
# AWS limits to deleting 1000 objects at a time
|
|
256
|
+
if len(delete_us["Objects"]) >= 1000:
|
|
257
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
258
|
+
delete_us = dict(Objects=[])
|
|
259
|
+
|
|
260
|
+
if len(delete_us["Objects"]):
|
|
261
|
+
self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
|
|
262
|
+
|
|
263
|
+
logger.info(f"Deleted contents of folder: {folder}")
|
|
264
|
+
|
|
239
265
|
|
|
240
266
|
class PeerDBInterface:
|
|
241
267
|
def __init__(self, config):
|
|
@@ -282,11 +308,11 @@ class PeerDBInterface:
|
|
|
282
308
|
if not self.docker_host_mapping or not host:
|
|
283
309
|
return host
|
|
284
310
|
|
|
285
|
-
if host in [
|
|
311
|
+
if host in ['localhost', '127.0.0.1']:
|
|
286
312
|
logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
|
|
287
313
|
return self.docker_host_mapping
|
|
288
314
|
|
|
289
|
-
url_pattern = r
|
|
315
|
+
url_pattern = r'(localhost|127\.0\.0\.1)'
|
|
290
316
|
match = re.search(url_pattern, host)
|
|
291
317
|
if match:
|
|
292
318
|
original_host = match.group(1)
|
datamarket/utils/main.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# IMPORTS
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
|
+
from dataclasses import dataclass
|
|
5
6
|
import inspect
|
|
6
7
|
import logging
|
|
7
8
|
import random
|
|
@@ -11,23 +12,35 @@ import shutil
|
|
|
11
12
|
import subprocess
|
|
12
13
|
import time
|
|
13
14
|
from pathlib import Path
|
|
14
|
-
from typing import Literal, Union
|
|
15
|
+
from typing import Literal, Optional, Union
|
|
15
16
|
|
|
16
17
|
import pendulum
|
|
17
18
|
from croniter import croniter
|
|
18
19
|
from configparser import RawConfigParser
|
|
19
20
|
from dynaconf import Dynaconf, add_converter
|
|
20
21
|
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
21
24
|
########################################################################################################################
|
|
22
|
-
#
|
|
25
|
+
# CLASSES
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class ProjectMetadata:
|
|
30
|
+
cmd_prefix: str
|
|
31
|
+
package_name: str
|
|
32
|
+
env_name: str
|
|
33
|
+
path: Path
|
|
34
|
+
config_path: Path
|
|
23
35
|
|
|
24
|
-
logger = logging.getLogger(__name__)
|
|
25
36
|
|
|
26
|
-
|
|
37
|
+
########################################################################################################################
|
|
38
|
+
# FUNCTIONS
|
|
27
39
|
|
|
28
40
|
|
|
29
41
|
def get_granular_date(
|
|
30
|
-
granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str],
|
|
42
|
+
granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str],
|
|
43
|
+
tz: str = "Europe/Madrid",
|
|
31
44
|
) -> pendulum.DateTime:
|
|
32
45
|
"""
|
|
33
46
|
Returns the most recent date based on the given granularity or a custom cron expression.
|
|
@@ -67,8 +80,10 @@ def read_converter(path_str: str):
|
|
|
67
80
|
|
|
68
81
|
|
|
69
82
|
def get_config(
|
|
70
|
-
config_file: Path, tz: str = "Europe/Madrid"
|
|
71
|
-
) ->
|
|
83
|
+
config_file: Optional[Path] = None, tz: str = "Europe/Madrid"
|
|
84
|
+
) -> Dynaconf:
|
|
85
|
+
config_file = config_file or get_project_metadata().config_path
|
|
86
|
+
|
|
72
87
|
if Path(config_file).suffix == ".ini":
|
|
73
88
|
logger.warning("Using legacy INI config reader. Please migrate to TOML")
|
|
74
89
|
cfg = RawConfigParser()
|
|
@@ -106,17 +121,20 @@ def get_config(
|
|
|
106
121
|
|
|
107
122
|
return config
|
|
108
123
|
|
|
109
|
-
|
|
110
|
-
def get_project_metadata():
|
|
124
|
+
def get_project_metadata() -> ProjectMetadata:
|
|
111
125
|
caller_frame = inspect.stack()[1]
|
|
112
126
|
current_file_parts = Path(caller_frame.filename).resolve().parts
|
|
113
127
|
src_index = current_file_parts.index("src")
|
|
128
|
+
|
|
114
129
|
cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
|
|
115
|
-
|
|
116
|
-
env_name = f"{
|
|
130
|
+
package_name = current_file_parts[src_index + 1]
|
|
131
|
+
env_name = f"{package_name}_env"
|
|
117
132
|
project_path = Path(*current_file_parts[:src_index])
|
|
133
|
+
config_path = project_path / "config.toml"
|
|
118
134
|
|
|
119
|
-
return
|
|
135
|
+
return ProjectMetadata(
|
|
136
|
+
cmd_prefix, package_name, env_name, project_path, config_path
|
|
137
|
+
)
|
|
120
138
|
|
|
121
139
|
|
|
122
140
|
def set_logger(level):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: datamarket
|
|
3
|
-
Version: 0.9.
|
|
3
|
+
Version: 0.9.6
|
|
4
4
|
Summary: Utilities that integrate advanced scraping knowledge into just one library.
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Author: DataMarket
|
|
@@ -63,7 +63,7 @@ Provides-Extra: xmltodict
|
|
|
63
63
|
Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
|
|
64
64
|
Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
|
|
65
65
|
Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
|
|
66
|
-
Requires-Dist: boto3 (>=1.
|
|
66
|
+
Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
|
|
67
67
|
Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
|
|
68
68
|
Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
|
|
69
69
|
Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
datamarket/__init__.py,sha256=FHS77P9qNewKMoN-p0FLEUEC60oWIYup1QkbJZP4ays,12
|
|
2
2
|
datamarket/interfaces/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
3
|
datamarket/interfaces/alchemy.py,sha256=V8E1GtokxUNmrUftKTFkIpNoXaqJME7ACES2BY0znQM,4214
|
|
4
|
-
datamarket/interfaces/aws.py,sha256=
|
|
4
|
+
datamarket/interfaces/aws.py,sha256=7KLUeBxmPN7avEMPsu5HC_KHB1N7W6Anp2X8fo43mlw,2383
|
|
5
5
|
datamarket/interfaces/drive.py,sha256=shbV5jpQVe_KPE-8Idx6Z9te5Zu1SmVfrvSAyd9ZIgE,2915
|
|
6
|
-
datamarket/interfaces/ftp.py,sha256=
|
|
6
|
+
datamarket/interfaces/ftp.py,sha256=o0KlJxtksbop9OjCiQRzyAa2IeG_ExVXagS6apwrAQo,1881
|
|
7
7
|
datamarket/interfaces/nominatim.py,sha256=_gFJ04D-ju5xn3wuaGT5Pj5jhf4F5eINpxOpuQL_dIQ,3664
|
|
8
|
-
datamarket/interfaces/peerdb.py,sha256=
|
|
8
|
+
datamarket/interfaces/peerdb.py,sha256=rNQ1-THcVvrej8BEPJs9zM4VfH5dlByafOIHYN9sB2A,21833
|
|
9
9
|
datamarket/interfaces/proxy.py,sha256=updoOStKd8-nQBbxWbnD9eOt6HksnYi-5dQ0rEySf5M,3152
|
|
10
10
|
datamarket/interfaces/tinybird.py,sha256=AYrcRGNOCoCt7ojilkWa27POROee9sTCwZ61GGHEPeM,2698
|
|
11
11
|
datamarket/params/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -13,11 +13,11 @@ datamarket/params/nominatim.py,sha256=pBYRfoBkkLBg2INbFymefmYSzaAVujQSpEro5c1hD_
|
|
|
13
13
|
datamarket/utils/__init__.py,sha256=8D5a8oKgqd6WA1RUkiKCn4l_PVemtyuckxQut0vDHXM,20
|
|
14
14
|
datamarket/utils/airflow.py,sha256=al0vc0YUikNu3Oy51VSn52I7pMU40akFBOl_UlHa2E4,795
|
|
15
15
|
datamarket/utils/alchemy.py,sha256=SRq6kgh1aANXVShBPgAuglmNhZssPWwWEY503gKSia8,635
|
|
16
|
-
datamarket/utils/main.py,sha256=
|
|
16
|
+
datamarket/utils/main.py,sha256=O6rX-65h4h0j2zs9dofdTPlly5reKDnvgLtTwbLmbWg,6529
|
|
17
17
|
datamarket/utils/selenium.py,sha256=IMKlbLzXABFhACnWzhHmB0l2hhVzNwHGZwbo14nEewQ,2499
|
|
18
18
|
datamarket/utils/soda.py,sha256=eZTXFbI1P3WoMd1MM-YjoVTpdjTcDSWuvBb7ViBMhSQ,941
|
|
19
19
|
datamarket/utils/typer.py,sha256=FDF3l6gh3UlAFPsHCtesnekvct2rKz0oFn3uKARBQvE,814
|
|
20
|
-
datamarket-0.9.
|
|
21
|
-
datamarket-0.9.
|
|
22
|
-
datamarket-0.9.
|
|
23
|
-
datamarket-0.9.
|
|
20
|
+
datamarket-0.9.6.dist-info/LICENSE,sha256=OXLcl0T2SZ8Pmy2_dmlvKuetivmyPd5m1q-Gyd-zaYY,35149
|
|
21
|
+
datamarket-0.9.6.dist-info/METADATA,sha256=8navfRiIA2UGaMQCWCsq0-LQBDzVpfYPlAH_RzLsams,6362
|
|
22
|
+
datamarket-0.9.6.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
23
|
+
datamarket-0.9.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|