datamarket 0.9.4__tar.gz → 0.9.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (23) hide show
  1. {datamarket-0.9.4 → datamarket-0.9.6}/PKG-INFO +2 -2
  2. {datamarket-0.9.4 → datamarket-0.9.6}/pyproject.toml +2 -2
  3. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/aws.py +2 -3
  4. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/ftp.py +3 -4
  5. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/peerdb.py +34 -8
  6. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/main.py +30 -12
  7. {datamarket-0.9.4 → datamarket-0.9.6}/LICENSE +0 -0
  8. {datamarket-0.9.4 → datamarket-0.9.6}/README.md +0 -0
  9. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/__init__.py +0 -0
  10. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/__init__.py +0 -0
  11. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/alchemy.py +0 -0
  12. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/drive.py +0 -0
  13. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/nominatim.py +0 -0
  14. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/proxy.py +0 -0
  15. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/interfaces/tinybird.py +0 -0
  16. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/params/__init__.py +0 -0
  17. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/params/nominatim.py +0 -0
  18. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/__init__.py +0 -0
  19. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/airflow.py +0 -0
  20. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/alchemy.py +0 -0
  21. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/selenium.py +0 -0
  22. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/soda.py +0 -0
  23. {datamarket-0.9.4 → datamarket-0.9.6}/src/datamarket/utils/typer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.9.4
3
+ Version: 0.9.6
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
5
  License: GPL-3.0-or-later
6
6
  Author: DataMarket
@@ -63,7 +63,7 @@ Provides-Extra: xmltodict
63
63
  Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0) ; extra == "alchemy"
64
64
  Requires-Dist: azure-storage-blob (>=12.0.0,<13.0.0) ; extra == "azure-storage-blob"
65
65
  Requires-Dist: beautifulsoup4 (>=4.0.0,<5.0.0)
66
- Requires-Dist: boto3 (>=1.0.0,<2.0.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
66
+ Requires-Dist: boto3 (>=1.35.0,<1.36.0) ; extra == "boto3" or extra == "aws" or extra == "peerdb"
67
67
  Requires-Dist: chompjs (>=1.0.0,<2.0.0) ; extra == "chompjs"
68
68
  Requires-Dist: click (>=8.0.0,<9.0.0) ; extra == "click"
69
69
  Requires-Dist: clickhouse-driver (>=0.2.0,<0.3.0) ; extra == "clickhouse-driver" or extra == "peerdb"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.9.4"
3
+ version = "0.9.6"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -27,7 +27,7 @@ croniter = "^3.0.0"
27
27
  dynaconf = "^3.0.0"
28
28
  jinja2 = "^3.0.0"
29
29
 
30
- boto3 = { version = "^1.0.0", optional = true }
30
+ boto3 = { version = "~1.35.0", optional = true }
31
31
  unidecode = { version = "^1.0.0", optional = true }
32
32
  lxml = { extras = ["html-clean"], version = "^5.0.0", optional = true }
33
33
  tqdm = { version = "^4.0.0", optional = true }
@@ -4,8 +4,7 @@
4
4
  import io
5
5
  import logging
6
6
  import boto3
7
-
8
- from ..utils.main import Config
7
+ from dynaconf import Dynaconf
9
8
 
10
9
  ########################################################################################################################
11
10
  # CLASSES
@@ -14,7 +13,7 @@ logger = logging.getLogger(__name__)
14
13
 
15
14
 
16
15
  class AWSInterface:
17
- def __init__(self, config: Config) -> None:
16
+ def __init__(self, config: Dynaconf) -> None:
18
17
  self.profiles = []
19
18
  self.config = config
20
19
 
@@ -5,16 +5,15 @@ import logging
5
5
  from ftplib import FTP, FTP_TLS
6
6
  from pathlib import Path
7
7
 
8
- from ..utils.main import Config
8
+ from dynaconf import Dynaconf
9
9
 
10
10
  ########################################################################################################################
11
11
  # CLASSES
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
-
16
15
  class FTPInterface:
17
- def __init__(self, config: Config):
16
+ def __init__(self, config: Dynaconf):
18
17
  if "ftp" in config:
19
18
  self.config = config["ftp"]
20
19
 
@@ -23,7 +22,7 @@ class FTPInterface:
23
22
  logger.warning("no ftp section in config")
24
23
 
25
24
  def get_ftp(self):
26
- if self.config["ftps"].lower() == "true":
25
+ if self.config["ftps"]:
27
26
  ftp_conn = FTP_TLS(self.config["server"])
28
27
 
29
28
  else:
@@ -216,26 +216,52 @@ class TransientS3:
216
216
  self.config = section
217
217
  self.bucket_name = self.config["bucket"]
218
218
  self.session = boto3.Session(profile_name=self.config["profile"])
219
- self.s3_resource = self.session.resource("s3")
219
+ self.s3_client = self.session.client("s3")
220
220
  self.credentials = self.session.get_credentials()
221
221
  self.access_key = self.credentials.access_key
222
222
  self.secret_key = self.credentials.secret_key
223
223
  self.region_name = self.session.region_name
224
- self.endpoint_url = self.s3_resource.meta.endpoint_url
224
+ self.endpoint_url = self.s3_client.meta.endpoint_url
225
225
  else:
226
226
  logger.warning("no peerdb.s3 section in config")
227
227
 
228
228
  def delete_paths_with_schema(self, schema_name):
229
229
  logger.info(f"Deleting paths containing '{schema_name}' from S3")
230
230
 
231
- bucket = self.s3_resource.Bucket(self.bucket_name)
231
+ paginator = self.s3_client.get_paginator("list_objects_v2")
232
+ pages = paginator.paginate(Bucket=self.bucket_name, Delimiter="/")
232
233
 
233
- for prefix in [schema_name, f"clone_{schema_name}"]:
234
- objects_to_delete = bucket.objects.filter(Prefix=prefix)
235
- objects_to_delete.delete()
234
+ for page in pages:
235
+ if "CommonPrefixes" in page:
236
+ for prefix in page["CommonPrefixes"]:
237
+ folder = prefix["Prefix"]
238
+ if schema_name in folder:
239
+ self._delete_folder_contents(folder)
236
240
 
237
241
  logger.info(f"Deleted paths containing '{schema_name}' from S3")
238
242
 
243
+ def _delete_folder_contents(self, folder):
244
+ logger.info(f"Deleting contents of folder: {folder}")
245
+
246
+ paginator = self.s3_client.get_paginator("list_objects_v2")
247
+ pages = paginator.paginate(Bucket=self.bucket_name, Prefix=folder)
248
+
249
+ delete_us = dict(Objects=[])
250
+ for page in pages:
251
+ if "Contents" in page:
252
+ for obj in page["Contents"]:
253
+ delete_us["Objects"].append(dict(Key=obj["Key"]))
254
+
255
+ # AWS limits to deleting 1000 objects at a time
256
+ if len(delete_us["Objects"]) >= 1000:
257
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
258
+ delete_us = dict(Objects=[])
259
+
260
+ if len(delete_us["Objects"]):
261
+ self.s3_client.delete_objects(Bucket=self.bucket_name, Delete=delete_us)
262
+
263
+ logger.info(f"Deleted contents of folder: {folder}")
264
+
239
265
 
240
266
  class PeerDBInterface:
241
267
  def __init__(self, config):
@@ -282,11 +308,11 @@ class PeerDBInterface:
282
308
  if not self.docker_host_mapping or not host:
283
309
  return host
284
310
 
285
- if host in ["localhost", "127.0.0.1"]:
311
+ if host in ['localhost', '127.0.0.1']:
286
312
  logger.debug(f"Mapping host {host} to {self.docker_host_mapping} for Docker environment")
287
313
  return self.docker_host_mapping
288
314
 
289
- url_pattern = r"(localhost|127\.0\.0\.1)"
315
+ url_pattern = r'(localhost|127\.0\.0\.1)'
290
316
  match = re.search(url_pattern, host)
291
317
  if match:
292
318
  original_host = match.group(1)
@@ -2,6 +2,7 @@
2
2
  # IMPORTS
3
3
 
4
4
  import asyncio
5
+ from dataclasses import dataclass
5
6
  import inspect
6
7
  import logging
7
8
  import random
@@ -11,23 +12,35 @@ import shutil
11
12
  import subprocess
12
13
  import time
13
14
  from pathlib import Path
14
- from typing import Literal, Union
15
+ from typing import Literal, Optional, Union
15
16
 
16
17
  import pendulum
17
18
  from croniter import croniter
18
19
  from configparser import RawConfigParser
19
20
  from dynaconf import Dynaconf, add_converter
20
21
 
22
+ logger = logging.getLogger(__name__)
23
+
21
24
  ########################################################################################################################
22
- # FUNCTIONS
25
+ # CLASSES
26
+
27
+
28
+ @dataclass
29
+ class ProjectMetadata:
30
+ cmd_prefix: str
31
+ package_name: str
32
+ env_name: str
33
+ path: Path
34
+ config_path: Path
23
35
 
24
- logger = logging.getLogger(__name__)
25
36
 
26
- Config = Union[RawConfigParser, Dynaconf]
37
+ ########################################################################################################################
38
+ # FUNCTIONS
27
39
 
28
40
 
29
41
  def get_granular_date(
30
- granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
42
+ granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str],
43
+ tz: str = "Europe/Madrid",
31
44
  ) -> pendulum.DateTime:
32
45
  """
33
46
  Returns the most recent date based on the given granularity or a custom cron expression.
@@ -67,8 +80,10 @@ def read_converter(path_str: str):
67
80
 
68
81
 
69
82
  def get_config(
70
- config_file: Path, tz: str = "Europe/Madrid"
71
- ) -> Union[RawConfigParser, Dynaconf]:
83
+ config_file: Optional[Path] = None, tz: str = "Europe/Madrid"
84
+ ) -> Dynaconf:
85
+ config_file = config_file or get_project_metadata().config_path
86
+
72
87
  if Path(config_file).suffix == ".ini":
73
88
  logger.warning("Using legacy INI config reader. Please migrate to TOML")
74
89
  cfg = RawConfigParser()
@@ -106,17 +121,20 @@ def get_config(
106
121
 
107
122
  return config
108
123
 
109
-
110
- def get_project_metadata():
124
+ def get_project_metadata() -> ProjectMetadata:
111
125
  caller_frame = inspect.stack()[1]
112
126
  current_file_parts = Path(caller_frame.filename).resolve().parts
113
127
  src_index = current_file_parts.index("src")
128
+
114
129
  cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
115
- pkg_name = current_file_parts[src_index + 1]
116
- env_name = f"{pkg_name}_env"
130
+ package_name = current_file_parts[src_index + 1]
131
+ env_name = f"{package_name}_env"
117
132
  project_path = Path(*current_file_parts[:src_index])
133
+ config_path = project_path / "config.toml"
118
134
 
119
- return {"cmd_prefix": cmd_prefix, "pkg_name": pkg_name, "env_name": env_name, "project_path": project_path}
135
+ return ProjectMetadata(
136
+ cmd_prefix, package_name, env_name, project_path, config_path
137
+ )
120
138
 
121
139
 
122
140
  def set_logger(level):
File without changes
File without changes