datamarket 0.7.20__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datamarket might be problematic. Click here for more details.

Files changed (24) hide show
  1. {datamarket-0.7.20 → datamarket-0.8.0}/PKG-INFO +9 -7
  2. {datamarket-0.7.20 → datamarket-0.8.0}/pyproject.toml +7 -5
  3. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/aws.py +8 -10
  4. datamarket-0.8.0/src/datamarket/utils/main.py +167 -0
  5. datamarket-0.7.20/src/datamarket/utils/main.py +0 -94
  6. {datamarket-0.7.20 → datamarket-0.8.0}/LICENSE +0 -0
  7. {datamarket-0.7.20 → datamarket-0.8.0}/README.md +0 -0
  8. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/__init__.py +0 -0
  9. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/__init__.py +0 -0
  10. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/alchemy.py +0 -0
  11. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/drive.py +0 -0
  12. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/ftp.py +0 -0
  13. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/nominatim.py +0 -0
  14. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/peerdb.py +0 -0
  15. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/proxy.py +0 -0
  16. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/tinybird.py +0 -0
  17. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/params/__init__.py +0 -0
  18. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/params/nominatim.py +0 -0
  19. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/__init__.py +0 -0
  20. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/airflow.py +0 -0
  21. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/alchemy.py +0 -0
  22. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/selenium.py +0 -0
  23. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/soda.py +0 -0
  24. {datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/typer.py +0 -0
@@ -1,8 +1,7 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: datamarket
3
- Version: 0.7.20
3
+ Version: 0.8.0
4
4
  Summary: Utilities that integrate advanced scraping knowledge into just one library.
5
- Home-page: https://datamarket.es
6
5
  License: GPL-3.0-or-later
7
6
  Author: DataMarket
8
7
  Author-email: techsupport@datamarket.es
@@ -22,7 +21,6 @@ Provides-Extra: boto3
22
21
  Provides-Extra: chompjs
23
22
  Provides-Extra: click
24
23
  Provides-Extra: clickhouse-driver
25
- Provides-Extra: croniter
26
24
  Provides-Extra: datetime
27
25
  Provides-Extra: demjson3
28
26
  Provides-Extra: dnspython
@@ -36,6 +34,7 @@ Provides-Extra: google-api-python-client
36
34
  Provides-Extra: google-auth-httplib2
37
35
  Provides-Extra: google-auth-oauthlib
38
36
  Provides-Extra: html2text
37
+ Provides-Extra: httpx
39
38
  Provides-Extra: json5
40
39
  Provides-Extra: lxml
41
40
  Provides-Extra: nodriver
@@ -43,7 +42,6 @@ Provides-Extra: openpyxl
43
42
  Provides-Extra: pandas
44
43
  Provides-Extra: pandera
45
44
  Provides-Extra: peerdb
46
- Provides-Extra: pendulum
47
45
  Provides-Extra: pillow
48
46
  Provides-Extra: playwright
49
47
  Provides-Extra: playwright-stealth
@@ -70,11 +68,12 @@ Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra =
70
68
  Requires-Dist: chompjs (==1.3.0) ; extra == "chompjs"
71
69
  Requires-Dist: click (==8.1.7) ; extra == "click"
72
70
  Requires-Dist: clickhouse-driver (==0.2.9) ; extra == "clickhouse-driver" or extra == "peerdb"
73
- Requires-Dist: croniter (==3.0.4) ; extra == "croniter"
71
+ Requires-Dist: croniter (==3.0.4)
74
72
  Requires-Dist: datetime (==5.5) ; extra == "datetime"
75
73
  Requires-Dist: demjson3 (==3.0.6) ; extra == "demjson3"
76
74
  Requires-Dist: dnspython (==2.7.0) ; extra == "dnspython"
77
75
  Requires-Dist: duckduckgo-search (==6.2.11b1) ; extra == "duckduckgo-search"
76
+ Requires-Dist: dynaconf (==3.2.6)
78
77
  Requires-Dist: fake-useragent (==1.5.1) ; extra == "fake-useragent"
79
78
  Requires-Dist: geoalchemy2 (==0.15.2) ; extra == "geoalchemy2"
80
79
  Requires-Dist: geopandas (==1.0.1) ; extra == "geopandas"
@@ -83,13 +82,15 @@ Requires-Dist: google-api-python-client (==2.151.0) ; extra == "google-api-pytho
83
82
  Requires-Dist: google-auth-httplib2 (==0.2.0) ; extra == "google-auth-httplib2"
84
83
  Requires-Dist: google-auth-oauthlib (==1.2.1) ; extra == "google-auth-oauthlib"
85
84
  Requires-Dist: html2text (==2024.2.26) ; extra == "html2text"
85
+ Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
86
+ Requires-Dist: jinja2 (==3.1.5)
86
87
  Requires-Dist: json5 (==0.9.25) ; extra == "json5"
87
88
  Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
88
89
  Requires-Dist: nodriver (==0.37) ; extra == "nodriver"
89
90
  Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
90
91
  Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
91
92
  Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
92
- Requires-Dist: pendulum (==3.0.0) ; extra == "pendulum"
93
+ Requires-Dist: pendulum (==3.0.0)
93
94
  Requires-Dist: pillow (==11.0.0) ; extra == "pillow"
94
95
  Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
95
96
  Requires-Dist: playwright-stealth (==1.0.6) ; extra == "playwright-stealth"
@@ -114,6 +115,7 @@ Requires-Dist: undetected-chromedriver (==3.5.5) ; extra == "undetected-chromedr
114
115
  Requires-Dist: unidecode (==1.3.8) ; extra == "unidecode"
115
116
  Requires-Dist: xmltodict (==0.14.2) ; extra == "xmltodict"
116
117
  Project-URL: Documentation, https://github.com/Data-Market/datamarket
118
+ Project-URL: Homepage, https://datamarket.es
117
119
  Project-URL: Repository, https://github.com/Data-Market/datamarket
118
120
  Description-Content-Type: text/markdown
119
121
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "datamarket"
3
- version = "0.7.20"
3
+ version = "0.8.0"
4
4
  description = "Utilities that integrate advanced scraping knowledge into just one library."
5
5
  authors = ["DataMarket <techsupport@datamarket.es>"]
6
6
  license = "GPL-3.0-or-later"
@@ -23,12 +23,15 @@ requests = "2.32.3"
23
23
  tenacity = "9.0.0"
24
24
  beautifulsoup4 = "4.12.3"
25
25
  pre-commit = "4.0.1"
26
+ pendulum = "3.0.0"
27
+ croniter = "3.0.4"
28
+ dynaconf = "3.2.6"
29
+ jinja2 = "3.1.5"
26
30
 
27
31
  boto3 = { version = "1.35.53", optional = true }
28
32
  unidecode = { version = "1.3.8", optional = true }
29
33
  lxml = { extras = ["html-clean"], version = "5.3.0", optional = true }
30
34
  tqdm = { version = "4.66.6", optional = true }
31
- pendulum = { version = "3.0.0", optional = true }
32
35
  pandas = { version = "2.2.3", optional = true }
33
36
  pyarrow = { version = "17.0.0", optional = true }
34
37
  pytest = { version = "8.3.3", optional = true }
@@ -36,7 +39,6 @@ playwright = { version = "1.47.0", optional = true }
36
39
  playwright-stealth = { version = "1.0.6", optional = true }
37
40
  soda-core-postgres = { version = "3.4.1", optional = true }
38
41
  fake-useragent = { version = "1.5.1", optional = true }
39
- croniter = { version = "3.0.4", optional = true }
40
42
  pydrive2 = { version = "1.20.0", optional = true }
41
43
  clickhouse-driver = { version = "0.2.9", optional = true }
42
44
  stem = { version = "1.8.2", optional = true }
@@ -67,13 +69,13 @@ google-auth-httplib2 = { version = "0.2.0", optional = true }
67
69
  google-auth-oauthlib = { version = "1.2.1", optional = true }
68
70
  dnspython = { version = "2.7.0", optional = true }
69
71
  openpyxl = { version = "3.1.5", optional = true }
72
+ httpx = { extras = ["http2"], version = "0.28.1", optional = true }
70
73
 
71
74
  [tool.poetry.extras]
72
75
  boto3 = ["boto3"]
73
76
  unidecode = ["unidecode"]
74
77
  lxml = ["lxml"]
75
78
  tqdm = ["tqdm"]
76
- pendulum = ["pendulum"]
77
79
  pandas = ["pandas"]
78
80
  pyarrow = ["pyarrow"]
79
81
  pytest = ["pytest"]
@@ -81,7 +83,6 @@ playwright = ["playwright"]
81
83
  playwright-stealth = ["playwright-stealth"]
82
84
  soda-core-postgres = ["soda-core-postgres"]
83
85
  fake-useragent = ["fake-useragent"]
84
- croniter = ["croniter"]
85
86
  pydrive2 = ["pydrive2"]
86
87
  clickhouse-driver = ["clickhouse-driver"]
87
88
  stem = ["stem"]
@@ -112,6 +113,7 @@ google-auth-httplib2 = ["google-auth-httplib2"]
112
113
  google-auth-oauthlib = ["google-auth-oauthlib"]
113
114
  dnspython = ["dnspython"]
114
115
  openpyxl = ["openpyxl"]
116
+ httpx = ["httpx"]
115
117
 
116
118
  # Interface groups
117
119
  aws = ["boto3"]
@@ -16,16 +16,14 @@ class AWSInterface:
16
16
  self.profiles = []
17
17
  self.config = config
18
18
 
19
- for section in self.config.sections():
20
- if section.startswith("aws:"):
21
- profile_name = section.split(":", 1)[1]
22
- self.profiles.append(
23
- {
24
- "profile": profile_name,
25
- "bucket": self.config[section]["bucket"],
26
- "session": boto3.Session(profile_name=profile_name),
27
- }
28
- )
19
+ for profile_name, values in self.config.get("aws", {}).items():
20
+ self.profiles.append(
21
+ {
22
+ "profile": profile_name,
23
+ "bucket": values["bucket"],
24
+ "session": boto3.Session(profile_name=profile_name),
25
+ }
26
+ )
29
27
 
30
28
  if not self.profiles:
31
29
  logger.warning("No AWS profiles found in config file")
@@ -0,0 +1,167 @@
1
+ ########################################################################################################################
2
+ # IMPORTS
3
+
4
+ import configparser
5
+ import inspect
6
+ import logging
7
+ import random
8
+ import re
9
+ import shlex
10
+ import shutil
11
+ import subprocess
12
+ import time
13
+ from pathlib import Path
14
+ from typing import Literal, Union
15
+
16
+ import pendulum
17
+ from croniter import croniter
18
+ from dynaconf import Dynaconf, add_converter
19
+
20
+ ########################################################################################################################
21
+ # FUNCTIONS
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ def get_granular_date(
27
+ granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
28
+ ) -> str:
29
+ """
30
+ Returns the most recent date based on the given granularity or a custom cron expression.
31
+
32
+ Args:
33
+ granularity: Either a predefined value ("monthly", "biweekly", "weekly") or a custom cron expression.
34
+ tz: Timezone to use for date calculations (default: "Europe/Madrid").
35
+
36
+ Returns:
37
+ A string representing the most recent date in the format "YYYY-MM-DD".
38
+
39
+ Raises:
40
+ ValueError: If the provided granularity or cron expression is invalid.
41
+ """
42
+ now = pendulum.now(tz)
43
+
44
+ predefined_patterns = {
45
+ "monthly": "0 0 1 * *",
46
+ "biweekly": "0 0 1,15 * *",
47
+ "weekly": "0 0 * * MON",
48
+ "daily": "0 0 * * *",
49
+ }
50
+
51
+ cron_pattern = predefined_patterns.get(granularity, granularity)
52
+
53
+ try:
54
+ cron = croniter(cron_pattern, now)
55
+ return cron.get_prev(pendulum.DateTime).strftime("%Y-%m-%d")
56
+ except Exception as e:
57
+ raise ValueError("Invalid cron expression or granularity specified.") from e
58
+
59
+
60
+ def read_converter(path_str: str):
61
+ with open(path_str) as f:
62
+ return f.read()
63
+
64
+
65
+ def get_config(config_file: Path, tz: str = "Europe/Madrid"):
66
+ if Path(config_file).suffix == "ini":
67
+ logger.warning("Using legacy INI config reader. Please migrate to TOML")
68
+ cfg = configparser.RawConfigParser()
69
+ return cfg.read(config_file)
70
+
71
+ add_converter("read", read_converter)
72
+
73
+ config = Dynaconf(
74
+ environments=True,
75
+ env_switcher="SYSTYPE",
76
+ vars={
77
+ "today": get_granular_date("daily", tz),
78
+ "biweekly_date": get_granular_date("biweekly", tz),
79
+ },
80
+ )
81
+
82
+ config.load_file(path=config_file)
83
+ config.load_file(path=Path.home() / config_file.name)
84
+ return config
85
+
86
+
87
+ def get_project_metadata():
88
+ caller_frame = inspect.stack()[1]
89
+ current_file_parts = Path(caller_frame.filename).resolve().parts
90
+ src_index = current_file_parts.index("src")
91
+ cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
92
+ pkg_name = current_file_parts[src_index + 1]
93
+ env_name = f"{pkg_name}_env"
94
+ project_path = Path(*current_file_parts[:src_index])
95
+
96
+ return {"cmd_prefix": cmd_prefix, "pkg_name": pkg_name, "env_name": env_name, "project_path": project_path}
97
+
98
+
99
+ def set_logger(level):
100
+ log = logging.getLogger()
101
+ log.setLevel(logging.DEBUG)
102
+ ch = logging.StreamHandler()
103
+ ch.setLevel(level.upper())
104
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
105
+ ch.setFormatter(formatter)
106
+ log.addHandler(ch)
107
+
108
+
109
+ def ban_sleep(max_time, min_time=0):
110
+ sleep_time = int(random.uniform(min_time, max_time)) # noqa: S311
111
+ logger.info(f"sleeping for {sleep_time} seconds...")
112
+ time.sleep(sleep_time)
113
+
114
+
115
+ def run_bash_command(command):
116
+ p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
117
+
118
+ text_lines = []
119
+ for line_b in iter(p.stdout.readline, ""):
120
+ line_str = line_b.decode().strip()
121
+
122
+ if not line_str:
123
+ break
124
+
125
+ logger.info(line_str)
126
+ text_lines.append(line_str)
127
+
128
+ return "\n".join(text_lines)
129
+
130
+
131
+ def text_to_int(text):
132
+ max_int32 = 2147483647
133
+ parsed_str = re.sub(r"[^\d]", "", text)
134
+ if parsed_str:
135
+ num = int(parsed_str)
136
+ else:
137
+ return None
138
+
139
+ if -max_int32 < num < max_int32:
140
+ return num
141
+
142
+
143
+ def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
144
+ while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
145
+ logger.warning("time to sleep and not scrape anything...")
146
+ ban_sleep(seconds, seconds)
147
+
148
+
149
+ def sleep_in_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
150
+ while from_h <= pendulum.now(tz=tz).hour < to_h:
151
+ logger.warning("time to sleep and not scrape anything...")
152
+ ban_sleep(seconds, seconds)
153
+
154
+
155
+ def parse_field(dict_struct, field_path, format_method=None):
156
+ if not isinstance(field_path, list):
157
+ raise ValueError("Argument field_path must be of type list")
158
+
159
+ field_value = dict_struct
160
+ for field in field_path:
161
+ if isinstance(field_value, dict):
162
+ field_value = field_value.get(field)
163
+ elif isinstance(field_value, list):
164
+ field_value = field_value[field] if len(field_value) > field else None
165
+ if field_value is None:
166
+ return None
167
+ return format_method(field_value) if format_method else field_value
@@ -1,94 +0,0 @@
1
- ########################################################################################################################
2
- # IMPORTS
3
-
4
- import configparser
5
- import logging
6
- import random
7
- import re
8
- import shlex
9
- import subprocess
10
- import time
11
-
12
- import pendulum
13
-
14
- ########################################################################################################################
15
- # FUNCTIONS
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- def get_config(config_path):
21
- cfg = configparser.RawConfigParser()
22
- cfg.read(config_path)
23
- return cfg
24
-
25
-
26
- def set_logger(level):
27
- log = logging.getLogger()
28
- log.setLevel(logging.DEBUG)
29
- ch = logging.StreamHandler()
30
- ch.setLevel(level.upper())
31
- formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
32
- ch.setFormatter(formatter)
33
- log.addHandler(ch)
34
-
35
-
36
- def ban_sleep(max_time, min_time=0):
37
- sleep_time = int(random.uniform(min_time, max_time))
38
- logger.info(f"sleeping for {sleep_time} seconds...")
39
- time.sleep(sleep_time)
40
-
41
-
42
- def run_bash_command(command):
43
- p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
44
-
45
- text_lines = []
46
- for line_b in iter(p.stdout.readline, ""):
47
- line_str = line_b.decode().strip()
48
-
49
- if not line_str:
50
- break
51
-
52
- logger.info(line_str)
53
- text_lines.append(line_str)
54
-
55
- return "\n".join(text_lines)
56
-
57
-
58
- def text_to_int(text):
59
- max_int32 = 2147483647
60
- parsed_str = re.sub(r"[^\d]", "", text)
61
- if parsed_str:
62
- num = int(parsed_str)
63
- else:
64
- return None
65
-
66
- if -max_int32 < num < max_int32:
67
- return num
68
-
69
-
70
- def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
71
- while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
72
- logger.warning("time to sleep and not scrape anything...")
73
- ban_sleep(seconds, seconds)
74
-
75
-
76
- def sleep_in_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
77
- while from_h <= pendulum.now(tz=tz).hour < to_h:
78
- logger.warning("time to sleep and not scrape anything...")
79
- ban_sleep(seconds, seconds)
80
-
81
-
82
- def parse_field(dict_struct, field_path, format_method=None):
83
- if not isinstance(field_path, list):
84
- raise ValueError("Argument field_path must be of type list")
85
-
86
- field_value = dict_struct
87
- for field in field_path:
88
- if isinstance(field_value, dict):
89
- field_value = field_value.get(field)
90
- elif isinstance(field_value, list):
91
- field_value = field_value[field] if len(field_value) > field else None
92
- if field_value is None:
93
- return None
94
- return format_method(field_value) if format_method else field_value
File without changes
File without changes