PyPI - datamarket - Versions diffs - 0.7.20__tar.gz → 0.8.0__tar.gz - Mend

datamarket 0.7.20tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datamarket might be problematic. Click here for more details.

Files changed (24) hide show

{datamarket-0.7.20 → datamarket-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,8 +1,7 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.3
 Name: datamarket
-Version: 0.7.20
+Version: 0.8.0
 Summary: Utilities that integrate advanced scraping knowledge into just one library.
-Home-page: https://datamarket.es
 License: GPL-3.0-or-later
 Author: DataMarket
 Author-email: techsupport@datamarket.es
@@ -22,7 +21,6 @@ Provides-Extra: boto3
 Provides-Extra: chompjs
 Provides-Extra: click
 Provides-Extra: clickhouse-driver
-Provides-Extra: croniter
 Provides-Extra: datetime
 Provides-Extra: demjson3
 Provides-Extra: dnspython
@@ -36,6 +34,7 @@ Provides-Extra: google-api-python-client
 Provides-Extra: google-auth-httplib2
 Provides-Extra: google-auth-oauthlib
 Provides-Extra: html2text
+Provides-Extra: httpx
 Provides-Extra: json5
 Provides-Extra: lxml
 Provides-Extra: nodriver
@@ -43,7 +42,6 @@ Provides-Extra: openpyxl
 Provides-Extra: pandas
 Provides-Extra: pandera
 Provides-Extra: peerdb
-Provides-Extra: pendulum
 Provides-Extra: pillow
 Provides-Extra: playwright
 Provides-Extra: playwright-stealth
@@ -70,11 +68,12 @@ Requires-Dist: boto3 (==1.35.53) ; extra == "boto3" or extra == "aws" or extra =
 Requires-Dist: chompjs (==1.3.0) ; extra == "chompjs"
 Requires-Dist: click (==8.1.7) ; extra == "click"
 Requires-Dist: clickhouse-driver (==0.2.9) ; extra == "clickhouse-driver" or extra == "peerdb"
-Requires-Dist: croniter (==3.0.4) ; extra == "croniter"
+Requires-Dist: croniter (==3.0.4)
 Requires-Dist: datetime (==5.5) ; extra == "datetime"
 Requires-Dist: demjson3 (==3.0.6) ; extra == "demjson3"
 Requires-Dist: dnspython (==2.7.0) ; extra == "dnspython"
 Requires-Dist: duckduckgo-search (==6.2.11b1) ; extra == "duckduckgo-search"
+Requires-Dist: dynaconf (==3.2.6)
 Requires-Dist: fake-useragent (==1.5.1) ; extra == "fake-useragent"
 Requires-Dist: geoalchemy2 (==0.15.2) ; extra == "geoalchemy2"
 Requires-Dist: geopandas (==1.0.1) ; extra == "geopandas"
@@ -83,13 +82,15 @@ Requires-Dist: google-api-python-client (==2.151.0) ; extra == "google-api-pytho
 Requires-Dist: google-auth-httplib2 (==0.2.0) ; extra == "google-auth-httplib2"
 Requires-Dist: google-auth-oauthlib (==1.2.1) ; extra == "google-auth-oauthlib"
 Requires-Dist: html2text (==2024.2.26) ; extra == "html2text"
+Requires-Dist: httpx[http2] (==0.28.1) ; extra == "httpx"
+Requires-Dist: jinja2 (==3.1.5)
 Requires-Dist: json5 (==0.9.25) ; extra == "json5"
 Requires-Dist: lxml[html-clean] (==5.3.0) ; extra == "lxml"
 Requires-Dist: nodriver (==0.37) ; extra == "nodriver"
 Requires-Dist: openpyxl (==3.1.5) ; extra == "openpyxl"
 Requires-Dist: pandas (==2.2.3) ; extra == "pandas"
 Requires-Dist: pandera (==0.20.4) ; extra == "pandera"
-Requires-Dist: pendulum (==3.0.0) ; extra == "pendulum"
+Requires-Dist: pendulum (==3.0.0)
 Requires-Dist: pillow (==11.0.0) ; extra == "pillow"
 Requires-Dist: playwright (==1.47.0) ; extra == "playwright"
 Requires-Dist: playwright-stealth (==1.0.6) ; extra == "playwright-stealth"
@@ -114,6 +115,7 @@ Requires-Dist: undetected-chromedriver (==3.5.5) ; extra == "undetected-chromedr
 Requires-Dist: unidecode (==1.3.8) ; extra == "unidecode"
 Requires-Dist: xmltodict (==0.14.2) ; extra == "xmltodict"
 Project-URL: Documentation, https://github.com/Data-Market/datamarket
+Project-URL: Homepage, https://datamarket.es
 Project-URL: Repository, https://github.com/Data-Market/datamarket
 Description-Content-Type: text/markdown

{datamarket-0.7.20 → datamarket-0.8.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "datamarket"
-version = "0.7.20"
+version = "0.8.0"
 description = "Utilities that integrate advanced scraping knowledge into just one library."
 authors = ["DataMarket <techsupport@datamarket.es>"]
 license = "GPL-3.0-or-later"
@@ -23,12 +23,15 @@ requests = "2.32.3"
 tenacity = "9.0.0"
 beautifulsoup4 = "4.12.3"
 pre-commit = "4.0.1"
+pendulum = "3.0.0"
+croniter = "3.0.4"
+dynaconf = "3.2.6"
+jinja2 = "3.1.5"
 boto3 = { version = "1.35.53", optional = true }
 unidecode = { version = "1.3.8", optional = true }
 lxml = { extras = ["html-clean"], version = "5.3.0", optional = true }
 tqdm = { version = "4.66.6", optional = true }
-pendulum = { version = "3.0.0", optional = true }
 pandas = { version = "2.2.3", optional = true }
 pyarrow = { version = "17.0.0", optional = true }
 pytest = { version = "8.3.3", optional = true }
@@ -36,7 +39,6 @@ playwright = { version = "1.47.0", optional = true }
 playwright-stealth = { version = "1.0.6", optional = true }
 soda-core-postgres = { version = "3.4.1", optional = true }
 fake-useragent = { version = "1.5.1", optional = true }
-croniter = { version = "3.0.4", optional = true }
 pydrive2 = { version = "1.20.0", optional = true }
 clickhouse-driver = { version = "0.2.9", optional = true }
 stem = { version = "1.8.2", optional = true }
@@ -67,13 +69,13 @@ google-auth-httplib2 = { version = "0.2.0", optional = true }
 google-auth-oauthlib = { version = "1.2.1", optional = true }
 dnspython = { version = "2.7.0", optional = true }
 openpyxl = { version = "3.1.5", optional = true }
+httpx = { extras = ["http2"], version = "0.28.1", optional = true }
 [tool.poetry.extras]
 boto3 = ["boto3"]
 unidecode = ["unidecode"]
 lxml = ["lxml"]
 tqdm = ["tqdm"]
-pendulum = ["pendulum"]
 pandas = ["pandas"]
 pyarrow = ["pyarrow"]
 pytest = ["pytest"]
@@ -81,7 +83,6 @@ playwright = ["playwright"]
 playwright-stealth = ["playwright-stealth"]
 soda-core-postgres = ["soda-core-postgres"]
 fake-useragent = ["fake-useragent"]
-croniter = ["croniter"]
 pydrive2 = ["pydrive2"]
 clickhouse-driver = ["clickhouse-driver"]
 stem = ["stem"]
@@ -112,6 +113,7 @@ google-auth-httplib2 = ["google-auth-httplib2"]
 google-auth-oauthlib = ["google-auth-oauthlib"]
 dnspython = ["dnspython"]
 openpyxl = ["openpyxl"]
+httpx = ["httpx"]
 # Interface groups
 aws = ["boto3"]

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/aws.py RENAMED Viewed

@@ -16,16 +16,14 @@ class AWSInterface:
         self.profiles = []
         self.config = config
-        for section in self.config.sections():
-            if section.startswith("aws:"):
-                profile_name = section.split(":", 1)[1]
-                self.profiles.append(
-                    {
-                        "profile": profile_name,
-                        "bucket": self.config[section]["bucket"],
-                        "session": boto3.Session(profile_name=profile_name),
-                    }
-                )
+        for profile_name, values in self.config.get("aws", {}).items():
+            self.profiles.append(
+                {
+                    "profile": profile_name,
+                    "bucket": values["bucket"],
+                    "session": boto3.Session(profile_name=profile_name),
+                }
+            )
         if not self.profiles:
             logger.warning("No AWS profiles found in config file")

datamarket-0.8.0/src/datamarket/utils/main.py ADDED Viewed

@@ -0,0 +1,167 @@
+########################################################################################################################
+# IMPORTS
+import configparser
+import inspect
+import logging
+import random
+import re
+import shlex
+import shutil
+import subprocess
+import time
+from pathlib import Path
+from typing import Literal, Union
+import pendulum
+from croniter import croniter
+from dynaconf import Dynaconf, add_converter
+########################################################################################################################
+# FUNCTIONS
+logger = logging.getLogger(__name__)
+def get_granular_date(
+    granularity: Union[Literal["monthly", "biweekly", "weekly", "daily"], str], tz: str = "Europe/Madrid"
+) -> str:
+    """
+    Returns the most recent date based on the given granularity or a custom cron expression.
+    Args:
+        granularity: Either a predefined value ("monthly", "biweekly", "weekly") or a custom cron expression.
+        tz: Timezone to use for date calculations (default: "Europe/Madrid").
+    Returns:
+        A string representing the most recent date in the format "YYYY-MM-DD".
+    Raises:
+        ValueError: If the provided granularity or cron expression is invalid.
+    """
+    now = pendulum.now(tz)
+    predefined_patterns = {
+        "monthly": "0 0 1 * *",
+        "biweekly": "0 0 1,15 * *",
+        "weekly": "0 0 * * MON",
+        "daily": "0 0 * * *",
+    }
+    cron_pattern = predefined_patterns.get(granularity, granularity)
+    try:
+        cron = croniter(cron_pattern, now)
+        return cron.get_prev(pendulum.DateTime).strftime("%Y-%m-%d")
+    except Exception as e:
+        raise ValueError("Invalid cron expression or granularity specified.") from e
+def read_converter(path_str: str):
+    with open(path_str) as f:
+        return f.read()
+def get_config(config_file: Path, tz: str = "Europe/Madrid"):
+    if Path(config_file).suffix == "ini":
+        logger.warning("Using legacy INI config reader. Please migrate to TOML")
+        cfg = configparser.RawConfigParser()
+        return cfg.read(config_file)
+    add_converter("read", read_converter)
+    config = Dynaconf(
+        environments=True,
+        env_switcher="SYSTYPE",
+        vars={
+            "today": get_granular_date("daily", tz),
+            "biweekly_date": get_granular_date("biweekly", tz),
+        },
+    )
+    config.load_file(path=config_file)
+    config.load_file(path=Path.home() / config_file.name)
+    return config
+def get_project_metadata():
+    caller_frame = inspect.stack()[1]
+    current_file_parts = Path(caller_frame.filename).resolve().parts
+    src_index = current_file_parts.index("src")
+    cmd_prefix = "dix vnc run --" if shutil.which("dix") else ""
+    pkg_name = current_file_parts[src_index + 1]
+    env_name = f"{pkg_name}_env"
+    project_path = Path(*current_file_parts[:src_index])
+    return {"cmd_prefix": cmd_prefix, "pkg_name": pkg_name, "env_name": env_name, "project_path": project_path}
+def set_logger(level):
+    log = logging.getLogger()
+    log.setLevel(logging.DEBUG)
+    ch = logging.StreamHandler()
+    ch.setLevel(level.upper())
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    ch.setFormatter(formatter)
+    log.addHandler(ch)
+def ban_sleep(max_time, min_time=0):
+    sleep_time = int(random.uniform(min_time, max_time))  # noqa: S311
+    logger.info(f"sleeping for {sleep_time} seconds...")
+    time.sleep(sleep_time)
+def run_bash_command(command):
+    p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    text_lines = []
+    for line_b in iter(p.stdout.readline, ""):
+        line_str = line_b.decode().strip()
+        if not line_str:
+            break
+        logger.info(line_str)
+        text_lines.append(line_str)
+    return "\n".join(text_lines)
+def text_to_int(text):
+    max_int32 = 2147483647
+    parsed_str = re.sub(r"[^\d]", "", text)
+    if parsed_str:
+        num = int(parsed_str)
+    else:
+        return None
+    if -max_int32 < num < max_int32:
+        return num
+def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
+    while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
+        logger.warning("time to sleep and not scrape anything...")
+        ban_sleep(seconds, seconds)
+def sleep_in_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
+    while from_h <= pendulum.now(tz=tz).hour < to_h:
+        logger.warning("time to sleep and not scrape anything...")
+        ban_sleep(seconds, seconds)
+def parse_field(dict_struct, field_path, format_method=None):
+    if not isinstance(field_path, list):
+        raise ValueError("Argument field_path must be of type list")
+    field_value = dict_struct
+    for field in field_path:
+        if isinstance(field_value, dict):
+            field_value = field_value.get(field)
+        elif isinstance(field_value, list):
+            field_value = field_value[field] if len(field_value) > field else None
+        if field_value is None:
+            return None
+    return format_method(field_value) if format_method else field_value

datamarket-0.7.20/src/datamarket/utils/main.py DELETED Viewed

@@ -1,94 +0,0 @@
-########################################################################################################################
-# IMPORTS
-import configparser
-import logging
-import random
-import re
-import shlex
-import subprocess
-import time
-import pendulum
-########################################################################################################################
-# FUNCTIONS
-logger = logging.getLogger(__name__)
-def get_config(config_path):
-    cfg = configparser.RawConfigParser()
-    cfg.read(config_path)
-    return cfg
-def set_logger(level):
-    log = logging.getLogger()
-    log.setLevel(logging.DEBUG)
-    ch = logging.StreamHandler()
-    ch.setLevel(level.upper())
-    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    ch.setFormatter(formatter)
-    log.addHandler(ch)
-def ban_sleep(max_time, min_time=0):
-    sleep_time = int(random.uniform(min_time, max_time))
-    logger.info(f"sleeping for {sleep_time} seconds...")
-    time.sleep(sleep_time)
-def run_bash_command(command):
-    p = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    text_lines = []
-    for line_b in iter(p.stdout.readline, ""):
-        line_str = line_b.decode().strip()
-        if not line_str:
-            break
-        logger.info(line_str)
-        text_lines.append(line_str)
-    return "\n".join(text_lines)
-def text_to_int(text):
-    max_int32 = 2147483647
-    parsed_str = re.sub(r"[^\d]", "", text)
-    if parsed_str:
-        num = int(parsed_str)
-    else:
-        return None
-    if -max_int32 < num < max_int32:
-        return num
-def sleep_out_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
-    while pendulum.now(tz=tz).hour >= to_h or pendulum.now(tz=tz).hour < from_h:
-        logger.warning("time to sleep and not scrape anything...")
-        ban_sleep(seconds, seconds)
-def sleep_in_interval(from_h, to_h, tz="Europe/Madrid", seconds=1800):
-    while from_h <= pendulum.now(tz=tz).hour < to_h:
-        logger.warning("time to sleep and not scrape anything...")
-        ban_sleep(seconds, seconds)
-def parse_field(dict_struct, field_path, format_method=None):
-    if not isinstance(field_path, list):
-        raise ValueError("Argument field_path must be of type list")
-    field_value = dict_struct
-    for field in field_path:
-        if isinstance(field_value, dict):
-            field_value = field_value.get(field)
-        elif isinstance(field_value, list):
-            field_value = field_value[field] if len(field_value) > field else None
-        if field_value is None:
-            return None
-    return format_method(field_value) if format_method else field_value

{datamarket-0.7.20 → datamarket-0.8.0}/LICENSE RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/README.md RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/__init__.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/__init__.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/alchemy.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/drive.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/ftp.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/nominatim.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/peerdb.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/proxy.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/interfaces/tinybird.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/params/__init__.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/params/nominatim.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/__init__.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/airflow.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/alchemy.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/selenium.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/soda.py RENAMED Viewed

File without changes

{datamarket-0.7.20 → datamarket-0.8.0}/src/datamarket/utils/typer.py RENAMED Viewed

File without changes

datamarket 0.7.20__tar.gz → 0.8.0__tar.gz

Potentially problematic release.

datamarket 0.7.20tar.gz → 0.8.0tar.gz