PyPI - sibi-dst - Versions diffs - 0.3.20__py3-none-any.whl → 0.3.22__py3-none-any.whl - Mend

sibi-dst 0.3.20py3-none-any.whl → 0.3.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

sibi_dst/__init__.py +1 -1
sibi_dst/df_helper/__init__.py +2 -2
sibi_dst/df_helper/_df_helper.py +34 -33
sibi_dst/df_helper/_parquet_artifact.py +4 -1
sibi_dst/df_helper/_parquet_reader.py +2 -1
sibi_dst/df_helper/backends/django/__init__.py +1 -2
sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
sibi_dst/df_helper/backends/http/__init__.py +2 -2
sibi_dst/df_helper/backends/http/_http_config.py +6 -3
sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
sibi_dst/df_helper/core/_defaults.py +9 -6
sibi_dst/df_helper/core/_filter_handler.py +7 -4
sibi_dst/df_helper/core/_params_config.py +3 -2
sibi_dst/df_helper/core/_query_config.py +0 -2
sibi_dst/utils/__init__.py +6 -5
sibi_dst/utils/_airflow_manager.py +4 -3
sibi_dst/utils/_clickhouse_writer.py +16 -13
sibi_dst/utils/_credentials.py +1 -1
sibi_dst/utils/_data_wrapper.py +82 -16
sibi_dst/utils/_date_utils.py +11 -5
sibi_dst/utils/_df_utils.py +9 -5
sibi_dst/utils/_file_utils.py +3 -1
sibi_dst/utils/_filepath_generator.py +4 -2
sibi_dst/utils/_log_utils.py +1 -1
sibi_dst/utils/_parquet_saver.py +0 -2
sibi_dst/utils/_storage_manager.py +1 -1
{sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/METADATA +1 -1
sibi_dst-0.3.22.dist-info/RECORD +47 -0
sibi_dst-0.3.20.dist-info/RECORD +0 -47
{sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/WHEEL +0 -0

sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py CHANGED Viewed

@@ -1,19 +1,21 @@
-import dask.dataframe as dd
-from sqlmodel import Session, select, text
-from typing import Any, Dict, Optional
 import logging
+from typing import Any, Dict, Optional
+import dask.dataframe as dd
 import pandas as pd
+from sqlmodel import Session, select, text
 class SQLModelLoadFromDb:
     df: dd.DataFrame
     def __init__(
-        self,
-        db_connection,
-        db_query: Optional[Dict[str, Any]] = None,
-        db_params: Optional[Dict[str, Any]] = None,
-        logger=None,
-        **kwargs,
+            self,
+            db_connection,
+            db_query: Optional[Dict[str, Any]] = None,
+            db_params: Optional[Dict[str, Any]] = None,
+            logger=None,
+            **kwargs,
     ):
         """
         Initialize the loader with database connection, query, and parameters.
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
                 results = session.exec(query).fetchall()
                 # Convert query results to a Dask DataFrame
-                print("results:",results)
+                print("results:", results)
                 if results:
                     df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
                 else:
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
         if field_map:
             rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
             if rename_mapping:
-                self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
+                self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})

sibi_dst/df_helper/core/_defaults.py CHANGED Viewed

@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
     "BooleanField": lambda x: x.astype(bool),
     "NullBooleanField": lambda x: x.astype(bool),
     "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
-    "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
-    "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
+    "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
+                                                                             meta=("date", "object")),
+    "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
+                                                                             meta=("time", "object")),
     "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
     "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
     "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
     Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
     Boolean.__name__: lambda x: x.astype(bool),
     DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
-    Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
-    Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
+    Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
+                                                                               meta=("date", "object")),
+    Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
+                                                                               meta=("time", "object")),
     JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
     UUID.__name__: lambda x: x.astype(str),
 }
 # Conversion map with normalized SQLAlchemy field types
 # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
 #     "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
     # Fallback to raw class name
     return field_type.__class__.__name__

sibi_dst/df_helper/core/_filter_handler.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import datetime
 import dask.dataframe as dd
 import pandas as pd
 from sqlalchemy import func, cast
 from sqlalchemy.sql.sqltypes import Date, Time
 from sibi_dst.utils import Logger
 class FilterHandler:
     def __init__(self, backend, logger=None):
         """
@@ -15,7 +18,8 @@ class FilterHandler:
             logger: Optional logger for debugging purposes.
         """
         self.backend = backend
-        self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)  # No-op logger if none provided
+        self.logger = logger or Logger.default_logger(
+            logger_name=self.__class__.__name__)  # No-op logger if none provided
         self.backend_methods = self._get_backend_methods(backend)
     def apply_filters(self, query_or_df, model=None, filters=None):
@@ -34,7 +38,7 @@ class FilterHandler:
         for key, value in filters.items():
             field_name, casting, operation = self._parse_filter_key(key)
             parsed_value = self._parse_filter_value(casting, value)
-            #print(field_name, casting, operation, parsed_value)
+            # print(field_name, casting, operation, parsed_value)
             # Get the column and apply backend-specific transformations
             if self.backend == "sqlalchemy":
                 column = self.backend_methods["get_column"](field_name, model, casting)
@@ -67,7 +71,6 @@ class FilterHandler:
         return field_name, casting, operation
     def _parse_filter_value(self, casting, value):
         """
         Convert filter value to appropriate type based on the casting (e.g., date).
@@ -213,4 +216,4 @@ class FilterHandler:
         return [
             "gte", "lte", "gt", "lt", "exact", "in", "range",
             "contains", "startswith", "endswith", "isnull",
-        ]
+        ]

sibi_dst/df_helper/core/_params_config.py CHANGED Viewed

@@ -1,7 +1,7 @@
+from typing import Optional, Dict, Union, List
 from pydantic import BaseModel, model_validator, Field
-from typing import Optional, Dict, Union, List
 dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
     "fieldnames": None,
     "index_col": None,
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
 LOOKUP_SEP = "__"
 class ParamsConfig(BaseModel):
     field_map: Optional[Dict] = Field(default_factory=dict)
     legacy_filters: bool = False
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
             new_filter_field = LOOKUP_SEP.join(new_parts)
             new_filters[new_filter_field] = value
-        self.filters = new_filters
+        self.filters = new_filters

sibi_dst/df_helper/core/_query_config.py CHANGED Viewed

@@ -1,7 +1,5 @@
 from typing import Optional
-import dask.dataframe as dd
-import pandas as pd
 from pydantic import BaseModel, model_validator

sibi_dst/utils/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from ._credentials import *
 from ._log_utils import Logger
 from ._date_utils import *
 from ._data_utils import DataUtils
@@ -9,13 +9,14 @@ from ._df_utils import DfUtils
 from ._storage_manager import StorageManager
 from ._parquet_saver import ParquetSaver
 from ._clickhouse_writer import ClickHouseWriter
-from ._data_wrapper import DataWrapper
 from ._airflow_manager import AirflowDAGManager
+from ._credentials import *
+from ._data_wrapper import DataWrapper
-__all__=[
+__all__ = [
+    "Logger",
     "ConfigManager",
     "ConfigLoader",
-    "Logger",
     "DateUtils",
     "BusinessDays",
     "FileUtils",
@@ -27,4 +28,4 @@ __all__=[
     "DfUtils",
     "ClickHouseWriter",
     "AirflowDAGManager",
-]
+]

sibi_dst/utils/_airflow_manager.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
-from jinja2 import Template
 from datetime import datetime
 import fsspec
 import httpx
+from jinja2 import Template
 """
     A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
@@ -54,8 +55,8 @@ with DAG(
     {% endfor %}
 """
-class AirflowDAGManager:
+class AirflowDAGManager:
     def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
         """
@@ -208,4 +209,4 @@ class AirflowDAGManager:
             return response.json()
         except httpx.RequestError as e:
             print(f"Failed to trigger DAG {dag_id}: {e}")
-            raise
+            raise

sibi_dst/utils/_clickhouse_writer.py CHANGED Viewed

@@ -1,9 +1,12 @@
+from concurrent.futures import ThreadPoolExecutor
 import clickhouse_connect
+import pandas as pd
 from clickhouse_driver import Client
 from dask.dataframe import dd
-import pandas as pd
 from sibi_dst.utils import Logger
-from concurrent.futures import ThreadPoolExecutor
 class ClickHouseWriter:
     dtype_to_clickhouse = {
@@ -19,20 +22,20 @@ class ClickHouseWriter:
     df: dd.DataFrame
     def __init__(self, logger=None, **kwargs):
-        self.clickhouse_host = kwargs.setdefault('host',"localhost")
-        self.clickhouse_port = kwargs.setdefault('port',8123)
-        self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
-        self.clickhouse_user = kwargs.setdefault('user','default')
-        self.clickhouse_password = kwargs.setdefault('password','')
-        self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
+        self.clickhouse_host = kwargs.setdefault('host', "localhost")
+        self.clickhouse_port = kwargs.setdefault('port', 8123)
+        self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
+        self.clickhouse_user = kwargs.setdefault('user', 'default')
+        self.clickhouse_password = kwargs.setdefault('password', '')
+        self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
         self.client = None
-        self.order_by=kwargs.setdefault('order_by','id')
+        self.order_by = kwargs.setdefault('order_by', 'id')
     def save_to_clickhouse(self, df, **kwargs):
         self.df = df.copy()
-        self.order_by = kwargs.setdefault('order_by',self.order_by)
+        self.order_by = kwargs.setdefault('order_by', self.order_by)
         if len(self.df.head().index) == 0:
             self.logger.debug("Dataframe is empty")
             return
@@ -86,8 +89,8 @@ class ClickHouseWriter:
         if engine is None:
             engine = f"ENGINE = MergeTree() order by {self.order_by}"
         dtypes = self.df.dtypes
-        clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
-        create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
+        clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
+        create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
         self.logger.debug(f"Creating table SQL:{create_table_sql}")
         if self.client:
             self.client.command(create_table_sql)
@@ -200,4 +203,4 @@ class ClickHouseWriter:
             with ThreadPoolExecutor() as executor:
                 executor.map(write_partition, partitions, range(len(partitions)))
         except Exception as e:
-            self.logger.error(f"Error during multi-partition write: {e}")
+            self.logger.error(f"Error during multi-partition write: {e}")

sibi_dst/utils/_credentials.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 class ConfigLoader:
     def __init__(self, prefix, keys, defaults=None):
         """
@@ -54,4 +55,3 @@ class ConfigManager:
         :return: The configuration dictionary.
         """
         return self.configurations.get(name, {})

sibi_dst/utils/_data_wrapper.py CHANGED Viewed

@@ -1,12 +1,16 @@
 import datetime
+from concurrent.futures import ThreadPoolExecutor
 from typing import Type, Any, Dict, Optional
 import fsspec
 import pandas as pd
 from IPython.display import display
-from sibi_dst.utils import Logger
 from tqdm import tqdm
+from sibi_dst.utils import Logger
 from sibi_dst.utils import ParquetSaver
 class DataWrapper:
     DEFAULT_MAX_AGE_MINUTES = 1440
     DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -29,7 +33,8 @@ class DataWrapper:
                  logger: Optional[Logger] = None,
                  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
                  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
-                 show_progress: bool = False):
+                 show_progress: bool = False,
+                 timeout: Optional[int] = 300):
         self.dataclass = dataclass
         self.date_field = date_field
         self.data_path = self.ensure_forward_slash(data_path)
@@ -47,6 +52,7 @@ class DataWrapper:
         self.max_age_minutes = max_age_minutes
         self.history_days_threshold = history_days_threshold
         self.show_progress = show_progress
+        self.timeout = timeout
         self.start_date = self.convert_to_date(start_date)
         self.end_date = self.convert_to_date(end_date)
@@ -73,31 +79,79 @@ class DataWrapper:
             yield date.date()
     def process(self):
-        """Execute the update plan following the specified hierarchy."""
+        """Execute the update plan using 'update_priority' to determine processing order."""
         update_plan_table = self.generate_update_plan_with_conditions()
-        # Display the update plan table to the user if show_progress is True
+        # Display the update plan table to the user if requested
         if self.show_progress:
             display(update_plan_table)
-        # Process files according to the hierarchy, considering only `update_required` dates
-        for category, description in [
-            ("overwrite", "Processing files due to overwrite=True"),
-            ("history_days", "Processing files within history_days_threshold"),
-            ("missing_files", "Processing missing files")
-        ]:
-            # Filter dates in the category where `update_required` is True
+        # Filter out rows that do not require updates (priority 0 means skip)
+        update_plan_table = update_plan_table[
+            (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
+            ]
+        # Group by priority
+        priorities = sorted(update_plan_table["update_priority"].unique())
+        # We will process each priority level in its own thread.
+        # Each thread will handle all dates associated with that priority.
+        def process_priority(priority):
+            # Extract dates for the current priority
             dates_to_process = update_plan_table[
-                (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
-            ]["date"].tolist()
+                update_plan_table["update_priority"] == priority
+                ]["date"].tolist()
+            # If show_progress is True, wrap in a progress bar
             date_iterator = dates_to_process
             if self.show_progress:
-                date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
+                date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
+                                     unit="date")
+            # Process each date for this priority
             for current_date in date_iterator:
                 self.process_date(current_date)
+        # Launch a separate thread for each priority
+        with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
+            futures = {executor.submit(process_priority, p): p for p in priorities}
+            for future in futures:
+                try:
+                    future.result(timeout=self.timeout)
+                except TimeoutError:
+                    self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
+                    future.cancel()
+                    priority = futures[future]
+                    new_future = executor.submit(process_priority, priority)
+                    futures[new_future] = priority
+                    self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
+    # def process(self):
+    #     """Execute the update plan following the specified hierarchy."""
+    #     update_plan_table = self.generate_update_plan_with_conditions()
+    #
+    #     # Display the update plan table to the user if show_progress is True
+    #     if self.show_progress:
+    #         display(update_plan_table)
+    #
+    #     # Process files according to the hierarchy, considering only `update_required` dates
+    #     for category, description in [
+    #         ("overwrite", "Processing files due to overwrite=True"),
+    #         ("history_days", "Processing files within history_days_threshold"),
+    #         ("missing_files", "Processing missing files")
+    #     ]:
+    #         # Filter dates in the category where `update_required` is True
+    #         dates_to_process = update_plan_table[
+    #             (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
+    #             ]["date"].tolist()
+    #
+    #         date_iterator = dates_to_process
+    #         if self.show_progress:
+    #             date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
+    #
+    #         for current_date in date_iterator:
+    #             self.process_date(current_date)
     def is_file_older_than(self, file_path: str) -> bool:
         """
         Check if a file is older than the specified max_age_minutes.
@@ -130,7 +184,7 @@ class DataWrapper:
         data_object = self.dataclass(**self.class_params)
         df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
-        if len(df.index)==0:
+        if len(df.index) == 0:
             self.logger.error("No data found for the specified date.")
             return
@@ -178,12 +232,14 @@ class DataWrapper:
                     category = "history_days"
                     update_required = True
                 else:
+                    category = "file age is recent"
                     update_required = False
             # Hierarchy 3: Missing files
             elif missing_file and current_date <= today:
                 category = "missing_files"
                 update_required = True
             else:
+                category = "No Update Required"
                 update_required = False
             # Collect condition descriptions for the update plan table
@@ -194,8 +250,18 @@ class DataWrapper:
                 "missing_file": missing_file,
                 "update_required": update_required,
                 "update_category": category,
-                "datawrapper class":self.dataclass.__name__
+                "datawrapper class": self.dataclass.__name__
             })
+            priority_map = {
+                "overwrite": 1,
+                "history_days": 2,
+                "missing_files": 3
+            }
+            for row in rows:
+                category = row.get("update_category")
+                # Default to None if no category assigned (no update required)
+                row["update_priority"] = priority_map.get(category, 0)
         update_plan_table = pd.DataFrame(rows)
         return update_plan_table

sibi_dst/utils/_date_utils.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import datetime
-from typing import Union, Tuple, Callable, Dict, Any
+from typing import Union, Tuple, Callable, Dict
 import numpy as np
 import pandas as pd
 from sibi_dst.utils import Logger
@@ -32,7 +33,8 @@ class DateUtils:
         raise ValueError(f"Unsupported date format: {value}")
     @classmethod
-    def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[datetime.date, datetime.date]:
+    def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
+        datetime.date, datetime.date]:
         """
         Calculate the start and end of the week for a given reference date.
         """
@@ -49,7 +51,8 @@ class DateUtils:
         return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
     @classmethod
-    def get_first_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
+    def get_first_day_of_the_quarter(cls, reference_date: Union[
+        str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
         """
         Get the first day of the quarter for a given date.
         """
@@ -58,7 +61,8 @@ class DateUtils:
         return datetime.date(reference_date.year, 3 * quarter - 2, 1)
     @classmethod
-    def get_last_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
+    def get_last_day_of_the_quarter(cls, reference_date: Union[
+        str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
         """
         Get the last day of the quarter for a given date.
         """
@@ -116,10 +120,12 @@ class DateUtils:
             'current_month': lambda: cls.get_month_range(n=0),
             'last_month': lambda: cls.get_month_range(n=-1),
             'current_year': lambda: cls.get_year_timerange(today().year),
-            'current_quarter': lambda: (cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
+            'current_quarter': lambda: (
+            cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
             'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
         }
 class BusinessDays:
     def __init__(self, holiday_list, logger):
         """

sibi_dst/utils/_df_utils.py CHANGED Viewed

@@ -1,7 +1,9 @@
-import pandas as pd
 import dask.dataframe as dd
+import pandas as pd
 from ._log_utils import Logger
 class DfUtils:
     def __init__(self, logger=None):
         """
@@ -210,7 +212,7 @@ class DfUtils:
             df['Total'] = df.sum(axis=1, numeric_only=True)
         return df
-    def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
+    def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
         """
         Summarizes data by creating a pivot table and resampling.
@@ -233,10 +235,12 @@ class DfUtils:
                 df = df.set_index(dd.to_datetime(df.index))
             # Group by index and summary columns
-            df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(agg_func).reset_index()
+            df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
+                agg_func).reset_index()
             # Pivot the table
-            df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
+            df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
+                                              aggfunc='sum').fillna(0)
             # Resample
             df_pivot.index = dd.to_datetime(df_pivot.index)
@@ -269,4 +273,4 @@ class DfUtils:
         Returns:
             DataFrame: Resampled pivot table.
         """
-        return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
+        return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)

sibi_dst/utils/_file_utils.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import shutil
 from pathlib import Path
 from typing import Optional
 import fsspec
 from sibi_dst.utils import Logger
 class FileUtils:
     def __init__(self, logger=None):
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -27,7 +29,7 @@ class FileUtils:
             fs.mkdirs(path)
     @staticmethod
-    def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
+    def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
         """Construct and return the full path for the parquet file."""
         fs, base_path = fsspec.core.url_to_fs(storage_path)
         parquet_filename = parquet_filename or "default.parquet"

sibi_dst/utils/_filepath_generator.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import datetime
-import fsspec
 import re
+import fsspec
 from sibi_dst.utils import Logger
@@ -150,6 +151,7 @@ class FilePathGenerator:
             return datetime.datetime.strptime(date, '%Y-%m-%d')
         return date
 """
 Usage:
 # Initialize the generator
@@ -182,4 +184,4 @@ for fp in file_paths:
 df_pandas = pd.concat(dataframes, ignore_index=True)
 print(df_pandas.head())
-"""
+"""

sibi_dst/utils/_log_utils.py CHANGED Viewed

@@ -71,4 +71,4 @@ class Logger:
         self.logger.error(msg)
     def critical(self, msg):
-        self.logger.critical(msg)
+        self.logger.critical(msg)

sibi_dst/utils/_parquet_saver.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from pathlib import Path
 from typing import Optional
-import dask_expr
 import fsspec
 import pyarrow as pa
@@ -103,4 +102,3 @@ class ParquetSaver:
         self.df_result.to_parquet(
             str(full_path), engine="pyarrow", schema=schema, write_index=False
         )

sibi_dst/utils/_storage_manager.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from types import SimpleNamespace
 import fsspec
@@ -86,4 +87,3 @@ class StorageManager:
         print("Rebuilding depot structure...")
         self.rebuild_depot_paths(depots, clear_existing=clear_existing)
         print("Rebuild complete.")

{sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 0.3.20
+Version: 0.3.22
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

sibi-dst 0.3.20__py3-none-any.whl → 0.3.22__py3-none-any.whl

sibi-dst 0.3.20py3-none-any.whl → 0.3.22py3-none-any.whl