sibi-dst 0.3.20__py3-none-any.whl → 0.3.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. sibi_dst/__init__.py +1 -1
  2. sibi_dst/df_helper/__init__.py +2 -2
  3. sibi_dst/df_helper/_df_helper.py +34 -33
  4. sibi_dst/df_helper/_parquet_artifact.py +4 -1
  5. sibi_dst/df_helper/_parquet_reader.py +2 -1
  6. sibi_dst/df_helper/backends/django/__init__.py +1 -2
  7. sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
  8. sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
  9. sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
  10. sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
  11. sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
  12. sibi_dst/df_helper/backends/http/__init__.py +2 -2
  13. sibi_dst/df_helper/backends/http/_http_config.py +6 -3
  14. sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
  15. sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
  16. sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
  17. sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
  18. sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
  19. sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
  20. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
  21. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
  22. sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
  23. sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
  24. sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
  25. sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
  26. sibi_dst/df_helper/core/_defaults.py +9 -6
  27. sibi_dst/df_helper/core/_filter_handler.py +7 -4
  28. sibi_dst/df_helper/core/_params_config.py +3 -2
  29. sibi_dst/df_helper/core/_query_config.py +0 -2
  30. sibi_dst/utils/__init__.py +6 -5
  31. sibi_dst/utils/_airflow_manager.py +4 -3
  32. sibi_dst/utils/_clickhouse_writer.py +16 -13
  33. sibi_dst/utils/_credentials.py +1 -1
  34. sibi_dst/utils/_data_wrapper.py +82 -16
  35. sibi_dst/utils/_date_utils.py +11 -5
  36. sibi_dst/utils/_df_utils.py +9 -5
  37. sibi_dst/utils/_file_utils.py +3 -1
  38. sibi_dst/utils/_filepath_generator.py +4 -2
  39. sibi_dst/utils/_log_utils.py +1 -1
  40. sibi_dst/utils/_parquet_saver.py +0 -2
  41. sibi_dst/utils/_storage_manager.py +1 -1
  42. {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/METADATA +1 -1
  43. sibi_dst-0.3.22.dist-info/RECORD +47 -0
  44. sibi_dst-0.3.20.dist-info/RECORD +0 -47
  45. {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/WHEEL +0 -0
@@ -1,19 +1,21 @@
1
- import dask.dataframe as dd
2
- from sqlmodel import Session, select, text
3
- from typing import Any, Dict, Optional
4
1
  import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ import dask.dataframe as dd
5
5
  import pandas as pd
6
+ from sqlmodel import Session, select, text
7
+
6
8
 
7
9
  class SQLModelLoadFromDb:
8
10
  df: dd.DataFrame
9
11
 
10
12
  def __init__(
11
- self,
12
- db_connection,
13
- db_query: Optional[Dict[str, Any]] = None,
14
- db_params: Optional[Dict[str, Any]] = None,
15
- logger=None,
16
- **kwargs,
13
+ self,
14
+ db_connection,
15
+ db_query: Optional[Dict[str, Any]] = None,
16
+ db_params: Optional[Dict[str, Any]] = None,
17
+ logger=None,
18
+ **kwargs,
17
19
  ):
18
20
  """
19
21
  Initialize the loader with database connection, query, and parameters.
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
74
76
  results = session.exec(query).fetchall()
75
77
 
76
78
  # Convert query results to a Dask DataFrame
77
- print("results:",results)
79
+ print("results:", results)
78
80
  if results:
79
81
  df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
80
82
  else:
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
96
98
  if field_map:
97
99
  rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
98
100
  if rename_mapping:
99
- self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
101
+ self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
54
54
  "BooleanField": lambda x: x.astype(bool),
55
55
  "NullBooleanField": lambda x: x.astype(bool),
56
56
  "DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
57
- "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
58
- "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
57
+ "DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
58
+ meta=("date", "object")),
59
+ "TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
60
+ meta=("time", "object")),
59
61
  "DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
60
62
  "JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
61
63
  "ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
72
74
  Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
73
75
  Boolean.__name__: lambda x: x.astype(bool),
74
76
  DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
75
- Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date, meta=("date", "object")),
76
- Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time, meta=("time", "object")),
77
+ Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
78
+ meta=("date", "object")),
79
+ Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
80
+ meta=("time", "object")),
77
81
  JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
78
82
  UUID.__name__: lambda x: x.astype(str),
79
83
  }
80
84
 
85
+
81
86
  # Conversion map with normalized SQLAlchemy field types
82
87
  # sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
83
88
  # "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
129
134
 
130
135
  # Fallback to raw class name
131
136
  return field_type.__class__.__name__
132
-
133
-
@@ -1,10 +1,13 @@
1
1
  import datetime
2
+
2
3
  import dask.dataframe as dd
3
4
  import pandas as pd
4
5
  from sqlalchemy import func, cast
5
6
  from sqlalchemy.sql.sqltypes import Date, Time
7
+
6
8
  from sibi_dst.utils import Logger
7
9
 
10
+
8
11
  class FilterHandler:
9
12
  def __init__(self, backend, logger=None):
10
13
  """
@@ -15,7 +18,8 @@ class FilterHandler:
15
18
  logger: Optional logger for debugging purposes.
16
19
  """
17
20
  self.backend = backend
18
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__) # No-op logger if none provided
21
+ self.logger = logger or Logger.default_logger(
22
+ logger_name=self.__class__.__name__) # No-op logger if none provided
19
23
  self.backend_methods = self._get_backend_methods(backend)
20
24
 
21
25
  def apply_filters(self, query_or_df, model=None, filters=None):
@@ -34,7 +38,7 @@ class FilterHandler:
34
38
  for key, value in filters.items():
35
39
  field_name, casting, operation = self._parse_filter_key(key)
36
40
  parsed_value = self._parse_filter_value(casting, value)
37
- #print(field_name, casting, operation, parsed_value)
41
+ # print(field_name, casting, operation, parsed_value)
38
42
  # Get the column and apply backend-specific transformations
39
43
  if self.backend == "sqlalchemy":
40
44
  column = self.backend_methods["get_column"](field_name, model, casting)
@@ -67,7 +71,6 @@ class FilterHandler:
67
71
 
68
72
  return field_name, casting, operation
69
73
 
70
-
71
74
  def _parse_filter_value(self, casting, value):
72
75
  """
73
76
  Convert filter value to appropriate type based on the casting (e.g., date).
@@ -213,4 +216,4 @@ class FilterHandler:
213
216
  return [
214
217
  "gte", "lte", "gt", "lt", "exact", "in", "range",
215
218
  "contains", "startswith", "endswith", "isnull",
216
- ]
219
+ ]
@@ -1,7 +1,7 @@
1
+ from typing import Optional, Dict, Union, List
1
2
 
2
3
  from pydantic import BaseModel, model_validator, Field
3
4
 
4
- from typing import Optional, Dict, Union, List
5
5
  dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
6
6
  "fieldnames": None,
7
7
  "index_col": None,
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
25
25
 
26
26
  LOOKUP_SEP = "__"
27
27
 
28
+
28
29
  class ParamsConfig(BaseModel):
29
30
  field_map: Optional[Dict] = Field(default_factory=dict)
30
31
  legacy_filters: bool = False
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
76
77
  new_filter_field = LOOKUP_SEP.join(new_parts)
77
78
  new_filters[new_filter_field] = value
78
79
 
79
- self.filters = new_filters
80
+ self.filters = new_filters
@@ -1,7 +1,5 @@
1
1
  from typing import Optional
2
2
 
3
- import dask.dataframe as dd
4
- import pandas as pd
5
3
  from pydantic import BaseModel, model_validator
6
4
 
7
5
 
@@ -1,5 +1,5 @@
1
1
  from __future__ import annotations
2
- from ._credentials import *
2
+
3
3
  from ._log_utils import Logger
4
4
  from ._date_utils import *
5
5
  from ._data_utils import DataUtils
@@ -9,13 +9,14 @@ from ._df_utils import DfUtils
9
9
  from ._storage_manager import StorageManager
10
10
  from ._parquet_saver import ParquetSaver
11
11
  from ._clickhouse_writer import ClickHouseWriter
12
- from ._data_wrapper import DataWrapper
13
12
  from ._airflow_manager import AirflowDAGManager
13
+ from ._credentials import *
14
+ from ._data_wrapper import DataWrapper
14
15
 
15
- __all__=[
16
+ __all__ = [
17
+ "Logger",
16
18
  "ConfigManager",
17
19
  "ConfigLoader",
18
- "Logger",
19
20
  "DateUtils",
20
21
  "BusinessDays",
21
22
  "FileUtils",
@@ -27,4 +28,4 @@ __all__=[
27
28
  "DfUtils",
28
29
  "ClickHouseWriter",
29
30
  "AirflowDAGManager",
30
- ]
31
+ ]
@@ -1,8 +1,9 @@
1
1
  import os
2
- from jinja2 import Template
3
2
  from datetime import datetime
3
+
4
4
  import fsspec
5
5
  import httpx
6
+ from jinja2 import Template
6
7
 
7
8
  """
8
9
  A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
@@ -54,8 +55,8 @@ with DAG(
54
55
  {% endfor %}
55
56
  """
56
57
 
57
- class AirflowDAGManager:
58
58
 
59
+ class AirflowDAGManager:
59
60
 
60
61
  def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
61
62
  """
@@ -208,4 +209,4 @@ class AirflowDAGManager:
208
209
  return response.json()
209
210
  except httpx.RequestError as e:
210
211
  print(f"Failed to trigger DAG {dag_id}: {e}")
211
- raise
212
+ raise
@@ -1,9 +1,12 @@
1
+ from concurrent.futures import ThreadPoolExecutor
2
+
1
3
  import clickhouse_connect
4
+ import pandas as pd
2
5
  from clickhouse_driver import Client
3
6
  from dask.dataframe import dd
4
- import pandas as pd
7
+
5
8
  from sibi_dst.utils import Logger
6
- from concurrent.futures import ThreadPoolExecutor
9
+
7
10
 
8
11
  class ClickHouseWriter:
9
12
  dtype_to_clickhouse = {
@@ -19,20 +22,20 @@ class ClickHouseWriter:
19
22
  df: dd.DataFrame
20
23
 
21
24
  def __init__(self, logger=None, **kwargs):
22
- self.clickhouse_host = kwargs.setdefault('host',"localhost")
23
- self.clickhouse_port = kwargs.setdefault('port',8123)
24
- self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
25
- self.clickhouse_user = kwargs.setdefault('user','default')
26
- self.clickhouse_password = kwargs.setdefault('password','')
27
- self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
25
+ self.clickhouse_host = kwargs.setdefault('host', "localhost")
26
+ self.clickhouse_port = kwargs.setdefault('port', 8123)
27
+ self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
28
+ self.clickhouse_user = kwargs.setdefault('user', 'default')
29
+ self.clickhouse_password = kwargs.setdefault('password', '')
30
+ self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
28
31
 
29
32
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
30
33
  self.client = None
31
- self.order_by=kwargs.setdefault('order_by','id')
34
+ self.order_by = kwargs.setdefault('order_by', 'id')
32
35
 
33
36
  def save_to_clickhouse(self, df, **kwargs):
34
37
  self.df = df.copy()
35
- self.order_by = kwargs.setdefault('order_by',self.order_by)
38
+ self.order_by = kwargs.setdefault('order_by', self.order_by)
36
39
  if len(self.df.head().index) == 0:
37
40
  self.logger.debug("Dataframe is empty")
38
41
  return
@@ -86,8 +89,8 @@ class ClickHouseWriter:
86
89
  if engine is None:
87
90
  engine = f"ENGINE = MergeTree() order by {self.order_by}"
88
91
  dtypes = self.df.dtypes
89
- clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
90
- create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
92
+ clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
93
+ create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
91
94
  self.logger.debug(f"Creating table SQL:{create_table_sql}")
92
95
  if self.client:
93
96
  self.client.command(create_table_sql)
@@ -200,4 +203,4 @@ class ClickHouseWriter:
200
203
  with ThreadPoolExecutor() as executor:
201
204
  executor.map(write_partition, partitions, range(len(partitions)))
202
205
  except Exception as e:
203
- self.logger.error(f"Error during multi-partition write: {e}")
206
+ self.logger.error(f"Error during multi-partition write: {e}")
@@ -1,5 +1,6 @@
1
1
  import os
2
2
 
3
+
3
4
  class ConfigLoader:
4
5
  def __init__(self, prefix, keys, defaults=None):
5
6
  """
@@ -54,4 +55,3 @@ class ConfigManager:
54
55
  :return: The configuration dictionary.
55
56
  """
56
57
  return self.configurations.get(name, {})
57
-
@@ -1,12 +1,16 @@
1
1
  import datetime
2
+ from concurrent.futures import ThreadPoolExecutor
2
3
  from typing import Type, Any, Dict, Optional
4
+
3
5
  import fsspec
4
6
  import pandas as pd
5
7
  from IPython.display import display
6
- from sibi_dst.utils import Logger
7
8
  from tqdm import tqdm
9
+
10
+ from sibi_dst.utils import Logger
8
11
  from sibi_dst.utils import ParquetSaver
9
12
 
13
+
10
14
  class DataWrapper:
11
15
  DEFAULT_MAX_AGE_MINUTES = 1440
12
16
  DEFAULT_HISTORY_DAYS_THRESHOLD = 30
@@ -29,7 +33,8 @@ class DataWrapper:
29
33
  logger: Optional[Logger] = None,
30
34
  max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
31
35
  history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
32
- show_progress: bool = False):
36
+ show_progress: bool = False,
37
+ timeout: Optional[int] = 300):
33
38
  self.dataclass = dataclass
34
39
  self.date_field = date_field
35
40
  self.data_path = self.ensure_forward_slash(data_path)
@@ -47,6 +52,7 @@ class DataWrapper:
47
52
  self.max_age_minutes = max_age_minutes
48
53
  self.history_days_threshold = history_days_threshold
49
54
  self.show_progress = show_progress
55
+ self.timeout = timeout
50
56
 
51
57
  self.start_date = self.convert_to_date(start_date)
52
58
  self.end_date = self.convert_to_date(end_date)
@@ -73,31 +79,79 @@ class DataWrapper:
73
79
  yield date.date()
74
80
 
75
81
  def process(self):
76
- """Execute the update plan following the specified hierarchy."""
82
+ """Execute the update plan using 'update_priority' to determine processing order."""
77
83
  update_plan_table = self.generate_update_plan_with_conditions()
78
84
 
79
- # Display the update plan table to the user if show_progress is True
85
+ # Display the update plan table to the user if requested
80
86
  if self.show_progress:
81
87
  display(update_plan_table)
82
88
 
83
- # Process files according to the hierarchy, considering only `update_required` dates
84
- for category, description in [
85
- ("overwrite", "Processing files due to overwrite=True"),
86
- ("history_days", "Processing files within history_days_threshold"),
87
- ("missing_files", "Processing missing files")
88
- ]:
89
- # Filter dates in the category where `update_required` is True
89
+ # Filter out rows that do not require updates (priority 0 means skip)
90
+ update_plan_table = update_plan_table[
91
+ (update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
92
+ ]
93
+
94
+ # Group by priority
95
+ priorities = sorted(update_plan_table["update_priority"].unique())
96
+
97
+ # We will process each priority level in its own thread.
98
+ # Each thread will handle all dates associated with that priority.
99
+ def process_priority(priority):
100
+ # Extract dates for the current priority
90
101
  dates_to_process = update_plan_table[
91
- (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
92
- ]["date"].tolist()
102
+ update_plan_table["update_priority"] == priority
103
+ ]["date"].tolist()
93
104
 
105
+ # If show_progress is True, wrap in a progress bar
94
106
  date_iterator = dates_to_process
95
107
  if self.show_progress:
96
- date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
108
+ date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
109
+ unit="date")
97
110
 
111
+ # Process each date for this priority
98
112
  for current_date in date_iterator:
99
113
  self.process_date(current_date)
100
114
 
115
+ # Launch a separate thread for each priority
116
+ with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
117
+ futures = {executor.submit(process_priority, p): p for p in priorities}
118
+ for future in futures:
119
+ try:
120
+ future.result(timeout=self.timeout)
121
+ except TimeoutError:
122
+ self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
123
+ future.cancel()
124
+ priority = futures[future]
125
+ new_future = executor.submit(process_priority, priority)
126
+ futures[new_future] = priority
127
+ self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
128
+
129
+ # def process(self):
130
+ # """Execute the update plan following the specified hierarchy."""
131
+ # update_plan_table = self.generate_update_plan_with_conditions()
132
+ #
133
+ # # Display the update plan table to the user if show_progress is True
134
+ # if self.show_progress:
135
+ # display(update_plan_table)
136
+ #
137
+ # # Process files according to the hierarchy, considering only `update_required` dates
138
+ # for category, description in [
139
+ # ("overwrite", "Processing files due to overwrite=True"),
140
+ # ("history_days", "Processing files within history_days_threshold"),
141
+ # ("missing_files", "Processing missing files")
142
+ # ]:
143
+ # # Filter dates in the category where `update_required` is True
144
+ # dates_to_process = update_plan_table[
145
+ # (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
146
+ # ]["date"].tolist()
147
+ #
148
+ # date_iterator = dates_to_process
149
+ # if self.show_progress:
150
+ # date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
151
+ #
152
+ # for current_date in date_iterator:
153
+ # self.process_date(current_date)
154
+
101
155
  def is_file_older_than(self, file_path: str) -> bool:
102
156
  """
103
157
  Check if a file is older than the specified max_age_minutes.
@@ -130,7 +184,7 @@ class DataWrapper:
130
184
  data_object = self.dataclass(**self.class_params)
131
185
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
132
186
 
133
- if len(df.index)==0:
187
+ if len(df.index) == 0:
134
188
  self.logger.error("No data found for the specified date.")
135
189
  return
136
190
 
@@ -178,12 +232,14 @@ class DataWrapper:
178
232
  category = "history_days"
179
233
  update_required = True
180
234
  else:
235
+ category = "file age is recent"
181
236
  update_required = False
182
237
  # Hierarchy 3: Missing files
183
238
  elif missing_file and current_date <= today:
184
239
  category = "missing_files"
185
240
  update_required = True
186
241
  else:
242
+ category = "No Update Required"
187
243
  update_required = False
188
244
 
189
245
  # Collect condition descriptions for the update plan table
@@ -194,8 +250,18 @@ class DataWrapper:
194
250
  "missing_file": missing_file,
195
251
  "update_required": update_required,
196
252
  "update_category": category,
197
- "datawrapper class":self.dataclass.__name__
253
+ "datawrapper class": self.dataclass.__name__
198
254
  })
255
+ priority_map = {
256
+ "overwrite": 1,
257
+ "history_days": 2,
258
+ "missing_files": 3
259
+ }
260
+
261
+ for row in rows:
262
+ category = row.get("update_category")
263
+ # Default to None if no category assigned (no update required)
264
+ row["update_priority"] = priority_map.get(category, 0)
199
265
 
200
266
  update_plan_table = pd.DataFrame(rows)
201
267
  return update_plan_table
@@ -1,8 +1,9 @@
1
1
  import datetime
2
- from typing import Union, Tuple, Callable, Dict, Any
2
+ from typing import Union, Tuple, Callable, Dict
3
3
 
4
4
  import numpy as np
5
5
  import pandas as pd
6
+
6
7
  from sibi_dst.utils import Logger
7
8
 
8
9
 
@@ -32,7 +33,8 @@ class DateUtils:
32
33
  raise ValueError(f"Unsupported date format: {value}")
33
34
 
34
35
  @classmethod
35
- def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[datetime.date, datetime.date]:
36
+ def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
37
+ datetime.date, datetime.date]:
36
38
  """
37
39
  Calculate the start and end of the week for a given reference date.
38
40
  """
@@ -49,7 +51,8 @@ class DateUtils:
49
51
  return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
50
52
 
51
53
  @classmethod
52
- def get_first_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
54
+ def get_first_day_of_the_quarter(cls, reference_date: Union[
55
+ str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
53
56
  """
54
57
  Get the first day of the quarter for a given date.
55
58
  """
@@ -58,7 +61,8 @@ class DateUtils:
58
61
  return datetime.date(reference_date.year, 3 * quarter - 2, 1)
59
62
 
60
63
  @classmethod
61
- def get_last_day_of_the_quarter(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
64
+ def get_last_day_of_the_quarter(cls, reference_date: Union[
65
+ str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
62
66
  """
63
67
  Get the last day of the quarter for a given date.
64
68
  """
@@ -116,10 +120,12 @@ class DateUtils:
116
120
  'current_month': lambda: cls.get_month_range(n=0),
117
121
  'last_month': lambda: cls.get_month_range(n=-1),
118
122
  'current_year': lambda: cls.get_year_timerange(today().year),
119
- 'current_quarter': lambda: (cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
123
+ 'current_quarter': lambda: (
124
+ cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
120
125
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
121
126
  }
122
127
 
128
+
123
129
  class BusinessDays:
124
130
  def __init__(self, holiday_list, logger):
125
131
  """
@@ -1,7 +1,9 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from ._log_utils import Logger
4
5
 
6
+
5
7
  class DfUtils:
6
8
  def __init__(self, logger=None):
7
9
  """
@@ -210,7 +212,7 @@ class DfUtils:
210
212
  df['Total'] = df.sum(axis=1, numeric_only=True)
211
213
  return df
212
214
 
213
- def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
215
+ def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
214
216
  """
215
217
  Summarizes data by creating a pivot table and resampling.
216
218
 
@@ -233,10 +235,12 @@ class DfUtils:
233
235
  df = df.set_index(dd.to_datetime(df.index))
234
236
 
235
237
  # Group by index and summary columns
236
- df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(agg_func).reset_index()
238
+ df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
239
+ agg_func).reset_index()
237
240
 
238
241
  # Pivot the table
239
- df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
242
+ df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
243
+ aggfunc='sum').fillna(0)
240
244
 
241
245
  # Resample
242
246
  df_pivot.index = dd.to_datetime(df_pivot.index)
@@ -269,4 +273,4 @@ class DfUtils:
269
273
  Returns:
270
274
  DataFrame: Resampled pivot table.
271
275
  """
272
- return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
276
+ return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
@@ -1,10 +1,12 @@
1
1
  import shutil
2
2
  from pathlib import Path
3
3
  from typing import Optional
4
+
4
5
  import fsspec
5
6
 
6
7
  from sibi_dst.utils import Logger
7
8
 
9
+
8
10
  class FileUtils:
9
11
  def __init__(self, logger=None):
10
12
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -27,7 +29,7 @@ class FileUtils:
27
29
  fs.mkdirs(path)
28
30
 
29
31
  @staticmethod
30
- def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
32
+ def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
31
33
  """Construct and return the full path for the parquet file."""
32
34
  fs, base_path = fsspec.core.url_to_fs(storage_path)
33
35
  parquet_filename = parquet_filename or "default.parquet"
@@ -1,7 +1,8 @@
1
1
  import datetime
2
- import fsspec
3
2
  import re
4
3
 
4
+ import fsspec
5
+
5
6
  from sibi_dst.utils import Logger
6
7
 
7
8
 
@@ -150,6 +151,7 @@ class FilePathGenerator:
150
151
  return datetime.datetime.strptime(date, '%Y-%m-%d')
151
152
  return date
152
153
 
154
+
153
155
  """
154
156
  Usage:
155
157
  # Initialize the generator
@@ -182,4 +184,4 @@ for fp in file_paths:
182
184
 
183
185
  df_pandas = pd.concat(dataframes, ignore_index=True)
184
186
  print(df_pandas.head())
185
- """
187
+ """
@@ -71,4 +71,4 @@ class Logger:
71
71
  self.logger.error(msg)
72
72
 
73
73
  def critical(self, msg):
74
- self.logger.critical(msg)
74
+ self.logger.critical(msg)
@@ -1,7 +1,6 @@
1
1
  from pathlib import Path
2
2
  from typing import Optional
3
3
 
4
- import dask_expr
5
4
  import fsspec
6
5
  import pyarrow as pa
7
6
 
@@ -103,4 +102,3 @@ class ParquetSaver:
103
102
  self.df_result.to_parquet(
104
103
  str(full_path), engine="pyarrow", schema=schema, write_index=False
105
104
  )
106
-
@@ -1,4 +1,5 @@
1
1
  from types import SimpleNamespace
2
+
2
3
  import fsspec
3
4
 
4
5
 
@@ -86,4 +87,3 @@ class StorageManager:
86
87
  print("Rebuilding depot structure...")
87
88
  self.rebuild_depot_paths(depots, clear_existing=clear_existing)
88
89
  print("Rebuild complete.")
89
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.20
3
+ Version: 0.3.22
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com