sibi-dst 0.3.20__py3-none-any.whl → 0.3.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +1 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_df_helper.py +34 -33
- sibi_dst/df_helper/_parquet_artifact.py +4 -1
- sibi_dst/df_helper/_parquet_reader.py +2 -1
- sibi_dst/df_helper/backends/django/__init__.py +1 -2
- sibi_dst/df_helper/backends/django/_django_db_connection.py +1 -1
- sibi_dst/df_helper/backends/django/_django_load_from_db.py +6 -8
- sibi_dst/df_helper/backends/django/_django_sql_model_builder.py +5 -5
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -1
- sibi_dst/df_helper/backends/django/_io_dask_alt.py +5 -4
- sibi_dst/df_helper/backends/http/__init__.py +2 -2
- sibi_dst/df_helper/backends/http/_http_config.py +6 -3
- sibi_dst/df_helper/backends/parquet/__init__.py +3 -3
- sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py +4 -2
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +12 -7
- sibi_dst/df_helper/backends/sql_alchemy/__init__.py +2 -2
- sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py +3 -1
- sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py +2 -3
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py +3 -3
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py +2 -2
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +5 -3
- sibi_dst/df_helper/backends/sql_model/__init__.py +1 -1
- sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py +5 -4
- sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py +13 -11
- sibi_dst/df_helper/core/_defaults.py +9 -6
- sibi_dst/df_helper/core/_filter_handler.py +7 -4
- sibi_dst/df_helper/core/_params_config.py +3 -2
- sibi_dst/df_helper/core/_query_config.py +0 -2
- sibi_dst/utils/__init__.py +6 -5
- sibi_dst/utils/_airflow_manager.py +4 -3
- sibi_dst/utils/_clickhouse_writer.py +16 -13
- sibi_dst/utils/_credentials.py +1 -1
- sibi_dst/utils/_data_wrapper.py +82 -16
- sibi_dst/utils/_date_utils.py +11 -5
- sibi_dst/utils/_df_utils.py +9 -5
- sibi_dst/utils/_file_utils.py +3 -1
- sibi_dst/utils/_filepath_generator.py +4 -2
- sibi_dst/utils/_log_utils.py +1 -1
- sibi_dst/utils/_parquet_saver.py +0 -2
- sibi_dst/utils/_storage_manager.py +1 -1
- {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/METADATA +1 -1
- sibi_dst-0.3.22.dist-info/RECORD +47 -0
- sibi_dst-0.3.20.dist-info/RECORD +0 -47
- {sibi_dst-0.3.20.dist-info → sibi_dst-0.3.22.dist-info}/WHEEL +0 -0
@@ -1,19 +1,21 @@
|
|
1
|
-
import dask.dataframe as dd
|
2
|
-
from sqlmodel import Session, select, text
|
3
|
-
from typing import Any, Dict, Optional
|
4
1
|
import logging
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
|
4
|
+
import dask.dataframe as dd
|
5
5
|
import pandas as pd
|
6
|
+
from sqlmodel import Session, select, text
|
7
|
+
|
6
8
|
|
7
9
|
class SQLModelLoadFromDb:
|
8
10
|
df: dd.DataFrame
|
9
11
|
|
10
12
|
def __init__(
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
self,
|
14
|
+
db_connection,
|
15
|
+
db_query: Optional[Dict[str, Any]] = None,
|
16
|
+
db_params: Optional[Dict[str, Any]] = None,
|
17
|
+
logger=None,
|
18
|
+
**kwargs,
|
17
19
|
):
|
18
20
|
"""
|
19
21
|
Initialize the loader with database connection, query, and parameters.
|
@@ -74,7 +76,7 @@ class SQLModelLoadFromDb:
|
|
74
76
|
results = session.exec(query).fetchall()
|
75
77
|
|
76
78
|
# Convert query results to a Dask DataFrame
|
77
|
-
print("results:",results)
|
79
|
+
print("results:", results)
|
78
80
|
if results:
|
79
81
|
df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
|
80
82
|
else:
|
@@ -96,4 +98,4 @@ class SQLModelLoadFromDb:
|
|
96
98
|
if field_map:
|
97
99
|
rename_mapping = {k: v for k, v in field_map.items() if k in self.df.columns}
|
98
100
|
if rename_mapping:
|
99
|
-
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
101
|
+
self.df = self.df.rename(columns=rename_mapping, meta={v: "object" for v in rename_mapping.values()})
|
@@ -54,8 +54,10 @@ django_field_conversion_map_dask: Dict[str, callable] = {
|
|
54
54
|
"BooleanField": lambda x: x.astype(bool),
|
55
55
|
"NullBooleanField": lambda x: x.astype(bool),
|
56
56
|
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
-
|
57
|
+
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
+
meta=("date", "object")),
|
59
|
+
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
60
|
+
meta=("time", "object")),
|
59
61
|
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
60
62
|
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
61
63
|
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
@@ -72,12 +74,15 @@ sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
|
72
74
|
Numeric.__name__: lambda x: pd.to_numeric(x, errors="coerce"),
|
73
75
|
Boolean.__name__: lambda x: x.astype(bool),
|
74
76
|
DateTime.__name__: lambda x: pd.to_datetime(x, errors="coerce"),
|
75
|
-
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
76
|
-
|
77
|
+
Date.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
78
|
+
meta=("date", "object")),
|
79
|
+
Time.__name__: lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
80
|
+
meta=("time", "object")),
|
77
81
|
JSON.__name__: lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
78
82
|
UUID.__name__: lambda x: x.astype(str),
|
79
83
|
}
|
80
84
|
|
85
|
+
|
81
86
|
# Conversion map with normalized SQLAlchemy field types
|
82
87
|
# sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
83
88
|
# "String": lambda x: x.map_partitions(lambda s: s.astype(str), meta=("string", "string")),
|
@@ -129,5 +134,3 @@ def normalize_sqlalchemy_type(field_type):
|
|
129
134
|
|
130
135
|
# Fallback to raw class name
|
131
136
|
return field_type.__class__.__name__
|
132
|
-
|
133
|
-
|
@@ -1,10 +1,13 @@
|
|
1
1
|
import datetime
|
2
|
+
|
2
3
|
import dask.dataframe as dd
|
3
4
|
import pandas as pd
|
4
5
|
from sqlalchemy import func, cast
|
5
6
|
from sqlalchemy.sql.sqltypes import Date, Time
|
7
|
+
|
6
8
|
from sibi_dst.utils import Logger
|
7
9
|
|
10
|
+
|
8
11
|
class FilterHandler:
|
9
12
|
def __init__(self, backend, logger=None):
|
10
13
|
"""
|
@@ -15,7 +18,8 @@ class FilterHandler:
|
|
15
18
|
logger: Optional logger for debugging purposes.
|
16
19
|
"""
|
17
20
|
self.backend = backend
|
18
|
-
self.logger = logger or Logger.default_logger(
|
21
|
+
self.logger = logger or Logger.default_logger(
|
22
|
+
logger_name=self.__class__.__name__) # No-op logger if none provided
|
19
23
|
self.backend_methods = self._get_backend_methods(backend)
|
20
24
|
|
21
25
|
def apply_filters(self, query_or_df, model=None, filters=None):
|
@@ -34,7 +38,7 @@ class FilterHandler:
|
|
34
38
|
for key, value in filters.items():
|
35
39
|
field_name, casting, operation = self._parse_filter_key(key)
|
36
40
|
parsed_value = self._parse_filter_value(casting, value)
|
37
|
-
#print(field_name, casting, operation, parsed_value)
|
41
|
+
# print(field_name, casting, operation, parsed_value)
|
38
42
|
# Get the column and apply backend-specific transformations
|
39
43
|
if self.backend == "sqlalchemy":
|
40
44
|
column = self.backend_methods["get_column"](field_name, model, casting)
|
@@ -67,7 +71,6 @@ class FilterHandler:
|
|
67
71
|
|
68
72
|
return field_name, casting, operation
|
69
73
|
|
70
|
-
|
71
74
|
def _parse_filter_value(self, casting, value):
|
72
75
|
"""
|
73
76
|
Convert filter value to appropriate type based on the casting (e.g., date).
|
@@ -213,4 +216,4 @@ class FilterHandler:
|
|
213
216
|
return [
|
214
217
|
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
215
218
|
"contains", "startswith", "endswith", "isnull",
|
216
|
-
]
|
219
|
+
]
|
@@ -1,7 +1,7 @@
|
|
1
|
+
from typing import Optional, Dict, Union, List
|
1
2
|
|
2
3
|
from pydantic import BaseModel, model_validator, Field
|
3
4
|
|
4
|
-
from typing import Optional, Dict, Union, List
|
5
5
|
dataframe_params: Dict[str, Union[None, str, bool, int, None]] = {
|
6
6
|
"fieldnames": None,
|
7
7
|
"index_col": None,
|
@@ -25,6 +25,7 @@ dataframe_options: Dict[str, Union[bool, str, int, None]] = {
|
|
25
25
|
|
26
26
|
LOOKUP_SEP = "__"
|
27
27
|
|
28
|
+
|
28
29
|
class ParamsConfig(BaseModel):
|
29
30
|
field_map: Optional[Dict] = Field(default_factory=dict)
|
30
31
|
legacy_filters: bool = False
|
@@ -76,4 +77,4 @@ class ParamsConfig(BaseModel):
|
|
76
77
|
new_filter_field = LOOKUP_SEP.join(new_parts)
|
77
78
|
new_filters[new_filter_field] = value
|
78
79
|
|
79
|
-
self.filters = new_filters
|
80
|
+
self.filters = new_filters
|
sibi_dst/utils/__init__.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
2
|
+
|
3
3
|
from ._log_utils import Logger
|
4
4
|
from ._date_utils import *
|
5
5
|
from ._data_utils import DataUtils
|
@@ -9,13 +9,14 @@ from ._df_utils import DfUtils
|
|
9
9
|
from ._storage_manager import StorageManager
|
10
10
|
from ._parquet_saver import ParquetSaver
|
11
11
|
from ._clickhouse_writer import ClickHouseWriter
|
12
|
-
from ._data_wrapper import DataWrapper
|
13
12
|
from ._airflow_manager import AirflowDAGManager
|
13
|
+
from ._credentials import *
|
14
|
+
from ._data_wrapper import DataWrapper
|
14
15
|
|
15
|
-
__all__=[
|
16
|
+
__all__ = [
|
17
|
+
"Logger",
|
16
18
|
"ConfigManager",
|
17
19
|
"ConfigLoader",
|
18
|
-
"Logger",
|
19
20
|
"DateUtils",
|
20
21
|
"BusinessDays",
|
21
22
|
"FileUtils",
|
@@ -27,4 +28,4 @@ __all__=[
|
|
27
28
|
"DfUtils",
|
28
29
|
"ClickHouseWriter",
|
29
30
|
"AirflowDAGManager",
|
30
|
-
]
|
31
|
+
]
|
@@ -1,8 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
from jinja2 import Template
|
3
2
|
from datetime import datetime
|
3
|
+
|
4
4
|
import fsspec
|
5
5
|
import httpx
|
6
|
+
from jinja2 import Template
|
6
7
|
|
7
8
|
"""
|
8
9
|
A manager to dynamically generate, save, and upload Airflow DAGs via SSH using fsspec.
|
@@ -54,8 +55,8 @@ with DAG(
|
|
54
55
|
{% endfor %}
|
55
56
|
"""
|
56
57
|
|
57
|
-
class AirflowDAGManager:
|
58
58
|
|
59
|
+
class AirflowDAGManager:
|
59
60
|
|
60
61
|
def __init__(self, output_dir, remote_dags_path, ssh_host, ssh_user, ssh_password, url, auth, wrapper_module_path):
|
61
62
|
"""
|
@@ -208,4 +209,4 @@ class AirflowDAGManager:
|
|
208
209
|
return response.json()
|
209
210
|
except httpx.RequestError as e:
|
210
211
|
print(f"Failed to trigger DAG {dag_id}: {e}")
|
211
|
-
raise
|
212
|
+
raise
|
@@ -1,9 +1,12 @@
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
|
+
|
1
3
|
import clickhouse_connect
|
4
|
+
import pandas as pd
|
2
5
|
from clickhouse_driver import Client
|
3
6
|
from dask.dataframe import dd
|
4
|
-
|
7
|
+
|
5
8
|
from sibi_dst.utils import Logger
|
6
|
-
|
9
|
+
|
7
10
|
|
8
11
|
class ClickHouseWriter:
|
9
12
|
dtype_to_clickhouse = {
|
@@ -19,20 +22,20 @@ class ClickHouseWriter:
|
|
19
22
|
df: dd.DataFrame
|
20
23
|
|
21
24
|
def __init__(self, logger=None, **kwargs):
|
22
|
-
self.clickhouse_host = kwargs.setdefault('host',"localhost")
|
23
|
-
self.clickhouse_port = kwargs.setdefault('port',8123)
|
24
|
-
self.clickhouse_dbname = kwargs.setdefault('database','sibi_data')
|
25
|
-
self.clickhouse_user = kwargs.setdefault('user','default')
|
26
|
-
self.clickhouse_password = kwargs.setdefault('password','')
|
27
|
-
self.clickhouse_table = kwargs.setdefault('table','test_sibi_table')
|
25
|
+
self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
26
|
+
self.clickhouse_port = kwargs.setdefault('port', 8123)
|
27
|
+
self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
28
|
+
self.clickhouse_user = kwargs.setdefault('user', 'default')
|
29
|
+
self.clickhouse_password = kwargs.setdefault('password', '')
|
30
|
+
self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
28
31
|
|
29
32
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
30
33
|
self.client = None
|
31
|
-
self.order_by=kwargs.setdefault('order_by','id')
|
34
|
+
self.order_by = kwargs.setdefault('order_by', 'id')
|
32
35
|
|
33
36
|
def save_to_clickhouse(self, df, **kwargs):
|
34
37
|
self.df = df.copy()
|
35
|
-
self.order_by = kwargs.setdefault('order_by',self.order_by)
|
38
|
+
self.order_by = kwargs.setdefault('order_by', self.order_by)
|
36
39
|
if len(self.df.head().index) == 0:
|
37
40
|
self.logger.debug("Dataframe is empty")
|
38
41
|
return
|
@@ -86,8 +89,8 @@ class ClickHouseWriter:
|
|
86
89
|
if engine is None:
|
87
90
|
engine = f"ENGINE = MergeTree() order by {self.order_by}"
|
88
91
|
dtypes = self.df.dtypes
|
89
|
-
clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
|
90
|
-
create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
92
|
+
clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
|
93
|
+
create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
91
94
|
self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
92
95
|
if self.client:
|
93
96
|
self.client.command(create_table_sql)
|
@@ -200,4 +203,4 @@ class ClickHouseWriter:
|
|
200
203
|
with ThreadPoolExecutor() as executor:
|
201
204
|
executor.map(write_partition, partitions, range(len(partitions)))
|
202
205
|
except Exception as e:
|
203
|
-
self.logger.error(f"Error during multi-partition write: {e}")
|
206
|
+
self.logger.error(f"Error during multi-partition write: {e}")
|
sibi_dst/utils/_credentials.py
CHANGED
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
import datetime
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
2
3
|
from typing import Type, Any, Dict, Optional
|
4
|
+
|
3
5
|
import fsspec
|
4
6
|
import pandas as pd
|
5
7
|
from IPython.display import display
|
6
|
-
from sibi_dst.utils import Logger
|
7
8
|
from tqdm import tqdm
|
9
|
+
|
10
|
+
from sibi_dst.utils import Logger
|
8
11
|
from sibi_dst.utils import ParquetSaver
|
9
12
|
|
13
|
+
|
10
14
|
class DataWrapper:
|
11
15
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
12
16
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
@@ -29,7 +33,8 @@ class DataWrapper:
|
|
29
33
|
logger: Optional[Logger] = None,
|
30
34
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
31
35
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
32
|
-
show_progress: bool = False
|
36
|
+
show_progress: bool = False,
|
37
|
+
timeout: Optional[int] = 300):
|
33
38
|
self.dataclass = dataclass
|
34
39
|
self.date_field = date_field
|
35
40
|
self.data_path = self.ensure_forward_slash(data_path)
|
@@ -47,6 +52,7 @@ class DataWrapper:
|
|
47
52
|
self.max_age_minutes = max_age_minutes
|
48
53
|
self.history_days_threshold = history_days_threshold
|
49
54
|
self.show_progress = show_progress
|
55
|
+
self.timeout = timeout
|
50
56
|
|
51
57
|
self.start_date = self.convert_to_date(start_date)
|
52
58
|
self.end_date = self.convert_to_date(end_date)
|
@@ -73,31 +79,79 @@ class DataWrapper:
|
|
73
79
|
yield date.date()
|
74
80
|
|
75
81
|
def process(self):
|
76
|
-
"""Execute the update plan
|
82
|
+
"""Execute the update plan using 'update_priority' to determine processing order."""
|
77
83
|
update_plan_table = self.generate_update_plan_with_conditions()
|
78
84
|
|
79
|
-
# Display the update plan table to the user if
|
85
|
+
# Display the update plan table to the user if requested
|
80
86
|
if self.show_progress:
|
81
87
|
display(update_plan_table)
|
82
88
|
|
83
|
-
#
|
84
|
-
|
85
|
-
("
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
89
|
+
# Filter out rows that do not require updates (priority 0 means skip)
|
90
|
+
update_plan_table = update_plan_table[
|
91
|
+
(update_plan_table["update_required"] == True) & (update_plan_table["update_priority"] != 0)
|
92
|
+
]
|
93
|
+
|
94
|
+
# Group by priority
|
95
|
+
priorities = sorted(update_plan_table["update_priority"].unique())
|
96
|
+
|
97
|
+
# We will process each priority level in its own thread.
|
98
|
+
# Each thread will handle all dates associated with that priority.
|
99
|
+
def process_priority(priority):
|
100
|
+
# Extract dates for the current priority
|
90
101
|
dates_to_process = update_plan_table[
|
91
|
-
|
92
|
-
|
102
|
+
update_plan_table["update_priority"] == priority
|
103
|
+
]["date"].tolist()
|
93
104
|
|
105
|
+
# If show_progress is True, wrap in a progress bar
|
94
106
|
date_iterator = dates_to_process
|
95
107
|
if self.show_progress:
|
96
|
-
date_iterator = tqdm(date_iterator, desc=f"{
|
108
|
+
date_iterator = tqdm(date_iterator, desc=f"Processing priority {priority}:{self.dataclass.__name__}",
|
109
|
+
unit="date")
|
97
110
|
|
111
|
+
# Process each date for this priority
|
98
112
|
for current_date in date_iterator:
|
99
113
|
self.process_date(current_date)
|
100
114
|
|
115
|
+
# Launch a separate thread for each priority
|
116
|
+
with ThreadPoolExecutor(max_workers=len(priorities)) as executor:
|
117
|
+
futures = {executor.submit(process_priority, p): p for p in priorities}
|
118
|
+
for future in futures:
|
119
|
+
try:
|
120
|
+
future.result(timeout=self.timeout)
|
121
|
+
except TimeoutError:
|
122
|
+
self.logger.error(f"Thread for {self.dataclass.__name__} timed out. Thread cancelled.")
|
123
|
+
future.cancel()
|
124
|
+
priority = futures[future]
|
125
|
+
new_future = executor.submit(process_priority, priority)
|
126
|
+
futures[new_future] = priority
|
127
|
+
self.logger.info(f"Resubmitted task for priority {priority} after timeout.")
|
128
|
+
|
129
|
+
# def process(self):
|
130
|
+
# """Execute the update plan following the specified hierarchy."""
|
131
|
+
# update_plan_table = self.generate_update_plan_with_conditions()
|
132
|
+
#
|
133
|
+
# # Display the update plan table to the user if show_progress is True
|
134
|
+
# if self.show_progress:
|
135
|
+
# display(update_plan_table)
|
136
|
+
#
|
137
|
+
# # Process files according to the hierarchy, considering only `update_required` dates
|
138
|
+
# for category, description in [
|
139
|
+
# ("overwrite", "Processing files due to overwrite=True"),
|
140
|
+
# ("history_days", "Processing files within history_days_threshold"),
|
141
|
+
# ("missing_files", "Processing missing files")
|
142
|
+
# ]:
|
143
|
+
# # Filter dates in the category where `update_required` is True
|
144
|
+
# dates_to_process = update_plan_table[
|
145
|
+
# (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
|
146
|
+
# ]["date"].tolist()
|
147
|
+
#
|
148
|
+
# date_iterator = dates_to_process
|
149
|
+
# if self.show_progress:
|
150
|
+
# date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
|
151
|
+
#
|
152
|
+
# for current_date in date_iterator:
|
153
|
+
# self.process_date(current_date)
|
154
|
+
|
101
155
|
def is_file_older_than(self, file_path: str) -> bool:
|
102
156
|
"""
|
103
157
|
Check if a file is older than the specified max_age_minutes.
|
@@ -130,7 +184,7 @@ class DataWrapper:
|
|
130
184
|
data_object = self.dataclass(**self.class_params)
|
131
185
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
132
186
|
|
133
|
-
if len(df.index)==0:
|
187
|
+
if len(df.index) == 0:
|
134
188
|
self.logger.error("No data found for the specified date.")
|
135
189
|
return
|
136
190
|
|
@@ -178,12 +232,14 @@ class DataWrapper:
|
|
178
232
|
category = "history_days"
|
179
233
|
update_required = True
|
180
234
|
else:
|
235
|
+
category = "file age is recent"
|
181
236
|
update_required = False
|
182
237
|
# Hierarchy 3: Missing files
|
183
238
|
elif missing_file and current_date <= today:
|
184
239
|
category = "missing_files"
|
185
240
|
update_required = True
|
186
241
|
else:
|
242
|
+
category = "No Update Required"
|
187
243
|
update_required = False
|
188
244
|
|
189
245
|
# Collect condition descriptions for the update plan table
|
@@ -194,8 +250,18 @@ class DataWrapper:
|
|
194
250
|
"missing_file": missing_file,
|
195
251
|
"update_required": update_required,
|
196
252
|
"update_category": category,
|
197
|
-
"datawrapper class":self.dataclass.__name__
|
253
|
+
"datawrapper class": self.dataclass.__name__
|
198
254
|
})
|
255
|
+
priority_map = {
|
256
|
+
"overwrite": 1,
|
257
|
+
"history_days": 2,
|
258
|
+
"missing_files": 3
|
259
|
+
}
|
260
|
+
|
261
|
+
for row in rows:
|
262
|
+
category = row.get("update_category")
|
263
|
+
# Default to None if no category assigned (no update required)
|
264
|
+
row["update_priority"] = priority_map.get(category, 0)
|
199
265
|
|
200
266
|
update_plan_table = pd.DataFrame(rows)
|
201
267
|
return update_plan_table
|
sibi_dst/utils/_date_utils.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import datetime
|
2
|
-
from typing import Union, Tuple, Callable, Dict
|
2
|
+
from typing import Union, Tuple, Callable, Dict
|
3
3
|
|
4
4
|
import numpy as np
|
5
5
|
import pandas as pd
|
6
|
+
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
8
9
|
|
@@ -32,7 +33,8 @@ class DateUtils:
|
|
32
33
|
raise ValueError(f"Unsupported date format: {value}")
|
33
34
|
|
34
35
|
@classmethod
|
35
|
-
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
36
|
+
def calc_week_range(cls, reference_date: Union[str, datetime.date, datetime.datetime, pd.Timestamp]) -> Tuple[
|
37
|
+
datetime.date, datetime.date]:
|
36
38
|
"""
|
37
39
|
Calculate the start and end of the week for a given reference date.
|
38
40
|
"""
|
@@ -49,7 +51,8 @@ class DateUtils:
|
|
49
51
|
return datetime.date(year, 1, 1), datetime.date(year, 12, 31)
|
50
52
|
|
51
53
|
@classmethod
|
52
|
-
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
54
|
+
def get_first_day_of_the_quarter(cls, reference_date: Union[
|
55
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
53
56
|
"""
|
54
57
|
Get the first day of the quarter for a given date.
|
55
58
|
"""
|
@@ -58,7 +61,8 @@ class DateUtils:
|
|
58
61
|
return datetime.date(reference_date.year, 3 * quarter - 2, 1)
|
59
62
|
|
60
63
|
@classmethod
|
61
|
-
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
64
|
+
def get_last_day_of_the_quarter(cls, reference_date: Union[
|
65
|
+
str, datetime.date, datetime.datetime, pd.Timestamp]) -> datetime.date:
|
62
66
|
"""
|
63
67
|
Get the last day of the quarter for a given date.
|
64
68
|
"""
|
@@ -116,10 +120,12 @@ class DateUtils:
|
|
116
120
|
'current_month': lambda: cls.get_month_range(n=0),
|
117
121
|
'last_month': lambda: cls.get_month_range(n=-1),
|
118
122
|
'current_year': lambda: cls.get_year_timerange(today().year),
|
119
|
-
'current_quarter': lambda: (
|
123
|
+
'current_quarter': lambda: (
|
124
|
+
cls.get_first_day_of_the_quarter(today()), cls.get_last_day_of_the_quarter(today())),
|
120
125
|
'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
|
121
126
|
}
|
122
127
|
|
128
|
+
|
123
129
|
class BusinessDays:
|
124
130
|
def __init__(self, holiday_list, logger):
|
125
131
|
"""
|
sibi_dst/utils/_df_utils.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import dask.dataframe as dd
|
2
|
+
import pandas as pd
|
3
|
+
|
3
4
|
from ._log_utils import Logger
|
4
5
|
|
6
|
+
|
5
7
|
class DfUtils:
|
6
8
|
def __init__(self, logger=None):
|
7
9
|
"""
|
@@ -210,7 +212,7 @@ class DfUtils:
|
|
210
212
|
df['Total'] = df.sum(axis=1, numeric_only=True)
|
211
213
|
return df
|
212
214
|
|
213
|
-
def summarise_data(self,df, summary_column, values_column, rule='D', agg_func='count'):
|
215
|
+
def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
|
214
216
|
"""
|
215
217
|
Summarizes data by creating a pivot table and resampling.
|
216
218
|
|
@@ -233,10 +235,12 @@ class DfUtils:
|
|
233
235
|
df = df.set_index(dd.to_datetime(df.index))
|
234
236
|
|
235
237
|
# Group by index and summary columns
|
236
|
-
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
238
|
+
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
239
|
+
agg_func).reset_index()
|
237
240
|
|
238
241
|
# Pivot the table
|
239
|
-
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
242
|
+
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
243
|
+
aggfunc='sum').fillna(0)
|
240
244
|
|
241
245
|
# Resample
|
242
246
|
df_pivot.index = dd.to_datetime(df_pivot.index)
|
@@ -269,4 +273,4 @@ class DfUtils:
|
|
269
273
|
Returns:
|
270
274
|
DataFrame: Resampled pivot table.
|
271
275
|
"""
|
272
|
-
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
276
|
+
return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
|
sibi_dst/utils/_file_utils.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
import shutil
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
|
+
|
4
5
|
import fsspec
|
5
6
|
|
6
7
|
from sibi_dst.utils import Logger
|
7
8
|
|
9
|
+
|
8
10
|
class FileUtils:
|
9
11
|
def __init__(self, logger=None):
|
10
12
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
@@ -27,7 +29,7 @@ class FileUtils:
|
|
27
29
|
fs.mkdirs(path)
|
28
30
|
|
29
31
|
@staticmethod
|
30
|
-
def construct_full_path(storage_path:str, parquet_filename: Optional[str]) -> Path:
|
32
|
+
def construct_full_path(storage_path: str, parquet_filename: Optional[str]) -> Path:
|
31
33
|
"""Construct and return the full path for the parquet file."""
|
32
34
|
fs, base_path = fsspec.core.url_to_fs(storage_path)
|
33
35
|
parquet_filename = parquet_filename or "default.parquet"
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import datetime
|
2
|
-
import fsspec
|
3
2
|
import re
|
4
3
|
|
4
|
+
import fsspec
|
5
|
+
|
5
6
|
from sibi_dst.utils import Logger
|
6
7
|
|
7
8
|
|
@@ -150,6 +151,7 @@ class FilePathGenerator:
|
|
150
151
|
return datetime.datetime.strptime(date, '%Y-%m-%d')
|
151
152
|
return date
|
152
153
|
|
154
|
+
|
153
155
|
"""
|
154
156
|
Usage:
|
155
157
|
# Initialize the generator
|
@@ -182,4 +184,4 @@ for fp in file_paths:
|
|
182
184
|
|
183
185
|
df_pandas = pd.concat(dataframes, ignore_index=True)
|
184
186
|
print(df_pandas.head())
|
185
|
-
"""
|
187
|
+
"""
|
sibi_dst/utils/_log_utils.py
CHANGED
sibi_dst/utils/_parquet_saver.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Optional
|
3
3
|
|
4
|
-
import dask_expr
|
5
4
|
import fsspec
|
6
5
|
import pyarrow as pa
|
7
6
|
|
@@ -103,4 +102,3 @@ class ParquetSaver:
|
|
103
102
|
self.df_result.to_parquet(
|
104
103
|
str(full_path), engine="pyarrow", schema=schema, write_index=False
|
105
104
|
)
|
106
|
-
|