sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/tests/test_baseclass.py +403 -0
- sibi_dst/utils/base.py +0 -254
- sibi_dst/utils/boilerplate/__init__.py +4 -1
- sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/RECORD +17 -14
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/WHEEL +0 -0
@@ -0,0 +1,144 @@
|
|
1
|
+
import dask.dataframe as dd
|
2
|
+
import datetime
|
3
|
+
import pandas as pd
|
4
|
+
from typing import Optional
|
5
|
+
from sibi_dst.utils import Logger
|
6
|
+
from sibi_dst.utils.dask_utils import dask_is_empty
|
7
|
+
|
8
|
+
today = datetime.date.today()
|
9
|
+
yesterday = today - datetime.timedelta(days=1)
|
10
|
+
TODAY_STR = today.strftime('%Y-%m-%d')
|
11
|
+
YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
|
12
|
+
|
13
|
+
|
14
|
+
class HybridDataLoader:
|
15
|
+
"""
|
16
|
+
A generic data loader that orchestrates loading from a historical
|
17
|
+
source and an optional live source.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
|
21
|
+
self.start_date = self._validate_date_format(start_date)
|
22
|
+
self.end_date = self._validate_date_format(end_date)
|
23
|
+
self.historical_reader = historical_reader
|
24
|
+
self.live_reader = live_reader
|
25
|
+
self.date_field = date_field
|
26
|
+
|
27
|
+
self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
|
28
|
+
self.debug = kwargs.get('debug', False)
|
29
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
30
|
+
|
31
|
+
# Validate date range
|
32
|
+
self._validate_date_range()
|
33
|
+
|
34
|
+
# Determine loading strategy
|
35
|
+
self._should_read_live = self.end_date == TODAY_STR
|
36
|
+
self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
|
37
|
+
self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
|
38
|
+
|
39
|
+
def _validate_date_format(self, date_str: str) -> str:
|
40
|
+
"""Validate that date string is in correct format."""
|
41
|
+
try:
|
42
|
+
datetime.datetime.strptime(date_str, '%Y-%m-%d')
|
43
|
+
return date_str
|
44
|
+
except ValueError:
|
45
|
+
raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
|
46
|
+
|
47
|
+
def _validate_date_range(self):
|
48
|
+
"""Validate that start date is not after end date."""
|
49
|
+
start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
|
50
|
+
end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
|
51
|
+
if end < start:
|
52
|
+
raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
|
53
|
+
|
54
|
+
def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
|
55
|
+
"""Forces the historical dataframe schema to match the live one."""
|
56
|
+
self.logger.debug("Aligning historical schema to match live schema.")
|
57
|
+
historical_cols = set(historical_df.columns)
|
58
|
+
live_cols = set(live_df.columns)
|
59
|
+
|
60
|
+
# Add missing columns to historical dataframe
|
61
|
+
for col in live_cols - historical_cols:
|
62
|
+
historical_df[col] = None
|
63
|
+
|
64
|
+
# Reorder columns to match live dataframe
|
65
|
+
return historical_df[list(live_df.columns)]
|
66
|
+
|
67
|
+
def _create_empty_dataframe(self) -> dd.DataFrame:
|
68
|
+
"""Create an empty dask dataframe with proper structure."""
|
69
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
70
|
+
|
71
|
+
async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
|
72
|
+
"""Load today's data from the live reader."""
|
73
|
+
self.logger.debug(f"Loading today's live data...")
|
74
|
+
date_filter = {f"{self.date_field}__date": TODAY_STR}
|
75
|
+
filters = {**kwargs, **date_filter}
|
76
|
+
|
77
|
+
try:
|
78
|
+
today_df = await self.live_reader(
|
79
|
+
logger=self.logger,
|
80
|
+
debug=self.debug
|
81
|
+
).aload(**filters)
|
82
|
+
return today_df
|
83
|
+
except Exception as e:
|
84
|
+
self.logger.error(f"Failed to load today's data: {e}")
|
85
|
+
if not self.debug:
|
86
|
+
return None
|
87
|
+
raise
|
88
|
+
|
89
|
+
async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
|
90
|
+
"""Load historical data from the historical reader."""
|
91
|
+
self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
|
92
|
+
|
93
|
+
try:
|
94
|
+
return await self.historical_reader(
|
95
|
+
parquet_start_date=start_date,
|
96
|
+
parquet_end_date=end_date,
|
97
|
+
logger=self.logger,
|
98
|
+
debug=self.debug
|
99
|
+
).aload(**kwargs)
|
100
|
+
except Exception as e:
|
101
|
+
self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
|
102
|
+
if not self.debug:
|
103
|
+
return self._create_empty_dataframe()
|
104
|
+
raise
|
105
|
+
|
106
|
+
async def aload(self, **kwargs) -> dd.DataFrame:
|
107
|
+
"""
|
108
|
+
Loads data from the historical source and, if required, the live source,
|
109
|
+
then concatenates them.
|
110
|
+
"""
|
111
|
+
# Case 1: Only today's data requested
|
112
|
+
if self._is_single_today:
|
113
|
+
today_df = await self._load_today_data(**kwargs)
|
114
|
+
return today_df if today_df is not None else self._create_empty_dataframe()
|
115
|
+
|
116
|
+
# Case 2: Pure historical data (end date is not today)
|
117
|
+
if not self._should_read_live:
|
118
|
+
return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
|
119
|
+
|
120
|
+
# Case 3: Mixed historical + live scenario (end date is today)
|
121
|
+
# Load historical data up to yesterday
|
122
|
+
historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
|
123
|
+
|
124
|
+
# Load today's data
|
125
|
+
today_df = await self._load_today_data(**kwargs)
|
126
|
+
|
127
|
+
# Combine dataframes
|
128
|
+
if today_df is not None and not dask_is_empty(today_df):
|
129
|
+
# Align schemas if needed
|
130
|
+
if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
|
131
|
+
try:
|
132
|
+
historical_df = self._align_schema_to_live(historical_df, today_df)
|
133
|
+
except Exception as e:
|
134
|
+
self.logger.warning(f"Failed to align schemas: {e}")
|
135
|
+
|
136
|
+
return dd.concat([historical_df, today_df], ignore_index=True)
|
137
|
+
else:
|
138
|
+
return historical_df
|
139
|
+
|
140
|
+
def __repr__(self):
|
141
|
+
return (f"HybridDataLoader(start_date='{self.start_date}', "
|
142
|
+
f"end_date='{self.end_date}', "
|
143
|
+
f"loading_live={self._should_read_live})")
|
144
|
+
|