sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,144 @@
1
+ import dask.dataframe as dd
2
+ import datetime
3
+ import pandas as pd
4
+ from typing import Optional
5
+ from sibi_dst.utils import Logger
6
+ from sibi_dst.utils.dask_utils import dask_is_empty
7
+
8
+ today = datetime.date.today()
9
+ yesterday = today - datetime.timedelta(days=1)
10
+ TODAY_STR = today.strftime('%Y-%m-%d')
11
+ YESTERDAY_STR = yesterday.strftime('%Y-%m-%d')
12
+
13
+
14
+ class HybridDataLoader:
15
+ """
16
+ A generic data loader that orchestrates loading from a historical
17
+ source and an optional live source.
18
+ """
19
+
20
+ def __init__(self, start_date: str, end_date: str, historical_reader, live_reader, date_field: str, **kwargs):
21
+ self.start_date = self._validate_date_format(start_date)
22
+ self.end_date = self._validate_date_format(end_date)
23
+ self.historical_reader = historical_reader
24
+ self.live_reader = live_reader
25
+ self.date_field = date_field
26
+
27
+ self.logger = kwargs.get('logger', Logger.default_logger(logger_name=__name__))
28
+ self.debug = kwargs.get('debug', False)
29
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
30
+
31
+ # Validate date range
32
+ self._validate_date_range()
33
+
34
+ # Determine loading strategy
35
+ self._should_read_live = self.end_date == TODAY_STR
36
+ self._is_single_today = (self.start_date == TODAY_STR and self.end_date == TODAY_STR)
37
+ self._is_single_historical = (self.start_date == self.end_date and self.end_date != TODAY_STR)
38
+
39
+ def _validate_date_format(self, date_str: str) -> str:
40
+ """Validate that date string is in correct format."""
41
+ try:
42
+ datetime.datetime.strptime(date_str, '%Y-%m-%d')
43
+ return date_str
44
+ except ValueError:
45
+ raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
46
+
47
+ def _validate_date_range(self):
48
+ """Validate that start date is not after end date."""
49
+ start = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
50
+ end = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
51
+ if end < start:
52
+ raise ValueError(f"End date ({self.end_date}) cannot be before start date ({self.start_date})")
53
+
54
+ def _align_schema_to_live(self, historical_df: dd.DataFrame, live_df: dd.DataFrame) -> dd.DataFrame:
55
+ """Forces the historical dataframe schema to match the live one."""
56
+ self.logger.debug("Aligning historical schema to match live schema.")
57
+ historical_cols = set(historical_df.columns)
58
+ live_cols = set(live_df.columns)
59
+
60
+ # Add missing columns to historical dataframe
61
+ for col in live_cols - historical_cols:
62
+ historical_df[col] = None
63
+
64
+ # Reorder columns to match live dataframe
65
+ return historical_df[list(live_df.columns)]
66
+
67
+ def _create_empty_dataframe(self) -> dd.DataFrame:
68
+ """Create an empty dask dataframe with proper structure."""
69
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
70
+
71
+ async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
72
+ """Load today's data from the live reader."""
73
+ self.logger.debug(f"Loading today's live data...")
74
+ date_filter = {f"{self.date_field}__date": TODAY_STR}
75
+ filters = {**kwargs, **date_filter}
76
+
77
+ try:
78
+ today_df = await self.live_reader(
79
+ logger=self.logger,
80
+ debug=self.debug
81
+ ).aload(**filters)
82
+ return today_df
83
+ except Exception as e:
84
+ self.logger.error(f"Failed to load today's data: {e}")
85
+ if not self.debug:
86
+ return None
87
+ raise
88
+
89
+ async def _load_historical_data(self, start_date: str, end_date: str, **kwargs) -> dd.DataFrame:
90
+ """Load historical data from the historical reader."""
91
+ self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
92
+
93
+ try:
94
+ return await self.historical_reader(
95
+ parquet_start_date=start_date,
96
+ parquet_end_date=end_date,
97
+ logger=self.logger,
98
+ debug=self.debug
99
+ ).aload(**kwargs)
100
+ except Exception as e:
101
+ self.logger.error(f"Failed to load historical data from {start_date} to {end_date}: {e}")
102
+ if not self.debug:
103
+ return self._create_empty_dataframe()
104
+ raise
105
+
106
+ async def aload(self, **kwargs) -> dd.DataFrame:
107
+ """
108
+ Loads data from the historical source and, if required, the live source,
109
+ then concatenates them.
110
+ """
111
+ # Case 1: Only today's data requested
112
+ if self._is_single_today:
113
+ today_df = await self._load_today_data(**kwargs)
114
+ return today_df if today_df is not None else self._create_empty_dataframe()
115
+
116
+ # Case 2: Pure historical data (end date is not today)
117
+ if not self._should_read_live:
118
+ return await self._load_historical_data(self.start_date, self.end_date, **kwargs)
119
+
120
+ # Case 3: Mixed historical + live scenario (end date is today)
121
+ # Load historical data up to yesterday
122
+ historical_df = await self._load_historical_data(self.start_date, YESTERDAY_STR, **kwargs)
123
+
124
+ # Load today's data
125
+ today_df = await self._load_today_data(**kwargs)
126
+
127
+ # Combine dataframes
128
+ if today_df is not None and not dask_is_empty(today_df):
129
+ # Align schemas if needed
130
+ if len(historical_df.columns) > 0 and len(today_df.columns) > 0:
131
+ try:
132
+ historical_df = self._align_schema_to_live(historical_df, today_df)
133
+ except Exception as e:
134
+ self.logger.warning(f"Failed to align schemas: {e}")
135
+
136
+ return dd.concat([historical_df, today_df], ignore_index=True)
137
+ else:
138
+ return historical_df
139
+
140
+ def __repr__(self):
141
+ return (f"HybridDataLoader(start_date='{self.start_date}', "
142
+ f"end_date='{self.end_date}', "
143
+ f"loading_live={self._should_read_live})")
144
+