sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,283 @@
1
+ import asyncio
2
+ import json
3
+ from contextlib import suppress
4
+ from collections.abc import Mapping
5
+ from typing import Any, Optional, Type, Sequence, List
6
+
7
+ import pandas as pd
8
+ from fastapi import APIRouter, Body, HTTPException, Query, Request
9
+ from fastapi.encoders import jsonable_encoder
10
+ from fastapi.responses import StreamingResponse
11
+ from pydantic import BaseModel, ConfigDict, Field, create_model
12
+
13
+ from sibi_flux.dask_cluster import safe_compute
14
+ from sibi_flux.dask_cluster.client_manager import get_persistent_client
15
+ from sibi_flux.utils.boilerplate.base_data_cube import BaseDatacube
16
+ from sibi_flux.datacube._data_cube import Datacube
17
+
18
+
19
+ class BaseCubeRouter:
20
+ """
21
+ A reusable router class for Datacubes that provides:
22
+ - POST /: Data retrieval with dynamic filters (optional pagination).
23
+ - POST /stream: SSE streaming with dynamic filters.
24
+
25
+ Attributes:
26
+ cube_cls: The Datacube class to wrap.
27
+ router: The APIRouter instance containing the endpoints.
28
+ FiltersModel: Dynamic Pydantic model for validation.
29
+ pagination: Whether to enable pagination logic on the main endpoint.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ cube_cls: Type[BaseDatacube] | Type[Datacube],
35
+ prefix: str = "",
36
+ tags: Sequence[str] | None = None,
37
+ pagination: bool = True,
38
+ ):
39
+ self.cube_cls = cube_cls
40
+ self.pagination = pagination
41
+ self.router = APIRouter(prefix=prefix, tags=tags or [])
42
+ self.FiltersModel = self._create_filters_model()
43
+ self._setup_routes()
44
+
45
+ def _generate_docstring_and_example(self, field_defs: Mapping[str, Any]) -> str:
46
+ """
47
+ Generates a JSON-like representation of the model for the docstring.
48
+ """
49
+ example_dict = {}
50
+ for name, (type_, field) in field_defs.items():
51
+ # Simplistic type mapping for display
52
+ type_name = "string"
53
+ if type_ is Optional[bool]:
54
+ type_name = True
55
+ elif type_ is Optional[int]:
56
+ type_name = 0
57
+ elif type_ is Optional[float]:
58
+ type_name = 0.0
59
+
60
+ example_dict[name] = type_name
61
+
62
+ pretty_json = json.dumps(example_dict, indent=2)
63
+ return f"Filter options:\n\n```json\n{pretty_json}\n```"
64
+
65
+ def _create_filters_model(self) -> Type[BaseModel]:
66
+ # Default behavior: if exclude_columns is empty/missing, exclude 'id'.
67
+ _exclude_columns = self.cube_cls.config.get("exclude_columns", [])
68
+ if not _exclude_columns:
69
+ _exclude_columns = ["id"]
70
+
71
+ try:
72
+ # Introspect dtypes by loading an empty slice/schema
73
+ # We use n_records=1 at init to hint the backend to limit query size
74
+ # Note: This requires the DB/Backend to be accessible at startup
75
+ cube = self.cube_cls(n_records=1)
76
+
77
+ # Try to load just metadata/empty df
78
+ # If backend doesn't support limit, this might raise Error or load full data.
79
+ # We catch exceptions to fallback.
80
+ df = cube.load()
81
+
82
+ if df is None:
83
+ return self._fallback_create_model(_exclude_columns)
84
+
85
+ _field_definitions = {}
86
+
87
+ # Identify boolean fields from config as override/hint
88
+ _boolean_fields = getattr(self.cube_cls, "boolean_fields", [])
89
+ _field_map = self.cube_cls.config.get("field_map", {})
90
+ _target_columns = set(_field_map.values()) if _field_map else None
91
+
92
+ for col in df.columns:
93
+ if col in _exclude_columns:
94
+ continue
95
+
96
+ # If field_map is provided, use only targeted columns
97
+ if _target_columns is not None and col not in _target_columns:
98
+ continue
99
+
100
+ if col not in df.columns:
101
+ continue
102
+
103
+ dtype = df.dtypes[col]
104
+ desc = f"Filter by {col}"
105
+ field_type: Any = Optional[str] # Default
106
+
107
+ if col in _boolean_fields:
108
+ field_type = Optional[bool]
109
+ desc += " (bool)"
110
+ elif pd.api.types.is_bool_dtype(dtype):
111
+ field_type = Optional[bool]
112
+ desc += " (bool)"
113
+ elif pd.api.types.is_integer_dtype(dtype):
114
+ field_type = Optional[int]
115
+ desc += " (int)"
116
+ elif pd.api.types.is_float_dtype(dtype):
117
+ field_type = Optional[float]
118
+ desc += " (float)"
119
+
120
+ _field_definitions[col] = (
121
+ field_type,
122
+ Field(default=None, description=desc),
123
+ )
124
+
125
+ model = create_model(
126
+ f"{self.cube_cls.__name__}Filters",
127
+ **_field_definitions,
128
+ __config__=ConfigDict(extra="ignore"),
129
+ )
130
+ model.__doc__ = self._generate_docstring_and_example(_field_definitions)
131
+ return model
132
+
133
+ except Exception:
134
+ # Fallback to config-based generation
135
+ # print(f"DTO INTROSPECTION ERROR for {self.cube_cls.__name__}: {e}")
136
+ return self._fallback_create_model(_exclude_columns)
137
+
138
+ def _fallback_create_model(self, exclude_columns: Sequence[str]) -> Type[BaseModel]:
139
+ # Retrieve fields from the Cube configuration
140
+ _field_map = self.cube_cls.config.get("field_map", {})
141
+ _output_columns = list(_field_map.values()) if _field_map else []
142
+ _boolean_fields = getattr(self.cube_cls, "boolean_fields", [])
143
+
144
+ _field_definitions = {}
145
+ for col in _output_columns:
146
+ if col in exclude_columns:
147
+ continue
148
+
149
+ if col in _boolean_fields:
150
+ _field_definitions[col] = (
151
+ Optional[bool],
152
+ Field(default=None, description=f"Filter by {col} (bool)"),
153
+ )
154
+ else:
155
+ _field_definitions[col] = (
156
+ Optional[str | int],
157
+ Field(default=None, description=f"Filter by {col}"),
158
+ )
159
+
160
+ FiltersModel: Any = create_model(
161
+ f"{self.cube_cls.__name__}Filters",
162
+ **_field_definitions,
163
+ __config__=ConfigDict(extra="ignore"),
164
+ )
165
+ FiltersModel.__doc__ = self._generate_docstring_and_example(_field_definitions)
166
+ return FiltersModel
167
+
168
+ async def _materialize_df(self, df: Any) -> pd.DataFrame:
169
+ if hasattr(df, "compute"):
170
+ # Use the shared persistent client if available
171
+ client = get_persistent_client()
172
+ return await asyncio.to_thread(safe_compute, df, dask_client=client)
173
+ return df
174
+
175
+ async def _stream_data_task(
176
+ self, cube: BaseDatacube, filters: Mapping[str, Any]
177
+ ) -> None:
178
+ async with cube:
179
+ await cube.emit("status", msg="loading")
180
+ try:
181
+ df = await cube.aload(**filters)
182
+ df = await self._materialize_df(df)
183
+
184
+ if df is not None and not df.empty:
185
+ records = df.to_dict(orient="records")
186
+ await cube.emit("data", data=jsonable_encoder(records))
187
+
188
+ await cube.emit("status", msg="complete")
189
+
190
+ except Exception as e:
191
+ await cube.emit("error", detail=str(e))
192
+
193
+ def _setup_routes(self):
194
+ # We define the functions inside closure to capture 'self' reliably
195
+ # and use the dynamic FiltersModel for type annotation and FastAPI dependency.
196
+
197
+ FiltersModel = self.FiltersModel
198
+
199
+ async def _core_logic(cube, filters):
200
+ if isinstance(filters, dict):
201
+ filters = FiltersModel(**filters)
202
+ filter_dict = filters.model_dump(exclude_unset=True)
203
+ try:
204
+ df = await cube.aload(**filter_dict)
205
+ except Exception as e:
206
+ raise HTTPException(status_code=500, detail=str(e))
207
+ df = await self._materialize_df(df)
208
+ return df
209
+
210
+ if self.pagination:
211
+
212
+ @self.router.post(
213
+ "",
214
+ summary=f"Get Paged {self.cube_cls.__name__} Data",
215
+ description="Retrieve data with filtering and pagination.",
216
+ )
217
+ async def get_cube_data(
218
+ filters: Any = Body(
219
+ default_factory=lambda: FiltersModel(),
220
+ description="Filters to apply.",
221
+ openapi_examples={},
222
+ ),
223
+ page: int = Query(1, ge=1, description="Page number."),
224
+ page_size: int = Query(
225
+ 50, ge=1, le=1000, description="Records per page."
226
+ ),
227
+ ) -> List[dict[str, Any]]:
228
+ cube = self.cube_cls()
229
+ df = await _core_logic(cube, filters)
230
+
231
+ if df is None or df.empty:
232
+ return []
233
+
234
+ start = (page - 1) * page_size
235
+ end = start + page_size
236
+ paged_df = df.iloc[start:end]
237
+ return paged_df.to_dict(orient="records")
238
+
239
+ else:
240
+
241
+ @self.router.post(
242
+ "",
243
+ summary=f"Get {self.cube_cls.__name__} Data",
244
+ description="Retrieve all data with filtering (pagination disabled).",
245
+ )
246
+ async def get_cube_data_all(
247
+ filters: Any = Body(
248
+ default_factory=lambda: FiltersModel(),
249
+ description="Filters to apply.",
250
+ openapi_examples={},
251
+ ),
252
+ ) -> List[dict[str, Any]]:
253
+ cube = self.cube_cls()
254
+ df = await _core_logic(cube, filters)
255
+
256
+ if df is None or df.empty:
257
+ return []
258
+
259
+ return df.to_dict(orient="records")
260
+
261
+ @self.router.post(
262
+ "/stream",
263
+ summary=f"Stream {self.cube_cls.__name__} Data",
264
+ description="Stream data updates via SSE.",
265
+ )
266
+ async def stream_cube_data(
267
+ request: Request,
268
+ filters: Any = Body(
269
+ default_factory=lambda: FiltersModel(),
270
+ description="Filters to apply.",
271
+ openapi_examples={},
272
+ ),
273
+ ) -> StreamingResponse:
274
+ cube = self.cube_cls(auto_sse=True)
275
+ if isinstance(filters, dict):
276
+ filters = FiltersModel(**filters)
277
+ filter_dict = filters.model_dump(exclude_unset=True)
278
+
279
+ asyncio.create_task(self._stream_data_task(cube, filter_dict))
280
+
281
+ return StreamingResponse(
282
+ cube.get_sse().aiter_sse(), media_type="text/event-stream"
283
+ )
@@ -0,0 +1,132 @@
1
+ """
2
+ Base DataCube module.
3
+
4
+ Provides the foundational classes for DataCube definitions and loading logic.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ from typing import Optional, Any, Dict
11
+
12
+ import dask.dataframe as dd
13
+ import pandas as pd
14
+
15
+ from sibi_flux.df_helper import DfHelper
16
+ from sibi_flux.dask_cluster import dask_is_empty
17
+
18
+
19
+ DataFrameType = dd.DataFrame | pd.DataFrame
20
+
21
+
22
+ class BaseDatacube(DfHelper):
23
+ """
24
+ Base cube with sync/async load hooks and resilient Dask handling.
25
+
26
+ Lifecycle:
27
+ 1. load() / aload() calls parent DfHelper to fetch raw data.
28
+ 2. If data exists, calls fix_data() (sync) or afix_data() (async).
29
+ 3. Result is stored in self.df and returned.
30
+
31
+ Subclasses should override:
32
+ - fix_data(self, df): for CPU-bound transformations (Pandas/Dask map_partitions)
33
+ - afix_data(self, df): for I/O-bound transformations (DB lookups, etc.)
34
+ """
35
+
36
+ # Class-level config overrides (e.g. backend='parquet')
37
+ config: Dict[str, Any] = {}
38
+
39
+ def __init__(self, **kwargs):
40
+
41
+ combined_config = {**self.config, **kwargs}
42
+ super().__init__(**combined_config)
43
+
44
+ # State container
45
+ self.df: Optional[DataFrameType] = None
46
+
47
+ # -----------------------------------------------------------------------
48
+ # Hooks (Override these)
49
+ # -----------------------------------------------------------------------
50
+ def fix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
51
+ """
52
+ Synchronous transformation hook.
53
+
54
+ Args:
55
+ df: The loaded dataframe (Pandas or Dask)
56
+ **kwargs: The options passed to load()
57
+
58
+ Returns:
59
+ The transformed dataframe.
60
+ """
61
+ return df
62
+
63
+ async def afix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
64
+ """
65
+ Asynchronous transformation hook.
66
+
67
+ Defaults to calling fix_data(). Override this ONLY if you need
68
+ to perform async operations (like awaiting other DB calls).
69
+ """
70
+ return self.fix_data(df, **kwargs)
71
+
72
+ # -----------------------------------------------------------------------
73
+ # Public API
74
+ # -----------------------------------------------------------------------
75
+ def load(self, **kwargs) -> DataFrameType:
76
+ """
77
+ Synchronous load pipeline.
78
+ """
79
+ # 1. Load Raw Data (Delegate to DfHelper)
80
+ df = super().load(**kwargs)
81
+
82
+ # 2. Check Emptiness (Resilient)
83
+ if not self._is_empty(df):
84
+ # 3. Apply Transform Hook
85
+ df = self.fix_data(df, **kwargs)
86
+ else:
87
+ self.logger.debug(f"No data loaded by {self.__class__.__name__}")
88
+
89
+ # 4. Update State & Return
90
+ self.df = df
91
+ return df
92
+
93
+ async def aload(self, **kwargs) -> DataFrameType:
94
+ """
95
+ Asynchronous load pipeline.
96
+ """
97
+ # 1. Load Raw Data
98
+ df = await super().aload(**kwargs)
99
+
100
+ # 2. Check Emptiness (Non-blocking)
101
+ # _is_empty triggers dask.compute(), so we must offload to thread
102
+ is_empty = await asyncio.to_thread(self._is_empty, df)
103
+
104
+ if not is_empty:
105
+ # 3. Apply Async Transform Hook
106
+ # (Note: afix_data calls fix_data by default, so this covers both cases)
107
+ df = await self.afix_data(df, **kwargs)
108
+ else:
109
+ self.logger.debug(f"No data loaded by {self.__class__.__name__}")
110
+
111
+ # 4. Update State & Return
112
+ self.df = df
113
+ return df
114
+
115
+ # -----------------------------------------------------------------------
116
+ # Internals
117
+ # -----------------------------------------------------------------------
118
+ def _is_empty(self, df: Optional[DataFrameType]) -> bool:
119
+ """
120
+ Robust emptiness check using dask_cluster.
121
+ """
122
+ if df is None:
123
+ return True
124
+
125
+ if isinstance(df, pd.DataFrame):
126
+ return df.empty
127
+
128
+ # Use our new resilient checker
129
+ # This prevents the "compute whole graph to check length" disaster
130
+ # Pass the client if we have one (from DfHelper/DaskClientMixin)
131
+ client = getattr(self, "dask_client", None)
132
+ return dask_is_empty(df, dask_client=client, logger=self.logger)
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import pandas as pd
4
+
5
+ from sibi_flux.pipelines import BasePipeline
6
+
7
+
8
+ class PipelineTemplate:
9
+ """
10
+ A reusable base class for executing product-related pipelines end-to-end.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ start_date: str,
16
+ end_date: str,
17
+ fs_instance,
18
+ storage_path: str,
19
+ dataset_cls,
20
+ filename: str,
21
+ date_field: str = "last_activity_dt",
22
+ **kwargs,
23
+ ):
24
+ self.start_date = start_date
25
+ self.end_date = end_date
26
+ self.max_workers = kwargs.pop("max_workers", 4)
27
+ self.fs = fs_instance
28
+ self.storage_path = storage_path
29
+
30
+ self.pipeline = BasePipeline(
31
+ start_date=self.start_date,
32
+ end_date=self.end_date,
33
+ dataset_cls=dataset_cls,
34
+ parquet_storage_path=self.storage_path,
35
+ fs=self.fs,
36
+ filename=filename,
37
+ date_field=date_field,
38
+ max_workers=self.max_workers,
39
+ )
40
+
41
+ async def to_parquet(self, **kwargs) -> pd.DataFrame:
42
+ await self.pipeline.to_parquet(**kwargs)
43
+ df = await self.pipeline.from_parquet(**kwargs)
44
+ return df
45
+
46
+ async def from_parquet(self, **kwargs) -> pd.DataFrame:
47
+ df = await self.pipeline.from_parquet(**kwargs)
48
+ return df
49
+
50
+ async def to_clickhouse(self, clickhouse_conf, **kwargs) -> None:
51
+ cnf = clickhouse_conf.copy()
52
+ cnf["table"] = self.pipeline.filename
53
+ cnf["overwrite"] = True
54
+ await self.pipeline.to_clickhouse(cnf, **kwargs)
@@ -0,0 +1,193 @@
1
+ import datetime
2
+ from typing import Optional
3
+ import pandas as pd
4
+ import dask.dataframe as dd
5
+ from sibi_flux.logger import Logger
6
+
7
+ TODAY = datetime.date.today()
8
+ YESTERDAY = TODAY - datetime.timedelta(days=1)
9
+ TODAY_STR = TODAY.strftime("%Y-%m-%d")
10
+ YESTERDAY_STR = YESTERDAY.strftime("%Y-%m-%d")
11
+
12
+
13
+ class HybridDataLoader:
14
+ """
15
+ Hybrid loader that merges historical (Parquet) and live (API/DB) data
16
+ in a consistent, schema-safe, timezone-normalized way.
17
+ """
18
+
19
+ def __init__(
20
+ self,
21
+ start_date: str,
22
+ end_date: str,
23
+ historical_reader,
24
+ live_reader,
25
+ date_field: str,
26
+ **kwargs,
27
+ ):
28
+ self.start_date = self._validate_date_format(start_date)
29
+ self.end_date = self._validate_date_format(end_date)
30
+ self.historical_reader = historical_reader
31
+ self.live_reader = live_reader
32
+ self.date_field = date_field
33
+
34
+ self.logger = kwargs.get("logger", Logger.default_logger(logger_name=__name__))
35
+ self.debug = kwargs.get("debug", False)
36
+ self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
37
+
38
+ self._validate_date_range()
39
+
40
+ self._read_live_flag = self.end_date == TODAY_STR
41
+ self._is_single_today = self.start_date == self.end_date == TODAY_STR
42
+ self._is_single_historical = self.start_date == self.end_date != TODAY_STR
43
+
44
+ # ------------------------------------------------------------------ #
45
+ # Validation
46
+ # ------------------------------------------------------------------ #
47
+ @staticmethod
48
+ def _validate_date_format(date_str: str) -> str:
49
+ try:
50
+ datetime.datetime.strptime(date_str, "%Y-%m-%d")
51
+ return date_str
52
+ except ValueError:
53
+ raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
54
+
55
+ def _validate_date_range(self):
56
+ start = datetime.datetime.strptime(self.start_date, "%Y-%m-%d").date()
57
+ end = datetime.datetime.strptime(self.end_date, "%Y-%m-%d").date()
58
+ if end < start:
59
+ raise ValueError(
60
+ f"End date ({self.end_date}) cannot be before start date ({self.start_date})"
61
+ )
62
+
63
+ @staticmethod
64
+ def _normalize_datetimes(df: dd.DataFrame, cols: list[str]) -> dd.DataFrame:
65
+ """Normalize datetime columns to UTC safely."""
66
+ if not cols:
67
+ return df
68
+
69
+ def _to_ts(pdf: pd.DataFrame) -> pd.DataFrame:
70
+ for c in cols:
71
+ if c in pdf.columns:
72
+ pdf[c] = pd.to_datetime(pdf[c], errors="coerce", utc=True)
73
+ return pdf
74
+
75
+ return df.map_partitions(_to_ts)
76
+
77
+ @staticmethod
78
+ def _create_empty_dataframe(meta: Optional[pd.DataFrame] = None) -> dd.DataFrame:
79
+ if meta is not None and not meta.empty:
80
+ return dd.from_pandas(meta.iloc[0:0], npartitions=1)
81
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
82
+
83
+ # ------------------------------------------------------------------ #
84
+ # Data loading methods
85
+ # ------------------------------------------------------------------ #
86
+ async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
87
+ """Load today's live data."""
88
+ self.logger.debug("Loading today's live data...")
89
+ date_filter = {f"{self.date_field}__date": TODAY_STR}
90
+ filters = {**kwargs, **date_filter}
91
+
92
+ try:
93
+ reader_obj = self.live_reader(logger=self.logger, debug=self.debug)
94
+ if not hasattr(reader_obj, "aload"):
95
+ raise TypeError("live_reader must expose an async aload() method")
96
+ today_df = await reader_obj.aload(**filters)
97
+ if today_df is not None:
98
+ today_df = self._normalize_datetimes(today_df, [self.date_field])
99
+ return today_df
100
+ except Exception as e:
101
+ self.logger.error(f"Failed to load today's data: {e}")
102
+ return None if not self.debug else (_ for _ in ()).throw(e)
103
+
104
+ async def _load_historical_data(
105
+ self, start_date: str, end_date: str, **kwargs
106
+ ) -> dd.DataFrame:
107
+ """Load historical data."""
108
+ self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
109
+ try:
110
+ reader_obj = self.historical_reader(
111
+ parquet_start_date=start_date,
112
+ parquet_end_date=end_date,
113
+ logger=self.logger,
114
+ debug=self.debug,
115
+ )
116
+ if not hasattr(reader_obj, "aload"):
117
+ raise TypeError("historical_reader must expose an async aload() method")
118
+ df = await reader_obj.aload(**kwargs)
119
+ df = self._normalize_datetimes(df, [self.date_field])
120
+ return df
121
+ except Exception as e:
122
+ self.logger.error(f"Failed to load historical data: {e}")
123
+ if self.debug:
124
+ raise
125
+ return self._create_empty_dataframe()
126
+
127
+ # ------------------------------------------------------------------ #
128
+ # Orchestrator
129
+ # ------------------------------------------------------------------ #
130
+ # ------------------------------------------------------------------ #
131
+ # Orchestrator
132
+ # ------------------------------------------------------------------ #
133
+ async def aload(self, **kwargs) -> dd.DataFrame:
134
+ """Load and concatenate data from historical and live sources."""
135
+ self.logger.debug(
136
+ f"[HybridLoader] start={self.start_date}, end={self.end_date}, "
137
+ f"read_live={self._read_live_flag}"
138
+ )
139
+
140
+ # Case 1: only today
141
+ if self._is_single_today:
142
+ today_df = await self._load_today_data(**kwargs)
143
+ return today_df if today_df is not None else self._create_empty_dataframe()
144
+
145
+ # Case 2: purely historical
146
+ if not self._read_live_flag:
147
+ return await self._load_historical_data(
148
+ self.start_date, self.end_date, **kwargs
149
+ )
150
+
151
+ # Case 3: mixed historical + live
152
+ hist_df = await self._load_historical_data(
153
+ self.start_date, YESTERDAY_STR, **kwargs
154
+ )
155
+ live_df = await self._load_today_data(**kwargs)
156
+
157
+ if hist_df is None:
158
+ hist_df = self._create_empty_dataframe()
159
+ if live_df is None:
160
+ live_df = self._create_empty_dataframe()
161
+
162
+ # Standardize / Validate both before concat if schema is enforced
163
+ # (Assuming the caller might want to check schemas here)
164
+ # For now, we rely on the readers returning relatively clean data,
165
+ # but we can do a naive concat.
166
+
167
+ # Note: Previous implementation did heavy schema alignment.
168
+ # Dask concat handles most alignment if columns differ but dtypes are compatible.
169
+ # If columns are completely missing, Dask might warn or error.
170
+
171
+ try:
172
+ return dd.concat(
173
+ [hist_df, live_df],
174
+ ignore_unknown_divisions=True,
175
+ interleave_partitions=True,
176
+ )
177
+ except Exception as e:
178
+ self.logger.warning(
179
+ f"Simple concat failed: {e}. Attempting robust alignment..."
180
+ )
181
+ # Robust alignment fallback can be implemented here if strictly needed,
182
+ # possibly offloaded to a thread.
183
+ self.logger.error(
184
+ "Robust alignment not yet reimplemented with DfValidator."
185
+ )
186
+ raise e
187
+
188
+ # ------------------------------------------------------------------ #
189
+ def __repr__(self):
190
+ return (
191
+ f"HybridDataLoader(start='{self.start_date}', end='{self.end_date}', "
192
+ f"read_live={self._read_live_flag})"
193
+ )
@@ -0,0 +1,6 @@
1
+ from ._clickhouse_writer import ClickHouseWriter, _to_bool
2
+
3
+ __all__ = [
4
+ "ClickHouseWriter",
5
+ "_to_bool",
6
+ ]