sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
from contextlib import suppress
|
|
4
|
+
from collections.abc import Mapping
|
|
5
|
+
from typing import Any, Optional, Type, Sequence, List
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from fastapi import APIRouter, Body, HTTPException, Query, Request
|
|
9
|
+
from fastapi.encoders import jsonable_encoder
|
|
10
|
+
from fastapi.responses import StreamingResponse
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, create_model
|
|
12
|
+
|
|
13
|
+
from sibi_flux.dask_cluster import safe_compute
|
|
14
|
+
from sibi_flux.dask_cluster.client_manager import get_persistent_client
|
|
15
|
+
from sibi_flux.utils.boilerplate.base_data_cube import BaseDatacube
|
|
16
|
+
from sibi_flux.datacube._data_cube import Datacube
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseCubeRouter:
|
|
20
|
+
"""
|
|
21
|
+
A reusable router class for Datacubes that provides:
|
|
22
|
+
- POST /: Data retrieval with dynamic filters (optional pagination).
|
|
23
|
+
- POST /stream: SSE streaming with dynamic filters.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
cube_cls: The Datacube class to wrap.
|
|
27
|
+
router: The APIRouter instance containing the endpoints.
|
|
28
|
+
FiltersModel: Dynamic Pydantic model for validation.
|
|
29
|
+
pagination: Whether to enable pagination logic on the main endpoint.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
cube_cls: Type[BaseDatacube] | Type[Datacube],
|
|
35
|
+
prefix: str = "",
|
|
36
|
+
tags: Sequence[str] | None = None,
|
|
37
|
+
pagination: bool = True,
|
|
38
|
+
):
|
|
39
|
+
self.cube_cls = cube_cls
|
|
40
|
+
self.pagination = pagination
|
|
41
|
+
self.router = APIRouter(prefix=prefix, tags=tags or [])
|
|
42
|
+
self.FiltersModel = self._create_filters_model()
|
|
43
|
+
self._setup_routes()
|
|
44
|
+
|
|
45
|
+
def _generate_docstring_and_example(self, field_defs: Mapping[str, Any]) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Generates a JSON-like representation of the model for the docstring.
|
|
48
|
+
"""
|
|
49
|
+
example_dict = {}
|
|
50
|
+
for name, (type_, field) in field_defs.items():
|
|
51
|
+
# Simplistic type mapping for display
|
|
52
|
+
type_name = "string"
|
|
53
|
+
if type_ is Optional[bool]:
|
|
54
|
+
type_name = True
|
|
55
|
+
elif type_ is Optional[int]:
|
|
56
|
+
type_name = 0
|
|
57
|
+
elif type_ is Optional[float]:
|
|
58
|
+
type_name = 0.0
|
|
59
|
+
|
|
60
|
+
example_dict[name] = type_name
|
|
61
|
+
|
|
62
|
+
pretty_json = json.dumps(example_dict, indent=2)
|
|
63
|
+
return f"Filter options:\n\n```json\n{pretty_json}\n```"
|
|
64
|
+
|
|
65
|
+
def _create_filters_model(self) -> Type[BaseModel]:
|
|
66
|
+
# Default behavior: if exclude_columns is empty/missing, exclude 'id'.
|
|
67
|
+
_exclude_columns = self.cube_cls.config.get("exclude_columns", [])
|
|
68
|
+
if not _exclude_columns:
|
|
69
|
+
_exclude_columns = ["id"]
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
# Introspect dtypes by loading an empty slice/schema
|
|
73
|
+
# We use n_records=1 at init to hint the backend to limit query size
|
|
74
|
+
# Note: This requires the DB/Backend to be accessible at startup
|
|
75
|
+
cube = self.cube_cls(n_records=1)
|
|
76
|
+
|
|
77
|
+
# Try to load just metadata/empty df
|
|
78
|
+
# If backend doesn't support limit, this might raise Error or load full data.
|
|
79
|
+
# We catch exceptions to fallback.
|
|
80
|
+
df = cube.load()
|
|
81
|
+
|
|
82
|
+
if df is None:
|
|
83
|
+
return self._fallback_create_model(_exclude_columns)
|
|
84
|
+
|
|
85
|
+
_field_definitions = {}
|
|
86
|
+
|
|
87
|
+
# Identify boolean fields from config as override/hint
|
|
88
|
+
_boolean_fields = getattr(self.cube_cls, "boolean_fields", [])
|
|
89
|
+
_field_map = self.cube_cls.config.get("field_map", {})
|
|
90
|
+
_target_columns = set(_field_map.values()) if _field_map else None
|
|
91
|
+
|
|
92
|
+
for col in df.columns:
|
|
93
|
+
if col in _exclude_columns:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# If field_map is provided, use only targeted columns
|
|
97
|
+
if _target_columns is not None and col not in _target_columns:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
if col not in df.columns:
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
dtype = df.dtypes[col]
|
|
104
|
+
desc = f"Filter by {col}"
|
|
105
|
+
field_type: Any = Optional[str] # Default
|
|
106
|
+
|
|
107
|
+
if col in _boolean_fields:
|
|
108
|
+
field_type = Optional[bool]
|
|
109
|
+
desc += " (bool)"
|
|
110
|
+
elif pd.api.types.is_bool_dtype(dtype):
|
|
111
|
+
field_type = Optional[bool]
|
|
112
|
+
desc += " (bool)"
|
|
113
|
+
elif pd.api.types.is_integer_dtype(dtype):
|
|
114
|
+
field_type = Optional[int]
|
|
115
|
+
desc += " (int)"
|
|
116
|
+
elif pd.api.types.is_float_dtype(dtype):
|
|
117
|
+
field_type = Optional[float]
|
|
118
|
+
desc += " (float)"
|
|
119
|
+
|
|
120
|
+
_field_definitions[col] = (
|
|
121
|
+
field_type,
|
|
122
|
+
Field(default=None, description=desc),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
model = create_model(
|
|
126
|
+
f"{self.cube_cls.__name__}Filters",
|
|
127
|
+
**_field_definitions,
|
|
128
|
+
__config__=ConfigDict(extra="ignore"),
|
|
129
|
+
)
|
|
130
|
+
model.__doc__ = self._generate_docstring_and_example(_field_definitions)
|
|
131
|
+
return model
|
|
132
|
+
|
|
133
|
+
except Exception:
|
|
134
|
+
# Fallback to config-based generation
|
|
135
|
+
# print(f"DTO INTROSPECTION ERROR for {self.cube_cls.__name__}: {e}")
|
|
136
|
+
return self._fallback_create_model(_exclude_columns)
|
|
137
|
+
|
|
138
|
+
def _fallback_create_model(self, exclude_columns: Sequence[str]) -> Type[BaseModel]:
|
|
139
|
+
# Retrieve fields from the Cube configuration
|
|
140
|
+
_field_map = self.cube_cls.config.get("field_map", {})
|
|
141
|
+
_output_columns = list(_field_map.values()) if _field_map else []
|
|
142
|
+
_boolean_fields = getattr(self.cube_cls, "boolean_fields", [])
|
|
143
|
+
|
|
144
|
+
_field_definitions = {}
|
|
145
|
+
for col in _output_columns:
|
|
146
|
+
if col in exclude_columns:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
if col in _boolean_fields:
|
|
150
|
+
_field_definitions[col] = (
|
|
151
|
+
Optional[bool],
|
|
152
|
+
Field(default=None, description=f"Filter by {col} (bool)"),
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
_field_definitions[col] = (
|
|
156
|
+
Optional[str | int],
|
|
157
|
+
Field(default=None, description=f"Filter by {col}"),
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
FiltersModel: Any = create_model(
|
|
161
|
+
f"{self.cube_cls.__name__}Filters",
|
|
162
|
+
**_field_definitions,
|
|
163
|
+
__config__=ConfigDict(extra="ignore"),
|
|
164
|
+
)
|
|
165
|
+
FiltersModel.__doc__ = self._generate_docstring_and_example(_field_definitions)
|
|
166
|
+
return FiltersModel
|
|
167
|
+
|
|
168
|
+
async def _materialize_df(self, df: Any) -> pd.DataFrame:
|
|
169
|
+
if hasattr(df, "compute"):
|
|
170
|
+
# Use the shared persistent client if available
|
|
171
|
+
client = get_persistent_client()
|
|
172
|
+
return await asyncio.to_thread(safe_compute, df, dask_client=client)
|
|
173
|
+
return df
|
|
174
|
+
|
|
175
|
+
async def _stream_data_task(
|
|
176
|
+
self, cube: BaseDatacube, filters: Mapping[str, Any]
|
|
177
|
+
) -> None:
|
|
178
|
+
async with cube:
|
|
179
|
+
await cube.emit("status", msg="loading")
|
|
180
|
+
try:
|
|
181
|
+
df = await cube.aload(**filters)
|
|
182
|
+
df = await self._materialize_df(df)
|
|
183
|
+
|
|
184
|
+
if df is not None and not df.empty:
|
|
185
|
+
records = df.to_dict(orient="records")
|
|
186
|
+
await cube.emit("data", data=jsonable_encoder(records))
|
|
187
|
+
|
|
188
|
+
await cube.emit("status", msg="complete")
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
await cube.emit("error", detail=str(e))
|
|
192
|
+
|
|
193
|
+
def _setup_routes(self):
|
|
194
|
+
# We define the functions inside closure to capture 'self' reliably
|
|
195
|
+
# and use the dynamic FiltersModel for type annotation and FastAPI dependency.
|
|
196
|
+
|
|
197
|
+
FiltersModel = self.FiltersModel
|
|
198
|
+
|
|
199
|
+
async def _core_logic(cube, filters):
|
|
200
|
+
if isinstance(filters, dict):
|
|
201
|
+
filters = FiltersModel(**filters)
|
|
202
|
+
filter_dict = filters.model_dump(exclude_unset=True)
|
|
203
|
+
try:
|
|
204
|
+
df = await cube.aload(**filter_dict)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
207
|
+
df = await self._materialize_df(df)
|
|
208
|
+
return df
|
|
209
|
+
|
|
210
|
+
if self.pagination:
|
|
211
|
+
|
|
212
|
+
@self.router.post(
|
|
213
|
+
"",
|
|
214
|
+
summary=f"Get Paged {self.cube_cls.__name__} Data",
|
|
215
|
+
description="Retrieve data with filtering and pagination.",
|
|
216
|
+
)
|
|
217
|
+
async def get_cube_data(
|
|
218
|
+
filters: Any = Body(
|
|
219
|
+
default_factory=lambda: FiltersModel(),
|
|
220
|
+
description="Filters to apply.",
|
|
221
|
+
openapi_examples={},
|
|
222
|
+
),
|
|
223
|
+
page: int = Query(1, ge=1, description="Page number."),
|
|
224
|
+
page_size: int = Query(
|
|
225
|
+
50, ge=1, le=1000, description="Records per page."
|
|
226
|
+
),
|
|
227
|
+
) -> List[dict[str, Any]]:
|
|
228
|
+
cube = self.cube_cls()
|
|
229
|
+
df = await _core_logic(cube, filters)
|
|
230
|
+
|
|
231
|
+
if df is None or df.empty:
|
|
232
|
+
return []
|
|
233
|
+
|
|
234
|
+
start = (page - 1) * page_size
|
|
235
|
+
end = start + page_size
|
|
236
|
+
paged_df = df.iloc[start:end]
|
|
237
|
+
return paged_df.to_dict(orient="records")
|
|
238
|
+
|
|
239
|
+
else:
|
|
240
|
+
|
|
241
|
+
@self.router.post(
|
|
242
|
+
"",
|
|
243
|
+
summary=f"Get {self.cube_cls.__name__} Data",
|
|
244
|
+
description="Retrieve all data with filtering (pagination disabled).",
|
|
245
|
+
)
|
|
246
|
+
async def get_cube_data_all(
|
|
247
|
+
filters: Any = Body(
|
|
248
|
+
default_factory=lambda: FiltersModel(),
|
|
249
|
+
description="Filters to apply.",
|
|
250
|
+
openapi_examples={},
|
|
251
|
+
),
|
|
252
|
+
) -> List[dict[str, Any]]:
|
|
253
|
+
cube = self.cube_cls()
|
|
254
|
+
df = await _core_logic(cube, filters)
|
|
255
|
+
|
|
256
|
+
if df is None or df.empty:
|
|
257
|
+
return []
|
|
258
|
+
|
|
259
|
+
return df.to_dict(orient="records")
|
|
260
|
+
|
|
261
|
+
@self.router.post(
|
|
262
|
+
"/stream",
|
|
263
|
+
summary=f"Stream {self.cube_cls.__name__} Data",
|
|
264
|
+
description="Stream data updates via SSE.",
|
|
265
|
+
)
|
|
266
|
+
async def stream_cube_data(
|
|
267
|
+
request: Request,
|
|
268
|
+
filters: Any = Body(
|
|
269
|
+
default_factory=lambda: FiltersModel(),
|
|
270
|
+
description="Filters to apply.",
|
|
271
|
+
openapi_examples={},
|
|
272
|
+
),
|
|
273
|
+
) -> StreamingResponse:
|
|
274
|
+
cube = self.cube_cls(auto_sse=True)
|
|
275
|
+
if isinstance(filters, dict):
|
|
276
|
+
filters = FiltersModel(**filters)
|
|
277
|
+
filter_dict = filters.model_dump(exclude_unset=True)
|
|
278
|
+
|
|
279
|
+
asyncio.create_task(self._stream_data_task(cube, filter_dict))
|
|
280
|
+
|
|
281
|
+
return StreamingResponse(
|
|
282
|
+
cube.get_sse().aiter_sse(), media_type="text/event-stream"
|
|
283
|
+
)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base DataCube module.
|
|
3
|
+
|
|
4
|
+
Provides the foundational classes for DataCube definitions and loading logic.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
from typing import Optional, Any, Dict
|
|
11
|
+
|
|
12
|
+
import dask.dataframe as dd
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from sibi_flux.df_helper import DfHelper
|
|
16
|
+
from sibi_flux.dask_cluster import dask_is_empty
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
DataFrameType = dd.DataFrame | pd.DataFrame
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseDatacube(DfHelper):
|
|
23
|
+
"""
|
|
24
|
+
Base cube with sync/async load hooks and resilient Dask handling.
|
|
25
|
+
|
|
26
|
+
Lifecycle:
|
|
27
|
+
1. load() / aload() calls parent DfHelper to fetch raw data.
|
|
28
|
+
2. If data exists, calls fix_data() (sync) or afix_data() (async).
|
|
29
|
+
3. Result is stored in self.df and returned.
|
|
30
|
+
|
|
31
|
+
Subclasses should override:
|
|
32
|
+
- fix_data(self, df): for CPU-bound transformations (Pandas/Dask map_partitions)
|
|
33
|
+
- afix_data(self, df): for I/O-bound transformations (DB lookups, etc.)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
# Class-level config overrides (e.g. backend='parquet')
|
|
37
|
+
config: Dict[str, Any] = {}
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
|
|
41
|
+
combined_config = {**self.config, **kwargs}
|
|
42
|
+
super().__init__(**combined_config)
|
|
43
|
+
|
|
44
|
+
# State container
|
|
45
|
+
self.df: Optional[DataFrameType] = None
|
|
46
|
+
|
|
47
|
+
# -----------------------------------------------------------------------
|
|
48
|
+
# Hooks (Override these)
|
|
49
|
+
# -----------------------------------------------------------------------
|
|
50
|
+
def fix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
|
|
51
|
+
"""
|
|
52
|
+
Synchronous transformation hook.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
df: The loaded dataframe (Pandas or Dask)
|
|
56
|
+
**kwargs: The options passed to load()
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
The transformed dataframe.
|
|
60
|
+
"""
|
|
61
|
+
return df
|
|
62
|
+
|
|
63
|
+
async def afix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
|
|
64
|
+
"""
|
|
65
|
+
Asynchronous transformation hook.
|
|
66
|
+
|
|
67
|
+
Defaults to calling fix_data(). Override this ONLY if you need
|
|
68
|
+
to perform async operations (like awaiting other DB calls).
|
|
69
|
+
"""
|
|
70
|
+
return self.fix_data(df, **kwargs)
|
|
71
|
+
|
|
72
|
+
# -----------------------------------------------------------------------
|
|
73
|
+
# Public API
|
|
74
|
+
# -----------------------------------------------------------------------
|
|
75
|
+
def load(self, **kwargs) -> DataFrameType:
|
|
76
|
+
"""
|
|
77
|
+
Synchronous load pipeline.
|
|
78
|
+
"""
|
|
79
|
+
# 1. Load Raw Data (Delegate to DfHelper)
|
|
80
|
+
df = super().load(**kwargs)
|
|
81
|
+
|
|
82
|
+
# 2. Check Emptiness (Resilient)
|
|
83
|
+
if not self._is_empty(df):
|
|
84
|
+
# 3. Apply Transform Hook
|
|
85
|
+
df = self.fix_data(df, **kwargs)
|
|
86
|
+
else:
|
|
87
|
+
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
88
|
+
|
|
89
|
+
# 4. Update State & Return
|
|
90
|
+
self.df = df
|
|
91
|
+
return df
|
|
92
|
+
|
|
93
|
+
async def aload(self, **kwargs) -> DataFrameType:
|
|
94
|
+
"""
|
|
95
|
+
Asynchronous load pipeline.
|
|
96
|
+
"""
|
|
97
|
+
# 1. Load Raw Data
|
|
98
|
+
df = await super().aload(**kwargs)
|
|
99
|
+
|
|
100
|
+
# 2. Check Emptiness (Non-blocking)
|
|
101
|
+
# _is_empty triggers dask.compute(), so we must offload to thread
|
|
102
|
+
is_empty = await asyncio.to_thread(self._is_empty, df)
|
|
103
|
+
|
|
104
|
+
if not is_empty:
|
|
105
|
+
# 3. Apply Async Transform Hook
|
|
106
|
+
# (Note: afix_data calls fix_data by default, so this covers both cases)
|
|
107
|
+
df = await self.afix_data(df, **kwargs)
|
|
108
|
+
else:
|
|
109
|
+
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
110
|
+
|
|
111
|
+
# 4. Update State & Return
|
|
112
|
+
self.df = df
|
|
113
|
+
return df
|
|
114
|
+
|
|
115
|
+
# -----------------------------------------------------------------------
|
|
116
|
+
# Internals
|
|
117
|
+
# -----------------------------------------------------------------------
|
|
118
|
+
def _is_empty(self, df: Optional[DataFrameType]) -> bool:
|
|
119
|
+
"""
|
|
120
|
+
Robust emptiness check using dask_cluster.
|
|
121
|
+
"""
|
|
122
|
+
if df is None:
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
if isinstance(df, pd.DataFrame):
|
|
126
|
+
return df.empty
|
|
127
|
+
|
|
128
|
+
# Use our new resilient checker
|
|
129
|
+
# This prevents the "compute whole graph to check length" disaster
|
|
130
|
+
# Pass the client if we have one (from DfHelper/DaskClientMixin)
|
|
131
|
+
client = getattr(self, "dask_client", None)
|
|
132
|
+
return dask_is_empty(df, dask_client=client, logger=self.logger)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from sibi_flux.pipelines import BasePipeline
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PipelineTemplate:
|
|
9
|
+
"""
|
|
10
|
+
A reusable base class for executing product-related pipelines end-to-end.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
start_date: str,
|
|
16
|
+
end_date: str,
|
|
17
|
+
fs_instance,
|
|
18
|
+
storage_path: str,
|
|
19
|
+
dataset_cls,
|
|
20
|
+
filename: str,
|
|
21
|
+
date_field: str = "last_activity_dt",
|
|
22
|
+
**kwargs,
|
|
23
|
+
):
|
|
24
|
+
self.start_date = start_date
|
|
25
|
+
self.end_date = end_date
|
|
26
|
+
self.max_workers = kwargs.pop("max_workers", 4)
|
|
27
|
+
self.fs = fs_instance
|
|
28
|
+
self.storage_path = storage_path
|
|
29
|
+
|
|
30
|
+
self.pipeline = BasePipeline(
|
|
31
|
+
start_date=self.start_date,
|
|
32
|
+
end_date=self.end_date,
|
|
33
|
+
dataset_cls=dataset_cls,
|
|
34
|
+
parquet_storage_path=self.storage_path,
|
|
35
|
+
fs=self.fs,
|
|
36
|
+
filename=filename,
|
|
37
|
+
date_field=date_field,
|
|
38
|
+
max_workers=self.max_workers,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
async def to_parquet(self, **kwargs) -> pd.DataFrame:
|
|
42
|
+
await self.pipeline.to_parquet(**kwargs)
|
|
43
|
+
df = await self.pipeline.from_parquet(**kwargs)
|
|
44
|
+
return df
|
|
45
|
+
|
|
46
|
+
async def from_parquet(self, **kwargs) -> pd.DataFrame:
|
|
47
|
+
df = await self.pipeline.from_parquet(**kwargs)
|
|
48
|
+
return df
|
|
49
|
+
|
|
50
|
+
async def to_clickhouse(self, clickhouse_conf, **kwargs) -> None:
|
|
51
|
+
cnf = clickhouse_conf.copy()
|
|
52
|
+
cnf["table"] = self.pipeline.filename
|
|
53
|
+
cnf["overwrite"] = True
|
|
54
|
+
await self.pipeline.to_clickhouse(cnf, **kwargs)
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
from typing import Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import dask.dataframe as dd
|
|
5
|
+
from sibi_flux.logger import Logger
|
|
6
|
+
|
|
7
|
+
TODAY = datetime.date.today()
|
|
8
|
+
YESTERDAY = TODAY - datetime.timedelta(days=1)
|
|
9
|
+
TODAY_STR = TODAY.strftime("%Y-%m-%d")
|
|
10
|
+
YESTERDAY_STR = YESTERDAY.strftime("%Y-%m-%d")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HybridDataLoader:
|
|
14
|
+
"""
|
|
15
|
+
Hybrid loader that merges historical (Parquet) and live (API/DB) data
|
|
16
|
+
in a consistent, schema-safe, timezone-normalized way.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
start_date: str,
|
|
22
|
+
end_date: str,
|
|
23
|
+
historical_reader,
|
|
24
|
+
live_reader,
|
|
25
|
+
date_field: str,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
self.start_date = self._validate_date_format(start_date)
|
|
29
|
+
self.end_date = self._validate_date_format(end_date)
|
|
30
|
+
self.historical_reader = historical_reader
|
|
31
|
+
self.live_reader = live_reader
|
|
32
|
+
self.date_field = date_field
|
|
33
|
+
|
|
34
|
+
self.logger = kwargs.get("logger", Logger.default_logger(logger_name=__name__))
|
|
35
|
+
self.debug = kwargs.get("debug", False)
|
|
36
|
+
self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
|
|
37
|
+
|
|
38
|
+
self._validate_date_range()
|
|
39
|
+
|
|
40
|
+
self._read_live_flag = self.end_date == TODAY_STR
|
|
41
|
+
self._is_single_today = self.start_date == self.end_date == TODAY_STR
|
|
42
|
+
self._is_single_historical = self.start_date == self.end_date != TODAY_STR
|
|
43
|
+
|
|
44
|
+
# ------------------------------------------------------------------ #
|
|
45
|
+
# Validation
|
|
46
|
+
# ------------------------------------------------------------------ #
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _validate_date_format(date_str: str) -> str:
|
|
49
|
+
try:
|
|
50
|
+
datetime.datetime.strptime(date_str, "%Y-%m-%d")
|
|
51
|
+
return date_str
|
|
52
|
+
except ValueError:
|
|
53
|
+
raise ValueError(f"Date '{date_str}' is not in valid YYYY-MM-DD format")
|
|
54
|
+
|
|
55
|
+
def _validate_date_range(self):
|
|
56
|
+
start = datetime.datetime.strptime(self.start_date, "%Y-%m-%d").date()
|
|
57
|
+
end = datetime.datetime.strptime(self.end_date, "%Y-%m-%d").date()
|
|
58
|
+
if end < start:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"End date ({self.end_date}) cannot be before start date ({self.start_date})"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def _normalize_datetimes(df: dd.DataFrame, cols: list[str]) -> dd.DataFrame:
|
|
65
|
+
"""Normalize datetime columns to UTC safely."""
|
|
66
|
+
if not cols:
|
|
67
|
+
return df
|
|
68
|
+
|
|
69
|
+
def _to_ts(pdf: pd.DataFrame) -> pd.DataFrame:
|
|
70
|
+
for c in cols:
|
|
71
|
+
if c in pdf.columns:
|
|
72
|
+
pdf[c] = pd.to_datetime(pdf[c], errors="coerce", utc=True)
|
|
73
|
+
return pdf
|
|
74
|
+
|
|
75
|
+
return df.map_partitions(_to_ts)
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def _create_empty_dataframe(meta: Optional[pd.DataFrame] = None) -> dd.DataFrame:
|
|
79
|
+
if meta is not None and not meta.empty:
|
|
80
|
+
return dd.from_pandas(meta.iloc[0:0], npartitions=1)
|
|
81
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
|
82
|
+
|
|
83
|
+
# ------------------------------------------------------------------ #
|
|
84
|
+
# Data loading methods
|
|
85
|
+
# ------------------------------------------------------------------ #
|
|
86
|
+
async def _load_today_data(self, **kwargs) -> Optional[dd.DataFrame]:
|
|
87
|
+
"""Load today's live data."""
|
|
88
|
+
self.logger.debug("Loading today's live data...")
|
|
89
|
+
date_filter = {f"{self.date_field}__date": TODAY_STR}
|
|
90
|
+
filters = {**kwargs, **date_filter}
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
reader_obj = self.live_reader(logger=self.logger, debug=self.debug)
|
|
94
|
+
if not hasattr(reader_obj, "aload"):
|
|
95
|
+
raise TypeError("live_reader must expose an async aload() method")
|
|
96
|
+
today_df = await reader_obj.aload(**filters)
|
|
97
|
+
if today_df is not None:
|
|
98
|
+
today_df = self._normalize_datetimes(today_df, [self.date_field])
|
|
99
|
+
return today_df
|
|
100
|
+
except Exception as e:
|
|
101
|
+
self.logger.error(f"Failed to load today's data: {e}")
|
|
102
|
+
return None if not self.debug else (_ for _ in ()).throw(e)
|
|
103
|
+
|
|
104
|
+
async def _load_historical_data(
|
|
105
|
+
self, start_date: str, end_date: str, **kwargs
|
|
106
|
+
) -> dd.DataFrame:
|
|
107
|
+
"""Load historical data."""
|
|
108
|
+
self.logger.debug(f"Loading historical data from {start_date} to {end_date}...")
|
|
109
|
+
try:
|
|
110
|
+
reader_obj = self.historical_reader(
|
|
111
|
+
parquet_start_date=start_date,
|
|
112
|
+
parquet_end_date=end_date,
|
|
113
|
+
logger=self.logger,
|
|
114
|
+
debug=self.debug,
|
|
115
|
+
)
|
|
116
|
+
if not hasattr(reader_obj, "aload"):
|
|
117
|
+
raise TypeError("historical_reader must expose an async aload() method")
|
|
118
|
+
df = await reader_obj.aload(**kwargs)
|
|
119
|
+
df = self._normalize_datetimes(df, [self.date_field])
|
|
120
|
+
return df
|
|
121
|
+
except Exception as e:
|
|
122
|
+
self.logger.error(f"Failed to load historical data: {e}")
|
|
123
|
+
if self.debug:
|
|
124
|
+
raise
|
|
125
|
+
return self._create_empty_dataframe()
|
|
126
|
+
|
|
127
|
+
# ------------------------------------------------------------------ #
|
|
128
|
+
# Orchestrator
|
|
129
|
+
# ------------------------------------------------------------------ #
|
|
130
|
+
# ------------------------------------------------------------------ #
|
|
131
|
+
# Orchestrator
|
|
132
|
+
# ------------------------------------------------------------------ #
|
|
133
|
+
async def aload(self, **kwargs) -> dd.DataFrame:
|
|
134
|
+
"""Load and concatenate data from historical and live sources."""
|
|
135
|
+
self.logger.debug(
|
|
136
|
+
f"[HybridLoader] start={self.start_date}, end={self.end_date}, "
|
|
137
|
+
f"read_live={self._read_live_flag}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Case 1: only today
|
|
141
|
+
if self._is_single_today:
|
|
142
|
+
today_df = await self._load_today_data(**kwargs)
|
|
143
|
+
return today_df if today_df is not None else self._create_empty_dataframe()
|
|
144
|
+
|
|
145
|
+
# Case 2: purely historical
|
|
146
|
+
if not self._read_live_flag:
|
|
147
|
+
return await self._load_historical_data(
|
|
148
|
+
self.start_date, self.end_date, **kwargs
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Case 3: mixed historical + live
|
|
152
|
+
hist_df = await self._load_historical_data(
|
|
153
|
+
self.start_date, YESTERDAY_STR, **kwargs
|
|
154
|
+
)
|
|
155
|
+
live_df = await self._load_today_data(**kwargs)
|
|
156
|
+
|
|
157
|
+
if hist_df is None:
|
|
158
|
+
hist_df = self._create_empty_dataframe()
|
|
159
|
+
if live_df is None:
|
|
160
|
+
live_df = self._create_empty_dataframe()
|
|
161
|
+
|
|
162
|
+
# Standardize / Validate both before concat if schema is enforced
|
|
163
|
+
# (Assuming the caller might want to check schemas here)
|
|
164
|
+
# For now, we rely on the readers returning relatively clean data,
|
|
165
|
+
# but we can do a naive concat.
|
|
166
|
+
|
|
167
|
+
# Note: Previous implementation did heavy schema alignment.
|
|
168
|
+
# Dask concat handles most alignment if columns differ but dtypes are compatible.
|
|
169
|
+
# If columns are completely missing, Dask might warn or error.
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
return dd.concat(
|
|
173
|
+
[hist_df, live_df],
|
|
174
|
+
ignore_unknown_divisions=True,
|
|
175
|
+
interleave_partitions=True,
|
|
176
|
+
)
|
|
177
|
+
except Exception as e:
|
|
178
|
+
self.logger.warning(
|
|
179
|
+
f"Simple concat failed: {e}. Attempting robust alignment..."
|
|
180
|
+
)
|
|
181
|
+
# Robust alignment fallback can be implemented here if strictly needed,
|
|
182
|
+
# possibly offloaded to a thread.
|
|
183
|
+
self.logger.error(
|
|
184
|
+
"Robust alignment not yet reimplemented with DfValidator."
|
|
185
|
+
)
|
|
186
|
+
raise e
|
|
187
|
+
|
|
188
|
+
# ------------------------------------------------------------------ #
|
|
189
|
+
def __repr__(self):
|
|
190
|
+
return (
|
|
191
|
+
f"HybridDataLoader(start='{self.start_date}', end='{self.end_date}', "
|
|
192
|
+
f"read_live={self._read_live_flag})"
|
|
193
|
+
)
|