sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import dask.dataframe as dd
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import sqlalchemy.types as sa_types
|
|
5
|
+
from typing import Any, ClassVar, List, Literal, Optional
|
|
6
|
+
from collections.abc import Mapping, MutableMapping
|
|
7
|
+
from pydantic import BaseModel, Field, ConfigDict, SecretStr
|
|
8
|
+
|
|
9
|
+
from sibi_flux.df_helper import DfHelper
|
|
10
|
+
from sibi_flux.utils import DataUtils
|
|
11
|
+
from sibi_flux.dask_cluster import dask_is_empty
|
|
12
|
+
from sibi_flux.df_validator import DfValidator
|
|
13
|
+
|
|
14
|
+
# Define types for clarity
|
|
15
|
+
DataFrameType = dd.DataFrame | pd.DataFrame
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatacubeConfig(BaseModel):
|
|
19
|
+
"""
|
|
20
|
+
Pydantic model for strict validation of Datacube configuration.
|
|
21
|
+
Allows extra fields to pass through to DfHelper/Backend.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
|
|
25
|
+
|
|
26
|
+
table: Optional[str] = None
|
|
27
|
+
backend: str = "sqlalchemy"
|
|
28
|
+
connection_url: Optional[str | SecretStr] = None
|
|
29
|
+
field_map: Mapping[str, str] = Field(default_factory=dict)
|
|
30
|
+
legacy_filters: bool = True
|
|
31
|
+
debug: bool = False
|
|
32
|
+
|
|
33
|
+
# Validation Config
|
|
34
|
+
validation_schema: Mapping[str, str] = Field(default_factory=dict)
|
|
35
|
+
enforce_schema: bool = False
|
|
36
|
+
|
|
37
|
+
# Allow optional logger instance
|
|
38
|
+
logger: Optional[Any] = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Datacube(DfHelper):
|
|
42
|
+
"""
|
|
43
|
+
Enhanced Datacube that simplifies subclassing via declarative configuration
|
|
44
|
+
and automated common transformations.
|
|
45
|
+
|
|
46
|
+
Consolidates functionality from the legacy BaseDatacube.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
table (str): Table name for database connection.
|
|
50
|
+
backend (str): Backend type (e.g., 'sqlalchemy').
|
|
51
|
+
connection_url (str): Database connection URL.
|
|
52
|
+
field_map (Mapping): Mapping of field names.
|
|
53
|
+
legacy_filters (bool): Use legacy filters.
|
|
54
|
+
live (bool): Live data flag.
|
|
55
|
+
|
|
56
|
+
boolean_columns (List[str]): Columns to convert to boolean.
|
|
57
|
+
numeric_float_columns (List[str]): Columns to convert to float.
|
|
58
|
+
numeric_int_columns (List[str]): Columns to convert to int.
|
|
59
|
+
date_fields (List[str]): Columns to convert to datetime.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# Declarative Config Attributes
|
|
63
|
+
table: str = ""
|
|
64
|
+
backend: Literal["sqlalchemy", "parquet", "http"] = "sqlalchemy"
|
|
65
|
+
connection_url: str = ""
|
|
66
|
+
field_map: Mapping[str, str] = {}
|
|
67
|
+
legacy_filters: bool = True
|
|
68
|
+
pii_columns: List[str] = []
|
|
69
|
+
|
|
70
|
+
# Validation
|
|
71
|
+
validation_schema: ClassVar[Mapping[str, str]] = {}
|
|
72
|
+
enforce_schema: ClassVar[bool] = False
|
|
73
|
+
|
|
74
|
+
# Declarative Transformation Lists
|
|
75
|
+
boolean_columns: List[str] = []
|
|
76
|
+
numeric_float_columns: List[str] = []
|
|
77
|
+
numeric_int_columns: List[str] = []
|
|
78
|
+
date_fields: List[str] = []
|
|
79
|
+
|
|
80
|
+
# Config container compatible with DfHelper
|
|
81
|
+
config: MutableMapping[str, Any] = {}
|
|
82
|
+
|
|
83
|
+
def __init__(self, **kwargs):
|
|
84
|
+
# State container
|
|
85
|
+
self.df: Optional[DataFrameType] = None
|
|
86
|
+
self._schema_inferred = False
|
|
87
|
+
|
|
88
|
+
# Build config dictionary from class attributes if not already present
|
|
89
|
+
if not self.config:
|
|
90
|
+
self.config = {
|
|
91
|
+
"table": self.table,
|
|
92
|
+
"backend": self.backend,
|
|
93
|
+
"connection_url": self.connection_url,
|
|
94
|
+
"field_map": self.field_map,
|
|
95
|
+
"legacy_filters": self.legacy_filters,
|
|
96
|
+
"validation_schema": self.validation_schema,
|
|
97
|
+
"enforce_schema": self.enforce_schema,
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Merge kwargs into the preliminary config
|
|
101
|
+
raw_config = {**self.config, **kwargs}
|
|
102
|
+
|
|
103
|
+
# Validate configuration using Pydantic
|
|
104
|
+
# This ensures types are correct and provides clear checks
|
|
105
|
+
validated_model = DatacubeConfig.model_validate(raw_config)
|
|
106
|
+
|
|
107
|
+
# Update self.config with the validated, normalized dictionary
|
|
108
|
+
self.config = validated_model.model_dump(exclude_unset=True)
|
|
109
|
+
|
|
110
|
+
# Copy class-level lists to instance to avoid mutation of shared class state
|
|
111
|
+
# and to allow instance-specific inference
|
|
112
|
+
self.boolean_columns = list(self.boolean_columns)
|
|
113
|
+
self.numeric_float_columns = list(self.numeric_float_columns)
|
|
114
|
+
self.numeric_int_columns = list(self.numeric_int_columns)
|
|
115
|
+
self.date_fields = list(self.date_fields)
|
|
116
|
+
|
|
117
|
+
# Initialize DfHelper with validated config
|
|
118
|
+
# We model_dump() allowing extra fields so that backend-specific params
|
|
119
|
+
# (like pool_size) are preserved and passed to DfHelper.
|
|
120
|
+
super().__init__(**self.config)
|
|
121
|
+
|
|
122
|
+
# Store dask_client if provided, for resilience checks and external access
|
|
123
|
+
self.dask_client = self.config.get("dask_client")
|
|
124
|
+
|
|
125
|
+
def _infer_schema(self):
|
|
126
|
+
"""
|
|
127
|
+
Attempts to infer column types from the SQLAlchemy model reflection.
|
|
128
|
+
Populates the transformation lists if they are missing metadata.
|
|
129
|
+
"""
|
|
130
|
+
if self.config.get("backend") != "sqlalchemy":
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
# This triggers reflection matching the table name
|
|
135
|
+
conn = self.get_sql_connection()
|
|
136
|
+
model = conn.model
|
|
137
|
+
if not model:
|
|
138
|
+
return
|
|
139
|
+
|
|
140
|
+
field_map = self.config.get("field_map", {})
|
|
141
|
+
|
|
142
|
+
for col in model.__table__.columns:
|
|
143
|
+
# Map DB column name to DataFrame column name (if renamed via field_map)
|
|
144
|
+
df_col_name = field_map.get(col.name, col.name)
|
|
145
|
+
|
|
146
|
+
# Check types and append if not already present
|
|
147
|
+
if isinstance(col.type, sa_types.Boolean):
|
|
148
|
+
if df_col_name not in self.boolean_columns:
|
|
149
|
+
self.boolean_columns.append(df_col_name)
|
|
150
|
+
|
|
151
|
+
elif isinstance(
|
|
152
|
+
col.type,
|
|
153
|
+
(sa_types.Integer, sa_types.BigInteger, sa_types.SmallInteger),
|
|
154
|
+
):
|
|
155
|
+
if (
|
|
156
|
+
df_col_name not in self.numeric_int_columns
|
|
157
|
+
and df_col_name not in self.boolean_columns
|
|
158
|
+
and df_col_name not in self.numeric_float_columns
|
|
159
|
+
):
|
|
160
|
+
self.numeric_int_columns.append(df_col_name)
|
|
161
|
+
|
|
162
|
+
elif isinstance(col.type, (sa_types.Float, sa_types.Numeric)):
|
|
163
|
+
# Numeric/Decimal often treated as float in analysis unless strict
|
|
164
|
+
if df_col_name not in self.numeric_float_columns:
|
|
165
|
+
self.numeric_float_columns.append(df_col_name)
|
|
166
|
+
|
|
167
|
+
elif isinstance(
|
|
168
|
+
col.type, (sa_types.Date, sa_types.DateTime, sa_types.TIMESTAMP)
|
|
169
|
+
):
|
|
170
|
+
if df_col_name not in self.date_fields:
|
|
171
|
+
self.date_fields.append(df_col_name)
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
# Inference is an enhancement; failure shouldn't block app start
|
|
175
|
+
# but we log it for debugging.
|
|
176
|
+
self.logger.debug(f"Schema inference skipped: {e}")
|
|
177
|
+
|
|
178
|
+
def _validate(self, df: DataFrameType) -> DataFrameType:
|
|
179
|
+
"""
|
|
180
|
+
Runs DfValidator if a schema is configured.
|
|
181
|
+
"""
|
|
182
|
+
schema = self.config.get("validation_schema")
|
|
183
|
+
if not schema:
|
|
184
|
+
return df
|
|
185
|
+
|
|
186
|
+
validator = DfValidator(
|
|
187
|
+
df, logger=self.logger, debug=self.config.get("debug", False)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# 1. Standardize (always good practice if validating)
|
|
191
|
+
# Note: We skip full standardization for now to avoid side effects on legacy names,
|
|
192
|
+
# but we could adhere to stricter contracts here.
|
|
193
|
+
|
|
194
|
+
if self.config.get("enforce_schema", False):
|
|
195
|
+
validator.apply_schema_map(schema)
|
|
196
|
+
else:
|
|
197
|
+
try:
|
|
198
|
+
validator.validate_schema(schema)
|
|
199
|
+
except TypeError as e:
|
|
200
|
+
# Re-raise to block pipeline
|
|
201
|
+
self.logger.error(f"Schema Validation Failed: {e}")
|
|
202
|
+
raise e
|
|
203
|
+
|
|
204
|
+
return validator.get_df()
|
|
205
|
+
|
|
206
|
+
def get_ddl(self, table_name: Optional[str] = None) -> str:
|
|
207
|
+
"""
|
|
208
|
+
Generates ClickHouse DDL for the current cube.
|
|
209
|
+
Requires that self.df is loaded (or at least inferred).
|
|
210
|
+
"""
|
|
211
|
+
if self.df is None:
|
|
212
|
+
raise RuntimeError("Cannot generate DDL: Datacube has not been loaded.")
|
|
213
|
+
|
|
214
|
+
name = table_name or self.config.get("table") or "datacube_table"
|
|
215
|
+
validator = DfValidator(self.df, logger=self.logger)
|
|
216
|
+
return validator.generate_clickhouse_ddl(name)
|
|
217
|
+
|
|
218
|
+
# -----------------------------------------------------------------------
|
|
219
|
+
# Lifecycle (Consolidated from BaseDatacube)
|
|
220
|
+
# -----------------------------------------------------------------------
|
|
221
|
+
def load(self, **kwargs) -> DataFrameType:
|
|
222
|
+
"""
|
|
223
|
+
Synchronous load pipeline.
|
|
224
|
+
"""
|
|
225
|
+
# 1. Load Raw Data (Delegate to DfHelper)
|
|
226
|
+
df = super().load(**kwargs)
|
|
227
|
+
|
|
228
|
+
# 2. Check Emptiness (Resilient)
|
|
229
|
+
if not self._is_empty(df):
|
|
230
|
+
# 3. Apply Transform Hook
|
|
231
|
+
df = self.fix_data(df, **kwargs)
|
|
232
|
+
# 4. Validate
|
|
233
|
+
df = self._validate(df)
|
|
234
|
+
else:
|
|
235
|
+
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
236
|
+
|
|
237
|
+
# 4. Update State & Return
|
|
238
|
+
self.df = df
|
|
239
|
+
return df
|
|
240
|
+
|
|
241
|
+
async def aload(self, **kwargs) -> DataFrameType:
|
|
242
|
+
"""
|
|
243
|
+
Asynchronous load pipeline.
|
|
244
|
+
"""
|
|
245
|
+
# 1. Load Raw Data
|
|
246
|
+
df = await super().aload(**kwargs)
|
|
247
|
+
|
|
248
|
+
# 2. Check Emptiness (Async Safe offload)
|
|
249
|
+
import asyncio
|
|
250
|
+
|
|
251
|
+
is_empty = await asyncio.to_thread(self._is_empty, df)
|
|
252
|
+
|
|
253
|
+
if not is_empty:
|
|
254
|
+
# 3. Apply Async Transform Hook
|
|
255
|
+
df = await self.afix_data(df, **kwargs)
|
|
256
|
+
# 4. Validate (CPU bound)
|
|
257
|
+
df = await asyncio.to_thread(self._validate, df)
|
|
258
|
+
else:
|
|
259
|
+
self.logger.debug(f"No data loaded by {self.__class__.__name__}")
|
|
260
|
+
|
|
261
|
+
# 4. Update State & Return
|
|
262
|
+
self.df = df
|
|
263
|
+
return df
|
|
264
|
+
|
|
265
|
+
# -----------------------------------------------------------------------
|
|
266
|
+
# Transformation Hooks
|
|
267
|
+
# -----------------------------------------------------------------------
|
|
268
|
+
def fix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
|
|
269
|
+
"""
|
|
270
|
+
Applies declarative transformations automatically.
|
|
271
|
+
Infers schema types on first run if using SQLAlchemy backend.
|
|
272
|
+
Subclasses can override this to add custom logic, calling super().fix_data(df) first.
|
|
273
|
+
"""
|
|
274
|
+
if not self._schema_inferred:
|
|
275
|
+
self._infer_schema()
|
|
276
|
+
self._schema_inferred = True
|
|
277
|
+
|
|
278
|
+
utils = DataUtils(logger=self.logger)
|
|
279
|
+
|
|
280
|
+
if self.pii_columns:
|
|
281
|
+
df = utils.transform_pii_columns(df, self.pii_columns)
|
|
282
|
+
|
|
283
|
+
if self.boolean_columns:
|
|
284
|
+
df = utils.transform_boolean_columns(df, self.boolean_columns)
|
|
285
|
+
|
|
286
|
+
if self.numeric_float_columns:
|
|
287
|
+
df = utils.transform_numeric_columns(
|
|
288
|
+
df, columns=self.numeric_float_columns, dtype=float
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
if self.numeric_int_columns:
|
|
292
|
+
df = utils.transform_numeric_columns(
|
|
293
|
+
df, columns=self.numeric_int_columns, dtype=int
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if self.date_fields:
|
|
297
|
+
if isinstance(df, dd.DataFrame):
|
|
298
|
+
df = utils.convert_to_datetime_dask(df, self.date_fields)
|
|
299
|
+
else:
|
|
300
|
+
for col in self.date_fields:
|
|
301
|
+
if col in df.columns:
|
|
302
|
+
df[col] = pd.to_datetime(
|
|
303
|
+
df[col], errors="coerce", utc=True, format="mixed"
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
return df
|
|
307
|
+
|
|
308
|
+
async def afix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
|
|
309
|
+
"""
|
|
310
|
+
Asynchronous transformation hook.
|
|
311
|
+
Offloads synchronous fix_data (and schema inference) to a thread.
|
|
312
|
+
"""
|
|
313
|
+
import asyncio
|
|
314
|
+
|
|
315
|
+
return await asyncio.to_thread(self.fix_data, df, **kwargs)
|
|
316
|
+
|
|
317
|
+
# -----------------------------------------------------------------------
|
|
318
|
+
# Internals
|
|
319
|
+
# -----------------------------------------------------------------------
|
|
320
|
+
def _is_empty(self, df: Optional[DataFrameType]) -> bool:
|
|
321
|
+
"""
|
|
322
|
+
Robust emptiness check using dask_cluster.
|
|
323
|
+
"""
|
|
324
|
+
if df is None:
|
|
325
|
+
return True
|
|
326
|
+
|
|
327
|
+
if isinstance(df, pd.DataFrame):
|
|
328
|
+
return df.empty
|
|
329
|
+
|
|
330
|
+
# Use resilient checker
|
|
331
|
+
client = getattr(self, "dask_client", None)
|
|
332
|
+
return dask_is_empty(df, dask_client=client, logger=self.logger)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import yaml
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal, Optional, Any
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
# --- Schema Definitions ---
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GlobalSettings(BaseModel):
|
|
12
|
+
cubes_root_path: str
|
|
13
|
+
fields_module_root: str
|
|
14
|
+
default_db_conn: str
|
|
15
|
+
env_file: Optional[str] = ".env.linux"
|
|
16
|
+
exclusions: list[str] = []
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DiscoveryRule(BaseModel):
|
|
20
|
+
pattern: str
|
|
21
|
+
match_type: Literal["prefix", "exact", "regex"] = "prefix"
|
|
22
|
+
domain: str
|
|
23
|
+
output_template: str
|
|
24
|
+
|
|
25
|
+
# Optional: Override the DB connection for specific tables
|
|
26
|
+
db_conn_override: Optional[str] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class GeneratorConfig(BaseModel):
|
|
30
|
+
version: float
|
|
31
|
+
settings: GlobalSettings
|
|
32
|
+
discovery_rules: list[DiscoveryRule]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- Logic Engine ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ConfigurationEngine:
|
|
39
|
+
def __init__(self, config_path: str = "generator_config.yaml"):
|
|
40
|
+
# Resolve config path relative to this file if not absolute
|
|
41
|
+
path = Path(config_path)
|
|
42
|
+
if not path.is_absolute():
|
|
43
|
+
path = Path(__file__).parent / config_path
|
|
44
|
+
|
|
45
|
+
self.config_path = path
|
|
46
|
+
self.config = self._load_config()
|
|
47
|
+
|
|
48
|
+
def _load_config(self) -> GeneratorConfig:
|
|
49
|
+
if not self.config_path.exists():
|
|
50
|
+
raise FileNotFoundError(f"Config file not found: {self.config_path}")
|
|
51
|
+
|
|
52
|
+
with open(self.config_path, "r") as f:
|
|
53
|
+
raw_data = yaml.safe_load(f)
|
|
54
|
+
return GeneratorConfig(**raw_data)
|
|
55
|
+
|
|
56
|
+
def resolve_table(self, table_name: str) -> Optional[dict[str, Any]]:
|
|
57
|
+
"""
|
|
58
|
+
Iterates through rules to find a match.
|
|
59
|
+
Returns a dict with resolved paths and metadata.
|
|
60
|
+
"""
|
|
61
|
+
# 1. Check Global Exclusions
|
|
62
|
+
for pattern in self.config.settings.exclusions:
|
|
63
|
+
# We assume exclusions are regex patterns for flexibility
|
|
64
|
+
if re.search(pattern, table_name):
|
|
65
|
+
return None
|
|
66
|
+
|
|
67
|
+
# 2. Check Rules
|
|
68
|
+
for rule in self.config.discovery_rules:
|
|
69
|
+
match_data = self._check_match(rule, table_name)
|
|
70
|
+
if match_data is not None:
|
|
71
|
+
return self._build_result(rule, table_name, match_data)
|
|
72
|
+
|
|
73
|
+
return None # No match found
|
|
74
|
+
|
|
75
|
+
def _check_match(
|
|
76
|
+
self, rule: DiscoveryRule, table_name: str
|
|
77
|
+
) -> Optional[dict[str, str]]:
|
|
78
|
+
"""
|
|
79
|
+
Checks if rule matches table_name.
|
|
80
|
+
Returns capturing groups dict if match (empty dict for non-regex matches), None otherwise.
|
|
81
|
+
"""
|
|
82
|
+
if rule.match_type == "exact":
|
|
83
|
+
if table_name == rule.pattern:
|
|
84
|
+
return {}
|
|
85
|
+
|
|
86
|
+
elif rule.match_type == "prefix":
|
|
87
|
+
if table_name.startswith(rule.pattern):
|
|
88
|
+
return {}
|
|
89
|
+
|
|
90
|
+
elif rule.match_type == "regex":
|
|
91
|
+
m = re.search(rule.pattern, table_name)
|
|
92
|
+
if m:
|
|
93
|
+
# Combine numbered groups (as strings) and named groups
|
|
94
|
+
groups = {str(i + 1): g for i, g in enumerate(m.groups())}
|
|
95
|
+
groups.update(m.groupdict())
|
|
96
|
+
return groups
|
|
97
|
+
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
def _build_result(
|
|
101
|
+
self, rule: DiscoveryRule, table_name: str, groups: dict[str, str]
|
|
102
|
+
) -> dict[str, Any]:
|
|
103
|
+
"""Constructs the final paths based on global settings + rule + captured groups"""
|
|
104
|
+
|
|
105
|
+
# 1. Resolve Output Subpath with Template Substitution
|
|
106
|
+
# Supports {feature}, {1}, etc.
|
|
107
|
+
try:
|
|
108
|
+
subpath = rule.output_template.format(**groups)
|
|
109
|
+
except KeyError:
|
|
110
|
+
# Fallback for safe substitution if keys missing? Or let it raise?
|
|
111
|
+
# Let's try to be safe but warn? For now, strict formatting.
|
|
112
|
+
subpath = rule.output_template
|
|
113
|
+
|
|
114
|
+
# 2. Resolve Physical Path
|
|
115
|
+
root = Path(self.config.settings.cubes_root_path)
|
|
116
|
+
full_output_path = str(root / subpath)
|
|
117
|
+
|
|
118
|
+
# 3. Resolve Field Map Path (Smart Guessing)
|
|
119
|
+
# e.g., solutions.conf.transforms.fields.logistics.asm_tracking_X.field_map
|
|
120
|
+
field_map_module = (
|
|
121
|
+
f"{self.config.settings.fields_module_root}."
|
|
122
|
+
f"{rule.domain}.{table_name}.field_map"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return {
|
|
126
|
+
"domain": rule.domain,
|
|
127
|
+
"path": full_output_path, # 'path' key is expected by generator
|
|
128
|
+
"save_to_path": full_output_path,
|
|
129
|
+
"field_map": field_map_module, # 'field_map' template logic relies on discovery result often
|
|
130
|
+
"connection_obj": rule.db_conn_override
|
|
131
|
+
or self.config.settings.default_db_conn,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
# --- Module Level Interface ---
|
|
136
|
+
|
|
137
|
+
_ENGINE = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_engine() -> ConfigurationEngine:
|
|
141
|
+
global _ENGINE
|
|
142
|
+
if _ENGINE is None:
|
|
143
|
+
_ENGINE = ConfigurationEngine()
|
|
144
|
+
return _ENGINE
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def match_rule(table_name: str) -> Optional[dict[str, str]]:
|
|
148
|
+
"""
|
|
149
|
+
Finds the first matching rule for a table name.
|
|
150
|
+
"""
|
|
151
|
+
engine = get_engine()
|
|
152
|
+
return engine.resolve_table(table_name)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from typing import List, MutableMapping, Optional
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .field_registry import GlobalFieldRegistry
|
|
4
|
+
|
|
5
|
+
# Singleton instance to avoid reloading YAML multiple times
|
|
6
|
+
_SHARED_REGISTRY: Optional[GlobalFieldRegistry] = None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FieldMapFactory:
|
|
10
|
+
"""
|
|
11
|
+
Factory for creating field translation maps.
|
|
12
|
+
Uses a shared GlobalFieldRegistry instance to translate field names.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
@classmethod
|
|
16
|
+
def create(cls, table_name: str, columns: List[str]) -> MutableMapping[str, str]:
|
|
17
|
+
"""
|
|
18
|
+
Creates a field map dictionary for the given table and columns.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
table_name: The name of the database table (for context/logging if needed).
|
|
22
|
+
columns: List of column names to translate.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A MutableMapping (dict) where key=column_name, value=translated_name.
|
|
26
|
+
"""
|
|
27
|
+
registry = cls._get_registry()
|
|
28
|
+
|
|
29
|
+
mapping = {}
|
|
30
|
+
for col in columns:
|
|
31
|
+
# We use the registry solely for lookups.
|
|
32
|
+
# If the key isn't there, we default to identity.
|
|
33
|
+
if col in registry:
|
|
34
|
+
mapping[col] = registry[col]
|
|
35
|
+
else:
|
|
36
|
+
mapping[col] = col
|
|
37
|
+
|
|
38
|
+
return mapping
|
|
39
|
+
|
|
40
|
+
@classmethod
|
|
41
|
+
def _get_registry(cls) -> GlobalFieldRegistry:
|
|
42
|
+
global _SHARED_REGISTRY
|
|
43
|
+
if _SHARED_REGISTRY is None:
|
|
44
|
+
# Assume standard location relative to project root
|
|
45
|
+
registry_path = Path("solutions/conf/global_field_translations.yaml")
|
|
46
|
+
_SHARED_REGISTRY = GlobalFieldRegistry(registry_path)
|
|
47
|
+
|
|
48
|
+
return _SHARED_REGISTRY
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import yaml
|
|
2
|
+
import threading
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Iterator, Optional, Union
|
|
5
|
+
from collections.abc import MutableMapping
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GlobalFieldRegistry(MutableMapping):
|
|
9
|
+
"""
|
|
10
|
+
Acts as the source of truth for field translations (Database Column -> English Attribute).
|
|
11
|
+
Backed by a local YAML file. Thread-safe.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, file_path: Union[str, Path]):
|
|
15
|
+
"""
|
|
16
|
+
Initialize the registry.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
file_path: Path to the YAML file backing this registry.
|
|
20
|
+
"""
|
|
21
|
+
self.file_path = Path(file_path).resolve()
|
|
22
|
+
self._data: Dict[str, str] = {}
|
|
23
|
+
self._dirty = False
|
|
24
|
+
self._lock = threading.RLock()
|
|
25
|
+
self._load()
|
|
26
|
+
|
|
27
|
+
def _load(self) -> None:
|
|
28
|
+
"""Load state from the YAML file."""
|
|
29
|
+
with self._lock:
|
|
30
|
+
if self.file_path.exists():
|
|
31
|
+
try:
|
|
32
|
+
with open(self.file_path, "r", encoding="utf-8") as f:
|
|
33
|
+
content = yaml.safe_load(f)
|
|
34
|
+
if content and isinstance(content, dict):
|
|
35
|
+
self._data = content
|
|
36
|
+
else:
|
|
37
|
+
self._data = {}
|
|
38
|
+
except Exception:
|
|
39
|
+
self._data = {}
|
|
40
|
+
else:
|
|
41
|
+
self._data = {}
|
|
42
|
+
|
|
43
|
+
def save(self) -> None:
|
|
44
|
+
"""Persist changes to the YAML file if any changes were made."""
|
|
45
|
+
with self._lock:
|
|
46
|
+
if not self._dirty:
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
# Ensure directory exists
|
|
50
|
+
self.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
with open(self.file_path, "w", encoding="utf-8") as f:
|
|
53
|
+
for key, value in sorted(self._data.items()):
|
|
54
|
+
# Dump single line
|
|
55
|
+
# Note: yaml.dump adds a newline at the end, so we strip it
|
|
56
|
+
line = yaml.dump(
|
|
57
|
+
{key: value},
|
|
58
|
+
sort_keys=False,
|
|
59
|
+
allow_unicode=True,
|
|
60
|
+
default_flow_style=False,
|
|
61
|
+
).strip()
|
|
62
|
+
|
|
63
|
+
# Append comment if translation matches original (and wasn't manually set to something else)
|
|
64
|
+
# We rely on the convention that non-translated fields equal their keys.
|
|
65
|
+
if key == value:
|
|
66
|
+
line += " # Translation missing"
|
|
67
|
+
|
|
68
|
+
f.write(line + "\n")
|
|
69
|
+
|
|
70
|
+
self._dirty = False
|
|
71
|
+
|
|
72
|
+
def register_field(
|
|
73
|
+
self,
|
|
74
|
+
original_name: str,
|
|
75
|
+
suggested_translation: Optional[str] = None,
|
|
76
|
+
force_update: bool = False,
|
|
77
|
+
) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Register a field if it does not exist, or update if force_update is True.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
original_name: The original database column name.
|
|
83
|
+
suggested_translation: Proposed English translation.
|
|
84
|
+
force_update: If True, overwrite existing translation.
|
|
85
|
+
"""
|
|
86
|
+
with self._lock:
|
|
87
|
+
if original_name in self._data:
|
|
88
|
+
if force_update and suggested_translation:
|
|
89
|
+
if self._data[original_name] != suggested_translation:
|
|
90
|
+
self._data[original_name] = suggested_translation
|
|
91
|
+
self._dirty = True
|
|
92
|
+
else:
|
|
93
|
+
# New field - use suggestion or default to identity
|
|
94
|
+
val = suggested_translation if suggested_translation else original_name
|
|
95
|
+
self._data[original_name] = val
|
|
96
|
+
self._dirty = True
|
|
97
|
+
|
|
98
|
+
# --- MutableMapping Implementation ---
|
|
99
|
+
|
|
100
|
+
def __getitem__(self, key: str) -> str:
|
|
101
|
+
with self._lock:
|
|
102
|
+
return self._data[key]
|
|
103
|
+
|
|
104
|
+
def __setitem__(self, key: str, value: str) -> None:
|
|
105
|
+
with self._lock:
|
|
106
|
+
if self._data.get(key) != value:
|
|
107
|
+
self._data[key] = value
|
|
108
|
+
self._dirty = True
|
|
109
|
+
|
|
110
|
+
def __delitem__(self, key: str) -> None:
|
|
111
|
+
with self._lock:
|
|
112
|
+
if key in self._data:
|
|
113
|
+
del self._data[key]
|
|
114
|
+
self._dirty = True
|
|
115
|
+
|
|
116
|
+
def __iter__(self) -> Iterator[str]:
|
|
117
|
+
with self._lock:
|
|
118
|
+
return iter(list(self._data))
|
|
119
|
+
|
|
120
|
+
def __len__(self) -> int:
|
|
121
|
+
with self._lock:
|
|
122
|
+
return len(self._data)
|