sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,332 @@
1
+ from __future__ import annotations
2
+ import dask.dataframe as dd
3
+ import pandas as pd
4
+ import sqlalchemy.types as sa_types
5
+ from typing import Any, ClassVar, List, Literal, Optional
6
+ from collections.abc import Mapping, MutableMapping
7
+ from pydantic import BaseModel, Field, ConfigDict, SecretStr
8
+
9
+ from sibi_flux.df_helper import DfHelper
10
+ from sibi_flux.utils import DataUtils
11
+ from sibi_flux.dask_cluster import dask_is_empty
12
+ from sibi_flux.df_validator import DfValidator
13
+
14
+ # Define types for clarity
15
+ DataFrameType = dd.DataFrame | pd.DataFrame
16
+
17
+
18
+ class DatacubeConfig(BaseModel):
19
+ """
20
+ Pydantic model for strict validation of Datacube configuration.
21
+ Allows extra fields to pass through to DfHelper/Backend.
22
+ """
23
+
24
+ model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True)
25
+
26
+ table: Optional[str] = None
27
+ backend: str = "sqlalchemy"
28
+ connection_url: Optional[str | SecretStr] = None
29
+ field_map: Mapping[str, str] = Field(default_factory=dict)
30
+ legacy_filters: bool = True
31
+ debug: bool = False
32
+
33
+ # Validation Config
34
+ validation_schema: Mapping[str, str] = Field(default_factory=dict)
35
+ enforce_schema: bool = False
36
+
37
+ # Allow optional logger instance
38
+ logger: Optional[Any] = None
39
+
40
+
41
+ class Datacube(DfHelper):
42
+ """
43
+ Enhanced Datacube that simplifies subclassing via declarative configuration
44
+ and automated common transformations.
45
+
46
+ Consolidates functionality from the legacy BaseDatacube.
47
+
48
+ Attributes:
49
+ table (str): Table name for database connection.
50
+ backend (str): Backend type (e.g., 'sqlalchemy').
51
+ connection_url (str): Database connection URL.
52
+ field_map (Mapping): Mapping of field names.
53
+ legacy_filters (bool): Use legacy filters.
54
+ live (bool): Live data flag.
55
+
56
+ boolean_columns (List[str]): Columns to convert to boolean.
57
+ numeric_float_columns (List[str]): Columns to convert to float.
58
+ numeric_int_columns (List[str]): Columns to convert to int.
59
+ date_fields (List[str]): Columns to convert to datetime.
60
+ """
61
+
62
+ # Declarative Config Attributes
63
+ table: str = ""
64
+ backend: Literal["sqlalchemy", "parquet", "http"] = "sqlalchemy"
65
+ connection_url: str = ""
66
+ field_map: Mapping[str, str] = {}
67
+ legacy_filters: bool = True
68
+ pii_columns: List[str] = []
69
+
70
+ # Validation
71
+ validation_schema: ClassVar[Mapping[str, str]] = {}
72
+ enforce_schema: ClassVar[bool] = False
73
+
74
+ # Declarative Transformation Lists
75
+ boolean_columns: List[str] = []
76
+ numeric_float_columns: List[str] = []
77
+ numeric_int_columns: List[str] = []
78
+ date_fields: List[str] = []
79
+
80
+ # Config container compatible with DfHelper
81
+ config: MutableMapping[str, Any] = {}
82
+
83
+ def __init__(self, **kwargs):
84
+ # State container
85
+ self.df: Optional[DataFrameType] = None
86
+ self._schema_inferred = False
87
+
88
+ # Build config dictionary from class attributes if not already present
89
+ if not self.config:
90
+ self.config = {
91
+ "table": self.table,
92
+ "backend": self.backend,
93
+ "connection_url": self.connection_url,
94
+ "field_map": self.field_map,
95
+ "legacy_filters": self.legacy_filters,
96
+ "validation_schema": self.validation_schema,
97
+ "enforce_schema": self.enforce_schema,
98
+ }
99
+
100
+ # Merge kwargs into the preliminary config
101
+ raw_config = {**self.config, **kwargs}
102
+
103
+ # Validate configuration using Pydantic
104
+ # This ensures types are correct and provides clear checks
105
+ validated_model = DatacubeConfig.model_validate(raw_config)
106
+
107
+ # Update self.config with the validated, normalized dictionary
108
+ self.config = validated_model.model_dump(exclude_unset=True)
109
+
110
+ # Copy class-level lists to instance to avoid mutation of shared class state
111
+ # and to allow instance-specific inference
112
+ self.boolean_columns = list(self.boolean_columns)
113
+ self.numeric_float_columns = list(self.numeric_float_columns)
114
+ self.numeric_int_columns = list(self.numeric_int_columns)
115
+ self.date_fields = list(self.date_fields)
116
+
117
+ # Initialize DfHelper with validated config
118
+ # We model_dump() allowing extra fields so that backend-specific params
119
+ # (like pool_size) are preserved and passed to DfHelper.
120
+ super().__init__(**self.config)
121
+
122
+ # Store dask_client if provided, for resilience checks and external access
123
+ self.dask_client = self.config.get("dask_client")
124
+
125
+ def _infer_schema(self):
126
+ """
127
+ Attempts to infer column types from the SQLAlchemy model reflection.
128
+ Populates the transformation lists if they are missing metadata.
129
+ """
130
+ if self.config.get("backend") != "sqlalchemy":
131
+ return
132
+
133
+ try:
134
+ # This triggers reflection matching the table name
135
+ conn = self.get_sql_connection()
136
+ model = conn.model
137
+ if not model:
138
+ return
139
+
140
+ field_map = self.config.get("field_map", {})
141
+
142
+ for col in model.__table__.columns:
143
+ # Map DB column name to DataFrame column name (if renamed via field_map)
144
+ df_col_name = field_map.get(col.name, col.name)
145
+
146
+ # Check types and append if not already present
147
+ if isinstance(col.type, sa_types.Boolean):
148
+ if df_col_name not in self.boolean_columns:
149
+ self.boolean_columns.append(df_col_name)
150
+
151
+ elif isinstance(
152
+ col.type,
153
+ (sa_types.Integer, sa_types.BigInteger, sa_types.SmallInteger),
154
+ ):
155
+ if (
156
+ df_col_name not in self.numeric_int_columns
157
+ and df_col_name not in self.boolean_columns
158
+ and df_col_name not in self.numeric_float_columns
159
+ ):
160
+ self.numeric_int_columns.append(df_col_name)
161
+
162
+ elif isinstance(col.type, (sa_types.Float, sa_types.Numeric)):
163
+ # Numeric/Decimal often treated as float in analysis unless strict
164
+ if df_col_name not in self.numeric_float_columns:
165
+ self.numeric_float_columns.append(df_col_name)
166
+
167
+ elif isinstance(
168
+ col.type, (sa_types.Date, sa_types.DateTime, sa_types.TIMESTAMP)
169
+ ):
170
+ if df_col_name not in self.date_fields:
171
+ self.date_fields.append(df_col_name)
172
+
173
+ except Exception as e:
174
+ # Inference is an enhancement; failure shouldn't block app start
175
+ # but we log it for debugging.
176
+ self.logger.debug(f"Schema inference skipped: {e}")
177
+
178
+ def _validate(self, df: DataFrameType) -> DataFrameType:
179
+ """
180
+ Runs DfValidator if a schema is configured.
181
+ """
182
+ schema = self.config.get("validation_schema")
183
+ if not schema:
184
+ return df
185
+
186
+ validator = DfValidator(
187
+ df, logger=self.logger, debug=self.config.get("debug", False)
188
+ )
189
+
190
+ # 1. Standardize (always good practice if validating)
191
+ # Note: We skip full standardization for now to avoid side effects on legacy names,
192
+ # but we could adhere to stricter contracts here.
193
+
194
+ if self.config.get("enforce_schema", False):
195
+ validator.apply_schema_map(schema)
196
+ else:
197
+ try:
198
+ validator.validate_schema(schema)
199
+ except TypeError as e:
200
+ # Re-raise to block pipeline
201
+ self.logger.error(f"Schema Validation Failed: {e}")
202
+ raise e
203
+
204
+ return validator.get_df()
205
+
206
+ def get_ddl(self, table_name: Optional[str] = None) -> str:
207
+ """
208
+ Generates ClickHouse DDL for the current cube.
209
+ Requires that self.df is loaded (or at least inferred).
210
+ """
211
+ if self.df is None:
212
+ raise RuntimeError("Cannot generate DDL: Datacube has not been loaded.")
213
+
214
+ name = table_name or self.config.get("table") or "datacube_table"
215
+ validator = DfValidator(self.df, logger=self.logger)
216
+ return validator.generate_clickhouse_ddl(name)
217
+
218
+ # -----------------------------------------------------------------------
219
+ # Lifecycle (Consolidated from BaseDatacube)
220
+ # -----------------------------------------------------------------------
221
+ def load(self, **kwargs) -> DataFrameType:
222
+ """
223
+ Synchronous load pipeline.
224
+ """
225
+ # 1. Load Raw Data (Delegate to DfHelper)
226
+ df = super().load(**kwargs)
227
+
228
+ # 2. Check Emptiness (Resilient)
229
+ if not self._is_empty(df):
230
+ # 3. Apply Transform Hook
231
+ df = self.fix_data(df, **kwargs)
232
+ # 4. Validate
233
+ df = self._validate(df)
234
+ else:
235
+ self.logger.debug(f"No data loaded by {self.__class__.__name__}")
236
+
237
+ # 4. Update State & Return
238
+ self.df = df
239
+ return df
240
+
241
+ async def aload(self, **kwargs) -> DataFrameType:
242
+ """
243
+ Asynchronous load pipeline.
244
+ """
245
+ # 1. Load Raw Data
246
+ df = await super().aload(**kwargs)
247
+
248
+ # 2. Check Emptiness (Async Safe offload)
249
+ import asyncio
250
+
251
+ is_empty = await asyncio.to_thread(self._is_empty, df)
252
+
253
+ if not is_empty:
254
+ # 3. Apply Async Transform Hook
255
+ df = await self.afix_data(df, **kwargs)
256
+ # 4. Validate (CPU bound)
257
+ df = await asyncio.to_thread(self._validate, df)
258
+ else:
259
+ self.logger.debug(f"No data loaded by {self.__class__.__name__}")
260
+
261
+ # 4. Update State & Return
262
+ self.df = df
263
+ return df
264
+
265
+ # -----------------------------------------------------------------------
266
+ # Transformation Hooks
267
+ # -----------------------------------------------------------------------
268
+ def fix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
269
+ """
270
+ Applies declarative transformations automatically.
271
+ Infers schema types on first run if using SQLAlchemy backend.
272
+ Subclasses can override this to add custom logic, calling super().fix_data(df) first.
273
+ """
274
+ if not self._schema_inferred:
275
+ self._infer_schema()
276
+ self._schema_inferred = True
277
+
278
+ utils = DataUtils(logger=self.logger)
279
+
280
+ if self.pii_columns:
281
+ df = utils.transform_pii_columns(df, self.pii_columns)
282
+
283
+ if self.boolean_columns:
284
+ df = utils.transform_boolean_columns(df, self.boolean_columns)
285
+
286
+ if self.numeric_float_columns:
287
+ df = utils.transform_numeric_columns(
288
+ df, columns=self.numeric_float_columns, dtype=float
289
+ )
290
+
291
+ if self.numeric_int_columns:
292
+ df = utils.transform_numeric_columns(
293
+ df, columns=self.numeric_int_columns, dtype=int
294
+ )
295
+
296
+ if self.date_fields:
297
+ if isinstance(df, dd.DataFrame):
298
+ df = utils.convert_to_datetime_dask(df, self.date_fields)
299
+ else:
300
+ for col in self.date_fields:
301
+ if col in df.columns:
302
+ df[col] = pd.to_datetime(
303
+ df[col], errors="coerce", utc=True, format="mixed"
304
+ )
305
+
306
+ return df
307
+
308
+ async def afix_data(self, df: DataFrameType, **kwargs) -> DataFrameType:
309
+ """
310
+ Asynchronous transformation hook.
311
+ Offloads synchronous fix_data (and schema inference) to a thread.
312
+ """
313
+ import asyncio
314
+
315
+ return await asyncio.to_thread(self.fix_data, df, **kwargs)
316
+
317
+ # -----------------------------------------------------------------------
318
+ # Internals
319
+ # -----------------------------------------------------------------------
320
+ def _is_empty(self, df: Optional[DataFrameType]) -> bool:
321
+ """
322
+ Robust emptiness check using dask_cluster.
323
+ """
324
+ if df is None:
325
+ return True
326
+
327
+ if isinstance(df, pd.DataFrame):
328
+ return df.empty
329
+
330
+ # Use resilient checker
331
+ client = getattr(self, "dask_client", None)
332
+ return dask_is_empty(df, dask_client=client, logger=self.logger)
@@ -0,0 +1,152 @@
1
+ import re
2
+ import yaml
3
+ from pathlib import Path
4
+ from typing import Literal, Optional, Any
5
+ from collections.abc import Mapping, Sequence
6
+ from pydantic import BaseModel
7
+
8
+ # --- Schema Definitions ---
9
+
10
+
11
+ class GlobalSettings(BaseModel):
12
+ cubes_root_path: str
13
+ fields_module_root: str
14
+ default_db_conn: str
15
+ env_file: Optional[str] = ".env.linux"
16
+ exclusions: list[str] = []
17
+
18
+
19
+ class DiscoveryRule(BaseModel):
20
+ pattern: str
21
+ match_type: Literal["prefix", "exact", "regex"] = "prefix"
22
+ domain: str
23
+ output_template: str
24
+
25
+ # Optional: Override the DB connection for specific tables
26
+ db_conn_override: Optional[str] = None
27
+
28
+
29
+ class GeneratorConfig(BaseModel):
30
+ version: float
31
+ settings: GlobalSettings
32
+ discovery_rules: list[DiscoveryRule]
33
+
34
+
35
+ # --- Logic Engine ---
36
+
37
+
38
+ class ConfigurationEngine:
39
+ def __init__(self, config_path: str = "generator_config.yaml"):
40
+ # Resolve config path relative to this file if not absolute
41
+ path = Path(config_path)
42
+ if not path.is_absolute():
43
+ path = Path(__file__).parent / config_path
44
+
45
+ self.config_path = path
46
+ self.config = self._load_config()
47
+
48
+ def _load_config(self) -> GeneratorConfig:
49
+ if not self.config_path.exists():
50
+ raise FileNotFoundError(f"Config file not found: {self.config_path}")
51
+
52
+ with open(self.config_path, "r") as f:
53
+ raw_data = yaml.safe_load(f)
54
+ return GeneratorConfig(**raw_data)
55
+
56
+ def resolve_table(self, table_name: str) -> Optional[dict[str, Any]]:
57
+ """
58
+ Iterates through rules to find a match.
59
+ Returns a dict with resolved paths and metadata.
60
+ """
61
+ # 1. Check Global Exclusions
62
+ for pattern in self.config.settings.exclusions:
63
+ # We assume exclusions are regex patterns for flexibility
64
+ if re.search(pattern, table_name):
65
+ return None
66
+
67
+ # 2. Check Rules
68
+ for rule in self.config.discovery_rules:
69
+ match_data = self._check_match(rule, table_name)
70
+ if match_data is not None:
71
+ return self._build_result(rule, table_name, match_data)
72
+
73
+ return None # No match found
74
+
75
+ def _check_match(
76
+ self, rule: DiscoveryRule, table_name: str
77
+ ) -> Optional[dict[str, str]]:
78
+ """
79
+ Checks if rule matches table_name.
80
+ Returns capturing groups dict if match (empty dict for non-regex matches), None otherwise.
81
+ """
82
+ if rule.match_type == "exact":
83
+ if table_name == rule.pattern:
84
+ return {}
85
+
86
+ elif rule.match_type == "prefix":
87
+ if table_name.startswith(rule.pattern):
88
+ return {}
89
+
90
+ elif rule.match_type == "regex":
91
+ m = re.search(rule.pattern, table_name)
92
+ if m:
93
+ # Combine numbered groups (as strings) and named groups
94
+ groups = {str(i + 1): g for i, g in enumerate(m.groups())}
95
+ groups.update(m.groupdict())
96
+ return groups
97
+
98
+ return None
99
+
100
+ def _build_result(
101
+ self, rule: DiscoveryRule, table_name: str, groups: dict[str, str]
102
+ ) -> dict[str, Any]:
103
+ """Constructs the final paths based on global settings + rule + captured groups"""
104
+
105
+ # 1. Resolve Output Subpath with Template Substitution
106
+ # Supports {feature}, {1}, etc.
107
+ try:
108
+ subpath = rule.output_template.format(**groups)
109
+ except KeyError:
110
+ # Fallback for safe substitution if keys missing? Or let it raise?
111
+ # Let's try to be safe but warn? For now, strict formatting.
112
+ subpath = rule.output_template
113
+
114
+ # 2. Resolve Physical Path
115
+ root = Path(self.config.settings.cubes_root_path)
116
+ full_output_path = str(root / subpath)
117
+
118
+ # 3. Resolve Field Map Path (Smart Guessing)
119
+ # e.g., solutions.conf.transforms.fields.logistics.asm_tracking_X.field_map
120
+ field_map_module = (
121
+ f"{self.config.settings.fields_module_root}."
122
+ f"{rule.domain}.{table_name}.field_map"
123
+ )
124
+
125
+ return {
126
+ "domain": rule.domain,
127
+ "path": full_output_path, # 'path' key is expected by generator
128
+ "save_to_path": full_output_path,
129
+ "field_map": field_map_module, # 'field_map' template logic relies on discovery result often
130
+ "connection_obj": rule.db_conn_override
131
+ or self.config.settings.default_db_conn,
132
+ }
133
+
134
+
135
+ # --- Module Level Interface ---
136
+
137
+ _ENGINE = None
138
+
139
+
140
+ def get_engine() -> ConfigurationEngine:
141
+ global _ENGINE
142
+ if _ENGINE is None:
143
+ _ENGINE = ConfigurationEngine()
144
+ return _ENGINE
145
+
146
+
147
+ def match_rule(table_name: str) -> Optional[dict[str, str]]:
148
+ """
149
+ Finds the first matching rule for a table name.
150
+ """
151
+ engine = get_engine()
152
+ return engine.resolve_table(table_name)
@@ -0,0 +1,48 @@
1
+ from typing import List, MutableMapping, Optional
2
+ from pathlib import Path
3
+ from .field_registry import GlobalFieldRegistry
4
+
5
+ # Singleton instance to avoid reloading YAML multiple times
6
+ _SHARED_REGISTRY: Optional[GlobalFieldRegistry] = None
7
+
8
+
9
+ class FieldMapFactory:
10
+ """
11
+ Factory for creating field translation maps.
12
+ Uses a shared GlobalFieldRegistry instance to translate field names.
13
+ """
14
+
15
+ @classmethod
16
+ def create(cls, table_name: str, columns: List[str]) -> MutableMapping[str, str]:
17
+ """
18
+ Creates a field map dictionary for the given table and columns.
19
+
20
+ Args:
21
+ table_name: The name of the database table (for context/logging if needed).
22
+ columns: List of column names to translate.
23
+
24
+ Returns:
25
+ A MutableMapping (dict) where key=column_name, value=translated_name.
26
+ """
27
+ registry = cls._get_registry()
28
+
29
+ mapping = {}
30
+ for col in columns:
31
+ # We use the registry solely for lookups.
32
+ # If the key isn't there, we default to identity.
33
+ if col in registry:
34
+ mapping[col] = registry[col]
35
+ else:
36
+ mapping[col] = col
37
+
38
+ return mapping
39
+
40
+ @classmethod
41
+ def _get_registry(cls) -> GlobalFieldRegistry:
42
+ global _SHARED_REGISTRY
43
+ if _SHARED_REGISTRY is None:
44
+ # Assume standard location relative to project root
45
+ registry_path = Path("solutions/conf/global_field_translations.yaml")
46
+ _SHARED_REGISTRY = GlobalFieldRegistry(registry_path)
47
+
48
+ return _SHARED_REGISTRY
@@ -0,0 +1,122 @@
1
+ import yaml
2
+ import threading
3
+ from pathlib import Path
4
+ from typing import Dict, Iterator, Optional, Union
5
+ from collections.abc import MutableMapping
6
+
7
+
8
+ class GlobalFieldRegistry(MutableMapping):
9
+ """
10
+ Acts as the source of truth for field translations (Database Column -> English Attribute).
11
+ Backed by a local YAML file. Thread-safe.
12
+ """
13
+
14
+ def __init__(self, file_path: Union[str, Path]):
15
+ """
16
+ Initialize the registry.
17
+
18
+ Args:
19
+ file_path: Path to the YAML file backing this registry.
20
+ """
21
+ self.file_path = Path(file_path).resolve()
22
+ self._data: Dict[str, str] = {}
23
+ self._dirty = False
24
+ self._lock = threading.RLock()
25
+ self._load()
26
+
27
+ def _load(self) -> None:
28
+ """Load state from the YAML file."""
29
+ with self._lock:
30
+ if self.file_path.exists():
31
+ try:
32
+ with open(self.file_path, "r", encoding="utf-8") as f:
33
+ content = yaml.safe_load(f)
34
+ if content and isinstance(content, dict):
35
+ self._data = content
36
+ else:
37
+ self._data = {}
38
+ except Exception:
39
+ self._data = {}
40
+ else:
41
+ self._data = {}
42
+
43
+ def save(self) -> None:
44
+ """Persist changes to the YAML file if any changes were made."""
45
+ with self._lock:
46
+ if not self._dirty:
47
+ return
48
+
49
+ # Ensure directory exists
50
+ self.file_path.parent.mkdir(parents=True, exist_ok=True)
51
+
52
+ with open(self.file_path, "w", encoding="utf-8") as f:
53
+ for key, value in sorted(self._data.items()):
54
+ # Dump single line
55
+ # Note: yaml.dump adds a newline at the end, so we strip it
56
+ line = yaml.dump(
57
+ {key: value},
58
+ sort_keys=False,
59
+ allow_unicode=True,
60
+ default_flow_style=False,
61
+ ).strip()
62
+
63
+ # Append comment if translation matches original (and wasn't manually set to something else)
64
+ # We rely on the convention that non-translated fields equal their keys.
65
+ if key == value:
66
+ line += " # Translation missing"
67
+
68
+ f.write(line + "\n")
69
+
70
+ self._dirty = False
71
+
72
+ def register_field(
73
+ self,
74
+ original_name: str,
75
+ suggested_translation: Optional[str] = None,
76
+ force_update: bool = False,
77
+ ) -> None:
78
+ """
79
+ Register a field if it does not exist, or update if force_update is True.
80
+
81
+ Args:
82
+ original_name: The original database column name.
83
+ suggested_translation: Proposed English translation.
84
+ force_update: If True, overwrite existing translation.
85
+ """
86
+ with self._lock:
87
+ if original_name in self._data:
88
+ if force_update and suggested_translation:
89
+ if self._data[original_name] != suggested_translation:
90
+ self._data[original_name] = suggested_translation
91
+ self._dirty = True
92
+ else:
93
+ # New field - use suggestion or default to identity
94
+ val = suggested_translation if suggested_translation else original_name
95
+ self._data[original_name] = val
96
+ self._dirty = True
97
+
98
+ # --- MutableMapping Implementation ---
99
+
100
+ def __getitem__(self, key: str) -> str:
101
+ with self._lock:
102
+ return self._data[key]
103
+
104
+ def __setitem__(self, key: str, value: str) -> None:
105
+ with self._lock:
106
+ if self._data.get(key) != value:
107
+ self._data[key] = value
108
+ self._dirty = True
109
+
110
+ def __delitem__(self, key: str) -> None:
111
+ with self._lock:
112
+ if key in self._data:
113
+ del self._data[key]
114
+ self._dirty = True
115
+
116
+ def __iter__(self) -> Iterator[str]:
117
+ with self._lock:
118
+ return iter(list(self._data))
119
+
120
+ def __len__(self) -> int:
121
+ with self._lock:
122
+ return len(self._data)