sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. sibi_dst/__init__.py +44 -0
  2. sibi_flux/__init__.py +49 -0
  3. sibi_flux/artifacts/__init__.py +7 -0
  4. sibi_flux/artifacts/base.py +166 -0
  5. sibi_flux/artifacts/parquet.py +360 -0
  6. sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
  7. sibi_flux/artifacts/parquet_engine/executor.py +204 -0
  8. sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
  9. sibi_flux/artifacts/parquet_engine/planner.py +544 -0
  10. sibi_flux/conf/settings.py +131 -0
  11. sibi_flux/core/__init__.py +5 -0
  12. sibi_flux/core/managed_resource/__init__.py +3 -0
  13. sibi_flux/core/managed_resource/_managed_resource.py +733 -0
  14. sibi_flux/core/type_maps/__init__.py +100 -0
  15. sibi_flux/dask_cluster/__init__.py +47 -0
  16. sibi_flux/dask_cluster/async_core.py +27 -0
  17. sibi_flux/dask_cluster/client_manager.py +549 -0
  18. sibi_flux/dask_cluster/core.py +322 -0
  19. sibi_flux/dask_cluster/exceptions.py +34 -0
  20. sibi_flux/dask_cluster/utils.py +49 -0
  21. sibi_flux/datacube/__init__.py +3 -0
  22. sibi_flux/datacube/_data_cube.py +332 -0
  23. sibi_flux/datacube/config_engine.py +152 -0
  24. sibi_flux/datacube/field_factory.py +48 -0
  25. sibi_flux/datacube/field_registry.py +122 -0
  26. sibi_flux/datacube/generator.py +677 -0
  27. sibi_flux/datacube/orchestrator.py +171 -0
  28. sibi_flux/dataset/__init__.py +3 -0
  29. sibi_flux/dataset/_dataset.py +162 -0
  30. sibi_flux/df_enricher/__init__.py +56 -0
  31. sibi_flux/df_enricher/async_enricher.py +201 -0
  32. sibi_flux/df_enricher/merger.py +253 -0
  33. sibi_flux/df_enricher/specs.py +45 -0
  34. sibi_flux/df_enricher/types.py +12 -0
  35. sibi_flux/df_helper/__init__.py +5 -0
  36. sibi_flux/df_helper/_df_helper.py +450 -0
  37. sibi_flux/df_helper/backends/__init__.py +34 -0
  38. sibi_flux/df_helper/backends/_params.py +173 -0
  39. sibi_flux/df_helper/backends/_strategies.py +295 -0
  40. sibi_flux/df_helper/backends/http/__init__.py +5 -0
  41. sibi_flux/df_helper/backends/http/_http_config.py +122 -0
  42. sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
  43. sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
  44. sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
  45. sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
  46. sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  47. sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
  48. sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
  49. sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
  50. sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
  51. sibi_flux/df_helper/backends/utils.py +32 -0
  52. sibi_flux/df_helper/core/__init__.py +15 -0
  53. sibi_flux/df_helper/core/_defaults.py +104 -0
  54. sibi_flux/df_helper/core/_filter_handler.py +617 -0
  55. sibi_flux/df_helper/core/_params_config.py +185 -0
  56. sibi_flux/df_helper/core/_query_config.py +17 -0
  57. sibi_flux/df_validator/__init__.py +3 -0
  58. sibi_flux/df_validator/_df_validator.py +222 -0
  59. sibi_flux/logger/__init__.py +1 -0
  60. sibi_flux/logger/_logger.py +480 -0
  61. sibi_flux/mcp/__init__.py +26 -0
  62. sibi_flux/mcp/client.py +150 -0
  63. sibi_flux/mcp/router.py +126 -0
  64. sibi_flux/orchestration/__init__.py +9 -0
  65. sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
  66. sibi_flux/orchestration/_pipeline_executor.py +212 -0
  67. sibi_flux/osmnx_helper/__init__.py +22 -0
  68. sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
  69. sibi_flux/osmnx_helper/graph_loader.py +225 -0
  70. sibi_flux/osmnx_helper/utils.py +100 -0
  71. sibi_flux/pipelines/__init__.py +3 -0
  72. sibi_flux/pipelines/base.py +218 -0
  73. sibi_flux/py.typed +0 -0
  74. sibi_flux/readers/__init__.py +3 -0
  75. sibi_flux/readers/base.py +82 -0
  76. sibi_flux/readers/parquet.py +106 -0
  77. sibi_flux/utils/__init__.py +53 -0
  78. sibi_flux/utils/boilerplate/__init__.py +19 -0
  79. sibi_flux/utils/boilerplate/base_attacher.py +45 -0
  80. sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
  81. sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
  82. sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
  83. sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
  84. sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
  85. sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
  86. sibi_flux/utils/common.py +7 -0
  87. sibi_flux/utils/credentials/__init__.py +3 -0
  88. sibi_flux/utils/credentials/_config_manager.py +155 -0
  89. sibi_flux/utils/dask_utils.py +14 -0
  90. sibi_flux/utils/data_utils/__init__.py +3 -0
  91. sibi_flux/utils/data_utils/_data_utils.py +389 -0
  92. sibi_flux/utils/dataframe_utils.py +52 -0
  93. sibi_flux/utils/date_utils/__init__.py +10 -0
  94. sibi_flux/utils/date_utils/_business_days.py +220 -0
  95. sibi_flux/utils/date_utils/_date_utils.py +311 -0
  96. sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
  97. sibi_flux/utils/file_utils.py +48 -0
  98. sibi_flux/utils/filepath_generator/__init__.py +5 -0
  99. sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
  100. sibi_flux/utils/parquet_saver/__init__.py +6 -0
  101. sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
  102. sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
  103. sibi_flux/utils/retry.py +46 -0
  104. sibi_flux/utils/storage/__init__.py +7 -0
  105. sibi_flux/utils/storage/_fs_registry.py +112 -0
  106. sibi_flux/utils/storage/_storage_manager.py +257 -0
  107. sibi_flux/utils/storage/factory.py +33 -0
  108. sibi_flux-2025.12.0.dist-info/METADATA +283 -0
  109. sibi_flux-2025.12.0.dist-info/RECORD +110 -0
  110. sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,677 @@
1
+ """
2
+ DataCube Generator module.
3
+
4
+ Handles introspection of database schemas and generation of Python DataCube classes.
5
+ """
6
+
7
+ import sqlalchemy as sa
8
+ from pathlib import Path
9
+ from typing import Optional, Set, Any, Callable, Tuple
10
+ from collections.abc import Mapping, MutableMapping, Iterable, Sequence
11
+ from sqlalchemy import inspect
12
+ import re
13
+ import importlib
14
+ import sys
15
+ import os
16
+
17
+
18
+ def get_sa_type_classes(
19
+ type_names: Iterable[str], logger: Optional[Any] = None
20
+ ) -> Tuple[Any, ...]:
21
+ """Converts a list of strings to actual SQLAlchemy type classes."""
22
+ classes = []
23
+ for name in type_names:
24
+ type_cls = getattr(sa, name, None)
25
+ if type_cls:
26
+ classes.append(type_cls)
27
+ elif logger:
28
+ logger.warning(f"SQLAlchemy type '{name}' not found.")
29
+ return tuple(classes)
30
+
31
+
32
+ def is_secure_path(target_file: str, allowed_dirs: Iterable[str]) -> bool:
33
+ """Verifies path is within sandbox using resolution to prevent traversal."""
34
+ try:
35
+ target_path = Path(target_file).resolve()
36
+ for allowed in allowed_dirs:
37
+ if target_path.is_relative_to(Path(allowed).resolve()):
38
+ return True
39
+ except (ValueError, RuntimeError, OSError):
40
+ return False
41
+ return False
42
+
43
+
44
+ def generate_class_code(
45
+ inspector: Optional[sa.engine.Inspector],
46
+ table_name: str,
47
+ config_obj: str,
48
+ base_class: str,
49
+ mappings: Mapping[str, tuple],
50
+ class_name: Optional[str] = None,
51
+ class_suffix: str = "Dc",
52
+ backend: str = "sqlalchemy",
53
+ field_map_var: Optional[str] = None,
54
+ legacy_filters: bool = False,
55
+ field_map_dict: Optional[Mapping[str, str]] = None,
56
+ sticky_filters: Optional[Mapping[str, Any]] = None,
57
+ ) -> str:
58
+ """Introspects DB table and generates Python class string. If inspector is None, generates shell."""
59
+
60
+ # Initialize metadata dict based on the keys in our YAML mapping
61
+ meta: dict[str, list[str]] = {label: [] for label in mappings.keys()}
62
+
63
+ if inspector:
64
+ try:
65
+ columns = inspector.get_columns(table_name)
66
+ for col in columns:
67
+ col_name = col["name"]
68
+ # Apply alias if map is provided
69
+ if field_map_dict:
70
+ col_name = field_map_dict.get(col_name, col_name)
71
+
72
+ for label, sa_classes in mappings.items():
73
+ if isinstance(col["type"], sa_classes):
74
+ meta[label].append(col_name)
75
+ except (sa.exc.SQLAlchemyError, KeyError):
76
+ # If introspection fails for specific table (e.g. missing), just skip columns
77
+ pass
78
+
79
+ if not class_name:
80
+ class_name = (
81
+ "".join(w.capitalize() for w in table_name.split("_")) + class_suffix
82
+ )
83
+
84
+ # Generate attribute lines for every label defined in the mapping
85
+ metadata_lines = []
86
+ for label, cols in meta.items():
87
+ metadata_lines.append(f" {label}: ClassVar[List[str]] = {repr(cols)}")
88
+
89
+ # Configuration block extensions
90
+ extra_config = []
91
+ if field_map_var:
92
+ extra_config.append(f" field_map: ClassVar[Dict] = {field_map_var}")
93
+ if legacy_filters:
94
+ extra_config.append(f" legacy_filters: ClassVar[bool] = {legacy_filters}")
95
+ if sticky_filters:
96
+ extra_config.append(
97
+ f" sticky_filters: ClassVar[Dict] = {repr(sticky_filters)}"
98
+ )
99
+
100
+ attributes_str = "\n".join(metadata_lines)
101
+ extra_config_str = "\n".join(extra_config)
102
+ if extra_config_str:
103
+ extra_config_str = "\n" + extra_config_str
104
+
105
+ return f"""
106
+ class {class_name}({base_class}):
107
+ df: Optional[dd.DataFrame] = None
108
+
109
+ # Config
110
+ backend: ClassVar[str] = "{backend}"
111
+ connection_url: ClassVar[str] = {config_obj}.get("db_url")
112
+ table: ClassVar[str] = "{table_name}"{extra_config_str}
113
+
114
+ # Transformations & Metadata
115
+ {attributes_str}
116
+ """
117
+
118
+
119
+ def check_drift(
120
+ dc_class: Any,
121
+ db_column_names: Iterable[str],
122
+ attribute_names: Iterable[str],
123
+ field_map: Optional[Mapping[str, str]] = None,
124
+ ) -> list[str]:
125
+ """
126
+ Checks for drift between the datacube class metadata and database columns.
127
+ Returns a list of issue strings.
128
+ """
129
+ code_cols: Set[str] = set()
130
+ for attr in attribute_names:
131
+ # Get the list from the class (default to empty list)
132
+ cols = getattr(dc_class, attr, [])
133
+ code_cols.update(cols)
134
+
135
+ # Apply alias mapping to DB columns if present
136
+ # Map: DB_NAME -> CODE_NAME
137
+ if field_map:
138
+ mapped_db_cols = set()
139
+ for c in db_column_names:
140
+ mapped_db_cols.add(field_map.get(c, c)) # Use alias if exists, else raw
141
+ final_db_cols = mapped_db_cols
142
+ else:
143
+ final_db_cols = set(db_column_names)
144
+
145
+ missing_in_code = final_db_cols - code_cols
146
+ extra_in_code = code_cols - final_db_cols
147
+
148
+ issues = []
149
+ if missing_in_code:
150
+ issues.append(f"DB has new cols: {missing_in_code}")
151
+ if extra_in_code:
152
+ issues.append(f"Code has stale cols: {extra_in_code}")
153
+
154
+ return issues
155
+
156
+
157
+ def resolve_db_url(config_obj_name: str, imports: Iterable[str]) -> Optional[str]:
158
+ """
159
+ Attempts to resolve a database URL by dynamically importing the config object
160
+ described in the import statements.
161
+ """
162
+ # Ensure current working directory is in path to allow local imports
163
+ if os.getcwd() not in sys.path:
164
+ sys.path.insert(0, os.getcwd())
165
+
166
+ for imp in imports:
167
+ # Simple parsing for "from module import var" or "from module import var1, var2"
168
+ if imp.startswith("from ") and " import " in imp:
169
+ parts = imp.split(" import ")
170
+ module_name = parts[0][5:].strip()
171
+ imported_vars = [v.strip() for v in parts[1].split(",")]
172
+
173
+ if config_obj_name in imported_vars:
174
+ try:
175
+ mod = importlib.import_module(module_name)
176
+ conf = getattr(mod, config_obj_name)
177
+ # Support dict-like (.get) or object attribute access
178
+ if hasattr(conf, "get"):
179
+ return conf.get("db_url")
180
+ elif hasattr(conf, "db_url"):
181
+ return conf.db_url
182
+ except (ImportError, AttributeError):
183
+ pass
184
+ return None
185
+
186
+
187
+ def load_environment(
188
+ env_file: Optional[Path] = None, logger: Optional[Any] = None
189
+ ) -> None:
190
+ """
191
+ Loads environment variables from the specified file or .env.linux by default.
192
+ """
193
+ target = env_file if env_file else Path(os.getcwd()) / ".env"
194
+ if target.exists():
195
+ if logger:
196
+ logger(f"Loading environment from {target.name}...")
197
+ with open(target, "r") as f:
198
+ for line in f:
199
+ line = line.strip()
200
+ if not line or line.startswith("#"):
201
+ continue
202
+ if "=" in line:
203
+ key, value = line.split("=", 1)
204
+ os.environ[key.strip()] = value.strip().strip("'").strip('"')
205
+ elif env_file and logger:
206
+ logger(f"Warning: Environment file '{env_file}' not found.")
207
+
208
+
209
+ def filter_global_imports(
210
+ global_imports: Iterable[str],
211
+ used_configs: Set[str],
212
+ ignored_prefixes: Optional[Iterable[str]] = None,
213
+ ) -> list[str]:
214
+ """
215
+ Filters global import lines, keeping only those that are used or generic.
216
+ Renames imports if they contain multiple items but only some are used.
217
+ """
218
+ filtered = []
219
+ ignored = ignored_prefixes or []
220
+ for line in global_imports:
221
+ if " import " in line and line.strip().startswith("from "):
222
+ parts = line.split(" import ")
223
+ prefix = parts[0]
224
+ imported_vars = [v.strip() for v in parts[1].split(",")]
225
+
226
+ # Check for intersection with used configs
227
+ common = [v for v in imported_vars if v in used_configs]
228
+
229
+ if common:
230
+ # If we use some of the imported vars, keep only those
231
+ filtered.append(f"{prefix} import {', '.join(common)}")
232
+ else:
233
+ # If we use NONE, check if it looks like a config import we should drop
234
+ if any(p in prefix for p in ignored):
235
+ continue
236
+ # Otherwise keep generic imports (e.g. typing, os, etc if they appeared here)
237
+ filtered.append(line)
238
+ else:
239
+ filtered.append(line)
240
+ return filtered
241
+
242
+
243
+ class DatacubeRegistry:
244
+ """
245
+ Encapsulates parsing logic for the datacube_registry.yaml.
246
+ """
247
+
248
+ def __init__(self, config_dict: MutableMapping[str, Any]):
249
+ self.config = config_dict
250
+ self.tables = self.config.setdefault("tables", {})
251
+ self.global_imports = self.config.get("global_imports", [])
252
+
253
+ # Resolve Mappings
254
+ raw_mappings = self.config.get("type_mappings", {})
255
+ self.processed_mappings = {
256
+ label: get_sa_type_classes(types) for label, types in raw_mappings.items()
257
+ }
258
+
259
+ # Defaults
260
+ # Defaults
261
+ self.valid_paths = self.config.get("valid_libpaths")
262
+ if not self.valid_paths:
263
+ # Make it empty/None? Or raise? Or let it be provided by caller?
264
+ # For now, let's just make it empty list so it fails secure check if not provided
265
+ self.valid_paths = []
266
+
267
+ self.valid_fieldmap_paths = self.config.get("valid_fieldmap_paths") or []
268
+ self.default_connection_obj = self.config.get(
269
+ "default_connection_obj", self.config.get("default_config_obj")
270
+ )
271
+ self.default_base_class = self.config.get("default_base_class")
272
+ self.default_base_import = self.config.get("default_base_import")
273
+ self.default_backend = self.config.get("default_backend", "sqlalchemy")
274
+ self.class_suffix = self.config.get("class_suffix", "Dc")
275
+
276
+ def get_table_details(self, table_name: str) -> dict[str, Any]:
277
+ return self.tables.get(table_name, {})
278
+
279
+ def group_tables_by_file(self) -> dict[str, list[tuple]]:
280
+ """
281
+ Groups tables by their target file path.
282
+ Returns: Dict[path_str, List[(table_name, conf_obj, base_cls, base_imp, cls_name)]]
283
+ """
284
+ file_groups: dict[str, list[tuple]] = {}
285
+ for table_name, details in self.tables.items():
286
+ target = details.get("save_to_path", details.get("path"))
287
+ if not target:
288
+ continue
289
+
290
+ conf_obj = details.get(
291
+ "connection_obj", details.get("config_obj", self.default_connection_obj)
292
+ )
293
+ base_cls = details.get("base_class", self.default_base_class)
294
+ base_imp = details.get("import_from", self.default_base_import)
295
+ cls_name = details.get("class_name")
296
+
297
+ file_groups.setdefault(target, []).append(
298
+ (table_name, conf_obj, base_cls, base_imp, cls_name)
299
+ )
300
+ return file_groups
301
+
302
+ def merge_discovered(
303
+ self, discovered_tables: Mapping[str, Mapping[str, Any]]
304
+ ) -> None:
305
+ """
306
+ Merges discovered tables into the existing configuration.
307
+ """
308
+ for table, new_details in discovered_tables.items():
309
+ if table in self.tables:
310
+ existing = self.tables[table]
311
+ for k, v in new_details.items():
312
+ if v is not None:
313
+ existing[k] = v
314
+ elif k not in existing:
315
+ existing[k] = v
316
+
317
+ if "class_name" not in existing:
318
+ existing["class_name"] = new_details.get("class_name")
319
+
320
+ self.tables[table] = existing
321
+ else:
322
+ self.tables[table] = new_details
323
+
324
+ def prune_tables(self, keep_tables: Iterable[str]) -> list[str]:
325
+ """
326
+ Removes tables from the registry that are not in the keep_tables list.
327
+ """
328
+ to_remove = [t for t in self.tables if t not in keep_tables]
329
+ for t in to_remove:
330
+ del self.tables[t]
331
+ return to_remove
332
+
333
+ def to_class_name(self, table_name: str) -> str:
334
+ """
335
+ Converts snake_case table name to CamelCase + Suffix.
336
+ """
337
+ return (
338
+ "".join(w.capitalize() for w in table_name.split("_")) + self.class_suffix
339
+ )
340
+
341
+ def normalize_aliases(self, ignored_prefixes: Optional[list[str]] = None) -> None:
342
+ pass
343
+
344
+ def discover(self, allowed_paths: Optional[list[str]] = None) -> list[str]:
345
+ return []
346
+
347
+
348
+ def perform_discovery(
349
+ all_table_names: Iterable[str],
350
+ match_rule_callback: Callable[[str], Optional[Mapping[str, Any]]],
351
+ registry: DatacubeRegistry,
352
+ whitelist: Optional[Mapping[str, Mapping[str, Any]]] = None,
353
+ template: Optional[Mapping[str, Any]] = None,
354
+ field_map_template: Optional[str] = None,
355
+ default_connection: str = "default_db_conf",
356
+ ) -> dict[str, dict[str, Any]]:
357
+ """
358
+ Executes the discovery logic:
359
+ 1. Filter by whitelist (if provided).
360
+ 2. Match tables against rules.
361
+ 3. Construct entries using (Template + Rule + Whitelist Override).
362
+ """
363
+ discovered = {}
364
+
365
+ candidates = all_table_names
366
+ if whitelist is not None:
367
+ candidates = [t for t in all_table_names if t in whitelist]
368
+
369
+ for table in candidates:
370
+ rule = match_rule_callback(table)
371
+ if not rule:
372
+ continue
373
+
374
+ path = rule["path"]
375
+ class_name = registry.to_class_name(table)
376
+
377
+ # entry = (template or {}).copy()
378
+ # Refactoring Note: dict() creates a copy and supports Mapping
379
+ entry = dict(template or {})
380
+
381
+ if rule.get("domain") and field_map_template:
382
+ # e.g. "solutions.conf.transforms.fields.{domain}.{table}.field_map"
383
+ entry["field_map"] = field_map_template.format(
384
+ domain=rule["domain"], table=table
385
+ )
386
+
387
+ entry.update(
388
+ {
389
+ "class_name": class_name,
390
+ "connection_obj": rule.get("connection_obj") or default_connection,
391
+ "save_to_path": path,
392
+ }
393
+ )
394
+
395
+ if whitelist and table in whitelist:
396
+ overrides = whitelist[table]
397
+ if overrides:
398
+ entry.update(overrides)
399
+
400
+ discovered[table] = entry
401
+
402
+ return discovered
403
+
404
+
405
+ def generate_field_map_files(
406
+ discovered_entries: Mapping[str, Mapping[str, Any]],
407
+ inspector: Any,
408
+ root_path: Path,
409
+ force: bool = False,
410
+ logger: Optional[Any] = None,
411
+ allowed_paths: Optional[Iterable[str]] = None,
412
+ ) -> None:
413
+ """
414
+ Generates or updates Python field map files for discovered tables.
415
+ If file exists and not force: performs smart sync (comment out stale, add new).
416
+ If force: overwrites completely.
417
+ """
418
+ key_pattern = re.compile(r'^\s*(?P<comment>#\s*)?[\'"](?P<key>\w+)[\'"]\s*:')
419
+
420
+ for table, entry in discovered_entries.items():
421
+ field_map = entry.get("field_map")
422
+ if not field_map:
423
+ continue
424
+
425
+ rule = entry # entry has domain if matched via discovery
426
+ if "domain" not in rule and "field_map" in rule:
427
+ # Try to infer target dir from field_map string??
428
+ # Convention: solutions.conf.transforms.fields.<domain>.<table_name>.field_map
429
+ parts = field_map.split(".")
430
+ if len(parts) > 4:
431
+ # domain is parts[-3] if standard convention
432
+ pass
433
+
434
+ # We assume root_path is passed correctly (e.g. solutions/conf/transforms/fields)
435
+ # And we append domain/table.py
436
+ # But we really should trust the rule['domain'] if available
437
+ domain = rule.get("domain", "common")
438
+ target_dir = root_path / domain
439
+ target_file = target_dir / f"{table}.py"
440
+
441
+ # Security Check
442
+ if allowed_paths:
443
+ if not is_secure_path(str(target_file), allowed_paths):
444
+ if logger:
445
+ logger(
446
+ f"[red]Blocked Field Map Generation: {target_file} is outside allowed paths.[/red]"
447
+ )
448
+ continue
449
+
450
+ target_dir.mkdir(parents=True, exist_ok=True)
451
+
452
+ if not (target_dir / "__init__.py").exists():
453
+ (target_dir / "__init__.py").touch()
454
+
455
+ try:
456
+ cols = inspector.get_columns(table)
457
+ db_col_names = {c["name"] for c in cols}
458
+ col_list = [c["name"] for c in cols] # Preserve order
459
+
460
+ if target_file.exists() and not force:
461
+ # --- Smart Sync ---
462
+ with open(target_file, "r") as f:
463
+ lines = f.readlines()
464
+
465
+ new_lines = []
466
+ existing_keys = set()
467
+
468
+ for line in lines:
469
+ match = key_pattern.match(line)
470
+ if match:
471
+ key = match.group("key")
472
+ existing_keys.add(key)
473
+ is_commented = bool(match.group("comment"))
474
+
475
+ if key not in db_col_names:
476
+ if not is_commented:
477
+ line = f"# {line}"
478
+ new_lines.append(line)
479
+
480
+ missing_cols = db_col_names - existing_keys
481
+
482
+ insert_idx = len(new_lines)
483
+ for i in range(len(new_lines) - 1, -1, -1):
484
+ if "}" in new_lines[i]:
485
+ insert_idx = i
486
+ break
487
+
488
+ to_insert = []
489
+ for col in sorted(missing_cols):
490
+ to_insert.append(
491
+ f' "{col}": "{col}", # TODO: Translate to English\n'
492
+ )
493
+
494
+ new_lines[insert_idx:insert_idx] = to_insert
495
+
496
+ with open(target_file, "w") as f:
497
+ f.writelines(new_lines)
498
+
499
+ if logger:
500
+ logger(
501
+ f"[green]Synced {target_file} (Added {len(missing_cols)}, Checked Stale)[/green]"
502
+ )
503
+ else:
504
+ # --- Fresh Generation ---
505
+ lines = ["field_map = {"]
506
+ for col in col_list:
507
+ lines.append(f' "{col}": "{col}", # TODO: Translate to English')
508
+ lines.append("}")
509
+
510
+ with open(target_file, "w") as f:
511
+ f.write("\n".join(lines))
512
+
513
+ if logger:
514
+ logger(f"[green]Generated {target_file}[/green]")
515
+
516
+ except (OSError, sa.exc.SQLAlchemyError) as e:
517
+ if logger:
518
+ logger(f"[red]Failed to generate/sync fields for {table}: {e}[/red]")
519
+
520
+
521
+ def generate_datacube_module_code(
522
+ items: Iterable[tuple],
523
+ registry: DatacubeRegistry,
524
+ get_db_url_callback: Callable[[str], str],
525
+ logger: Optional[Any] = None,
526
+ ) -> Tuple[list[str], list[str]]:
527
+ """
528
+ Generates the code components (imports, class definitions) for a single DataCube module file.
529
+ Handles import aliasing to prevent variable shadowing.
530
+ """
531
+ # Refactoring Note: using concrete Dict in string literal is fine as it's code generation
532
+ imports: Set[str] = {
533
+ "from typing import Optional, List, ClassVar, Dict",
534
+ "import dask.dataframe as dd",
535
+ }
536
+ classes_code = []
537
+
538
+ for table_name, conf_obj, base_cls, base_imp, cls_name in items:
539
+ imports.add(base_imp)
540
+
541
+ details = registry.get_table_details(table_name)
542
+ field_map_str = details.get("field_map")
543
+ sticky_filters = details.get("sticky_filters")
544
+
545
+ field_map_var = None
546
+ legacy = False
547
+
548
+ field_map_dict = None
549
+ if field_map_str:
550
+ if "." in field_map_str:
551
+ mod, var = field_map_str.rsplit(".", 1)
552
+ # FIX: Create a unique alias for the import to avoid shadowing
553
+ alias = f"{var}_{table_name}"
554
+ imports.add(f"from {mod} import {var} as {alias}")
555
+ field_map_var = alias
556
+ legacy = True
557
+
558
+ # Try to load the actual dict for alias application during introspection
559
+ try:
560
+ module = importlib.import_module(mod)
561
+ field_map_dict = getattr(module, var)
562
+ if not isinstance(field_map_dict, dict):
563
+ if logger:
564
+ logger(
565
+ f"[yellow]Warning: {var} in {mod} is not a dict. Alias mapping skipped.[/yellow]"
566
+ )
567
+ field_map_dict = None
568
+ except (ImportError, AttributeError) as e:
569
+ if logger:
570
+ logger(
571
+ f"[yellow]Warning: Could not load field_map {var} from {mod}: {e}. Alias mapping skipped.[/yellow]"
572
+ )
573
+ field_map_dict = None
574
+ else:
575
+ if logger:
576
+ logger(
577
+ f"[yellow]Warning: Invalid field_map format '{field_map_str}' for {table_name}. Expected module.path.variable[/yellow]"
578
+ )
579
+
580
+ try:
581
+ db_url = get_db_url_callback(conf_obj)
582
+ engine = sa.create_engine(db_url)
583
+ inspector = inspect(engine)
584
+
585
+ classes_code.append(
586
+ generate_class_code(
587
+ inspector,
588
+ table_name,
589
+ conf_obj,
590
+ base_cls,
591
+ registry.processed_mappings,
592
+ class_name=cls_name,
593
+ class_suffix=registry.class_suffix,
594
+ backend=registry.default_backend,
595
+ field_map_var=field_map_var,
596
+ legacy_filters=legacy,
597
+ field_map_dict=field_map_dict,
598
+ sticky_filters=sticky_filters,
599
+ )
600
+ )
601
+ except sa.exc.SQLAlchemyError as e:
602
+ if logger:
603
+ logger(
604
+ f"[yellow]Warning: Could not introspect {table_name} ({e}). Generating shell class.[/yellow]"
605
+ )
606
+ classes_code.append(
607
+ generate_class_code(
608
+ None,
609
+ table_name,
610
+ conf_obj,
611
+ base_cls,
612
+ registry.processed_mappings,
613
+ class_name=cls_name,
614
+ class_suffix=registry.class_suffix,
615
+ backend=registry.default_backend,
616
+ field_map_var=field_map_var,
617
+ )
618
+ )
619
+
620
+ return list(imports), classes_code
621
+
622
+
623
+ def dump_db_schema(
624
+ engine: Any,
625
+ db_name: str,
626
+ output_dir: Path,
627
+ whitelist: Optional[Mapping[str, Any]] = None,
628
+ logger: Optional[Any] = None,
629
+ ) -> None:
630
+ """
631
+ Introspects the database and dumps a comprehensive schema YAML file.
632
+ Structure:
633
+ <db_name>:
634
+ <table_name>:
635
+ columns: ...
636
+ indexes: ...
637
+ pk: ...
638
+ foreign_keys: ...
639
+ """
640
+ inspector = inspect(engine)
641
+ all_tables = inspector.get_table_names()
642
+
643
+ if whitelist:
644
+ tables_to_process = sorted([t for t in all_tables if t in whitelist])
645
+ else:
646
+ tables_to_process = sorted(all_tables)
647
+
648
+ schema_data = {}
649
+
650
+ for table in tables_to_process:
651
+ try:
652
+ # Columns
653
+ columns = []
654
+ for col in inspector.get_columns(table):
655
+ columns.append({"name": col["name"], "type": str(col["type"])})
656
+
657
+ schema_data[table] = {"columns": columns}
658
+ except sa.exc.SQLAlchemyError as e:
659
+ if logger:
660
+ logger(f"[red]Error dumping schema for table {table}: {e}[/red]")
661
+ schema_data[table] = {"error": str(e)}
662
+
663
+ # Top level key is db_name
664
+ final_output = {db_name: schema_data}
665
+
666
+ output_dir.mkdir(parents=True, exist_ok=True)
667
+ out_file = output_dir / f"schema_{db_name}.yaml"
668
+
669
+ import yaml
670
+
671
+ with open(out_file, "w") as f:
672
+ yaml.dump(final_output, f, sort_keys=False, default_flow_style=False)
673
+
674
+ if logger:
675
+ logger(
676
+ f"[green]Dumped schema to {out_file} ({len(tables_to_process)} tables)[/green]"
677
+ )