sibi-flux 2025.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +44 -0
- sibi_flux/__init__.py +49 -0
- sibi_flux/artifacts/__init__.py +7 -0
- sibi_flux/artifacts/base.py +166 -0
- sibi_flux/artifacts/parquet.py +360 -0
- sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
- sibi_flux/artifacts/parquet_engine/executor.py +204 -0
- sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
- sibi_flux/artifacts/parquet_engine/planner.py +544 -0
- sibi_flux/conf/settings.py +131 -0
- sibi_flux/core/__init__.py +5 -0
- sibi_flux/core/managed_resource/__init__.py +3 -0
- sibi_flux/core/managed_resource/_managed_resource.py +733 -0
- sibi_flux/core/type_maps/__init__.py +100 -0
- sibi_flux/dask_cluster/__init__.py +47 -0
- sibi_flux/dask_cluster/async_core.py +27 -0
- sibi_flux/dask_cluster/client_manager.py +549 -0
- sibi_flux/dask_cluster/core.py +322 -0
- sibi_flux/dask_cluster/exceptions.py +34 -0
- sibi_flux/dask_cluster/utils.py +49 -0
- sibi_flux/datacube/__init__.py +3 -0
- sibi_flux/datacube/_data_cube.py +332 -0
- sibi_flux/datacube/config_engine.py +152 -0
- sibi_flux/datacube/field_factory.py +48 -0
- sibi_flux/datacube/field_registry.py +122 -0
- sibi_flux/datacube/generator.py +677 -0
- sibi_flux/datacube/orchestrator.py +171 -0
- sibi_flux/dataset/__init__.py +3 -0
- sibi_flux/dataset/_dataset.py +162 -0
- sibi_flux/df_enricher/__init__.py +56 -0
- sibi_flux/df_enricher/async_enricher.py +201 -0
- sibi_flux/df_enricher/merger.py +253 -0
- sibi_flux/df_enricher/specs.py +45 -0
- sibi_flux/df_enricher/types.py +12 -0
- sibi_flux/df_helper/__init__.py +5 -0
- sibi_flux/df_helper/_df_helper.py +450 -0
- sibi_flux/df_helper/backends/__init__.py +34 -0
- sibi_flux/df_helper/backends/_params.py +173 -0
- sibi_flux/df_helper/backends/_strategies.py +295 -0
- sibi_flux/df_helper/backends/http/__init__.py +5 -0
- sibi_flux/df_helper/backends/http/_http_config.py +122 -0
- sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
- sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
- sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
- sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
- sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
- sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
- sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
- sibi_flux/df_helper/backends/utils.py +32 -0
- sibi_flux/df_helper/core/__init__.py +15 -0
- sibi_flux/df_helper/core/_defaults.py +104 -0
- sibi_flux/df_helper/core/_filter_handler.py +617 -0
- sibi_flux/df_helper/core/_params_config.py +185 -0
- sibi_flux/df_helper/core/_query_config.py +17 -0
- sibi_flux/df_validator/__init__.py +3 -0
- sibi_flux/df_validator/_df_validator.py +222 -0
- sibi_flux/logger/__init__.py +1 -0
- sibi_flux/logger/_logger.py +480 -0
- sibi_flux/mcp/__init__.py +26 -0
- sibi_flux/mcp/client.py +150 -0
- sibi_flux/mcp/router.py +126 -0
- sibi_flux/orchestration/__init__.py +9 -0
- sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
- sibi_flux/orchestration/_pipeline_executor.py +212 -0
- sibi_flux/osmnx_helper/__init__.py +22 -0
- sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
- sibi_flux/osmnx_helper/graph_loader.py +225 -0
- sibi_flux/osmnx_helper/utils.py +100 -0
- sibi_flux/pipelines/__init__.py +3 -0
- sibi_flux/pipelines/base.py +218 -0
- sibi_flux/py.typed +0 -0
- sibi_flux/readers/__init__.py +3 -0
- sibi_flux/readers/base.py +82 -0
- sibi_flux/readers/parquet.py +106 -0
- sibi_flux/utils/__init__.py +53 -0
- sibi_flux/utils/boilerplate/__init__.py +19 -0
- sibi_flux/utils/boilerplate/base_attacher.py +45 -0
- sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
- sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
- sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
- sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
- sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
- sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
- sibi_flux/utils/common.py +7 -0
- sibi_flux/utils/credentials/__init__.py +3 -0
- sibi_flux/utils/credentials/_config_manager.py +155 -0
- sibi_flux/utils/dask_utils.py +14 -0
- sibi_flux/utils/data_utils/__init__.py +3 -0
- sibi_flux/utils/data_utils/_data_utils.py +389 -0
- sibi_flux/utils/dataframe_utils.py +52 -0
- sibi_flux/utils/date_utils/__init__.py +10 -0
- sibi_flux/utils/date_utils/_business_days.py +220 -0
- sibi_flux/utils/date_utils/_date_utils.py +311 -0
- sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
- sibi_flux/utils/file_utils.py +48 -0
- sibi_flux/utils/filepath_generator/__init__.py +5 -0
- sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
- sibi_flux/utils/parquet_saver/__init__.py +6 -0
- sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
- sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
- sibi_flux/utils/retry.py +46 -0
- sibi_flux/utils/storage/__init__.py +7 -0
- sibi_flux/utils/storage/_fs_registry.py +112 -0
- sibi_flux/utils/storage/_storage_manager.py +257 -0
- sibi_flux/utils/storage/factory.py +33 -0
- sibi_flux-2025.12.0.dist-info/METADATA +283 -0
- sibi_flux-2025.12.0.dist-info/RECORD +110 -0
- sibi_flux-2025.12.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DataCube Generator module.
|
|
3
|
+
|
|
4
|
+
Handles introspection of database schemas and generation of Python DataCube classes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import sqlalchemy as sa
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Set, Any, Callable, Tuple
|
|
10
|
+
from collections.abc import Mapping, MutableMapping, Iterable, Sequence
|
|
11
|
+
from sqlalchemy import inspect
|
|
12
|
+
import re
|
|
13
|
+
import importlib
|
|
14
|
+
import sys
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_sa_type_classes(
|
|
19
|
+
type_names: Iterable[str], logger: Optional[Any] = None
|
|
20
|
+
) -> Tuple[Any, ...]:
|
|
21
|
+
"""Converts a list of strings to actual SQLAlchemy type classes."""
|
|
22
|
+
classes = []
|
|
23
|
+
for name in type_names:
|
|
24
|
+
type_cls = getattr(sa, name, None)
|
|
25
|
+
if type_cls:
|
|
26
|
+
classes.append(type_cls)
|
|
27
|
+
elif logger:
|
|
28
|
+
logger.warning(f"SQLAlchemy type '{name}' not found.")
|
|
29
|
+
return tuple(classes)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def is_secure_path(target_file: str, allowed_dirs: Iterable[str]) -> bool:
|
|
33
|
+
"""Verifies path is within sandbox using resolution to prevent traversal."""
|
|
34
|
+
try:
|
|
35
|
+
target_path = Path(target_file).resolve()
|
|
36
|
+
for allowed in allowed_dirs:
|
|
37
|
+
if target_path.is_relative_to(Path(allowed).resolve()):
|
|
38
|
+
return True
|
|
39
|
+
except (ValueError, RuntimeError, OSError):
|
|
40
|
+
return False
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def generate_class_code(
|
|
45
|
+
inspector: Optional[sa.engine.Inspector],
|
|
46
|
+
table_name: str,
|
|
47
|
+
config_obj: str,
|
|
48
|
+
base_class: str,
|
|
49
|
+
mappings: Mapping[str, tuple],
|
|
50
|
+
class_name: Optional[str] = None,
|
|
51
|
+
class_suffix: str = "Dc",
|
|
52
|
+
backend: str = "sqlalchemy",
|
|
53
|
+
field_map_var: Optional[str] = None,
|
|
54
|
+
legacy_filters: bool = False,
|
|
55
|
+
field_map_dict: Optional[Mapping[str, str]] = None,
|
|
56
|
+
sticky_filters: Optional[Mapping[str, Any]] = None,
|
|
57
|
+
) -> str:
|
|
58
|
+
"""Introspects DB table and generates Python class string. If inspector is None, generates shell."""
|
|
59
|
+
|
|
60
|
+
# Initialize metadata dict based on the keys in our YAML mapping
|
|
61
|
+
meta: dict[str, list[str]] = {label: [] for label in mappings.keys()}
|
|
62
|
+
|
|
63
|
+
if inspector:
|
|
64
|
+
try:
|
|
65
|
+
columns = inspector.get_columns(table_name)
|
|
66
|
+
for col in columns:
|
|
67
|
+
col_name = col["name"]
|
|
68
|
+
# Apply alias if map is provided
|
|
69
|
+
if field_map_dict:
|
|
70
|
+
col_name = field_map_dict.get(col_name, col_name)
|
|
71
|
+
|
|
72
|
+
for label, sa_classes in mappings.items():
|
|
73
|
+
if isinstance(col["type"], sa_classes):
|
|
74
|
+
meta[label].append(col_name)
|
|
75
|
+
except (sa.exc.SQLAlchemyError, KeyError):
|
|
76
|
+
# If introspection fails for specific table (e.g. missing), just skip columns
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
if not class_name:
|
|
80
|
+
class_name = (
|
|
81
|
+
"".join(w.capitalize() for w in table_name.split("_")) + class_suffix
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Generate attribute lines for every label defined in the mapping
|
|
85
|
+
metadata_lines = []
|
|
86
|
+
for label, cols in meta.items():
|
|
87
|
+
metadata_lines.append(f" {label}: ClassVar[List[str]] = {repr(cols)}")
|
|
88
|
+
|
|
89
|
+
# Configuration block extensions
|
|
90
|
+
extra_config = []
|
|
91
|
+
if field_map_var:
|
|
92
|
+
extra_config.append(f" field_map: ClassVar[Dict] = {field_map_var}")
|
|
93
|
+
if legacy_filters:
|
|
94
|
+
extra_config.append(f" legacy_filters: ClassVar[bool] = {legacy_filters}")
|
|
95
|
+
if sticky_filters:
|
|
96
|
+
extra_config.append(
|
|
97
|
+
f" sticky_filters: ClassVar[Dict] = {repr(sticky_filters)}"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
attributes_str = "\n".join(metadata_lines)
|
|
101
|
+
extra_config_str = "\n".join(extra_config)
|
|
102
|
+
if extra_config_str:
|
|
103
|
+
extra_config_str = "\n" + extra_config_str
|
|
104
|
+
|
|
105
|
+
return f"""
|
|
106
|
+
class {class_name}({base_class}):
|
|
107
|
+
df: Optional[dd.DataFrame] = None
|
|
108
|
+
|
|
109
|
+
# Config
|
|
110
|
+
backend: ClassVar[str] = "{backend}"
|
|
111
|
+
connection_url: ClassVar[str] = {config_obj}.get("db_url")
|
|
112
|
+
table: ClassVar[str] = "{table_name}"{extra_config_str}
|
|
113
|
+
|
|
114
|
+
# Transformations & Metadata
|
|
115
|
+
{attributes_str}
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def check_drift(
|
|
120
|
+
dc_class: Any,
|
|
121
|
+
db_column_names: Iterable[str],
|
|
122
|
+
attribute_names: Iterable[str],
|
|
123
|
+
field_map: Optional[Mapping[str, str]] = None,
|
|
124
|
+
) -> list[str]:
|
|
125
|
+
"""
|
|
126
|
+
Checks for drift between the datacube class metadata and database columns.
|
|
127
|
+
Returns a list of issue strings.
|
|
128
|
+
"""
|
|
129
|
+
code_cols: Set[str] = set()
|
|
130
|
+
for attr in attribute_names:
|
|
131
|
+
# Get the list from the class (default to empty list)
|
|
132
|
+
cols = getattr(dc_class, attr, [])
|
|
133
|
+
code_cols.update(cols)
|
|
134
|
+
|
|
135
|
+
# Apply alias mapping to DB columns if present
|
|
136
|
+
# Map: DB_NAME -> CODE_NAME
|
|
137
|
+
if field_map:
|
|
138
|
+
mapped_db_cols = set()
|
|
139
|
+
for c in db_column_names:
|
|
140
|
+
mapped_db_cols.add(field_map.get(c, c)) # Use alias if exists, else raw
|
|
141
|
+
final_db_cols = mapped_db_cols
|
|
142
|
+
else:
|
|
143
|
+
final_db_cols = set(db_column_names)
|
|
144
|
+
|
|
145
|
+
missing_in_code = final_db_cols - code_cols
|
|
146
|
+
extra_in_code = code_cols - final_db_cols
|
|
147
|
+
|
|
148
|
+
issues = []
|
|
149
|
+
if missing_in_code:
|
|
150
|
+
issues.append(f"DB has new cols: {missing_in_code}")
|
|
151
|
+
if extra_in_code:
|
|
152
|
+
issues.append(f"Code has stale cols: {extra_in_code}")
|
|
153
|
+
|
|
154
|
+
return issues
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def resolve_db_url(config_obj_name: str, imports: Iterable[str]) -> Optional[str]:
|
|
158
|
+
"""
|
|
159
|
+
Attempts to resolve a database URL by dynamically importing the config object
|
|
160
|
+
described in the import statements.
|
|
161
|
+
"""
|
|
162
|
+
# Ensure current working directory is in path to allow local imports
|
|
163
|
+
if os.getcwd() not in sys.path:
|
|
164
|
+
sys.path.insert(0, os.getcwd())
|
|
165
|
+
|
|
166
|
+
for imp in imports:
|
|
167
|
+
# Simple parsing for "from module import var" or "from module import var1, var2"
|
|
168
|
+
if imp.startswith("from ") and " import " in imp:
|
|
169
|
+
parts = imp.split(" import ")
|
|
170
|
+
module_name = parts[0][5:].strip()
|
|
171
|
+
imported_vars = [v.strip() for v in parts[1].split(",")]
|
|
172
|
+
|
|
173
|
+
if config_obj_name in imported_vars:
|
|
174
|
+
try:
|
|
175
|
+
mod = importlib.import_module(module_name)
|
|
176
|
+
conf = getattr(mod, config_obj_name)
|
|
177
|
+
# Support dict-like (.get) or object attribute access
|
|
178
|
+
if hasattr(conf, "get"):
|
|
179
|
+
return conf.get("db_url")
|
|
180
|
+
elif hasattr(conf, "db_url"):
|
|
181
|
+
return conf.db_url
|
|
182
|
+
except (ImportError, AttributeError):
|
|
183
|
+
pass
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def load_environment(
|
|
188
|
+
env_file: Optional[Path] = None, logger: Optional[Any] = None
|
|
189
|
+
) -> None:
|
|
190
|
+
"""
|
|
191
|
+
Loads environment variables from the specified file or .env.linux by default.
|
|
192
|
+
"""
|
|
193
|
+
target = env_file if env_file else Path(os.getcwd()) / ".env"
|
|
194
|
+
if target.exists():
|
|
195
|
+
if logger:
|
|
196
|
+
logger(f"Loading environment from {target.name}...")
|
|
197
|
+
with open(target, "r") as f:
|
|
198
|
+
for line in f:
|
|
199
|
+
line = line.strip()
|
|
200
|
+
if not line or line.startswith("#"):
|
|
201
|
+
continue
|
|
202
|
+
if "=" in line:
|
|
203
|
+
key, value = line.split("=", 1)
|
|
204
|
+
os.environ[key.strip()] = value.strip().strip("'").strip('"')
|
|
205
|
+
elif env_file and logger:
|
|
206
|
+
logger(f"Warning: Environment file '{env_file}' not found.")
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def filter_global_imports(
|
|
210
|
+
global_imports: Iterable[str],
|
|
211
|
+
used_configs: Set[str],
|
|
212
|
+
ignored_prefixes: Optional[Iterable[str]] = None,
|
|
213
|
+
) -> list[str]:
|
|
214
|
+
"""
|
|
215
|
+
Filters global import lines, keeping only those that are used or generic.
|
|
216
|
+
Renames imports if they contain multiple items but only some are used.
|
|
217
|
+
"""
|
|
218
|
+
filtered = []
|
|
219
|
+
ignored = ignored_prefixes or []
|
|
220
|
+
for line in global_imports:
|
|
221
|
+
if " import " in line and line.strip().startswith("from "):
|
|
222
|
+
parts = line.split(" import ")
|
|
223
|
+
prefix = parts[0]
|
|
224
|
+
imported_vars = [v.strip() for v in parts[1].split(",")]
|
|
225
|
+
|
|
226
|
+
# Check for intersection with used configs
|
|
227
|
+
common = [v for v in imported_vars if v in used_configs]
|
|
228
|
+
|
|
229
|
+
if common:
|
|
230
|
+
# If we use some of the imported vars, keep only those
|
|
231
|
+
filtered.append(f"{prefix} import {', '.join(common)}")
|
|
232
|
+
else:
|
|
233
|
+
# If we use NONE, check if it looks like a config import we should drop
|
|
234
|
+
if any(p in prefix for p in ignored):
|
|
235
|
+
continue
|
|
236
|
+
# Otherwise keep generic imports (e.g. typing, os, etc if they appeared here)
|
|
237
|
+
filtered.append(line)
|
|
238
|
+
else:
|
|
239
|
+
filtered.append(line)
|
|
240
|
+
return filtered
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class DatacubeRegistry:
|
|
244
|
+
"""
|
|
245
|
+
Encapsulates parsing logic for the datacube_registry.yaml.
|
|
246
|
+
"""
|
|
247
|
+
|
|
248
|
+
def __init__(self, config_dict: MutableMapping[str, Any]):
|
|
249
|
+
self.config = config_dict
|
|
250
|
+
self.tables = self.config.setdefault("tables", {})
|
|
251
|
+
self.global_imports = self.config.get("global_imports", [])
|
|
252
|
+
|
|
253
|
+
# Resolve Mappings
|
|
254
|
+
raw_mappings = self.config.get("type_mappings", {})
|
|
255
|
+
self.processed_mappings = {
|
|
256
|
+
label: get_sa_type_classes(types) for label, types in raw_mappings.items()
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Defaults
|
|
260
|
+
# Defaults
|
|
261
|
+
self.valid_paths = self.config.get("valid_libpaths")
|
|
262
|
+
if not self.valid_paths:
|
|
263
|
+
# Make it empty/None? Or raise? Or let it be provided by caller?
|
|
264
|
+
# For now, let's just make it empty list so it fails secure check if not provided
|
|
265
|
+
self.valid_paths = []
|
|
266
|
+
|
|
267
|
+
self.valid_fieldmap_paths = self.config.get("valid_fieldmap_paths") or []
|
|
268
|
+
self.default_connection_obj = self.config.get(
|
|
269
|
+
"default_connection_obj", self.config.get("default_config_obj")
|
|
270
|
+
)
|
|
271
|
+
self.default_base_class = self.config.get("default_base_class")
|
|
272
|
+
self.default_base_import = self.config.get("default_base_import")
|
|
273
|
+
self.default_backend = self.config.get("default_backend", "sqlalchemy")
|
|
274
|
+
self.class_suffix = self.config.get("class_suffix", "Dc")
|
|
275
|
+
|
|
276
|
+
def get_table_details(self, table_name: str) -> dict[str, Any]:
|
|
277
|
+
return self.tables.get(table_name, {})
|
|
278
|
+
|
|
279
|
+
def group_tables_by_file(self) -> dict[str, list[tuple]]:
|
|
280
|
+
"""
|
|
281
|
+
Groups tables by their target file path.
|
|
282
|
+
Returns: Dict[path_str, List[(table_name, conf_obj, base_cls, base_imp, cls_name)]]
|
|
283
|
+
"""
|
|
284
|
+
file_groups: dict[str, list[tuple]] = {}
|
|
285
|
+
for table_name, details in self.tables.items():
|
|
286
|
+
target = details.get("save_to_path", details.get("path"))
|
|
287
|
+
if not target:
|
|
288
|
+
continue
|
|
289
|
+
|
|
290
|
+
conf_obj = details.get(
|
|
291
|
+
"connection_obj", details.get("config_obj", self.default_connection_obj)
|
|
292
|
+
)
|
|
293
|
+
base_cls = details.get("base_class", self.default_base_class)
|
|
294
|
+
base_imp = details.get("import_from", self.default_base_import)
|
|
295
|
+
cls_name = details.get("class_name")
|
|
296
|
+
|
|
297
|
+
file_groups.setdefault(target, []).append(
|
|
298
|
+
(table_name, conf_obj, base_cls, base_imp, cls_name)
|
|
299
|
+
)
|
|
300
|
+
return file_groups
|
|
301
|
+
|
|
302
|
+
def merge_discovered(
|
|
303
|
+
self, discovered_tables: Mapping[str, Mapping[str, Any]]
|
|
304
|
+
) -> None:
|
|
305
|
+
"""
|
|
306
|
+
Merges discovered tables into the existing configuration.
|
|
307
|
+
"""
|
|
308
|
+
for table, new_details in discovered_tables.items():
|
|
309
|
+
if table in self.tables:
|
|
310
|
+
existing = self.tables[table]
|
|
311
|
+
for k, v in new_details.items():
|
|
312
|
+
if v is not None:
|
|
313
|
+
existing[k] = v
|
|
314
|
+
elif k not in existing:
|
|
315
|
+
existing[k] = v
|
|
316
|
+
|
|
317
|
+
if "class_name" not in existing:
|
|
318
|
+
existing["class_name"] = new_details.get("class_name")
|
|
319
|
+
|
|
320
|
+
self.tables[table] = existing
|
|
321
|
+
else:
|
|
322
|
+
self.tables[table] = new_details
|
|
323
|
+
|
|
324
|
+
def prune_tables(self, keep_tables: Iterable[str]) -> list[str]:
|
|
325
|
+
"""
|
|
326
|
+
Removes tables from the registry that are not in the keep_tables list.
|
|
327
|
+
"""
|
|
328
|
+
to_remove = [t for t in self.tables if t not in keep_tables]
|
|
329
|
+
for t in to_remove:
|
|
330
|
+
del self.tables[t]
|
|
331
|
+
return to_remove
|
|
332
|
+
|
|
333
|
+
def to_class_name(self, table_name: str) -> str:
|
|
334
|
+
"""
|
|
335
|
+
Converts snake_case table name to CamelCase + Suffix.
|
|
336
|
+
"""
|
|
337
|
+
return (
|
|
338
|
+
"".join(w.capitalize() for w in table_name.split("_")) + self.class_suffix
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
def normalize_aliases(self, ignored_prefixes: Optional[list[str]] = None) -> None:
|
|
342
|
+
pass
|
|
343
|
+
|
|
344
|
+
def discover(self, allowed_paths: Optional[list[str]] = None) -> list[str]:
|
|
345
|
+
return []
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def perform_discovery(
|
|
349
|
+
all_table_names: Iterable[str],
|
|
350
|
+
match_rule_callback: Callable[[str], Optional[Mapping[str, Any]]],
|
|
351
|
+
registry: DatacubeRegistry,
|
|
352
|
+
whitelist: Optional[Mapping[str, Mapping[str, Any]]] = None,
|
|
353
|
+
template: Optional[Mapping[str, Any]] = None,
|
|
354
|
+
field_map_template: Optional[str] = None,
|
|
355
|
+
default_connection: str = "default_db_conf",
|
|
356
|
+
) -> dict[str, dict[str, Any]]:
|
|
357
|
+
"""
|
|
358
|
+
Executes the discovery logic:
|
|
359
|
+
1. Filter by whitelist (if provided).
|
|
360
|
+
2. Match tables against rules.
|
|
361
|
+
3. Construct entries using (Template + Rule + Whitelist Override).
|
|
362
|
+
"""
|
|
363
|
+
discovered = {}
|
|
364
|
+
|
|
365
|
+
candidates = all_table_names
|
|
366
|
+
if whitelist is not None:
|
|
367
|
+
candidates = [t for t in all_table_names if t in whitelist]
|
|
368
|
+
|
|
369
|
+
for table in candidates:
|
|
370
|
+
rule = match_rule_callback(table)
|
|
371
|
+
if not rule:
|
|
372
|
+
continue
|
|
373
|
+
|
|
374
|
+
path = rule["path"]
|
|
375
|
+
class_name = registry.to_class_name(table)
|
|
376
|
+
|
|
377
|
+
# entry = (template or {}).copy()
|
|
378
|
+
# Refactoring Note: dict() creates a copy and supports Mapping
|
|
379
|
+
entry = dict(template or {})
|
|
380
|
+
|
|
381
|
+
if rule.get("domain") and field_map_template:
|
|
382
|
+
# e.g. "solutions.conf.transforms.fields.{domain}.{table}.field_map"
|
|
383
|
+
entry["field_map"] = field_map_template.format(
|
|
384
|
+
domain=rule["domain"], table=table
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
entry.update(
|
|
388
|
+
{
|
|
389
|
+
"class_name": class_name,
|
|
390
|
+
"connection_obj": rule.get("connection_obj") or default_connection,
|
|
391
|
+
"save_to_path": path,
|
|
392
|
+
}
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if whitelist and table in whitelist:
|
|
396
|
+
overrides = whitelist[table]
|
|
397
|
+
if overrides:
|
|
398
|
+
entry.update(overrides)
|
|
399
|
+
|
|
400
|
+
discovered[table] = entry
|
|
401
|
+
|
|
402
|
+
return discovered
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def generate_field_map_files(
|
|
406
|
+
discovered_entries: Mapping[str, Mapping[str, Any]],
|
|
407
|
+
inspector: Any,
|
|
408
|
+
root_path: Path,
|
|
409
|
+
force: bool = False,
|
|
410
|
+
logger: Optional[Any] = None,
|
|
411
|
+
allowed_paths: Optional[Iterable[str]] = None,
|
|
412
|
+
) -> None:
|
|
413
|
+
"""
|
|
414
|
+
Generates or updates Python field map files for discovered tables.
|
|
415
|
+
If file exists and not force: performs smart sync (comment out stale, add new).
|
|
416
|
+
If force: overwrites completely.
|
|
417
|
+
"""
|
|
418
|
+
key_pattern = re.compile(r'^\s*(?P<comment>#\s*)?[\'"](?P<key>\w+)[\'"]\s*:')
|
|
419
|
+
|
|
420
|
+
for table, entry in discovered_entries.items():
|
|
421
|
+
field_map = entry.get("field_map")
|
|
422
|
+
if not field_map:
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
rule = entry # entry has domain if matched via discovery
|
|
426
|
+
if "domain" not in rule and "field_map" in rule:
|
|
427
|
+
# Try to infer target dir from field_map string??
|
|
428
|
+
# Convention: solutions.conf.transforms.fields.<domain>.<table_name>.field_map
|
|
429
|
+
parts = field_map.split(".")
|
|
430
|
+
if len(parts) > 4:
|
|
431
|
+
# domain is parts[-3] if standard convention
|
|
432
|
+
pass
|
|
433
|
+
|
|
434
|
+
# We assume root_path is passed correctly (e.g. solutions/conf/transforms/fields)
|
|
435
|
+
# And we append domain/table.py
|
|
436
|
+
# But we really should trust the rule['domain'] if available
|
|
437
|
+
domain = rule.get("domain", "common")
|
|
438
|
+
target_dir = root_path / domain
|
|
439
|
+
target_file = target_dir / f"{table}.py"
|
|
440
|
+
|
|
441
|
+
# Security Check
|
|
442
|
+
if allowed_paths:
|
|
443
|
+
if not is_secure_path(str(target_file), allowed_paths):
|
|
444
|
+
if logger:
|
|
445
|
+
logger(
|
|
446
|
+
f"[red]Blocked Field Map Generation: {target_file} is outside allowed paths.[/red]"
|
|
447
|
+
)
|
|
448
|
+
continue
|
|
449
|
+
|
|
450
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
451
|
+
|
|
452
|
+
if not (target_dir / "__init__.py").exists():
|
|
453
|
+
(target_dir / "__init__.py").touch()
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
cols = inspector.get_columns(table)
|
|
457
|
+
db_col_names = {c["name"] for c in cols}
|
|
458
|
+
col_list = [c["name"] for c in cols] # Preserve order
|
|
459
|
+
|
|
460
|
+
if target_file.exists() and not force:
|
|
461
|
+
# --- Smart Sync ---
|
|
462
|
+
with open(target_file, "r") as f:
|
|
463
|
+
lines = f.readlines()
|
|
464
|
+
|
|
465
|
+
new_lines = []
|
|
466
|
+
existing_keys = set()
|
|
467
|
+
|
|
468
|
+
for line in lines:
|
|
469
|
+
match = key_pattern.match(line)
|
|
470
|
+
if match:
|
|
471
|
+
key = match.group("key")
|
|
472
|
+
existing_keys.add(key)
|
|
473
|
+
is_commented = bool(match.group("comment"))
|
|
474
|
+
|
|
475
|
+
if key not in db_col_names:
|
|
476
|
+
if not is_commented:
|
|
477
|
+
line = f"# {line}"
|
|
478
|
+
new_lines.append(line)
|
|
479
|
+
|
|
480
|
+
missing_cols = db_col_names - existing_keys
|
|
481
|
+
|
|
482
|
+
insert_idx = len(new_lines)
|
|
483
|
+
for i in range(len(new_lines) - 1, -1, -1):
|
|
484
|
+
if "}" in new_lines[i]:
|
|
485
|
+
insert_idx = i
|
|
486
|
+
break
|
|
487
|
+
|
|
488
|
+
to_insert = []
|
|
489
|
+
for col in sorted(missing_cols):
|
|
490
|
+
to_insert.append(
|
|
491
|
+
f' "{col}": "{col}", # TODO: Translate to English\n'
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
new_lines[insert_idx:insert_idx] = to_insert
|
|
495
|
+
|
|
496
|
+
with open(target_file, "w") as f:
|
|
497
|
+
f.writelines(new_lines)
|
|
498
|
+
|
|
499
|
+
if logger:
|
|
500
|
+
logger(
|
|
501
|
+
f"[green]Synced {target_file} (Added {len(missing_cols)}, Checked Stale)[/green]"
|
|
502
|
+
)
|
|
503
|
+
else:
|
|
504
|
+
# --- Fresh Generation ---
|
|
505
|
+
lines = ["field_map = {"]
|
|
506
|
+
for col in col_list:
|
|
507
|
+
lines.append(f' "{col}": "{col}", # TODO: Translate to English')
|
|
508
|
+
lines.append("}")
|
|
509
|
+
|
|
510
|
+
with open(target_file, "w") as f:
|
|
511
|
+
f.write("\n".join(lines))
|
|
512
|
+
|
|
513
|
+
if logger:
|
|
514
|
+
logger(f"[green]Generated {target_file}[/green]")
|
|
515
|
+
|
|
516
|
+
except (OSError, sa.exc.SQLAlchemyError) as e:
|
|
517
|
+
if logger:
|
|
518
|
+
logger(f"[red]Failed to generate/sync fields for {table}: {e}[/red]")
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def generate_datacube_module_code(
|
|
522
|
+
items: Iterable[tuple],
|
|
523
|
+
registry: DatacubeRegistry,
|
|
524
|
+
get_db_url_callback: Callable[[str], str],
|
|
525
|
+
logger: Optional[Any] = None,
|
|
526
|
+
) -> Tuple[list[str], list[str]]:
|
|
527
|
+
"""
|
|
528
|
+
Generates the code components (imports, class definitions) for a single DataCube module file.
|
|
529
|
+
Handles import aliasing to prevent variable shadowing.
|
|
530
|
+
"""
|
|
531
|
+
# Refactoring Note: using concrete Dict in string literal is fine as it's code generation
|
|
532
|
+
imports: Set[str] = {
|
|
533
|
+
"from typing import Optional, List, ClassVar, Dict",
|
|
534
|
+
"import dask.dataframe as dd",
|
|
535
|
+
}
|
|
536
|
+
classes_code = []
|
|
537
|
+
|
|
538
|
+
for table_name, conf_obj, base_cls, base_imp, cls_name in items:
|
|
539
|
+
imports.add(base_imp)
|
|
540
|
+
|
|
541
|
+
details = registry.get_table_details(table_name)
|
|
542
|
+
field_map_str = details.get("field_map")
|
|
543
|
+
sticky_filters = details.get("sticky_filters")
|
|
544
|
+
|
|
545
|
+
field_map_var = None
|
|
546
|
+
legacy = False
|
|
547
|
+
|
|
548
|
+
field_map_dict = None
|
|
549
|
+
if field_map_str:
|
|
550
|
+
if "." in field_map_str:
|
|
551
|
+
mod, var = field_map_str.rsplit(".", 1)
|
|
552
|
+
# FIX: Create a unique alias for the import to avoid shadowing
|
|
553
|
+
alias = f"{var}_{table_name}"
|
|
554
|
+
imports.add(f"from {mod} import {var} as {alias}")
|
|
555
|
+
field_map_var = alias
|
|
556
|
+
legacy = True
|
|
557
|
+
|
|
558
|
+
# Try to load the actual dict for alias application during introspection
|
|
559
|
+
try:
|
|
560
|
+
module = importlib.import_module(mod)
|
|
561
|
+
field_map_dict = getattr(module, var)
|
|
562
|
+
if not isinstance(field_map_dict, dict):
|
|
563
|
+
if logger:
|
|
564
|
+
logger(
|
|
565
|
+
f"[yellow]Warning: {var} in {mod} is not a dict. Alias mapping skipped.[/yellow]"
|
|
566
|
+
)
|
|
567
|
+
field_map_dict = None
|
|
568
|
+
except (ImportError, AttributeError) as e:
|
|
569
|
+
if logger:
|
|
570
|
+
logger(
|
|
571
|
+
f"[yellow]Warning: Could not load field_map {var} from {mod}: {e}. Alias mapping skipped.[/yellow]"
|
|
572
|
+
)
|
|
573
|
+
field_map_dict = None
|
|
574
|
+
else:
|
|
575
|
+
if logger:
|
|
576
|
+
logger(
|
|
577
|
+
f"[yellow]Warning: Invalid field_map format '{field_map_str}' for {table_name}. Expected module.path.variable[/yellow]"
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
try:
|
|
581
|
+
db_url = get_db_url_callback(conf_obj)
|
|
582
|
+
engine = sa.create_engine(db_url)
|
|
583
|
+
inspector = inspect(engine)
|
|
584
|
+
|
|
585
|
+
classes_code.append(
|
|
586
|
+
generate_class_code(
|
|
587
|
+
inspector,
|
|
588
|
+
table_name,
|
|
589
|
+
conf_obj,
|
|
590
|
+
base_cls,
|
|
591
|
+
registry.processed_mappings,
|
|
592
|
+
class_name=cls_name,
|
|
593
|
+
class_suffix=registry.class_suffix,
|
|
594
|
+
backend=registry.default_backend,
|
|
595
|
+
field_map_var=field_map_var,
|
|
596
|
+
legacy_filters=legacy,
|
|
597
|
+
field_map_dict=field_map_dict,
|
|
598
|
+
sticky_filters=sticky_filters,
|
|
599
|
+
)
|
|
600
|
+
)
|
|
601
|
+
except sa.exc.SQLAlchemyError as e:
|
|
602
|
+
if logger:
|
|
603
|
+
logger(
|
|
604
|
+
f"[yellow]Warning: Could not introspect {table_name} ({e}). Generating shell class.[/yellow]"
|
|
605
|
+
)
|
|
606
|
+
classes_code.append(
|
|
607
|
+
generate_class_code(
|
|
608
|
+
None,
|
|
609
|
+
table_name,
|
|
610
|
+
conf_obj,
|
|
611
|
+
base_cls,
|
|
612
|
+
registry.processed_mappings,
|
|
613
|
+
class_name=cls_name,
|
|
614
|
+
class_suffix=registry.class_suffix,
|
|
615
|
+
backend=registry.default_backend,
|
|
616
|
+
field_map_var=field_map_var,
|
|
617
|
+
)
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
return list(imports), classes_code
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def dump_db_schema(
|
|
624
|
+
engine: Any,
|
|
625
|
+
db_name: str,
|
|
626
|
+
output_dir: Path,
|
|
627
|
+
whitelist: Optional[Mapping[str, Any]] = None,
|
|
628
|
+
logger: Optional[Any] = None,
|
|
629
|
+
) -> None:
|
|
630
|
+
"""
|
|
631
|
+
Introspects the database and dumps a comprehensive schema YAML file.
|
|
632
|
+
Structure:
|
|
633
|
+
<db_name>:
|
|
634
|
+
<table_name>:
|
|
635
|
+
columns: ...
|
|
636
|
+
indexes: ...
|
|
637
|
+
pk: ...
|
|
638
|
+
foreign_keys: ...
|
|
639
|
+
"""
|
|
640
|
+
inspector = inspect(engine)
|
|
641
|
+
all_tables = inspector.get_table_names()
|
|
642
|
+
|
|
643
|
+
if whitelist:
|
|
644
|
+
tables_to_process = sorted([t for t in all_tables if t in whitelist])
|
|
645
|
+
else:
|
|
646
|
+
tables_to_process = sorted(all_tables)
|
|
647
|
+
|
|
648
|
+
schema_data = {}
|
|
649
|
+
|
|
650
|
+
for table in tables_to_process:
|
|
651
|
+
try:
|
|
652
|
+
# Columns
|
|
653
|
+
columns = []
|
|
654
|
+
for col in inspector.get_columns(table):
|
|
655
|
+
columns.append({"name": col["name"], "type": str(col["type"])})
|
|
656
|
+
|
|
657
|
+
schema_data[table] = {"columns": columns}
|
|
658
|
+
except sa.exc.SQLAlchemyError as e:
|
|
659
|
+
if logger:
|
|
660
|
+
logger(f"[red]Error dumping schema for table {table}: {e}[/red]")
|
|
661
|
+
schema_data[table] = {"error": str(e)}
|
|
662
|
+
|
|
663
|
+
# Top level key is db_name
|
|
664
|
+
final_output = {db_name: schema_data}
|
|
665
|
+
|
|
666
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
667
|
+
out_file = output_dir / f"schema_{db_name}.yaml"
|
|
668
|
+
|
|
669
|
+
import yaml
|
|
670
|
+
|
|
671
|
+
with open(out_file, "w") as f:
|
|
672
|
+
yaml.dump(final_output, f, sort_keys=False, default_flow_style=False)
|
|
673
|
+
|
|
674
|
+
if logger:
|
|
675
|
+
logger(
|
|
676
|
+
f"[green]Dumped schema to {out_file} ({len(tables_to_process)} tables)[/green]"
|
|
677
|
+
)
|