sibi-flux 2026.1.2__py3-none-any.whl → 2026.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sibi_flux/datacube/cli.py CHANGED
@@ -7,6 +7,7 @@ import typer
7
7
  import subprocess
8
8
  import importlib.util
9
9
  import importlib
10
+ import shutil
10
11
  import sqlalchemy as sa
11
12
  from pathlib import Path
12
13
  from typing import Optional, Callable, Set, Dict, Any, Iterable, Mapping
@@ -29,15 +30,106 @@ from sibi_flux.datacube.generator import (
29
30
  )
30
31
  from sibi_flux.datacube.orchestrator import DiscoveryOrchestrator
31
32
  from sibi_flux.datacube.field_factory import FieldMapFactory
33
+ from sibi_flux.init.rule_generator import RuleEngine
32
34
 
33
35
  import sibi_flux.datacube.generator
34
36
 
35
-
36
37
  app = typer.Typer(help="Sibi-Flux Data Cube Generator")
37
38
  console = Console()
38
39
 
39
40
  # --- Context Management ---
40
41
 
42
+
43
+ def _load_and_resolve_config(config_path: Path) -> dict:
44
+ if not config_path.exists():
45
+ return {}
46
+ with open(config_path, "r") as f:
47
+ config_data = yaml.safe_load(f) or {}
48
+
49
+ # Heuristic: Config is in generators/datacubes/discovery_params.yaml
50
+ # Project Root is 3 levels up from FILE
51
+ try:
52
+ project_root = config_path.parent.parent.parent
53
+ except Exception:
54
+ project_root = Path.cwd()
55
+
56
+ if "paths" in config_data:
57
+ if "target" in config_data["paths"]:
58
+ target = config_data["paths"]["target"]
59
+ for key in ["datacubes_dir", "field_maps_dir"]:
60
+ if key in target:
61
+ rel_path = target[key]
62
+ if rel_path and not Path(rel_path).is_absolute():
63
+ abs_path = (project_root / rel_path).resolve()
64
+ target[key] = str(abs_path)
65
+
66
+ # Resolve Registry File (Dual Support)
67
+ repos = config_data.get("paths", {}).get("repositories", {})
68
+
69
+ # New location
70
+ if "global_datacube_registry_file" in repos:
71
+ reg_file = repos["global_datacube_registry_file"]
72
+ if not Path(reg_file).is_absolute():
73
+ repos["global_datacube_registry_file"] = str(
74
+ (project_root / reg_file).resolve()
75
+ )
76
+
77
+ # Old location (fallback)
78
+ elif "global_datacube_registry_file" in config_data.get("paths", {}):
79
+ reg_file = config_data.get("paths", {})["global_datacube_registry_file"]
80
+ if not Path(reg_file).is_absolute():
81
+ config_data["paths"]["global_datacube_registry_file"] = str(
82
+ (project_root / reg_file).resolve()
83
+ )
84
+
85
+ # Resolve Repositories
86
+ if "repositories" in config_data["paths"]:
87
+ repos = config_data["paths"]["repositories"]
88
+ for key in [
89
+ "global_field_repository_file",
90
+ "global_field_translations_file",
91
+ ]:
92
+ if key in repos:
93
+ rel = repos[key]
94
+ if rel and not Path(rel).is_absolute():
95
+ config_data["paths"]["repositories"][key] = str(
96
+ (project_root / rel).resolve()
97
+ )
98
+
99
+ # Resolve Discovery Paths (Dual Support: root or paths.discovery)
100
+ discovery_block = None
101
+ if "paths" in config_data and "discovery" in config_data["paths"]:
102
+ discovery_block = config_data["paths"]["discovery"]
103
+ elif "discovery" in config_data:
104
+ discovery_block = config_data["discovery"]
105
+
106
+ if discovery_block:
107
+ for key in ["all_tables_file", "rules_file", "whitelist_file"]:
108
+ if key in discovery_block:
109
+ rel = discovery_block[key]
110
+ if rel and not Path(rel).is_absolute():
111
+ discovery_block[key] = str((project_root / rel).resolve())
112
+
113
+ # Normalize databases (id -> name mapping) for CLI compatibility
114
+ # Ensure this matches logic in gen_dc.py wrapper
115
+ if "databases" in config_data:
116
+ for db in config_data["databases"]:
117
+ if "id" in db and "name" not in db:
118
+ db["name"] = db["id"]
119
+ if "connection_ref" in db and "connection_obj" not in db:
120
+ db["connection_obj"] = db["connection_ref"]
121
+
122
+ # Normalize import_spec to global_import string for resolve_db_url
123
+ if "import_spec" in db and "global_import" not in db:
124
+ spec = db["import_spec"]
125
+ if "module" in spec and "symbol" in spec:
126
+ db["global_import"] = (
127
+ f"from {spec['module']} import {spec['symbol']}"
128
+ )
129
+
130
+ return config_data
131
+
132
+
41
133
  class CLIContext:
42
134
  def __init__(self):
43
135
  self.default_config: Optional[Path] = None
@@ -52,7 +144,7 @@ class CLIContext:
52
144
  field_translations_file: Path,
53
145
  valid_paths: list[str],
54
146
  valid_fieldmap_paths: list[str],
55
- params: Optional[dict] = None
147
+ params: Optional[dict] = None,
56
148
  ):
57
149
  self.default_config = default_config
58
150
  self.field_translations_file = field_translations_file
@@ -60,23 +152,59 @@ class CLIContext:
60
152
  self.valid_fieldmap_paths = valid_fieldmap_paths
61
153
  self.params = params or {}
62
154
 
155
+ def auto_configure(self):
156
+ """Attempts to find defaults if not configured."""
157
+ if self.default_config:
158
+ return
159
+
160
+ # Heuristic check for standard project layout
161
+ # Case 1: Run from project root -> generators/datacubes/discovery_params.yaml
162
+ candidate = Path("generators/datacubes/discovery_params.yaml")
163
+ if candidate.exists():
164
+ # Use shared resolver to get normalization and project root paths
165
+ raw_params = _load_and_resolve_config(candidate)
166
+
167
+ self.configure(
168
+ default_config=candidate.resolve(),
169
+ field_translations_file=(
170
+ candidate.parent.parent.parent
171
+ / "dataobjects/globals/global_field_translations.yaml"
172
+ ).resolve(),
173
+ valid_paths=[], # Would need params to populate
174
+ valid_fieldmap_paths=[],
175
+ params=raw_params,
176
+ )
177
+ console.print(f"[dim]Auto-configured context from {candidate}[/dim]")
178
+
179
+
63
180
  context = CLIContext()
181
+ context.auto_configure()
182
+
64
183
 
65
184
  def set_context_defaults(
66
185
  default_config: Path,
67
186
  field_translations_file: Path,
68
187
  valid_paths: list[str],
69
188
  valid_fieldmap_paths: list[str],
70
- params: Optional[dict] = None
189
+ params: Optional[dict] = None,
71
190
  ):
72
191
  """Configures the CLI context with project-specific defaults."""
73
- context.configure(default_config, field_translations_file, valid_paths, valid_fieldmap_paths, params)
74
-
192
+ context.configure(
193
+ default_config,
194
+ field_translations_file,
195
+ valid_paths,
196
+ valid_fieldmap_paths,
197
+ params,
198
+ )
199
+
75
200
  # Ensure directories exist based on configured params
76
201
  if params:
77
202
  ensure_directories_exist(params, logger=console.log)
78
203
 
79
- def _get_db_url_callback(registry: DatacubeRegistry, db_url_map: Optional[str]) -> Callable[[str], str]:
204
+
205
+ def _get_db_url_callback(
206
+ registry: DatacubeRegistry, db_url_map: Optional[str]
207
+ ) -> Callable[[str], str]:
80
208
  """Helper to create a callback that resolves DB URLs from CLI overrides or registry."""
81
209
  cli_urls = json.loads(db_url_map) if db_url_map else {}
82
210
 
@@ -88,19 +216,29 @@ def _get_db_url_callback(registry: DatacubeRegistry, db_url_map: Optional[str])
88
216
  url = resolve_db_url(conf_name, registry.global_imports)
89
217
  if url:
90
218
  return url
91
- raise ValueError(f"Could not resolve DB URL for '{conf_name}'. Provide via --db-urls or check imports.")
219
+ raise ValueError(
220
+ f"Could not resolve DB URL for '{conf_name}'. Provide via --db-urls or check imports."
221
+ )
92
222
 
93
223
  return get_url
94
224
 
225
+
95
226
  # --- Commands ---
96
227
 
228
+
97
229
  @app.command()
98
230
  def sync(
99
231
  config_file: Optional[Path] = typer.Option(None, "--config"),
100
- db_url_map: Optional[str] = typer.Option(None, "--db-urls", help="Optional JSON mapping. If omitted, tries to resolve from code."),
232
+ db_url_map: Optional[str] = typer.Option(
233
+ None,
234
+ "--db-urls",
235
+ help="Optional JSON mapping. If omitted, tries to resolve from code.",
236
+ ),
101
237
  force: bool = typer.Option(False, "--force", "-f"),
102
- env_file: Optional[Path] = typer.Option(None, "--env-file", "-e", help="Path to environment file"),
103
- dry_run: bool = typer.Option(False, "--dry-run")
238
+ env_file: Optional[Path] = typer.Option(
239
+ None, "--env-file", "-e", help="Path to environment file"
240
+ ),
241
+ dry_run: bool = typer.Option(False, "--dry-run"),
104
242
  ) -> None:
105
243
  """Generates all Datacube classes based on the whitelists and field maps."""
106
244
  config_path = config_file or context.default_config
@@ -119,64 +257,188 @@ def sync(
119
257
  load_environment(env_path, logger=console.print)
120
258
 
121
259
  # Start with empty/default registry
122
- with open(config_path, 'r') as f:
123
- config_data = yaml.safe_load(f)
260
+ config_data = _load_and_resolve_config(config_path)
124
261
  registry = DatacubeRegistry(config_data, params=context.params)
125
-
262
+
263
+ # --- Aggregation Phase ---
126
264
  # --- Aggregation Phase ---
127
265
  params = context.params
128
266
  databases = params.get("databases", [])
129
-
267
+
268
+ # JIT DISCOVERY CHECK
269
+ # If using simplified whitelist workflow, registry might be empty.
270
+ # Auto-discover from whitelist in-memory.
271
+ if not registry.tables:
272
+ console.print(
273
+ "[dim]Registry empty. Attempting JIT Discovery from Whitelists...[/dim]"
274
+ )
275
+
276
+ # Prepare URL resolver for orchestrator usage if needed
277
+ import json
278
+
279
+ cli_urls = json.loads(db_url_map) if db_url_map else {}
280
+
281
+ for db in databases:
282
+ conn_obj = db.get("connection_ref") or db.get("connection_obj")
283
+ nm = db.get("id") or db.get("name")
284
+
285
+ # Resolve whitelist/rules paths (reusing logic from discover command or simplifying?)
286
+ # Orchestrator handles defaults if paths passed are relative/simple strings.
287
+ # We need to resolve full paths to be safe, or trust Orchestrator logic.
288
+ # Let's rely on params provided to orchestrator logic via context.params
289
+
290
+ try:
291
+ # Resolve DB URL
292
+ import_spec = db.get("import_spec")
293
+ if import_spec and isinstance(import_spec, dict):
294
+ imp = import_spec.get("module")
295
+ else:
296
+ imp = db.get("global_import")
297
+ db_imports = [imp] if imp else registry.global_imports
298
+
299
+ # Helper to resolve
300
+ if conn_obj in cli_urls:
301
+ db_conn_str = cli_urls[conn_obj]
302
+ else:
303
+ db_conn_str = resolve_db_url(conn_obj, db_imports)
304
+
305
+ if not db_conn_str:
306
+ console.print(
307
+ f"[yellow]Skipping JIT discovery for {nm}: No DB URL.[/yellow]"
308
+ )
309
+ continue
310
+
311
+ # Initialize Orchestrator
312
+ # We need to construct paths similar to 'discover' command logic
313
+ # Or let Orchestrator defaults handle it.
314
+ # Better to pass explicit defaults from params if available.
315
+
316
+ disc_paths = params.get("paths", {}).get("discovery", {}) or params.get(
317
+ "discovery", {}
318
+ )
319
+
320
+ whitelist_file = (
321
+ db.get("whitelist_file")
322
+ or disc_paths.get("whitelist_file")
323
+ or params.get("whitelist_file")
324
+ or f"discovery_whitelist_{conn_obj}.yaml"
325
+ )
326
+ rules_file = (
327
+ db.get("rules_file")
328
+ or disc_paths.get("rules_file")
329
+ or params.get("rules_file")
330
+ or f"discovery_rules_{conn_obj}.yaml"
331
+ )
332
+
333
+ # Anchoring
334
+ try:
335
+ prj_root = config_path.parent.parent.parent
336
+ except Exception:
337
+ prj_root = Path.cwd()
338
+
339
+ # Resolve Whitelist
340
+ if Path(whitelist_file).is_absolute():
341
+ wl_path = whitelist_file
342
+ else:
343
+ wl_path = str(prj_root / whitelist_file)
344
+
345
+ # Resolve Rules
346
+ if Path(rules_file).is_absolute():
347
+ r_path = rules_file
348
+ else:
349
+ r_path = str(prj_root / rules_file)
350
+
351
+ # console.print(f"DEBUG: {nm} -> WL Path: {wl_path} (Exists: {Path(wl_path).exists()})")
352
+
353
+ orchestrator = DiscoveryOrchestrator(
354
+ params=context.params,
355
+ rules_path=r_path,
356
+ whitelist_path=wl_path,
357
+ registry_path=str(config_path), # Not saving, but needed for init?
358
+ db_connection_str=db_conn_str,
359
+ db_config=db,
360
+ )
361
+
362
+ entries = orchestrator.discover()
363
+ registry.merge_discovered(entries)
364
+
365
+ except Exception as e:
366
+ console.print(f"[red]JIT Discovery failed for {nm}: {e}[/red]")
367
+
130
368
  # 0. Generate Field Maps (if enabled)
131
369
  # Check generation.enable_field_maps (defaults to True)
132
370
  if params.get("generation", {}).get("enable_field_maps", True):
133
371
  import json
372
+
134
373
  cli_urls = json.loads(db_url_map) if db_url_map else {}
135
-
374
+
136
375
  def get_url_safe(conf_name, db_imp):
137
- if conf_name in cli_urls: return cli_urls[conf_name]
138
- imp = [db_imp] if db_imp else registry.global_imports
139
- return resolve_db_url(conf_name, imp)
376
+ if conf_name in cli_urls:
377
+ return cli_urls[conf_name]
378
+ imp = [db_imp] if db_imp else registry.global_imports
379
+ return resolve_db_url(conf_name, imp)
140
380
 
141
- _run_field_map_generation(context, config_path, databases, get_url_safe, force=force)
381
+ _run_field_map_generation(
382
+ context, config_path, databases, get_url_safe, force=force
383
+ )
142
384
  # Ensure new modules are picked up
143
385
  importlib.invalidate_caches()
144
386
 
145
387
  # Inject valid paths for security from context
146
-
147
-
148
388
 
149
389
  # Inject valid paths for security from context
150
- registry.valid_paths = context.valid_paths
390
+ # Also inject the resolved datacubes_dir since params determines it.
391
+
392
+ # Resolving datacubes_dir locally just in case context.valid_paths misses it
393
+ # (Context might rely on static registry or defaults, but params can contain overrides)
394
+ dc_dir = params.get("paths", {}).get("target", {}).get("datacubes_dir")
395
+
396
+ valid_paths = set(context.valid_paths) # Use set for deduplication
397
+ if dc_dir:
398
+ # Resolve against project root if relative
399
+ if Path(dc_dir).is_absolute():
400
+ valid_paths.add(str(dc_dir))
401
+ else:
402
+ try:
403
+ # Heuristic re-resolution
404
+ prj_root = config_path.parent.parent.parent
405
+ valid_paths.add(str(prj_root / dc_dir))
406
+ except Exception:
407
+ valid_paths.add(str(Path.cwd() / dc_dir))
408
+
409
+ # Debug: Check if registry uses valid_paths correctly
410
+ registry.valid_paths = list(valid_paths)
151
411
  registry.valid_fieldmap_paths = context.valid_fieldmap_paths
152
-
412
+
153
413
  get_url = _get_db_url_callback(registry, db_url_map)
154
414
 
155
415
  # Group tables by target file
156
416
  file_groups = registry.group_tables_by_file()
157
417
 
158
418
  summary_table = Table(title="Sync Results")
159
-
160
-
161
419
 
162
420
  summary_table.add_column("File", style="magenta")
163
421
  summary_table.add_column("Classes", style="cyan")
164
422
  summary_table.add_column("Status")
165
423
 
424
+ generated_registry = {}
425
+
166
426
  for file_path_str, items in file_groups.items():
167
427
  if not is_secure_path(file_path_str, registry.valid_paths):
168
- console.print(f"[bold red]Blocked:[/bold red] {file_path_str} is outside allowed paths.")
428
+ console.print(
429
+ f"[bold red]Blocked:[/bold red] {file_path_str} is outside allowed paths."
430
+ )
169
431
  continue
170
432
 
171
433
  file_path = Path(file_path_str)
172
-
434
+
173
435
  is_append = False
174
436
  existing_content = ""
175
-
437
+
176
438
  if file_path.exists() and not force:
177
- with open(file_path, 'r') as f:
439
+ with open(file_path, "r") as f:
178
440
  existing_content = f.read()
179
-
441
+
180
442
  missing_items = []
181
443
  for item in items:
182
444
  # item is (table_name, conf_obj, base_cls, base_imp, cls_name)
@@ -184,44 +446,59 @@ def sync(
184
446
  cls_name = item[4]
185
447
  if f"class {cls_name}" not in existing_content:
186
448
  missing_items.append(item)
187
-
449
+
188
450
  if not missing_items:
189
- summary_table.add_row(file_path_str, str(len(items)), "[yellow]Skipped (All Exist)[/yellow]")
451
+ summary_table.add_row(
452
+ file_path_str,
453
+ str(len(items)),
454
+ "[yellow]Skipped (All Exist)[/yellow]",
455
+ )
190
456
  continue
191
-
457
+
192
458
  items = missing_items
193
459
  is_append = True
194
-
460
+
195
461
  if dry_run:
196
- status = "[blue]Dry Run (Append)[/blue]" if is_append else "[blue]Dry Run[/blue]"
462
+ status = (
463
+ "[blue]Dry Run (Append)[/blue]" if is_append else "[blue]Dry Run[/blue]"
464
+ )
197
465
  summary_table.add_row(file_path_str, str(len(items)), status)
198
466
  continue
199
467
 
200
468
  # Prepare File Content
201
469
  imports_list, classes_code = generate_datacube_module_code(
202
- items=items,
203
- registry=registry,
204
- get_db_url_callback=get_url,
205
- logger=console.print
470
+ items=items,
471
+ registry=registry,
472
+ get_db_url_callback=get_url,
473
+ logger=console.print,
206
474
  )
207
475
  imports = set(imports_list)
208
476
 
209
477
  # Collect used config objects for this file to filter imports
210
- used_configs = set(item[1] for item in items if item[1]) # item[1] is conf_obj
211
- filtered_global_imports = filter_global_imports(registry.global_imports, used_configs, ignored_prefixes=["solutions.conf"])
212
-
478
+ used_configs = set(item[1] for item in items if item[1]) # item[1] is conf_obj
479
+ filtered_global_imports = filter_global_imports(
480
+ registry.global_imports, used_configs, ignored_prefixes=["solutions.conf"]
481
+ )
482
+
213
483
  if not classes_code:
214
- if not is_append:
215
- summary_table.add_row(file_path_str, "0", "[red]Failed (No Classes Generated)[/red]")
216
- else:
217
- summary_table.add_row(file_path_str, "0", "[red]Failed to Append[/red]")
218
- continue
484
+ if not is_append:
485
+ summary_table.add_row(
486
+ file_path_str, "0", "[red]Failed (No Classes Generated)[/red]"
487
+ )
488
+ else:
489
+ summary_table.add_row(file_path_str, "0", "[red]Failed to Append[/red]")
490
+ continue
219
491
 
220
492
  if not is_append:
221
493
  # We are generating the field map with Mapping type hint, so we should allow it in the generator
222
494
  # but this file writes the datacube class.
223
-
224
- full_content = sorted(list(imports)) + filtered_global_imports + ["\n# --- Generated ---"] + classes_code
495
+
496
+ full_content = (
497
+ sorted(list(imports))
498
+ + filtered_global_imports
499
+ + ["\n# --- Generated ---"]
500
+ + classes_code
501
+ )
225
502
  file_path.parent.mkdir(parents=True, exist_ok=True)
226
503
  with open(file_path, "w") as f:
227
504
  f.write("\n".join(full_content))
@@ -233,24 +510,106 @@ def sync(
233
510
  status_msg = f"[green]Appended {len(classes_code)} Classes[/green]"
234
511
 
235
512
  # Format using Ruff
236
- subprocess.run(["uv", "run", "ruff", "format", str(file_path)], capture_output=True)
513
+ subprocess.run(
514
+ ["uv", "run", "ruff", "format", str(file_path)], capture_output=True
515
+ )
237
516
  summary_table.add_row(file_path_str, str(len(items)), status_msg)
238
517
 
518
+ # --- Registry Collection ---
519
+ # Collect metadata for generated datacubes
520
+ # Structure: {conf_obj: {table_name: {class_name: ..., path: ...}}}
521
+ for item in items:
522
+ t_name = item[0]
523
+ conf_obj = item[1]
524
+ cls_n = item[4]
525
+ # Calculate path relative to project root
526
+ try:
527
+ if "project_root" not in locals():
528
+ project_root = config_path.parent.parent.parent
529
+ rel_path = file_path.relative_to(project_root)
530
+ except Exception:
531
+ rel_path = file_path
532
+
533
+ if conf_obj not in generated_registry:
534
+ generated_registry[conf_obj] = {}
535
+
536
+ generated_registry[conf_obj][t_name] = {
537
+ "class_name": cls_n,
538
+ "path": str(rel_path),
539
+ }
540
+
239
541
  console.print(summary_table)
240
542
 
543
+ # --- Write Datacube Registry ---
544
+ reg_rel_path = params.get("paths", {}).get("repositories", {}).get(
545
+ "global_datacube_registry_file"
546
+ ) or params.get("global_datacube_registry_file")
547
+
548
+ if reg_rel_path and generated_registry:
549
+ try:
550
+ if Path(reg_rel_path).is_absolute():
551
+ reg_file = Path(reg_rel_path)
552
+ else:
553
+ if "project_root" not in locals():
554
+ project_root = config_path.parent.parent.parent
555
+ reg_file = project_root / reg_rel_path
556
+
557
+ reg_file.parent.mkdir(parents=True, exist_ok=True)
558
+
559
+ # Group Logic Applied above.
560
+ # Sort keys for stability
561
+ reg_data = {
562
+ k: dict(sorted(v.items()))
563
+ for k, v in sorted(generated_registry.items())
564
+ }
565
+
566
+ with open(reg_file, "w") as f:
567
+ yaml.dump(reg_data, f, sort_keys=False)
568
+
569
+ console.print(
570
+ f"[green]Updated Datacube Registry at {reg_rel_path} ({len(generated_registry)} entries)[/green]"
571
+ )
572
+ except Exception as e:
573
+ console.print(f"[red]Failed to write Datacube Registry: {e}[/red]")
574
+
575
+
241
576
  @app.command()
242
577
  def discover(
243
578
  config_file: Optional[Path] = typer.Option(None, "--config"),
244
- db_conf: str = typer.Option("replica_db_conf", help="Config object to use for discovery introspection"),
579
+ db_conf: str = typer.Option(
580
+ "replica_db_conf", help="Config object to use for discovery introspection"
581
+ ),
245
582
  db_url_map: Optional[str] = typer.Option(None, "--db-urls"),
246
- env_file: Optional[Path] = typer.Option(None, "--env-file", "-e", help="Path to environment file"),
247
- update: bool = typer.Option(False, "--update", help="Update the registry file in place"),
248
- prune: bool = typer.Option(False, "--prune", help="Remove tables from registry if they are not in the discovery result"),
249
- run_sync: bool = typer.Option(False, "--sync", help="Run sync immediately after update"),
250
- dry_run: bool = typer.Option(False, "--dry-run", help="Preview changes without saving (overrides --update)"),
251
- generate_fields: bool = typer.Option(False, "--generate-fields", help="Generate field_map files for discovered tables"),
252
- force: bool = typer.Option(False, "--force", "-f", help="Force overwrite of existing field maps"),
253
- fields_root: str = typer.Option("solutions.conf.transforms.fields", "--fields-root", help="Python path root for field maps"),
583
+ env_file: Optional[Path] = typer.Option(
584
+ None, "--env-file", "-e", help="Path to environment file"
585
+ ),
586
+ update: bool = typer.Option(
587
+ False, "--update", help="Update the registry file in place"
588
+ ),
589
+ prune: bool = typer.Option(
590
+ False,
591
+ "--prune",
592
+ help="Remove tables from registry if they are not in the discovery result",
593
+ ),
594
+ run_sync: bool = typer.Option(
595
+ False, "--sync", help="Run sync immediately after update"
596
+ ),
597
+ dry_run: bool = typer.Option(
598
+ False, "--dry-run", help="Preview changes without saving (overrides --update)"
599
+ ),
600
+ generate_fields: bool = typer.Option(
601
+ False,
602
+ "--generate-fields",
603
+ help="Generate field_map files for discovered tables",
604
+ ),
605
+ force: bool = typer.Option(
606
+ False, "--force", "-f", help="Force overwrite of existing field maps"
607
+ ),
608
+ fields_root: str = typer.Option(
609
+ "solutions.conf.transforms.fields",
610
+ "--fields-root",
611
+ help="Python path root for field maps",
612
+ ),
254
613
  ) -> None:
255
614
  config_path = config_file or context.default_config
256
615
  if not config_path:
@@ -258,7 +617,7 @@ def discover(
258
617
  raise typer.Exit(code=1)
259
618
 
260
619
  gen_config_path = config_path.parent / "generator_config.yaml"
261
-
620
+
262
621
  # Resolve env_file: CLI > Params > Default
263
622
  if env_file:
264
623
  env_path = env_file
@@ -266,24 +625,49 @@ def discover(
266
625
  env_path = Path(context.params.get("defaults", {})["env_file"])
267
626
  else:
268
627
  env_path = Path(".env.linux")
269
-
628
+
270
629
  load_environment(env_path, logger=console.print)
271
630
 
272
-
273
- with open(config_path, 'r') as f:
274
- config_data = yaml.safe_load(f)
631
+ # Load Registry Config (Bootstrap if missing)
632
+ if config_path.exists():
633
+ config_data = _load_and_resolve_config(config_path)
634
+ else:
635
+ # If registry file doesn't exist (e.g. first run), initialize with minimal settings
636
+ # We need "cubes_root_path" from params usually.
637
+ # But wait, config_path IS the registry file path.
638
+ console.print(
639
+ f"[yellow]Registry file {config_path} not found. Initializing empty registry.[/yellow]"
640
+ )
641
+ config_path.parent.mkdir(parents=True, exist_ok=True)
642
+ # Try to infer cubes_root_path from context params if available
643
+ cubes_root = "dataobjects/gencubes" # Default fallback
644
+ if context.params and "paths" in context.params:
645
+ cubes_root = (
646
+ context.params.get("paths", {})
647
+ .get("target", {})
648
+ .get("datacubes_dir", cubes_root)
649
+ )
650
+
651
+ config_data = {"settings": {"cubes_root_path": cubes_root}}
652
+ # We don't save it yet? Or should we?
653
+ # DatacubeRegistry will use this data. If we save later, it's fine.
654
+
275
655
  registry = DatacubeRegistry(config_data)
276
656
 
277
657
  import json
658
+
278
659
  # Resolve DB URL
279
660
  cli_urls = json.loads(db_url_map) if db_url_map else {}
661
+
280
662
  def get_url(conf_name):
281
663
  if conf_name in cli_urls:
282
664
  return cli_urls[conf_name]
283
665
  url = resolve_db_url(conf_name, registry.global_imports)
284
666
  if url:
285
667
  return url
286
- raise ValueError(f"Could not resolve DB URL for '{conf_name}'. Provide via --db-urls or check imports.")
668
+ raise ValueError(
669
+ f"Could not resolve DB URL for '{conf_name}'. Provide via --db-urls or check imports."
670
+ )
287
671
 
288
672
  # --- Initialize Global Field Registry (DEPRECATED) ---
289
673
  field_registry = None
@@ -297,34 +681,46 @@ def discover(
297
681
 
298
682
  params = context.params
299
683
  databases = params.get("databases", [])
300
-
684
+
301
685
  # Fallback to single DB mode if no databases defined (Backwards Compatibility)
302
686
  if not databases:
303
- databases = [{
304
- "name": db_conf,
305
- "connection_obj": db_conf,
306
- # Use standard name if not defined
307
- "whitelist_file": "discovery_whitelist.yaml",
308
- "rules_file": "discovery_rules.yaml"
309
- }]
687
+ databases = [
688
+ {
689
+ "name": db_conf,
690
+ "connection_obj": db_conf,
691
+ # Use standard name if not defined
692
+ "whitelist_file": "discovery_whitelist.yaml",
693
+ "rules_file": "discovery_rules.yaml",
694
+ }
695
+ ]
310
696
 
311
697
  # Filter if user requested specific DB via CLI (using db_conf arg as filter name)
312
- # The `db_conf` argument defaults to "replica_db_conf".
698
+ # The `db_conf` argument defaults to "replica_db_conf".
313
699
  target_db_name = None
314
-
700
+
315
701
  aggregated_entries = {}
316
702
  last_orchestrator = None
317
703
 
704
+ # Load existing all_tables data if accumulating
705
+ global_tables_file = params.get("all_tables_file") or "all_tables.yaml"
706
+ global_tables_path = config_path.parent / global_tables_file
707
+ all_tables_data = {}
708
+ if global_tables_path.exists():
709
+ with open(global_tables_path, "r") as f:
710
+ all_tables_data = yaml.safe_load(f) or {}
711
+
318
712
  for db_config in databases:
319
713
  db_name = db_config.get("id") or db_config.get("name", "unknown")
320
714
  conn_obj = db_config.get("connection_ref") or db_config.get("connection_obj")
321
-
715
+
322
716
  # Determine whitelist path
323
717
  wl_filename = db_config.get("whitelist_file")
324
718
  if not wl_filename:
325
- # Try global param fallback
326
- wl_filename = params.get("discovery", {}).get("whitelist_file") or params.get("whitelist_file")
327
-
719
+ # Try global param fallback
720
+ wl_filename = params.get("discovery", {}).get(
721
+ "whitelist_file"
722
+ ) or params.get("whitelist_file")
723
+
328
724
  if not wl_filename:
329
725
  # Default convention: discovery_whitelist_<db_name>.yaml
330
726
  wl_filename = f"discovery_whitelist_{conn_obj}.yaml"
@@ -333,14 +729,18 @@ def discover(
333
729
  # Determine rules path
334
730
  rules_filename = db_config.get("rules_file")
335
731
  if not rules_filename:
336
- rules_filename = params.get("discovery", {}).get("rules_file") or params.get("rules_file")
337
-
732
+ rules_filename = params.get("discovery", {}).get(
733
+ "rules_file"
734
+ ) or params.get("rules_file")
735
+
338
736
  if not rules_filename:
339
- rules_filename = f"discovery_rules_{conn_obj}.yaml"
737
+ rules_filename = f"discovery_rules_{conn_obj}.yaml"
340
738
  rules_path = config_path.parent / rules_filename
341
739
 
342
740
  # Determine blacklist path
343
- bl_filename = db_config.get("blacklist_file", f"discovery_blacklist_{conn_obj}.yaml")
741
+ bl_filename = db_config.get(
742
+ "blacklist_file", f"discovery_blacklist_{conn_obj}.yaml"
743
+ )
344
744
  blacklist_path = config_path.parent / bl_filename
345
745
 
346
746
  console.print(f"[bold cyan]Discovering: {db_name} ({conn_obj})[/]")
@@ -351,16 +751,18 @@ def discover(
351
751
  # Support proper import_spec from new config or legacy global_import
352
752
  import_spec = db_config.get("import_spec")
353
753
  if import_spec and isinstance(import_spec, dict):
354
- imp = import_spec.get("module")
754
+ imp = import_spec.get("module")
355
755
  else:
356
- imp = db_config.get("global_import")
756
+ imp = db_config.get("global_import")
357
757
  db_imports = [imp] if imp else registry.global_imports
358
758
  if not db_imports and registry.global_imports:
359
- db_imports = registry.global_imports
360
-
759
+ db_imports = registry.global_imports
760
+
361
761
  db_conn_str = resolve_db_url(conn_obj, db_imports)
362
762
  except Exception:
363
- console.print(f"[red]Could not resolve connection {conn_obj}. Skipping.[/red]")
763
+ console.print(
764
+ f"[red]Could not resolve connection {conn_obj}. Skipping.[/red]"
765
+ )
364
766
  continue
365
767
 
366
768
  orchestrator = DiscoveryOrchestrator(
@@ -370,43 +772,62 @@ def discover(
370
772
  whitelist_path=str(whitelist_path),
371
773
  registry_path=str(config_path),
372
774
  db_connection_str=db_conn_str,
373
- db_config=db_config
775
+ db_config=db_config,
374
776
  )
375
-
777
+
376
778
  try:
377
779
  entries = orchestrator.discover()
378
780
  aggregated_entries.update(entries)
379
781
  last_orchestrator = orchestrator
782
+
783
+ # --- Capture Raw Tables for all_tables.yaml ---
784
+ if hasattr(orchestrator, "raw_tables") and orchestrator.raw_tables:
785
+ # Sort for consistency
786
+ all_tables_data[conn_obj] = sorted(list(orchestrator.raw_tables))
787
+ console.print(
788
+ f"[green]Captured {len(orchestrator.raw_tables)} raw tables for {conn_obj}[/green]"
789
+ )
380
790
  except Exception as e:
381
- console.print(f"[red]Discovery failed for {db_name}: {e}[/red]")
382
- if not dry_run:
383
- raise # Fail hard if not dry run? Or continue? Let's buffer errors?
384
- # For now, log and continue might result in partial registry which is bad (prune would wipe missing).
385
- # Fail safe:
386
- return
791
+ console.print(f"[red]Discovery failed for {db_name}: {e}[/red]")
792
+ if not dry_run:
793
+ raise # Fail hard if not dry run? Or continue? Let's buffer errors?
794
+ # For now, log and continue might result in partial registry which is bad (prune would wipe missing).
795
+ # Fail safe:
796
+ return
387
797
 
388
798
  # Aggregate global imports from ALL databases to ensure registry has them
389
799
  aggregated_global_imports = set(params.get("global_imports", []))
390
800
  for db in databases:
391
801
  if "global_import" in db:
392
802
  aggregated_global_imports.add(db["global_import"])
393
-
803
+
394
804
  # Save Aggregated Registry
395
805
  if last_orchestrator:
396
806
  console.print("")
397
807
  # Inject aggregated imports into the last orchestrator's update logic?
398
808
  # The orchestrator's save_registry loads existing, updates tables, and saves.
399
809
  # It DOES NOT currently update global_imports. We need to add that cap.
400
-
810
+
401
811
  # Helper manual update for now, or update Orchestrator to support it?
402
- # Let's update Orchestrator.save_registry to accept global_imports update.
403
812
  last_orchestrator.save_registry(
404
- aggregated_entries,
405
- dry_run=dry_run,
406
- prune=prune,
407
- global_imports=list(aggregated_global_imports)
813
+ aggregated_entries,
814
+ dry_run=dry_run,
815
+ prune=prune,
816
+ global_imports=list(aggregated_global_imports),
408
817
  )
409
-
818
+
819
+ # Save all_tables.yaml
820
+ if not dry_run and all_tables_data:
821
+ with open(global_tables_path, "w") as f:
822
+ yaml.dump(all_tables_data, f, sort_keys=False)
823
+ console.print(
824
+ f"[bold green]Updated {global_tables_file} with raw tables from providers.[/bold green]"
825
+ )
826
+ elif dry_run:
827
+ console.print(
828
+ f"[yellow]DRY RUN: Would update {global_tables_file} with {len(all_tables_data)} providers.[/yellow]"
829
+ )
830
+
410
831
  # Save Registry changes (collected during discovery)
411
832
  if not dry_run and field_registry:
412
833
  field_registry.save()
@@ -415,29 +836,29 @@ def discover(
415
836
  if generate_fields and not dry_run:
416
837
  console.print("")
417
838
  console.rule("[bold blue]Generating Field Maps[/]")
418
-
839
+
419
840
  # Reload registry to ensure we have latest discovered tables
420
- with open(config_path, 'r') as f:
841
+ with open(config_path, "r") as f:
421
842
  updated_config_data = yaml.safe_load(f)
422
843
  updated_registry = DatacubeRegistry(updated_config_data, params=context.params)
423
844
 
424
845
  # Convert python path to physical path
425
- phys_root = Path(fields_root.replace('.', '/'))
426
-
846
+ phys_root = Path(fields_root.replace(".", "/"))
847
+
427
848
  # Group tables by connection to use correct inspector
428
849
  tables_by_conn = {}
429
850
  for t_name, t_data in updated_registry.tables.items():
430
- conn = t_data.get('connection_obj', updated_registry.default_connection_obj)
851
+ conn = t_data.get("connection_obj", updated_registry.default_connection_obj)
431
852
  if conn not in tables_by_conn:
432
853
  tables_by_conn[conn] = {}
433
854
  tables_by_conn[conn][t_name] = t_data
434
-
855
+
435
856
  for conn_obj, table_group in tables_by_conn.items():
436
857
  try:
437
858
  db_url = get_url(conn_obj)
438
859
  engine = sa.create_engine(db_url)
439
860
  inspector = inspect(engine)
440
-
861
+
441
862
  # Use the promoted Generator
442
863
  generate_field_map_files(
443
864
  discovered_entries=table_group,
@@ -445,35 +866,46 @@ def discover(
445
866
  root_path=phys_root,
446
867
  force=force,
447
868
  logger=console.print,
448
- allowed_paths=updated_registry.valid_fieldmap_paths if hasattr(updated_registry, 'valid_fieldmap_paths') else None
869
+ allowed_paths=(
870
+ updated_registry.valid_fieldmap_paths
871
+ if hasattr(updated_registry, "valid_fieldmap_paths")
872
+ else None
873
+ ),
449
874
  )
450
875
  except Exception as e:
451
- console.print(f"[red]Error generating fields for connection {conn_obj}: {e}[/red]")
452
-
876
+ console.print(
877
+ f"[red]Error generating fields for connection {conn_obj}: {e}[/red]"
878
+ )
879
+
453
880
  # Chained Sync
454
881
  if run_sync:
455
882
  if dry_run:
456
883
  console.print("[yellow]Skipping sync in dry-run mode.[/yellow]")
457
884
  elif not update:
458
- console.print("[yellow]Sync skipped: Registry not updated (use --update to enable chaining).[/yellow]")
885
+ console.print(
886
+ "[yellow]Sync skipped: Registry not updated (use --update to enable chaining).[/yellow]"
887
+ )
459
888
  else:
460
- console.print("")
461
- console.rule("[bold blue]Auto-Syncing Datacubes[/]")
462
- # Call sync command directly with current context options
463
- sync(
464
- config_file=config_path,
465
- db_url_map=db_url_map,
466
- force=True,
467
- env_file=env_file,
468
- dry_run=False
469
- )
889
+ console.print("")
890
+ console.rule("[bold blue]Auto-Syncing Datacubes[/]")
891
+ # Call sync command directly with current context options
892
+ sync(
893
+ config_file=config_path,
894
+ db_url_map=db_url_map,
895
+ force=True,
896
+ env_file=env_file,
897
+ dry_run=False,
898
+ )
899
+
470
900
 
471
901
  @app.command()
472
902
  def scan(
473
903
  config_file: Optional[Path] = typer.Option(None, "--config"),
474
904
  db_url_map: Optional[str] = typer.Option(None, "--db-urls"),
475
905
  env_file: Optional[Path] = typer.Option(None, "--env-file", "-e"),
476
- db_name: Optional[str] = typer.Option(None, "--db", help="Target specific database from params"),
906
+ db_name: Optional[str] = typer.Option(
907
+ None, "--db", help="Target specific database from params"
908
+ ),
477
909
  ) -> None:
478
910
  """
479
911
  Introspects configured databases and dumps table lists to YAML.
@@ -487,41 +919,41 @@ def scan(
487
919
  # Resolve env_file: CLI > Params > Default
488
920
  if env_file:
489
921
  env_path = env_file
490
- elif context.params and "env_file" in context.params:
491
- env_path = Path(context.params["env_file"])
922
+ elif context.params and context.params.get("defaults", {}).get("env_file"):
923
+ env_path = Path(context.params.get("defaults", {})["env_file"])
492
924
  else:
493
925
  env_path = Path(".env.linux")
494
926
  load_environment(env_path, logger=console.print)
495
927
 
496
- with open(config_path, 'r') as f:
497
- config_data = yaml.safe_load(f)
928
+ config_data = _load_and_resolve_config(config_path)
498
929
  registry = DatacubeRegistry(config_data, params=context.params)
499
930
 
500
931
  import json
932
+
501
933
  cli_urls = json.loads(db_url_map) if db_url_map else {}
502
-
934
+
503
935
  # Helper Resolution
504
936
  def get_url_safe(conf_name, db_imp):
505
- if conf_name in cli_urls: return cli_urls[conf_name]
506
- imp = [db_imp] if db_imp else registry.global_imports
507
- return resolve_db_url(conf_name, imp)
937
+ if conf_name in cli_urls:
938
+ return cli_urls[conf_name]
939
+ imp = [db_imp] if db_imp else registry.global_imports
940
+ return resolve_db_url(conf_name, imp)
508
941
 
509
942
  params = context.params
510
943
  databases = params.get("databases", [])
511
-
944
+
512
945
  # Filter targets
513
946
  target_dbs = databases
514
947
  if db_name:
515
948
  target_dbs = [d for d in databases if d.get("name") == db_name]
516
949
  if not target_dbs:
517
- console.print(f"[red]Database '{db_name}' not found.[/red]")
518
- raise typer.Exit(code=1)
950
+ console.print(f"[red]Database '{db_name}' not found.[/red]")
951
+ raise typer.Exit(code=1)
519
952
 
520
-
521
953
  # Resolve global output file
522
954
  global_tables_file = params.get("all_tables_file") or "all_tables.yaml"
523
955
  global_tables_path = config_path.parent / global_tables_file
524
-
956
+
525
957
  # Load existing data to preserve config for DBs not being scanned
526
958
  all_tables_data = {}
527
959
  if global_tables_path.exists():
@@ -531,37 +963,47 @@ def scan(
531
963
  for db in target_dbs:
532
964
  name = db.get("name")
533
965
  conn_obj = db.get("connection_obj")
534
-
966
+
535
967
  console.print(f"[bold cyan]Scanning: {name} ...[/bold cyan]")
536
968
  try:
537
969
  db_url = get_url_safe(conn_obj, db.get("global_import"))
538
970
  if not db_url:
539
971
  console.print(f"[red]Could not resolve URL for {conn_obj}[/red]")
540
972
  continue
541
-
973
+
542
974
  engine = sa.create_engine(db_url)
543
975
  inspector = inspect(engine)
544
976
  tables = sorted(inspector.get_table_names())
545
-
977
+
546
978
  # Update shared dictionary
547
979
  all_tables_data[conn_obj] = tables
548
980
  console.print(f"[green]Found {len(tables)} tables for {conn_obj}[/green]")
549
-
981
+
550
982
  except Exception as e:
551
983
  console.print(f"[red]Scan failed for {name}: {e}[/red]")
552
- if params.get("debug"): raise e
984
+ if params.get("debug"):
985
+ raise e
553
986
 
554
987
  # Persist aggregated result
555
988
  with open(global_tables_path, "w") as f:
556
989
  yaml.dump(all_tables_data, f, sort_keys=False)
557
-
558
- console.print(f"[bold green]Updated table list at {global_tables_path}[/bold green]")
990
+
991
+ console.print(
992
+ f"[bold green]Updated table list at {global_tables_path}[/bold green]"
993
+ )
994
+
559
995
 
560
996
  @app.command()
561
997
  def drift(
562
998
  config_file: Optional[Path] = typer.Option(None, "--config"),
563
- db_url_map: Optional[str] = typer.Option(None, "--db-urls", help="Optional JSON mapping. If omitted, tries to resolve from code."),
564
- env_file: Optional[Path] = typer.Option(None, "--env-file", "-e", help="Path to environment file"),
999
+ db_url_map: Optional[str] = typer.Option(
1000
+ None,
1001
+ "--db-urls",
1002
+ help="Optional JSON mapping. If omitted, tries to resolve from code.",
1003
+ ),
1004
+ env_file: Optional[Path] = typer.Option(
1005
+ None, "--env-file", "-e", help="Path to environment file"
1006
+ ),
565
1007
  ) -> None:
566
1008
  """
567
1009
  Checks for 'drift' between the generated Python classes and the DB schema.
@@ -574,21 +1016,19 @@ def drift(
574
1016
  # Resolve env_file: CLI > Params > Default
575
1017
  if env_file:
576
1018
  env_path = env_file
577
- elif context.params and "env_file" in context.params:
578
- env_path = Path(context.params["env_file"])
1019
+ elif context.params and context.params.get("defaults", {}).get("env_file"):
1020
+ env_path = Path(context.params.get("defaults", {})["env_file"])
579
1021
  else:
580
1022
  env_path = Path(".env.linux")
581
1023
 
582
1024
  load_environment(env_path, logger=console.print)
583
1025
 
1026
+ config_data = _load_and_resolve_config(config_path)
584
1027
 
585
- with open(config_path, 'r') as f:
586
- config_data = yaml.safe_load(f)
587
-
588
1028
  registry = DatacubeRegistry(config_data)
589
1029
  get_url = _get_db_url_callback(registry, db_url_map)
590
1030
  cli_urls = json.loads(db_url_map) if db_url_map else {}
591
-
1031
+
592
1032
  drift_table = Table(title="Schema Drift Analysis")
593
1033
  drift_table.add_column("Class", style="cyan")
594
1034
  drift_table.add_column("Status", style="bold")
@@ -598,22 +1038,32 @@ def drift(
598
1038
  attribute_names = list(registry.processed_mappings.keys())
599
1039
 
600
1040
  for table_name, details in registry.tables.items():
601
- target = details.get('save_to_path', details.get('path'))
1041
+ target = details.get("save_to_path", details.get("path"))
602
1042
  if not target:
603
- drift_table.add_row(table_name, "[red]Config Error[/red]", "Missing save_to_path")
604
- continue
1043
+ drift_table.add_row(
1044
+ table_name, "[red]Config Error[/red]", "Missing save_to_path"
1045
+ )
1046
+ continue
605
1047
  path = Path(target)
606
1048
  if not path.exists():
607
- console.print(f"[yellow]Skipping {table_name}: File {path} not found.[/yellow]")
1049
+ console.print(
1050
+ f"[yellow]Skipping {table_name}: File {path} not found.[/yellow]"
1051
+ )
608
1052
  continue
609
1053
 
610
1054
  # 1. Determine Class Name
611
- provided_class_name = details.get('class_name')
612
- class_name = provided_class_name if provided_class_name else "".join(w.capitalize() for w in table_name.split('_')) + "Dc"
1055
+ provided_class_name = details.get("class_name")
1056
+ class_name = (
1057
+ provided_class_name
1058
+ if provided_class_name
1059
+ else "".join(w.capitalize() for w in table_name.split("_")) + "Dc"
1060
+ )
613
1061
 
614
1062
  # 2. Dynamically load the generated class from the file
615
1063
  try:
616
- spec = importlib.util.spec_from_file_location(f"dynamic_mod_{table_name}", path)
1064
+ spec = importlib.util.spec_from_file_location(
1065
+ f"dynamic_mod_{table_name}", path
1066
+ )
617
1067
  mod = importlib.util.module_from_spec(spec)
618
1068
  spec.loader.exec_module(mod)
619
1069
  dc_class = getattr(mod, class_name)
@@ -624,35 +1074,41 @@ def drift(
624
1074
  # 3. Determine DB URL
625
1075
  # Priority: CLI Override > Class Attribute > Registry Config
626
1076
  db_url = None
627
- conf_obj = details.get('connection_obj', details.get('config_obj', registry.default_connection_obj))
1077
+ conf_obj = details.get(
1078
+ "connection_obj", details.get("config_obj", registry.default_connection_obj)
1079
+ )
628
1080
 
629
1081
  if conf_obj in cli_urls:
630
1082
  db_url = cli_urls[conf_obj]
631
- elif hasattr(dc_class, 'connection_url'):
632
- db_url = getattr(dc_class, 'connection_url')
633
- elif hasattr(dc_class, 'config') and isinstance(dc_class.config, dict):
634
- db_url = dc_class.config.get('connection_url')
635
-
1083
+ elif hasattr(dc_class, "connection_url"):
1084
+ db_url = getattr(dc_class, "connection_url")
1085
+ elif hasattr(dc_class, "config") and isinstance(dc_class.config, dict):
1086
+ db_url = dc_class.config.get("connection_url")
1087
+
636
1088
  # Fallback to registry resolution
637
1089
  if not db_url:
638
1090
  try:
639
1091
  db_url = get_url(conf_obj)
640
1092
  except Exception:
641
- pass
1093
+ pass
642
1094
 
643
1095
  # 4. Introspect DB
644
1096
  try:
645
1097
  engine = sa.create_engine(db_url)
646
1098
  inspector = inspect(engine)
647
- db_cols = {c['name'] for c in inspector.get_columns(table_name)}
1099
+ db_cols = {c["name"] for c in inspector.get_columns(table_name)}
648
1100
  except Exception as e:
649
- drift_table.add_row(class_name, "[red]DB Error[/red]", repr(e))
650
- continue
1101
+ drift_table.add_row(class_name, "[red]DB Error[/red]", repr(e))
1102
+ continue
651
1103
 
652
1104
  # 5. Extract Field Map (if any)
653
- field_map = getattr(dc_class, 'field_map', None)
654
- if not field_map and hasattr(dc_class, 'config') and isinstance(dc_class.config, dict):
655
- field_map = dc_class.config.get('field_map')
1105
+ field_map = getattr(dc_class, "field_map", None)
1106
+ if (
1107
+ not field_map
1108
+ and hasattr(dc_class, "config")
1109
+ and isinstance(dc_class.config, dict)
1110
+ ):
1111
+ field_map = dc_class.config.get("field_map")
656
1112
 
657
1113
  # 6. Check Drift
658
1114
  issues = check_drift(dc_class, db_cols, attribute_names, field_map=field_map)
@@ -665,12 +1121,80 @@ def drift(
665
1121
  console.print(drift_table)
666
1122
 
667
1123
 
1124
+ @app.command()
1125
+ def propose_rules(
1126
+ config_file: Optional[Path] = typer.Option(None, "--config"),
1127
+ dry_run: bool = typer.Option(
1128
+ False, "--dry-run", help="Preview rules without saving"
1129
+ ),
1130
+ ):
1131
+ """
1132
+ Analyzes all_tables.yaml and proposes new discovery rules.
1133
+ """
1134
+ config_path = config_file or context.default_config
1135
+ if not config_path:
1136
+ console.print("[red]No config file specified and no default configured.[/red]")
1137
+ raise typer.Exit(code=1)
1138
+
1139
+ # Resolve paths via helper to ensure Project Root logic is applied
1140
+ resolved_config = _load_and_resolve_config(config_path)
1141
+
1142
+ # We rely on all_tables.yaml being generated by scan/discover
1143
+ # The resolved config will have absolute paths for these if the helper worked.
1144
+ params = context.params
1145
+
1146
+ # Prefer resolved values if available
1147
+ if (
1148
+ "discovery" in resolved_config
1149
+ and "all_tables_file" in resolved_config["discovery"]
1150
+ ):
1151
+ all_tables_path = Path(resolved_config["discovery"]["all_tables_file"])
1152
+ else:
1153
+ # Fallback to manual resolution (legacy or if not in discovery block)
1154
+ raw_val = (
1155
+ params.get("discovery", {}).get("all_tables_file")
1156
+ or params.get("all_tables_file")
1157
+ or "all_tables.yaml"
1158
+ )
1159
+ all_tables_path = config_path.parent / raw_val
1160
+
1161
+ # Rules File
1162
+ if "discovery" in resolved_config and "rules_file" in resolved_config["discovery"]:
1163
+ rules_path = Path(resolved_config["discovery"]["rules_file"])
1164
+ else:
1165
+ raw_rules = (
1166
+ params.get("discovery", {}).get("rules_file")
1167
+ or params.get("rules_file")
1168
+ or "discovery_rules.yaml"
1169
+ )
1170
+ rules_path = config_path.parent / raw_rules
1171
+
1172
+ if not all_tables_path.exists():
1173
+ console.print(
1174
+ f"[red]Error: {all_tables_path} not found. Run 'dc-scan' first.[/red]"
1175
+ )
1176
+ raise typer.Exit(code=1)
1177
+
1178
+ engine = RuleEngine(all_tables_path, rules_path)
1179
+ engine.load()
1180
+ updates = engine.propose_rules()
1181
+
1182
+ if dry_run:
1183
+ console.print("[bold yellow]Proposed Updates:[/]")
1184
+ for conn, rules in updates.items():
1185
+ console.print(f"[cyan]{conn}:[/]")
1186
+ for r in rules:
1187
+ console.print(f" - {r}")
1188
+ else:
1189
+ engine.save_proposal(updates)
668
1190
 
669
1191
 
670
1192
  @app.command()
671
1193
  def match(
672
1194
  config_file: Optional[Path] = typer.Option(None, "--config"),
673
- db_name: Optional[str] = typer.Option(None, "--db", help="Target specific database from params"),
1195
+ db_name: Optional[str] = typer.Option(
1196
+ None, "--db", help="Target specific database from params"
1197
+ ),
674
1198
  ) -> None:
675
1199
  """
676
1200
  Applies discovery rules to scanned tables and generates whitelists (registry).
@@ -682,23 +1206,23 @@ def match(
682
1206
  raise typer.Exit(code=1)
683
1207
 
684
1208
  import yaml
685
-
1209
+
686
1210
  # Load Params
687
- params_path = config_path.parent / "discovery_params.yaml" # Assuming relative location or loaded via context
1211
+ params_path = (
1212
+ config_path.parent / "discovery_params.yaml"
1213
+ ) # Assuming relative location or loaded via context
688
1214
  # Context should already have params if set_context_defaults ran, but to be safe/standalone:
689
1215
  params = context.params
690
1216
  databases = params.get("databases", [])
691
1217
  folder_prefix = params.get("folder_prefix", "solutions/dataobjects/gencubes/")
692
1218
  fields_suffix = params.get("fields_module_root", "fields")
693
1219
 
694
-
695
-
696
1220
  target_dbs = databases
697
1221
  if db_name:
698
1222
  target_dbs = [d for d in databases if d.get("name") == db_name]
699
1223
  if not target_dbs:
700
- console.print(f"[red]Database '{db_name}' not found.[/red]")
701
- raise typer.Exit(code=1)
1224
+ console.print(f"[red]Database '{db_name}' not found.[/red]")
1225
+ raise typer.Exit(code=1)
702
1226
 
703
1227
  for db in target_dbs:
704
1228
  name = db.get("name")
@@ -706,36 +1230,40 @@ def match(
706
1230
  rules_file = db.get("rules_file")
707
1231
  whitelist_file = db.get("whitelist_file")
708
1232
  conn_obj = db.get("connection_obj")
709
-
1233
+
710
1234
  # Path Composition
711
1235
  db_domain = db.get("db_domain")
712
1236
  import_base = Path(folder_prefix)
713
1237
  if db_domain:
714
1238
  import_base = import_base / db_domain
715
1239
  import_base = import_base / fields_suffix
716
-
1240
+
717
1241
  try:
718
1242
  import_base = import_base.relative_to(Path.cwd())
719
1243
  except ValueError:
720
1244
  pass
721
1245
  fields_module_base = str(import_base).replace("/", ".")
722
-
1246
+
723
1247
  if not (all_tables_file and rules_file and whitelist_file):
724
- console.print(f"[yellow]Skipping {name}: Missing file config.[/yellow]")
725
- continue
1248
+ console.print(f"[yellow]Skipping {name}: Missing file config.[/yellow]")
1249
+ continue
726
1250
 
727
1251
  tables_path = config_path.parent / all_tables_file
728
1252
  rules_path = config_path.parent / rules_file
729
1253
  out_path = config_path.parent / whitelist_file
730
-
1254
+
731
1255
  if not tables_path.exists():
732
- console.print(f"[red]Skipping {name}: {all_tables_file} not found. Run 'scan' first.[/red]")
1256
+ console.print(
1257
+ f"[red]Skipping {name}: {all_tables_file} not found. Run 'scan' first.[/red]"
1258
+ )
733
1259
  continue
734
-
1260
+
735
1261
  if not rules_path.exists():
736
- console.print(f"[yellow]Skipping {name}: Rules file {rules_file} not found.[/yellow]")
737
- continue
738
-
1262
+ console.print(
1263
+ f"[yellow]Skipping {name}: Rules file {rules_file} not found.[/yellow]"
1264
+ )
1265
+ continue
1266
+
739
1267
  # Load existing whitelist to preserve customizations
740
1268
  existing_whitelist = {}
741
1269
  if out_path.exists():
@@ -745,71 +1273,79 @@ def match(
745
1273
 
746
1274
  with open(tables_path, "r") as f:
747
1275
  all_tables = yaml.safe_load(f) or []
748
-
1276
+
749
1277
  with open(rules_path, "r") as f:
750
1278
  rules_data = yaml.safe_load(f) or []
751
-
1279
+
752
1280
  # Match Logic
753
- console.print(f"[bold cyan]Matching: {name} ({len(all_tables)} tables)[/bold cyan]")
754
-
1281
+ console.print(
1282
+ f"[bold cyan]Matching: {name} ({len(all_tables)} tables)[/bold cyan]"
1283
+ )
1284
+
755
1285
  matches = {}
756
1286
  matched_count = 0
757
-
1287
+
758
1288
  for table in sorted(all_tables):
759
1289
  # Find first matching rule
760
1290
  matched_rule = None
761
1291
  for r in rules_data:
762
1292
  pattern = r.get("pattern")
763
1293
  mtype = r.get("match_type", "exact")
764
-
1294
+
765
1295
  is_match = False
766
1296
  if mtype == "exact" and table == pattern:
767
1297
  is_match = True
768
1298
  elif mtype == "prefix" and table.startswith(pattern):
769
1299
  is_match = True
770
1300
  elif mtype == "regex":
771
- import re
772
- if re.search(pattern, table):
773
- is_match = True
774
-
1301
+ import re
1302
+
1303
+ if re.search(pattern, table):
1304
+ is_match = True
1305
+
775
1306
  if is_match:
776
1307
  matched_rule = r
777
1308
  break
778
-
1309
+
779
1310
  if matched_rule:
780
1311
  # Construct Registry Entry
781
1312
  # Resolve path using folder_prefix
782
1313
  template = matched_rule.get("output_template", f"{table}_cubes.py")
783
1314
  domain = matched_rule.get("domain", "common")
784
-
1315
+
785
1316
  # Path Construction: folder_prefix + db_domain + domain + output_template
786
1317
  db_domain = db.get("db_domain", "")
787
-
1318
+
788
1319
  # Careful not to double slash if db_domain is empty, but Path handles it.
789
1320
  # template should now be just filename per rule updates.
790
1321
  full_path_obj = Path(folder_prefix)
791
1322
  if db_domain:
792
1323
  full_path_obj = full_path_obj / db_domain
793
-
1324
+
794
1325
  full_path_obj = full_path_obj / domain / template
795
1326
  full_path = str(full_path_obj)
796
-
1327
+
797
1328
  # Class Name Generation
798
1329
  # Check if exists in old whitelist
799
1330
  existing_entry = existing_whitelist.get(table, {})
800
-
1331
+
801
1332
  custom_name = existing_entry.get("custom_name")
802
-
1333
+
803
1334
  if custom_name:
804
1335
  class_name = custom_name
805
1336
  elif existing_entry.get("class_name"):
806
1337
  class_name = existing_entry.get("class_name")
807
1338
  else:
808
1339
  class_suffix = params.get("class_suffix", "Dc")
809
- class_name = "".join(w.capitalize() for w in table.split("_")) + class_suffix
1340
+ class_name = (
1341
+ "".join(w.capitalize() for w in table.split("_")) + class_suffix
1342
+ )
1343
+
1344
+ field_map_template = matched_rule.get(
1345
+ "field_map_template",
1346
+ f"{fields_module_base}.{{domain}}.{{table}}.field_map",
1347
+ )
810
1348
 
811
- field_map_template = matched_rule.get("field_map_template", f"{fields_module_base}.{{domain}}.{{table}}.field_map")
812
-
813
1349
  # Construct defaults
814
1350
  entry = {
815
1351
  "path": full_path,
@@ -817,17 +1353,17 @@ def match(
817
1353
  "domain": domain,
818
1354
  "class_name": class_name,
819
1355
  # Ensure field_map is assigned
820
- "field_map": field_map_template.format(domain=domain, table=table)
1356
+ "field_map": field_map_template.format(domain=domain, table=table),
821
1357
  }
822
-
1358
+
823
1359
  # Preserve custom_name if present, else default to None
824
1360
  entry["custom_name"] = custom_name if custom_name else None
825
-
826
- # Preserve other fields if needed?
1361
+
1362
+ # Preserve other fields if needed?
827
1363
  # User asked specifically for keys on class_name preservation.
828
- # But generally we might want to respect other overrides?
1364
+ # But generally we might want to respect other overrides?
829
1365
  # For now, strict to class_name per request + generation logic.
830
-
1366
+
831
1367
  matches[table] = entry
832
1368
  matched_count += 1
833
1369
 
@@ -835,15 +1371,20 @@ def match(
835
1371
  output_data = {"tables": matches}
836
1372
  with open(out_path, "w") as f:
837
1373
  yaml.dump(output_data, f, sort_keys=False)
838
-
839
- console.print(f"[green]Matched {matched_count} tables. Written to {out_path}[/green]")
1374
+
1375
+ console.print(
1376
+ f"[green]Matched {matched_count} tables. Written to {out_path}[/green]"
1377
+ )
1378
+
840
1379
 
841
1380
  @app.command()
842
1381
  def map(
843
1382
  config_file: Optional[Path] = typer.Option(None, "--config"),
844
1383
  db_url_map: Optional[str] = typer.Option(None, "--db-urls"),
845
1384
  env_file: Optional[Path] = typer.Option(None, "--env-file", "-e"),
846
- db_name: Optional[str] = typer.Option(None, "--db", help="Target specific database from params"),
1385
+ db_name: Optional[str] = typer.Option(
1386
+ None, "--db", help="Target specific database from params"
1387
+ ),
847
1388
  force: bool = typer.Option(False, "--force", "-f"),
848
1389
  ) -> None:
849
1390
  """
@@ -855,50 +1396,66 @@ def map(
855
1396
  console.print("[red]No config file specified.[/red]")
856
1397
  raise typer.Exit(code=1)
857
1398
 
1399
+ # Env Load
858
1400
  # Env Load
859
1401
  if env_file:
860
1402
  env_path = env_file
861
- elif context.params and "env_file" in context.params:
862
- env_path = Path(context.params["env_file"])
1403
+ elif context.params and context.params.get("defaults", {}).get("env_file"):
1404
+ env_path = Path(context.params.get("defaults", {})["env_file"])
863
1405
  else:
864
1406
  env_path = Path(".env.linux")
865
1407
  load_environment(env_path, logger=console.print)
866
-
1408
+
867
1409
  import json
1410
+
868
1411
  cli_urls = json.loads(db_url_map) if db_url_map else {}
869
- registry = DatacubeRegistry({}, params=context.params) # Dummy reg for imports
1412
+ registry = DatacubeRegistry({}, params=context.params) # Dummy reg for imports
870
1413
 
871
1414
  def get_url_safe(conf_name, db_imp):
872
- if conf_name in cli_urls: return cli_urls[conf_name]
873
- imp = [db_imp] if db_imp else registry.global_imports
874
- return resolve_db_url(conf_name, imp)
1415
+ if conf_name in cli_urls:
1416
+ return cli_urls[conf_name]
1417
+ imp = [db_imp] if db_imp else registry.global_imports
1418
+ return resolve_db_url(conf_name, imp)
875
1419
 
876
1420
  params = context.params
877
1421
  databases = params.get("databases", [])
878
-
1422
+
879
1423
  target_dbs = databases
880
1424
  if db_name:
881
1425
  target_dbs = [d for d in databases if d.get("name") == db_name]
882
-
883
- _run_field_map_generation(context, config_path, target_dbs, get_url_safe, force=force)
1426
+
1427
+ _run_field_map_generation(
1428
+ context, config_path, target_dbs, get_url_safe, force=force
1429
+ )
884
1430
  return
1431
+
1432
+
885
1433
  @app.command()
886
1434
  def init(
887
1435
  config_file: Optional[Path] = typer.Option(None, "--config"),
888
- db_conf: Optional[str] = typer.Option(None, help="Config object to use for introspection"),
889
- db_name: Optional[str] = typer.Option(None, "--db", help="Target specific database from params"),
1436
+ db_conf: Optional[str] = typer.Option(
1437
+ None, help="Config object to use for introspection"
1438
+ ),
1439
+ db_name: Optional[str] = typer.Option(
1440
+ None, "--db", help="Target specific database from params"
1441
+ ),
890
1442
  db_url_map: Optional[str] = typer.Option(None, "--db-urls"),
891
1443
  env_file: Optional[Path] = typer.Option(None, "--env-file", "-e"),
892
- dump_schema: Optional[Path] = typer.Option(None, "--dump-schema", help="Dump database schema"),
893
- init_whitelist: Optional[Path] = typer.Option(None, "--init-whitelist", help="Initialize whitelist from DB tables"),
894
- init_rules: Optional[Path] = typer.Option(None, "--init-rules", help="Initialize discovery rules from DB tables"),
895
- reset: bool = typer.Option(False, "--reset", help="Reset registry and config to defaults"),
1444
+ dump_schema: Optional[Path] = typer.Option(
1445
+ None, "--dump-schema", help="Dump database schema"
1446
+ ),
1447
+ init_rules: Optional[Path] = typer.Option(
1448
+ None, "--init-rules", help="Initialize discovery rules from DB tables"
1449
+ ),
1450
+ reset: bool = typer.Option(
1451
+ False, "--reset", help="Reset registry and config to defaults"
1452
+ ),
896
1453
  ) -> None:
897
1454
  """
898
- Initializes configuration, schema dumps, and whitelists.
1455
+ Initializes configuration and schema dumps.
899
1456
  """
900
1457
  params = context.params
901
-
1458
+
902
1459
  # Determine Targets
903
1460
  databases = params.get("databases", [])
904
1461
  target_dbs = []
@@ -907,203 +1464,657 @@ def init(
907
1464
  # Filter specific DB
908
1465
  target_dbs = [d for d in databases if d.get("name") == db_name]
909
1466
  if not target_dbs:
910
- console.print(f"[red]Database '{db_name}' not found in params.[/red]")
911
- raise typer.Exit(code=1)
1467
+ console.print(f"[red]Database '{db_name}' not found in params.[/red]")
1468
+ raise typer.Exit(code=1)
912
1469
  elif databases:
913
1470
  # All DBs
914
1471
  target_dbs = databases
915
1472
  else:
916
1473
  # Legacy Fallback
917
- target_db_conf = db_conf or params.get("default_connection_obj", "replica_db_conf")
918
- target_dbs = [{"name": target_db_conf, "connection_obj": target_db_conf, "whitelist_file": "discovery_whitelist.yaml"}]
1474
+ target_db_conf = db_conf or params.get(
1475
+ "default_connection_obj", "replica_db_conf"
1476
+ )
1477
+ target_dbs = [
1478
+ {
1479
+ "name": target_db_conf,
1480
+ "connection_obj": target_db_conf,
1481
+ "whitelist_file": "discovery_whitelist.yaml",
1482
+ }
1483
+ ]
919
1484
 
920
1485
  # Validate db_conf override if provided (only if single target or legacy)
921
1486
  if db_conf and not db_name and not databases:
922
- target_dbs[0]["connection_obj"] = db_conf
1487
+ target_dbs[0]["connection_obj"] = db_conf
923
1488
 
924
1489
  # Resolve env_file: CLI > Params > Default
925
1490
  if env_file:
926
1491
  env_path = env_file
927
- elif context.params and "env_file" in context.params:
928
- env_path = Path(context.params["env_file"])
1492
+ elif context.params and context.params.get("defaults", {}).get("env_file"):
1493
+ env_path = Path(context.params.get("defaults", {})["env_file"])
929
1494
  else:
930
1495
  env_path = Path(".env.linux")
931
1496
  load_environment(env_path, logger=console.print)
932
1497
 
933
-
934
1498
  # Resolve Context (Registry/Config Paths)
935
1499
  # Be robust if config_file doesn't exist yet
936
1500
  config_path = config_file or context.default_config
937
-
1501
+
938
1502
  if not config_path:
939
1503
  console.print("[red]No config file target specified.[/red]")
940
1504
  raise typer.Exit(code=1)
941
1505
 
942
1506
  import json
1507
+
943
1508
  cli_urls = json.loads(db_url_map) if db_url_map else {}
944
1509
 
945
1510
  # Helper to resolve URL without a full registry instance if file missing
946
1511
  def resolve_url_safe(conf_name):
947
- if conf_name in cli_urls: return cli_urls[conf_name]
948
-
1512
+ if conf_name in cli_urls:
1513
+ return cli_urls[conf_name]
1514
+
949
1515
  # Check params first
950
1516
  for db in databases:
951
- if db.get("connection_obj") == conf_name:
952
- imp = db.get("global_import")
953
- if imp:
954
- url = resolve_db_url(conf_name, [imp])
955
- if url: return url
956
-
1517
+ if db.get("connection_obj") == conf_name:
1518
+ imp = db.get("global_import")
1519
+ if imp:
1520
+ url = resolve_db_url(conf_name, [imp])
1521
+ if url:
1522
+ return url
1523
+
957
1524
  # Try loading defaults if registry file exists
958
1525
  if config_path.exists():
959
- with open(config_path, 'r') as f:
1526
+ with open(config_path, "r") as f:
960
1527
  data = yaml.safe_load(f)
961
1528
  # Minimal registry just to get imports/resolution
962
1529
  reg = DatacubeRegistry(data, params=context.params)
963
1530
  url = resolve_db_url(conf_name, reg.global_imports)
964
- if url: return url
965
-
1531
+ if url:
1532
+ return url
1533
+
966
1534
  # Fallback: try raw resolve (might fail if imports missing)
967
- url = resolve_db_url(conf_name, [])
968
- if url: return url
969
- raise ValueError("Cannot resolve DB URL. Please ensure registry exists or use --db-urls.")
1535
+ url = resolve_db_url(conf_name, [])
1536
+ if url:
1537
+ return url
1538
+ raise ValueError(
1539
+ "Cannot resolve DB URL. Please ensure registry exists or use --db-urls."
1540
+ )
970
1541
 
971
1542
  # 1. Reset / Initialize Files (Global)
972
1543
  if reset:
973
1544
  if typer.confirm("Are you sure you want to reset the registry?"):
974
- # We rely on params being provided by the wrapper/context now.
975
- if not params:
976
- console.print("[yellow]Warning: No params loaded from context. Defaults may be minimal.[/yellow]")
977
-
978
- # Registry Default
979
- default_registry = {
980
- "global_imports": params.get("global_imports", []),
981
- "tables": {}
982
- }
983
- with open(config_path, "w") as f:
984
- yaml.dump(default_registry, f, sort_keys=False)
985
- console.print(f"[green]Reset {config_path}[/green]")
1545
+ # We rely on params being provided by the wrapper/context now.
1546
+ if not params:
1547
+ console.print(
1548
+ "[yellow]Warning: No params loaded from context. Defaults may be minimal.[/yellow]"
1549
+ )
1550
+
1551
+ # Registry Default
1552
+ default_registry = {
1553
+ "global_imports": params.get("global_imports", []),
1554
+ "tables": {},
1555
+ }
1556
+ with open(config_path, "w") as f:
1557
+ yaml.dump(default_registry, f, sort_keys=False)
1558
+ console.print(f"[green]Reset {config_path}[/green]")
986
1559
 
987
1560
  # Loop over targets for DB-specific actions
988
1561
  for db in target_dbs:
989
1562
  db_name = db.get("name")
990
1563
  conn_obj = db.get("connection_obj")
991
-
1564
+
992
1565
  try:
993
1566
  db_url = resolve_url_safe(conn_obj)
994
1567
  engine = sa.create_engine(db_url)
995
1568
  except Exception as e:
996
- console.print(f"[red]Skipping {db_name}: Cannot check DB connection ({e})[/red]")
1569
+ console.print(
1570
+ f"[red]Skipping {db_name}: Cannot check DB connection ({e})[/red]"
1571
+ )
997
1572
  continue
998
1573
 
999
1574
  # 2. Dump Schema
1000
1575
  if dump_schema:
1001
1576
  from sibi_flux.datacube.generator import dump_db_schema
1577
+
1002
1578
  console.print(f"[bold]Dumping schema for {db_name}...[/bold]")
1003
1579
  dump_db_schema(
1004
1580
  engine=engine,
1005
1581
  db_name=db_name,
1006
1582
  output_dir=dump_schema,
1007
- logger=console.print
1583
+ logger=console.print,
1008
1584
  )
1009
-
1010
- # 3. Initialize Whitelist
1011
- if init_whitelist:
1585
+
1586
+ # 3. Initialize Rules (Global/Merged?)
1587
+ if init_rules:
1012
1588
  insp = inspect(engine)
1013
1589
  tables = insp.get_table_names()
1014
-
1015
- # Determine path
1016
- if init_whitelist.name == "discovery_whitelist.yaml": # CLI Default value check?
1017
- # Actually Typer might pass the value even if default.
1018
- # If valid path provided, usage is ambiguous with multi-db.
1019
- # If explicit path provided, we write to IT. (Overwriting per loop? Bad).
1020
- # Convention: If explicit path provided, we assume single DB mode or user knows what they do.
1021
- # BUT here we want to use the config-defined whitelist file if available.
1022
-
1023
- wl_file = db.get("whitelist_file", f"discovery_whitelist_{db_name}.yaml")
1024
- target_path = config_path.parent / wl_file
1590
+
1591
+ target_path = init_rules
1592
+ if not target_path.is_absolute():
1593
+ # Default to Project Root anchoring for consistency
1594
+ try:
1595
+ project_root = config_path.parent.parent.parent
1596
+ except Exception:
1597
+ project_root = Path.cwd()
1598
+
1599
+ target_path = project_root / target_path
1600
+
1601
+ # Ensure parent dir exists
1602
+ if not target_path.parent.exists():
1603
+ target_path.parent.mkdir(parents=True, exist_ok=True)
1604
+
1605
+ # Load existing to append?
1606
+ existing_rules = []
1607
+ if target_path.exists():
1608
+ with open(target_path, "r") as f:
1609
+ existing_rules = yaml.safe_load(f) or []
1610
+
1611
+ console.print(f"Appending rules for {db_name} tables...")
1612
+
1613
+ new_rules = []
1614
+ for table in sorted(tables):
1615
+ # Check existence
1616
+ if any(r["pattern"] == table for r in existing_rules):
1617
+ continue
1618
+
1619
+ new_rules.append(
1620
+ {
1621
+ "pattern": table,
1622
+ "match_type": "exact",
1623
+ "domain": "common",
1624
+ "output_template": f"common/{table}_cubes.py",
1625
+ "db_conn_override": conn_obj,
1626
+ }
1627
+ )
1628
+
1629
+ all_rules = existing_rules + new_rules
1630
+
1631
+ with open(target_path, "w") as f:
1632
+ yaml.dump(all_rules, f, sort_keys=False)
1633
+ console.print(f"[green]Rules updated for {db_name}.[/green]")
1634
+
1635
+
1636
+ @app.command()
1637
+ def whitelist(
1638
+ config_file: Optional[Path] = typer.Option(None, "--config"),
1639
+ db_name: Optional[str] = typer.Option(
1640
+ None, "--db", help="Target specific database from params"
1641
+ ),
1642
+ db_url_map: Optional[str] = typer.Option(None, "--db-urls"),
1643
+ env_file: Optional[Path] = typer.Option(None, "--env-file", "-e"),
1644
+ force: bool = typer.Option(False, "--force", "-f"),
1645
+ ) -> None:
1646
+ """
1647
+ Generates whitelist files based on discovery rules and database schema.
1648
+ """
1649
+ params = context.params
1650
+ databases = params.get("databases", [])
1651
+
1652
+ # 1. Determine Targets
1653
+ target_dbs = []
1654
+ if db_name:
1655
+ target_dbs = [d for d in databases if d.get("name") == db_name]
1656
+ if not target_dbs:
1657
+ console.print(f"[red]Database '{db_name}' not found in params.[/red]")
1658
+ raise typer.Exit(code=1)
1659
+ elif databases:
1660
+ target_dbs = databases
1661
+ if not target_dbs:
1662
+ # Legacy
1663
+ target_db_conf = params.get("default_connection_obj", "replica_db_conf")
1664
+ target_dbs = [{"name": target_db_conf, "connection_obj": target_db_conf}]
1665
+
1666
+ # 2. Env Load
1667
+ if env_file:
1668
+ env_path = env_file
1669
+ elif context.params and context.params.get("defaults", {}).get("env_file"):
1670
+ env_path = Path(context.params.get("defaults", {})["env_file"])
1671
+ else:
1672
+ env_path = Path(".env.linux")
1673
+ load_environment(env_path, logger=console.print)
1674
+
1675
+ # 3. Config Path
1676
+ config_path = config_file or context.default_config
1677
+ if not config_path:
1678
+ console.print("[red]No config file target specified.[/red]")
1679
+ raise typer.Exit(code=1)
1680
+
1681
+ import json
1682
+
1683
+ cli_urls = json.loads(db_url_map) if db_url_map else {}
1684
+
1685
+ config_data = _load_and_resolve_config(
1686
+ config_path
1687
+ ) # Since we need rules/paths resolved
1688
+
1689
+ registry = DatacubeRegistry(config_data, params=context.params)
1690
+
1691
+ def resolve_url_safe(conf_name, db_imp):
1692
+ if conf_name in cli_urls:
1693
+ return cli_urls[conf_name]
1694
+ imp = [db_imp] if db_imp else registry.global_imports
1695
+ return resolve_db_url(conf_name, imp)
1696
+
1697
+ # 4. Iterate and Generate
1698
+ for db in target_dbs:
1699
+ db_name = db.get("name")
1700
+ conn_obj = db.get("connection_obj")
1701
+
1702
+ try:
1703
+ db_url = resolve_url_safe(conn_obj, db.get("global_import"))
1704
+ engine = sa.create_engine(db_url)
1705
+ except Exception as e:
1706
+ console.print(
1707
+ f"[red]Skipping {db_name}: Cannot check DB connection ({e})[/red]"
1708
+ )
1709
+ continue
1710
+
1711
+ insp = inspect(engine)
1712
+ tables = insp.get_table_names()
1713
+
1714
+ # Determine path (Config Driven)
1715
+ discovery_cfg = params.get("paths", {}).get("discovery") or params.get(
1716
+ "discovery", {}
1717
+ )
1718
+ wl_file = (
1719
+ discovery_cfg.get("whitelist_file")
1720
+ or params.get("whitelist_file")
1721
+ or "whitelist.yaml"
1722
+ )
1723
+ target_path = Path(wl_file)
1724
+ if not target_path.is_absolute():
1725
+ try:
1726
+ project_root = config_path.parent.parent.parent
1727
+ except Exception:
1728
+ project_root = Path.cwd()
1729
+ target_path = project_root / target_path
1730
+
1731
+ # Load Rules
1732
+ discovery_cfg = params.get("paths", {}).get("discovery") or params.get(
1733
+ "discovery", {}
1734
+ )
1735
+ rules_file = (
1736
+ discovery_cfg.get("rules_file")
1737
+ or params.get("rules_file")
1738
+ or "discovery_rules.yaml"
1739
+ )
1740
+ if Path(rules_file).is_absolute():
1741
+ rules_path = Path(rules_file)
1742
+ else:
1743
+ try:
1744
+ prj_root = config_path.parent.parent.parent
1745
+ except Exception:
1746
+ prj_root = Path.cwd()
1747
+ rules_path = prj_root / rules_file
1748
+
1749
+ filtered_tables = {} # Default to ALL if no rules? No, tables is list.
1750
+ # If no rules, we might want to default to empty dicts for all tables?
1751
+ # Let's keep logic: if rules exist, filter.
1752
+
1753
+ # 4. Use ConfigurationEngine for logic
1754
+ # Initialize engine with the resolved rules file
1755
+ from sibi_flux.datacube.config_engine import ConfigurationEngine
1756
+
1757
+ # We need to construct a lightweight 'params' dict or use existing context.params
1758
+ # But we need to ensure 'engine' uses the correct 'rules_path' for THIS connection.
1759
+
1760
+ # NOTE: ConfigEngine takes 'params' and 'rules_path'.
1761
+ # It handles scoped dictionary rules (via 'context_key') correctly.
1762
+
1763
+ eng = ConfigurationEngine(
1764
+ context.params, rules_path=str(rules_path), context_key=conn_obj
1765
+ )
1766
+
1767
+ filtered_tables = {}
1768
+
1769
+ if rules_path.exists():
1770
+ for t in tables:
1771
+ # Resolve using engine (handles prefix, regex, template logic)
1772
+ # Pass mocked db_config for domain resolution logic inside engine
1773
+ mock_db_config = {
1774
+ "db_domain": db.get("db_domain", "common"),
1775
+ "connection_obj": conn_obj,
1776
+ }
1777
+
1778
+ res = eng.resolve_table(t, db_config=mock_db_config)
1779
+
1780
+ if res:
1781
+ # Calculate relative paths
1782
+ db_dom = db.get("db_domain")
1783
+
1784
+ # 1. Datacube Path Explicit
1785
+ # Structure: datacubes_dir / [db_domain] / domain / template
1786
+
1787
+ # datacubes_dir resolved from params
1788
+ dc_root_dir = (
1789
+ context.params.get("paths", {})
1790
+ .get("target", {})
1791
+ .get("datacubes_dir", "dataobjects/gencubes")
1792
+ )
1793
+
1794
+ dc_base = Path(dc_root_dir)
1795
+ if db_dom:
1796
+ dc_base = dc_base / db_dom
1797
+
1798
+ # Rule might provide 'output_template' (e.g. 'asm_cubes.py')
1799
+ # We assume template is just the filename now per robust rules
1800
+ template_name = res.get("output_template", Path(res["path"]).name)
1801
+
1802
+ full_dc_path = dc_base / res["domain"] / template_name
1803
+
1804
+ # 2. Field Map Path Explicit
1805
+ # Structure: field_maps_dir / [db_domain] / domain / table.py
1806
+
1807
+ fields_root_dir = (
1808
+ context.params.get("paths", {})
1809
+ .get("target", {})
1810
+ .get("field_maps_dir", "dataobjects/fields")
1811
+ )
1812
+
1813
+ fm_base = Path(fields_root_dir)
1814
+ if db_dom:
1815
+ fm_base = fm_base / db_dom
1816
+
1817
+ fm_path_full = fm_base / res["domain"] / f"{t}.py"
1818
+
1819
+ # Relativize logic
1820
+ def _safe_rel(p):
1821
+ try:
1822
+ pp = Path(p)
1823
+ # Heuristic for project root if not clear
1824
+ root = (
1825
+ config_path.parent.parent.parent
1826
+ if config_path.parent.name == "datacubes"
1827
+ else Path.cwd()
1828
+ )
1829
+ if pp.is_absolute():
1830
+ try:
1831
+ return pp.relative_to(root)
1832
+ except ValueError:
1833
+ return pp.resolve().relative_to(root.resolve())
1834
+ return pp
1835
+ except Exception:
1836
+ return Path(p)
1837
+
1838
+ rel_dc_path = _safe_rel(full_dc_path)
1839
+ rel_fm_path = _safe_rel(fm_path_full)
1840
+
1841
+ filtered_tables[t] = {
1842
+ "domain": res["domain"],
1843
+ "output_template": (
1844
+ Path(res["path"]).name
1845
+ if "output_template" not in res
1846
+ else res.get("output_template", Path(res["path"]).name)
1847
+ ),
1848
+ "datacube_path": str(rel_dc_path),
1849
+ "field_map_path": str(rel_fm_path),
1850
+ }
1851
+ # Recover template logic: if rule-based, template was used.
1852
+ # But since we have the path, template is secondary.
1853
+ # We'll stick to what we have.
1854
+
1855
+ else:
1856
+ console.print(
1857
+ f"[yellow]No rules found for {conn_obj}. Whitelisting ALL tables (No Paths Calculated).[/yellow]"
1858
+ )
1859
+ filtered_tables = {t: {} for t in tables}
1860
+
1861
+ # 5. Load Current Whitelist
1862
+ current_wl = {}
1863
+ if target_path.exists():
1864
+ try:
1865
+ with open(target_path, "r") as rf:
1866
+ current_wl = yaml.safe_load(rf) or {}
1867
+ except:
1868
+ pass
1869
+
1870
+ # Existing whitelist for this connection
1871
+ existing_tables_map = {}
1872
+ if conn_obj in current_wl:
1873
+ raw_wl = current_wl[conn_obj]
1874
+ if isinstance(raw_wl, list):
1875
+ # Upgrade legacy list to dict
1876
+ existing_tables_map = {t: {} for t in raw_wl}
1025
1877
  else:
1026
- target_path = init_whitelist
1027
- if not target_path.is_absolute():
1028
- target_path = config_path.parent / target_path
1029
-
1030
- console.print(f"Initializing whitelist at {target_path} for {db_name} with {len(tables)} tables...")
1031
- dump_data = sorted(tables)
1032
- with open(target_path, 'w') as f:
1033
- yaml.dump({'tables': dump_data}, f, sort_keys=False)
1034
- console.print(f"[green]whitelist initialized for {db_name}.[/green]")
1035
-
1036
- # 4. Initialize Rules (Global/Merged?)
1037
- # Rules are usually global. Initializing from ONE db might miss others,
1038
- # or overwriting rules file repeatedly.
1039
- # We'll skip complex merging for now and just append or warn.
1040
- if init_rules:
1041
- insp = inspect(engine)
1042
- tables = insp.get_table_names()
1043
-
1044
- target_path = init_rules
1045
- if not target_path.is_absolute():
1046
- target_path = config_path.parent / target_path
1047
-
1048
- # Load existing to append?
1049
- existing_rules = []
1050
- if target_path.exists():
1051
- with open(target_path, 'r') as f:
1052
- existing_rules = yaml.safe_load(f) or []
1053
-
1054
- console.print(f"Appending rules for {db_name} tables...")
1055
-
1056
- new_rules = []
1057
- for table in sorted(tables):
1058
- # Check existence
1059
- if any(r['pattern'] == table for r in existing_rules):
1060
- continue
1061
-
1062
- new_rules.append({
1063
- "pattern": table,
1064
- "match_type": "exact",
1065
- "domain": "common",
1066
- "output_template": f"common/{table}_cubes.py",
1067
- "db_conn_override": conn_obj
1068
- })
1069
-
1070
- all_rules = existing_rules + new_rules
1071
-
1072
- with open(target_path, 'w') as f:
1073
- yaml.dump(all_rules, f, sort_keys=False)
1074
- console.print(f"[green]Rules updated for {db_name}.[/green]")
1075
-
1076
-
1077
-
1078
- def _run_field_map_generation(context, config_path, target_dbs, url_resolver, force=False):
1878
+ existing_tables_map = raw_wl.get("tables", {})
1879
+
1880
+ # Calculate sets
1881
+ current_table_names = set(existing_tables_map.keys())
1882
+ new_table_names = set(filtered_tables.keys())
1883
+
1884
+ # Sync Logic:
1885
+ # 1. Start with intersection (preserve config, but update rule-based defaults if missing?)
1886
+ # We want to keep manual overrides in existing map, but maybe refresh defaults?
1887
+ retained = current_table_names.intersection(new_table_names)
1888
+
1889
+ # 2. Add new filtered tables (additions)
1890
+ added = new_table_names - current_table_names
1891
+
1892
+ # 3. Removed (in current but not in filtered)
1893
+ removed = current_table_names - new_table_names
1894
+
1895
+ # Construct new table map
1896
+ new_table_map = {}
1897
+
1898
+ # 1. Retained: Merge existing with new rule metadata
1899
+ # Priority: Existing (Manual) > New (Rule)
1900
+ # BUT: Enforce calculated paths to avoid stale absolute paths
1901
+ for t in retained:
1902
+ existing_meta = existing_tables_map[t]
1903
+ rule_meta = filtered_tables[t]
1904
+ # Merge: update rule defaults only if not set in existing
1905
+ merged = rule_meta.copy()
1906
+ merged.update(existing_meta) # Existing overwrites rule
1907
+
1908
+ # Restore calculated paths (Enforce Relative)
1909
+ if "datacube_path" in rule_meta:
1910
+ merged["datacube_path"] = rule_meta["datacube_path"]
1911
+ if "field_map_path" in rule_meta:
1912
+ merged["field_map_path"] = rule_meta["field_map_path"]
1913
+
1914
+ new_table_map[t] = merged
1915
+
1916
+ # 2. Add new
1917
+ for t in added:
1918
+ new_table_map[t] = filtered_tables[t]
1919
+
1920
+ # 3. Removed are omitted
1921
+
1922
+ # Update structure
1923
+ # Inject Global Paths
1924
+ paths_cfg = params.get("paths", {}).get("target", {})
1925
+
1926
+ # Helper to ensure relative paths
1927
+ def _to_rel(p):
1928
+ if not p:
1929
+ return p
1930
+ try:
1931
+ pp = Path(p)
1932
+ # Heuristic for project root if not clear
1933
+ root = (
1934
+ config_path.parent.parent.parent
1935
+ if config_path.parent.name == "datacubes"
1936
+ else Path.cwd()
1937
+ )
1938
+ if pp.is_absolute():
1939
+ try:
1940
+ return str(pp.relative_to(root))
1941
+ except ValueError:
1942
+ return str(pp.resolve().relative_to(root.resolve()))
1943
+ return p
1944
+ except Exception:
1945
+ pass
1946
+ return p
1947
+
1948
+ dc_dir = _to_rel(paths_cfg.get("datacubes_dir", "dataobjects/gencubes/"))
1949
+ fm_dir = _to_rel(
1950
+ paths_cfg.get("field_maps_dir", "dataobjects/gencubes/fields/")
1951
+ )
1952
+
1953
+ # Get db_domain from config (fallback to db_name if missing)
1954
+ db_domain = db.get("db_domain") or db.get("name")
1955
+
1956
+ rich_structure = {
1957
+ "db_domain": db_domain,
1958
+ "datacubes_dir": dc_dir,
1959
+ "field_maps_dir": fm_dir,
1960
+ "tables": dict(sorted(new_table_map.items())), # Sort keys for stability
1961
+ }
1962
+
1963
+ current_wl[conn_obj] = rich_structure
1964
+
1965
+ with open(target_path, "w") as f:
1966
+ yaml.dump(current_wl, f, sort_keys=False)
1967
+
1968
+ # Report
1969
+ console.print(f"[bold underline]Sync Report for {db_name}[/bold underline]")
1970
+ if added:
1971
+ console.print(
1972
+ f"[green] + Added {len(added)} tables:[/green] {', '.join(sorted(list(added))[:5])}{'...' if len(added)>5 else ''}"
1973
+ )
1974
+ if removed:
1975
+ console.print(
1976
+ f"[red] - Removed {len(removed)} tables:[/red] {', '.join(sorted(list(removed))[:5])}{'...' if len(removed)>5 else ''}"
1977
+ )
1978
+ if not added and not removed:
1979
+ console.print("[dim] No changes.[/dim]")
1980
+
1981
+ # Count keys directly
1982
+ final_count = len(rich_structure["tables"])
1983
+ console.print(f"[blue] Total Whitelisted: {final_count}[/blue]")
1984
+
1985
+
1986
+ def _run_field_map_generation(
1987
+ context, config_path, target_dbs, url_resolver, force=False
1988
+ ):
1079
1989
  """Shared logic for generating field map files."""
1080
1990
  params = context.params
1081
- folder_prefix = params.get("folder_prefix", "solutions/dataobjects/gencubes/")
1082
- fields_suffix = params.get("fields_module_root", "fields")
1083
-
1991
+ # Logic: modern nested > legacy flat > default
1992
+ folder_prefix = (
1993
+ params.get("paths", {}).get("target", {}).get("datacubes_dir")
1994
+ or params.get("folder_prefix")
1995
+ or "solutions/dataobjects/gencubes/"
1996
+ )
1997
+
1998
+ # Priority: explicit 'field_maps_dir' > legacy suffix construction
1999
+ configured_fm_dir = params.get("paths", {}).get("target", {}).get("field_maps_dir")
2000
+
2001
+ if configured_fm_dir:
2002
+ # If fm_dir is provided, it is the root for fields.
2003
+ # We don't append suffix to it unless it's just a root base?
2004
+ # Usually 'field_maps_dir' is the full relative path e.g. 'dataobjects/fields'
2005
+ fields_root_path_base = Path(configured_fm_dir)
2006
+ use_legacy_construction = False
2007
+ else:
2008
+ fields_suffix = (
2009
+ params.get("generation", {}).get("fields_subpackage")
2010
+ or params.get("fields_module_root")
2011
+ or "fields"
2012
+ )
2013
+ fields_root_path_base = Path(folder_prefix) / fields_suffix
2014
+ use_legacy_construction = True
2015
+
1084
2016
  from sibi_flux.datacube.field_mapper import FieldTranslationManager
1085
-
2017
+
2018
+ # Initialize Manager & Load Global Repo
1086
2019
  # Initialize Manager & Load Global Repo
1087
- repo_rel_path = context.params.get("global_repo_path", "solutions/conf/global_field_repository.yaml")
1088
- global_repo_path = Path(repo_rel_path).resolve()
1089
-
2020
+ repo_rel_path = (
2021
+ params.get("paths", {})
2022
+ .get("repositories", {})
2023
+ .get("global_field_repository_file")
2024
+ or params.get("global_repo_path")
2025
+ or "solutions/conf/global_field_repository.yaml"
2026
+ )
2027
+
2028
+ # Resolve relative to project root if needed
2029
+ if Path(repo_rel_path).is_absolute():
2030
+ global_repo_path = Path(repo_rel_path)
2031
+ else:
2032
+ try:
2033
+ # Heuristic: config is in generators/datacubes, project root is 3 levels up
2034
+ # But better to check if config_path is passed
2035
+ project_root = config_path.parent.parent.parent
2036
+ except Exception:
2037
+ project_root = Path.cwd()
2038
+ global_repo_path = (project_root / repo_rel_path).resolve()
2039
+
1090
2040
  manager = FieldTranslationManager()
1091
-
2041
+
2042
+ # 1. Load Repository (Definitions)
1092
2043
  if global_repo_path.exists():
1093
2044
  with open(global_repo_path, "r") as f:
1094
2045
  repo_data = yaml.safe_load(f) or []
1095
2046
  manager.load_from_list(repo_data)
1096
- console.print(f"[green]Loaded {len(manager.fields)} fields from Global Repository.[/green]")
1097
- else:
1098
- console.print(f"[dim]Global Repository not found at {global_repo_path}, creating new...[/dim]")
2047
+ console.print(
2048
+ f"[green]Loaded {len(manager.fields)} fields from Global Repository.[/green]"
2049
+ )
2050
+
2051
+ # 2. Load Translations (Overrides)
2052
+ trans_rel_path = params.get("paths", {}).get("repositories", {}).get(
2053
+ "global_field_translations_file"
2054
+ ) or params.get("global_field_translations_file")
2055
+ if trans_rel_path:
2056
+ if Path(trans_rel_path).is_absolute():
2057
+ trans_path = Path(trans_rel_path)
2058
+ else:
2059
+ trans_path = (project_root / trans_rel_path).resolve()
2060
+
2061
+ if trans_path.exists():
2062
+ with open(trans_path, "r") as f:
2063
+ trans_data = yaml.safe_load(f) or []
2064
+ manager.load_from_list(trans_data)
2065
+ console.print(
2066
+ f"[green]Loaded translations from {trans_rel_path}.[/green]"
2067
+ )
2068
+
2069
+ # --- GLOBAL CLEAN BUILD ---
2070
+ if force:
2071
+ # Determine global field maps root to wipe
2072
+ # Logic matches default resolution used later
2073
+ tgt = params.get("paths", {}).get("target", {})
2074
+ fm_dir = tgt.get("field_maps_dir")
2075
+
2076
+ # If not set in params, check if we can infer from default
2077
+ # But wait, whitelist overrides this per DB.
2078
+ # User said "fields folder is exclusive".
2079
+ # So we should wipe the configured 'field_maps_dir' from global params.
2080
+
2081
+ if fm_dir:
2082
+ if Path(fm_dir).is_absolute():
2083
+ abs_fm_dir = Path(fm_dir)
2084
+ else:
2085
+ abs_fm_dir = (project_root / fm_dir).resolve()
2086
+
2087
+ if abs_fm_dir.exists():
2088
+ console.print(
2089
+ f"[bold red]Global Clean: Removing entire fields directory {abs_fm_dir}[/bold red]"
2090
+ )
2091
+ try:
2092
+ shutil.rmtree(abs_fm_dir)
2093
+ abs_fm_dir.mkdir(parents=True, exist_ok=True)
2094
+ (abs_fm_dir / "__init__.py").touch()
2095
+ except Exception as e:
2096
+ console.print(f"[red]Failed to clean global fields dir: {e}[/red]")
2097
+ else:
2098
+ console.print(
2099
+ "[yellow]Warning: Could not determine global field_maps_dir for clean build.[/yellow]"
2100
+ )
1099
2101
 
1100
2102
  for db in target_dbs:
2103
+ # console.print(f"DEBUG: Processing DB entry: {db} (Type: {type(db)})")
2104
+ if isinstance(db, str):
2105
+ console.print(
2106
+ f"[red]Error: Database entry is a string '{db}', expected dict. Check config.[/red]"
2107
+ )
2108
+ continue
2109
+
1101
2110
  name = db.get("id") or db.get("name")
1102
2111
  conn_obj = db.get("connection_ref") or db.get("connection_obj")
1103
-
1104
- if not db.get("enable_field_map_generation", True) and not params.get("generation", {}).get("enable_field_maps", True):
1105
- console.print(f"[dim]Skipping {name}: Field map generation disabled.[/dim]")
1106
- continue
2112
+
2113
+ if not db.get("enable_field_map_generation", True) and not params.get(
2114
+ "generation", {}
2115
+ ).get("enable_field_maps", True):
2116
+ console.print(f"[dim]Skipping {name}: Field map generation disabled.[/dim]")
2117
+ continue
1107
2118
 
1108
2119
  # Language Settings
1109
2120
  source_lang = db.get("db_source_lang", "es")
@@ -1111,96 +2122,298 @@ def _run_field_map_generation(context, config_path, target_dbs, url_resolver, fo
1111
2122
 
1112
2123
  # Path Composition
1113
2124
  db_domain = db.get("db_domain")
1114
- fields_root_path = Path(folder_prefix)
1115
- if db_domain:
1116
- fields_root_path = fields_root_path / db_domain
1117
- fields_root_path = fields_root_path / fields_suffix
1118
-
1119
-
2125
+
2126
+ if use_legacy_construction:
2127
+ fields_root_path = Path(folder_prefix)
2128
+ if db_domain:
2129
+ fields_root_path = fields_root_path / db_domain
2130
+ fields_root_path = fields_root_path / fields_suffix
2131
+ else:
2132
+ # Modern Logic: configured_fm_dir is root
2133
+ fields_root_path = fields_root_path_base
2134
+ if db_domain:
2135
+ fields_root_path = fields_root_path / db_domain
2136
+
2137
+ # Resolve global whitelist path
1120
2138
  # Resolve global whitelist path
1121
2139
  # Try nested discovery param first, then legacy flat
2140
+ discovery_cfg = params.get("paths", {}).get("discovery") or params.get(
2141
+ "discovery", {}
2142
+ )
1122
2143
  global_whitelist_file = (
1123
- params.get("discovery", {}).get("whitelist_file") or
1124
- params.get("whitelist_file") or
1125
- "whitelist.yaml"
2144
+ discovery_cfg.get("whitelist_file")
2145
+ or params.get("whitelist_file")
2146
+ or "whitelist.yaml"
1126
2147
  )
1127
-
2148
+
2149
+ # If absolute, use directly (handled by gen_dc.py resolution), else resolve
1128
2150
  # If absolute, use directly (handled by gen_dc.py resolution), else resolve
1129
2151
  if Path(global_whitelist_file).is_absolute():
1130
2152
  wl_path = Path(global_whitelist_file)
1131
2153
  else:
1132
- wl_path = config_path.parent / global_whitelist_file
1133
-
2154
+ # Use Project Root anchoring
2155
+ try:
2156
+ # Heuristic: config is in generators/datacubes
2157
+ project_root = config_path.parent.parent.parent
2158
+ except Exception:
2159
+ project_root = Path.cwd()
2160
+ wl_path = project_root / global_whitelist_file
2161
+
1134
2162
  if not wl_path.exists():
1135
- console.print(f"[yellow]Skipping {name}: Whitelist {global_whitelist_file} not found.[/yellow]")
2163
+ console.print(
2164
+ f"[yellow]Skipping {name}: Whitelist {global_whitelist_file} not found.[/yellow]"
2165
+ )
1136
2166
  continue
1137
-
2167
+
1138
2168
  with open(wl_path, "r") as f:
1139
2169
  registry_data = yaml.safe_load(f) or {}
1140
2170
 
1141
- # Default to empty dict if key not found
2171
+ # Load Rules for Domain Inference
2172
+ rules_filename = db.get("rules_file")
2173
+ if not rules_filename:
2174
+ # Check deep keys
2175
+ disc = params.get("paths", {}).get("discovery", {})
2176
+ # console.print(f"DEBUG: Discovery Block: {disc}")
2177
+ rules_filename = disc.get("rules_file") or params.get("discovery", {}).get(
2178
+ "rules_file"
2179
+ )
2180
+
2181
+ if not rules_filename:
2182
+ console.print(
2183
+ f"DEBUG: Fallback for {name}. Params keys: {disc.keys() if 'disc' in locals() else 'N/A'}"
2184
+ )
2185
+ rules_filename = f"discovery_rules_{conn_obj}.yaml"
2186
+
2187
+ # Resolve path relative to project root properly
2188
+ if Path(rules_filename).is_absolute():
2189
+ rules_path = Path(rules_filename)
2190
+ else:
2191
+ try:
2192
+ # Heuristic: config is in generators/datacubes
2193
+ # Reuse project_root calculation from above
2194
+ if "project_root" not in locals():
2195
+ try:
2196
+ project_root = config_path.parent.parent.parent
2197
+ except Exception:
2198
+ project_root = Path.cwd()
2199
+
2200
+ console.print(
2201
+ f"DEBUG: ConfigRoot={config_path.parent}, ProjRoot={project_root}, RulesFile={rules_filename}"
2202
+ )
2203
+ rules_path = project_root / rules_filename
2204
+ except Exception:
2205
+ rules_path = config_path.parent / rules_filename
2206
+
2207
+ rules = []
2208
+ # console.print(f"DEBUG: Checking rules path: {rules_path}")
2209
+ if rules_path.exists():
2210
+ with open(rules_path, "r") as f:
2211
+ rules = yaml.safe_load(f) or []
2212
+
2213
+ # PROPERLY HANDLE DICT vs LIST RULES
2214
+ if isinstance(rules, dict):
2215
+ # Try to find specific rules for this connection
2216
+ candidates = [conn_obj, name]
2217
+ found = False
2218
+ for key in candidates:
2219
+ if key in rules:
2220
+ rules = rules[key]
2221
+ found = True
2222
+ break
2223
+ if not found:
2224
+ rules = []
2225
+
2226
+ # console.print(f"DEBUG: Loaded {len(rules)} rules from {rules_path} for {conn_obj}")
2227
+
2228
+ # console.print(f"DEBUG: Loaded {len(rules)} rules from {rules_path}")
2229
+
2230
+ # Support List or Dict Format
1142
2231
  scoped_data = registry_data.get(conn_obj, {})
1143
- tables = scoped_data.get("tables", {})
2232
+ if isinstance(scoped_data, list):
2233
+ tables = {t: {} for t in scoped_data}
2234
+ else:
2235
+ tables = scoped_data.get("tables", {})
2236
+
1144
2237
  if not tables:
1145
- console.print(f"[dim]No tables in whitelist for {name}.[/dim]")
1146
- continue
2238
+ console.print(f"[dim]No tables in whitelist for {name}.[/dim]")
2239
+ continue
2240
+
2241
+ # Override paths from whitelist if present
2242
+ # This allows whitelist to drive output location efficiently
2243
+ # Retrieve db_domain from whitelist entry (preferred) or registry config
2244
+ # Hoisted from loop for Clean Build capability
2245
+ wl_entry = registry_data.get(conn_obj, {})
2246
+ db_domain = wl_entry.get("db_domain") or db.get("db_domain") or db.get("name")
2247
+
2248
+ wl_fields_dir = wl_entry.get("field_maps_dir")
2249
+ if wl_fields_dir:
2250
+ # Resolve relative to project root
2251
+ if Path(wl_fields_dir).is_absolute():
2252
+ fields_root_path = Path(wl_fields_dir)
2253
+ else:
2254
+ # Use cached or recalculated project_root
2255
+ if "project_root" not in locals():
2256
+ try:
2257
+ project_root = config_path.parent.parent.parent
2258
+ except Exception:
2259
+ project_root = Path.cwd()
2260
+ fields_root_path = project_root / wl_fields_dir
2261
+ else:
2262
+ # Fallback to global setting (calculated earlier, but we need to re-calc if not in scope or just reuse?)
2263
+
2264
+ # Logic: modern nested > legacy flat > default
2265
+ folder_prefix = (
2266
+ params.get("paths", {}).get("target", {}).get("datacubes_dir")
2267
+ or params.get("folder_prefix")
2268
+ or "solutions/dataobjects/gencubes/"
2269
+ )
2270
+ fields_suffix = (
2271
+ params.get("generation", {}).get("fields_subpackage")
2272
+ or params.get("fields_module_root")
2273
+ or "fields"
2274
+ )
2275
+ # Resolve
2276
+ if not Path(folder_prefix).is_absolute():
2277
+ try:
2278
+ prj_root = config_path.parent.parent.parent
2279
+ except Exception:
2280
+ prj_root = Path.cwd()
2281
+ folder_prefix = str(prj_root / folder_prefix)
2282
+
2283
+ # Construct
2284
+ fields_root_path = fields_root_path_base
2285
+
2286
+ # Ensure Parent Chain has __init__.py (Backtrack to root)
2287
+ try:
2288
+ # Make sure fields_root_path itself exists
2289
+ fields_root_path.mkdir(parents=True, exist_ok=True)
2290
+ if not (fields_root_path / "__init__.py").exists():
2291
+ (fields_root_path / "__init__.py").touch()
2292
+
2293
+ curr = fields_root_path.parent
2294
+ # Stop at project root (.) or root (/)
2295
+ while str(curr) != "." and str(curr) != "/" and len(curr.parts) > 0:
2296
+ if not (curr / "__init__.py").exists():
2297
+ (curr / "__init__.py").touch()
2298
+ curr = curr.parent
2299
+ except Exception:
2300
+ pass
2301
+
2302
+ # This is safe because db_domain is specific to this connection.
2303
+ db_target_root = fields_root_path / db_domain
2304
+
2305
+ # Ensure Root Exists
2306
+ db_target_root.mkdir(parents=True, exist_ok=True)
2307
+ if not (db_target_root / "__init__.py").exists():
2308
+ (db_target_root / "__init__.py").touch()
2309
+
2310
+ console.print(
2311
+ f"[bold cyan]Mapping fields for {name} ({len(tables)} tables) [Src:{source_lang} -> Tgt:{target_lang}]...[/bold cyan]"
2312
+ )
1147
2313
 
1148
- console.print(f"[bold cyan]Mapping fields for {name} ({len(tables)} tables) [Src:{source_lang} -> Tgt:{target_lang}]...[/bold cyan]")
1149
-
1150
2314
  try:
1151
2315
  db_url = url_resolver(conn_obj, db.get("global_import"))
1152
2316
  engine = sa.create_engine(db_url)
1153
2317
  inspector = inspect(engine)
1154
-
2318
+
1155
2319
  for table_name, details in tables.items():
1156
- if not details.get("field_map"):
1157
- continue
2320
+ domain = details.get("domain")
2321
+
2322
+ # If domain is missing (e.g. manual entry without rule), try fallback inference
2323
+ if not domain:
2324
+ # Infer from rules
2325
+ for r in rules:
2326
+ pat = r.get("pattern")
2327
+ mtype = r.get("match_type", "exact")
2328
+ is_match = False
2329
+ if mtype == "exact" and table_name == pat:
2330
+ is_match = True
2331
+ elif mtype == "prefix" and table_name.startswith(pat):
2332
+ is_match = True
2333
+ elif mtype == "regex":
2334
+ import re
2335
+
2336
+ if re.search(pat, table_name):
2337
+ is_match = True
2338
+
2339
+ if is_match:
2340
+ domain = r.get("domain")
2341
+ # console.print(f"DEBUG: {table_name} matched rule {pat} -> {domain}")
2342
+ break
2343
+
2344
+ if not domain:
2345
+ # console.print(f"DEBUG: No match for {table_name}, defaulting to common")
2346
+ domain = "common"
2347
+
2348
+ # Target Resolution
2349
+ explicit_fm_path = details.get("field_map_path")
2350
+ if explicit_fm_path:
2351
+ # Use Explicit Path from Whitelist
2352
+ if Path(explicit_fm_path).is_absolute():
2353
+ target_file = Path(explicit_fm_path)
2354
+ else:
2355
+ # Ensure project_root is available or fallback to cwd
2356
+ if "project_root" not in locals():
2357
+ try:
2358
+ project_root = config_path.parent.parent.parent
2359
+ except:
2360
+ project_root = Path.cwd()
2361
+ target_file = project_root / explicit_fm_path
2362
+
2363
+ target_dir = target_file.parent
2364
+ else:
2365
+ # Fallback Logic
2366
+ # Target Dir: fields/{db_domain}/{domain}
2367
+ target_dir = db_target_root / domain
2368
+ target_file = target_dir / f"{table_name}.py"
1158
2369
 
1159
- domain = details.get("domain", "common")
1160
-
1161
- # Target Dir
1162
- target_dir = fields_root_path / domain
2370
+ # Output Initialization
1163
2371
  target_dir.mkdir(parents=True, exist_ok=True)
1164
2372
  if not (target_dir / "__init__.py").exists():
1165
2373
  (target_dir / "__init__.py").touch()
1166
-
1167
- # Python Target File
1168
- target_file = target_dir / f"{table_name}.py"
1169
-
2374
+
1170
2375
  # Skip if exists and not force
1171
2376
  if target_file.exists() and not force:
1172
2377
  # console.print(f"DEBUG: Skipping existing {table_name}")
1173
2378
  continue
1174
-
2379
+
1175
2380
  try:
1176
- cols = inspector.get_columns(table_name)
2381
+ # Fix for SA 2.0 / Clickhouse: inspect connection, not engine
2382
+ with engine.connect() as conn:
2383
+ cols = inspect(conn).get_columns(table_name)
1177
2384
  # console.print(f"DEBUG: Generating {table_name} with {len(cols)} columns")
1178
2385
  field_map = {}
1179
-
1180
- full_table_name = f"{domain}.{table_name}"
1181
-
2386
+
2387
+ full_table_name = f"{db_domain}.{domain}.{table_name}"
2388
+
1182
2389
  for c in cols:
1183
2390
  col_name = c["name"]
1184
2391
  col_type = str(c["type"])
1185
-
2392
+
1186
2393
  # 1. Register / Get Canonical Field
1187
- trans_msg = manager.register_field(col_name, col_type, full_table_name)
2394
+ trans_msg = manager.register_field(
2395
+ col_name, col_type, full_table_name
2396
+ )
1188
2397
  if trans_msg:
1189
2398
  console.print(f" [dim]{trans_msg}[/dim]")
1190
-
2399
+
1191
2400
  # 2. Get Field Definition
1192
2401
  fid = manager._generate_id(col_name, col_type)
1193
2402
  field_def = manager.fields.get(fid)
1194
-
2403
+
1195
2404
  if field_def:
1196
2405
  # 3. Translate if needed
1197
2406
  if target_lang not in field_def.aliases:
1198
2407
  manager.translate_alias(fid, target_lang)
1199
-
2408
+
1200
2409
  # 4. Determine Target Column
1201
- target_alias = field_def.aliases.get(target_lang) or field_def.aliases.get("en") or col_name
2410
+ target_alias = (
2411
+ field_def.aliases.get(target_lang)
2412
+ or field_def.aliases.get("en")
2413
+ or col_name
2414
+ )
1202
2415
  target_col = manager.generate_target_column(target_alias)
1203
-
2416
+
1204
2417
  if not field_def.target_column:
1205
2418
  field_def.target_column = target_col
1206
2419
  else:
@@ -1220,27 +2433,36 @@ def _run_field_map_generation(context, config_path, target_dbs, url_resolver, fo
1220
2433
  for col_name in field_map.keys():
1221
2434
  lines.append(f' "{col_name}",')
1222
2435
  lines.append("]")
1223
-
2436
+
1224
2437
  lines.append("")
1225
- lines.append(f'field_map: Mapping[str, str] = FieldMapFactory.create("{table_name}", COLUMNS)')
1226
-
2438
+ lines.append(
2439
+ f'field_map: Mapping[str, str] = FieldMapFactory.create("{full_table_name}", COLUMNS)'
2440
+ )
2441
+
1227
2442
  # Generate metadata call
1228
2443
  lines.append("")
1229
- lines.append(f'metadata: Mapping[str, Any] = FieldMapFactory.create_metadata("{table_name}", COLUMNS)')
1230
-
2444
+ lines.append(
2445
+ f'metadata: Mapping[str, Any] = FieldMapFactory.create_metadata("{full_table_name}", COLUMNS)'
2446
+ )
2447
+
1231
2448
  with open(target_file, "w") as f:
1232
2449
  f.write("\n".join(lines))
1233
-
2450
+
1234
2451
  except Exception as e:
1235
- console.print(f"[red]Error mapping {table_name}: {e}[/red]")
1236
- if context.params.get("debug"): raise e
1237
-
2452
+ console.print(f"[red]Error processing {table_name}: {e}[/red]")
2453
+ continue
2454
+
1238
2455
  except Exception as e:
1239
- console.print(f"[red]Error processing DB {name}: {e}[/red]")
2456
+ console.print(f"[red]Error connecting/inspecting DB {name}: {e}[/red]")
1240
2457
 
1241
- # Save Global Repo
1242
- manager.save_to_yaml(global_repo_path)
1243
- console.print("Saved Global Field Repository.")
2458
+ # Save Updates to Global Repository
2459
+ try:
2460
+ manager.save_to_yaml(global_repo_path)
2461
+ console.print(
2462
+ f"[green]Updated Global Field Repository at {global_repo_path} ({len(manager.fields)} fields)[/green]"
2463
+ )
2464
+ except Exception as e:
2465
+ console.print(f"[red]Failed to save Global Field Repository: {e}[/red]")
1244
2466
 
1245
2467
 
1246
2468
  if __name__ == "__main__":