aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,799 @@
1
+ """Orchestrator for Coverage Simulator, Question Simulator, and Interactive Pipeline execution.
2
+
3
+ Coordinates the three top-level execution modes: the coverage simulator builds synthetic templates from seed questions via expansion and validation; the question simulator (QSim) generates natural-language questions from schema-profiled intent skeletons; the interactive pipeline answers live user questions with template reuse, SQL generation, and feedback learning.
4
+
5
+ Also provides artifact path utilities, QSim summary loading, and template list builders.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import os
12
+ import random
13
+ from collections import Counter
14
+ from datetime import datetime
15
+ from typing import Any
16
+
17
+ from platformdirs import user_data_dir
18
+
19
+ from .config import EngineConfig, PolicyConfig, QSimConfig, SimulatorConfig
20
+ from .contracts_base import (
21
+ QSimSummary,
22
+ RejectedTemplateInfo,
23
+ SchemaGraph,
24
+ SimulatorSummary,
25
+ TemplateInfo,
26
+ )
27
+ from .contracts_core import RuntimeIntent
28
+ from .core_utils import ask_user_choice, debug, log, normalize_question
29
+ from .expansion_ops import expand_gold_intents
30
+ from .intent_process import match_template_for_union
31
+ from .pipeline import (
32
+ check_and_handle_hard_block,
33
+ check_template_reuse,
34
+ compute_final_metrics,
35
+ confirm_intent_with_user,
36
+ display_final_results,
37
+ generate_and_validate_sql,
38
+ generate_join_candidates,
39
+ handle_direct_sql_reuse,
40
+ handle_user_feedback,
41
+ load_pipeline_resources,
42
+ parse_intent_via_llm,
43
+ save_result_csv,
44
+ )
45
+ from .qsim_ops import generate_all_intents, generate_all_questions
46
+ from .qsim_sample import instantiate_all
47
+ from .schema import compute_schema_limits, compute_schema_stats
48
+ from .simulator import (
49
+ get_next_simulator_version,
50
+ resolve_joins_for_table_set,
51
+ run_deterministic_simulation,
52
+ run_gold_intent_generation,
53
+ save_simulation_failures,
54
+ save_simulation_report,
55
+ )
56
+ from .templates import save_template_store, templates_to_store
57
+ from .utils import flatten_param_values, intent_key, validate_question
58
+ from .validation_execute import execute_sql, get_spark_sql_for_execution
59
+
60
+
61
+ def qsim_run_once(
62
+ num_intents: int | None = None,
63
+ num_questions: int | None = None,
64
+ seed: int | None = None,
65
+ artifacts_dir: str | None = None,
66
+ schema: SchemaGraph | None = None,
67
+ ) -> QSimSummary:
68
+ """Generate synthetic natural-language questions and persist them to a timestamped JSON file.
69
+
70
+ Runs the full QSim pipeline: intent skeleton generation, instantiation with sampled values, LLM question generation, and artifact persistence.
71
+
72
+ Args:
73
+
74
+ num_intents: Number of distinct intent types to generate; defaults to ``QSimConfig.INTENT_TYPES``.
75
+ num_questions: Total number of question variants to produce; defaults to ``QSimConfig.QUESTIONS_COUNT``.
76
+ seed: Random seed for reproducible generation; defaults to ``QSimConfig.RANDOM_SEED``.
77
+ artifacts_dir: Directory for output files; when ``None``, files are written to the current working directory.
78
+ schema: Pre-loaded and profiled ``SchemaGraph``; must not be ``None``.
79
+
80
+ Returns:
81
+
82
+ ``QSimSummary`` with timestamp, intent count, question count, and seed.
83
+
84
+ Raises:
85
+
86
+ RuntimeError: If *schema* is ``None`` or no column roles are found on the schema.
87
+ """
88
+ if num_intents is None:
89
+ num_intents = QSimConfig.INTENT_TYPES
90
+ if num_questions is None:
91
+ num_questions = QSimConfig.QUESTIONS_COUNT
92
+ if seed is None:
93
+ seed = QSimConfig.RANDOM_SEED
94
+
95
+ random.seed(seed)
96
+
97
+ log(f"Starting question simulation: {num_intents} intent types, {num_questions} questions, seed={seed}")
98
+
99
+ if schema is None:
100
+ raise RuntimeError("Schema must be provided to qsim_run_once")
101
+
102
+ has_profiled_columns = any(
103
+ col.role is not None for table in schema.tables.values() for col in table.columns.values()
104
+ )
105
+ if not has_profiled_columns:
106
+ raise RuntimeError(
107
+ "Schema profiling failed - no column roles found. Check database connection and column data."
108
+ )
109
+
110
+ total_cols = sum(len(t.columns) for t in schema.tables.values())
111
+ log(f" Loaded {len(schema.tables)} tables, {total_cols} columns with metadata")
112
+
113
+ column_roles: dict[str, str] = {}
114
+ for table_name, table_meta in schema.tables.items():
115
+ for col_name, col_meta in table_meta.columns.items():
116
+ if col_meta.role:
117
+ column_roles[f"{table_name}.{col_name}"] = col_meta.role
118
+
119
+ timestamp = datetime.now().strftime("%y%m%d%H%M%S")
120
+
121
+ log("Generating QSimIntent structures...")
122
+ intents = generate_all_intents(schema, column_roles, num_intents)
123
+ log(f" Generated {len(intents)} QSimIntent structures")
124
+
125
+ log("Instantiating QSimIntents with values...")
126
+ instantiated = instantiate_all(intents, schema, num_questions)
127
+ log(f" Created {len(instantiated)} QSimIntent variants with values")
128
+
129
+ log("Generating NL questions via LLM...")
130
+ results = generate_all_questions(instantiated, schema)
131
+ log(f" Generated {len(results)} QSimIntents with questions")
132
+
133
+ output_results = [intent.to_dict() for intent in results]
134
+
135
+ parent_ids = [
136
+ (intent.intent_id.rsplit("_v", 1)[0] if "_v" in intent.intent_id else intent.intent_id) for intent in results
137
+ ]
138
+ intent_counts = Counter(parent_ids)
139
+ log(f" Questions per intent type: {dict(intent_counts)}")
140
+
141
+ if artifacts_dir:
142
+ os.makedirs(artifacts_dir, exist_ok=True)
143
+ qsim_questions_path = os.path.join(artifacts_dir, f"qsim_intents_with_questions_{timestamp}.json")
144
+ qsim_summary_path = os.path.join(artifacts_dir, "qsim_summary.json")
145
+ else:
146
+ qsim_questions_path = f"qsim_intents_with_questions_{timestamp}.json"
147
+ qsim_summary_path = "qsim_summary.json"
148
+
149
+ log(f"Saving QSimIntents with questions to {qsim_questions_path}...")
150
+ with open(qsim_questions_path, "w", encoding="utf-8") as f:
151
+ json.dump(output_results, f, indent=2, ensure_ascii=False)
152
+
153
+ summary_entry = QSimSummary(
154
+ timestamp=timestamp,
155
+ num_intents=len(intents),
156
+ num_questions=len(output_results),
157
+ seed=seed,
158
+ )
159
+
160
+ summaries = []
161
+ if os.path.exists(qsim_summary_path):
162
+ with open(qsim_summary_path, encoding="utf-8") as f:
163
+ summaries = json.load(f)
164
+ summaries.append(summary_entry.to_dict())
165
+ with open(qsim_summary_path, "w", encoding="utf-8") as f:
166
+ json.dump(summaries, f, indent=2, ensure_ascii=False)
167
+
168
+ log(f"Question simulation complete: {len(output_results)} questions saved")
169
+ print(f"QSim timestamp: {timestamp}")
170
+
171
+ if output_results and EngineConfig.DEBUG:
172
+ debug("[main_execution.qsim_run_once] samples:")
173
+ for i, item in enumerate(output_results[:5]):
174
+ debug(f"[main_execution.qsim_run_once] {i + 1}. {item.get('question', 'N/A')}")
175
+
176
+ return summary_entry
177
+
178
+
179
+ def load_generated_questions(path: str | None = None) -> list[dict[str, Any]]:
180
+ """Load previously generated QSim questions from a JSON file.
181
+
182
+ Args:
183
+
184
+ path: Path to the JSON file; defaults to ``QSimConfig.QUESTIONS_OUTPUT_PATH``.
185
+
186
+ Returns:
187
+
188
+ List of QSimIntent dicts as stored in the file.
189
+ """
190
+ if path is None:
191
+ path = QSimConfig.QUESTIONS_OUTPUT_PATH
192
+
193
+ with open(path, encoding="utf-8") as f:
194
+ return json.load(f)
195
+
196
+
197
+ def get_questions_only(results: list[dict[str, Any]], output_path: str | None = None) -> None:
198
+ """Print and save a numbered list of NL questions from QSim results.
199
+
200
+ Args:
201
+
202
+ results: List of QSimIntent dicts, each containing a ``"question"`` key.
203
+ output_path: File path for the output text file; defaults to ``QSimConfig.QUESTIONS_OUTPUT_PATH`` with ``.json`` replaced by ``.txt``.
204
+ """
205
+ questions = [r["question"] for r in results]
206
+
207
+ if output_path is None:
208
+ base = QSimConfig.QUESTIONS_OUTPUT_PATH
209
+ if base.endswith(".json"):
210
+ output_path = base[:-5] + ".txt"
211
+ else:
212
+ timestamp = datetime.now().strftime("%y%m%d%H%M")
213
+ output_path = f"qsim_{timestamp}_questions.txt"
214
+
215
+ for i, q in enumerate(questions, 1):
216
+ print(f"{i}. {q}")
217
+
218
+ with open(output_path, "w", encoding="utf-8") as f:
219
+ for i, q in enumerate(questions, 1):
220
+ f.write(f"{i}. {q}\n")
221
+
222
+
223
+ def simulator_run_once(
224
+ schema: SchemaGraph,
225
+ dialect: Any,
226
+ seed_filepath: str,
227
+ output_dir: str,
228
+ store: dict[str, Any] | None = None,
229
+ templates: dict[str, Any] | None = None,
230
+ interactive_gold: bool = True,
231
+ seed: int | None = None,
232
+ ) -> SimulatorSummary:
233
+ """Execute the full deterministic coverage simulator pipeline.
234
+
235
+ Phases:
236
+ 1. Gold intent generation from seed questions (LLM parse).
237
+ 2. Deterministic multi-depth expansion (no LLM).
238
+ 3. Cross-gold SHA-256 dedup.
239
+ 4. Join resolution at gold level, cached per table-set.
240
+ 5. SQL build + instantiate + validate/execute + LLM question
241
+ generation with realism gate (1 per intent).
242
+ 6. Template creation + store merge + report.
243
+
244
+ Args:
245
+
246
+ schema: Pre-loaded SchemaGraph with profiled columns.
247
+ dialect: SQL dialect object for validation and execution.
248
+ seed_filepath: Path to the seed questions text file.
249
+ output_dir: Directory for output artefacts.
250
+ store: Existing template store dict; may be None.
251
+ templates: Existing templates dict keyed by id; may be None.
252
+ interactive_gold: When True, confirm each gold intent.
253
+ seed: Random seed; defaults to SimulatorConfig.RANDOM_SEED.
254
+
255
+ Returns:
256
+
257
+ SimulatorSummary with version and success/failure counts.
258
+ """
259
+ if seed is None:
260
+ seed = SimulatorConfig.RANDOM_SEED
261
+ random.seed(seed)
262
+
263
+ os.makedirs(output_dir, exist_ok=True)
264
+ version = get_next_simulator_version(output_dir)
265
+ gold_filepath = os.path.join(
266
+ output_dir,
267
+ SimulatorConfig.GOLD_OUTPUT_PATTERN.format(version=version),
268
+ )
269
+ report_filepath = os.path.join(
270
+ output_dir,
271
+ SimulatorConfig.REPORT_PATTERN.format(version=version),
272
+ )
273
+ results_csv_filepath = os.path.join(
274
+ output_dir,
275
+ SimulatorConfig.RESULTS_CSV_PATTERN.format(version=version),
276
+ )
277
+ failures_filepath = os.path.join(
278
+ output_dir,
279
+ SimulatorConfig.FAILURES_PATTERN.format(version=version),
280
+ )
281
+
282
+ log(f"Starting simulator version {version}")
283
+ print(f"Simulator version: {version}")
284
+
285
+ schema_stats = compute_schema_stats(schema)
286
+ limits = compute_schema_limits(schema_stats)
287
+ log(
288
+ f"Computed SchemaLimits: max_filters={limits.max_filters}, "
289
+ f"max_groupby={limits.max_groupby}, "
290
+ f"max_tables={limits.max_tables}"
291
+ )
292
+
293
+ log("PHASE 1: Gold Intent Generation")
294
+ gold_intents_raw = run_gold_intent_generation(
295
+ schema, seed_filepath, gold_filepath,
296
+ interactive=interactive_gold,
297
+ )
298
+ gold_sim_intents = [
299
+ SimulatorIntent.from_dict(d) if isinstance(d, dict) else d
300
+ for d in gold_intents_raw
301
+ ]
302
+ log(f"Gold intents: {len(gold_sim_intents)}")
303
+
304
+ log("PHASE 2: Deterministic Multi-Depth Expansion")
305
+ synthetic_intents = expand_gold_intents(
306
+ gold_sim_intents, schema, limits,
307
+ )
308
+ log(f"Synthetic intents (deduped): {len(synthetic_intents)}")
309
+
310
+ log("PHASE 3: Join Resolution (cached per table-set)")
311
+ join_cache: dict[frozenset[str], Any] = {}
312
+ for gold in gold_sim_intents:
313
+ resolve_joins_for_table_set(
314
+ gold.tables or [], schema,
315
+ gold.intent_id or "gold", join_cache,
316
+ )
317
+ log(f"Join cache seeded with {len(join_cache)} table-set entries")
318
+
319
+ log("PHASE 4: SQL Build + Validate + Execute + Question Generation")
320
+ next_id = int(store["next_id"]) if store else 1
321
+ results, new_templates, updated_next_id = run_deterministic_simulation(
322
+ synthetic_intents, schema, dialect, next_id,
323
+ join_cache=join_cache,
324
+ csv_output_path=results_csv_filepath,
325
+ )
326
+ log(
327
+ f"Simulation results: {len(results)}, "
328
+ f"templates: {len(new_templates)}"
329
+ )
330
+
331
+ save_simulation_report(results, report_filepath)
332
+ save_simulation_failures(results, failures_filepath)
333
+
334
+ if store is not None and templates is not None:
335
+ for tmpl in new_templates:
336
+ dedupe_key = (
337
+ tmpl.intent_key,
338
+ getattr(
339
+ tmpl.intent_signature,
340
+ "chosen_join_candidate_id", "",
341
+ ),
342
+ tmpl.sql_fp,
343
+ )
344
+ found = False
345
+ for existing in templates.values():
346
+ k = (
347
+ existing.intent_key,
348
+ getattr(
349
+ existing.intent_signature,
350
+ "chosen_join_candidate_id", "",
351
+ ),
352
+ existing.sql_fp,
353
+ )
354
+ if k == dedupe_key:
355
+ found = True
356
+ for i, q in enumerate(tmpl.value_history.questions):
357
+ pv = (
358
+ tmpl.value_history.param_values[i]
359
+ if i < len(tmpl.value_history.param_values)
360
+ else {}
361
+ )
362
+ nl = (
363
+ tmpl.value_history.natural_language[i]
364
+ if i < len(tmpl.value_history.natural_language)
365
+ else ""
366
+ )
367
+ existing.value_history.add(pv, q, nl)
368
+ break
369
+ if not found:
370
+ templates[tmpl.id] = tmpl
371
+ store["next_id"] = updated_next_id
372
+ store = templates_to_store(store, templates)
373
+ save_template_store(store)
374
+
375
+ total = len(results)
376
+ success = sum(1 for r in results if r.success)
377
+ failed = total - success
378
+ success_rate = round(success / total, 3) if total > 0 else 0.0
379
+
380
+ log(
381
+ f"SIMULATOR COMPLETE: {len(new_templates)} synthetic templates created"
382
+ )
383
+ return SimulatorSummary(
384
+ version=version,
385
+ total=total,
386
+ success=success,
387
+ failed=failed,
388
+ success_rate=success_rate,
389
+ )
390
+
391
+
392
+ def interactive_run_once(
393
+ schema: SchemaGraph | None = None,
394
+ store: Any | None = None,
395
+ templates: list | None = None,
396
+ rejected: list | None = None,
397
+ schema_terms: Any | None = None,
398
+ question: str | None = None,
399
+ engine: Any | None = None,
400
+ ) -> dict[str, Any] | None:
401
+ """Execute a single interactive pipeline iteration.
402
+
403
+ Reads a question from stdin (or uses the supplied *question*),
404
+ validates it, checks for template reuse, parses intent via LLM
405
+ if needed, generates SQL, executes it, and handles user
406
+ feedback.
407
+
408
+ Args:
409
+
410
+ schema: Pre-loaded ``SchemaGraph``; raises when ``None``.
411
+ store: Template store dict; raises when ``None``.
412
+ templates: List of accepted ``Template`` objects; raises
413
+ when ``None``.
414
+ rejected: List of ``RejectedTemplate`` objects; raises
415
+ when ``None``.
416
+ schema_terms: Set of schema term tokens; raises when
417
+ ``None``.
418
+ question: When provided the pipeline uses this question
419
+ instead of reading from stdin.
420
+ engine: SQLAlchemy engine for query execution; when
421
+ ``None`` the default engine is used.
422
+
423
+ Returns:
424
+
425
+ A dict with pipeline results on a full run, or ``None``
426
+ on early exit.
427
+ """
428
+ if question is None:
429
+ print("Enter question")
430
+ try:
431
+ question = input().strip()
432
+ except (EOFError, KeyboardInterrupt):
433
+ print("\nUser terminated.")
434
+ return None
435
+
436
+ if not question:
437
+ print("\nInvalid input.")
438
+ return None
439
+ print(f"User input: {question}")
440
+
441
+ valid, query_type, corrected = validate_question(question)
442
+ if not valid:
443
+ if query_type == "restricted":
444
+ print("Restricted access, only querying is allowed. Please rephrase.")
445
+ else:
446
+ print("Unable to process your question. Please rephrase to a valid question.")
447
+ return None
448
+ if corrected != question:
449
+ debug(f"[main_execution.interactive_run_once] typo_corrected: '{question}' -> '{corrected}'")
450
+ question = corrected
451
+
452
+ q_norm = normalize_question(question)
453
+ debug(f"[main_execution.interactive_run_once] q_norm: {q_norm}")
454
+
455
+ dialect, _, schema, store, templates, rejected, schema_terms = load_pipeline_resources(
456
+ schema, store, templates, rejected, schema_terms
457
+ )
458
+
459
+ tmpl_match = check_template_reuse(q_norm, templates)
460
+
461
+ if tmpl_match.reuse_type == "direct_reuse":
462
+ log(f"direct SQL reuse via exact question match (trust=2, template='{tmpl_match.best_template.id}')")
463
+ debug("[main_execution.interactive_run_once] direct_reuse: question_match")
464
+ handled = handle_direct_sql_reuse(
465
+ q_norm,
466
+ tmpl_match.best_template,
467
+ dialect,
468
+ store,
469
+ templates,
470
+ schema,
471
+ engine=engine,
472
+ existing_nl=None,
473
+ )
474
+ if handled:
475
+ return None
476
+
477
+ intent = tmpl_match.intent
478
+ semantic_warnings: list[dict[str, Any]] = []
479
+
480
+ if intent is None:
481
+ parsed = parse_intent_via_llm(q_norm, schema, templates, store)
482
+ if parsed is None:
483
+ return None
484
+ intent, semantic_warnings, _ = parsed
485
+
486
+ ikey = intent_key(intent)
487
+ debug(f"[main_execution.interactive_run_once] intent_key: {ikey[:32]}")
488
+
489
+ union_result = match_template_for_union(intent, templates)
490
+ matched_template = None
491
+ union_select_cols = None
492
+ cols_changed = False
493
+ has_union_match = union_result is not None
494
+ if union_result is not None:
495
+ matched_template, union_select_cols, cols_changed = union_result
496
+
497
+ if not confirm_intent_with_user(
498
+ intent, store, semantic_warnings, has_union_match=has_union_match,
499
+ ):
500
+ return None
501
+
502
+ join_candidates, cmap, cte_join_hints = generate_join_candidates(intent, schema)
503
+ if join_candidates is None:
504
+ save_template_store(store)
505
+ return None
506
+
507
+ (
508
+ hard_block_override,
509
+ hard_block_rejected_template,
510
+ matched_rejected_template,
511
+ proceed,
512
+ ) = check_and_handle_hard_block(rejected, ikey, intent)
513
+ if not proceed:
514
+ save_template_store(store)
515
+ return None
516
+
517
+ sql, ok = generate_and_validate_sql(
518
+ q_norm,
519
+ intent,
520
+ schema,
521
+ join_candidates,
522
+ cmap,
523
+ dialect,
524
+ store,
525
+ engine=engine,
526
+ cte_join_hints=cte_join_hints,
527
+ matched_template=matched_template,
528
+ union_select_cols=union_select_cols,
529
+ cols_changed=cols_changed,
530
+ )
531
+ if not ok:
532
+ return None
533
+
534
+ log("executing SQL")
535
+ spark_sql = get_spark_sql_for_execution(
536
+ intent.sql_param or "",
537
+ dict(flatten_param_values(intent)),
538
+ schema,
539
+ intent,
540
+ dialect,
541
+ )
542
+ rows = execute_sql(
543
+ dialect,
544
+ sql,
545
+ spark_sql_for_execution=spark_sql if spark_sql else None,
546
+ )
547
+
548
+ conf = compute_final_metrics(sql, intent, schema, templates, join_candidates, store)
549
+
550
+ ux_summary = display_final_results(q_norm, intent, sql, rows)
551
+
552
+ if conf >= PolicyConfig.FINAL_SQL_AUTO_ACCEPT_THRESHOLD:
553
+ log(f"[AUTO-ACCEPT] confidence={conf:.3f} >= {PolicyConfig.FINAL_SQL_AUTO_ACCEPT_THRESHOLD}")
554
+ choice = "y"
555
+ else:
556
+ choice = ask_user_choice("\nIs this correct?", ["y", "n"])
557
+ if choice is None:
558
+ save_template_store(store)
559
+ return None
560
+
561
+ if choice == "y":
562
+ save_result_csv(rows, intent, sql)
563
+
564
+ handle_user_feedback(
565
+ choice,
566
+ intent,
567
+ sql,
568
+ schema,
569
+ store,
570
+ templates,
571
+ rejected,
572
+ q_norm,
573
+ hard_block_override,
574
+ hard_block_rejected_template,
575
+ matched_rejected_template,
576
+ ux_summary,
577
+ dialect=dialect,
578
+ )
579
+
580
+
581
+ def _obfuscate_trust_level(trust_level: int) -> str:
582
+ """Convert an internal trust level integer to a user-facing string.
583
+
584
+ Args:
585
+
586
+ trust_level: Internal trust level in the range 0–2.
587
+
588
+ Returns:
589
+
590
+ One of ``"high"``, ``"moderate"``, or ``"low"``.
591
+ """
592
+ if trust_level >= 2:
593
+ return "high"
594
+ elif trust_level == 1:
595
+ return "moderate"
596
+ return "low"
597
+
598
+
599
+ def _obfuscate_rejection_category(categories: dict[str, int]) -> str:
600
+ """Convert a rejection category frequency dict to a user-facing category string.
601
+
602
+ Args:
603
+
604
+ categories: Dict mapping rejection category names to occurrence counts.
605
+
606
+ Returns:
607
+
608
+ One of ``"invalid_sql"``, ``"incorrect_results"``, ``"ambiguous_intent"``, or ``"other"``.
609
+ """
610
+ if not categories:
611
+ return "other"
612
+ max_cat = max(categories.items(), key=lambda x: x[1])[0]
613
+ if max_cat in ("wrong_tables", "wrong_join"):
614
+ return "invalid_sql"
615
+ elif max_cat in (
616
+ "too_many_rows",
617
+ "too_few_rows",
618
+ "wrong_filters_or_values",
619
+ "wrong_columns_selected",
620
+ ):
621
+ return "incorrect_results"
622
+ elif max_cat == "wrong_intent":
623
+ return "ambiguous_intent"
624
+ return "other"
625
+
626
+
627
+ def get_templates_list(templates: dict[str, Any]) -> list[TemplateInfo]:
628
+ """Build a ``TemplateInfo`` list from the templates dictionary.
629
+
630
+ Args:
631
+
632
+ templates: Dict of accepted ``Template`` objects keyed by template id.
633
+
634
+ Returns:
635
+
636
+ List of ``TemplateInfo`` instances with obfuscated trust levels.
637
+ """
638
+ results = []
639
+ for t in templates.values():
640
+ vh = t.value_history
641
+ example_question = vh.questions[-1] if vh.questions else ""
642
+ last_natural_language = vh.natural_language[-1] if vh.natural_language else ""
643
+
644
+ results.append(
645
+ TemplateInfo(
646
+ id=t.id,
647
+ natural_language=last_natural_language,
648
+ example_question=example_question,
649
+ trust_level=_obfuscate_trust_level(t.trust_level),
650
+ source=t.source,
651
+ )
652
+ )
653
+ return results
654
+
655
+
656
+ def get_rejected_templates_list(rejected: dict[str, Any]) -> list[RejectedTemplateInfo]:
657
+ """Build a ``RejectedTemplateInfo`` list from the rejected templates dictionary.
658
+
659
+ Args:
660
+
661
+ rejected: Dict of ``RejectedTemplate`` objects keyed by template id.
662
+
663
+ Returns:
664
+
665
+ List of ``RejectedTemplateInfo`` instances with obfuscated rejection categories.
666
+ """
667
+ results = []
668
+ for rt in rejected.values():
669
+ vh = rt.value_history
670
+ example_question = vh.questions[-1] if vh.questions else ""
671
+ last_natural_language = vh.natural_language[-1] if vh.natural_language else ""
672
+ rejection_count = len(vh.rejection_categories)
673
+
674
+ category_counts: dict[str, int] = {}
675
+ for cat in vh.rejection_categories:
676
+ category_counts[cat] = category_counts.get(cat, 0) + 1
677
+
678
+ results.append(
679
+ RejectedTemplateInfo(
680
+ id=rt.id,
681
+ natural_language=last_natural_language,
682
+ example_question=example_question,
683
+ rejection_category=_obfuscate_rejection_category(category_counts),
684
+ rejection_count=rejection_count,
685
+ )
686
+ )
687
+ return results
688
+
689
+
690
+ def get_simulator_summary_from_dir(artifacts_dir: str, version: int) -> SimulatorSummary:
691
+ """Build a ``SimulatorSummary`` from a persisted simulation report file.
692
+
693
+ Args:
694
+
695
+ artifacts_dir: Directory containing ``simulation_report_v{version}.json``.
696
+ version: Simulator run version number.
697
+
698
+ Returns:
699
+
700
+ ``SimulatorSummary`` with total, success, failed counts, and success rate.
701
+
702
+ Raises:
703
+
704
+ FileNotFoundError: If the report file for *version* does not exist.
705
+ """
706
+ report_path = os.path.join(artifacts_dir, f"simulation_report_v{version}.json")
707
+ if not os.path.exists(report_path):
708
+ raise FileNotFoundError(f"Simulator report v{version} not found")
709
+
710
+ with open(report_path, encoding="utf-8") as f:
711
+ report = json.load(f)
712
+
713
+ total = report.get("total", 0)
714
+ success = report.get("success", 0)
715
+ failed = report.get("failed", 0)
716
+ success_rate = round(success / total, 3) if total > 0 else 0.0
717
+
718
+ return SimulatorSummary(
719
+ version=version,
720
+ total=total,
721
+ success=success,
722
+ failed=failed,
723
+ success_rate=success_rate,
724
+ )
725
+
726
+
727
+ def get_artifacts_dir(
728
+ engine: str,
729
+ host: str | None,
730
+ database: str | None,
731
+ schema: str | None,
732
+ catalog: str | None,
733
+ ) -> str:
734
+ """Compute the artifacts directory path inside the platform-specific user data directory.
735
+
736
+ Args:
737
+
738
+ engine: SQL engine type, either ``"postgresql"`` or ``"databricks"``.
739
+ host: Database host (PostgreSQL only).
740
+ database: Database name (PostgreSQL only).
741
+ schema: Schema name.
742
+ catalog: Unity Catalog name (Databricks only).
743
+
744
+ Returns:
745
+
746
+ Absolute path to the artifacts directory (always writable).
747
+ """
748
+ if engine == "postgresql":
749
+ parts = [
750
+ "artifacts",
751
+ engine,
752
+ host or "localhost",
753
+ database or "db",
754
+ schema or "public",
755
+ ]
756
+ else:
757
+ parts = ["artifacts", engine, catalog or "catalog", schema or "schema"]
758
+ safe_parts = [p.replace(".", "_").replace("-", "_").replace(":", "_") for p in parts]
759
+ folder_name = "_".join(safe_parts)
760
+ base = user_data_dir(appname="text2sql", appauthor=False)
761
+ return os.path.join(base, folder_name)
762
+
763
+
764
+ def resolve_qsim_path(timestamp_or_result: str | QSimSummary, artifacts_dir: str) -> str:
765
+ """Resolve the full file path for a QSim output JSON from a timestamp or summary.
766
+
767
+ Args:
768
+
769
+ timestamp_or_result: Either a timestamp string or a ``QSimSummary`` instance.
770
+ artifacts_dir: Directory where QSim output files are stored.
771
+
772
+ Returns:
773
+
774
+ Absolute path to the ``qsim_intents_with_questions_{timestamp}.json`` file.
775
+ """
776
+ if isinstance(timestamp_or_result, QSimSummary):
777
+ timestamp = timestamp_or_result.timestamp
778
+ else:
779
+ timestamp = timestamp_or_result
780
+ return os.path.join(artifacts_dir, f"qsim_intents_with_questions_{timestamp}.json")
781
+
782
+
783
+ def get_qsim(artifacts_dir: str) -> list[QSimSummary]:
784
+ """Load all QSim run summaries from ``qsim_summary.json`` in temporal order.
785
+
786
+ Args:
787
+
788
+ artifacts_dir: Directory containing the ``qsim_summary.json`` file.
789
+
790
+ Returns:
791
+
792
+ List of ``QSimSummary`` instances ordered oldest-first, or an empty list if the summary file does not exist.
793
+ """
794
+ qsim_summary_path = os.path.join(artifacts_dir, "qsim_summary.json")
795
+ if not os.path.exists(qsim_summary_path):
796
+ return []
797
+ with open(qsim_summary_path, encoding="utf-8") as f:
798
+ summaries = json.load(f)
799
+ return [QSimSummary.from_dict(s) for s in summaries]