aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,952 @@
1
+ """Base schema and utility contracts for the text-to-SQL pipeline.
2
+
3
+ Defines the foundational dataclasses and enums shared across all pipeline stages: schema representation (ColumnMetadata, TableMetadata, SchemaGraph), intent validation results, SQL shape comparison, QSim skeleton structures, and result containers for query plans, validation, and template management.
4
+
5
+ Also provides type utility helpers for mapping raw SQL data types to the standardized value_type vocabulary used throughout the pipeline.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import re
11
+ from dataclasses import asdict, dataclass, field
12
+ from enum import Enum
13
+ from typing import Any
14
+
15
+ from .config import (
16
+ BOOLEAN_VALUE_PATTERNS,
17
+ COLUMN_TYPE_TO_VALUE_TYPE,
18
+ DATE_TYPE_TOKENS,
19
+ EXCLUDED_FILTER_PATTERNS,
20
+ NUMERIC_TYPE_TOKENS,
21
+ STRING_TYPE_TOKENS,
22
+ normalize_column_type,
23
+ )
24
+ from .core_utils import issue_sig
25
+
26
+
27
+ def _is_numeric_type(data_type: str) -> bool:
28
+ """Return True if data type string contains a numeric token.
29
+
30
+ Args:
31
+
32
+ data_type: Raw SQL data type string.
33
+
34
+ Returns:
35
+
36
+ True if any numeric token is found in the lowercased type string.
37
+ """
38
+ dt = data_type.lower()
39
+ return any(t in dt for t in NUMERIC_TYPE_TOKENS)
40
+
41
+
42
+ def is_string_type(data_type: str) -> bool:
43
+ """Return True if data type string contains a string/text token.
44
+
45
+ Args:
46
+
47
+ data_type: Raw SQL data type string.
48
+
49
+ Returns:
50
+
51
+ True if any string/text token is found in the lowercased type string.
52
+ """
53
+ dt = data_type.lower()
54
+ return any(t in dt for t in STRING_TYPE_TOKENS)
55
+
56
+
57
+ def _is_date_type(data_type: str) -> bool:
58
+ """Return True if data type string contains a temporal token.
59
+
60
+ Args:
61
+
62
+ data_type: Raw SQL data type string.
63
+
64
+ Returns:
65
+
66
+ True if any temporal token is found in the lowercased type string.
67
+ """
68
+ dt = data_type.lower()
69
+ return any(t in dt for t in DATE_TYPE_TOKENS)
70
+
71
+
72
+ def _data_type_to_value_type(data_type: str) -> str:
73
+ """Map raw SQL data type to standardized value type.
74
+
75
+ Args:
76
+
77
+ data_type: Raw SQL data type string such as 'varchar(255)' or 'integer'.
78
+
79
+ Returns:
80
+
81
+ One of 'integer', 'number', 'string', 'date', or 'boolean'.
82
+ """
83
+ normalized = normalize_column_type(data_type)
84
+ vt = COLUMN_TYPE_TO_VALUE_TYPE.get(normalized)
85
+ if vt:
86
+ return vt
87
+ if _is_numeric_type(data_type):
88
+ return "number"
89
+ if _is_date_type(data_type):
90
+ return "date"
91
+ if is_string_type(data_type):
92
+ return "string"
93
+ return "string"
94
+
95
+
96
+ class ColumnRole(Enum):
97
+ """Column role for profiling and question simulation."""
98
+
99
+ IDENTIFIER = "identifier"
100
+ CATEGORICAL = "categorical"
101
+ NUMERIC_CATEGORICAL = "numeric_categorical"
102
+ NUMERIC_MEASURE = "numeric_measure"
103
+ TEMPORAL = "temporal"
104
+ BOOLEAN = "boolean"
105
+ FREE_TEXT = "free_text"
106
+ AUDIT = "audit"
107
+
108
+
109
+ class TableRole(Enum):
110
+ """Table role for join constraint validation."""
111
+
112
+ DIMENSION = "dimension"
113
+ FACT = "fact"
114
+ BRIDGE = "bridge"
115
+ UNKNOWN = "unknown"
116
+
117
+
118
+ @dataclass
119
+ class FKEdge:
120
+ """Foreign key relationship between tables."""
121
+
122
+ src_table: str
123
+ src_cols: list[str]
124
+ dst_table: str
125
+ dst_cols: list[str]
126
+
127
+
128
+ @dataclass
129
+ class ValueDomain:
130
+ """Value domain for sampling concrete values during question
131
+ generation."""
132
+
133
+ values: list[str] = field(default_factory=list)
134
+ min_val: str | None = None
135
+ max_val: str | None = None
136
+ data_type: str | None = None
137
+
138
+
139
+ @dataclass
140
+ class ColumnMetadata:
141
+ """Consolidated column metadata with profile, role, and value
142
+ domain."""
143
+
144
+ name: str
145
+ data_type: str
146
+ is_primary_key: bool = False
147
+ is_foreign_key: bool = False
148
+ fk_target: tuple[str, str] | None = None
149
+ role: str | None = None
150
+ value_type: str = ""
151
+ row_count: int = 0
152
+ distinct_count: int = 0
153
+ distinct_ratio: float = 0.0
154
+ null_ratio: float = 0.0
155
+ min_val: str | None = None
156
+ max_val: str | None = None
157
+ top_k_values: list[str] = field(default_factory=list)
158
+ is_aggregatable_override: bool | None = None
159
+ is_groupable_override: bool | None = None
160
+ is_filterable_override: bool | None = None
161
+ valid_filter_ops: list[str] = field(default_factory=list)
162
+ valid_aggregations: list[str] = field(default_factory=list)
163
+ valid_having_ops: list[str] = field(default_factory=list)
164
+ boolean_true_value: str | None = None
165
+ boolean_false_value: str | None = None
166
+ description: str = ""
167
+
168
+ def __post_init__(self) -> None:
169
+ """Auto-compute value_type from data_type when value_type is not explicitly set."""
170
+ if not self.value_type and self.data_type:
171
+ self.value_type = _data_type_to_value_type(self.data_type)
172
+
173
+ @staticmethod
174
+ def from_dict(d: dict[str, Any]) -> ColumnMetadata:
175
+ """Create ColumnMetadata from dictionary.
176
+
177
+ Args:
178
+
179
+ d: Dictionary with keys matching ColumnMetadata fields.
180
+
181
+ Returns:
182
+
183
+ Populated ColumnMetadata instance.
184
+ """
185
+ fk_target = None
186
+ if d.get("fk_target"):
187
+ fk_target = tuple(d["fk_target"]) if isinstance(d["fk_target"], list) else d["fk_target"]
188
+ return ColumnMetadata(
189
+ name=d.get("name", ""),
190
+ data_type=d.get("data_type", ""),
191
+ is_primary_key=d.get("is_primary_key", False),
192
+ is_foreign_key=d.get("is_foreign_key", False),
193
+ fk_target=fk_target,
194
+ role=d.get("role"),
195
+ value_type=d.get("value_type", ""),
196
+ row_count=d.get("row_count", 0),
197
+ distinct_count=d.get("distinct_count", 0),
198
+ distinct_ratio=d.get("distinct_ratio", 0.0),
199
+ null_ratio=d.get("null_ratio", 0.0),
200
+ min_val=d.get("min_val"),
201
+ max_val=d.get("max_val"),
202
+ top_k_values=d.get("top_k_values", []),
203
+ is_aggregatable_override=d.get("is_aggregatable_override"),
204
+ is_groupable_override=d.get("is_groupable_override"),
205
+ is_filterable_override=d.get("is_filterable_override"),
206
+ valid_filter_ops=d.get("valid_filter_ops", []),
207
+ valid_aggregations=d.get("valid_aggregations", []),
208
+ valid_having_ops=d.get("valid_having_ops", []),
209
+ boolean_true_value=d.get("boolean_true_value"),
210
+ boolean_false_value=d.get("boolean_false_value"),
211
+ description=d.get("description", ""),
212
+ )
213
+
214
+ def to_dict(self) -> dict[str, Any]:
215
+ """Serialize to a plain dictionary for JSON storage.
216
+
217
+ Returns:
218
+
219
+ Dictionary with all ColumnMetadata fields as primitives.
220
+ """
221
+ return {
222
+ "name": self.name,
223
+ "data_type": self.data_type,
224
+ "is_primary_key": self.is_primary_key,
225
+ "is_foreign_key": self.is_foreign_key,
226
+ "fk_target": list(self.fk_target) if self.fk_target else None,
227
+ "role": self.role,
228
+ "value_type": self.value_type,
229
+ "row_count": self.row_count,
230
+ "distinct_count": self.distinct_count,
231
+ "distinct_ratio": self.distinct_ratio,
232
+ "null_ratio": self.null_ratio,
233
+ "min_val": self.min_val,
234
+ "max_val": self.max_val,
235
+ "top_k_values": self.top_k_values,
236
+ "is_aggregatable_override": self.is_aggregatable_override,
237
+ "is_groupable_override": self.is_groupable_override,
238
+ "is_filterable_override": self.is_filterable_override,
239
+ "valid_filter_ops": self.valid_filter_ops,
240
+ "valid_aggregations": self.valid_aggregations,
241
+ "valid_having_ops": self.valid_having_ops,
242
+ "boolean_true_value": self.boolean_true_value,
243
+ "boolean_false_value": self.boolean_false_value,
244
+ "description": self.description,
245
+ }
246
+
247
+ @property
248
+ def is_usable(self) -> bool:
249
+ """Column has sufficient variance for meaningful queries."""
250
+ if self.role == ColumnRole.AUDIT.value:
251
+ return False
252
+ if self.distinct_count is not None and self.distinct_count <= 1:
253
+ return False
254
+ return True
255
+
256
+ @property
257
+ def is_boolean_like(self) -> bool:
258
+ """Column has exactly 2 distinct values and behaves like a boolean flag."""
259
+ if self.role == ColumnRole.BOOLEAN.value:
260
+ return True
261
+ dtype_lower = (self.data_type or "").lower()
262
+ if "bool" in dtype_lower:
263
+ return True
264
+ if self.distinct_count != 2:
265
+ return False
266
+ if self.is_primary_key or self.is_foreign_key:
267
+ return False
268
+ if not self.top_k_values or len(self.top_k_values) != 2:
269
+ return False
270
+ values_lower = frozenset(str(v).lower().strip() for v in self.top_k_values)
271
+ return values_lower in BOOLEAN_VALUE_PATTERNS
272
+
273
+ @property
274
+ def is_filterable(self) -> bool:
275
+ """Column can be used in WHERE clause."""
276
+ for pattern in EXCLUDED_FILTER_PATTERNS:
277
+ if re.search(pattern, self.name, re.IGNORECASE):
278
+ return False
279
+ if self.is_filterable_override is not None:
280
+ return self.is_filterable_override
281
+ if self.is_primary_key:
282
+ return True
283
+ if self.is_foreign_key:
284
+ return True
285
+ if self.role in (
286
+ ColumnRole.CATEGORICAL.value,
287
+ ColumnRole.NUMERIC_CATEGORICAL.value,
288
+ ColumnRole.NUMERIC_MEASURE.value,
289
+ ColumnRole.TEMPORAL.value,
290
+ ColumnRole.BOOLEAN.value,
291
+ ):
292
+ return True
293
+ return False
294
+
295
+ def get_valid_filter_ops(self) -> list[str]:
296
+ """Return valid filter operators for this column.
297
+
298
+ Always includes null check operators regardless of column type.
299
+
300
+ Returns:
301
+
302
+ List of operator strings such as '=', '!=', 'like', 'between'.
303
+ """
304
+ null_ops = ["is null", "is not null"]
305
+ if self.valid_filter_ops:
306
+ return list(set(self.valid_filter_ops + null_ops))
307
+ return null_ops
308
+
309
+ def get_valid_aggregations(self) -> set[str]:
310
+ """Return valid aggregation functions for this column.
311
+
312
+ Returns the stored valid_aggregations set.
313
+
314
+ Returns:
315
+
316
+ Set of lowercase aggregation function names such as 'count'.
317
+ """
318
+ if self.valid_aggregations:
319
+ return set(agg.lower() for agg in self.valid_aggregations)
320
+ return set()
321
+
322
+ def get_valid_having_ops(self) -> list[str]:
323
+ """Return valid HAVING clause operators for this column.
324
+
325
+ Returns the stored valid_having_ops list.
326
+
327
+ Returns:
328
+
329
+ List of operator strings valid in a HAVING clause.
330
+ """
331
+ if self.valid_having_ops:
332
+ return list(self.valid_having_ops)
333
+ return []
334
+
335
+ @property
336
+ def is_groupable(self) -> bool:
337
+ """Column can be used in GROUP BY clause."""
338
+ if self.is_groupable_override is not None:
339
+ return self.is_groupable_override
340
+ if self.is_foreign_key:
341
+ return True
342
+ return self.role in (
343
+ ColumnRole.CATEGORICAL.value,
344
+ ColumnRole.NUMERIC_CATEGORICAL.value,
345
+ ColumnRole.BOOLEAN.value,
346
+ ColumnRole.TEMPORAL.value,
347
+ ColumnRole.IDENTIFIER.value,
348
+ )
349
+
350
+ @property
351
+ def is_aggregatable(self) -> bool:
352
+ """Column can be used with SUM/AVG aggregations."""
353
+ if self.is_aggregatable_override is not None:
354
+ return self.is_aggregatable_override
355
+ return self.role == ColumnRole.NUMERIC_MEASURE.value
356
+
357
+
358
+ @dataclass
359
+ class TableMetadata:
360
+ """Table metadata with nested columns, foreign keys, partition columns, and role."""
361
+
362
+ name: str
363
+ columns: dict[str, ColumnMetadata]
364
+ primary_key: list[str]
365
+ foreign_keys: list[FKEdge]
366
+ partition_columns: list[str] = field(default_factory=list)
367
+ role: str | None = None
368
+ row_count: int = 0
369
+ description: str = ""
370
+ composite_descriptive_ratios: dict[tuple[str, str], float] = field(
371
+ default_factory=dict,
372
+ )
373
+
374
+ @staticmethod
375
+ def from_dict(d: dict[str, Any]) -> TableMetadata:
376
+ """Create TableMetadata from dictionary.
377
+
378
+ Args:
379
+
380
+ d: Dictionary with keys matching TableMetadata fields.
381
+
382
+ Returns:
383
+
384
+ Populated TableMetadata instance with nested ColumnMetadata and FKEdge objects.
385
+ """
386
+ cols_raw = d.get("columns", {})
387
+ columns = {k: ColumnMetadata.from_dict(v) for k, v in cols_raw.items()} if isinstance(cols_raw, dict) else {}
388
+ fk_raw = d.get("foreign_keys", [])
389
+ foreign_keys = [FKEdge(**fk) if isinstance(fk, dict) else fk for fk in fk_raw]
390
+ return TableMetadata(
391
+ name=d.get("name", ""),
392
+ columns=columns,
393
+ primary_key=d.get("primary_key", []),
394
+ foreign_keys=foreign_keys,
395
+ partition_columns=d.get("partition_columns", []),
396
+ role=d.get("role"),
397
+ row_count=d.get("row_count", 0),
398
+ description=d.get("description", ""),
399
+ composite_descriptive_ratios={
400
+ tuple(k.split("|", 1)): v
401
+ for k, v in d.get("composite_descriptive_ratios", {}).items()
402
+ if "|" in k
403
+ },
404
+ )
405
+
406
+ def to_dict(self) -> dict[str, Any]:
407
+ """Serialize to a plain dictionary for JSON storage.
408
+
409
+ Returns:
410
+
411
+ Dictionary with all TableMetadata fields, with nested columns and foreign keys serialized recursively.
412
+ """
413
+ return {
414
+ "name": self.name,
415
+ "columns": {k: v.to_dict() for k, v in self.columns.items()},
416
+ "primary_key": self.primary_key,
417
+ "foreign_keys": [asdict(fk) for fk in self.foreign_keys],
418
+ "partition_columns": self.partition_columns,
419
+ "role": self.role,
420
+ "row_count": self.row_count,
421
+ "description": self.description,
422
+ "composite_descriptive_ratios": {
423
+ f"{c1}|{c2}": ratio
424
+ for (c1, c2), ratio in self.composite_descriptive_ratios.items()
425
+ },
426
+ }
427
+
428
+ @property
429
+ def column_names(self) -> list[str]:
430
+ """Get list of column names."""
431
+ return list(self.columns.keys())
432
+
433
+
434
+ @dataclass
435
+ class SchemaGraph:
436
+ """Schema graph with nested tables, join paths, and metadata."""
437
+
438
+ tables: dict[str, TableMetadata]
439
+ join_paths_multi: dict[str, dict[str, list[list[dict[str, Any]]]]]
440
+ schema_hash: str
441
+ created_at: str = ""
442
+ enum_values: dict[str, list[str]] | None = None
443
+ schema_stats: dict[str, Any] | None = None
444
+
445
+ @property
446
+ def fk_edges(self) -> list[FKEdge]:
447
+ """Get all FK edges from tables."""
448
+ return [fk for table in self.tables.values() for fk in table.foreign_keys]
449
+
450
+ @property
451
+ def table_names(self) -> list[str]:
452
+ """Get list of table names."""
453
+ return list(self.tables.keys())
454
+
455
+ def get_column(self, table: str, column: str) -> ColumnMetadata | None:
456
+ """Get column metadata by table and column name.
457
+
458
+ Args:
459
+
460
+ table: Table name to look up.
461
+ column: Column name within that table.
462
+
463
+ Returns:
464
+
465
+ ColumnMetadata if found, otherwise None.
466
+ """
467
+ if table in self.tables and column in self.tables[table].columns:
468
+ return self.tables[table].columns[column]
469
+ return None
470
+
471
+ @property
472
+ def schema_literal_text(self) -> str:
473
+ _SKIP_ROLE_TAGS = {ColumnRole.IDENTIFIER.value, ColumnRole.AUDIT.value, ""}
474
+ out = []
475
+ for t in sorted(self.tables):
476
+ tm = self.tables[t]
477
+ table_header = f"TABLE {tm.name}"
478
+ if tm.role:
479
+ table_header += f" ({tm.role})"
480
+ if tm.description:
481
+ table_header += f" — {tm.description}"
482
+ out.append(table_header)
483
+ for c in tm.columns.values():
484
+ pk_marker = " [PK]" if c.is_primary_key else ""
485
+ fk_marker = f" [FK->{c.fk_target[0]}.{c.fk_target[1]}]" if c.fk_target else ""
486
+ role_tag = ""
487
+ if c.role and c.role not in _SKIP_ROLE_TAGS:
488
+ role_tag = f" [{c.role}]"
489
+ hint_tag = ""
490
+ if c.description:
491
+ hint_tag = f" — {c.description}"
492
+ out.append(f" {c.name}: {c.data_type}{pk_marker}{fk_marker}{role_tag}{hint_tag}")
493
+ if self.enum_values:
494
+ out.append("")
495
+ out.append("ENUM TYPES:")
496
+ for enum_name, values in sorted(self.enum_values.items()):
497
+ out.append(f" {enum_name}: {values[:10]}{'...' if len(values) > 10 else ''}")
498
+ return "\n".join(out).strip()
499
+
500
+ @staticmethod
501
+ def from_dict(d: dict[str, Any]) -> SchemaGraph:
502
+ """Create SchemaGraph from dictionary.
503
+
504
+ Args:
505
+
506
+ d: Dictionary with keys matching SchemaGraph fields, typically loaded from JSON.
507
+
508
+ Returns:
509
+
510
+ Populated SchemaGraph with nested TableMetadata instances.
511
+ """
512
+ tables_raw = d.get("tables", {})
513
+ tables = {k: TableMetadata.from_dict(v) for k, v in tables_raw.items()}
514
+ return SchemaGraph(
515
+ tables=tables,
516
+ join_paths_multi=d.get("join_paths_multi", {}),
517
+ schema_hash=d.get("schema_hash", ""),
518
+ created_at=d.get("created_at", ""),
519
+ enum_values=d.get("enum_values"),
520
+ )
521
+
522
+ def to_dict(self) -> dict[str, Any]:
523
+ """Serialize to a plain dictionary for JSON storage.
524
+
525
+ Returns:
526
+
527
+ Dictionary with all SchemaGraph fields, with nested tables serialized recursively.
528
+ """
529
+ return {
530
+ "tables": {k: v.to_dict() for k, v in self.tables.items()},
531
+ "join_paths_multi": self.join_paths_multi,
532
+ "schema_hash": self.schema_hash,
533
+ "created_at": self.created_at,
534
+ "enum_values": self.enum_values,
535
+ }
536
+
537
+
538
+ @dataclass
539
+ class ExpansionMetadata:
540
+ """Metadata for intent expansion operations."""
541
+
542
+ operator: str
543
+ parent_intent_id: str | None = None
544
+ depth: int = 0
545
+ expansion_path: list[str] = field(default_factory=list)
546
+
547
+ @staticmethod
548
+ def from_dict(d: dict[str, Any]) -> ExpansionMetadata:
549
+ """Create ExpansionMetadata from dictionary.
550
+
551
+ Args:
552
+
553
+ d: Dictionary with keys matching ExpansionMetadata fields.
554
+
555
+ Returns:
556
+
557
+ Populated ExpansionMetadata instance.
558
+ """
559
+ return ExpansionMetadata(
560
+ operator=d.get("operator", ""),
561
+ parent_intent_id=d.get("parent_intent_id"),
562
+ depth=d.get("depth", 0),
563
+ expansion_path=d.get("expansion_path", []),
564
+ )
565
+
566
+ def to_dict(self) -> dict[str, Any]:
567
+ """Convert to dictionary."""
568
+ return asdict(self)
569
+
570
+
571
+ @dataclass
572
+ class CteOutputColumnMeta:
573
+ """Metadata for CTE output column including source, role, and aggregation info."""
574
+
575
+ source: str
576
+ base_column: str = ""
577
+ agg_func: str = ""
578
+ role: str | None = None
579
+ filterable: bool = True
580
+ aggregatable: bool = True
581
+ data_type: str = "unknown"
582
+ value_type: str = ""
583
+ groupable: bool = True
584
+ valid_filter_ops: list[str] = field(default_factory=list)
585
+ valid_aggregations: list[str] = field(default_factory=list)
586
+ valid_having_ops: list[str] = field(default_factory=list)
587
+
588
+ def __post_init__(self):
589
+ """Auto-compute value_type from data_type when not provided."""
590
+ if not self.value_type and self.data_type:
591
+ self.value_type = _data_type_to_value_type(self.data_type)
592
+
593
+ def get_valid_filter_ops(self) -> list[str]:
594
+ """Return valid filter operators for this CTE output column."""
595
+ null_ops = ["is null", "is not null"]
596
+ if self.valid_filter_ops:
597
+ return list(set(self.valid_filter_ops + null_ops))
598
+ if self.filterable:
599
+ return [
600
+ "=",
601
+ "!=",
602
+ "<",
603
+ "<=",
604
+ ">",
605
+ ">=",
606
+ "in",
607
+ "not in",
608
+ "is null",
609
+ "is not null",
610
+ ]
611
+ return null_ops
612
+
613
+ def get_valid_aggregations(self) -> set[str]:
614
+ """Return valid aggregation functions for this CTE output column."""
615
+ if self.valid_aggregations:
616
+ return set(agg.lower() for agg in self.valid_aggregations)
617
+ if self.aggregatable:
618
+ return {"count", "sum", "avg", "min", "max"}
619
+ return {"count"}
620
+
621
+ def get_valid_having_ops(self) -> list[str]:
622
+ """Return valid HAVING clause operators for this CTE output column."""
623
+ if self.valid_having_ops:
624
+ return list(self.valid_having_ops)
625
+ if self.aggregatable:
626
+ return ["=", "!=", "<", "<=", ">", ">="]
627
+ return []
628
+
629
+ @staticmethod
630
+ def from_dict(d: dict[str, Any]) -> CteOutputColumnMeta:
631
+ """Create CteOutputColumnMeta from dictionary.
632
+
633
+ Args:
634
+
635
+ d: Dictionary with keys matching CteOutputColumnMeta fields.
636
+
637
+ Returns:
638
+
639
+ Populated CteOutputColumnMeta instance.
640
+ """
641
+ return CteOutputColumnMeta(
642
+ source=d.get("source", "passthrough"),
643
+ base_column=d.get("base_column", ""),
644
+ agg_func=d.get("agg_func", ""),
645
+ role=d.get("role"),
646
+ filterable=d.get("filterable", True),
647
+ aggregatable=d.get("aggregatable", True),
648
+ data_type=d.get("data_type", "unknown"),
649
+ value_type=d.get("value_type", ""),
650
+ groupable=d.get("groupable", True),
651
+ valid_filter_ops=d.get("valid_filter_ops", []),
652
+ valid_aggregations=d.get("valid_aggregations", []),
653
+ valid_having_ops=d.get("valid_having_ops", []),
654
+ )
655
+
656
+ def to_dict(self) -> dict[str, Any]:
657
+ """Convert to dictionary."""
658
+ return asdict(self)
659
+
660
+
661
+ @dataclass
662
+ class RetryFailureContext:
663
+ """Structured failure context for LLM retry guidance."""
664
+
665
+ failure_type: str
666
+ required_tables: list[str]
667
+ used_tables: set[str]
668
+ missing_tables: set[str]
669
+ attempt_number: int
670
+
671
+
672
+ @dataclass
673
+ class SQLShape:
674
+ """Structural features of a SQL query for comparison."""
675
+
676
+ num_joins: int
677
+ has_group_by: bool
678
+ has_agg: bool
679
+ num_cte: int = 0
680
+ num_filters: int = 0
681
+ num_having: int = 0
682
+ has_distinct: bool = False
683
+
684
+ @staticmethod
685
+ def from_dict(d: dict[str, Any]) -> SQLShape:
686
+ """Create SQLShape from dictionary.
687
+
688
+ Args: d: Dictionary with keys matching SQLShape fields.
689
+
690
+ Returns:
691
+
692
+ Populated SQLShape instance.
693
+ """
694
+ return SQLShape(
695
+ num_joins=d.get("num_joins", 0),
696
+ has_group_by=d.get("has_group_by", False),
697
+ has_agg=d.get("has_agg", False),
698
+ num_cte=d.get("num_cte", 0),
699
+ num_filters=d.get("num_filters", 0),
700
+ num_having=d.get("num_having", 0),
701
+ has_distinct=d.get("has_distinct", False),
702
+ )
703
+
704
+ def to_dict(self) -> dict[str, Any]:
705
+ """Convert to dictionary."""
706
+ return asdict(self)
707
+
708
+
709
+ @dataclass
710
+ class IntentIssue:
711
+ """Issue detected during intent validation or resolution."""
712
+
713
+ issue_id: str
714
+ category: str
715
+ severity: str
716
+ message: str
717
+ context: dict[str, Any] = field(default_factory=dict)
718
+
719
+ @staticmethod
720
+ def from_dict(d: dict[str, Any]) -> IntentIssue:
721
+ """Create IntentIssue from dictionary.
722
+
723
+ Args:
724
+
725
+ d: Dictionary with keys matching IntentIssue fields.
726
+
727
+ Returns:
728
+
729
+ Populated IntentIssue instance.
730
+ """
731
+ return IntentIssue(
732
+ issue_id=d.get("issue_id", ""),
733
+ category=d.get("category", ""),
734
+ severity=d.get("severity", "error"),
735
+ message=d.get("message", ""),
736
+ context=d.get("context", {}),
737
+ )
738
+
739
+ def to_dict(self) -> dict[str, Any]:
740
+ """Convert to dictionary."""
741
+ return asdict(self)
742
+
743
+
744
+ @dataclass
745
+ class IntentValidationResult:
746
+ """Result container for intent validation with issue tracking."""
747
+
748
+ issues: list[IntentIssue] = field(default_factory=list)
749
+
750
+ @property
751
+ def is_valid(self) -> bool:
752
+ """Return True if no errors exist."""
753
+ return not any(i.severity == "error" for i in self.issues)
754
+
755
+ @property
756
+ def issue_signature(self) -> str:
757
+ """Return signature of all issues for ABAB detection."""
758
+ return issue_sig([i.issue_id for i in self.issues])
759
+
760
+ @staticmethod
761
+ def from_dict(d: dict[str, Any]) -> IntentValidationResult:
762
+ """Create IntentValidationResult from dictionary.
763
+
764
+ Args:
765
+
766
+ d: Dictionary with an 'issues' list of serialized IntentIssue dicts.
767
+
768
+ Returns:
769
+
770
+ Populated IntentValidationResult with deserialized IntentIssue objects.
771
+ """
772
+ issues_raw = d.get("issues", [])
773
+ return IntentValidationResult(
774
+ issues=[IntentIssue.from_dict(i) for i in issues_raw],
775
+ )
776
+
777
+ def to_dict(self) -> dict[str, Any]:
778
+ """Convert to dictionary."""
779
+ return {"issues": [i.to_dict() for i in self.issues]}
780
+
781
+
782
+ @dataclass
783
+ class TemplateStats:
784
+ """Template acceptance/rejection statistics."""
785
+
786
+ accept: int = 0
787
+ reject: int = 0
788
+
789
+ @staticmethod
790
+ def from_dict(d: dict[str, Any]) -> TemplateStats:
791
+ """Create TemplateStats from dictionary.
792
+
793
+ Args:
794
+
795
+ d: Dictionary with 'accept' and 'reject' integer keys.
796
+
797
+ Returns:
798
+
799
+ Populated TemplateStats instance.
800
+ """
801
+ return TemplateStats(
802
+ accept=int(d.get("accept", 0)),
803
+ reject=int(d.get("reject", 0)),
804
+ )
805
+
806
+ def to_dict(self) -> dict[str, Any]:
807
+ """Convert to dictionary."""
808
+ return asdict(self)
809
+
810
+
811
+ @dataclass
812
+ class QSimSkeleton:
813
+ """Structural skeleton for QSim intent before LLM fills semantics."""
814
+
815
+ tables: list[str]
816
+ has_aggregation: bool
817
+ num_filters: int
818
+ num_groupby: int
819
+ has_orderby: bool
820
+ has_having: bool
821
+ has_distinct: bool = False
822
+ has_expr_comparison: bool = False
823
+
824
+
825
+ @dataclass
826
+ class SkeletonPool:
827
+ """Tiered skeleton pool with round-robin table set selection."""
828
+
829
+ tier_a_by_table_set: dict[str, list[QSimSkeleton]]
830
+ tier_b_by_table_set: dict[str, list[QSimSkeleton]]
831
+ tier_c_by_table_set: dict[str, list[QSimSkeleton]]
832
+ table_set_keys: list[str]
833
+ tier_a_indices: dict[str, int]
834
+ tier_b_indices: dict[str, int]
835
+ tier_c_indices: dict[str, int]
836
+ current_table_idx: int = 0
837
+
838
+
839
+ @dataclass
840
+ class QueryPlan:
841
+ """Output of SQL generation stage."""
842
+
843
+ sql: str
844
+ chosen_join_candidate_id: str
845
+ chosen_join_path_signature: list[str]
846
+
847
+
848
+ @dataclass
849
+ class ValidationResult:
850
+ """Output of validation stage."""
851
+
852
+ valid: bool
853
+ errors: list[str] = field(default_factory=list)
854
+ warnings: list[str] = field(default_factory=list)
855
+ extra_tables: set = field(default_factory=set)
856
+
857
+
858
+ @dataclass
859
+ class TemplateInfo:
860
+ """User-facing template information with obfuscated internals."""
861
+
862
+ id: str
863
+ natural_language: str
864
+ example_question: str
865
+ trust_level: str
866
+ source: str
867
+
868
+
869
+ @dataclass
870
+ class RejectedTemplateInfo:
871
+ """User-facing rejected template with generic categories."""
872
+
873
+ id: str
874
+ natural_language: str
875
+ example_question: str
876
+ rejection_category: str
877
+ rejection_count: int
878
+
879
+
880
+ @dataclass
881
+ class SimulatorSummary:
882
+ """High-level simulator execution statistics."""
883
+
884
+ version: int
885
+ total: int
886
+ success: int
887
+ failed: int
888
+ success_rate: float
889
+
890
+
891
+ @dataclass
892
+ class QSimSummary:
893
+ """Question simulator run metadata with timestamp, counts, and seed."""
894
+
895
+ timestamp: str
896
+ num_intents: int
897
+ num_questions: int
898
+ seed: int
899
+
900
+ @staticmethod
901
+ def from_dict(d: dict[str, Any]) -> QSimSummary:
902
+ """Create QSimSummary from dictionary.
903
+
904
+ Args:
905
+
906
+ d: Dictionary with keys matching QSimSummary fields.
907
+
908
+ Returns:
909
+
910
+ Populated QSimSummary instance.
911
+ """
912
+ return QSimSummary(
913
+ timestamp=d.get("timestamp", ""),
914
+ num_intents=d.get("num_intents", 0),
915
+ num_questions=d.get("num_questions", 0),
916
+ seed=d.get("seed", 42),
917
+ )
918
+
919
+ def to_dict(self) -> dict[str, Any]:
920
+ """Convert to dictionary."""
921
+ return asdict(self)
922
+
923
+
924
+ @dataclass
925
+ class QSimRange:
926
+ """User-facing range limits for QSim parameters."""
927
+
928
+ num_intents_range: tuple[int, int]
929
+ num_questions_range: tuple[int, int]
930
+
931
+
932
+ @dataclass
933
+ class SchemaLimits:
934
+ """Internal schema-based limits for adaptive parameter validation."""
935
+
936
+ max_filters: int
937
+ max_groupby: int
938
+ max_tables: int
939
+
940
+
941
+ @dataclass
942
+ class SkeletonLimits:
943
+ """Schema-derived limits for QSim skeleton enumeration.
944
+
945
+ Computed from column capabilities (filterable, groupable, aggregatable) for a given table set.
946
+
947
+ Used when generating valid skeleton combinations.
948
+ """
949
+
950
+ max_filters: int
951
+ max_groupby: int
952
+ max_having: int