aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
text2sql/config.py ADDED
@@ -0,0 +1,1063 @@
1
+ """Configuration classes and pipeline-wide constants for the text-to-SQL engine.
2
+
3
+ Holds all tunable thresholds, flag sets, and environment-driven settings used across pipeline stages.
4
+
5
+ PolicyConfig centralizes scoring penalties and safety rules.
6
+
7
+ EngineConfig selects the active runtime backend (PostgreSQL or Databricks) and LLM credentials.
8
+
9
+ QSimConfig and SimulatorConfig control synthetic question and intent generation.
10
+
11
+ Module-level constants define valid value types, operators, aggregation functions, and column-type mappings used for validation and intent resolution.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import re
18
+ from typing import ClassVar
19
+
20
+ INTENT_SCHEMA = {
21
+ "type": "object",
22
+ "required": ["tables"],
23
+ "properties": {
24
+ "tables": {
25
+ "oneOf": [
26
+ {"type": "array", "items": {"type": "string"}},
27
+ {"type": "string"},
28
+ ]
29
+ },
30
+ "aggregation_targets": {
31
+ "type": "object",
32
+ "additionalProperties": {"type": "string"},
33
+ },
34
+ "grain": {"type": "string"},
35
+ "select_cols": {
36
+ "type": "array",
37
+ "items": {"oneOf": [{"type": "string"}, {"type": "object"}]},
38
+ },
39
+ "group_by_cols": {"type": "array", "items": {"type": "string"}},
40
+ "order_by_cols": {
41
+ "type": "array",
42
+ "items": {"oneOf": [{"type": "string"}, {"type": "object"}]},
43
+ },
44
+ "filters_param": {
45
+ "type": "array",
46
+ "items": {
47
+ "type": "object",
48
+ "required": ["op"],
49
+ "properties": {
50
+ "left_expr": {"type": "string"},
51
+ "left_col": {"type": "string"},
52
+ "op": {"type": "string"},
53
+ "right_expr": {"type": "string"},
54
+ "right_col": {"type": "string"},
55
+ "value_type": {"type": "string"},
56
+ "value": {},
57
+ "bool_op": {"type": "string"},
58
+ "filter_group": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
59
+ },
60
+ },
61
+ },
62
+ "having_param": {
63
+ "type": "array",
64
+ "items": {
65
+ "type": "object",
66
+ "required": ["op"],
67
+ "properties": {
68
+ "left_expr": {"type": "string"},
69
+ "left_agg": {"type": "string"},
70
+ "op": {"type": "string"},
71
+ "right_expr": {"type": "string"},
72
+ "right_agg": {"type": "string"},
73
+ "value_type": {"type": "string"},
74
+ "value": {},
75
+ "bool_op": {"type": "string"},
76
+ "filter_group": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
77
+ },
78
+ },
79
+ },
80
+ "cte_steps": {
81
+ "type": "array",
82
+ "items": {
83
+ "type": "object",
84
+ "required": ["cte_name"],
85
+ "properties": {
86
+ "cte_name": {"type": "string"},
87
+ "description": {"type": "string"},
88
+ "tables": {"type": "array", "items": {"type": "string"}},
89
+ "grain": {"type": "string"},
90
+ "select_cols": {"type": "array"},
91
+ "group_by_cols": {"type": "array"},
92
+ "order_by_cols": {"type": "array"},
93
+ "filters_param": {"type": "array"},
94
+ "having_param": {"type": "array"},
95
+ "output_columns": {"type": "array"},
96
+ },
97
+ },
98
+ },
99
+ "limit": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
100
+ "natural_language": {"type": "string"},
101
+ "intent_status": {"type": "string"},
102
+ },
103
+ }
104
+ VALID_AGGREGATION_FUNCTIONS = {"count", "sum", "avg", "min", "max"}
105
+ VALID_SCALAR_FUNCTIONS = {
106
+ "upper",
107
+ "lower",
108
+ "trim",
109
+ "ltrim",
110
+ "rtrim",
111
+ "length",
112
+ "abs",
113
+ "round",
114
+ "floor",
115
+ "ceil",
116
+ "date_trunc",
117
+ "date_part",
118
+ "extract",
119
+ "coalesce",
120
+ "year",
121
+ "month",
122
+ "day",
123
+ }
124
+ SCALAR_FUNCTIONS_STRING = {"upper", "lower", "trim", "ltrim", "rtrim", "length"}
125
+ SCALAR_FUNCTIONS_NUMERIC = {"abs", "round", "floor", "ceil"}
126
+ SCALAR_FUNCTIONS_TEMPORAL = {
127
+ "date_trunc",
128
+ "date_part",
129
+ "extract",
130
+ "year",
131
+ "month",
132
+ "day",
133
+ }
134
+ SCALAR_FUNCTIONS_LEADING_ARG = {"date_trunc", "date_part", "extract"}
135
+ DISALLOWED_EXTRACT_UNITS = {"epoch"}
136
+ SCALAR_FUNCTIONS_AGG_COMPATIBLE = {"abs", "round", "floor", "ceil"}
137
+ VALID_ARITH_OPS = {"+", "-", "*", "/"}
138
+ VALID_GRAINS = {"scalar", "grouped", "row_level"}
139
+ VALID_EXPECTED_ROWS = {"one", "few", "many"}
140
+ VALID_FILTER_OPS = {
141
+ "=",
142
+ "!=",
143
+ "<",
144
+ "<=",
145
+ ">",
146
+ ">=",
147
+ "like",
148
+ "ilike",
149
+ "in",
150
+ "between",
151
+ "is null",
152
+ "is not null",
153
+ "not in",
154
+ "not like",
155
+ "not ilike",
156
+ }
157
+ VALID_HAVING_OPS = {"=", "!=", "<", "<=", ">", ">=", "in", "not in", "between"}
158
+ VALID_AGG_FUNCS = {"count", "sum", "avg", "min", "max"}
159
+ DATABRICKS_TABLE_QUALIFY_SKIP_IDENTIFIERS: frozenset[str] = frozenset(
160
+ {
161
+ "avg",
162
+ "case",
163
+ "cast",
164
+ "coalesce",
165
+ "count",
166
+ "date_part",
167
+ "date_trunc",
168
+ "extract",
169
+ "lateral",
170
+ "lower",
171
+ "max",
172
+ "min",
173
+ "nullif",
174
+ "replace",
175
+ "substring",
176
+ "sum",
177
+ "trim",
178
+ "try_cast",
179
+ "unnest",
180
+ "upper",
181
+ "values",
182
+ }
183
+ )
184
+ VALID_VALUE_TYPES = {
185
+ "integer",
186
+ "string",
187
+ "date",
188
+ "number",
189
+ "null",
190
+ "boolean",
191
+ "date_window",
192
+ "date_diff",
193
+ }
194
+ VALID_DATE_WINDOW_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
195
+ VALID_DATE_DIFF_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
196
+ VALID_EXTRACT_UNITS = {
197
+ "year", "month", "day", "week", "quarter",
198
+ "hour", "minute", "second", "dow", "doy",
199
+ }
200
+ VALID_INTERVAL_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
201
+ VALID_FILTER_VALUE_TYPES = {
202
+ "categorical",
203
+ "numeric",
204
+ "numeric_categorical",
205
+ "temporal",
206
+ "boolean",
207
+ "null",
208
+ }
209
+ VALID_HAVING_VALUE_TYPES = {"number", "integer"}
210
+ VALUE_TYPE_NORMALIZATION = {
211
+ "timestamp": "date",
212
+ "datetime": "date",
213
+ "timestamptz": "date",
214
+ "time": "date",
215
+ "numeric": "number",
216
+ "decimal": "number",
217
+ "float": "number",
218
+ "double": "number",
219
+ "real": "number",
220
+ "money": "number",
221
+ "bigint": "integer",
222
+ "smallint": "integer",
223
+ "int": "integer",
224
+ "int2": "integer",
225
+ "int4": "integer",
226
+ "int8": "integer",
227
+ "serial": "integer",
228
+ "varchar": "string",
229
+ "char": "string",
230
+ "text": "string",
231
+ "bpchar": "string",
232
+ "uuid": "string",
233
+ "bool": "boolean",
234
+ "enum": "string",
235
+ "integer": "integer",
236
+ "string": "string",
237
+ "date": "date",
238
+ "number": "number",
239
+ "boolean": "boolean",
240
+ "null": "null",
241
+ "date_window": "date_window",
242
+ "date_diff": "date_diff",
243
+ }
244
+ BOOLEAN_FILTER_OPS = {"=", "!=", "in", "not in", "is null", "is not null"}
245
+ CATEGORICAL_FILTER_OPS = {
246
+ "=",
247
+ "!=",
248
+ "like",
249
+ "ilike",
250
+ "not like",
251
+ "not ilike",
252
+ "in",
253
+ "not in",
254
+ "is null",
255
+ "is not null",
256
+ }
257
+ NUMERIC_CATEGORICAL_FILTER_OPS = {
258
+ "=",
259
+ "!=",
260
+ "<",
261
+ "<=",
262
+ ">",
263
+ ">=",
264
+ "in",
265
+ "not in",
266
+ "between",
267
+ "is null",
268
+ "is not null",
269
+ }
270
+ NUMERIC_FILTER_OPS = frozenset(
271
+ {
272
+ "=",
273
+ "!=",
274
+ "<",
275
+ "<=",
276
+ ">",
277
+ ">=",
278
+ "in",
279
+ "not in",
280
+ "between",
281
+ "is null",
282
+ "is not null",
283
+ }
284
+ )
285
+ CTE_NUMERIC_FILTER_OPS = list(NUMERIC_FILTER_OPS)
286
+ TEMPORAL_FILTER_OPS = {
287
+ "=",
288
+ "!=",
289
+ "<",
290
+ "<=",
291
+ ">",
292
+ ">=",
293
+ "in",
294
+ "not in",
295
+ "between",
296
+ "is null",
297
+ "is not null",
298
+ }
299
+ FK_FILTER_OPS = {
300
+ "=",
301
+ "!=",
302
+ "<",
303
+ "<=",
304
+ ">",
305
+ ">=",
306
+ "in",
307
+ "not in",
308
+ "between",
309
+ "is null",
310
+ "is not null",
311
+ }
312
+ ROLE_ALLOWED_AGGREGATIONS = {
313
+ "IDENTIFIER": {"count"},
314
+ "CATEGORICAL": {"count", "min", "max"},
315
+ "NUMERIC_CATEGORICAL": {"count", "min", "max"},
316
+ "NUMERIC_MEASURE": {"count", "sum", "avg", "min", "max"},
317
+ "TEMPORAL": {"count", "min", "max"},
318
+ "BOOLEAN": {"count", "sum"},
319
+ "FREE_TEXT": {"count"},
320
+ "AUDIT": set(),
321
+ }
322
+ NUMERIC_ONLY_AGGREGATIONS = {"sum", "avg"}
323
+ COLUMN_TYPE_TO_VALUE_TYPE = {
324
+ "int": "integer",
325
+ "integer": "integer",
326
+ "bigint": "integer",
327
+ "smallint": "integer",
328
+ "tinyint": "integer",
329
+ "int2": "integer",
330
+ "int4": "integer",
331
+ "int8": "integer",
332
+ "long": "integer",
333
+ "short": "integer",
334
+ "serial": "integer",
335
+ "bigserial": "integer",
336
+ "smallserial": "integer",
337
+ "float": "number",
338
+ "double": "number",
339
+ "decimal": "number",
340
+ "numeric": "number",
341
+ "real": "number",
342
+ "float4": "number",
343
+ "float8": "number",
344
+ "double precision": "number",
345
+ "money": "number",
346
+ "varchar": "string",
347
+ "text": "string",
348
+ "char": "string",
349
+ "string": "string",
350
+ "character varying": "string",
351
+ "bpchar": "string",
352
+ "nchar": "string",
353
+ "nvarchar": "string",
354
+ "ntext": "string",
355
+ "clob": "string",
356
+ "date": "date",
357
+ "timestamp": "date",
358
+ "timestamptz": "date",
359
+ "datetime": "date",
360
+ "time": "date",
361
+ "timestamp without time zone": "date",
362
+ "timestamp with time zone": "date",
363
+ "boolean": "boolean",
364
+ "bool": "boolean",
365
+ }
366
+ AGGREGATION_ALLOWED_COLUMN_TYPES = {
367
+ "count": ["integer", "string", "date", "number", "boolean"],
368
+ "sum": ["integer", "number"],
369
+ "avg": ["integer", "number"],
370
+ "min": ["integer", "number", "string", "date"],
371
+ "max": ["integer", "number", "string", "date"],
372
+ }
373
+ EXCLUDED_FILTER_PATTERNS = [
374
+ r"password",
375
+ r"picture",
376
+ r"photo",
377
+ r"image",
378
+ r"blob",
379
+ r"address.?2",
380
+ r"address_line.?2",
381
+ ]
382
+ BOOLEAN_VALUE_PATTERNS = frozenset(
383
+ [
384
+ frozenset(["0", "1"]),
385
+ frozenset(["true", "false"]),
386
+ frozenset(["yes", "no"]),
387
+ frozenset(["y", "n"]),
388
+ frozenset(["t", "f"]),
389
+ frozenset(["on", "off"]),
390
+ frozenset(["active", "inactive"]),
391
+ frozenset(["enabled", "disabled"]),
392
+ ]
393
+ )
394
+ BOOLEAN_TRUE_FALSE_MAP: dict[frozenset[str], tuple[str, str]] = {
395
+ frozenset(["0", "1"]): ("1", "0"),
396
+ frozenset(["true", "false"]): ("true", "false"),
397
+ frozenset(["yes", "no"]): ("yes", "no"),
398
+ frozenset(["y", "n"]): ("y", "n"),
399
+ frozenset(["t", "f"]): ("t", "f"),
400
+ frozenset(["on", "off"]): ("on", "off"),
401
+ frozenset(["active", "inactive"]): ("active", "inactive"),
402
+ frozenset(["enabled", "disabled"]): ("enabled", "disabled"),
403
+ }
404
+ NUMERIC_TYPE_TOKENS = frozenset(
405
+ {
406
+ "int",
407
+ "integer",
408
+ "float",
409
+ "double",
410
+ "decimal",
411
+ "numeric",
412
+ "real",
413
+ "number",
414
+ "serial",
415
+ "bigint",
416
+ "smallint",
417
+ "tinyint",
418
+ "money",
419
+ "long",
420
+ "short",
421
+ }
422
+ )
423
+ STRING_TYPE_TOKENS = frozenset(
424
+ {
425
+ "char",
426
+ "varchar",
427
+ "text",
428
+ "string",
429
+ "clob",
430
+ "nchar",
431
+ "nvarchar",
432
+ "ntext",
433
+ "bpchar",
434
+ }
435
+ )
436
+ DATE_TYPE_TOKENS = frozenset(
437
+ {
438
+ "date",
439
+ "time",
440
+ "timestamp",
441
+ "timestamptz",
442
+ "interval",
443
+ }
444
+ )
445
+ AGG_PREFIXES = frozenset({"COUNT(", "SUM(", "AVG(", "MIN(", "MAX("})
446
+ OP_FLIP: dict[str, str] = {">": "<", "<": ">", ">=": "<=", "<=": ">="}
447
+ NUMERIC_RESULT_SCALARS = frozenset(
448
+ {
449
+ "abs",
450
+ "round",
451
+ "floor",
452
+ "ceil",
453
+ "extract",
454
+ "date_part",
455
+ "year",
456
+ "month",
457
+ "day",
458
+ "length",
459
+ }
460
+ )
461
+ INTEGER_SCALARS = frozenset({"extract", "date_part", "year", "month", "day", "length"})
462
+ NUMERIC_RESULT_AGGS = frozenset({"count", "sum", "avg"})
463
+ NUMERIC_RESULT_OPS = frozenset({"=", "!=", "<", "<=", ">", ">="})
464
+ ARITHMETIC_ROLES = frozenset({"numeric_measure", "numeric_categorical"})
465
+ COMPATIBLE_TYPE_PAIRS = {
466
+ ("int", "int"),
467
+ ("int", "integer"),
468
+ ("int", "bigint"),
469
+ ("int", "smallint"),
470
+ ("int", "tinyint"),
471
+ ("int", "long"),
472
+ ("int", "short"),
473
+ ("int", "numeric"),
474
+ ("int", "decimal"),
475
+ ("integer", "integer"),
476
+ ("integer", "int"),
477
+ ("integer", "bigint"),
478
+ ("integer", "smallint"),
479
+ ("integer", "tinyint"),
480
+ ("integer", "long"),
481
+ ("integer", "short"),
482
+ ("bigint", "bigint"),
483
+ ("bigint", "int"),
484
+ ("bigint", "integer"),
485
+ ("bigint", "smallint"),
486
+ ("bigint", "tinyint"),
487
+ ("bigint", "long"),
488
+ ("bigint", "numeric"),
489
+ ("smallint", "smallint"),
490
+ ("smallint", "int"),
491
+ ("smallint", "integer"),
492
+ ("smallint", "bigint"),
493
+ ("smallint", "tinyint"),
494
+ ("tinyint", "tinyint"),
495
+ ("tinyint", "int"),
496
+ ("tinyint", "integer"),
497
+ ("tinyint", "smallint"),
498
+ ("tinyint", "bigint"),
499
+ ("long", "long"),
500
+ ("long", "int"),
501
+ ("long", "integer"),
502
+ ("long", "bigint"),
503
+ ("short", "short"),
504
+ ("short", "int"),
505
+ ("short", "integer"),
506
+ ("short", "smallint"),
507
+ ("short", "tinyint"),
508
+ ("numeric", "numeric"),
509
+ ("numeric", "decimal"),
510
+ ("numeric", "int"),
511
+ ("numeric", "integer"),
512
+ ("numeric", "bigint"),
513
+ ("decimal", "decimal"),
514
+ ("decimal", "numeric"),
515
+ ("decimal", "int"),
516
+ ("decimal", "integer"),
517
+ ("float", "float"),
518
+ ("float", "double"),
519
+ ("float", "real"),
520
+ ("float", "numeric"),
521
+ ("double", "double"),
522
+ ("double", "float"),
523
+ ("double", "real"),
524
+ ("real", "real"),
525
+ ("real", "float"),
526
+ ("real", "double"),
527
+ ("varchar", "varchar"),
528
+ ("varchar", "text"),
529
+ ("varchar", "char"),
530
+ ("varchar", "string"),
531
+ ("text", "text"),
532
+ ("text", "varchar"),
533
+ ("text", "char"),
534
+ ("text", "string"),
535
+ ("char", "char"),
536
+ ("char", "varchar"),
537
+ ("char", "text"),
538
+ ("char", "string"),
539
+ ("string", "string"),
540
+ ("string", "varchar"),
541
+ ("string", "text"),
542
+ ("string", "char"),
543
+ ("date", "date"),
544
+ ("date", "timestamp"),
545
+ ("date", "timestamptz"),
546
+ ("timestamp", "timestamp"),
547
+ ("timestamp", "date"),
548
+ ("timestamp", "timestamptz"),
549
+ ("timestamptz", "timestamptz"),
550
+ ("timestamptz", "timestamp"),
551
+ ("timestamptz", "date"),
552
+ ("boolean", "boolean"),
553
+ ("boolean", "bool"),
554
+ ("bool", "bool"),
555
+ ("bool", "boolean"),
556
+ ("number", "number"),
557
+ ("number", "integer"),
558
+ ("number", "numeric"),
559
+ ("number", "decimal"),
560
+ ("number", "float"),
561
+ ("number", "double"),
562
+ ("number", "real"),
563
+ ("integer", "number"),
564
+ }
565
+ AGG_QUANTITY_RE = re.compile(
566
+ r"\b(?:more\s+than|greater\s+than|at\s+least|fewer\s+than|less\s+than|"
567
+ r"no\s+more\s+than|no\s+fewer\s+than|over|under|exceeding|"
568
+ r"above|below|a\s+minimum\s+of|a\s+maximum\s+of)\s+\d+\b",
569
+ re.IGNORECASE,
570
+ )
571
+ COUNT_THRESHOLD_TABLE_RE = re.compile(
572
+ r"\b(?:in\s+(?:exactly\s+)?|exactly\s+)(\d+)\s+(\w+)\b",
573
+ re.IGNORECASE,
574
+ )
575
+ CTE_FULL_AGGS = ["count", "sum", "avg", "min", "max"]
576
+ CTE_DEFAULT_AGGS = ["count", "min", "max"]
577
+ CTE_HAVING_COMPARE_OPS = ["=", "!=", "<", "<=", ">", ">="]
578
+ SCALAR_FUNC_DEFAULTS: dict[str, list] = {
579
+ "round": [2],
580
+ "trunc": [0],
581
+ "truncate": [0],
582
+ "coalesce": [0],
583
+ "date_trunc": ["month"],
584
+ "date_part": ["month"],
585
+ "extract": ["year"],
586
+ }
587
+ DATE_UNIT_KEYWORDS = [
588
+ ("month", "month"),
589
+ ("day", "day"),
590
+ ("week", "week"),
591
+ ("quarter", "quarter"),
592
+ ("year", "year"),
593
+ ("date", "year"),
594
+ ]
595
+ STRUCTURAL_IDENTITY_VALUES = frozenset({0, 1})
596
+ IN_OPS = frozenset({"in", "not in"})
597
+ IN_STRING_SEPARATORS = re.compile(r"['\"]?\s*,\s*['\"]?")
598
+ BOOLEAN_TRUTHY_VALUES = frozenset({"1", "true", "t", "yes", "y", "on", "active", "enabled"})
599
+ BOOLEAN_FALSY_VALUES = frozenset({"0", "false", "f", "no", "n", "off", "inactive", "disabled"})
600
+ ILIKE_ELIGIBLE_OPS = frozenset({"=", "!=", "like", "not like"})
601
+ ILIKE_OP_MAP: dict[str, str] = {
602
+ "=": "ilike",
603
+ "!=": "not ilike",
604
+ "like": "ilike",
605
+ "not like": "not ilike",
606
+ }
607
+ NUMERIC_DATA_TYPES = frozenset(
608
+ {
609
+ "integer",
610
+ "int",
611
+ "int2",
612
+ "int4",
613
+ "int8",
614
+ "smallint",
615
+ "bigint",
616
+ "serial",
617
+ "bigserial",
618
+ "numeric",
619
+ "decimal",
620
+ "real",
621
+ "double precision",
622
+ "float",
623
+ "float4",
624
+ "float8",
625
+ "money",
626
+ }
627
+ )
628
+ REVERSE_OP_MAP = {
629
+ ">": "<",
630
+ "<": ">",
631
+ ">=": "<=",
632
+ "<=": ">=",
633
+ "=": "=",
634
+ "!=": "!=",
635
+ "like": "like",
636
+ "not like": "not like",
637
+ "ilike": "ilike",
638
+ "not ilike": "not ilike",
639
+ "in": "in",
640
+ "not in": "not in",
641
+ "is null": "is null",
642
+ "is not null": "is not null",
643
+ }
644
+ THRESHOLD_RE = re.compile(
645
+ r"\b(?:more\s+than|less\s+than|at\s+least|at\s+most|over|under|above|below"
646
+ r"|exceeds?|greater\s+than|fewer\s+than|minimum\s+of|maximum\s+of)\s+\d+",
647
+ re.IGNORECASE,
648
+ )
649
+ AGG_KEYWORDS_RE = re.compile(r"\b(?:total|count|number\s+of|average|avg|sum|how\s+many)\b", re.IGNORECASE)
650
+ AGG_PATTERN = re.compile(r"^(COUNT|SUM|AVG|MIN|MAX)\s*\(\s*(.+?)\s*\)$", re.IGNORECASE)
651
+ TABLE_COL_PATTERN = re.compile(r"(\w+)\.(\w+)")
652
+ HAVING_COUNT_VALUES = [1, 2, 3, 5, 10, 15, 20, 25, 50, 100]
653
+ HAVING_SUM_AVG_VALUES = [10.0, 50.0, 100.0, 250.0, 500.0, 750.0, 1000.0]
654
+ HAVING_MIN_MAX_VALUES = [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
655
+ QUESTION_STARTS_AGG = [
656
+ "How many",
657
+ "What is the total",
658
+ "What is the average",
659
+ "What is the minimum",
660
+ "What is the maximum",
661
+ "Find the sum of",
662
+ "Calculate the",
663
+ "Show the count of",
664
+ "Get the total",
665
+ ]
666
+ QUESTION_STARTS_LIST = [
667
+ "List all",
668
+ "Show me",
669
+ "What are",
670
+ "Which",
671
+ "Find",
672
+ "Display",
673
+ "Get",
674
+ "Return",
675
+ "Retrieve",
676
+ ]
677
+ QUESTION_STARTS_GROUP = [
678
+ "Show me",
679
+ "What is",
680
+ "Group",
681
+ "Break down",
682
+ "Summarize",
683
+ "Calculate",
684
+ "Find the",
685
+ "Get the",
686
+ ]
687
+
688
+ RANGE_OPS = frozenset({">", "<", ">=", "<="})
689
+ IMPOSSIBLE_HAVING_RE = re.compile(
690
+ r"^COUNT\b.*",
691
+ re.IGNORECASE,
692
+ )
693
+ SQL_KEYWORDS = frozenset(
694
+ {
695
+ "select",
696
+ "from",
697
+ "distinct",
698
+ "where",
699
+ "group",
700
+ "order",
701
+ "having",
702
+ "limit",
703
+ "join",
704
+ "inner",
705
+ "outer",
706
+ "left",
707
+ "right",
708
+ "cross",
709
+ "on",
710
+ "as",
711
+ "insert",
712
+ "update",
713
+ "delete",
714
+ "create",
715
+ "drop",
716
+ "alter",
717
+ "table",
718
+ "index",
719
+ "view",
720
+ "into",
721
+ "values",
722
+ "set",
723
+ "and",
724
+ "or",
725
+ "not",
726
+ "in",
727
+ }
728
+ )
729
+ NUMERIC_LITERAL_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
730
+ TOP_N_RE = re.compile(r"\b(?:top|first|bottom|last|least|most)\s+\d+\b", re.IGNORECASE)
731
+ DISTINCT_RE = re.compile(r"\b(?:distinct|unique)\b", re.IGNORECASE)
732
+
733
+
734
+ def normalize_value_type(value_type: str) -> str:
735
+ """Normalize an LLM-provided value type string to a valid pipeline value type.
736
+
737
+ Args:
738
+
739
+ value_type: Raw value type string as returned by the LLM or stored in the schema.
740
+
741
+ Returns:
742
+
743
+ One of the VALID_VALUE_TYPES values, defaulting to 'string' for unknown types.
744
+ """
745
+ if not value_type:
746
+ return "string"
747
+ vt_lower = value_type.lower().strip()
748
+ if vt_lower in VALUE_TYPE_NORMALIZATION:
749
+ return VALUE_TYPE_NORMALIZATION[vt_lower]
750
+ if vt_lower in VALID_VALUE_TYPES:
751
+ return vt_lower
752
+ return "string"
753
+
754
+
755
+ def normalize_column_type(col_type: str) -> str:
756
+ """Strip type parameters from a SQL type string for mapping lookup.
757
+
758
+ Args:
759
+
760
+ col_type: Raw SQL type string such as 'varchar(255)' or 'numeric(10,2)'.
761
+
762
+ Returns:
763
+
764
+ Lowercased type name with parameter suffixes removed.
765
+ """
766
+ normalized = col_type.lower().strip()
767
+ normalized = re.sub(r"\(\d+(?:,\s*\d+)?\)", "", normalized)
768
+ normalized = normalized.strip()
769
+ return normalized
770
+
771
+
772
+ class PolicyConfig:
773
+ """Pure thresholds, penalties, safety rules — no runtime state."""
774
+
775
+ DEBUG: ClassVar[bool] = True
776
+ VERBOSE: ClassVar[bool] = True
777
+
778
+ REGENERATE_TEMPLATE_STORE: ClassVar[bool] = False
779
+ REGENERATE_SCHEMA_GRAPH: ClassVar[bool] = False
780
+ REGENERATE_SKELETON_CACHE: ClassVar[bool] = False
781
+
782
+ MAX_REPAIR_LOOPS = 2
783
+
784
+ CATEGORICAL_MAX_CARDINALITY = 50
785
+ CATEGORICAL_MAX_RATIO = 0.05
786
+ FREE_TEXT_CATEGORICAL_MAX_CARDINALITY = 200
787
+ IDENTIFIER_MIN_UNIQUENESS = 0.98
788
+ CATEGORICAL_SAMPLE_SIZE = 20
789
+
790
+ AUTO_PROCEED_THRESHOLD = 0.85
791
+ FINAL_SQL_AUTO_ACCEPT_THRESHOLD = 0.95
792
+ FUZZY_MATCH_MAX_DISTANCE = 2
793
+
794
+ PENALTY_CAP = 0.30
795
+
796
+ TRUST_PROMOTE_MIN_TOTAL = 2
797
+ TRUST_PROMOTE_MAX_REJECT_RATIO = 0.25
798
+ TRUST_DEMOTE_REJECT_RATIO_T2 = 0.25
799
+ TRUST_DEMOTE_REJECT_RATIO_T1 = 0.5
800
+
801
+ PEN_BY_INTENT_KEY = 0.05
802
+ PEN_BY_JOIN_SIG = 0.05
803
+ PEN_BY_SQL_FP = 0.08
804
+ PEN_BY_COLMAP_SIG = 0.05
805
+ PEN_BY_RESULT_ISSUE = 0.06
806
+
807
+ MAX_AVOID_EXAMPLES = 2
808
+
809
+ STOPWORDS = {
810
+ "a",
811
+ "an",
812
+ "the",
813
+ "is",
814
+ "are",
815
+ "was",
816
+ "were",
817
+ "be",
818
+ "been",
819
+ "being",
820
+ "am",
821
+ "do",
822
+ "does",
823
+ "did",
824
+ "have",
825
+ "has",
826
+ "had",
827
+ "can",
828
+ "could",
829
+ "would",
830
+ "should",
831
+ "will",
832
+ "shall",
833
+ "may",
834
+ "might",
835
+ "me",
836
+ "my",
837
+ "i",
838
+ "we",
839
+ "you",
840
+ "your",
841
+ "it",
842
+ "its",
843
+ "please",
844
+ }
845
+
846
+ FORBIDDEN_SQL = [
847
+ r"\bupdate\b",
848
+ r"\bdelete\b",
849
+ r"\binsert\b",
850
+ r"\bmerge\b",
851
+ r"\balter\b",
852
+ r"\bdrop\b",
853
+ r"\btruncate\b",
854
+ r"\bgrant\b",
855
+ r"\brevoke\b",
856
+ r"\bcreate\b",
857
+ r"\bcomment\b",
858
+ r"\brename\b",
859
+ r"\bcall\b",
860
+ r"\bexecute\b",
861
+ r"\bdo\b",
862
+ r"\bcopy\b",
863
+ r";\s*\S",
864
+ r"\bCASE\s+WHEN\b",
865
+ r"\bUNION\b",
866
+ r"\bINTERSECT\b",
867
+ r"\bEXCEPT\b",
868
+ r"\bLATERAL\b",
869
+ r"\bOFFSET\b",
870
+ r"\bFETCH\s+FIRST\b",
871
+ r"\bDISTINCT\s+ON\b",
872
+ r"\bARRAY\s*\[",
873
+ r"\bARRAY_AGG\b",
874
+ r"::json\b",
875
+ r"\bjson_",
876
+ r"\bjsonb_",
877
+ r"\bEXISTS\s*\(",
878
+ r"\bBETWEEN\b",
879
+ ]
880
+
881
+ REJECT_CATEGORIES = [
882
+ "wrong_intent",
883
+ "wrong_tables",
884
+ "wrong_join",
885
+ "wrong_filters_or_having",
886
+ "wrong_aggregation_or_grouping",
887
+ "wrong_columns_selected",
888
+ "too_many_rows",
889
+ "too_few_rows",
890
+ "invalid_structure",
891
+ "other",
892
+ ]
893
+
894
+ STRUCTURAL_REJECT_CATEGORIES = {
895
+ "wrong_intent",
896
+ "wrong_tables",
897
+ "wrong_join",
898
+ "wrong_aggregation_or_grouping",
899
+ "wrong_columns_selected",
900
+ "invalid_structure",
901
+ }
902
+
903
+
904
+ class PostgresRuntimeConfig:
905
+ """PostgreSQL-specific runtime configuration."""
906
+
907
+ HOST: ClassVar[str] = "localhost"
908
+ PORT: ClassVar[int] = 5432
909
+ USER: ClassVar[str] = "postgres"
910
+ PASSWORD: ClassVar[str | None] = None
911
+ DATABASE: ClassVar[str | None] = None
912
+ SCHEMA: ClassVar[str] = "public"
913
+
914
+ SQL_FILE_PATH: ClassVar[str | None] = None
915
+
916
+ DEBUG: ClassVar[bool] = False
917
+
918
+ @classmethod
919
+ def db_url(cls) -> str:
920
+ """Build the PostgreSQL SQLAlchemy connection URL from current config.
921
+
922
+ Returns:
923
+
924
+ Connection URL string for use with SQLAlchemy.
925
+
926
+ Raises:
927
+
928
+ ValueError: If PASSWORD or DATABASE are not set.
929
+ """
930
+ if not cls.PASSWORD:
931
+ raise ValueError("PostgreSQL password required")
932
+ if not cls.DATABASE:
933
+ raise ValueError("PostgreSQL database required")
934
+ return f"postgresql+psycopg2://{cls.USER}:{cls.PASSWORD}@{cls.HOST}:{cls.PORT}/{cls.DATABASE}"
935
+
936
+
937
+ class DatabricksRuntimeConfig:
938
+ """Databricks-specific runtime configuration."""
939
+
940
+ CATALOG: ClassVar[str | None] = None
941
+ SCHEMA: ClassVar[str | None] = None
942
+
943
+ SQL_FILE_PATH: ClassVar[str | None] = None
944
+
945
+ SERVER_HOSTNAME: ClassVar[str | None] = None
946
+ HTTP_PATH: ClassVar[str | None] = None
947
+ ACCESS_TOKEN: ClassVar[str | None] = None
948
+
949
+ DEBUG: ClassVar[bool] = False
950
+
951
+ @classmethod
952
+ def has_native_connection(cls) -> bool:
953
+ """Return True when all three databricks-sql-connector params are set."""
954
+ return bool(cls.SERVER_HOSTNAME and cls.HTTP_PATH and cls.ACCESS_TOKEN)
955
+
956
+ @classmethod
957
+ def validate(cls) -> None:
958
+ """Validate that required Databricks configuration fields are set.
959
+
960
+ Raises:
961
+
962
+ ValueError: If CATALOG or SCHEMA are not configured.
963
+ """
964
+ if not cls.CATALOG:
965
+ raise ValueError("Databricks catalog required")
966
+ if not cls.SCHEMA:
967
+ raise ValueError("Databricks schema required")
968
+
969
+
970
+ class EngineConfig:
971
+ """Engine selection and active runtime configuration."""
972
+
973
+ TYPE: ClassVar[str] = "postgresql"
974
+
975
+ RUNTIME: ClassVar[type] = PostgresRuntimeConfig
976
+
977
+ API_TOKEN: ClassVar[str | None] = os.environ.get("OPENAI_API_KEY")
978
+ OPENAI_MODEL: ClassVar[str] = "gpt-4o-mini"
979
+ OPENAI_MODEL_INTENT: ClassVar[str] = "gpt-5-mini"
980
+ OPENAI_MODEL_SQL: ClassVar[str] = "gpt-4.1-mini"
981
+ OPENAI_MODEL_SCHEMA: ClassVar[str] = "gpt-5-mini"
982
+ OPENAI_BASE_URL: ClassVar[str] = "https://api.openai.com/v1"
983
+
984
+ SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json"
985
+ TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json"
986
+
987
+
988
+ class QSimConfig:
989
+ """Question Simulator settings for NL question generation."""
990
+
991
+ INTENT_TYPES = 20
992
+ QUESTIONS_COUNT = 100
993
+ MAX_TABLES_PER_INTENT = 3
994
+ MAX_FILTERS_PER_INTENT = 4
995
+ MAX_FILTER_COLUMNS = 2
996
+ MAX_GROUP_BY_COLUMNS = 2
997
+
998
+ MIN_AVG_VARIANTS_PER_INTENT = 1
999
+ MAX_AVG_VARIANTS_PER_INTENT = 10
1000
+
1001
+ MAX_NO_VARIANCE_RATIO = 0.25
1002
+ SINGLE_TABLE_RATIO = 0.40
1003
+ TWO_TABLE_RATIO = 0.40
1004
+ THREE_TABLE_RATIO = 0.20
1005
+
1006
+ MAX_CONSECUTIVE_DUPLICATES = 5
1007
+ MAX_CONSECUTIVE_FAILURES = 5
1008
+
1009
+ MIN_FILTER_RATIO = 0.70
1010
+ MIN_HAVING_RATIO = 0.15
1011
+ MIN_THREE_TABLE_RATIO = 0.10
1012
+
1013
+ PROFILING_SAMPLE_THRESHOLD = 100_000
1014
+ PROFILING_SAMPLE_SIZE = 10_000
1015
+
1016
+ RANDOM_SEED = 42
1017
+
1018
+ EXCLUDED_FILTER_PATTERNS = EXCLUDED_FILTER_PATTERNS
1019
+
1020
+ SKELETONS_JSON_PATH = "qsim_skeletons.json"
1021
+ QUESTIONS_OUTPUT_PATH = "qsim_intents_with_questions.json"
1022
+
1023
+ MAX_ROLE_CLASSIFICATION_RETRIES = 2
1024
+
1025
+
1026
+ class SimulatorConfig:
1027
+ """Coverage Simulator settings for synthetic intent expansion and template generation."""
1028
+
1029
+ MAX_FILTERS = 3
1030
+ MAX_TABLES = 3
1031
+ MAX_GROUPBY = 2
1032
+
1033
+ MAX_EXPR_COMPARISONS = 2
1034
+ MAX_HAVING_CONDITIONS = 2
1035
+ MAX_EXPANSION_DEPTH = 2
1036
+
1037
+ GOLD_OUTPUT_PATTERN = "gold_intents_v{version}.json"
1038
+ REPORT_PATTERN = "simulation_report_v{version}.json"
1039
+ RESULTS_CSV_PATTERN = "simulation_results_v{version}.csv"
1040
+ FAILURES_PATTERN = "simulation_failures_v{version}.json"
1041
+
1042
+ RANDOM_SEED = 42
1043
+
1044
+ EXTRACT_EXPANSION_UNITS: list[str] = ["year", "month", "day", "quarter", "dow"]
1045
+ DATE_TRUNC_EXPANSION_UNITS: list[str] = ["month", "quarter", "year"]
1046
+ LIMIT_EXPANSION_VALUES: list[int] = [10, 50, 100]
1047
+
1048
+ DATE_WINDOW_EXPANSION_PRESETS: list[dict[str, int | str]] = [
1049
+ {"unit": "day", "offset": 7},
1050
+ {"unit": "day", "offset": 30},
1051
+ {"unit": "day", "offset": 90},
1052
+ {"unit": "month", "offset": 1},
1053
+ {"unit": "month", "offset": 3},
1054
+ {"unit": "month", "offset": 6},
1055
+ {"unit": "month", "offset": 12},
1056
+ {"unit": "year", "offset": 1},
1057
+ ]
1058
+
1059
+ DATE_DIFF_EXPANSION_PRESETS: list[dict[str, int | str]] = [
1060
+ {"unit": "day", "amount": 7},
1061
+ {"unit": "day", "amount": 30},
1062
+ {"unit": "day", "amount": 90},
1063
+ ]