PyPI - aetherdialect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

aetherdialect-0.1.0.dist-info/METADATA +197 -0
aetherdialect-0.1.0.dist-info/RECORD +34 -0
aetherdialect-0.1.0.dist-info/WHEEL +5 -0
aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
text2sql/__init__.py +7 -0
text2sql/config.py +1063 -0
text2sql/contracts_base.py +952 -0
text2sql/contracts_core.py +1890 -0
text2sql/core_utils.py +834 -0
text2sql/dialect.py +1134 -0
text2sql/expansion_ops.py +1218 -0
text2sql/expansion_rules.py +496 -0
text2sql/intent_expr.py +1759 -0
text2sql/intent_process.py +2133 -0
text2sql/intent_repair.py +1733 -0
text2sql/intent_resolve.py +1292 -0
text2sql/live_testing.py +1117 -0
text2sql/main_execution.py +799 -0
text2sql/pipeline.py +1662 -0
text2sql/qsim_ops.py +1286 -0
text2sql/qsim_sample.py +609 -0
text2sql/qsim_struct.py +569 -0
text2sql/schema.py +973 -0
text2sql/schema_profiling.py +2075 -0
text2sql/simulator.py +970 -0
text2sql/sql_gen.py +1537 -0
text2sql/templates.py +1037 -0
text2sql/text2sql.py +726 -0
text2sql/utils.py +973 -0
text2sql/validation_agg.py +1033 -0
text2sql/validation_execute.py +1092 -0
text2sql/validation_schema.py +1847 -0
text2sql/validation_semantic.py +2122 -0

text2sql/config.py ADDED Viewed

@@ -0,0 +1,1063 @@
+"""Configuration classes and pipeline-wide constants for the text-to-SQL engine.
+Holds all tunable thresholds, flag sets, and environment-driven settings used across pipeline stages.
+PolicyConfig centralizes scoring penalties and safety rules.
+EngineConfig selects the active runtime backend (PostgreSQL or Databricks) and LLM credentials.
+QSimConfig and SimulatorConfig control synthetic question and intent generation.
+Module-level constants define valid value types, operators, aggregation functions, and column-type mappings used for validation and intent resolution.
+"""
+from __future__ import annotations
+import os
+import re
+from typing import ClassVar
+INTENT_SCHEMA = {
+    "type": "object",
+    "required": ["tables"],
+    "properties": {
+        "tables": {
+            "oneOf": [
+                {"type": "array", "items": {"type": "string"}},
+                {"type": "string"},
+            ]
+        },
+        "aggregation_targets": {
+            "type": "object",
+            "additionalProperties": {"type": "string"},
+        },
+        "grain": {"type": "string"},
+        "select_cols": {
+            "type": "array",
+            "items": {"oneOf": [{"type": "string"}, {"type": "object"}]},
+        },
+        "group_by_cols": {"type": "array", "items": {"type": "string"}},
+        "order_by_cols": {
+            "type": "array",
+            "items": {"oneOf": [{"type": "string"}, {"type": "object"}]},
+        },
+        "filters_param": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "required": ["op"],
+                "properties": {
+                    "left_expr": {"type": "string"},
+                    "left_col": {"type": "string"},
+                    "op": {"type": "string"},
+                    "right_expr": {"type": "string"},
+                    "right_col": {"type": "string"},
+                    "value_type": {"type": "string"},
+                    "value": {},
+                    "bool_op": {"type": "string"},
+                    "filter_group": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
+                },
+            },
+        },
+        "having_param": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "required": ["op"],
+                "properties": {
+                    "left_expr": {"type": "string"},
+                    "left_agg": {"type": "string"},
+                    "op": {"type": "string"},
+                    "right_expr": {"type": "string"},
+                    "right_agg": {"type": "string"},
+                    "value_type": {"type": "string"},
+                    "value": {},
+                    "bool_op": {"type": "string"},
+                    "filter_group": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
+                },
+            },
+        },
+        "cte_steps": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "required": ["cte_name"],
+                "properties": {
+                    "cte_name": {"type": "string"},
+                    "description": {"type": "string"},
+                    "tables": {"type": "array", "items": {"type": "string"}},
+                    "grain": {"type": "string"},
+                    "select_cols": {"type": "array"},
+                    "group_by_cols": {"type": "array"},
+                    "order_by_cols": {"type": "array"},
+                    "filters_param": {"type": "array"},
+                    "having_param": {"type": "array"},
+                    "output_columns": {"type": "array"},
+                },
+            },
+        },
+        "limit": {"oneOf": [{"type": "integer"}, {"type": "null"}]},
+        "natural_language": {"type": "string"},
+        "intent_status": {"type": "string"},
+    },
+}
+VALID_AGGREGATION_FUNCTIONS = {"count", "sum", "avg", "min", "max"}
+VALID_SCALAR_FUNCTIONS = {
+    "upper",
+    "lower",
+    "trim",
+    "ltrim",
+    "rtrim",
+    "length",
+    "abs",
+    "round",
+    "floor",
+    "ceil",
+    "date_trunc",
+    "date_part",
+    "extract",
+    "coalesce",
+    "year",
+    "month",
+    "day",
+}
+SCALAR_FUNCTIONS_STRING = {"upper", "lower", "trim", "ltrim", "rtrim", "length"}
+SCALAR_FUNCTIONS_NUMERIC = {"abs", "round", "floor", "ceil"}
+SCALAR_FUNCTIONS_TEMPORAL = {
+    "date_trunc",
+    "date_part",
+    "extract",
+    "year",
+    "month",
+    "day",
+}
+SCALAR_FUNCTIONS_LEADING_ARG = {"date_trunc", "date_part", "extract"}
+DISALLOWED_EXTRACT_UNITS = {"epoch"}
+SCALAR_FUNCTIONS_AGG_COMPATIBLE = {"abs", "round", "floor", "ceil"}
+VALID_ARITH_OPS = {"+", "-", "*", "/"}
+VALID_GRAINS = {"scalar", "grouped", "row_level"}
+VALID_EXPECTED_ROWS = {"one", "few", "many"}
+VALID_FILTER_OPS = {
+    "=",
+    "!=",
+    "<",
+    "<=",
+    ">",
+    ">=",
+    "like",
+    "ilike",
+    "in",
+    "between",
+    "is null",
+    "is not null",
+    "not in",
+    "not like",
+    "not ilike",
+}
+VALID_HAVING_OPS = {"=", "!=", "<", "<=", ">", ">=", "in", "not in", "between"}
+VALID_AGG_FUNCS = {"count", "sum", "avg", "min", "max"}
+DATABRICKS_TABLE_QUALIFY_SKIP_IDENTIFIERS: frozenset[str] = frozenset(
+    {
+        "avg",
+        "case",
+        "cast",
+        "coalesce",
+        "count",
+        "date_part",
+        "date_trunc",
+        "extract",
+        "lateral",
+        "lower",
+        "max",
+        "min",
+        "nullif",
+        "replace",
+        "substring",
+        "sum",
+        "trim",
+        "try_cast",
+        "unnest",
+        "upper",
+        "values",
+    }
+)
+VALID_VALUE_TYPES = {
+    "integer",
+    "string",
+    "date",
+    "number",
+    "null",
+    "boolean",
+    "date_window",
+    "date_diff",
+}
+VALID_DATE_WINDOW_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
+VALID_DATE_DIFF_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
+VALID_EXTRACT_UNITS = {
+    "year", "month", "day", "week", "quarter",
+    "hour", "minute", "second", "dow", "doy",
+}
+VALID_INTERVAL_UNITS = {"day", "week", "month", "year", "hour", "minute", "second"}
+VALID_FILTER_VALUE_TYPES = {
+    "categorical",
+    "numeric",
+    "numeric_categorical",
+    "temporal",
+    "boolean",
+    "null",
+}
+VALID_HAVING_VALUE_TYPES = {"number", "integer"}
+VALUE_TYPE_NORMALIZATION = {
+    "timestamp": "date",
+    "datetime": "date",
+    "timestamptz": "date",
+    "time": "date",
+    "numeric": "number",
+    "decimal": "number",
+    "float": "number",
+    "double": "number",
+    "real": "number",
+    "money": "number",
+    "bigint": "integer",
+    "smallint": "integer",
+    "int": "integer",
+    "int2": "integer",
+    "int4": "integer",
+    "int8": "integer",
+    "serial": "integer",
+    "varchar": "string",
+    "char": "string",
+    "text": "string",
+    "bpchar": "string",
+    "uuid": "string",
+    "bool": "boolean",
+    "enum": "string",
+    "integer": "integer",
+    "string": "string",
+    "date": "date",
+    "number": "number",
+    "boolean": "boolean",
+    "null": "null",
+    "date_window": "date_window",
+    "date_diff": "date_diff",
+}
+BOOLEAN_FILTER_OPS = {"=", "!=", "in", "not in", "is null", "is not null"}
+CATEGORICAL_FILTER_OPS = {
+    "=",
+    "!=",
+    "like",
+    "ilike",
+    "not like",
+    "not ilike",
+    "in",
+    "not in",
+    "is null",
+    "is not null",
+}
+NUMERIC_CATEGORICAL_FILTER_OPS = {
+    "=",
+    "!=",
+    "<",
+    "<=",
+    ">",
+    ">=",
+    "in",
+    "not in",
+    "between",
+    "is null",
+    "is not null",
+}
+NUMERIC_FILTER_OPS = frozenset(
+    {
+        "=",
+        "!=",
+        "<",
+        "<=",
+        ">",
+        ">=",
+        "in",
+        "not in",
+        "between",
+        "is null",
+        "is not null",
+    }
+)
+CTE_NUMERIC_FILTER_OPS = list(NUMERIC_FILTER_OPS)
+TEMPORAL_FILTER_OPS = {
+    "=",
+    "!=",
+    "<",
+    "<=",
+    ">",
+    ">=",
+    "in",
+    "not in",
+    "between",
+    "is null",
+    "is not null",
+}
+FK_FILTER_OPS = {
+    "=",
+    "!=",
+    "<",
+    "<=",
+    ">",
+    ">=",
+    "in",
+    "not in",
+    "between",
+    "is null",
+    "is not null",
+}
+ROLE_ALLOWED_AGGREGATIONS = {
+    "IDENTIFIER": {"count"},
+    "CATEGORICAL": {"count", "min", "max"},
+    "NUMERIC_CATEGORICAL": {"count", "min", "max"},
+    "NUMERIC_MEASURE": {"count", "sum", "avg", "min", "max"},
+    "TEMPORAL": {"count", "min", "max"},
+    "BOOLEAN": {"count", "sum"},
+    "FREE_TEXT": {"count"},
+    "AUDIT": set(),
+}
+NUMERIC_ONLY_AGGREGATIONS = {"sum", "avg"}
+COLUMN_TYPE_TO_VALUE_TYPE = {
+    "int": "integer",
+    "integer": "integer",
+    "bigint": "integer",
+    "smallint": "integer",
+    "tinyint": "integer",
+    "int2": "integer",
+    "int4": "integer",
+    "int8": "integer",
+    "long": "integer",
+    "short": "integer",
+    "serial": "integer",
+    "bigserial": "integer",
+    "smallserial": "integer",
+    "float": "number",
+    "double": "number",
+    "decimal": "number",
+    "numeric": "number",
+    "real": "number",
+    "float4": "number",
+    "float8": "number",
+    "double precision": "number",
+    "money": "number",
+    "varchar": "string",
+    "text": "string",
+    "char": "string",
+    "string": "string",
+    "character varying": "string",
+    "bpchar": "string",
+    "nchar": "string",
+    "nvarchar": "string",
+    "ntext": "string",
+    "clob": "string",
+    "date": "date",
+    "timestamp": "date",
+    "timestamptz": "date",
+    "datetime": "date",
+    "time": "date",
+    "timestamp without time zone": "date",
+    "timestamp with time zone": "date",
+    "boolean": "boolean",
+    "bool": "boolean",
+}
+AGGREGATION_ALLOWED_COLUMN_TYPES = {
+    "count": ["integer", "string", "date", "number", "boolean"],
+    "sum": ["integer", "number"],
+    "avg": ["integer", "number"],
+    "min": ["integer", "number", "string", "date"],
+    "max": ["integer", "number", "string", "date"],
+}
+EXCLUDED_FILTER_PATTERNS = [
+    r"password",
+    r"picture",
+    r"photo",
+    r"image",
+    r"blob",
+    r"address.?2",
+    r"address_line.?2",
+]
+BOOLEAN_VALUE_PATTERNS = frozenset(
+    [
+        frozenset(["0", "1"]),
+        frozenset(["true", "false"]),
+        frozenset(["yes", "no"]),
+        frozenset(["y", "n"]),
+        frozenset(["t", "f"]),
+        frozenset(["on", "off"]),
+        frozenset(["active", "inactive"]),
+        frozenset(["enabled", "disabled"]),
+    ]
+)
+BOOLEAN_TRUE_FALSE_MAP: dict[frozenset[str], tuple[str, str]] = {
+    frozenset(["0", "1"]): ("1", "0"),
+    frozenset(["true", "false"]): ("true", "false"),
+    frozenset(["yes", "no"]): ("yes", "no"),
+    frozenset(["y", "n"]): ("y", "n"),
+    frozenset(["t", "f"]): ("t", "f"),
+    frozenset(["on", "off"]): ("on", "off"),
+    frozenset(["active", "inactive"]): ("active", "inactive"),
+    frozenset(["enabled", "disabled"]): ("enabled", "disabled"),
+}
+NUMERIC_TYPE_TOKENS = frozenset(
+    {
+        "int",
+        "integer",
+        "float",
+        "double",
+        "decimal",
+        "numeric",
+        "real",
+        "number",
+        "serial",
+        "bigint",
+        "smallint",
+        "tinyint",
+        "money",
+        "long",
+        "short",
+    }
+)
+STRING_TYPE_TOKENS = frozenset(
+    {
+        "char",
+        "varchar",
+        "text",
+        "string",
+        "clob",
+        "nchar",
+        "nvarchar",
+        "ntext",
+        "bpchar",
+    }
+)
+DATE_TYPE_TOKENS = frozenset(
+    {
+        "date",
+        "time",
+        "timestamp",
+        "timestamptz",
+        "interval",
+    }
+)
+AGG_PREFIXES = frozenset({"COUNT(", "SUM(", "AVG(", "MIN(", "MAX("})
+OP_FLIP: dict[str, str] = {">": "<", "<": ">", ">=": "<=", "<=": ">="}
+NUMERIC_RESULT_SCALARS = frozenset(
+    {
+        "abs",
+        "round",
+        "floor",
+        "ceil",
+        "extract",
+        "date_part",
+        "year",
+        "month",
+        "day",
+        "length",
+    }
+)
+INTEGER_SCALARS = frozenset({"extract", "date_part", "year", "month", "day", "length"})
+NUMERIC_RESULT_AGGS = frozenset({"count", "sum", "avg"})
+NUMERIC_RESULT_OPS = frozenset({"=", "!=", "<", "<=", ">", ">="})
+ARITHMETIC_ROLES = frozenset({"numeric_measure", "numeric_categorical"})
+COMPATIBLE_TYPE_PAIRS = {
+    ("int", "int"),
+    ("int", "integer"),
+    ("int", "bigint"),
+    ("int", "smallint"),
+    ("int", "tinyint"),
+    ("int", "long"),
+    ("int", "short"),
+    ("int", "numeric"),
+    ("int", "decimal"),
+    ("integer", "integer"),
+    ("integer", "int"),
+    ("integer", "bigint"),
+    ("integer", "smallint"),
+    ("integer", "tinyint"),
+    ("integer", "long"),
+    ("integer", "short"),
+    ("bigint", "bigint"),
+    ("bigint", "int"),
+    ("bigint", "integer"),
+    ("bigint", "smallint"),
+    ("bigint", "tinyint"),
+    ("bigint", "long"),
+    ("bigint", "numeric"),
+    ("smallint", "smallint"),
+    ("smallint", "int"),
+    ("smallint", "integer"),
+    ("smallint", "bigint"),
+    ("smallint", "tinyint"),
+    ("tinyint", "tinyint"),
+    ("tinyint", "int"),
+    ("tinyint", "integer"),
+    ("tinyint", "smallint"),
+    ("tinyint", "bigint"),
+    ("long", "long"),
+    ("long", "int"),
+    ("long", "integer"),
+    ("long", "bigint"),
+    ("short", "short"),
+    ("short", "int"),
+    ("short", "integer"),
+    ("short", "smallint"),
+    ("short", "tinyint"),
+    ("numeric", "numeric"),
+    ("numeric", "decimal"),
+    ("numeric", "int"),
+    ("numeric", "integer"),
+    ("numeric", "bigint"),
+    ("decimal", "decimal"),
+    ("decimal", "numeric"),
+    ("decimal", "int"),
+    ("decimal", "integer"),
+    ("float", "float"),
+    ("float", "double"),
+    ("float", "real"),
+    ("float", "numeric"),
+    ("double", "double"),
+    ("double", "float"),
+    ("double", "real"),
+    ("real", "real"),
+    ("real", "float"),
+    ("real", "double"),
+    ("varchar", "varchar"),
+    ("varchar", "text"),
+    ("varchar", "char"),
+    ("varchar", "string"),
+    ("text", "text"),
+    ("text", "varchar"),
+    ("text", "char"),
+    ("text", "string"),
+    ("char", "char"),
+    ("char", "varchar"),
+    ("char", "text"),
+    ("char", "string"),
+    ("string", "string"),
+    ("string", "varchar"),
+    ("string", "text"),
+    ("string", "char"),
+    ("date", "date"),
+    ("date", "timestamp"),
+    ("date", "timestamptz"),
+    ("timestamp", "timestamp"),
+    ("timestamp", "date"),
+    ("timestamp", "timestamptz"),
+    ("timestamptz", "timestamptz"),
+    ("timestamptz", "timestamp"),
+    ("timestamptz", "date"),
+    ("boolean", "boolean"),
+    ("boolean", "bool"),
+    ("bool", "bool"),
+    ("bool", "boolean"),
+    ("number", "number"),
+    ("number", "integer"),
+    ("number", "numeric"),
+    ("number", "decimal"),
+    ("number", "float"),
+    ("number", "double"),
+    ("number", "real"),
+    ("integer", "number"),
+}
+AGG_QUANTITY_RE = re.compile(
+    r"\b(?:more\s+than|greater\s+than|at\s+least|fewer\s+than|less\s+than|"
+    r"no\s+more\s+than|no\s+fewer\s+than|over|under|exceeding|"
+    r"above|below|a\s+minimum\s+of|a\s+maximum\s+of)\s+\d+\b",
+    re.IGNORECASE,
+)
+COUNT_THRESHOLD_TABLE_RE = re.compile(
+    r"\b(?:in\s+(?:exactly\s+)?|exactly\s+)(\d+)\s+(\w+)\b",
+    re.IGNORECASE,
+)
+CTE_FULL_AGGS = ["count", "sum", "avg", "min", "max"]
+CTE_DEFAULT_AGGS = ["count", "min", "max"]
+CTE_HAVING_COMPARE_OPS = ["=", "!=", "<", "<=", ">", ">="]
+SCALAR_FUNC_DEFAULTS: dict[str, list] = {
+    "round": [2],
+    "trunc": [0],
+    "truncate": [0],
+    "coalesce": [0],
+    "date_trunc": ["month"],
+    "date_part": ["month"],
+    "extract": ["year"],
+}
+DATE_UNIT_KEYWORDS = [
+    ("month", "month"),
+    ("day", "day"),
+    ("week", "week"),
+    ("quarter", "quarter"),
+    ("year", "year"),
+    ("date", "year"),
+]
+STRUCTURAL_IDENTITY_VALUES = frozenset({0, 1})
+IN_OPS = frozenset({"in", "not in"})
+IN_STRING_SEPARATORS = re.compile(r"['\"]?\s*,\s*['\"]?")
+BOOLEAN_TRUTHY_VALUES = frozenset({"1", "true", "t", "yes", "y", "on", "active", "enabled"})
+BOOLEAN_FALSY_VALUES = frozenset({"0", "false", "f", "no", "n", "off", "inactive", "disabled"})
+ILIKE_ELIGIBLE_OPS = frozenset({"=", "!=", "like", "not like"})
+ILIKE_OP_MAP: dict[str, str] = {
+    "=": "ilike",
+    "!=": "not ilike",
+    "like": "ilike",
+    "not like": "not ilike",
+}
+NUMERIC_DATA_TYPES = frozenset(
+    {
+        "integer",
+        "int",
+        "int2",
+        "int4",
+        "int8",
+        "smallint",
+        "bigint",
+        "serial",
+        "bigserial",
+        "numeric",
+        "decimal",
+        "real",
+        "double precision",
+        "float",
+        "float4",
+        "float8",
+        "money",
+    }
+)
+REVERSE_OP_MAP = {
+    ">": "<",
+    "<": ">",
+    ">=": "<=",
+    "<=": ">=",
+    "=": "=",
+    "!=": "!=",
+    "like": "like",
+    "not like": "not like",
+    "ilike": "ilike",
+    "not ilike": "not ilike",
+    "in": "in",
+    "not in": "not in",
+    "is null": "is null",
+    "is not null": "is not null",
+}
+THRESHOLD_RE = re.compile(
+    r"\b(?:more\s+than|less\s+than|at\s+least|at\s+most|over|under|above|below"
+    r"|exceeds?|greater\s+than|fewer\s+than|minimum\s+of|maximum\s+of)\s+\d+",
+    re.IGNORECASE,
+)
+AGG_KEYWORDS_RE = re.compile(r"\b(?:total|count|number\s+of|average|avg|sum|how\s+many)\b", re.IGNORECASE)
+AGG_PATTERN = re.compile(r"^(COUNT|SUM|AVG|MIN|MAX)\s*\(\s*(.+?)\s*\)$", re.IGNORECASE)
+TABLE_COL_PATTERN = re.compile(r"(\w+)\.(\w+)")
+HAVING_COUNT_VALUES = [1, 2, 3, 5, 10, 15, 20, 25, 50, 100]
+HAVING_SUM_AVG_VALUES = [10.0, 50.0, 100.0, 250.0, 500.0, 750.0, 1000.0]
+HAVING_MIN_MAX_VALUES = [1.0, 5.0, 10.0, 25.0, 50.0, 75.0, 100.0]
+QUESTION_STARTS_AGG = [
+    "How many",
+    "What is the total",
+    "What is the average",
+    "What is the minimum",
+    "What is the maximum",
+    "Find the sum of",
+    "Calculate the",
+    "Show the count of",
+    "Get the total",
+]
+QUESTION_STARTS_LIST = [
+    "List all",
+    "Show me",
+    "What are",
+    "Which",
+    "Find",
+    "Display",
+    "Get",
+    "Return",
+    "Retrieve",
+]
+QUESTION_STARTS_GROUP = [
+    "Show me",
+    "What is",
+    "Group",
+    "Break down",
+    "Summarize",
+    "Calculate",
+    "Find the",
+    "Get the",
+]
+RANGE_OPS = frozenset({">", "<", ">=", "<="})
+IMPOSSIBLE_HAVING_RE = re.compile(
+    r"^COUNT\b.*",
+    re.IGNORECASE,
+)
+SQL_KEYWORDS = frozenset(
+    {
+        "select",
+        "from",
+        "distinct",
+        "where",
+        "group",
+        "order",
+        "having",
+        "limit",
+        "join",
+        "inner",
+        "outer",
+        "left",
+        "right",
+        "cross",
+        "on",
+        "as",
+        "insert",
+        "update",
+        "delete",
+        "create",
+        "drop",
+        "alter",
+        "table",
+        "index",
+        "view",
+        "into",
+        "values",
+        "set",
+        "and",
+        "or",
+        "not",
+        "in",
+    }
+)
+NUMERIC_LITERAL_RE = re.compile(r"\b\d+(?:\.\d+)?\b")
+TOP_N_RE = re.compile(r"\b(?:top|first|bottom|last|least|most)\s+\d+\b", re.IGNORECASE)
+DISTINCT_RE = re.compile(r"\b(?:distinct|unique)\b", re.IGNORECASE)
+def normalize_value_type(value_type: str) -> str:
+    """Normalize an LLM-provided value type string to a valid pipeline value type.
+    Args:
+        value_type: Raw value type string as returned by the LLM or stored in the schema.
+    Returns:
+        One of the VALID_VALUE_TYPES values, defaulting to 'string' for unknown types.
+    """
+    if not value_type:
+        return "string"
+    vt_lower = value_type.lower().strip()
+    if vt_lower in VALUE_TYPE_NORMALIZATION:
+        return VALUE_TYPE_NORMALIZATION[vt_lower]
+    if vt_lower in VALID_VALUE_TYPES:
+        return vt_lower
+    return "string"
+def normalize_column_type(col_type: str) -> str:
+    """Strip type parameters from a SQL type string for mapping lookup.
+    Args:
+        col_type: Raw SQL type string such as 'varchar(255)' or 'numeric(10,2)'.
+    Returns:
+        Lowercased type name with parameter suffixes removed.
+    """
+    normalized = col_type.lower().strip()
+    normalized = re.sub(r"\(\d+(?:,\s*\d+)?\)", "", normalized)
+    normalized = normalized.strip()
+    return normalized
+class PolicyConfig:
+    """Pure thresholds, penalties, safety rules — no runtime state."""
+    DEBUG: ClassVar[bool] = True
+    VERBOSE: ClassVar[bool] = True
+    REGENERATE_TEMPLATE_STORE: ClassVar[bool] = False
+    REGENERATE_SCHEMA_GRAPH: ClassVar[bool] = False
+    REGENERATE_SKELETON_CACHE: ClassVar[bool] = False
+    MAX_REPAIR_LOOPS = 2
+    CATEGORICAL_MAX_CARDINALITY = 50
+    CATEGORICAL_MAX_RATIO = 0.05
+    FREE_TEXT_CATEGORICAL_MAX_CARDINALITY = 200
+    IDENTIFIER_MIN_UNIQUENESS = 0.98
+    CATEGORICAL_SAMPLE_SIZE = 20
+    AUTO_PROCEED_THRESHOLD = 0.85
+    FINAL_SQL_AUTO_ACCEPT_THRESHOLD = 0.95
+    FUZZY_MATCH_MAX_DISTANCE = 2
+    PENALTY_CAP = 0.30
+    TRUST_PROMOTE_MIN_TOTAL = 2
+    TRUST_PROMOTE_MAX_REJECT_RATIO = 0.25
+    TRUST_DEMOTE_REJECT_RATIO_T2 = 0.25
+    TRUST_DEMOTE_REJECT_RATIO_T1 = 0.5
+    PEN_BY_INTENT_KEY = 0.05
+    PEN_BY_JOIN_SIG = 0.05
+    PEN_BY_SQL_FP = 0.08
+    PEN_BY_COLMAP_SIG = 0.05
+    PEN_BY_RESULT_ISSUE = 0.06
+    MAX_AVOID_EXAMPLES = 2
+    STOPWORDS = {
+        "a",
+        "an",
+        "the",
+        "is",
+        "are",
+        "was",
+        "were",
+        "be",
+        "been",
+        "being",
+        "am",
+        "do",
+        "does",
+        "did",
+        "have",
+        "has",
+        "had",
+        "can",
+        "could",
+        "would",
+        "should",
+        "will",
+        "shall",
+        "may",
+        "might",
+        "me",
+        "my",
+        "i",
+        "we",
+        "you",
+        "your",
+        "it",
+        "its",
+        "please",
+    }
+    FORBIDDEN_SQL = [
+        r"\bupdate\b",
+        r"\bdelete\b",
+        r"\binsert\b",
+        r"\bmerge\b",
+        r"\balter\b",
+        r"\bdrop\b",
+        r"\btruncate\b",
+        r"\bgrant\b",
+        r"\brevoke\b",
+        r"\bcreate\b",
+        r"\bcomment\b",
+        r"\brename\b",
+        r"\bcall\b",
+        r"\bexecute\b",
+        r"\bdo\b",
+        r"\bcopy\b",
+        r";\s*\S",
+        r"\bCASE\s+WHEN\b",
+        r"\bUNION\b",
+        r"\bINTERSECT\b",
+        r"\bEXCEPT\b",
+        r"\bLATERAL\b",
+        r"\bOFFSET\b",
+        r"\bFETCH\s+FIRST\b",
+        r"\bDISTINCT\s+ON\b",
+        r"\bARRAY\s*\[",
+        r"\bARRAY_AGG\b",
+        r"::json\b",
+        r"\bjson_",
+        r"\bjsonb_",
+        r"\bEXISTS\s*\(",
+        r"\bBETWEEN\b",
+    ]
+    REJECT_CATEGORIES = [
+        "wrong_intent",
+        "wrong_tables",
+        "wrong_join",
+        "wrong_filters_or_having",
+        "wrong_aggregation_or_grouping",
+        "wrong_columns_selected",
+        "too_many_rows",
+        "too_few_rows",
+        "invalid_structure",
+        "other",
+    ]
+    STRUCTURAL_REJECT_CATEGORIES = {
+        "wrong_intent",
+        "wrong_tables",
+        "wrong_join",
+        "wrong_aggregation_or_grouping",
+        "wrong_columns_selected",
+        "invalid_structure",
+    }
+class PostgresRuntimeConfig:
+    """PostgreSQL-specific runtime configuration."""
+    HOST: ClassVar[str] = "localhost"
+    PORT: ClassVar[int] = 5432
+    USER: ClassVar[str] = "postgres"
+    PASSWORD: ClassVar[str | None] = None
+    DATABASE: ClassVar[str | None] = None
+    SCHEMA: ClassVar[str] = "public"
+    SQL_FILE_PATH: ClassVar[str | None] = None
+    DEBUG: ClassVar[bool] = False
+    @classmethod
+    def db_url(cls) -> str:
+        """Build the PostgreSQL SQLAlchemy connection URL from current config.
+        Returns:
+            Connection URL string for use with SQLAlchemy.
+        Raises:
+            ValueError: If PASSWORD or DATABASE are not set.
+        """
+        if not cls.PASSWORD:
+            raise ValueError("PostgreSQL password required")
+        if not cls.DATABASE:
+            raise ValueError("PostgreSQL database required")
+        return f"postgresql+psycopg2://{cls.USER}:{cls.PASSWORD}@{cls.HOST}:{cls.PORT}/{cls.DATABASE}"
+class DatabricksRuntimeConfig:
+    """Databricks-specific runtime configuration."""
+    CATALOG: ClassVar[str | None] = None
+    SCHEMA: ClassVar[str | None] = None
+    SQL_FILE_PATH: ClassVar[str | None] = None
+    SERVER_HOSTNAME: ClassVar[str | None] = None
+    HTTP_PATH: ClassVar[str | None] = None
+    ACCESS_TOKEN: ClassVar[str | None] = None
+    DEBUG: ClassVar[bool] = False
+    @classmethod
+    def has_native_connection(cls) -> bool:
+        """Return True when all three databricks-sql-connector params are set."""
+        return bool(cls.SERVER_HOSTNAME and cls.HTTP_PATH and cls.ACCESS_TOKEN)
+    @classmethod
+    def validate(cls) -> None:
+        """Validate that required Databricks configuration fields are set.
+        Raises:
+            ValueError: If CATALOG or SCHEMA are not configured.
+        """
+        if not cls.CATALOG:
+            raise ValueError("Databricks catalog required")
+        if not cls.SCHEMA:
+            raise ValueError("Databricks schema required")
+class EngineConfig:
+    """Engine selection and active runtime configuration."""
+    TYPE: ClassVar[str] = "postgresql"
+    RUNTIME: ClassVar[type] = PostgresRuntimeConfig
+    API_TOKEN: ClassVar[str | None] = os.environ.get("OPENAI_API_KEY")
+    OPENAI_MODEL: ClassVar[str] = "gpt-4o-mini"
+    OPENAI_MODEL_INTENT: ClassVar[str] = "gpt-5-mini"
+    OPENAI_MODEL_SQL: ClassVar[str] = "gpt-4.1-mini"
+    OPENAI_MODEL_SCHEMA: ClassVar[str] = "gpt-5-mini"
+    OPENAI_BASE_URL: ClassVar[str] = "https://api.openai.com/v1"
+    SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json"
+    TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json"
+class QSimConfig:
+    """Question Simulator settings for NL question generation."""
+    INTENT_TYPES = 20
+    QUESTIONS_COUNT = 100
+    MAX_TABLES_PER_INTENT = 3
+    MAX_FILTERS_PER_INTENT = 4
+    MAX_FILTER_COLUMNS = 2
+    MAX_GROUP_BY_COLUMNS = 2
+    MIN_AVG_VARIANTS_PER_INTENT = 1
+    MAX_AVG_VARIANTS_PER_INTENT = 10
+    MAX_NO_VARIANCE_RATIO = 0.25
+    SINGLE_TABLE_RATIO = 0.40
+    TWO_TABLE_RATIO = 0.40
+    THREE_TABLE_RATIO = 0.20
+    MAX_CONSECUTIVE_DUPLICATES = 5
+    MAX_CONSECUTIVE_FAILURES = 5
+    MIN_FILTER_RATIO = 0.70
+    MIN_HAVING_RATIO = 0.15
+    MIN_THREE_TABLE_RATIO = 0.10
+    PROFILING_SAMPLE_THRESHOLD = 100_000
+    PROFILING_SAMPLE_SIZE = 10_000
+    RANDOM_SEED = 42
+    EXCLUDED_FILTER_PATTERNS = EXCLUDED_FILTER_PATTERNS
+    SKELETONS_JSON_PATH = "qsim_skeletons.json"
+    QUESTIONS_OUTPUT_PATH = "qsim_intents_with_questions.json"
+    MAX_ROLE_CLASSIFICATION_RETRIES = 2
+class SimulatorConfig:
+    """Coverage Simulator settings for synthetic intent expansion and template generation."""
+    MAX_FILTERS = 3
+    MAX_TABLES = 3
+    MAX_GROUPBY = 2
+    MAX_EXPR_COMPARISONS = 2
+    MAX_HAVING_CONDITIONS = 2
+    MAX_EXPANSION_DEPTH = 2
+    GOLD_OUTPUT_PATTERN = "gold_intents_v{version}.json"
+    REPORT_PATTERN = "simulation_report_v{version}.json"
+    RESULTS_CSV_PATTERN = "simulation_results_v{version}.csv"
+    FAILURES_PATTERN = "simulation_failures_v{version}.json"
+    RANDOM_SEED = 42
+    EXTRACT_EXPANSION_UNITS: list[str] = ["year", "month", "day", "quarter", "dow"]
+    DATE_TRUNC_EXPANSION_UNITS: list[str] = ["month", "quarter", "year"]
+    LIMIT_EXPANSION_VALUES: list[int] = [10, 50, 100]
+    DATE_WINDOW_EXPANSION_PRESETS: list[dict[str, int | str]] = [
+        {"unit": "day", "offset": 7},
+        {"unit": "day", "offset": 30},
+        {"unit": "day", "offset": 90},
+        {"unit": "month", "offset": 1},
+        {"unit": "month", "offset": 3},
+        {"unit": "month", "offset": 6},
+        {"unit": "month", "offset": 12},
+        {"unit": "year", "offset": 1},
+    ]
+    DATE_DIFF_EXPANSION_PRESETS: list[dict[str, int | str]] = [
+        {"unit": "day", "amount": 7},
+        {"unit": "day", "amount": 30},
+        {"unit": "day", "amount": 90},
+    ]