aetherdialect 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aetherdialect-0.1.0/LICENSE +7 -0
- aetherdialect-0.1.0/PKG-INFO +197 -0
- aetherdialect-0.1.0/README.md +162 -0
- aetherdialect-0.1.0/pyproject.toml +81 -0
- aetherdialect-0.1.0/setup.cfg +4 -0
- aetherdialect-0.1.0/src/aetherdialect.egg-info/PKG-INFO +197 -0
- aetherdialect-0.1.0/src/aetherdialect.egg-info/SOURCES.txt +65 -0
- aetherdialect-0.1.0/src/aetherdialect.egg-info/dependency_links.txt +1 -0
- aetherdialect-0.1.0/src/aetherdialect.egg-info/requires.txt +26 -0
- aetherdialect-0.1.0/src/aetherdialect.egg-info/top_level.txt +1 -0
- aetherdialect-0.1.0/src/text2sql/__init__.py +7 -0
- aetherdialect-0.1.0/src/text2sql/config.py +1063 -0
- aetherdialect-0.1.0/src/text2sql/contracts_base.py +952 -0
- aetherdialect-0.1.0/src/text2sql/contracts_core.py +1890 -0
- aetherdialect-0.1.0/src/text2sql/core_utils.py +834 -0
- aetherdialect-0.1.0/src/text2sql/dialect.py +1134 -0
- aetherdialect-0.1.0/src/text2sql/expansion_ops.py +1218 -0
- aetherdialect-0.1.0/src/text2sql/expansion_rules.py +496 -0
- aetherdialect-0.1.0/src/text2sql/intent_expr.py +1759 -0
- aetherdialect-0.1.0/src/text2sql/intent_process.py +2133 -0
- aetherdialect-0.1.0/src/text2sql/intent_repair.py +1733 -0
- aetherdialect-0.1.0/src/text2sql/intent_resolve.py +1292 -0
- aetherdialect-0.1.0/src/text2sql/live_testing.py +1117 -0
- aetherdialect-0.1.0/src/text2sql/main_execution.py +799 -0
- aetherdialect-0.1.0/src/text2sql/pipeline.py +1662 -0
- aetherdialect-0.1.0/src/text2sql/qsim_ops.py +1286 -0
- aetherdialect-0.1.0/src/text2sql/qsim_sample.py +609 -0
- aetherdialect-0.1.0/src/text2sql/qsim_struct.py +569 -0
- aetherdialect-0.1.0/src/text2sql/schema.py +973 -0
- aetherdialect-0.1.0/src/text2sql/schema_profiling.py +2075 -0
- aetherdialect-0.1.0/src/text2sql/simulator.py +970 -0
- aetherdialect-0.1.0/src/text2sql/sql_gen.py +1537 -0
- aetherdialect-0.1.0/src/text2sql/templates.py +1037 -0
- aetherdialect-0.1.0/src/text2sql/text2sql.py +726 -0
- aetherdialect-0.1.0/src/text2sql/utils.py +973 -0
- aetherdialect-0.1.0/src/text2sql/validation_agg.py +1033 -0
- aetherdialect-0.1.0/src/text2sql/validation_execute.py +1092 -0
- aetherdialect-0.1.0/src/text2sql/validation_schema.py +1847 -0
- aetherdialect-0.1.0/src/text2sql/validation_semantic.py +2122 -0
- aetherdialect-0.1.0/tests/test_config.py +769 -0
- aetherdialect-0.1.0/tests/test_contracts.py +2240 -0
- aetherdialect-0.1.0/tests/test_core_utils.py +1077 -0
- aetherdialect-0.1.0/tests/test_dialect.py +475 -0
- aetherdialect-0.1.0/tests/test_expansion_ops.py +585 -0
- aetherdialect-0.1.0/tests/test_expansion_rules.py +434 -0
- aetherdialect-0.1.0/tests/test_intent_expr.py +2315 -0
- aetherdialect-0.1.0/tests/test_intent_process.py +1194 -0
- aetherdialect-0.1.0/tests/test_intent_repair.py +3519 -0
- aetherdialect-0.1.0/tests/test_intent_resolve.py +1959 -0
- aetherdialect-0.1.0/tests/test_live_testing.py +670 -0
- aetherdialect-0.1.0/tests/test_main_execution.py +277 -0
- aetherdialect-0.1.0/tests/test_pipeline_units.py +1226 -0
- aetherdialect-0.1.0/tests/test_qsim_ops.py +499 -0
- aetherdialect-0.1.0/tests/test_qsim_sample.py +802 -0
- aetherdialect-0.1.0/tests/test_qsim_struct.py +810 -0
- aetherdialect-0.1.0/tests/test_schema.py +537 -0
- aetherdialect-0.1.0/tests/test_schema_profiling.py +1056 -0
- aetherdialect-0.1.0/tests/test_simulator.py +389 -0
- aetherdialect-0.1.0/tests/test_simulator_pipeline.py +248 -0
- aetherdialect-0.1.0/tests/test_sql_gen.py +1327 -0
- aetherdialect-0.1.0/tests/test_templates.py +1005 -0
- aetherdialect-0.1.0/tests/test_text2sql.py +240 -0
- aetherdialect-0.1.0/tests/test_utils.py +1652 -0
- aetherdialect-0.1.0/tests/test_validation_agg.py +832 -0
- aetherdialect-0.1.0/tests/test_validation_execute.py +636 -0
- aetherdialect-0.1.0/tests/test_validation_schema.py +1201 -0
- aetherdialect-0.1.0/tests/test_validation_semantic.py +2251 -0
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (C) 2026 Akul Ameya
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aetherdialect
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
|
+
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/akul-ameya/aetherdialect
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: jsonschema<5,>=4.0
|
|
12
|
+
Requires-Dist: openai<3,>=2.0.0
|
|
13
|
+
Requires-Dist: sqlglot<30,>=29.0
|
|
14
|
+
Requires-Dist: platformdirs<5,>=2.0.0
|
|
15
|
+
Requires-Dist: python-dotenv<2,>=1.0.0
|
|
16
|
+
Provides-Extra: databricks
|
|
17
|
+
Requires-Dist: pyspark<4,>=3.3; extra == "databricks"
|
|
18
|
+
Requires-Dist: databricks-sql-connector<4,>=3.0; extra == "databricks"
|
|
19
|
+
Provides-Extra: postgresql
|
|
20
|
+
Requires-Dist: SQLAlchemy<3,>=2.0; extra == "postgresql"
|
|
21
|
+
Requires-Dist: psycopg2-binary<3,>=2.9; extra == "postgresql"
|
|
22
|
+
Requires-Dist: pglast<8,>=5.0; extra == "postgresql"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
26
|
+
Requires-Dist: vulture<3,>=2.11; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
28
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
32
|
+
Requires-Dist: black<25,>=24; extra == "dev"
|
|
33
|
+
Requires-Dist: docformatter<2,>=1.7; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# Deterministic, validation-first Text-to-SQL for business databases
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install text2sql
|
|
42
|
+
pip install "text2sql[postgresql]"
|
|
43
|
+
pip install "text2sql[databricks]"
|
|
44
|
+
pip install "text2sql[postgresql,databricks]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
48
|
+
|
|
49
|
+
| Extra | Brings in | Use when |
|
|
50
|
+
| ------------ | ---------------------------------------------- | --------------------------- |
|
|
51
|
+
| `postgresql` | SQLAlchemy, PostgreSQL driver, `pglast` | `engine="postgresql"` |
|
|
52
|
+
| `databricks` | PySpark, Databricks SQL connector | `engine="databricks"` |
|
|
53
|
+
|
|
54
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect. The base package already depends on `sqlglot`; `pglast` is installed with the `postgresql` extra.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Quickstart
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from text2sql import Text2SQL
|
|
62
|
+
|
|
63
|
+
t2s = Text2SQL(
|
|
64
|
+
engine="postgresql",
|
|
65
|
+
host="localhost",
|
|
66
|
+
database="mydb",
|
|
67
|
+
password="secret",
|
|
68
|
+
openai_api_key="sk-...",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
t2s.run_interactive()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## What this is
|
|
79
|
+
|
|
80
|
+
A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**. It targets **stable business schemas** and **repeated analytical questions**, not open-ended “any SQL” generation.
|
|
81
|
+
|
|
82
|
+
- Natural language is turned into a **structured intent** (tables, select expressions, filters, grouping, ordering, optional CTEs) that is **shared across dialects**; dialect-specific SQL is produced later.
|
|
83
|
+
- **Templates** store previously accepted query patterns; **negative memory** records rejections so bad shapes are less likely to repeat.
|
|
84
|
+
- LLM calls run at **temperature 0**; for the same inputs and schema state, behavior is **repeatable**.
|
|
85
|
+
- **Bounded LLM use**: strong paths reuse templates or deterministic structure before asking the model for SQL.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Philosophy
|
|
90
|
+
|
|
91
|
+
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
92
|
+
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
93
|
+
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
94
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, generate or repair SQL; everything else is rules and stores.
|
|
95
|
+
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## What it supports (at a glance)
|
|
100
|
+
|
|
101
|
+
**Backends**
|
|
102
|
+
|
|
103
|
+
- PostgreSQL via SQLAlchemy.
|
|
104
|
+
- Databricks via Unity Catalog introspection (with optional DDL file fallback when the catalog is empty).
|
|
105
|
+
|
|
106
|
+
**Schema**
|
|
107
|
+
|
|
108
|
+
- Load from **live introspection** (primary).
|
|
109
|
+
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
110
|
+
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
111
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
112
|
+
- Optional **human notes** (plain text) fed once when the schema graph is built: richer **descriptions**, **roles**, and optional **sensitivity** labels — without renaming tables or inventing columns.
|
|
113
|
+
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
114
|
+
|
|
115
|
+
**Intent / SQL shape (analytical subset)**
|
|
116
|
+
|
|
117
|
+
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
118
|
+
- **Joins:** only along **FK-backed paths**; join type for injected joins is chosen **deterministically** from table roles (e.g. dimension side as `LEFT` where applicable).
|
|
119
|
+
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
120
|
+
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
121
|
+
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
122
|
+
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
123
|
+
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
124
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM` / `AVG` on select columns (main query and CTEs).
|
|
125
|
+
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
126
|
+
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
127
|
+
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
128
|
+
|
|
129
|
+
**Operational modes**
|
|
130
|
+
|
|
131
|
+
- **Interactive** — ask questions, accept/reject, results export.
|
|
132
|
+
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
133
|
+
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## What it is not
|
|
138
|
+
|
|
139
|
+
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
140
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` / `EXPLAIN` as you prefer).
|
|
141
|
+
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## How a question becomes SQL
|
|
146
|
+
|
|
147
|
+
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
148
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain** (see below).
|
|
149
|
+
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
150
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and optionally **`EXPLAIN`** when an engine is available.
|
|
151
|
+
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Validation (layers)
|
|
156
|
+
|
|
157
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (`pglast` on PostgreSQL, **`sqlglot` (Spark)** on Databricks). When a live **engine** is passed in, **`EXPLAIN`** can be used as an extra executability check.
|
|
158
|
+
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
159
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment (one of many cross-checks, not the only story).
|
|
160
|
+
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
161
|
+
|
|
162
|
+
## Deterministic repairs (after intent parse)
|
|
163
|
+
|
|
164
|
+
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Learning and reuse
|
|
169
|
+
|
|
170
|
+
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
171
|
+
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
172
|
+
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Coverage simulator (brief)
|
|
177
|
+
|
|
178
|
+
1. Parse each **seed** line into a gold intent.
|
|
179
|
+
2. **Expand** with a fixed set of **deterministic operators** (filters, aggregates, joins, time windows, numeric transforms, distinct/limit/OR-groups, expressions, **window variants**, etc.), **deduplicated** across depths.
|
|
180
|
+
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
181
|
+
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
182
|
+
|
|
183
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## QSim (brief)
|
|
188
|
+
|
|
189
|
+
Generates **reproducible** question lists from the schema and profiled values. Same seed → same output. Use for regression-style testing or dataset building.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## When to use it
|
|
194
|
+
|
|
195
|
+
**Good fit:** star/snowflake-style models, clear FKs, repeated BI-style questions, PostgreSQL or Databricks.
|
|
196
|
+
|
|
197
|
+
**Poor fit:** schemas without relationships, heavy procedural logic, or expectations of arbitrary SQL features outside the supported analytical subset.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# Deterministic, validation-first Text-to-SQL for business databases
|
|
2
|
+
|
|
3
|
+
## Installation
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install text2sql
|
|
7
|
+
pip install "text2sql[postgresql]"
|
|
8
|
+
pip install "text2sql[databricks]"
|
|
9
|
+
pip install "text2sql[postgresql,databricks]"
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
13
|
+
|
|
14
|
+
| Extra | Brings in | Use when |
|
|
15
|
+
| ------------ | ---------------------------------------------- | --------------------------- |
|
|
16
|
+
| `postgresql` | SQLAlchemy, PostgreSQL driver, `pglast` | `engine="postgresql"` |
|
|
17
|
+
| `databricks` | PySpark, Databricks SQL connector | `engine="databricks"` |
|
|
18
|
+
|
|
19
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect. The base package already depends on `sqlglot`; `pglast` is installed with the `postgresql` extra.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Quickstart
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from text2sql import Text2SQL
|
|
27
|
+
|
|
28
|
+
t2s = Text2SQL(
|
|
29
|
+
engine="postgresql",
|
|
30
|
+
host="localhost",
|
|
31
|
+
database="mydb",
|
|
32
|
+
password="secret",
|
|
33
|
+
openai_api_key="sk-...",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
t2s.run_interactive()
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## What this is
|
|
44
|
+
|
|
45
|
+
A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**. It targets **stable business schemas** and **repeated analytical questions**, not open-ended “any SQL” generation.
|
|
46
|
+
|
|
47
|
+
- Natural language is turned into a **structured intent** (tables, select expressions, filters, grouping, ordering, optional CTEs) that is **shared across dialects**; dialect-specific SQL is produced later.
|
|
48
|
+
- **Templates** store previously accepted query patterns; **negative memory** records rejections so bad shapes are less likely to repeat.
|
|
49
|
+
- LLM calls run at **temperature 0**; for the same inputs and schema state, behavior is **repeatable**.
|
|
50
|
+
- **Bounded LLM use**: strong paths reuse templates or deterministic structure before asking the model for SQL.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## Philosophy
|
|
55
|
+
|
|
56
|
+
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
57
|
+
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
58
|
+
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
59
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, generate or repair SQL; everything else is rules and stores.
|
|
60
|
+
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## What it supports (at a glance)
|
|
65
|
+
|
|
66
|
+
**Backends**
|
|
67
|
+
|
|
68
|
+
- PostgreSQL via SQLAlchemy.
|
|
69
|
+
- Databricks via Unity Catalog introspection (with optional DDL file fallback when the catalog is empty).
|
|
70
|
+
|
|
71
|
+
**Schema**
|
|
72
|
+
|
|
73
|
+
- Load from **live introspection** (primary).
|
|
74
|
+
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
75
|
+
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
76
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
77
|
+
- Optional **human notes** (plain text) fed once when the schema graph is built: richer **descriptions**, **roles**, and optional **sensitivity** labels — without renaming tables or inventing columns.
|
|
78
|
+
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
79
|
+
|
|
80
|
+
**Intent / SQL shape (analytical subset)**
|
|
81
|
+
|
|
82
|
+
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
83
|
+
- **Joins:** only along **FK-backed paths**; join type for injected joins is chosen **deterministically** from table roles (e.g. dimension side as `LEFT` where applicable).
|
|
84
|
+
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
85
|
+
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
86
|
+
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
87
|
+
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
88
|
+
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
89
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM` / `AVG` on select columns (main query and CTEs).
|
|
90
|
+
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
91
|
+
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
92
|
+
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
93
|
+
|
|
94
|
+
**Operational modes**
|
|
95
|
+
|
|
96
|
+
- **Interactive** — ask questions, accept/reject, results export.
|
|
97
|
+
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
98
|
+
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## What it is not
|
|
103
|
+
|
|
104
|
+
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
105
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` / `EXPLAIN` as you prefer).
|
|
106
|
+
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## How a question becomes SQL
|
|
111
|
+
|
|
112
|
+
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
113
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain** (see below).
|
|
114
|
+
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
115
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and optionally **`EXPLAIN`** when an engine is available.
|
|
116
|
+
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Validation (layers)
|
|
121
|
+
|
|
122
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (`pglast` on PostgreSQL, **`sqlglot` (Spark)** on Databricks). When a live **engine** is passed in, **`EXPLAIN`** can be used as an extra executability check.
|
|
123
|
+
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
124
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment (one of many cross-checks, not the only story).
|
|
125
|
+
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
126
|
+
|
|
127
|
+
## Deterministic repairs (after intent parse)
|
|
128
|
+
|
|
129
|
+
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Learning and reuse
|
|
134
|
+
|
|
135
|
+
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
136
|
+
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
137
|
+
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## Coverage simulator (brief)
|
|
142
|
+
|
|
143
|
+
1. Parse each **seed** line into a gold intent.
|
|
144
|
+
2. **Expand** with a fixed set of **deterministic operators** (filters, aggregates, joins, time windows, numeric transforms, distinct/limit/OR-groups, expressions, **window variants**, etc.), **deduplicated** across depths.
|
|
145
|
+
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
146
|
+
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
147
|
+
|
|
148
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## QSim (brief)
|
|
153
|
+
|
|
154
|
+
Generates **reproducible** question lists from the schema and profiled values. Same seed → same output. Use for regression-style testing or dataset building.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## When to use it
|
|
159
|
+
|
|
160
|
+
**Good fit:** star/snowflake-style models, clear FKs, repeated BI-style questions, PostgreSQL or Databricks.
|
|
161
|
+
|
|
162
|
+
**Poor fit:** schemas without relationships, heavy procedural logic, or expectations of arbitrary SQL features outside the supported analytical subset.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "aetherdialect"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Deterministic, validation-first Text-to-SQL system for business databases"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [{name = "Akul Ameya", email = "akul.ameya@gmail.com"}]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"jsonschema>=4.0,<5",
|
|
15
|
+
"openai>=2.0.0,<3",
|
|
16
|
+
"sqlglot>=29.0,<30",
|
|
17
|
+
"platformdirs>=2.0.0,<5",
|
|
18
|
+
"python-dotenv>=1.0.0,<2",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
databricks = ["pyspark>=3.3,<4", "databricks-sql-connector>=3.0,<4"]
|
|
23
|
+
postgresql = [
|
|
24
|
+
"SQLAlchemy>=2.0,<3",
|
|
25
|
+
"psycopg2-binary>=2.9,<3",
|
|
26
|
+
"pglast>=5.0,<8",
|
|
27
|
+
]
|
|
28
|
+
dev = [
|
|
29
|
+
"pytest>=8.0",
|
|
30
|
+
"pytest-cov>=5.0",
|
|
31
|
+
"vulture>=2.11,<3",
|
|
32
|
+
"ruff>=0.4",
|
|
33
|
+
"mypy>=1.10",
|
|
34
|
+
"twine>=5.0",
|
|
35
|
+
"build>=1.0",
|
|
36
|
+
"pre-commit>=3.0",
|
|
37
|
+
"black>=24,<25",
|
|
38
|
+
"docformatter>=1.7,<2",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[tool.ruff]
|
|
42
|
+
line-length = 120
|
|
43
|
+
target-version = "py310"
|
|
44
|
+
src = ["src", "tests", "live_tests"]
|
|
45
|
+
exclude = [
|
|
46
|
+
"dev_workspace",
|
|
47
|
+
".venv",
|
|
48
|
+
"venv",
|
|
49
|
+
".pytest_cache",
|
|
50
|
+
"__pycache__",
|
|
51
|
+
"src/text2sql/__pycache__",
|
|
52
|
+
"tests/__pycache__",
|
|
53
|
+
"live_tests/__pycache__"
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
[tool.ruff.lint]
|
|
57
|
+
select = ["E", "F", "I", "B", "UP"]
|
|
58
|
+
ignore = ["E501"]
|
|
59
|
+
|
|
60
|
+
[tool.ruff.format]
|
|
61
|
+
quote-style = "double"
|
|
62
|
+
indent-style = "space"
|
|
63
|
+
line-ending = "auto"
|
|
64
|
+
|
|
65
|
+
[tool.mypy]
|
|
66
|
+
python_version = "3.10"
|
|
67
|
+
strict = true
|
|
68
|
+
|
|
69
|
+
[tool.pytest.ini_options]
|
|
70
|
+
testpaths = ["tests", "live_tests"]
|
|
71
|
+
markers = [
|
|
72
|
+
"live: marks tests that require a live LLM and database connection (deselect with '-m \"not live\"')",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
[tool.setuptools.packages.find]
|
|
76
|
+
where = ["src"]
|
|
77
|
+
include = ["text2sql*"]
|
|
78
|
+
exclude = ["__pycache__"]
|
|
79
|
+
|
|
80
|
+
[project.urls]
|
|
81
|
+
Homepage = "https://github.com/akul-ameya/aetherdialect"
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: aetherdialect
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
|
+
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/akul-ameya/aetherdialect
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: jsonschema<5,>=4.0
|
|
12
|
+
Requires-Dist: openai<3,>=2.0.0
|
|
13
|
+
Requires-Dist: sqlglot<30,>=29.0
|
|
14
|
+
Requires-Dist: platformdirs<5,>=2.0.0
|
|
15
|
+
Requires-Dist: python-dotenv<2,>=1.0.0
|
|
16
|
+
Provides-Extra: databricks
|
|
17
|
+
Requires-Dist: pyspark<4,>=3.3; extra == "databricks"
|
|
18
|
+
Requires-Dist: databricks-sql-connector<4,>=3.0; extra == "databricks"
|
|
19
|
+
Provides-Extra: postgresql
|
|
20
|
+
Requires-Dist: SQLAlchemy<3,>=2.0; extra == "postgresql"
|
|
21
|
+
Requires-Dist: psycopg2-binary<3,>=2.9; extra == "postgresql"
|
|
22
|
+
Requires-Dist: pglast<8,>=5.0; extra == "postgresql"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
26
|
+
Requires-Dist: vulture<3,>=2.11; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
28
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
29
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pre-commit>=3.0; extra == "dev"
|
|
32
|
+
Requires-Dist: black<25,>=24; extra == "dev"
|
|
33
|
+
Requires-Dist: docformatter<2,>=1.7; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# Deterministic, validation-first Text-to-SQL for business databases
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install text2sql
|
|
42
|
+
pip install "text2sql[postgresql]"
|
|
43
|
+
pip install "text2sql[databricks]"
|
|
44
|
+
pip install "text2sql[postgresql,databricks]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
48
|
+
|
|
49
|
+
| Extra | Brings in | Use when |
|
|
50
|
+
| ------------ | ---------------------------------------------- | --------------------------- |
|
|
51
|
+
| `postgresql` | SQLAlchemy, PostgreSQL driver, `pglast` | `engine="postgresql"` |
|
|
52
|
+
| `databricks` | PySpark, Databricks SQL connector | `engine="databricks"` |
|
|
53
|
+
|
|
54
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect. The base package already depends on `sqlglot`; `pglast` is installed with the `postgresql` extra.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Quickstart
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from text2sql import Text2SQL
|
|
62
|
+
|
|
63
|
+
t2s = Text2SQL(
|
|
64
|
+
engine="postgresql",
|
|
65
|
+
host="localhost",
|
|
66
|
+
database="mydb",
|
|
67
|
+
password="secret",
|
|
68
|
+
openai_api_key="sk-...",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
t2s.run_interactive()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## What this is
|
|
79
|
+
|
|
80
|
+
A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**. It targets **stable business schemas** and **repeated analytical questions**, not open-ended “any SQL” generation.
|
|
81
|
+
|
|
82
|
+
- Natural language is turned into a **structured intent** (tables, select expressions, filters, grouping, ordering, optional CTEs) that is **shared across dialects**; dialect-specific SQL is produced later.
|
|
83
|
+
- **Templates** store previously accepted query patterns; **negative memory** records rejections so bad shapes are less likely to repeat.
|
|
84
|
+
- LLM calls run at **temperature 0**; for the same inputs and schema state, behavior is **repeatable**.
|
|
85
|
+
- **Bounded LLM use**: strong paths reuse templates or deterministic structure before asking the model for SQL.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## Philosophy
|
|
90
|
+
|
|
91
|
+
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
92
|
+
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
93
|
+
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
94
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, generate or repair SQL; everything else is rules and stores.
|
|
95
|
+
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## What it supports (at a glance)
|
|
100
|
+
|
|
101
|
+
**Backends**
|
|
102
|
+
|
|
103
|
+
- PostgreSQL via SQLAlchemy.
|
|
104
|
+
- Databricks via Unity Catalog introspection (with optional DDL file fallback when the catalog is empty).
|
|
105
|
+
|
|
106
|
+
**Schema**
|
|
107
|
+
|
|
108
|
+
- Load from **live introspection** (primary).
|
|
109
|
+
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
110
|
+
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
111
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
112
|
+
- Optional **human notes** (plain text) fed once when the schema graph is built: richer **descriptions**, **roles**, and optional **sensitivity** labels — without renaming tables or inventing columns.
|
|
113
|
+
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
114
|
+
|
|
115
|
+
**Intent / SQL shape (analytical subset)**
|
|
116
|
+
|
|
117
|
+
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
118
|
+
- **Joins:** only along **FK-backed paths**; join type for injected joins is chosen **deterministically** from table roles (e.g. dimension side as `LEFT` where applicable).
|
|
119
|
+
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
120
|
+
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
121
|
+
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
122
|
+
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
123
|
+
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
124
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM` / `AVG` on select columns (main query and CTEs).
|
|
125
|
+
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
126
|
+
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
127
|
+
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
128
|
+
|
|
129
|
+
**Operational modes**
|
|
130
|
+
|
|
131
|
+
- **Interactive** — ask questions, accept/reject, results export.
|
|
132
|
+
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
133
|
+
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## What it is not
|
|
138
|
+
|
|
139
|
+
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
140
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` / `EXPLAIN` as you prefer).
|
|
141
|
+
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## How a question becomes SQL
|
|
146
|
+
|
|
147
|
+
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
148
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain** (see below).
|
|
149
|
+
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
150
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and optionally **`EXPLAIN`** when an engine is available.
|
|
151
|
+
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Validation (layers)
|
|
156
|
+
|
|
157
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (`pglast` on PostgreSQL, **`sqlglot` (Spark)** on Databricks). When a live **engine** is passed in, **`EXPLAIN`** can be used as an extra executability check.
|
|
158
|
+
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
159
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment (one of many cross-checks, not the only story).
|
|
160
|
+
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
161
|
+
|
|
162
|
+
## Deterministic repairs (after intent parse)
|
|
163
|
+
|
|
164
|
+
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Learning and reuse
|
|
169
|
+
|
|
170
|
+
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
171
|
+
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
172
|
+
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## Coverage simulator (brief)
|
|
177
|
+
|
|
178
|
+
1. Parse each **seed** line into a gold intent.
|
|
179
|
+
2. **Expand** with a fixed set of **deterministic operators** (filters, aggregates, joins, time windows, numeric transforms, distinct/limit/OR-groups, expressions, **window variants**, etc.), **deduplicated** across depths.
|
|
180
|
+
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
181
|
+
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
182
|
+
|
|
183
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## QSim (brief)
|
|
188
|
+
|
|
189
|
+
Generates **reproducible** question lists from the schema and profiled values. Same seed → same output. Use for regression-style testing or dataset building.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## When to use it
|
|
194
|
+
|
|
195
|
+
**Good fit:** star/snowflake-style models, clear FKs, repeated BI-style questions, PostgreSQL or Databricks.
|
|
196
|
+
|
|
197
|
+
**Poor fit:** schemas without relationships, heavy procedural logic, or expectations of arbitrary SQL features outside the supported analytical subset.
|