aetherdialect 0.1.0__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aetherdialect-0.1.0/src/aetherdialect.egg-info → aetherdialect-0.1.2}/PKG-INFO +31 -29
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/README.md +27 -26
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/pyproject.toml +15 -4
- {aetherdialect-0.1.0 → aetherdialect-0.1.2/src/aetherdialect.egg-info}/PKG-INFO +31 -29
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/SOURCES.txt +3 -2
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/requires.txt +3 -2
- aetherdialect-0.1.2/src/text2sql/__init__.py +10 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/config.py +352 -104
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/contracts_base.py +1165 -952
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/contracts_core.py +798 -123
- aetherdialect-0.1.2/src/text2sql/core_utils.py +1247 -0
- aetherdialect-0.1.2/src/text2sql/dialect.py +2455 -0
- aetherdialect-0.1.2/src/text2sql/expansion_ops.py +1999 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/intent_expr.py +556 -289
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/intent_process.py +774 -587
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/intent_repair.py +1040 -472
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/intent_resolve.py +196 -107
- aetherdialect-0.1.2/src/text2sql/live_testing.py +1371 -0
- aetherdialect-0.1.2/src/text2sql/main_execution.py +1727 -0
- aetherdialect-0.1.2/src/text2sql/pipeline.py +2607 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/qsim_ops.py +1299 -1286
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/qsim_sample.py +839 -609
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/qsim_struct.py +558 -569
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/schema.py +520 -220
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/schema_profiling.py +1245 -486
- aetherdialect-0.1.2/src/text2sql/simulator.py +1215 -0
- aetherdialect-0.1.2/src/text2sql/sql_gen.py +2533 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/templates.py +457 -230
- aetherdialect-0.1.2/src/text2sql/text2sql.py +1006 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/utils.py +175 -108
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/validation_agg.py +1083 -1033
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/validation_execute.py +195 -142
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/validation_schema.py +1044 -193
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/text2sql/validation_semantic.py +2138 -2122
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_config.py +58 -72
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_contracts.py +123 -115
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_core_utils.py +28 -29
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_dialect.py +254 -91
- aetherdialect-0.1.2/tests/test_expansion_ops.py +1035 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_intent_expr.py +150 -59
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_intent_process.py +365 -172
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_intent_repair.py +324 -74
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_intent_resolve.py +73 -100
- aetherdialect-0.1.2/tests/test_join_bool_cte_matrix.py +383 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_live_testing.py +140 -60
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_main_execution.py +284 -277
- aetherdialect-0.1.2/tests/test_pipeline_session.py +160 -0
- aetherdialect-0.1.2/tests/test_pipeline_targeted.py +136 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_pipeline_units.py +550 -52
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_qsim_ops.py +7 -8
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_qsim_sample.py +3 -6
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_qsim_struct.py +17 -19
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_schema.py +46 -14
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_schema_profiling.py +199 -72
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_simulator.py +175 -10
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_simulator_pipeline.py +286 -248
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_sql_gen.py +533 -59
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_templates.py +214 -161
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_text2sql.py +312 -240
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_utils.py +39 -42
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_validation_agg.py +7 -14
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_validation_execute.py +72 -86
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_validation_schema.py +181 -8
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/tests/test_validation_semantic.py +61 -59
- aetherdialect-0.1.0/src/text2sql/__init__.py +0 -7
- aetherdialect-0.1.0/src/text2sql/core_utils.py +0 -834
- aetherdialect-0.1.0/src/text2sql/dialect.py +0 -1134
- aetherdialect-0.1.0/src/text2sql/expansion_ops.py +0 -1218
- aetherdialect-0.1.0/src/text2sql/expansion_rules.py +0 -496
- aetherdialect-0.1.0/src/text2sql/live_testing.py +0 -1117
- aetherdialect-0.1.0/src/text2sql/main_execution.py +0 -799
- aetherdialect-0.1.0/src/text2sql/pipeline.py +0 -1662
- aetherdialect-0.1.0/src/text2sql/simulator.py +0 -970
- aetherdialect-0.1.0/src/text2sql/sql_gen.py +0 -1537
- aetherdialect-0.1.0/src/text2sql/text2sql.py +0 -726
- aetherdialect-0.1.0/tests/test_expansion_ops.py +0 -585
- aetherdialect-0.1.0/tests/test_expansion_rules.py +0 -434
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/LICENSE +0 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/setup.cfg +0 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/dependency_links.txt +0 -0
- {aetherdialect-0.1.0 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aetherdialect
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
5
|
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,14 +10,15 @@ Description-Content-Type: text/markdown
|
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: jsonschema<5,>=4.0
|
|
12
12
|
Requires-Dist: openai<3,>=2.0.0
|
|
13
|
-
Requires-Dist: sqlglot<30,>=29.0
|
|
14
13
|
Requires-Dist: platformdirs<5,>=2.0.0
|
|
15
14
|
Requires-Dist: python-dotenv<2,>=1.0.0
|
|
15
|
+
Requires-Dist: SQLAlchemy<3,>=2.0
|
|
16
16
|
Provides-Extra: databricks
|
|
17
|
+
Requires-Dist: sqlglot<30,>=29.0; extra == "databricks"
|
|
17
18
|
Requires-Dist: pyspark<4,>=3.3; extra == "databricks"
|
|
18
19
|
Requires-Dist: databricks-sql-connector<4,>=3.0; extra == "databricks"
|
|
20
|
+
Requires-Dist: databricks-sqlalchemy<3,>=2.0; extra == "databricks"
|
|
19
21
|
Provides-Extra: postgresql
|
|
20
|
-
Requires-Dist: SQLAlchemy<3,>=2.0; extra == "postgresql"
|
|
21
22
|
Requires-Dist: psycopg2-binary<3,>=2.9; extra == "postgresql"
|
|
22
23
|
Requires-Dist: pglast<8,>=5.0; extra == "postgresql"
|
|
23
24
|
Provides-Extra: dev
|
|
@@ -35,6 +36,8 @@ Dynamic: license-file
|
|
|
35
36
|
|
|
36
37
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
37
38
|
|
|
39
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
40
|
+
|
|
38
41
|
## Installation
|
|
39
42
|
|
|
40
43
|
```bash
|
|
@@ -44,14 +47,15 @@ pip install "text2sql[databricks]"
|
|
|
44
47
|
pip install "text2sql[postgresql,databricks]"
|
|
45
48
|
```
|
|
46
49
|
|
|
47
|
-
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
50
|
+
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
48
51
|
|
|
49
|
-
| Extra | Brings in
|
|
50
|
-
| ------------ |
|
|
51
|
-
|
|
|
52
|
-
| `
|
|
52
|
+
| Extra | Brings in | Use when |
|
|
53
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
54
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
55
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
56
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
53
57
|
|
|
54
|
-
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
58
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
55
59
|
|
|
56
60
|
---
|
|
57
61
|
|
|
@@ -71,7 +75,9 @@ t2s = Text2SQL(
|
|
|
71
75
|
t2s.run_interactive()
|
|
72
76
|
```
|
|
73
77
|
|
|
74
|
-
Constructor options,
|
|
78
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
79
|
+
|
|
80
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
75
81
|
|
|
76
82
|
---
|
|
77
83
|
|
|
@@ -91,7 +97,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
91
97
|
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
92
98
|
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
93
99
|
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
94
|
-
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed,
|
|
100
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, repair SQL; everything else is rules and stores.
|
|
95
101
|
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
96
102
|
|
|
97
103
|
---
|
|
@@ -100,35 +106,35 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
100
106
|
|
|
101
107
|
**Backends**
|
|
102
108
|
|
|
103
|
-
- PostgreSQL
|
|
104
|
-
- Databricks
|
|
109
|
+
- PostgreSQL
|
|
110
|
+
- Databricks
|
|
105
111
|
|
|
106
112
|
**Schema**
|
|
107
113
|
|
|
108
114
|
- Load from **live introspection** (primary).
|
|
109
115
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
110
116
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
111
|
-
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
112
|
-
- Optional **human notes** (plain text)
|
|
117
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
118
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
113
119
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
114
120
|
|
|
115
121
|
**Intent / SQL shape (analytical subset)**
|
|
116
122
|
|
|
117
123
|
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
118
|
-
- **Joins:**
|
|
124
|
+
- **Joins:** related tables are wired using the schema’s relationships; when more than one valid path could link the same tables, one coherent path is chosen for the whole query, and join style follows table roles (e.g. `LEFT` toward dimensions where that fits). Self-joins use **CTEs** instead of repeating the same base table in one `FROM` chain.
|
|
119
125
|
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
120
126
|
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
121
127
|
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
122
128
|
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
123
129
|
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
124
|
-
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`
|
|
130
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`/`AVG`, `LAG`, `LEAD`, `FIRST_VALUE`, and `LAST_VALUE` on select columns (main query and CTEs).
|
|
125
131
|
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
126
132
|
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
127
133
|
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
128
134
|
|
|
129
135
|
**Operational modes**
|
|
130
136
|
|
|
131
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
137
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
132
138
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
133
139
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
134
140
|
|
|
@@ -137,7 +143,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
137
143
|
## What it is not
|
|
138
144
|
|
|
139
145
|
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
140
|
-
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT`
|
|
146
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` and `EXPLAIN`).
|
|
141
147
|
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
142
148
|
|
|
143
149
|
---
|
|
@@ -145,31 +151,27 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
145
151
|
## How a question becomes SQL
|
|
146
152
|
|
|
147
153
|
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
148
|
-
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain
|
|
154
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain**.
|
|
149
155
|
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
150
|
-
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and
|
|
156
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and **`EXPLAIN`**.
|
|
151
157
|
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
152
158
|
|
|
153
159
|
---
|
|
154
160
|
|
|
155
161
|
## Validation (layers)
|
|
156
162
|
|
|
157
|
-
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (
|
|
163
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (**`pglast`** or **`sqlglot`**). **`EXPLAIN`** is used as an extra executability check.
|
|
158
164
|
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
159
|
-
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment
|
|
165
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment.
|
|
160
166
|
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
161
167
|
|
|
162
|
-
## Deterministic repairs (after intent parse)
|
|
163
|
-
|
|
164
|
-
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
165
|
-
|
|
166
168
|
---
|
|
167
169
|
|
|
168
170
|
## Learning and reuse
|
|
169
171
|
|
|
170
172
|
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
171
173
|
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
172
|
-
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
174
|
+
- Persistence is under a **per-connection artifact directory** (see **[USAGE.md](USAGE.md)**); you can back it up or reset it by removing that directory.
|
|
173
175
|
|
|
174
176
|
---
|
|
175
177
|
|
|
@@ -180,7 +182,7 @@ Applied in order (high level): `COUNT(*)` normalization; CTE naming and output a
|
|
|
180
182
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
181
183
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
182
184
|
|
|
183
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
185
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
184
186
|
|
|
185
187
|
---
|
|
186
188
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
2
2
|
|
|
3
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
4
|
+
|
|
3
5
|
## Installation
|
|
4
6
|
|
|
5
7
|
```bash
|
|
@@ -9,14 +11,15 @@ pip install "text2sql[databricks]"
|
|
|
9
11
|
pip install "text2sql[postgresql,databricks]"
|
|
10
12
|
```
|
|
11
13
|
|
|
12
|
-
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
14
|
+
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
13
15
|
|
|
14
|
-
| Extra | Brings in
|
|
15
|
-
| ------------ |
|
|
16
|
-
|
|
|
17
|
-
| `
|
|
16
|
+
| Extra | Brings in | Use when |
|
|
17
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
18
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
19
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
20
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
18
21
|
|
|
19
|
-
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
22
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
20
23
|
|
|
21
24
|
---
|
|
22
25
|
|
|
@@ -36,7 +39,9 @@ t2s = Text2SQL(
|
|
|
36
39
|
t2s.run_interactive()
|
|
37
40
|
```
|
|
38
41
|
|
|
39
|
-
Constructor options,
|
|
42
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
43
|
+
|
|
44
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
40
45
|
|
|
41
46
|
---
|
|
42
47
|
|
|
@@ -56,7 +61,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
56
61
|
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
57
62
|
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
58
63
|
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
59
|
-
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed,
|
|
64
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, repair SQL; everything else is rules and stores.
|
|
60
65
|
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
61
66
|
|
|
62
67
|
---
|
|
@@ -65,35 +70,35 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
65
70
|
|
|
66
71
|
**Backends**
|
|
67
72
|
|
|
68
|
-
- PostgreSQL
|
|
69
|
-
- Databricks
|
|
73
|
+
- PostgreSQL
|
|
74
|
+
- Databricks
|
|
70
75
|
|
|
71
76
|
**Schema**
|
|
72
77
|
|
|
73
78
|
- Load from **live introspection** (primary).
|
|
74
79
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
75
80
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
76
|
-
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
77
|
-
- Optional **human notes** (plain text)
|
|
81
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
82
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
78
83
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
79
84
|
|
|
80
85
|
**Intent / SQL shape (analytical subset)**
|
|
81
86
|
|
|
82
87
|
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
83
|
-
- **Joins:**
|
|
88
|
+
- **Joins:** related tables are wired using the schema’s relationships; when more than one valid path could link the same tables, one coherent path is chosen for the whole query, and join style follows table roles (e.g. `LEFT` toward dimensions where that fits). Self-joins use **CTEs** instead of repeating the same base table in one `FROM` chain.
|
|
84
89
|
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
85
90
|
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
86
91
|
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
87
92
|
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
88
93
|
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
89
|
-
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`
|
|
94
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`/`AVG`, `LAG`, `LEAD`, `FIRST_VALUE`, and `LAST_VALUE` on select columns (main query and CTEs).
|
|
90
95
|
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
91
96
|
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
92
97
|
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
93
98
|
|
|
94
99
|
**Operational modes**
|
|
95
100
|
|
|
96
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
101
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
97
102
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
98
103
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
99
104
|
|
|
@@ -102,7 +107,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
102
107
|
## What it is not
|
|
103
108
|
|
|
104
109
|
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
105
|
-
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT`
|
|
110
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` and `EXPLAIN`).
|
|
106
111
|
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
107
112
|
|
|
108
113
|
---
|
|
@@ -110,31 +115,27 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
110
115
|
## How a question becomes SQL
|
|
111
116
|
|
|
112
117
|
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
113
|
-
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain
|
|
118
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain**.
|
|
114
119
|
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
115
|
-
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and
|
|
120
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and **`EXPLAIN`**.
|
|
116
121
|
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
117
122
|
|
|
118
123
|
---
|
|
119
124
|
|
|
120
125
|
## Validation (layers)
|
|
121
126
|
|
|
122
|
-
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (
|
|
127
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (**`pglast`** or **`sqlglot`**). **`EXPLAIN`** is used as an extra executability check.
|
|
123
128
|
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
124
|
-
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment
|
|
129
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment.
|
|
125
130
|
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
126
131
|
|
|
127
|
-
## Deterministic repairs (after intent parse)
|
|
128
|
-
|
|
129
|
-
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
130
|
-
|
|
131
132
|
---
|
|
132
133
|
|
|
133
134
|
## Learning and reuse
|
|
134
135
|
|
|
135
136
|
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
136
137
|
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
137
|
-
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
138
|
+
- Persistence is under a **per-connection artifact directory** (see **[USAGE.md](USAGE.md)**); you can back it up or reset it by removing that directory.
|
|
138
139
|
|
|
139
140
|
---
|
|
140
141
|
|
|
@@ -145,7 +146,7 @@ Applied in order (high level): `COUNT(*)` normalization; CTE naming and output a
|
|
|
145
146
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
146
147
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
147
148
|
|
|
148
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
149
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
149
150
|
|
|
150
151
|
---
|
|
151
152
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "aetherdialect"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.2"
|
|
8
8
|
description = "Deterministic, validation-first Text-to-SQL system for business databases"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -13,15 +13,19 @@ authors = [{name = "Akul Ameya", email = "akul.ameya@gmail.com"}]
|
|
|
13
13
|
dependencies = [
|
|
14
14
|
"jsonschema>=4.0,<5",
|
|
15
15
|
"openai>=2.0.0,<3",
|
|
16
|
-
"sqlglot>=29.0,<30",
|
|
17
16
|
"platformdirs>=2.0.0,<5",
|
|
18
17
|
"python-dotenv>=1.0.0,<2",
|
|
18
|
+
"SQLAlchemy>=2.0,<3",
|
|
19
19
|
]
|
|
20
20
|
|
|
21
21
|
[project.optional-dependencies]
|
|
22
|
-
databricks = [
|
|
22
|
+
databricks = [
|
|
23
|
+
"sqlglot>=29.0,<30",
|
|
24
|
+
"pyspark>=3.3,<4",
|
|
25
|
+
"databricks-sql-connector>=3.0,<4",
|
|
26
|
+
"databricks-sqlalchemy>=2.0,<3",
|
|
27
|
+
]
|
|
23
28
|
postgresql = [
|
|
24
|
-
"SQLAlchemy>=2.0,<3",
|
|
25
29
|
"psycopg2-binary>=2.9,<3",
|
|
26
30
|
"pglast>=5.0,<8",
|
|
27
31
|
]
|
|
@@ -62,12 +66,19 @@ quote-style = "double"
|
|
|
62
66
|
indent-style = "space"
|
|
63
67
|
line-ending = "auto"
|
|
64
68
|
|
|
69
|
+
[tool.docformatter]
|
|
70
|
+
wrap-summaries = 72
|
|
71
|
+
wrap-descriptions = 72
|
|
72
|
+
style = "google"
|
|
73
|
+
force-wrap = true
|
|
74
|
+
|
|
65
75
|
[tool.mypy]
|
|
66
76
|
python_version = "3.10"
|
|
67
77
|
strict = true
|
|
68
78
|
|
|
69
79
|
[tool.pytest.ini_options]
|
|
70
80
|
testpaths = ["tests", "live_tests"]
|
|
81
|
+
pythonpath = ["src"]
|
|
71
82
|
markers = [
|
|
72
83
|
"live: marks tests that require a live LLM and database connection (deselect with '-m \"not live\"')",
|
|
73
84
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aetherdialect
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
5
|
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -10,14 +10,15 @@ Description-Content-Type: text/markdown
|
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: jsonschema<5,>=4.0
|
|
12
12
|
Requires-Dist: openai<3,>=2.0.0
|
|
13
|
-
Requires-Dist: sqlglot<30,>=29.0
|
|
14
13
|
Requires-Dist: platformdirs<5,>=2.0.0
|
|
15
14
|
Requires-Dist: python-dotenv<2,>=1.0.0
|
|
15
|
+
Requires-Dist: SQLAlchemy<3,>=2.0
|
|
16
16
|
Provides-Extra: databricks
|
|
17
|
+
Requires-Dist: sqlglot<30,>=29.0; extra == "databricks"
|
|
17
18
|
Requires-Dist: pyspark<4,>=3.3; extra == "databricks"
|
|
18
19
|
Requires-Dist: databricks-sql-connector<4,>=3.0; extra == "databricks"
|
|
20
|
+
Requires-Dist: databricks-sqlalchemy<3,>=2.0; extra == "databricks"
|
|
19
21
|
Provides-Extra: postgresql
|
|
20
|
-
Requires-Dist: SQLAlchemy<3,>=2.0; extra == "postgresql"
|
|
21
22
|
Requires-Dist: psycopg2-binary<3,>=2.9; extra == "postgresql"
|
|
22
23
|
Requires-Dist: pglast<8,>=5.0; extra == "postgresql"
|
|
23
24
|
Provides-Extra: dev
|
|
@@ -35,6 +36,8 @@ Dynamic: license-file
|
|
|
35
36
|
|
|
36
37
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
37
38
|
|
|
39
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
40
|
+
|
|
38
41
|
## Installation
|
|
39
42
|
|
|
40
43
|
```bash
|
|
@@ -44,14 +47,15 @@ pip install "text2sql[databricks]"
|
|
|
44
47
|
pip install "text2sql[postgresql,databricks]"
|
|
45
48
|
```
|
|
46
49
|
|
|
47
|
-
Requires Python ≥ 3.10 and an [OpenAI API key](https://platform.openai.com/api-keys).
|
|
50
|
+
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
48
51
|
|
|
49
|
-
| Extra | Brings in
|
|
50
|
-
| ------------ |
|
|
51
|
-
|
|
|
52
|
-
| `
|
|
52
|
+
| Extra | Brings in | Use when |
|
|
53
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
54
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
55
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
56
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
53
57
|
|
|
54
|
-
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
58
|
+
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
55
59
|
|
|
56
60
|
---
|
|
57
61
|
|
|
@@ -71,7 +75,9 @@ t2s = Text2SQL(
|
|
|
71
75
|
t2s.run_interactive()
|
|
72
76
|
```
|
|
73
77
|
|
|
74
|
-
Constructor options,
|
|
78
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
79
|
+
|
|
80
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
75
81
|
|
|
76
82
|
---
|
|
77
83
|
|
|
@@ -91,7 +97,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
91
97
|
- **Determinism over creativity** — prefer a correct, boring plan to a novel one.
|
|
92
98
|
- **Correct joins over clever SQL** — join paths come from **foreign keys** and precomputed paths, not free-form guessing.
|
|
93
99
|
- **Validate before execute** — schema checks, intent consistency, join shape, and dialect-safe **read-only** `SELECT` rules.
|
|
94
|
-
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed,
|
|
100
|
+
- **Minimal LLM surface** — parse intent, resolve ambiguous joins when needed, repair SQL; everything else is rules and stores.
|
|
95
101
|
- **Safe defaults for non-analysts** — narrow allowed SQL; you still choose **database credentials** (read-only vs write-capable is outside this library).
|
|
96
102
|
|
|
97
103
|
---
|
|
@@ -100,35 +106,35 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
100
106
|
|
|
101
107
|
**Backends**
|
|
102
108
|
|
|
103
|
-
- PostgreSQL
|
|
104
|
-
- Databricks
|
|
109
|
+
- PostgreSQL
|
|
110
|
+
- Databricks
|
|
105
111
|
|
|
106
112
|
**Schema**
|
|
107
113
|
|
|
108
114
|
- Load from **live introspection** (primary).
|
|
109
115
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
110
116
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
111
|
-
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling.
|
|
112
|
-
- Optional **human notes** (plain text)
|
|
117
|
+
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
118
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
113
119
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
114
120
|
|
|
115
121
|
**Intent / SQL shape (analytical subset)**
|
|
116
122
|
|
|
117
123
|
- **Queries:** `SELECT` only (enforced with pattern checks and dialect parsing). **CTEs** reuse the same intent model as the outer query.
|
|
118
|
-
- **Joins:**
|
|
124
|
+
- **Joins:** related tables are wired using the schema’s relationships; when more than one valid path could link the same tables, one coherent path is chosen for the whole query, and join style follows table roles (e.g. `LEFT` toward dimensions where that fits). Self-joins use **CTEs** instead of repeating the same base table in one `FROM` chain.
|
|
119
125
|
- **Select list:** bare columns, **aggregates** (`COUNT`, `SUM`, `AVG`, `MIN`, `MAX`, etc.), **arithmetic and string expressions** where the schema allows them, **`DISTINCT`**, and **scalar functions** subject to column metadata.
|
|
120
126
|
- **Filters / boolean logic:** comparisons, **`AND` / `OR`**, **`IN`**, **`LIKE`**; **`ILIKE` / `NOT ILIKE` on PostgreSQL only** (intent stays dialect-agnostic; SQL rendering differs). Null / boolean value normalization in the repair chain.
|
|
121
127
|
- **`BETWEEN`** in intent is **decomposed** into a pair of comparable predicates.
|
|
122
128
|
- **Grouping / ordering:** **`GROUP BY`**, **`HAVING`** (aggregate-aware), **`ORDER BY`**, **`LIMIT`**; rules tie **grain** (row-level vs grouped) to aggregates and grouped columns.
|
|
123
129
|
- **Dates:** structured **`date_window`** (anchor unit + offset) and **date-difference** filters between columns where supported.
|
|
124
|
-
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`
|
|
130
|
+
- **Windows:** `ROW_NUMBER`, `RANK`, `DENSE_RANK`, and windowed `SUM`/`AVG`, `LAG`, `LEAD`, `FIRST_VALUE`, and `LAST_VALUE` on select columns (main query and CTEs).
|
|
125
131
|
- **`CASE` / `WHEN`:** only in the **select list** in the intent model (not in `WHERE` / `HAVING`).
|
|
126
132
|
- **Arrays / lists:** membership-style filters; SQL uses dialect-appropriate forms; optional **UNNEST / EXPLODE-style** expansion in CTE select lists for typed array columns.
|
|
127
133
|
- **Metadata:** **UNIQUE** (and related) when reflection or DDL exposes it, for ranking “human readable” identifiers.
|
|
128
134
|
|
|
129
135
|
**Operational modes**
|
|
130
136
|
|
|
131
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
137
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
132
138
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
133
139
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
134
140
|
|
|
@@ -137,7 +143,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
137
143
|
## What it is not
|
|
138
144
|
|
|
139
145
|
- Not a **full SQL** or **stored-procedure** generator: no `UNION`/`INTERSECT`/`EXCEPT`, no correlated subqueries, no `EXISTS`, no `LATERAL`, no **DML/DDL**, no arbitrary **RIGHT/FULL OUTER** join policy in the constrained path.
|
|
140
|
-
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT`
|
|
146
|
+
- Not a substitute for **database security**: use credentials with **least privilege** (`SELECT` and `EXPLAIN`).
|
|
141
147
|
- Not **schema-agnostic**: quality depends on **FKs**, sensible types, and optional notes for domain language.
|
|
142
148
|
|
|
143
149
|
---
|
|
@@ -145,31 +151,27 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
145
151
|
## How a question becomes SQL
|
|
146
152
|
|
|
147
153
|
1. **Template match** — if a trusted pattern fits, reuse parameterized SQL (often **no** SQL LLM call).
|
|
148
|
-
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain
|
|
154
|
+
2. **Intent parse** — structured intent from the question + schema summary, then a long **deterministic repair chain**.
|
|
149
155
|
3. **Join resolution** — choose among valid **FK paths** for the intent’s tables; disambiguation may use the LLM when multiple paths tie.
|
|
150
|
-
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and
|
|
156
|
+
4. **SQL generation & validation** — deterministic skeleton, injected joins, LLM fill/repair **under constraints**, then **semantic validation**, **`SELECT`-only / forbidden-pattern checks**, **dialect AST** validation (**`pglast`** or **`sqlglot`**), and **`EXPLAIN`**.
|
|
151
157
|
5. **Execute** (where the mode allows) and **learn** — accept → promote template trust; reject → record negative pattern.
|
|
152
158
|
|
|
153
159
|
---
|
|
154
160
|
|
|
155
161
|
## Validation (layers)
|
|
156
162
|
|
|
157
|
-
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (
|
|
163
|
+
- **Safety / shape** — `SELECT`-only enforcement, configurable **forbidden SQL** substrings, then **dialect `ast_validate`** (**`pglast`** or **`sqlglot`**). **`EXPLAIN`** is used as an extra executability check.
|
|
158
164
|
- **Schema vs intent** — tables/columns/CTEs, **selectability**, access and sensitivity policy, window / CASE / array shapes, filter and HAVING ops per column, aggregate roles in select/HAVING/ORDER BY, scalar function typing, filter value types vs column types, null/date-window/date-diff rules.
|
|
159
|
-
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment
|
|
165
|
+
- **Semantic consistency** — grouped queries require proper aggregation; contradictions and impossible HAVING; **grain** alignment.
|
|
160
166
|
- **Joins** — paths must match the FK graph; guarded path avoids ad-hoc join guessing.
|
|
161
167
|
|
|
162
|
-
## Deterministic repairs (after intent parse)
|
|
163
|
-
|
|
164
|
-
Applied in order (high level): `COUNT(*)` normalization; CTE naming and output aliases; qualify CTE outputs; sanitize table names; grain rules for grouped CTE usage and **grain consistency**; strip redundant `GROUP BY`; normalize filters/HAVING; null-equality fixes; strip join-condition leakage into filters; per-CTE sort order; simplify expressions; `IN` value normalization; date-diff classification and raw-value fixes; **`BETWEEN` → paired predicates**; auto-repair filters/HAVING; strip impossible HAVING; FK filter type repair; filter value case / enum alignment; boolean and null filter values; expand FK selects to descriptive columns; deduplicate contradictory filters; redundant PK re-qualification; **window**, **CASE**, and **array** intent repairs; sensitivity and access policy enforcement on the intent.
|
|
165
|
-
|
|
166
168
|
---
|
|
167
169
|
|
|
168
170
|
## Learning and reuse
|
|
169
171
|
|
|
170
172
|
- **Accepted templates** — intent fingerprint, parameterized SQL, optional example question, **trust** that rises with validation and falls with rejection.
|
|
171
173
|
- **Rejected templates** — categorized failures so similar bad intents are discouraged.
|
|
172
|
-
- Persistence is under a **per-connection artifact directory** (see USAGE); you can back it up or reset it by removing that directory.
|
|
174
|
+
- Persistence is under a **per-connection artifact directory** (see **[USAGE.md](USAGE.md)**); you can back it up or reset it by removing that directory.
|
|
173
175
|
|
|
174
176
|
---
|
|
175
177
|
|
|
@@ -180,7 +182,7 @@ Applied in order (high level): `COUNT(*)` normalization; CTE naming and output a
|
|
|
180
182
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
181
183
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
182
184
|
|
|
183
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
185
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
184
186
|
|
|
185
187
|
---
|
|
186
188
|
|
|
@@ -13,7 +13,6 @@ src/text2sql/contracts_core.py
|
|
|
13
13
|
src/text2sql/core_utils.py
|
|
14
14
|
src/text2sql/dialect.py
|
|
15
15
|
src/text2sql/expansion_ops.py
|
|
16
|
-
src/text2sql/expansion_rules.py
|
|
17
16
|
src/text2sql/intent_expr.py
|
|
18
17
|
src/text2sql/intent_process.py
|
|
19
18
|
src/text2sql/intent_repair.py
|
|
@@ -40,13 +39,15 @@ tests/test_contracts.py
|
|
|
40
39
|
tests/test_core_utils.py
|
|
41
40
|
tests/test_dialect.py
|
|
42
41
|
tests/test_expansion_ops.py
|
|
43
|
-
tests/test_expansion_rules.py
|
|
44
42
|
tests/test_intent_expr.py
|
|
45
43
|
tests/test_intent_process.py
|
|
46
44
|
tests/test_intent_repair.py
|
|
47
45
|
tests/test_intent_resolve.py
|
|
46
|
+
tests/test_join_bool_cte_matrix.py
|
|
48
47
|
tests/test_live_testing.py
|
|
49
48
|
tests/test_main_execution.py
|
|
49
|
+
tests/test_pipeline_session.py
|
|
50
|
+
tests/test_pipeline_targeted.py
|
|
50
51
|
tests/test_pipeline_units.py
|
|
51
52
|
tests/test_qsim_ops.py
|
|
52
53
|
tests/test_qsim_sample.py
|
|
@@ -1,12 +1,14 @@
|
|
|
1
1
|
jsonschema<5,>=4.0
|
|
2
2
|
openai<3,>=2.0.0
|
|
3
|
-
sqlglot<30,>=29.0
|
|
4
3
|
platformdirs<5,>=2.0.0
|
|
5
4
|
python-dotenv<2,>=1.0.0
|
|
5
|
+
SQLAlchemy<3,>=2.0
|
|
6
6
|
|
|
7
7
|
[databricks]
|
|
8
|
+
sqlglot<30,>=29.0
|
|
8
9
|
pyspark<4,>=3.3
|
|
9
10
|
databricks-sql-connector<4,>=3.0
|
|
11
|
+
databricks-sqlalchemy<3,>=2.0
|
|
10
12
|
|
|
11
13
|
[dev]
|
|
12
14
|
pytest>=8.0
|
|
@@ -21,6 +23,5 @@ black<25,>=24
|
|
|
21
23
|
docformatter<2,>=1.7
|
|
22
24
|
|
|
23
25
|
[postgresql]
|
|
24
|
-
SQLAlchemy<3,>=2.0
|
|
25
26
|
psycopg2-binary<3,>=2.9
|
|
26
27
|
pglast<8,>=5.0
|