aetherdialect 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {aetherdialect-0.1.1/src/aetherdialect.egg-info → aetherdialect-0.1.2}/PKG-INFO +14 -10
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/README.md +13 -9
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/pyproject.toml +1 -1
- {aetherdialect-0.1.1 → aetherdialect-0.1.2/src/aetherdialect.egg-info}/PKG-INFO +14 -10
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/SOURCES.txt +1 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/__init__.py +10 -10
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/config.py +110 -18
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/contracts_base.py +93 -60
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/contracts_core.py +238 -118
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/core_utils.py +380 -69
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/dialect.py +309 -161
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/expansion_ops.py +609 -203
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_expr.py +173 -103
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_process.py +185 -66
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_repair.py +208 -96
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_resolve.py +76 -66
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/live_testing.py +169 -47
- aetherdialect-0.1.2/src/text2sql/main_execution.py +1727 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/pipeline.py +632 -70
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_ops.py +73 -35
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_sample.py +84 -38
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_struct.py +69 -36
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/schema.py +134 -81
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/schema_profiling.py +826 -121
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/simulator.py +418 -211
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/sql_gen.py +340 -95
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/templates.py +293 -52
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/text2sql.py +247 -97
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/utils.py +93 -35
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_agg.py +88 -30
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_execute.py +49 -18
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_schema.py +405 -136
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_semantic.py +105 -76
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_config.py +7 -7
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_contracts.py +2 -2
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_core_utils.py +1 -1
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_dialect.py +109 -0
- aetherdialect-0.1.2/tests/test_expansion_ops.py +1035 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_expr.py +12 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_repair.py +67 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_live_testing.py +38 -18
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_main_execution.py +19 -10
- aetherdialect-0.1.2/tests/test_pipeline_session.py +160 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_pipeline_units.py +33 -16
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_struct.py +15 -15
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_schema_profiling.py +90 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_simulator.py +147 -5
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_sql_gen.py +45 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_templates.py +23 -18
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_text2sql.py +76 -1
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_schema.py +57 -2
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_semantic.py +31 -0
- aetherdialect-0.1.1/src/text2sql/main_execution.py +0 -786
- aetherdialect-0.1.1/tests/test_expansion_ops.py +0 -599
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/LICENSE +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/setup.cfg +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/dependency_links.txt +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/requires.txt +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/top_level.txt +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_process.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_resolve.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_join_bool_cte_matrix.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_pipeline_targeted.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_ops.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_sample.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_schema.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_simulator_pipeline.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_utils.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_agg.py +0 -0
- {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_execute.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aetherdialect
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
5
|
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -36,6 +36,8 @@ Dynamic: license-file
|
|
|
36
36
|
|
|
37
37
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
38
38
|
|
|
39
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
40
|
+
|
|
39
41
|
## Installation
|
|
40
42
|
|
|
41
43
|
```bash
|
|
@@ -47,11 +49,11 @@ pip install "text2sql[postgresql,databricks]"
|
|
|
47
49
|
|
|
48
50
|
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
49
51
|
|
|
50
|
-
| Extra | Brings in
|
|
51
|
-
| ------------ |
|
|
52
|
-
| (base) | **SQLAlchemy** (shared introspection / execution interface)
|
|
53
|
-
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`**
|
|
54
|
-
| `databricks` |
|
|
52
|
+
| Extra | Brings in | Use when |
|
|
53
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
54
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
55
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
56
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
55
57
|
|
|
56
58
|
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
57
59
|
|
|
@@ -73,7 +75,9 @@ t2s = Text2SQL(
|
|
|
73
75
|
t2s.run_interactive()
|
|
74
76
|
```
|
|
75
77
|
|
|
76
|
-
Constructor options,
|
|
78
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
79
|
+
|
|
80
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
77
81
|
|
|
78
82
|
---
|
|
79
83
|
|
|
@@ -111,7 +115,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
111
115
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
112
116
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
113
117
|
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
114
|
-
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**):
|
|
118
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
115
119
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
116
120
|
|
|
117
121
|
**Intent / SQL shape (analytical subset)**
|
|
@@ -130,7 +134,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
130
134
|
|
|
131
135
|
**Operational modes**
|
|
132
136
|
|
|
133
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
137
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
134
138
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
135
139
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
136
140
|
|
|
@@ -178,7 +182,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
178
182
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
179
183
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
180
184
|
|
|
181
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
185
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
182
186
|
|
|
183
187
|
---
|
|
184
188
|
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
2
2
|
|
|
3
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
4
|
+
|
|
3
5
|
## Installation
|
|
4
6
|
|
|
5
7
|
```bash
|
|
@@ -11,11 +13,11 @@ pip install "text2sql[postgresql,databricks]"
|
|
|
11
13
|
|
|
12
14
|
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
13
15
|
|
|
14
|
-
| Extra | Brings in
|
|
15
|
-
| ------------ |
|
|
16
|
-
| (base) | **SQLAlchemy** (shared introspection / execution interface)
|
|
17
|
-
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`**
|
|
18
|
-
| `databricks` |
|
|
16
|
+
| Extra | Brings in | Use when |
|
|
17
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
18
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
19
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
20
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
19
21
|
|
|
20
22
|
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
21
23
|
|
|
@@ -37,7 +39,9 @@ t2s = Text2SQL(
|
|
|
37
39
|
t2s.run_interactive()
|
|
38
40
|
```
|
|
39
41
|
|
|
40
|
-
Constructor options,
|
|
42
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
43
|
+
|
|
44
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
41
45
|
|
|
42
46
|
---
|
|
43
47
|
|
|
@@ -75,7 +79,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
75
79
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
76
80
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
77
81
|
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
78
|
-
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**):
|
|
82
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
79
83
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
80
84
|
|
|
81
85
|
**Intent / SQL shape (analytical subset)**
|
|
@@ -94,7 +98,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
94
98
|
|
|
95
99
|
**Operational modes**
|
|
96
100
|
|
|
97
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
101
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
98
102
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
99
103
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
100
104
|
|
|
@@ -142,7 +146,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
142
146
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
143
147
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
144
148
|
|
|
145
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
149
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
146
150
|
|
|
147
151
|
---
|
|
148
152
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: aetherdialect
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Deterministic, validation-first Text-to-SQL system for business databases
|
|
5
5
|
Author-email: Akul Ameya <akul.ameya@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -36,6 +36,8 @@ Dynamic: license-file
|
|
|
36
36
|
|
|
37
37
|
# Deterministic, validation-first Text-to-SQL for business databases
|
|
38
38
|
|
|
39
|
+
Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
|
|
40
|
+
|
|
39
41
|
## Installation
|
|
40
42
|
|
|
41
43
|
```bash
|
|
@@ -47,11 +49,11 @@ pip install "text2sql[postgresql,databricks]"
|
|
|
47
49
|
|
|
48
50
|
Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
|
|
49
51
|
|
|
50
|
-
| Extra | Brings in
|
|
51
|
-
| ------------ |
|
|
52
|
-
| (base) | **SQLAlchemy** (shared introspection / execution interface)
|
|
53
|
-
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`**
|
|
54
|
-
| `databricks` |
|
|
52
|
+
| Extra | Brings in | Use when |
|
|
53
|
+
| ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
|
|
54
|
+
| (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
|
|
55
|
+
| `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
|
|
56
|
+
| `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
|
|
55
57
|
|
|
56
58
|
**SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
|
|
57
59
|
|
|
@@ -73,7 +75,9 @@ t2s = Text2SQL(
|
|
|
73
75
|
t2s.run_interactive()
|
|
74
76
|
```
|
|
75
77
|
|
|
76
|
-
Constructor options,
|
|
78
|
+
Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
|
|
79
|
+
|
|
80
|
+
**Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
|
|
77
81
|
|
|
78
82
|
---
|
|
79
83
|
|
|
@@ -111,7 +115,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
111
115
|
- Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
|
|
112
116
|
- Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
|
|
113
117
|
- **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
|
|
114
|
-
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**):
|
|
118
|
+
- Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
|
|
115
119
|
- Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
|
|
116
120
|
|
|
117
121
|
**Intent / SQL shape (analytical subset)**
|
|
@@ -130,7 +134,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
130
134
|
|
|
131
135
|
**Operational modes**
|
|
132
136
|
|
|
133
|
-
- **Interactive** — ask questions, accept/reject, results export.
|
|
137
|
+
- **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
|
|
134
138
|
- **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
|
|
135
139
|
- **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
|
|
136
140
|
|
|
@@ -178,7 +182,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
|
|
|
178
182
|
3. Resolve joins once per table set where possible; validate and **execute** as a gate.
|
|
179
183
|
4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
|
|
180
184
|
|
|
181
|
-
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader,
|
|
185
|
+
The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
|
|
182
186
|
|
|
183
187
|
---
|
|
184
188
|
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
-
|
|
3
|
-
from .text2sql import Text2SQL
|
|
4
|
-
|
|
5
|
-
try:
|
|
6
|
-
__version__ = version("aetherdialect")
|
|
7
|
-
except PackageNotFoundError:
|
|
8
|
-
__version__ = "0.0.0+dev"
|
|
9
|
-
|
|
10
|
-
__all__ = ["Text2SQL", "__version__"]
|
|
1
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
2
|
+
|
|
3
|
+
from .text2sql import Text2SQL
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
__version__ = version("aetherdialect")
|
|
7
|
+
except PackageNotFoundError:
|
|
8
|
+
__version__ = "0.0.0+dev"
|
|
9
|
+
|
|
10
|
+
__all__ = ["Text2SQL", "__version__"]
|
|
@@ -5,11 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
import os
|
|
6
6
|
import re
|
|
7
7
|
from enum import Enum
|
|
8
|
-
from typing import ClassVar, Protocol, runtime_checkable
|
|
8
|
+
from typing import Any, ClassVar, Protocol, runtime_checkable
|
|
9
9
|
from urllib.parse import quote
|
|
10
10
|
|
|
11
11
|
SUPPORTED_ENGINES: frozenset[str] = frozenset({"postgresql", "databricks"})
|
|
12
12
|
|
|
13
|
+
JSON_COMPACT_SEPARATORS: tuple[str, str] = (",", ":")
|
|
14
|
+
|
|
13
15
|
|
|
14
16
|
@runtime_checkable
|
|
15
17
|
class RuntimeConfig(Protocol):
|
|
@@ -117,6 +119,7 @@ _AGGREGATION_FUNCTION_NAMES_ORDERED: tuple[str, ...] = (
|
|
|
117
119
|
"max",
|
|
118
120
|
)
|
|
119
121
|
VALID_AGGREGATION_FUNCTIONS = frozenset(_AGGREGATION_FUNCTION_NAMES_ORDERED)
|
|
122
|
+
SELECTABILITY_AGG_FUNCS: frozenset[str] = VALID_AGGREGATION_FUNCTIONS
|
|
120
123
|
SQL_AGG_FUNC_CALL_RE = re.compile(
|
|
121
124
|
r"\b(?:count|sum|avg|min|max)\s*\(",
|
|
122
125
|
re.IGNORECASE,
|
|
@@ -421,6 +424,20 @@ BOOLEAN_TRUE_FALSE_MAP: dict[frozenset[str], tuple[str, str]] = {
|
|
|
421
424
|
frozenset(["active", "inactive"]): ("active", "inactive"),
|
|
422
425
|
frozenset(["enabled", "disabled"]): ("enabled", "disabled"),
|
|
423
426
|
}
|
|
427
|
+
|
|
428
|
+
BOOLEAN_NEGATION_PREFIXES: tuple[str, ...] = ("no ", "not ", "non ", "non-", "un", "in")
|
|
429
|
+
BOOLEAN_NEGATION_SUFFIXES: tuple[str, ...] = ()
|
|
430
|
+
BOOLEAN_ANTONYM_MIN_STEM_LEN: int = 3
|
|
431
|
+
BOOLEAN_AFFIRMATIVE_STRIP_PREFIXES: tuple[str, ...] = ("a ", "an ")
|
|
432
|
+
|
|
433
|
+
ARTIFACT_FORMAT_VERSION: int = 2
|
|
434
|
+
DESTRUCTIVE_REBUILD_ON_FORMAT_MISMATCH: bool = False
|
|
435
|
+
|
|
436
|
+
FAILURE_HINT_MAX_RECORDS: int = 200
|
|
437
|
+
FAILURE_HINT_MAX_CHARS_PER_RECORD: int = 500
|
|
438
|
+
FAILURE_HINT_MAX_MESSAGES: int = 5
|
|
439
|
+
FAILURE_HINT_MAX_INJECT_CHARS: int = 1200
|
|
440
|
+
FAILURE_HINT_FUZZY: bool = False
|
|
424
441
|
NUMERIC_TYPE_TOKENS = frozenset(
|
|
425
442
|
{
|
|
426
443
|
"int",
|
|
@@ -778,11 +795,11 @@ def normalize_value_type(value_type: str) -> str:
|
|
|
778
795
|
|
|
779
796
|
Args:
|
|
780
797
|
|
|
781
|
-
|
|
798
|
+
value_type: LLM or schema value type.
|
|
782
799
|
|
|
783
800
|
Returns:
|
|
784
801
|
|
|
785
|
-
|
|
802
|
+
Normalised name from `VALUE_TYPE_NORMALIZATION` / `VALID_VALUE_TYPES`, else `'string'`.
|
|
786
803
|
"""
|
|
787
804
|
if not value_type:
|
|
788
805
|
return "string"
|
|
@@ -800,11 +817,11 @@ def normalize_column_type(col_type: str) -> str:
|
|
|
800
817
|
|
|
801
818
|
Args:
|
|
802
819
|
|
|
803
|
-
|
|
820
|
+
col_type: Raw SQL type (e.g. `VARCHAR(255)`).
|
|
804
821
|
|
|
805
822
|
Returns:
|
|
806
823
|
|
|
807
|
-
|
|
824
|
+
Base type name for lookup tables.
|
|
808
825
|
"""
|
|
809
826
|
normalized = col_type.lower().strip()
|
|
810
827
|
normalized = re.sub(r"\(\d+(?:,\s*\d+)?\)", "", normalized)
|
|
@@ -1023,11 +1040,11 @@ class PostgresRuntimeConfig:
|
|
|
1023
1040
|
|
|
1024
1041
|
Returns:
|
|
1025
1042
|
|
|
1026
|
-
|
|
1043
|
+
SQLAlchemy connection URL.
|
|
1027
1044
|
|
|
1028
1045
|
Raises:
|
|
1029
1046
|
|
|
1030
|
-
|
|
1047
|
+
ValueError: If `PASSWORD` or `DATABASE` is unset.
|
|
1031
1048
|
"""
|
|
1032
1049
|
if not cls.PASSWORD:
|
|
1033
1050
|
raise ValueError("PostgreSQL password required")
|
|
@@ -1057,7 +1074,7 @@ class DatabricksRuntimeConfig:
|
|
|
1057
1074
|
|
|
1058
1075
|
Returns:
|
|
1059
1076
|
|
|
1060
|
-
|
|
1077
|
+
Whether `databricks-sql-connector` can be used.
|
|
1061
1078
|
"""
|
|
1062
1079
|
return bool(cls.SERVER_HOSTNAME and cls.HTTP_PATH and cls.ACCESS_TOKEN)
|
|
1063
1080
|
|
|
@@ -1068,11 +1085,11 @@ class DatabricksRuntimeConfig:
|
|
|
1068
1085
|
|
|
1069
1086
|
Returns:
|
|
1070
1087
|
|
|
1071
|
-
|
|
1088
|
+
None.
|
|
1072
1089
|
|
|
1073
1090
|
Raises:
|
|
1074
1091
|
|
|
1075
|
-
|
|
1092
|
+
ValueError: If either identifier is missing.
|
|
1076
1093
|
"""
|
|
1077
1094
|
if not cls.CATALOG:
|
|
1078
1095
|
raise ValueError("Databricks catalog required")
|
|
@@ -1086,7 +1103,7 @@ class DatabricksRuntimeConfig:
|
|
|
1086
1103
|
|
|
1087
1104
|
Returns:
|
|
1088
1105
|
|
|
1089
|
-
|
|
1106
|
+
URL string, or ``None`` when native warehouse credentials are not configured.
|
|
1090
1107
|
"""
|
|
1091
1108
|
if not cls.has_native_connection():
|
|
1092
1109
|
return None
|
|
@@ -1117,8 +1134,8 @@ class EngineConfig:
|
|
|
1117
1134
|
AZURE_OPENAI_BASE_URL: ClassVar[str | None] = os.environ.get("AZURE_OPENAI_BASE_URL")
|
|
1118
1135
|
AZURE_OPENAI_ENDPOINT: ClassVar[str | None] = os.environ.get("AZURE_OPENAI_ENDPOINT")
|
|
1119
1136
|
|
|
1120
|
-
SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json"
|
|
1121
|
-
TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json"
|
|
1137
|
+
SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json.gz"
|
|
1138
|
+
TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json.gz"
|
|
1122
1139
|
|
|
1123
1140
|
@classmethod
|
|
1124
1141
|
def azure_base_url(cls) -> str | None:
|
|
@@ -1171,12 +1188,86 @@ class QSimConfig:
|
|
|
1171
1188
|
|
|
1172
1189
|
EXCLUDED_FILTER_PATTERNS = EXCLUDED_FILTER_PATTERNS
|
|
1173
1190
|
|
|
1174
|
-
SKELETONS_JSON_PATH = "qsim_skeletons.json"
|
|
1175
|
-
QUESTIONS_OUTPUT_PATH = "qsim_intents_with_questions.json"
|
|
1191
|
+
SKELETONS_JSON_PATH = "qsim_skeletons.json.gz"
|
|
1176
1192
|
|
|
1177
1193
|
MAX_ROLE_CLASSIFICATION_RETRIES = 2
|
|
1178
1194
|
|
|
1179
1195
|
|
|
1196
|
+
class ExpansionOperatorId:
|
|
1197
|
+
"""Stable expansion operator ids; registry keys and expansion metadata stamps."""
|
|
1198
|
+
|
|
1199
|
+
FILTER_ADD = "FILTER_ADD"
|
|
1200
|
+
FILTER_EXPR_ADD = "FILTER_EXPR_ADD"
|
|
1201
|
+
AGG_CHANGE = "AGG_CHANGE"
|
|
1202
|
+
GROUPBY_ADD = "GROUPBY_ADD"
|
|
1203
|
+
ORDERBY_ADD = "ORDERBY_ADD"
|
|
1204
|
+
HAVING_VALUE_ADD = "HAVING_VALUE_ADD"
|
|
1205
|
+
HAVING_EXPR_ADD = "HAVING_EXPR_ADD"
|
|
1206
|
+
FILTER_REMOVE = "FILTER_REMOVE"
|
|
1207
|
+
GROUPBY_REMOVE = "GROUPBY_REMOVE"
|
|
1208
|
+
HAVING_REMOVE = "HAVING_REMOVE"
|
|
1209
|
+
JOIN_DIMENSION_ADD = "JOIN_DIMENSION_ADD"
|
|
1210
|
+
JOIN_FACT_ADD = "JOIN_FACT_ADD"
|
|
1211
|
+
DIMENSION_SWAP = "DIMENSION_SWAP"
|
|
1212
|
+
TABLE_REMOVE = "TABLE_REMOVE"
|
|
1213
|
+
BRIDGE_INTERMEDIATE_ADD = "BRIDGE_INTERMEDIATE_ADD"
|
|
1214
|
+
INCLUDE_GOLD = "INCLUDE_GOLD"
|
|
1215
|
+
TEMP_EXTRACT_GROUPBY = "TEMP_EXTRACT_GROUPBY"
|
|
1216
|
+
TEMP_DATE_TRUNC_GROUPBY = "TEMP_DATE_TRUNC_GROUPBY"
|
|
1217
|
+
TEMP_DATE_WINDOW_FILTER = "TEMP_DATE_WINDOW_FILTER"
|
|
1218
|
+
TEMP_DATE_DIFF_FILTER = "TEMP_DATE_DIFF_FILTER"
|
|
1219
|
+
NUM_ROUND_SELECT = "NUM_ROUND_SELECT"
|
|
1220
|
+
NUM_ABS_FILTER = "NUM_ABS_FILTER"
|
|
1221
|
+
DISTINCT_ADD = "DISTINCT_ADD"
|
|
1222
|
+
LIMIT_ADD = "LIMIT_ADD"
|
|
1223
|
+
FILTER_OR_GROUP = "FILTER_OR_GROUP"
|
|
1224
|
+
SELECT_EXPR_PAIR_MULTIPLY = "SELECT_EXPR_PAIR_MULTIPLY"
|
|
1225
|
+
WINDOW_RANK_ADD = "WINDOW_RANK_ADD"
|
|
1226
|
+
WINDOW_SUM_PARTITION_ADD = "WINDOW_SUM_PARTITION_ADD"
|
|
1227
|
+
SELECT_CASE_LABEL_ADD = "SELECT_CASE_LABEL_ADD"
|
|
1228
|
+
WINDOW_LAG_ADD = "WINDOW_LAG_ADD"
|
|
1229
|
+
WINDOW_LEAD_ADD = "WINDOW_LEAD_ADD"
|
|
1230
|
+
FILTER_ILIKE_ADD = "FILTER_ILIKE_ADD"
|
|
1231
|
+
FILTER_ARRAY_CONTAINS_ADD = "FILTER_ARRAY_CONTAINS_ADD"
|
|
1232
|
+
ORDERBY_REMOVE = "ORDERBY_REMOVE"
|
|
1233
|
+
LIMIT_REMOVE = "LIMIT_REMOVE"
|
|
1234
|
+
SELECT_COL_TRIM = "SELECT_COL_TRIM"
|
|
1235
|
+
WINDOW_STRIP = "WINDOW_STRIP"
|
|
1236
|
+
DISTINCT_REMOVE = "DISTINCT_REMOVE"
|
|
1237
|
+
|
|
1238
|
+
|
|
1239
|
+
SEED_NORMALIZATION_BATCH_SIZE: int = 20
|
|
1240
|
+
|
|
1241
|
+
INTERACTIVE_STAGE_DIRECT_REUSE = "direct_reuse_confirm"
|
|
1242
|
+
INTERACTIVE_STAGE_INTENT_WARNINGS = "intent_semantic_warnings"
|
|
1243
|
+
INTERACTIVE_STAGE_INTENT_CONFIRM = "intent_confirm"
|
|
1244
|
+
INTERACTIVE_STAGE_SCHEMA_INVALID = "intent_schema_invalid_continue"
|
|
1245
|
+
INTERACTIVE_STAGE_HARD_BLOCK = "hard_block_override"
|
|
1246
|
+
INTERACTIVE_STAGE_SQL_FEEDBACK = "sql_result_confirm"
|
|
1247
|
+
|
|
1248
|
+
PIPELINE_SUSPEND_ID_DIRECT_REUSE = "awaiting_direct_reuse_confirmation"
|
|
1249
|
+
PIPELINE_SUSPEND_ID_INTENT_WARNINGS = "awaiting_intent_semantic_continue"
|
|
1250
|
+
PIPELINE_SUSPEND_ID_INTENT_CONFIRM = "awaiting_intent_confirmation"
|
|
1251
|
+
PIPELINE_SUSPEND_ID_HARD_BLOCK = "awaiting_hard_block_override"
|
|
1252
|
+
PIPELINE_SUSPEND_ID_SQL = "awaiting_sql_result_confirmation"
|
|
1253
|
+
PIPELINE_SUSPEND_ID_SCHEMA_INVALID = "awaiting_schema_invalid_continuation"
|
|
1254
|
+
|
|
1255
|
+
SEED_NORMALIZATION_JSON = "seed_question_normalization.json"
|
|
1256
|
+
NORMALIZED_SEEDS_TXT = "seed_questions_normalized.txt"
|
|
1257
|
+
QSIM_QUESTIONS_PATTERN = "qsim_questions_v{version}.txt"
|
|
1258
|
+
|
|
1259
|
+
REALISM_DROP_REASON_CATEGORIES: frozenset[str] = frozenset(
|
|
1260
|
+
{
|
|
1261
|
+
"nonsensical_sql",
|
|
1262
|
+
"tautology",
|
|
1263
|
+
"overly_narrow_filter",
|
|
1264
|
+
"pii_smell",
|
|
1265
|
+
"unmeasurable_metric",
|
|
1266
|
+
"other",
|
|
1267
|
+
}
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
|
|
1180
1271
|
class SimulatorConfig:
|
|
1181
1272
|
"""Simulator expansion depth, output filename patterns, and date/limit presets."""
|
|
1182
1273
|
|
|
@@ -1190,10 +1281,8 @@ class SimulatorConfig:
|
|
|
1190
1281
|
MAX_HAVING_CONDITIONS = 2
|
|
1191
1282
|
MAX_EXPANSION_DEPTH = 2
|
|
1192
1283
|
|
|
1193
|
-
|
|
1284
|
+
SIMULATOR_BUNDLE_PATTERN = "simulator_v{version}.zip"
|
|
1194
1285
|
REPORT_PATTERN = "simulation_report_v{version}.json"
|
|
1195
|
-
RESULTS_CSV_PATTERN = "simulation_results_v{version}.csv"
|
|
1196
|
-
FAILURES_PATTERN = "simulation_failures_v{version}.json"
|
|
1197
1286
|
|
|
1198
1287
|
RANDOM_SEED = DEFAULT_RANDOM_SEED
|
|
1199
1288
|
|
|
@@ -1217,3 +1306,6 @@ class SimulatorConfig:
|
|
|
1217
1306
|
{"unit": "day", "amount": 30},
|
|
1218
1307
|
{"unit": "day", "amount": 90},
|
|
1219
1308
|
]
|
|
1309
|
+
|
|
1310
|
+
|
|
1311
|
+
EMPTY_JOIN_CANDIDATES: dict[str, Any] = {"candidates": []}
|