aetherdialect 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {aetherdialect-0.1.1/src/aetherdialect.egg-info → aetherdialect-0.1.2}/PKG-INFO +14 -10
  2. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/README.md +13 -9
  3. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/pyproject.toml +1 -1
  4. {aetherdialect-0.1.1 → aetherdialect-0.1.2/src/aetherdialect.egg-info}/PKG-INFO +14 -10
  5. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/SOURCES.txt +1 -0
  6. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/__init__.py +10 -10
  7. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/config.py +110 -18
  8. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/contracts_base.py +93 -60
  9. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/contracts_core.py +238 -118
  10. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/core_utils.py +380 -69
  11. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/dialect.py +309 -161
  12. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/expansion_ops.py +609 -203
  13. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_expr.py +173 -103
  14. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_process.py +185 -66
  15. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_repair.py +208 -96
  16. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/intent_resolve.py +76 -66
  17. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/live_testing.py +169 -47
  18. aetherdialect-0.1.2/src/text2sql/main_execution.py +1727 -0
  19. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/pipeline.py +632 -70
  20. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_ops.py +73 -35
  21. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_sample.py +84 -38
  22. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/qsim_struct.py +69 -36
  23. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/schema.py +134 -81
  24. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/schema_profiling.py +826 -121
  25. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/simulator.py +418 -211
  26. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/sql_gen.py +340 -95
  27. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/templates.py +293 -52
  28. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/text2sql.py +247 -97
  29. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/utils.py +93 -35
  30. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_agg.py +88 -30
  31. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_execute.py +49 -18
  32. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_schema.py +405 -136
  33. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/text2sql/validation_semantic.py +105 -76
  34. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_config.py +7 -7
  35. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_contracts.py +2 -2
  36. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_core_utils.py +1 -1
  37. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_dialect.py +109 -0
  38. aetherdialect-0.1.2/tests/test_expansion_ops.py +1035 -0
  39. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_expr.py +12 -0
  40. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_repair.py +67 -0
  41. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_live_testing.py +38 -18
  42. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_main_execution.py +19 -10
  43. aetherdialect-0.1.2/tests/test_pipeline_session.py +160 -0
  44. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_pipeline_units.py +33 -16
  45. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_struct.py +15 -15
  46. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_schema_profiling.py +90 -0
  47. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_simulator.py +147 -5
  48. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_sql_gen.py +45 -0
  49. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_templates.py +23 -18
  50. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_text2sql.py +76 -1
  51. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_schema.py +57 -2
  52. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_semantic.py +31 -0
  53. aetherdialect-0.1.1/src/text2sql/main_execution.py +0 -786
  54. aetherdialect-0.1.1/tests/test_expansion_ops.py +0 -599
  55. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/LICENSE +0 -0
  56. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/setup.cfg +0 -0
  57. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/dependency_links.txt +0 -0
  58. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/requires.txt +0 -0
  59. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/src/aetherdialect.egg-info/top_level.txt +0 -0
  60. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_process.py +0 -0
  61. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_intent_resolve.py +0 -0
  62. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_join_bool_cte_matrix.py +0 -0
  63. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_pipeline_targeted.py +0 -0
  64. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_ops.py +0 -0
  65. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_qsim_sample.py +0 -0
  66. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_schema.py +0 -0
  67. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_simulator_pipeline.py +0 -0
  68. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_utils.py +0 -0
  69. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_agg.py +0 -0
  70. {aetherdialect-0.1.1 → aetherdialect-0.1.2}/tests/test_validation_execute.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aetherdialect
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Deterministic, validation-first Text-to-SQL system for business databases
5
5
  Author-email: Akul Ameya <akul.ameya@gmail.com>
6
6
  License: MIT
@@ -36,6 +36,8 @@ Dynamic: license-file
36
36
 
37
37
  # Deterministic, validation-first Text-to-SQL for business databases
38
38
 
39
+ Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
40
+
39
41
  ## Installation
40
42
 
41
43
  ```bash
@@ -47,11 +49,11 @@ pip install "text2sql[postgresql,databricks]"
47
49
 
48
50
  Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
49
51
 
50
- | Extra | Brings in | Use when |
51
- | ------------ | ------------------------------------------------------------------------- | --------------------- |
52
- | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
53
- | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
54
- | `databricks` | PySpark, Databricks SQL connector, **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
52
+ | Extra | Brings in | Use when |
53
+ | ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
54
+ | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
55
+ | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
56
+ | `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
55
57
 
56
58
  **SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
57
59
 
@@ -73,7 +75,9 @@ t2s = Text2SQL(
73
75
  t2s.run_interactive()
74
76
  ```
75
77
 
76
- Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
78
+ Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
79
+
80
+ **Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
77
81
 
78
82
  ---
79
83
 
@@ -111,7 +115,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
111
115
  - Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
112
116
  - Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
113
117
  - **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
114
- - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): read once when the schema graph is **first** created (not on cache load). The LLM uses them to refine **table and column roles**, natural-language **descriptions**, and optional **sensitivity** labels without renaming tables or inventing columns.
118
+ - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
115
119
  - Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
116
120
 
117
121
  **Intent / SQL shape (analytical subset)**
@@ -130,7 +134,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
130
134
 
131
135
  **Operational modes**
132
136
 
133
- - **Interactive** — ask questions, accept/reject, results export.
137
+ - **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
134
138
  - **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
135
139
  - **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
136
140
 
@@ -178,7 +182,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
178
182
  3. Resolve joins once per table set where possible; validate and **execute** as a gate.
179
183
  4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
180
184
 
181
- The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
185
+ The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
182
186
 
183
187
  ---
184
188
 
@@ -1,5 +1,7 @@
1
1
  # Deterministic, validation-first Text-to-SQL for business databases
2
2
 
3
+ Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
4
+
3
5
  ## Installation
4
6
 
5
7
  ```bash
@@ -11,11 +13,11 @@ pip install "text2sql[postgresql,databricks]"
11
13
 
12
14
  Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
13
15
 
14
- | Extra | Brings in | Use when |
15
- | ------------ | ------------------------------------------------------------------------- | --------------------- |
16
- | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
17
- | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
18
- | `databricks` | PySpark, Databricks SQL connector, **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
16
+ | Extra | Brings in | Use when |
17
+ | ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
18
+ | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
19
+ | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
20
+ | `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
19
21
 
20
22
  **SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
21
23
 
@@ -37,7 +39,9 @@ t2s = Text2SQL(
37
39
  t2s.run_interactive()
38
40
  ```
39
41
 
40
- Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
42
+ Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
43
+
44
+ **Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
41
45
 
42
46
  ---
43
47
 
@@ -75,7 +79,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
75
79
  - Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
76
80
  - Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
77
81
  - **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
78
- - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): read once when the schema graph is **first** created (not on cache load). The LLM uses them to refine **table and column roles**, natural-language **descriptions**, and optional **sensitivity** labels without renaming tables or inventing columns.
82
+ - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
79
83
  - Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
80
84
 
81
85
  **Intent / SQL shape (analytical subset)**
@@ -94,7 +98,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
94
98
 
95
99
  **Operational modes**
96
100
 
97
- - **Interactive** — ask questions, accept/reject, results export.
101
+ - **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
98
102
  - **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
99
103
  - **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
100
104
 
@@ -142,7 +146,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
142
146
  3. Resolve joins once per table set where possible; validate and **execute** as a gate.
143
147
  4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
144
148
 
145
- The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
149
+ The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
146
150
 
147
151
  ---
148
152
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "aetherdialect"
7
- version = "0.1.1"
7
+ version = "0.1.2"
8
8
  description = "Deterministic, validation-first Text-to-SQL system for business databases"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: aetherdialect
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: Deterministic, validation-first Text-to-SQL system for business databases
5
5
  Author-email: Akul Ameya <akul.ameya@gmail.com>
6
6
  License: MIT
@@ -36,6 +36,8 @@ Dynamic: license-file
36
36
 
37
37
  # Deterministic, validation-first Text-to-SQL for business databases
38
38
 
39
+ Questions resolve more reliably when you state analytical intent explicitly—entities, grain, filters, time scope, and ordering—instead of leaving those details implied.
40
+
39
41
  ## Installation
40
42
 
41
43
  ```bash
@@ -47,11 +49,11 @@ pip install "text2sql[postgresql,databricks]"
47
49
 
48
50
  Requires Python ≥ 3.10 and either an [OpenAI API key](https://platform.openai.com/api-keys) or Azure OpenAI credentials.
49
51
 
50
- | Extra | Brings in | Use when |
51
- | ------------ | ------------------------------------------------------------------------- | --------------------- |
52
- | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
53
- | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
54
- | `databricks` | PySpark, Databricks SQL connector, **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
52
+ | Extra | Brings in | Use when |
53
+ | ------------ | ------------------------------------------------------------------------------------------------ | --------------------- |
54
+ | (base) | **SQLAlchemy** (shared introspection / execution interface) | Always installed |
55
+ | `postgresql` | PostgreSQL driver (`psycopg2-binary`), **`pglast`** | `engine="postgresql"` |
56
+ | `databricks` | Databricks SQL connector (preferred), PySpark (fallback), **`databricks-sqlalchemy`**, `sqlglot` | `engine="databricks"` |
55
57
 
56
58
  **SQL parsing for validation:** PostgreSQL uses **`pglast`** for structural AST checks (join pairs, CTE bodies, `ast_validate`). Databricks / Spark SQL uses **`sqlglot`** with the **Spark** dialect.
57
59
 
@@ -73,7 +75,9 @@ t2s = Text2SQL(
73
75
  t2s.run_interactive()
74
76
  ```
75
77
 
76
- Constructor options, modes, optional files, and the full method list are in **[USAGE.md](USAGE.md)**.
78
+ Constructor options, credentials (`set_openai_api_key`, `set_azure_openai_api_key`, `set_env`), modes, and the full API are in **[USAGE.md](USAGE.md)**. Pass **`artifacts_dir=`** to put the per-connection cache under a root you choose; otherwise it lives under the platform user-data directory (see USAGE.md).
79
+
80
+ **Interactive two ways:** **`run_interactive()`** is a stdin loop. For your own UI or protocol, use **`Text2SQL.pipeline_session()`** and drive **`PipelineSession`** step by step: one natural-language question is a **turn** that may return several **`SessionEvent`** objects (prompt / answer / …) until **`done`** is true. Types and methods are documented in **[USAGE.md](USAGE.md)**.
77
81
 
78
82
  ---
79
83
 
@@ -111,7 +115,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
111
115
  - Optional **`CREATE TABLE` file** as extra or fallback (especially when you cannot reach all metadata from the driver).
112
116
  - Cached schema snapshot per connection fingerprint so restarts avoid re-reflecting unchanged databases.
113
117
  - **Table roles** (e.g. fact vs dimension), **column roles** (measure, categorical, temporal, identifier, etc.), **filter / aggregation / HAVING** allowances per column, **value domains** from profiling — all assigned when the graph is built (reflection, DDL, profiling, and optional notes).
114
- - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): read once when the schema graph is **first** created (not on cache load). The LLM uses them to refine **table and column roles**, natural-language **descriptions**, and optional **sensitivity** labels without renaming tables or inventing columns.
118
+ - Optional **human notes** (plain text), via `Text2SQL(..., notes_file=...)` (see **[USAGE.md](USAGE.md)**): merged when the graph is built or when notes change; if the cache already contains notes and you omit `notes_file` on a later run, cached roles and hints are kept.
115
119
  - Optional **deny lists** for tables or columns so they stay out of prompts and can be stripped from intents.
116
120
 
117
121
  **Intent / SQL shape (analytical subset)**
@@ -130,7 +134,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
130
134
 
131
135
  **Operational modes**
132
136
 
133
- - **Interactive** — ask questions, accept/reject, results export.
137
+ - **Interactive** — ask questions, accept/reject, results export; via **`run_interactive()`** or a programmatic **`PipelineSession`** (see Quickstart above and **[USAGE.md](USAGE.md)**).
134
138
  - **Coverage simulator** — seed questions → gold intents → **deterministic expansion** (many operators, deduplicated) → validate/execute → NL question generation for new templates.
135
139
  - **QSim** — reproducible synthetic questions from schema and profiles (seeded randomness).
136
140
 
@@ -178,7 +182,7 @@ A **validation-first** text-to-SQL layer for **PostgreSQL** and **Databricks**.
178
182
  3. Resolve joins once per table set where possible; validate and **execute** as a gate.
179
183
  4. One LLM step to produce a **natural language question** from the SQL, with a **realism** filter.
180
184
 
181
- The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, so its seed count reflects that cap. It returns **rough** LLM-call and execution estimates from a seed file and schema stats.
185
+ The simulator loads at most **500** seed lines per run (fixed internal cap; larger files are truncated). **`estimate_simulator_costs`** uses the same loader, prints **rough** per-phase upper-bound lines to stdout, and returns **`None`**.
182
186
 
183
187
  ---
184
188
 
@@ -46,6 +46,7 @@ tests/test_intent_resolve.py
46
46
  tests/test_join_bool_cte_matrix.py
47
47
  tests/test_live_testing.py
48
48
  tests/test_main_execution.py
49
+ tests/test_pipeline_session.py
49
50
  tests/test_pipeline_targeted.py
50
51
  tests/test_pipeline_units.py
51
52
  tests/test_qsim_ops.py
@@ -1,10 +1,10 @@
1
- from importlib.metadata import PackageNotFoundError, version
2
-
3
- from .text2sql import Text2SQL
4
-
5
- try:
6
- __version__ = version("aetherdialect")
7
- except PackageNotFoundError:
8
- __version__ = "0.0.0+dev"
9
-
10
- __all__ = ["Text2SQL", "__version__"]
1
+ from importlib.metadata import PackageNotFoundError, version
2
+
3
+ from .text2sql import Text2SQL
4
+
5
+ try:
6
+ __version__ = version("aetherdialect")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0+dev"
9
+
10
+ __all__ = ["Text2SQL", "__version__"]
@@ -5,11 +5,13 @@ from __future__ import annotations
5
5
  import os
6
6
  import re
7
7
  from enum import Enum
8
- from typing import ClassVar, Protocol, runtime_checkable
8
+ from typing import Any, ClassVar, Protocol, runtime_checkable
9
9
  from urllib.parse import quote
10
10
 
11
11
  SUPPORTED_ENGINES: frozenset[str] = frozenset({"postgresql", "databricks"})
12
12
 
13
+ JSON_COMPACT_SEPARATORS: tuple[str, str] = (",", ":")
14
+
13
15
 
14
16
  @runtime_checkable
15
17
  class RuntimeConfig(Protocol):
@@ -117,6 +119,7 @@ _AGGREGATION_FUNCTION_NAMES_ORDERED: tuple[str, ...] = (
117
119
  "max",
118
120
  )
119
121
  VALID_AGGREGATION_FUNCTIONS = frozenset(_AGGREGATION_FUNCTION_NAMES_ORDERED)
122
+ SELECTABILITY_AGG_FUNCS: frozenset[str] = VALID_AGGREGATION_FUNCTIONS
120
123
  SQL_AGG_FUNC_CALL_RE = re.compile(
121
124
  r"\b(?:count|sum|avg|min|max)\s*\(",
122
125
  re.IGNORECASE,
@@ -421,6 +424,20 @@ BOOLEAN_TRUE_FALSE_MAP: dict[frozenset[str], tuple[str, str]] = {
421
424
  frozenset(["active", "inactive"]): ("active", "inactive"),
422
425
  frozenset(["enabled", "disabled"]): ("enabled", "disabled"),
423
426
  }
427
+
428
+ BOOLEAN_NEGATION_PREFIXES: tuple[str, ...] = ("no ", "not ", "non ", "non-", "un", "in")
429
+ BOOLEAN_NEGATION_SUFFIXES: tuple[str, ...] = ()
430
+ BOOLEAN_ANTONYM_MIN_STEM_LEN: int = 3
431
+ BOOLEAN_AFFIRMATIVE_STRIP_PREFIXES: tuple[str, ...] = ("a ", "an ")
432
+
433
+ ARTIFACT_FORMAT_VERSION: int = 2
434
+ DESTRUCTIVE_REBUILD_ON_FORMAT_MISMATCH: bool = False
435
+
436
+ FAILURE_HINT_MAX_RECORDS: int = 200
437
+ FAILURE_HINT_MAX_CHARS_PER_RECORD: int = 500
438
+ FAILURE_HINT_MAX_MESSAGES: int = 5
439
+ FAILURE_HINT_MAX_INJECT_CHARS: int = 1200
440
+ FAILURE_HINT_FUZZY: bool = False
424
441
  NUMERIC_TYPE_TOKENS = frozenset(
425
442
  {
426
443
  "int",
@@ -778,11 +795,11 @@ def normalize_value_type(value_type: str) -> str:
778
795
 
779
796
  Args:
780
797
 
781
- value_type: LLM or schema value type.
798
+ value_type: LLM or schema value type.
782
799
 
783
800
  Returns:
784
801
 
785
- Normalised name from `VALUE_TYPE_NORMALIZATION` / `VALID_VALUE_TYPES`, else `'string'`.
802
+ Normalised name from `VALUE_TYPE_NORMALIZATION` / `VALID_VALUE_TYPES`, else `'string'`.
786
803
  """
787
804
  if not value_type:
788
805
  return "string"
@@ -800,11 +817,11 @@ def normalize_column_type(col_type: str) -> str:
800
817
 
801
818
  Args:
802
819
 
803
- col_type: Raw SQL type (e.g. `VARCHAR(255)`).
820
+ col_type: Raw SQL type (e.g. `VARCHAR(255)`).
804
821
 
805
822
  Returns:
806
823
 
807
- Base type name for lookup tables.
824
+ Base type name for lookup tables.
808
825
  """
809
826
  normalized = col_type.lower().strip()
810
827
  normalized = re.sub(r"\(\d+(?:,\s*\d+)?\)", "", normalized)
@@ -1023,11 +1040,11 @@ class PostgresRuntimeConfig:
1023
1040
 
1024
1041
  Returns:
1025
1042
 
1026
- SQLAlchemy connection URL.
1043
+ SQLAlchemy connection URL.
1027
1044
 
1028
1045
  Raises:
1029
1046
 
1030
- ValueError: If `PASSWORD` or `DATABASE` is unset.
1047
+ ValueError: If `PASSWORD` or `DATABASE` is unset.
1031
1048
  """
1032
1049
  if not cls.PASSWORD:
1033
1050
  raise ValueError("PostgreSQL password required")
@@ -1057,7 +1074,7 @@ class DatabricksRuntimeConfig:
1057
1074
 
1058
1075
  Returns:
1059
1076
 
1060
- Whether `databricks-sql-connector` can be used.
1077
+ Whether `databricks-sql-connector` can be used.
1061
1078
  """
1062
1079
  return bool(cls.SERVER_HOSTNAME and cls.HTTP_PATH and cls.ACCESS_TOKEN)
1063
1080
 
@@ -1068,11 +1085,11 @@ class DatabricksRuntimeConfig:
1068
1085
 
1069
1086
  Returns:
1070
1087
 
1071
- None.
1088
+ None.
1072
1089
 
1073
1090
  Raises:
1074
1091
 
1075
- ValueError: If either identifier is missing.
1092
+ ValueError: If either identifier is missing.
1076
1093
  """
1077
1094
  if not cls.CATALOG:
1078
1095
  raise ValueError("Databricks catalog required")
@@ -1086,7 +1103,7 @@ class DatabricksRuntimeConfig:
1086
1103
 
1087
1104
  Returns:
1088
1105
 
1089
- URL string, or ``None`` when native warehouse credentials are not configured.
1106
+ URL string, or ``None`` when native warehouse credentials are not configured.
1090
1107
  """
1091
1108
  if not cls.has_native_connection():
1092
1109
  return None
@@ -1117,8 +1134,8 @@ class EngineConfig:
1117
1134
  AZURE_OPENAI_BASE_URL: ClassVar[str | None] = os.environ.get("AZURE_OPENAI_BASE_URL")
1118
1135
  AZURE_OPENAI_ENDPOINT: ClassVar[str | None] = os.environ.get("AZURE_OPENAI_ENDPOINT")
1119
1136
 
1120
- SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json"
1121
- TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json"
1137
+ SCHEMA_JSON_PATH: ClassVar[str] = "schema_graph.json.gz"
1138
+ TEMPLATE_JSON_PATH: ClassVar[str] = "intent_templates.json.gz"
1122
1139
 
1123
1140
  @classmethod
1124
1141
  def azure_base_url(cls) -> str | None:
@@ -1171,12 +1188,86 @@ class QSimConfig:
1171
1188
 
1172
1189
  EXCLUDED_FILTER_PATTERNS = EXCLUDED_FILTER_PATTERNS
1173
1190
 
1174
- SKELETONS_JSON_PATH = "qsim_skeletons.json"
1175
- QUESTIONS_OUTPUT_PATH = "qsim_intents_with_questions.json"
1191
+ SKELETONS_JSON_PATH = "qsim_skeletons.json.gz"
1176
1192
 
1177
1193
  MAX_ROLE_CLASSIFICATION_RETRIES = 2
1178
1194
 
1179
1195
 
1196
+ class ExpansionOperatorId:
1197
+ """Stable expansion operator ids; registry keys and expansion metadata stamps."""
1198
+
1199
+ FILTER_ADD = "FILTER_ADD"
1200
+ FILTER_EXPR_ADD = "FILTER_EXPR_ADD"
1201
+ AGG_CHANGE = "AGG_CHANGE"
1202
+ GROUPBY_ADD = "GROUPBY_ADD"
1203
+ ORDERBY_ADD = "ORDERBY_ADD"
1204
+ HAVING_VALUE_ADD = "HAVING_VALUE_ADD"
1205
+ HAVING_EXPR_ADD = "HAVING_EXPR_ADD"
1206
+ FILTER_REMOVE = "FILTER_REMOVE"
1207
+ GROUPBY_REMOVE = "GROUPBY_REMOVE"
1208
+ HAVING_REMOVE = "HAVING_REMOVE"
1209
+ JOIN_DIMENSION_ADD = "JOIN_DIMENSION_ADD"
1210
+ JOIN_FACT_ADD = "JOIN_FACT_ADD"
1211
+ DIMENSION_SWAP = "DIMENSION_SWAP"
1212
+ TABLE_REMOVE = "TABLE_REMOVE"
1213
+ BRIDGE_INTERMEDIATE_ADD = "BRIDGE_INTERMEDIATE_ADD"
1214
+ INCLUDE_GOLD = "INCLUDE_GOLD"
1215
+ TEMP_EXTRACT_GROUPBY = "TEMP_EXTRACT_GROUPBY"
1216
+ TEMP_DATE_TRUNC_GROUPBY = "TEMP_DATE_TRUNC_GROUPBY"
1217
+ TEMP_DATE_WINDOW_FILTER = "TEMP_DATE_WINDOW_FILTER"
1218
+ TEMP_DATE_DIFF_FILTER = "TEMP_DATE_DIFF_FILTER"
1219
+ NUM_ROUND_SELECT = "NUM_ROUND_SELECT"
1220
+ NUM_ABS_FILTER = "NUM_ABS_FILTER"
1221
+ DISTINCT_ADD = "DISTINCT_ADD"
1222
+ LIMIT_ADD = "LIMIT_ADD"
1223
+ FILTER_OR_GROUP = "FILTER_OR_GROUP"
1224
+ SELECT_EXPR_PAIR_MULTIPLY = "SELECT_EXPR_PAIR_MULTIPLY"
1225
+ WINDOW_RANK_ADD = "WINDOW_RANK_ADD"
1226
+ WINDOW_SUM_PARTITION_ADD = "WINDOW_SUM_PARTITION_ADD"
1227
+ SELECT_CASE_LABEL_ADD = "SELECT_CASE_LABEL_ADD"
1228
+ WINDOW_LAG_ADD = "WINDOW_LAG_ADD"
1229
+ WINDOW_LEAD_ADD = "WINDOW_LEAD_ADD"
1230
+ FILTER_ILIKE_ADD = "FILTER_ILIKE_ADD"
1231
+ FILTER_ARRAY_CONTAINS_ADD = "FILTER_ARRAY_CONTAINS_ADD"
1232
+ ORDERBY_REMOVE = "ORDERBY_REMOVE"
1233
+ LIMIT_REMOVE = "LIMIT_REMOVE"
1234
+ SELECT_COL_TRIM = "SELECT_COL_TRIM"
1235
+ WINDOW_STRIP = "WINDOW_STRIP"
1236
+ DISTINCT_REMOVE = "DISTINCT_REMOVE"
1237
+
1238
+
1239
+ SEED_NORMALIZATION_BATCH_SIZE: int = 20
1240
+
1241
+ INTERACTIVE_STAGE_DIRECT_REUSE = "direct_reuse_confirm"
1242
+ INTERACTIVE_STAGE_INTENT_WARNINGS = "intent_semantic_warnings"
1243
+ INTERACTIVE_STAGE_INTENT_CONFIRM = "intent_confirm"
1244
+ INTERACTIVE_STAGE_SCHEMA_INVALID = "intent_schema_invalid_continue"
1245
+ INTERACTIVE_STAGE_HARD_BLOCK = "hard_block_override"
1246
+ INTERACTIVE_STAGE_SQL_FEEDBACK = "sql_result_confirm"
1247
+
1248
+ PIPELINE_SUSPEND_ID_DIRECT_REUSE = "awaiting_direct_reuse_confirmation"
1249
+ PIPELINE_SUSPEND_ID_INTENT_WARNINGS = "awaiting_intent_semantic_continue"
1250
+ PIPELINE_SUSPEND_ID_INTENT_CONFIRM = "awaiting_intent_confirmation"
1251
+ PIPELINE_SUSPEND_ID_HARD_BLOCK = "awaiting_hard_block_override"
1252
+ PIPELINE_SUSPEND_ID_SQL = "awaiting_sql_result_confirmation"
1253
+ PIPELINE_SUSPEND_ID_SCHEMA_INVALID = "awaiting_schema_invalid_continuation"
1254
+
1255
+ SEED_NORMALIZATION_JSON = "seed_question_normalization.json"
1256
+ NORMALIZED_SEEDS_TXT = "seed_questions_normalized.txt"
1257
+ QSIM_QUESTIONS_PATTERN = "qsim_questions_v{version}.txt"
1258
+
1259
+ REALISM_DROP_REASON_CATEGORIES: frozenset[str] = frozenset(
1260
+ {
1261
+ "nonsensical_sql",
1262
+ "tautology",
1263
+ "overly_narrow_filter",
1264
+ "pii_smell",
1265
+ "unmeasurable_metric",
1266
+ "other",
1267
+ }
1268
+ )
1269
+
1270
+
1180
1271
  class SimulatorConfig:
1181
1272
  """Simulator expansion depth, output filename patterns, and date/limit presets."""
1182
1273
 
@@ -1190,10 +1281,8 @@ class SimulatorConfig:
1190
1281
  MAX_HAVING_CONDITIONS = 2
1191
1282
  MAX_EXPANSION_DEPTH = 2
1192
1283
 
1193
- GOLD_OUTPUT_PATTERN = "gold_intents_v{version}.json"
1284
+ SIMULATOR_BUNDLE_PATTERN = "simulator_v{version}.zip"
1194
1285
  REPORT_PATTERN = "simulation_report_v{version}.json"
1195
- RESULTS_CSV_PATTERN = "simulation_results_v{version}.csv"
1196
- FAILURES_PATTERN = "simulation_failures_v{version}.json"
1197
1286
 
1198
1287
  RANDOM_SEED = DEFAULT_RANDOM_SEED
1199
1288
 
@@ -1217,3 +1306,6 @@ class SimulatorConfig:
1217
1306
  {"unit": "day", "amount": 30},
1218
1307
  {"unit": "day", "amount": 90},
1219
1308
  ]
1309
+
1310
+
1311
+ EMPTY_JOIN_CANDIDATES: dict[str, Any] = {"candidates": []}