sqlas 2.5.0__tar.gz → 2.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlas-2.8.0/PKG-INFO +423 -0
- sqlas-2.8.0/README.md +373 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/pyproject.toml +9 -4
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/__init__.py +39 -6
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/agentic.py +105 -7
- sqlas-2.8.0/sqlas/benchmarks.py +480 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/context.py +4 -4
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/core.py +247 -7
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/correctness.py +113 -12
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/evaluate.py +164 -24
- sqlas-2.8.0/sqlas/failure_analysis.py +284 -0
- sqlas-2.8.0/sqlas/governance.py +358 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/guardrails.py +5 -1
- sqlas-2.8.0/sqlas/integrations.py +274 -0
- sqlas-2.8.0/sqlas/production.py +367 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/prompt_registry.py +2 -1
- sqlas-2.8.0/sqlas/quality.py +567 -0
- sqlas-2.8.0/sqlas/reasoning.py +467 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/response.py +30 -17
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/safety.py +13 -6
- sqlas-2.8.0/sqlas/ui.py +572 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/visualization.py +11 -8
- sqlas-2.8.0/sqlas.egg-info/PKG-INFO +423 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas.egg-info/SOURCES.txt +8 -0
- sqlas-2.8.0/sqlas.egg-info/requires.txt +31 -0
- sqlas-2.8.0/sqlas.egg-info/top_level.txt +2 -0
- sqlas-2.8.0/tests/test_governance.py +170 -0
- sqlas-2.8.0/tests/test_reasoning.py +210 -0
- sqlas-2.5.0/PKG-INFO +0 -364
- sqlas-2.5.0/README.md +0 -328
- sqlas-2.5.0/sqlas/production.py +0 -153
- sqlas-2.5.0/sqlas/quality.py +0 -173
- sqlas-2.5.0/sqlas.egg-info/PKG-INFO +0 -364
- sqlas-2.5.0/sqlas.egg-info/requires.txt +0 -12
- sqlas-2.5.0/sqlas.egg-info/top_level.txt +0 -1
- {sqlas-2.5.0 → sqlas-2.8.0}/LICENSE +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/setup.cfg +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/cache.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/feedback.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/py.typed +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/runner.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas/schema_quality.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/sqlas.egg-info/dependency_links.txt +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/tests/test_context.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/tests/test_execute_fn.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/tests/test_large_schema.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/tests/test_sqlas.py +0 -0
- {sqlas-2.5.0 → sqlas-2.8.0}/tests/test_v2.py +0 -0
sqlas-2.8.0/PKG-INFO
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sqlas
|
|
3
|
+
Version: 2.8.0
|
|
4
|
+
Summary: SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics.
|
|
5
|
+
Author-email: thepradip <pradiptivhale@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/thepradip/SQLAS
|
|
8
|
+
Project-URL: Documentation, https://github.com/thepradip/SQLAS#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/thepradip/SQLAS
|
|
10
|
+
Project-URL: Changelog, https://github.com/thepradip/SQLAS/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: sql,agent,evaluation,llm,text-to-sql,ragas,mlflow,benchmark,monitoring
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: sqlglot>=20.0
|
|
27
|
+
Provides-Extra: mlflow
|
|
28
|
+
Requires-Dist: mlflow>=3.0; extra == "mlflow"
|
|
29
|
+
Provides-Extra: wandb
|
|
30
|
+
Requires-Dist: wandb>=0.16; extra == "wandb"
|
|
31
|
+
Provides-Extra: langsmith
|
|
32
|
+
Requires-Dist: langsmith>=0.1; extra == "langsmith"
|
|
33
|
+
Provides-Extra: ui
|
|
34
|
+
Requires-Dist: streamlit>=1.30; extra == "ui"
|
|
35
|
+
Requires-Dist: pandas>=2.0; extra == "ui"
|
|
36
|
+
Provides-Extra: benchmarks
|
|
37
|
+
Provides-Extra: prometheus
|
|
38
|
+
Requires-Dist: prometheus-client>=0.19; extra == "prometheus"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
41
|
+
Requires-Dist: build; extra == "dev"
|
|
42
|
+
Requires-Dist: twine; extra == "dev"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: mlflow>=3.0; extra == "all"
|
|
45
|
+
Requires-Dist: wandb>=0.16; extra == "all"
|
|
46
|
+
Requires-Dist: langsmith>=0.1; extra == "all"
|
|
47
|
+
Requires-Dist: streamlit>=1.30; extra == "all"
|
|
48
|
+
Requires-Dist: pandas>=2.0; extra == "all"
|
|
49
|
+
Dynamic: license-file
|
|
50
|
+
|
|
51
|
+
# SQLAS — SQL Agent Scoring Framework
|
|
52
|
+
|
|
53
|
+
**A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
|
|
54
|
+
|
|
55
|
+
[](https://pypi.org/project/sqlas/)
|
|
56
|
+
[](https://pypi.org/project/sqlas/)
|
|
57
|
+
[](https://github.com/thepradip/SQLAS)
|
|
58
|
+
[](LICENSE)
|
|
59
|
+
|
|
60
|
+
Evaluate SQL agents across **50+ metrics** — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, guardrails, and cache ROI. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
|
|
61
|
+
|
|
62
|
+
**Author:** [thepradip](https://github.com/thepradip)
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Install
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install sqlas # core
|
|
70
|
+
pip install "sqlas[mlflow]" # + MLflow
|
|
71
|
+
pip install "sqlas[ui]" # + Streamlit UI
|
|
72
|
+
pip install "sqlas[all]" # everything
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## What's New in v2.7.0
|
|
78
|
+
|
|
79
|
+
| Feature | Description |
|
|
80
|
+
|---------|-------------|
|
|
81
|
+
| **Multi-gold SQL** | `execution_accuracy_best_of(sql, gold_sqls)` — evaluate against all valid gold queries, take best score. `TestCase.gold_sqls: list[str]` |
|
|
82
|
+
| **Hardness classification** | `auto_classify_hardness(sql)` → `easy/medium/hard/extra-hard` per BIRD criteria. Auto-set on every `evaluate()` call |
|
|
83
|
+
| **Exact match metric** | `exact_match(generated, gold)` — normalized string comparison. Exposed as `SQLASScores.exact_match_score` |
|
|
84
|
+
| **Failure classification** | `classify_failure(sql, scores, details)` → named `FailureCategory` with `top_hint()` actionable fix |
|
|
85
|
+
| **Batch crash isolation** | One failing test case no longer kills the entire batch |
|
|
86
|
+
| **LLM retry with backoff** | `_retry_llm_judge()` retries 3× (1s→2s→4s) on all 13 LLM judge call sites |
|
|
87
|
+
| **Weight normalization** | Custom weights auto-normalized to 1.0 instead of silently distorting scores |
|
|
88
|
+
| **LLM judge cache** | `enable_judge_cache()` — opt-in in-memory cache prevents re-scoring identical pairs in CI |
|
|
89
|
+
| **Report generation** | `generate_report(scores_list, format="markdown"\|"json")`, `to_json()`, `to_markdown_report()` |
|
|
90
|
+
| **Non-deterministic detection** | `NOW()`, `RANDOM()`, `CURRENT_TIMESTAMP` trigger `nondeterministic_warning` in details |
|
|
91
|
+
| **execute_fn timeout** | 30s wall-clock timeout with thread-safe SQLite fallback |
|
|
92
|
+
| **Safety patterns** | `UNION ALL SELECT`, `EXCEPT SELECT`, `WAITFOR DELAY`, file injection, NL prompt injection synonyms |
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from sqlas import evaluate
|
|
100
|
+
|
|
101
|
+
def llm_judge(prompt: str) -> str:
|
|
102
|
+
return openai_client.chat.completions.create(
|
|
103
|
+
model="gpt-4o", messages=[{"role":"user","content":prompt}]
|
|
104
|
+
).choices[0].message.content
|
|
105
|
+
|
|
106
|
+
scores = evaluate(
|
|
107
|
+
question = "How many active users?",
|
|
108
|
+
generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
109
|
+
gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
110
|
+
db_path = "my.db",
|
|
111
|
+
llm_judge = llm_judge,
|
|
112
|
+
response = "There are 1,523 active users.",
|
|
113
|
+
result_data = {"columns":["COUNT(*)"],"rows":[[1523]],"row_count":1,"execution_time_ms":2.1},
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
print(scores.overall_score) # 0.95
|
|
117
|
+
print(scores.correctness_score) # 0.88
|
|
118
|
+
print(scores.verdict) # PASS
|
|
119
|
+
print(scores.hardness) # "easy"
|
|
120
|
+
print(scores.exact_match_score) # 1.0
|
|
121
|
+
print(scores.to_markdown_report()) # Markdown for PR comments
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Failure Classification (v2.7)
|
|
127
|
+
|
|
128
|
+
Know exactly *why* a query failed — not just a score.
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from sqlas import classify_failure, FailureCategory
|
|
132
|
+
|
|
133
|
+
analysis = classify_failure(
|
|
134
|
+
sql = "SELECT id FROM users LIMIT 100",
|
|
135
|
+
scores = {"execution_accuracy": 1.0, "row_count_match": 0.12},
|
|
136
|
+
details = {"row_count_match": {"pred_count": 100, "gold_count": 839}},
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
print(analysis.primary) # FailureCategory.LIMIT_TRUNCATION
|
|
140
|
+
print(analysis.summary()) # "FAIL [limit_truncation] (score=1.000)"
|
|
141
|
+
print(analysis.top_hint()) # "Remove LIMIT — question asks for full results, not top-N"
|
|
142
|
+
print(analysis.evidence) # {"limit_truncation": "LIMIT in SQL, 100 rows vs 839 expected"}
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**All failure categories:**
|
|
146
|
+
|
|
147
|
+
| Category | Source |
|
|
148
|
+
|----------|--------|
|
|
149
|
+
| `LIMIT_TRUNCATION` | LIMIT silently cut result (100 vs 839 rows) |
|
|
150
|
+
| `WRONG_TABLE` | `accounting_transactions` used instead of `accounting` |
|
|
151
|
+
| `WRONG_AGGREGATION` | MAX instead of SUM, AVG instead of SUM |
|
|
152
|
+
| `SCALAR_MISMATCH` | Correlation or count value differs |
|
|
153
|
+
| `ROW_EXPLOSION` | 1:N join inflated row count |
|
|
154
|
+
| `SCHEMA_HALLUCINATION` | Invented table/column names (`counts`, `adm_count`, `n`) |
|
|
155
|
+
| `FULL_TABLE_SCAN` | SELECT * with no WHERE/LIMIT |
|
|
156
|
+
| `TRIM_ON_NUMERIC` | TRIM() on REAL column — invalid on Postgres |
|
|
157
|
+
| `UNSAFE_QUERY` | DDL/DML attempted |
|
|
158
|
+
| `CURRENCY_NOT_CLEANED` | Single REPLACE missed commas in `$1,234` |
|
|
159
|
+
| `NULL_IN_AGGREGATION` | AVG/SUM without IS NOT NULL |
|
|
160
|
+
| `JOIN_WITHOUT_FK` | Banking joined to users with no foreign key |
|
|
161
|
+
| `FAITHFULNESS_DROP` | Narration not grounded in SQL result |
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Multi-gold SQL (v2.7)
|
|
166
|
+
|
|
167
|
+
When a question has multiple valid SQL formulations, evaluate against all and take the best score:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from sqlas import evaluate, TestCase
|
|
171
|
+
|
|
172
|
+
# Single evaluate call
|
|
173
|
+
scores = evaluate(
|
|
174
|
+
question = "Count active users",
|
|
175
|
+
generated_sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
|
|
176
|
+
gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1", # primary gold
|
|
177
|
+
db_path = "my.db",
|
|
178
|
+
llm_judge = llm_judge,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# Batch with multiple gold SQLs per question
|
|
182
|
+
test_case = TestCase(
|
|
183
|
+
question = "Count active users",
|
|
184
|
+
gold_sqls = [
|
|
185
|
+
"SELECT COUNT(*) FROM users WHERE active = 1",
|
|
186
|
+
"SELECT COUNT(*) FROM users WHERE status = 'active'",
|
|
187
|
+
"SELECT COUNT(id) FROM users WHERE is_active = true",
|
|
188
|
+
],
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Hardness Classification (v2.7)
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
from sqlas import auto_classify_hardness
|
|
198
|
+
|
|
199
|
+
auto_classify_hardness("SELECT COUNT(*) FROM users")
|
|
200
|
+
# → "easy"
|
|
201
|
+
|
|
202
|
+
auto_classify_hardness("SELECT u.id, SUM(o.total) FROM users u JOIN orders o ON u.id=o.user_id GROUP BY u.id HAVING SUM(o.total) > 1000")
|
|
203
|
+
# → "hard"
|
|
204
|
+
|
|
205
|
+
auto_classify_hardness("WITH ranked AS (SELECT *, ROW_NUMBER() OVER (...) FROM ...) SELECT ...")
|
|
206
|
+
# → "extra-hard"
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Follows BIRD benchmark criteria. Auto-set on every `evaluate()` call as `SQLASScores.hardness`.
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Report Generation (v2.7)
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from sqlas import generate_report
|
|
217
|
+
|
|
218
|
+
# Batch markdown report — paste into PRs or CI comments
|
|
219
|
+
results = evaluate_batch(test_cases, llm_judge, db_path="my.db")
|
|
220
|
+
print(generate_report(results, questions, format="markdown"))
|
|
221
|
+
|
|
222
|
+
# JSON for artifact storage
|
|
223
|
+
print(generate_report(results, format="json"))
|
|
224
|
+
|
|
225
|
+
# Per-query reports
|
|
226
|
+
print(scores.to_json())
|
|
227
|
+
print(scores.to_markdown_report(question="How many users?", sql=generated_sql))
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## LLM Judge Cache (v2.7)
|
|
233
|
+
|
|
234
|
+
Prevent re-scoring identical prompts in CI runs:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from sqlas import enable_judge_cache, clear_judge_cache
|
|
238
|
+
|
|
239
|
+
enable_judge_cache() # opt-in — identical prompts return cached result
|
|
240
|
+
results = evaluate_batch(...)
|
|
241
|
+
clear_judge_cache() # clear between test runs
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Three-Dimension Scoring
|
|
247
|
+
|
|
248
|
+
`PASS` only when **all three** dimensions meet their thresholds:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
|
|
252
|
+
|
|
253
|
+
c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
|
|
254
|
+
q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
|
|
255
|
+
s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
|
|
256
|
+
|
|
257
|
+
print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
|
|
258
|
+
print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
|
|
259
|
+
print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
`evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
|
|
263
|
+
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
## Guardrail Pipeline
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
from sqlas import GuardrailPipeline
|
|
270
|
+
|
|
271
|
+
pipeline = GuardrailPipeline(pii_columns=["email","ssn","password"])
|
|
272
|
+
|
|
273
|
+
r = pipeline.check_input("List every user's SSN") # blocks malicious NL intent
|
|
274
|
+
r = pipeline.check_sql(generated_sql) # blocks injection/PII SQL
|
|
275
|
+
r = pipeline.check_output(response, result_data) # blocks PII in response
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
**Injection patterns detected:** `UNION ALL SELECT`, `EXCEPT SELECT`, `INTERSECT SELECT`, stacked mutations, tautologies, time-based injection, file write/read, `WAITFOR DELAY`.
|
|
279
|
+
|
|
280
|
+
**NL prompt injection detected:** ignore/override/discard instructions, jailbreak, bypass guardrails, pretend unrestricted.
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## Spider / BIRD Benchmark
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from sqlas.benchmarks import run_spider_benchmark
|
|
288
|
+
|
|
289
|
+
results = run_spider_benchmark(
|
|
290
|
+
agent_fn = my_agent,
|
|
291
|
+
llm_judge = llm_judge,
|
|
292
|
+
spider_dir = "./spider",
|
|
293
|
+
n_samples = 50, # stratified by difficulty → ~$0.25
|
|
294
|
+
mlflow_run = True,
|
|
295
|
+
)
|
|
296
|
+
print(results["summary"]["overall_score"])
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
---
|
|
300
|
+
|
|
301
|
+
## Prompt Versioning
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from sqlas import PromptRegistry
|
|
305
|
+
|
|
306
|
+
registry = PromptRegistry()
|
|
307
|
+
registry.register("You are a SQL analyst...", version_id="v1")
|
|
308
|
+
registry.record("v1", scores)
|
|
309
|
+
|
|
310
|
+
status = registry.detect_regression("v1", window=50, threshold=0.05)
|
|
311
|
+
if status["regressed"]:
|
|
312
|
+
for hint in status["hints"]:
|
|
313
|
+
print(hint["hint"]) # actionable prompt fix suggestion
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
---
|
|
317
|
+
|
|
318
|
+
## Observability Integrations
|
|
319
|
+
|
|
320
|
+
```python
|
|
321
|
+
from sqlas.integrations import log_all
|
|
322
|
+
|
|
323
|
+
log_all(results,
|
|
324
|
+
mlflow_experiment = "sql-agent-v2",
|
|
325
|
+
wandb_project = "sql-evals",
|
|
326
|
+
langsmith_project = "my-sql-agent",
|
|
327
|
+
)
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
---
|
|
331
|
+
|
|
332
|
+
## Run a Test Suite
|
|
333
|
+
|
|
334
|
+
```python
|
|
335
|
+
from sqlas import run_suite, TestCase, WEIGHTS_V4, build_schema_info
|
|
336
|
+
|
|
337
|
+
tables, columns = build_schema_info(db_path="my.db")
|
|
338
|
+
|
|
339
|
+
results = run_suite(
|
|
340
|
+
test_cases = test_cases,
|
|
341
|
+
agent_fn = my_agent,
|
|
342
|
+
llm_judge = llm_judge,
|
|
343
|
+
execute_fn = execute_fn,
|
|
344
|
+
valid_tables = tables,
|
|
345
|
+
valid_columns = columns,
|
|
346
|
+
weights = WEIGHTS_V4,
|
|
347
|
+
pass_threshold = 0.6,
|
|
348
|
+
)
|
|
349
|
+
print(results["summary"]["overall_score"])
|
|
350
|
+
print(results["summary"]["by_category"])
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
---
|
|
354
|
+
|
|
355
|
+
## Metrics Overview
|
|
356
|
+
|
|
357
|
+
| Dimension | Key Metrics |
|
|
358
|
+
|-----------|-------------|
|
|
359
|
+
| **Correctness** | Execution accuracy, exact match, multi-gold SQL, semantic equivalence, result set similarity |
|
|
360
|
+
| **SQL Quality** | SQL quality (LLM), schema compliance, complexity match, data scan efficiency |
|
|
361
|
+
| **Context (RAGAS)** | Context precision, recall, entity recall, noise robustness |
|
|
362
|
+
| **Response** | Faithfulness, answer relevance, completeness, fluency |
|
|
363
|
+
| **Agentic** | Steps efficiency, schema grounding, planning quality, tool use accuracy, plan compliance, first attempt success |
|
|
364
|
+
| **Safety** | Read-only compliance, SQL injection, prompt injection, PII access, PII leakage |
|
|
365
|
+
| **Production** | Execution success, VES efficiency, row explosion detection, empty result, result coverage |
|
|
366
|
+
| **Cache** | Cache hit score, tokens saved, few-shot examples used |
|
|
367
|
+
| **Visualization** | Chart spec validity, data alignment, LLM chart validation |
|
|
368
|
+
|
|
369
|
+
## Weight Profiles
|
|
370
|
+
|
|
371
|
+
| Profile | Metrics | Best for |
|
|
372
|
+
|---------|---------|----------|
|
|
373
|
+
| `WEIGHTS` | 15 | Standard NL→SQL pipeline |
|
|
374
|
+
| `WEIGHTS_V2` | 20 | + RAGAS context quality |
|
|
375
|
+
| `WEIGHTS_V3` | 30 | + Guardrails + visualization |
|
|
376
|
+
| `WEIGHTS_V4` | 28 | + Agentic quality ← ReAct agents |
|
|
377
|
+
|
|
378
|
+
---
|
|
379
|
+
|
|
380
|
+
## Changelog
|
|
381
|
+
|
|
382
|
+
### v2.7.0
|
|
383
|
+
- `classify_failure()` + `FailureCategory` enum — named failure classification with actionable hints
|
|
384
|
+
- `auto_classify_hardness()` — BIRD-aligned easy/medium/hard/extra-hard (auto-set on every eval)
|
|
385
|
+
- `exact_match()` + `SQLASScores.exact_match_score`
|
|
386
|
+
- `execution_accuracy_best_of()` + `TestCase.gold_sqls` — multi-gold SQL evaluation
|
|
387
|
+
- `generate_report()` — batch markdown/JSON report; `to_json()`, `to_markdown_report()` on SQLASScores
|
|
388
|
+
- `enable_judge_cache()` / `clear_judge_cache()` — opt-in LLM judge caching
|
|
389
|
+
- LLM retry with exponential backoff (3×) on all 13 LLM judge call sites
|
|
390
|
+
- Batch eval crash isolation — one failure no longer kills the batch
|
|
391
|
+
- Weight normalization — auto-normalize to 1.0 instead of silently distorting
|
|
392
|
+
- execute_fn timeout (30s) with thread-safe SQLite fallback
|
|
393
|
+
- Non-deterministic query detection (NOW, RANDOM, CURRENT_TIMESTAMP)
|
|
394
|
+
- Safety: UNION ALL SELECT, EXCEPT, WAITFOR DELAY, file injection, NL synonyms
|
|
395
|
+
- Division-by-zero guards in all context metrics
|
|
396
|
+
|
|
397
|
+
### v2.6.0
|
|
398
|
+
- Spider/BIRD benchmark (`run_spider_benchmark`, `run_bird_benchmark`)
|
|
399
|
+
- MLflow, W&B, LangSmith integrations (`sqlas.integrations`)
|
|
400
|
+
- Streamlit UI (`python -m sqlas ui`)
|
|
401
|
+
- React evaluation dashboard (`sqlas-ui/`)
|
|
402
|
+
|
|
403
|
+
### v2.5.0
|
|
404
|
+
- `plan_compliance()` — measures create_plan enforcement before execute_sql
|
|
405
|
+
- `first_attempt_success()` — measures SQL retry rate
|
|
406
|
+
|
|
407
|
+
### v2.4.0
|
|
408
|
+
- `PromptRegistry` — prompt versioning, regression detection, improvement hints
|
|
409
|
+
- `schema_retrieval_quality()` — precision/recall/F1 for schema index
|
|
410
|
+
|
|
411
|
+
### v2.3.0
|
|
412
|
+
- `GuardrailPipeline` — 3-stage safety: input → SQL → output (zero LLM cost)
|
|
413
|
+
- `FeedbackStore` — verified gold SQL from user thumbs-up
|
|
414
|
+
|
|
415
|
+
### v2.2.0
|
|
416
|
+
- Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
|
|
417
|
+
- `verdict` — AND logic: PASS only when all three pass thresholds
|
|
418
|
+
|
|
419
|
+
---
|
|
420
|
+
|
|
421
|
+
## License
|
|
422
|
+
|
|
423
|
+
MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
|