sqlas 1.3.0__tar.gz → 2.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sqlas-2.5.0/PKG-INFO +364 -0
- sqlas-2.5.0/README.md +328 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/pyproject.toml +2 -2
- sqlas-2.5.0/sqlas/__init__.py +90 -0
- sqlas-2.5.0/sqlas/agentic.py +317 -0
- sqlas-2.5.0/sqlas/cache.py +93 -0
- sqlas-2.5.0/sqlas/core.py +543 -0
- sqlas-2.5.0/sqlas/evaluate.py +822 -0
- sqlas-2.5.0/sqlas/feedback.py +177 -0
- sqlas-2.5.0/sqlas/guardrails.py +381 -0
- sqlas-2.5.0/sqlas/production.py +153 -0
- sqlas-2.5.0/sqlas/prompt_registry.py +378 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/quality.py +2 -1
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/runner.py +6 -1
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/safety.py +51 -9
- sqlas-2.5.0/sqlas/schema_quality.py +215 -0
- sqlas-2.5.0/sqlas.egg-info/PKG-INFO +364 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/SOURCES.txt +9 -1
- {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_execute_fn.py +10 -5
- sqlas-2.5.0/tests/test_large_schema.py +285 -0
- sqlas-2.5.0/tests/test_v2.py +279 -0
- sqlas-1.3.0/PKG-INFO +0 -376
- sqlas-1.3.0/README.md +0 -340
- sqlas-1.3.0/sqlas/__init__.py +0 -90
- sqlas-1.3.0/sqlas/core.py +0 -273
- sqlas-1.3.0/sqlas/evaluate.py +0 -276
- sqlas-1.3.0/sqlas/production.py +0 -74
- sqlas-1.3.0/sqlas.egg-info/PKG-INFO +0 -376
- {sqlas-1.3.0 → sqlas-2.5.0}/LICENSE +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/setup.cfg +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/context.py +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/correctness.py +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/py.typed +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/response.py +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/visualization.py +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/dependency_links.txt +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/requires.txt +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/top_level.txt +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_context.py +0 -0
- {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_sqlas.py +0 -0
sqlas-2.5.0/PKG-INFO
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sqlas
|
|
3
|
+
Version: 2.5.0
|
|
4
|
+
Summary: SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics.
|
|
5
|
+
Author-email: thepradip <pradiptivhale@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/thepradip/SQLAS
|
|
8
|
+
Project-URL: Documentation, https://github.com/thepradip/SQLAS#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/thepradip/SQLAS
|
|
10
|
+
Project-URL: Changelog, https://github.com/thepradip/SQLAS/blob/main/CHANGELOG.md
|
|
11
|
+
Keywords: sql,agent,evaluation,llm,text-to-sql,ragas,mlflow,benchmark,monitoring
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Database
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: sqlglot>=20.0
|
|
27
|
+
Provides-Extra: mlflow
|
|
28
|
+
Requires-Dist: mlflow>=3.0; extra == "mlflow"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build; extra == "dev"
|
|
32
|
+
Requires-Dist: twine; extra == "dev"
|
|
33
|
+
Provides-Extra: all
|
|
34
|
+
Requires-Dist: mlflow>=3.0; extra == "all"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# SQLAS — SQL Agent Scoring Framework
|
|
38
|
+
|
|
39
|
+
**A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
|
|
40
|
+
|
|
41
|
+
[](https://pypi.org/project/sqlas/)
|
|
42
|
+
[](https://pypi.org/project/sqlas/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
[](https://github.com/thepradip/SQLAS)
|
|
45
|
+
|
|
46
|
+
Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
|
|
47
|
+
|
|
48
|
+
**Author:** [thepradip](https://github.com/thepradip)
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install sqlas # core
|
|
56
|
+
pip install "sqlas[mlflow]" # + MLflow integration
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## What's New in v2.4.0
|
|
62
|
+
|
|
63
|
+
| Feature | Description |
|
|
64
|
+
|---|---|
|
|
65
|
+
| `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
|
|
66
|
+
| `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
|
|
67
|
+
| `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
|
|
68
|
+
| `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
|
|
69
|
+
| `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
|
|
70
|
+
| Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
|
|
71
|
+
| `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Quick Start
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from sqlas import evaluate
|
|
79
|
+
|
|
80
|
+
def llm_judge(prompt: str) -> str:
|
|
81
|
+
return openai_client.chat.completions.create(
|
|
82
|
+
model="gpt-4o",
|
|
83
|
+
messages=[{"role": "user", "content": prompt}],
|
|
84
|
+
).choices[0].message.content
|
|
85
|
+
|
|
86
|
+
scores = evaluate(
|
|
87
|
+
question = "How many active users are there?",
|
|
88
|
+
generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
89
|
+
gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
90
|
+
db_path = "my_database.db",
|
|
91
|
+
llm_judge = llm_judge,
|
|
92
|
+
response = "There are 1,523 active users.",
|
|
93
|
+
result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
|
|
94
|
+
"row_count": 1, "execution_time_ms": 2.1},
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
print(scores.overall_score) # 0.95
|
|
98
|
+
print(scores.correctness_score) # 0.88 (v2.2)
|
|
99
|
+
print(scores.quality_score) # 0.93 (v2.2)
|
|
100
|
+
print(scores.safety_composite_score) # 1.00 (v2.2)
|
|
101
|
+
print(scores.verdict) # PASS (v2.2 — AND logic)
|
|
102
|
+
print(scores.summary())
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
---
|
|
106
|
+
|
|
107
|
+
## Three-Dimension Scoring (v2.2)
|
|
108
|
+
|
|
109
|
+
`PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
|
|
113
|
+
|
|
114
|
+
# Run only the metrics you need — each is fully independent
|
|
115
|
+
c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
|
|
116
|
+
q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
|
|
117
|
+
s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
|
|
118
|
+
|
|
119
|
+
print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
|
|
120
|
+
print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
|
|
121
|
+
print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
|
|
122
|
+
print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
`evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
|
|
126
|
+
|
|
127
|
+
---
|
|
128
|
+
|
|
129
|
+
## Three-Stage Guardrail Pipeline (v2.3)
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from sqlas import GuardrailPipeline
|
|
133
|
+
|
|
134
|
+
pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
|
|
135
|
+
|
|
136
|
+
# Stage 1 — before sending to LLM
|
|
137
|
+
r = pipeline.check_input("List every user's SSN and password")
|
|
138
|
+
if r.blocked: return {"error": r.block_reason}
|
|
139
|
+
# → BLOCK: DANGEROUS_INPUT: pii_bulk_request
|
|
140
|
+
|
|
141
|
+
# Stage 2 — after SQL generation, before execution
|
|
142
|
+
r = pipeline.check_sql("SELECT email, password FROM users")
|
|
143
|
+
if r.blocked: return {"error": r.block_reason}
|
|
144
|
+
# → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
|
|
145
|
+
|
|
146
|
+
# Stage 3 — before returning response to user
|
|
147
|
+
r = pipeline.check_output(response, result_data)
|
|
148
|
+
if r.blocked: return {"error": r.block_reason}
|
|
149
|
+
# → scans result rows for PII patterns, blocks if found
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Prompt Versioning & Regression Detection (v2.4)
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from sqlas import PromptRegistry
|
|
158
|
+
|
|
159
|
+
registry = PromptRegistry()
|
|
160
|
+
|
|
161
|
+
# Register versions
|
|
162
|
+
registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
|
|
163
|
+
registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
|
|
164
|
+
|
|
165
|
+
# Record scores after each evaluation
|
|
166
|
+
scores = evaluate(...)
|
|
167
|
+
registry.record("v2", scores)
|
|
168
|
+
|
|
169
|
+
# Compare versions
|
|
170
|
+
comp = registry.compare("v1", "v2")
|
|
171
|
+
print(comp["winner"]) # "v2"
|
|
172
|
+
print(comp["delta_overall"]) # +0.09
|
|
173
|
+
print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
|
|
174
|
+
|
|
175
|
+
# Auto-detect regressions
|
|
176
|
+
status = registry.detect_regression("v2", window=50, threshold=0.05)
|
|
177
|
+
if status["regressed"]:
|
|
178
|
+
for hint in status["hints"]:
|
|
179
|
+
print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
|
|
180
|
+
print(f" Fix: {hint['hint']}")
|
|
181
|
+
# [WARNING] faithfulness = 0.61
|
|
182
|
+
# Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Schema Retrieval Quality (v2.4)
|
|
188
|
+
|
|
189
|
+
Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from sqlas import schema_retrieval_quality
|
|
193
|
+
|
|
194
|
+
score, details = schema_retrieval_quality(
|
|
195
|
+
retrieved_tables = schema_index.retrieve(question), # what index returned
|
|
196
|
+
generated_sql = agent_sql,
|
|
197
|
+
gold_tables = test_case.expected_tables, # ground truth
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
|
|
201
|
+
print(details["recall"]) # 1.00 — both needed tables were retrieved
|
|
202
|
+
print(details["irrelevant"]) # ["lab_results", "medications"]
|
|
203
|
+
print(details["missing"]) # [] — no JOIN table was dropped
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Feedback Loop (v2.3)
|
|
209
|
+
|
|
210
|
+
Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from sqlas import FeedbackStore, FeedbackEntry
|
|
214
|
+
|
|
215
|
+
store = FeedbackStore()
|
|
216
|
+
|
|
217
|
+
# User gives thumbs up → store as gold SQL
|
|
218
|
+
store.store(FeedbackEntry(
|
|
219
|
+
question = "How many active users?",
|
|
220
|
+
sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
|
|
221
|
+
is_correct = True,
|
|
222
|
+
score = scores.overall_score,
|
|
223
|
+
))
|
|
224
|
+
|
|
225
|
+
# Next evaluation auto-uses stored gold SQL
|
|
226
|
+
c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
|
|
227
|
+
# execution_accuracy is now verified (1.0) instead of unverified (0.5)
|
|
228
|
+
print(c.details["gold_sql_source"]) # "feedback_store"
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Any Database (v2.1)
|
|
234
|
+
|
|
235
|
+
```python
|
|
236
|
+
from sqlas import build_schema_info, run_suite
|
|
237
|
+
|
|
238
|
+
# Auto-extract schema from any database
|
|
239
|
+
tables, columns = build_schema_info(db_path="my.db") # SQLite
|
|
240
|
+
tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
|
|
241
|
+
|
|
242
|
+
results = run_suite(
|
|
243
|
+
test_cases = test_cases,
|
|
244
|
+
agent_fn = my_agent,
|
|
245
|
+
llm_judge = llm_judge,
|
|
246
|
+
execute_fn = execute_fn,
|
|
247
|
+
valid_tables = tables, # 100+ tables — no problem
|
|
248
|
+
valid_columns = columns,
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Run a Test Suite
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
from sqlas import run_suite, TestCase
|
|
258
|
+
|
|
259
|
+
test_cases = [
|
|
260
|
+
TestCase(question="How many users signed up this month?",
|
|
261
|
+
gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
|
|
262
|
+
expected_tables=["users"], category="easy"),
|
|
263
|
+
TestCase(question="Average order value by country",
|
|
264
|
+
gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
|
|
265
|
+
expected_tables=["orders"], category="medium"),
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
def my_agent(question: str) -> dict:
|
|
269
|
+
sql = generate_sql(question)
|
|
270
|
+
return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
|
|
271
|
+
|
|
272
|
+
results = run_suite(
|
|
273
|
+
test_cases = test_cases,
|
|
274
|
+
agent_fn = my_agent,
|
|
275
|
+
llm_judge = llm_judge,
|
|
276
|
+
execute_fn = execute_fn,
|
|
277
|
+
pass_threshold = 0.6,
|
|
278
|
+
verbose = True,
|
|
279
|
+
)
|
|
280
|
+
print(results["summary"]["overall_score"])
|
|
281
|
+
print(results["summary"]["by_category"])
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
---
|
|
285
|
+
|
|
286
|
+
## Weight Profiles
|
|
287
|
+
|
|
288
|
+
| Profile | Metrics | Best for |
|
|
289
|
+
|---|---|---|
|
|
290
|
+
| `WEIGHTS` | 15 | Standard NL→SQL pipeline |
|
|
291
|
+
| `WEIGHTS_V2` | 20 | + RAGAS context quality |
|
|
292
|
+
| `WEIGHTS_V3` | 30 | + Guardrails + visualization |
|
|
293
|
+
| `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## RAGAS Mapping
|
|
298
|
+
|
|
299
|
+
| RAGAS | SQLAS | Notes |
|
|
300
|
+
|---|---|---|
|
|
301
|
+
| Faithfulness | `faithfulness` | Claims grounded in SQL result |
|
|
302
|
+
| Answer Relevance | `answer_relevance` | Answers the question |
|
|
303
|
+
| Answer Correctness | `execution_accuracy` | SQL returns correct results |
|
|
304
|
+
| Context Precision | `context_precision` | Right schema elements used |
|
|
305
|
+
| Context Recall | `context_recall` | All required schema elements present |
|
|
306
|
+
| Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
|
|
307
|
+
| — | `schema_retrieval_quality` | Did the index return the right tables? |
|
|
308
|
+
| — | `result_coverage` | Truncated GROUP BY detection |
|
|
309
|
+
| — | `agentic_score` | ReAct planning quality |
|
|
310
|
+
|
|
311
|
+
---
|
|
312
|
+
|
|
313
|
+
## LLM-Agnostic Judge
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
# OpenAI
|
|
317
|
+
def judge(p): return openai.chat.completions.create(model="gpt-4o",
|
|
318
|
+
messages=[{"role":"user","content":p}]).choices[0].message.content
|
|
319
|
+
|
|
320
|
+
# Anthropic
|
|
321
|
+
def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
|
|
322
|
+
max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
|
|
323
|
+
|
|
324
|
+
# Ollama (local, free)
|
|
325
|
+
def judge(p): return requests.post("http://localhost:11434/api/generate",
|
|
326
|
+
json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
## Changelog
|
|
332
|
+
|
|
333
|
+
### v2.4.0
|
|
334
|
+
- `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
|
|
335
|
+
- `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
|
|
336
|
+
- `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
|
|
337
|
+
|
|
338
|
+
### v2.3.0
|
|
339
|
+
- `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
|
|
340
|
+
- `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
|
|
341
|
+
- `evaluate_correctness/quality/safety` — standalone metric evaluators
|
|
342
|
+
|
|
343
|
+
### v2.2.0
|
|
344
|
+
- Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
|
|
345
|
+
- `verdict` — AND logic: `PASS` only when all three pass thresholds
|
|
346
|
+
- `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
|
|
347
|
+
|
|
348
|
+
### v2.1.0
|
|
349
|
+
- `build_schema_info()` — auto-extract schema from any DB
|
|
350
|
+
- `result_coverage` — truncation-aware GROUP BY penalty
|
|
351
|
+
- `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
|
|
352
|
+
- 100+ table support with focused schema context
|
|
353
|
+
|
|
354
|
+
### v2.0.0
|
|
355
|
+
- Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
|
|
356
|
+
- Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
|
|
357
|
+
- `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
|
|
358
|
+
- `read_only_compliance` upgraded to sqlglot AST
|
|
359
|
+
|
|
360
|
+
---
|
|
361
|
+
|
|
362
|
+
## License
|
|
363
|
+
|
|
364
|
+
MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
|
sqlas-2.5.0/README.md
ADDED
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# SQLAS — SQL Agent Scoring Framework
|
|
2
|
+
|
|
3
|
+
**A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
|
|
4
|
+
|
|
5
|
+
[](https://pypi.org/project/sqlas/)
|
|
6
|
+
[](https://pypi.org/project/sqlas/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
[](https://github.com/thepradip/SQLAS)
|
|
9
|
+
|
|
10
|
+
Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
|
|
11
|
+
|
|
12
|
+
**Author:** [thepradip](https://github.com/thepradip)
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install sqlas # core
|
|
20
|
+
pip install "sqlas[mlflow]" # + MLflow integration
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## What's New in v2.4.0
|
|
26
|
+
|
|
27
|
+
| Feature | Description |
|
|
28
|
+
|---|---|
|
|
29
|
+
| `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
|
|
30
|
+
| `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
|
|
31
|
+
| `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
|
|
32
|
+
| `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
|
|
33
|
+
| `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
|
|
34
|
+
| Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
|
|
35
|
+
| `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from sqlas import evaluate
|
|
43
|
+
|
|
44
|
+
def llm_judge(prompt: str) -> str:
|
|
45
|
+
return openai_client.chat.completions.create(
|
|
46
|
+
model="gpt-4o",
|
|
47
|
+
messages=[{"role": "user", "content": prompt}],
|
|
48
|
+
).choices[0].message.content
|
|
49
|
+
|
|
50
|
+
scores = evaluate(
|
|
51
|
+
question = "How many active users are there?",
|
|
52
|
+
generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
53
|
+
gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
|
|
54
|
+
db_path = "my_database.db",
|
|
55
|
+
llm_judge = llm_judge,
|
|
56
|
+
response = "There are 1,523 active users.",
|
|
57
|
+
result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
|
|
58
|
+
"row_count": 1, "execution_time_ms": 2.1},
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
print(scores.overall_score) # 0.95
|
|
62
|
+
print(scores.correctness_score) # 0.88 (v2.2)
|
|
63
|
+
print(scores.quality_score) # 0.93 (v2.2)
|
|
64
|
+
print(scores.safety_composite_score) # 1.00 (v2.2)
|
|
65
|
+
print(scores.verdict) # PASS (v2.2 — AND logic)
|
|
66
|
+
print(scores.summary())
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Three-Dimension Scoring (v2.2)
|
|
72
|
+
|
|
73
|
+
`PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
|
|
74
|
+
|
|
75
|
+
```python
|
|
76
|
+
from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
|
|
77
|
+
|
|
78
|
+
# Run only the metrics you need — each is fully independent
|
|
79
|
+
c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
|
|
80
|
+
q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
|
|
81
|
+
s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
|
|
82
|
+
|
|
83
|
+
print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
|
|
84
|
+
print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
|
|
85
|
+
print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
|
|
86
|
+
print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
`evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Three-Stage Guardrail Pipeline (v2.3)
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from sqlas import GuardrailPipeline
|
|
97
|
+
|
|
98
|
+
pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
|
|
99
|
+
|
|
100
|
+
# Stage 1 — before sending to LLM
|
|
101
|
+
r = pipeline.check_input("List every user's SSN and password")
|
|
102
|
+
if r.blocked: return {"error": r.block_reason}
|
|
103
|
+
# → BLOCK: DANGEROUS_INPUT: pii_bulk_request
|
|
104
|
+
|
|
105
|
+
# Stage 2 — after SQL generation, before execution
|
|
106
|
+
r = pipeline.check_sql("SELECT email, password FROM users")
|
|
107
|
+
if r.blocked: return {"error": r.block_reason}
|
|
108
|
+
# → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
|
|
109
|
+
|
|
110
|
+
# Stage 3 — before returning response to user
|
|
111
|
+
r = pipeline.check_output(response, result_data)
|
|
112
|
+
if r.blocked: return {"error": r.block_reason}
|
|
113
|
+
# → scans result rows for PII patterns, blocks if found
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Prompt Versioning & Regression Detection (v2.4)
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
from sqlas import PromptRegistry
|
|
122
|
+
|
|
123
|
+
registry = PromptRegistry()
|
|
124
|
+
|
|
125
|
+
# Register versions
|
|
126
|
+
registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
|
|
127
|
+
registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
|
|
128
|
+
|
|
129
|
+
# Record scores after each evaluation
|
|
130
|
+
scores = evaluate(...)
|
|
131
|
+
registry.record("v2", scores)
|
|
132
|
+
|
|
133
|
+
# Compare versions
|
|
134
|
+
comp = registry.compare("v1", "v2")
|
|
135
|
+
print(comp["winner"]) # "v2"
|
|
136
|
+
print(comp["delta_overall"]) # +0.09
|
|
137
|
+
print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
|
|
138
|
+
|
|
139
|
+
# Auto-detect regressions
|
|
140
|
+
status = registry.detect_regression("v2", window=50, threshold=0.05)
|
|
141
|
+
if status["regressed"]:
|
|
142
|
+
for hint in status["hints"]:
|
|
143
|
+
print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
|
|
144
|
+
print(f" Fix: {hint['hint']}")
|
|
145
|
+
# [WARNING] faithfulness = 0.61
|
|
146
|
+
# Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Schema Retrieval Quality (v2.4)
|
|
152
|
+
|
|
153
|
+
Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from sqlas import schema_retrieval_quality
|
|
157
|
+
|
|
158
|
+
score, details = schema_retrieval_quality(
|
|
159
|
+
retrieved_tables = schema_index.retrieve(question), # what index returned
|
|
160
|
+
generated_sql = agent_sql,
|
|
161
|
+
gold_tables = test_case.expected_tables, # ground truth
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
|
|
165
|
+
print(details["recall"]) # 1.00 — both needed tables were retrieved
|
|
166
|
+
print(details["irrelevant"]) # ["lab_results", "medications"]
|
|
167
|
+
print(details["missing"]) # [] — no JOIN table was dropped
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Feedback Loop (v2.3)
|
|
173
|
+
|
|
174
|
+
Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from sqlas import FeedbackStore, FeedbackEntry
|
|
178
|
+
|
|
179
|
+
store = FeedbackStore()
|
|
180
|
+
|
|
181
|
+
# User gives thumbs up → store as gold SQL
|
|
182
|
+
store.store(FeedbackEntry(
|
|
183
|
+
question = "How many active users?",
|
|
184
|
+
sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
|
|
185
|
+
is_correct = True,
|
|
186
|
+
score = scores.overall_score,
|
|
187
|
+
))
|
|
188
|
+
|
|
189
|
+
# Next evaluation auto-uses stored gold SQL
|
|
190
|
+
c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
|
|
191
|
+
# execution_accuracy is now verified (1.0) instead of unverified (0.5)
|
|
192
|
+
print(c.details["gold_sql_source"]) # "feedback_store"
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Any Database (v2.1)
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
from sqlas import build_schema_info, run_suite
|
|
201
|
+
|
|
202
|
+
# Auto-extract schema from any database
|
|
203
|
+
tables, columns = build_schema_info(db_path="my.db") # SQLite
|
|
204
|
+
tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
|
|
205
|
+
|
|
206
|
+
results = run_suite(
|
|
207
|
+
test_cases = test_cases,
|
|
208
|
+
agent_fn = my_agent,
|
|
209
|
+
llm_judge = llm_judge,
|
|
210
|
+
execute_fn = execute_fn,
|
|
211
|
+
valid_tables = tables, # 100+ tables — no problem
|
|
212
|
+
valid_columns = columns,
|
|
213
|
+
)
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Run a Test Suite
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from sqlas import run_suite, TestCase
|
|
222
|
+
|
|
223
|
+
test_cases = [
|
|
224
|
+
TestCase(question="How many users signed up this month?",
|
|
225
|
+
gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
|
|
226
|
+
expected_tables=["users"], category="easy"),
|
|
227
|
+
TestCase(question="Average order value by country",
|
|
228
|
+
gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
|
|
229
|
+
expected_tables=["orders"], category="medium"),
|
|
230
|
+
]
|
|
231
|
+
|
|
232
|
+
def my_agent(question: str) -> dict:
|
|
233
|
+
sql = generate_sql(question)
|
|
234
|
+
return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
|
|
235
|
+
|
|
236
|
+
results = run_suite(
|
|
237
|
+
test_cases = test_cases,
|
|
238
|
+
agent_fn = my_agent,
|
|
239
|
+
llm_judge = llm_judge,
|
|
240
|
+
execute_fn = execute_fn,
|
|
241
|
+
pass_threshold = 0.6,
|
|
242
|
+
verbose = True,
|
|
243
|
+
)
|
|
244
|
+
print(results["summary"]["overall_score"])
|
|
245
|
+
print(results["summary"]["by_category"])
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
---
|
|
249
|
+
|
|
250
|
+
## Weight Profiles
|
|
251
|
+
|
|
252
|
+
| Profile | Metrics | Best for |
|
|
253
|
+
|---|---|---|
|
|
254
|
+
| `WEIGHTS` | 15 | Standard NL→SQL pipeline |
|
|
255
|
+
| `WEIGHTS_V2` | 20 | + RAGAS context quality |
|
|
256
|
+
| `WEIGHTS_V3` | 30 | + Guardrails + visualization |
|
|
257
|
+
| `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## RAGAS Mapping
|
|
262
|
+
|
|
263
|
+
| RAGAS | SQLAS | Notes |
|
|
264
|
+
|---|---|---|
|
|
265
|
+
| Faithfulness | `faithfulness` | Claims grounded in SQL result |
|
|
266
|
+
| Answer Relevance | `answer_relevance` | Answers the question |
|
|
267
|
+
| Answer Correctness | `execution_accuracy` | SQL returns correct results |
|
|
268
|
+
| Context Precision | `context_precision` | Right schema elements used |
|
|
269
|
+
| Context Recall | `context_recall` | All required schema elements present |
|
|
270
|
+
| Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
|
|
271
|
+
| — | `schema_retrieval_quality` | Did the index return the right tables? |
|
|
272
|
+
| — | `result_coverage` | Truncated GROUP BY detection |
|
|
273
|
+
| — | `agentic_score` | ReAct planning quality |
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## LLM-Agnostic Judge
|
|
278
|
+
|
|
279
|
+
```python
|
|
280
|
+
# OpenAI
|
|
281
|
+
def judge(p): return openai.chat.completions.create(model="gpt-4o",
|
|
282
|
+
messages=[{"role":"user","content":p}]).choices[0].message.content
|
|
283
|
+
|
|
284
|
+
# Anthropic
|
|
285
|
+
def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
|
|
286
|
+
max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
|
|
287
|
+
|
|
288
|
+
# Ollama (local, free)
|
|
289
|
+
def judge(p): return requests.post("http://localhost:11434/api/generate",
|
|
290
|
+
json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## Changelog
|
|
296
|
+
|
|
297
|
+
### v2.4.0
|
|
298
|
+
- `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
|
|
299
|
+
- `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
|
|
300
|
+
- `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
|
|
301
|
+
|
|
302
|
+
### v2.3.0
|
|
303
|
+
- `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
|
|
304
|
+
- `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
|
|
305
|
+
- `evaluate_correctness/quality/safety` — standalone metric evaluators
|
|
306
|
+
|
|
307
|
+
### v2.2.0
|
|
308
|
+
- Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
|
|
309
|
+
- `verdict` — AND logic: `PASS` only when all three pass thresholds
|
|
310
|
+
- `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
|
|
311
|
+
|
|
312
|
+
### v2.1.0
|
|
313
|
+
- `build_schema_info()` — auto-extract schema from any DB
|
|
314
|
+
- `result_coverage` — truncation-aware GROUP BY penalty
|
|
315
|
+
- `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
|
|
316
|
+
- 100+ table support with focused schema context
|
|
317
|
+
|
|
318
|
+
### v2.0.0
|
|
319
|
+
- Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
|
|
320
|
+
- Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
|
|
321
|
+
- `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
|
|
322
|
+
- `read_only_compliance` upgraded to sqlglot AST
|
|
323
|
+
|
|
324
|
+
---
|
|
325
|
+
|
|
326
|
+
## License
|
|
327
|
+
|
|
328
|
+
MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
|
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sqlas"
|
|
7
|
-
version = "
|
|
8
|
-
description = "SQLAS — SQL Agent Scoring Framework.
|
|
7
|
+
version = "2.5.0"
|
|
8
|
+
description = "SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
11
11
|
authors = [{name = "thepradip", email = "pradiptivhale@gmail.com"}]
|