sqlas 2.0.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sqlas-2.6.0/PKG-INFO +378 -0
  2. sqlas-2.6.0/README.md +328 -0
  3. {sqlas-2.0.0 → sqlas-2.6.0}/pyproject.toml +9 -4
  4. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/__init__.py +24 -5
  5. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/agentic.py +104 -0
  6. sqlas-2.6.0/sqlas/benchmarks.py +480 -0
  7. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/core.py +214 -5
  8. sqlas-2.6.0/sqlas/evaluate.py +822 -0
  9. sqlas-2.6.0/sqlas/feedback.py +177 -0
  10. sqlas-2.6.0/sqlas/guardrails.py +381 -0
  11. sqlas-2.6.0/sqlas/integrations.py +274 -0
  12. sqlas-2.6.0/sqlas/production.py +153 -0
  13. sqlas-2.6.0/sqlas/prompt_registry.py +378 -0
  14. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/quality.py +2 -1
  15. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/runner.py +6 -1
  16. sqlas-2.6.0/sqlas/schema_quality.py +215 -0
  17. sqlas-2.6.0/sqlas/ui.py +572 -0
  18. sqlas-2.6.0/sqlas.egg-info/PKG-INFO +378 -0
  19. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas.egg-info/SOURCES.txt +8 -0
  20. sqlas-2.6.0/sqlas.egg-info/requires.txt +31 -0
  21. sqlas-2.6.0/sqlas.egg-info/top_level.txt +2 -0
  22. {sqlas-2.0.0 → sqlas-2.6.0}/tests/test_execute_fn.py +5 -2
  23. sqlas-2.6.0/tests/test_large_schema.py +285 -0
  24. sqlas-2.0.0/PKG-INFO +0 -376
  25. sqlas-2.0.0/README.md +0 -340
  26. sqlas-2.0.0/sqlas/evaluate.py +0 -318
  27. sqlas-2.0.0/sqlas/production.py +0 -74
  28. sqlas-2.0.0/sqlas.egg-info/PKG-INFO +0 -376
  29. sqlas-2.0.0/sqlas.egg-info/requires.txt +0 -12
  30. sqlas-2.0.0/sqlas.egg-info/top_level.txt +0 -1
  31. {sqlas-2.0.0 → sqlas-2.6.0}/LICENSE +0 -0
  32. {sqlas-2.0.0 → sqlas-2.6.0}/setup.cfg +0 -0
  33. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/cache.py +0 -0
  34. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/context.py +0 -0
  35. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/correctness.py +0 -0
  36. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/py.typed +0 -0
  37. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/response.py +0 -0
  38. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/safety.py +0 -0
  39. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas/visualization.py +0 -0
  40. {sqlas-2.0.0 → sqlas-2.6.0}/sqlas.egg-info/dependency_links.txt +0 -0
  41. {sqlas-2.0.0 → sqlas-2.6.0}/tests/test_context.py +0 -0
  42. {sqlas-2.0.0 → sqlas-2.6.0}/tests/test_sqlas.py +0 -0
  43. {sqlas-2.0.0 → sqlas-2.6.0}/tests/test_v2.py +0 -0
sqlas-2.6.0/PKG-INFO ADDED
@@ -0,0 +1,378 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqlas
3
+ Version: 2.6.0
4
+ Summary: SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics.
5
+ Author-email: thepradip <pradiptivhale@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/thepradip/SQLAS
8
+ Project-URL: Documentation, https://github.com/thepradip/SQLAS#readme
9
+ Project-URL: Repository, https://github.com/thepradip/SQLAS
10
+ Project-URL: Changelog, https://github.com/thepradip/SQLAS/blob/main/CHANGELOG.md
11
+ Keywords: sql,agent,evaluation,llm,text-to-sql,ragas,mlflow,benchmark,monitoring
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: sqlglot>=20.0
27
+ Provides-Extra: mlflow
28
+ Requires-Dist: mlflow>=3.0; extra == "mlflow"
29
+ Provides-Extra: wandb
30
+ Requires-Dist: wandb>=0.16; extra == "wandb"
31
+ Provides-Extra: langsmith
32
+ Requires-Dist: langsmith>=0.1; extra == "langsmith"
33
+ Provides-Extra: ui
34
+ Requires-Dist: streamlit>=1.30; extra == "ui"
35
+ Requires-Dist: pandas>=2.0; extra == "ui"
36
+ Provides-Extra: benchmarks
37
+ Provides-Extra: prometheus
38
+ Requires-Dist: prometheus-client>=0.19; extra == "prometheus"
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=7.0; extra == "dev"
41
+ Requires-Dist: build; extra == "dev"
42
+ Requires-Dist: twine; extra == "dev"
43
+ Provides-Extra: all
44
+ Requires-Dist: mlflow>=3.0; extra == "all"
45
+ Requires-Dist: wandb>=0.16; extra == "all"
46
+ Requires-Dist: langsmith>=0.1; extra == "all"
47
+ Requires-Dist: streamlit>=1.30; extra == "all"
48
+ Requires-Dist: pandas>=2.0; extra == "all"
49
+ Dynamic: license-file
50
+
51
+ # SQLAS — SQL Agent Scoring Framework
52
+
53
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
54
+
55
+ [![PyPI](https://img.shields.io/pypi/v/sqlas)](https://pypi.org/project/sqlas/)
56
+ [![Python](https://img.shields.io/pypi/pyversions/sqlas)](https://pypi.org/project/sqlas/)
57
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
58
+ [![Tests](https://img.shields.io/badge/tests-140%20passing-brightgreen)](https://github.com/thepradip/SQLAS)
59
+
60
+ Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
61
+
62
+ **Author:** [thepradip](https://github.com/thepradip)
63
+
64
+ ---
65
+
66
+ ## Install
67
+
68
+ ```bash
69
+ pip install sqlas # core
70
+ pip install "sqlas[mlflow]" # + MLflow integration
71
+ ```
72
+
73
+ ---
74
+
75
+ ## What's New in v2.4.0
76
+
77
+ | Feature | Description |
78
+ |---|---|
79
+ | `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
80
+ | `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
81
+ | `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
82
+ | `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
83
+ | `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
84
+ | Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
85
+ | `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
86
+
87
+ ---
88
+
89
+ ## Quick Start
90
+
91
+ ```python
92
+ from sqlas import evaluate
93
+
94
+ def llm_judge(prompt: str) -> str:
95
+ return openai_client.chat.completions.create(
96
+ model="gpt-4o",
97
+ messages=[{"role": "user", "content": prompt}],
98
+ ).choices[0].message.content
99
+
100
+ scores = evaluate(
101
+ question = "How many active users are there?",
102
+ generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
103
+ gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
104
+ db_path = "my_database.db",
105
+ llm_judge = llm_judge,
106
+ response = "There are 1,523 active users.",
107
+ result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
108
+ "row_count": 1, "execution_time_ms": 2.1},
109
+ )
110
+
111
+ print(scores.overall_score) # 0.95
112
+ print(scores.correctness_score) # 0.88 (v2.2)
113
+ print(scores.quality_score) # 0.93 (v2.2)
114
+ print(scores.safety_composite_score) # 1.00 (v2.2)
115
+ print(scores.verdict) # PASS (v2.2 — AND logic)
116
+ print(scores.summary())
117
+ ```
118
+
119
+ ---
120
+
121
+ ## Three-Dimension Scoring (v2.2)
122
+
123
+ `PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
124
+
125
+ ```python
126
+ from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
127
+
128
+ # Run only the metrics you need — each is fully independent
129
+ c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
130
+ q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
131
+ s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
132
+
133
+ print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
134
+ print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
135
+ print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
136
+ print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
137
+ ```
138
+
139
+ `evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
140
+
141
+ ---
142
+
143
+ ## Three-Stage Guardrail Pipeline (v2.3)
144
+
145
+ ```python
146
+ from sqlas import GuardrailPipeline
147
+
148
+ pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
149
+
150
+ # Stage 1 — before sending to LLM
151
+ r = pipeline.check_input("List every user's SSN and password")
152
+ if r.blocked: return {"error": r.block_reason}
153
+ # → BLOCK: DANGEROUS_INPUT: pii_bulk_request
154
+
155
+ # Stage 2 — after SQL generation, before execution
156
+ r = pipeline.check_sql("SELECT email, password FROM users")
157
+ if r.blocked: return {"error": r.block_reason}
158
+ # → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
159
+
160
+ # Stage 3 — before returning response to user
161
+ r = pipeline.check_output(response, result_data)
162
+ if r.blocked: return {"error": r.block_reason}
163
+ # → scans result rows for PII patterns, blocks if found
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Prompt Versioning & Regression Detection (v2.4)
169
+
170
+ ```python
171
+ from sqlas import PromptRegistry
172
+
173
+ registry = PromptRegistry()
174
+
175
+ # Register versions
176
+ registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
177
+ registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
178
+
179
+ # Record scores after each evaluation
180
+ scores = evaluate(...)
181
+ registry.record("v2", scores)
182
+
183
+ # Compare versions
184
+ comp = registry.compare("v1", "v2")
185
+ print(comp["winner"]) # "v2"
186
+ print(comp["delta_overall"]) # +0.09
187
+ print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
188
+
189
+ # Auto-detect regressions
190
+ status = registry.detect_regression("v2", window=50, threshold=0.05)
191
+ if status["regressed"]:
192
+ for hint in status["hints"]:
193
+ print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
194
+ print(f" Fix: {hint['hint']}")
195
+ # [WARNING] faithfulness = 0.61
196
+ # Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
197
+ ```
198
+
199
+ ---
200
+
201
+ ## Schema Retrieval Quality (v2.4)
202
+
203
+ Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
204
+
205
+ ```python
206
+ from sqlas import schema_retrieval_quality
207
+
208
+ score, details = schema_retrieval_quality(
209
+ retrieved_tables = schema_index.retrieve(question), # what index returned
210
+ generated_sql = agent_sql,
211
+ gold_tables = test_case.expected_tables, # ground truth
212
+ )
213
+
214
+ print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
215
+ print(details["recall"]) # 1.00 — both needed tables were retrieved
216
+ print(details["irrelevant"]) # ["lab_results", "medications"]
217
+ print(details["missing"]) # [] — no JOIN table was dropped
218
+ ```
219
+
220
+ ---
221
+
222
+ ## Feedback Loop (v2.3)
223
+
224
+ Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
225
+
226
+ ```python
227
+ from sqlas import FeedbackStore, FeedbackEntry
228
+
229
+ store = FeedbackStore()
230
+
231
+ # User gives thumbs up → store as gold SQL
232
+ store.store(FeedbackEntry(
233
+ question = "How many active users?",
234
+ sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
235
+ is_correct = True,
236
+ score = scores.overall_score,
237
+ ))
238
+
239
+ # Next evaluation auto-uses stored gold SQL
240
+ c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
241
+ # execution_accuracy is now verified (1.0) instead of unverified (0.5)
242
+ print(c.details["gold_sql_source"]) # "feedback_store"
243
+ ```
244
+
245
+ ---
246
+
247
+ ## Any Database (v2.1)
248
+
249
+ ```python
250
+ from sqlas import build_schema_info, run_suite
251
+
252
+ # Auto-extract schema from any database
253
+ tables, columns = build_schema_info(db_path="my.db") # SQLite
254
+ tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
255
+
256
+ results = run_suite(
257
+ test_cases = test_cases,
258
+ agent_fn = my_agent,
259
+ llm_judge = llm_judge,
260
+ execute_fn = execute_fn,
261
+ valid_tables = tables, # 100+ tables — no problem
262
+ valid_columns = columns,
263
+ )
264
+ ```
265
+
266
+ ---
267
+
268
+ ## Run a Test Suite
269
+
270
+ ```python
271
+ from sqlas import run_suite, TestCase
272
+
273
+ test_cases = [
274
+ TestCase(question="How many users signed up this month?",
275
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
276
+ expected_tables=["users"], category="easy"),
277
+ TestCase(question="Average order value by country",
278
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
279
+ expected_tables=["orders"], category="medium"),
280
+ ]
281
+
282
+ def my_agent(question: str) -> dict:
283
+ sql = generate_sql(question)
284
+ return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
285
+
286
+ results = run_suite(
287
+ test_cases = test_cases,
288
+ agent_fn = my_agent,
289
+ llm_judge = llm_judge,
290
+ execute_fn = execute_fn,
291
+ pass_threshold = 0.6,
292
+ verbose = True,
293
+ )
294
+ print(results["summary"]["overall_score"])
295
+ print(results["summary"]["by_category"])
296
+ ```
297
+
298
+ ---
299
+
300
+ ## Weight Profiles
301
+
302
+ | Profile | Metrics | Best for |
303
+ |---|---|---|
304
+ | `WEIGHTS` | 15 | Standard NL→SQL pipeline |
305
+ | `WEIGHTS_V2` | 20 | + RAGAS context quality |
306
+ | `WEIGHTS_V3` | 30 | + Guardrails + visualization |
307
+ | `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
308
+
309
+ ---
310
+
311
+ ## RAGAS Mapping
312
+
313
+ | RAGAS | SQLAS | Notes |
314
+ |---|---|---|
315
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result |
316
+ | Answer Relevance | `answer_relevance` | Answers the question |
317
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
318
+ | Context Precision | `context_precision` | Right schema elements used |
319
+ | Context Recall | `context_recall` | All required schema elements present |
320
+ | Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
321
+ | — | `schema_retrieval_quality` | Did the index return the right tables? |
322
+ | — | `result_coverage` | Truncated GROUP BY detection |
323
+ | — | `agentic_score` | ReAct planning quality |
324
+
325
+ ---
326
+
327
+ ## LLM-Agnostic Judge
328
+
329
+ ```python
330
+ # OpenAI
331
+ def judge(p): return openai.chat.completions.create(model="gpt-4o",
332
+ messages=[{"role":"user","content":p}]).choices[0].message.content
333
+
334
+ # Anthropic
335
+ def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
336
+ max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
337
+
338
+ # Ollama (local, free)
339
+ def judge(p): return requests.post("http://localhost:11434/api/generate",
340
+ json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
341
+ ```
342
+
343
+ ---
344
+
345
+ ## Changelog
346
+
347
+ ### v2.4.0
348
+ - `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
349
+ - `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
350
+ - `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
351
+
352
+ ### v2.3.0
353
+ - `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
354
+ - `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
355
+ - `evaluate_correctness/quality/safety` — standalone metric evaluators
356
+
357
+ ### v2.2.0
358
+ - Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
359
+ - `verdict` — AND logic: `PASS` only when all three pass thresholds
360
+ - `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
361
+
362
+ ### v2.1.0
363
+ - `build_schema_info()` — auto-extract schema from any DB
364
+ - `result_coverage` — truncation-aware GROUP BY penalty
365
+ - `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
366
+ - 100+ table support with focused schema context
367
+
368
+ ### v2.0.0
369
+ - Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
370
+ - Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
371
+ - `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
372
+ - `read_only_compliance` upgraded to sqlglot AST
373
+
374
+ ---
375
+
376
+ ## License
377
+
378
+ MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
sqlas-2.6.0/README.md ADDED
@@ -0,0 +1,328 @@
1
+ # SQLAS — SQL Agent Scoring Framework
2
+
3
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/sqlas)](https://pypi.org/project/sqlas/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/sqlas)](https://pypi.org/project/sqlas/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
8
+ [![Tests](https://img.shields.io/badge/tests-140%20passing-brightgreen)](https://github.com/thepradip/SQLAS)
9
+
10
+ Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
11
+
12
+ **Author:** [thepradip](https://github.com/thepradip)
13
+
14
+ ---
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install sqlas # core
20
+ pip install "sqlas[mlflow]" # + MLflow integration
21
+ ```
22
+
23
+ ---
24
+
25
+ ## What's New in v2.4.0
26
+
27
+ | Feature | Description |
28
+ |---|---|
29
+ | `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
30
+ | `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
31
+ | `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
32
+ | `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
33
+ | `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
34
+ | Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
35
+ | `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
36
+
37
+ ---
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from sqlas import evaluate
43
+
44
+ def llm_judge(prompt: str) -> str:
45
+ return openai_client.chat.completions.create(
46
+ model="gpt-4o",
47
+ messages=[{"role": "user", "content": prompt}],
48
+ ).choices[0].message.content
49
+
50
+ scores = evaluate(
51
+ question = "How many active users are there?",
52
+ generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
53
+ gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
54
+ db_path = "my_database.db",
55
+ llm_judge = llm_judge,
56
+ response = "There are 1,523 active users.",
57
+ result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
58
+ "row_count": 1, "execution_time_ms": 2.1},
59
+ )
60
+
61
+ print(scores.overall_score) # 0.95
62
+ print(scores.correctness_score) # 0.88 (v2.2)
63
+ print(scores.quality_score) # 0.93 (v2.2)
64
+ print(scores.safety_composite_score) # 1.00 (v2.2)
65
+ print(scores.verdict) # PASS (v2.2 — AND logic)
66
+ print(scores.summary())
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Three-Dimension Scoring (v2.2)
72
+
73
+ `PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
74
+
75
+ ```python
76
+ from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
77
+
78
+ # Run only the metrics you need — each is fully independent
79
+ c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
80
+ q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
81
+ s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
82
+
83
+ print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
84
+ print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
85
+ print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
86
+ print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
87
+ ```
88
+
89
+ `evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
90
+
91
+ ---
92
+
93
+ ## Three-Stage Guardrail Pipeline (v2.3)
94
+
95
+ ```python
96
+ from sqlas import GuardrailPipeline
97
+
98
+ pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
99
+
100
+ # Stage 1 — before sending to LLM
101
+ r = pipeline.check_input("List every user's SSN and password")
102
+ if r.blocked: return {"error": r.block_reason}
103
+ # → BLOCK: DANGEROUS_INPUT: pii_bulk_request
104
+
105
+ # Stage 2 — after SQL generation, before execution
106
+ r = pipeline.check_sql("SELECT email, password FROM users")
107
+ if r.blocked: return {"error": r.block_reason}
108
+ # → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
109
+
110
+ # Stage 3 — before returning response to user
111
+ r = pipeline.check_output(response, result_data)
112
+ if r.blocked: return {"error": r.block_reason}
113
+ # → scans result rows for PII patterns, blocks if found
114
+ ```
115
+
116
+ ---
117
+
118
+ ## Prompt Versioning & Regression Detection (v2.4)
119
+
120
+ ```python
121
+ from sqlas import PromptRegistry
122
+
123
+ registry = PromptRegistry()
124
+
125
+ # Register versions
126
+ registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
127
+ registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
128
+
129
+ # Record scores after each evaluation
130
+ scores = evaluate(...)
131
+ registry.record("v2", scores)
132
+
133
+ # Compare versions
134
+ comp = registry.compare("v1", "v2")
135
+ print(comp["winner"]) # "v2"
136
+ print(comp["delta_overall"]) # +0.09
137
+ print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
138
+
139
+ # Auto-detect regressions
140
+ status = registry.detect_regression("v2", window=50, threshold=0.05)
141
+ if status["regressed"]:
142
+ for hint in status["hints"]:
143
+ print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
144
+ print(f" Fix: {hint['hint']}")
145
+ # [WARNING] faithfulness = 0.61
146
+ # Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
147
+ ```
148
+
149
+ ---
150
+
151
+ ## Schema Retrieval Quality (v2.4)
152
+
153
+ Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
154
+
155
+ ```python
156
+ from sqlas import schema_retrieval_quality
157
+
158
+ score, details = schema_retrieval_quality(
159
+ retrieved_tables = schema_index.retrieve(question), # what index returned
160
+ generated_sql = agent_sql,
161
+ gold_tables = test_case.expected_tables, # ground truth
162
+ )
163
+
164
+ print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
165
+ print(details["recall"]) # 1.00 — both needed tables were retrieved
166
+ print(details["irrelevant"]) # ["lab_results", "medications"]
167
+ print(details["missing"]) # [] — no JOIN table was dropped
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Feedback Loop (v2.3)
173
+
174
+ Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
175
+
176
+ ```python
177
+ from sqlas import FeedbackStore, FeedbackEntry
178
+
179
+ store = FeedbackStore()
180
+
181
+ # User gives thumbs up → store as gold SQL
182
+ store.store(FeedbackEntry(
183
+ question = "How many active users?",
184
+ sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
185
+ is_correct = True,
186
+ score = scores.overall_score,
187
+ ))
188
+
189
+ # Next evaluation auto-uses stored gold SQL
190
+ c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
191
+ # execution_accuracy is now verified (1.0) instead of unverified (0.5)
192
+ print(c.details["gold_sql_source"]) # "feedback_store"
193
+ ```
194
+
195
+ ---
196
+
197
+ ## Any Database (v2.1)
198
+
199
+ ```python
200
+ from sqlas import build_schema_info, run_suite
201
+
202
+ # Auto-extract schema from any database
203
+ tables, columns = build_schema_info(db_path="my.db") # SQLite
204
+ tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
205
+
206
+ results = run_suite(
207
+ test_cases = test_cases,
208
+ agent_fn = my_agent,
209
+ llm_judge = llm_judge,
210
+ execute_fn = execute_fn,
211
+ valid_tables = tables, # 100+ tables — no problem
212
+ valid_columns = columns,
213
+ )
214
+ ```
215
+
216
+ ---
217
+
218
+ ## Run a Test Suite
219
+
220
+ ```python
221
+ from sqlas import run_suite, TestCase
222
+
223
+ test_cases = [
224
+ TestCase(question="How many users signed up this month?",
225
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
226
+ expected_tables=["users"], category="easy"),
227
+ TestCase(question="Average order value by country",
228
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
229
+ expected_tables=["orders"], category="medium"),
230
+ ]
231
+
232
+ def my_agent(question: str) -> dict:
233
+ sql = generate_sql(question)
234
+ return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
235
+
236
+ results = run_suite(
237
+ test_cases = test_cases,
238
+ agent_fn = my_agent,
239
+ llm_judge = llm_judge,
240
+ execute_fn = execute_fn,
241
+ pass_threshold = 0.6,
242
+ verbose = True,
243
+ )
244
+ print(results["summary"]["overall_score"])
245
+ print(results["summary"]["by_category"])
246
+ ```
247
+
248
+ ---
249
+
250
+ ## Weight Profiles
251
+
252
+ | Profile | Metrics | Best for |
253
+ |---|---|---|
254
+ | `WEIGHTS` | 15 | Standard NL→SQL pipeline |
255
+ | `WEIGHTS_V2` | 20 | + RAGAS context quality |
256
+ | `WEIGHTS_V3` | 30 | + Guardrails + visualization |
257
+ | `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
258
+
259
+ ---
260
+
261
+ ## RAGAS Mapping
262
+
263
+ | RAGAS | SQLAS | Notes |
264
+ |---|---|---|
265
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result |
266
+ | Answer Relevance | `answer_relevance` | Answers the question |
267
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
268
+ | Context Precision | `context_precision` | Right schema elements used |
269
+ | Context Recall | `context_recall` | All required schema elements present |
270
+ | Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
271
+ | — | `schema_retrieval_quality` | Did the index return the right tables? |
272
+ | — | `result_coverage` | Truncated GROUP BY detection |
273
+ | — | `agentic_score` | ReAct planning quality |
274
+
275
+ ---
276
+
277
+ ## LLM-Agnostic Judge
278
+
279
+ ```python
280
+ # OpenAI
281
+ def judge(p): return openai.chat.completions.create(model="gpt-4o",
282
+ messages=[{"role":"user","content":p}]).choices[0].message.content
283
+
284
+ # Anthropic
285
+ def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
286
+ max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
287
+
288
+ # Ollama (local, free)
289
+ def judge(p): return requests.post("http://localhost:11434/api/generate",
290
+ json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
291
+ ```
292
+
293
+ ---
294
+
295
+ ## Changelog
296
+
297
+ ### v2.4.0
298
+ - `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
299
+ - `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
300
+ - `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
301
+
302
+ ### v2.3.0
303
+ - `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
304
+ - `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
305
+ - `evaluate_correctness/quality/safety` — standalone metric evaluators
306
+
307
+ ### v2.2.0
308
+ - Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
309
+ - `verdict` — AND logic: `PASS` only when all three pass thresholds
310
+ - `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
311
+
312
+ ### v2.1.0
313
+ - `build_schema_info()` — auto-extract schema from any DB
314
+ - `result_coverage` — truncation-aware GROUP BY penalty
315
+ - `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
316
+ - 100+ table support with focused schema context
317
+
318
+ ### v2.0.0
319
+ - Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
320
+ - Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
321
+ - `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
322
+ - `read_only_compliance` upgraded to sqlglot AST
323
+
324
+ ---
325
+
326
+ ## License
327
+
328
+ MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)