sqlas 1.3.0__tar.gz → 2.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. sqlas-2.5.0/PKG-INFO +364 -0
  2. sqlas-2.5.0/README.md +328 -0
  3. {sqlas-1.3.0 → sqlas-2.5.0}/pyproject.toml +2 -2
  4. sqlas-2.5.0/sqlas/__init__.py +90 -0
  5. sqlas-2.5.0/sqlas/agentic.py +317 -0
  6. sqlas-2.5.0/sqlas/cache.py +93 -0
  7. sqlas-2.5.0/sqlas/core.py +543 -0
  8. sqlas-2.5.0/sqlas/evaluate.py +822 -0
  9. sqlas-2.5.0/sqlas/feedback.py +177 -0
  10. sqlas-2.5.0/sqlas/guardrails.py +381 -0
  11. sqlas-2.5.0/sqlas/production.py +153 -0
  12. sqlas-2.5.0/sqlas/prompt_registry.py +378 -0
  13. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/quality.py +2 -1
  14. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/runner.py +6 -1
  15. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/safety.py +51 -9
  16. sqlas-2.5.0/sqlas/schema_quality.py +215 -0
  17. sqlas-2.5.0/sqlas.egg-info/PKG-INFO +364 -0
  18. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/SOURCES.txt +9 -1
  19. {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_execute_fn.py +10 -5
  20. sqlas-2.5.0/tests/test_large_schema.py +285 -0
  21. sqlas-2.5.0/tests/test_v2.py +279 -0
  22. sqlas-1.3.0/PKG-INFO +0 -376
  23. sqlas-1.3.0/README.md +0 -340
  24. sqlas-1.3.0/sqlas/__init__.py +0 -90
  25. sqlas-1.3.0/sqlas/core.py +0 -273
  26. sqlas-1.3.0/sqlas/evaluate.py +0 -276
  27. sqlas-1.3.0/sqlas/production.py +0 -74
  28. sqlas-1.3.0/sqlas.egg-info/PKG-INFO +0 -376
  29. {sqlas-1.3.0 → sqlas-2.5.0}/LICENSE +0 -0
  30. {sqlas-1.3.0 → sqlas-2.5.0}/setup.cfg +0 -0
  31. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/context.py +0 -0
  32. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/correctness.py +0 -0
  33. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/py.typed +0 -0
  34. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/response.py +0 -0
  35. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas/visualization.py +0 -0
  36. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/dependency_links.txt +0 -0
  37. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/requires.txt +0 -0
  38. {sqlas-1.3.0 → sqlas-2.5.0}/sqlas.egg-info/top_level.txt +0 -0
  39. {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_context.py +0 -0
  40. {sqlas-1.3.0 → sqlas-2.5.0}/tests/test_sqlas.py +0 -0
sqlas-2.5.0/PKG-INFO ADDED
@@ -0,0 +1,364 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqlas
3
+ Version: 2.5.0
4
+ Summary: SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics.
5
+ Author-email: thepradip <pradiptivhale@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/thepradip/SQLAS
8
+ Project-URL: Documentation, https://github.com/thepradip/SQLAS#readme
9
+ Project-URL: Repository, https://github.com/thepradip/SQLAS
10
+ Project-URL: Changelog, https://github.com/thepradip/SQLAS/blob/main/CHANGELOG.md
11
+ Keywords: sql,agent,evaluation,llm,text-to-sql,ragas,mlflow,benchmark,monitoring
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Database
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Typing :: Typed
23
+ Requires-Python: >=3.10
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: sqlglot>=20.0
27
+ Provides-Extra: mlflow
28
+ Requires-Dist: mlflow>=3.0; extra == "mlflow"
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0; extra == "dev"
31
+ Requires-Dist: build; extra == "dev"
32
+ Requires-Dist: twine; extra == "dev"
33
+ Provides-Extra: all
34
+ Requires-Dist: mlflow>=3.0; extra == "all"
35
+ Dynamic: license-file
36
+
37
+ # SQLAS — SQL Agent Scoring Framework
38
+
39
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
40
+
41
+ [![PyPI](https://img.shields.io/pypi/v/sqlas)](https://pypi.org/project/sqlas/)
42
+ [![Python](https://img.shields.io/pypi/pyversions/sqlas)](https://pypi.org/project/sqlas/)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
44
+ [![Tests](https://img.shields.io/badge/tests-140%20passing-brightgreen)](https://github.com/thepradip/SQLAS)
45
+
46
+ Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
47
+
48
+ **Author:** [thepradip](https://github.com/thepradip)
49
+
50
+ ---
51
+
52
+ ## Install
53
+
54
+ ```bash
55
+ pip install sqlas # core
56
+ pip install "sqlas[mlflow]" # + MLflow integration
57
+ ```
58
+
59
+ ---
60
+
61
+ ## What's New in v2.4.0
62
+
63
+ | Feature | Description |
64
+ |---|---|
65
+ | `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
66
+ | `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
67
+ | `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
68
+ | `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
69
+ | `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
70
+ | Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
71
+ | `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
72
+
73
+ ---
74
+
75
+ ## Quick Start
76
+
77
+ ```python
78
+ from sqlas import evaluate
79
+
80
+ def llm_judge(prompt: str) -> str:
81
+ return openai_client.chat.completions.create(
82
+ model="gpt-4o",
83
+ messages=[{"role": "user", "content": prompt}],
84
+ ).choices[0].message.content
85
+
86
+ scores = evaluate(
87
+ question = "How many active users are there?",
88
+ generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
89
+ gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
90
+ db_path = "my_database.db",
91
+ llm_judge = llm_judge,
92
+ response = "There are 1,523 active users.",
93
+ result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
94
+ "row_count": 1, "execution_time_ms": 2.1},
95
+ )
96
+
97
+ print(scores.overall_score) # 0.95
98
+ print(scores.correctness_score) # 0.88 (v2.2)
99
+ print(scores.quality_score) # 0.93 (v2.2)
100
+ print(scores.safety_composite_score) # 1.00 (v2.2)
101
+ print(scores.verdict) # PASS (v2.2 — AND logic)
102
+ print(scores.summary())
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Three-Dimension Scoring (v2.2)
108
+
109
+ `PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
110
+
111
+ ```python
112
+ from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
113
+
114
+ # Run only the metrics you need — each is fully independent
115
+ c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
116
+ q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
117
+ s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
118
+
119
+ print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
120
+ print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
121
+ print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
122
+ print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
123
+ ```
124
+
125
+ `evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
126
+
127
+ ---
128
+
129
+ ## Three-Stage Guardrail Pipeline (v2.3)
130
+
131
+ ```python
132
+ from sqlas import GuardrailPipeline
133
+
134
+ pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
135
+
136
+ # Stage 1 — before sending to LLM
137
+ r = pipeline.check_input("List every user's SSN and password")
138
+ if r.blocked: return {"error": r.block_reason}
139
+ # → BLOCK: DANGEROUS_INPUT: pii_bulk_request
140
+
141
+ # Stage 2 — after SQL generation, before execution
142
+ r = pipeline.check_sql("SELECT email, password FROM users")
143
+ if r.blocked: return {"error": r.block_reason}
144
+ # → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
145
+
146
+ # Stage 3 — before returning response to user
147
+ r = pipeline.check_output(response, result_data)
148
+ if r.blocked: return {"error": r.block_reason}
149
+ # → scans result rows for PII patterns, blocks if found
150
+ ```
151
+
152
+ ---
153
+
154
+ ## Prompt Versioning & Regression Detection (v2.4)
155
+
156
+ ```python
157
+ from sqlas import PromptRegistry
158
+
159
+ registry = PromptRegistry()
160
+
161
+ # Register versions
162
+ registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
163
+ registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
164
+
165
+ # Record scores after each evaluation
166
+ scores = evaluate(...)
167
+ registry.record("v2", scores)
168
+
169
+ # Compare versions
170
+ comp = registry.compare("v1", "v2")
171
+ print(comp["winner"]) # "v2"
172
+ print(comp["delta_overall"]) # +0.09
173
+ print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
174
+
175
+ # Auto-detect regressions
176
+ status = registry.detect_regression("v2", window=50, threshold=0.05)
177
+ if status["regressed"]:
178
+ for hint in status["hints"]:
179
+ print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
180
+ print(f" Fix: {hint['hint']}")
181
+ # [WARNING] faithfulness = 0.61
182
+ # Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Schema Retrieval Quality (v2.4)
188
+
189
+ Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
190
+
191
+ ```python
192
+ from sqlas import schema_retrieval_quality
193
+
194
+ score, details = schema_retrieval_quality(
195
+ retrieved_tables = schema_index.retrieve(question), # what index returned
196
+ generated_sql = agent_sql,
197
+ gold_tables = test_case.expected_tables, # ground truth
198
+ )
199
+
200
+ print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
201
+ print(details["recall"]) # 1.00 — both needed tables were retrieved
202
+ print(details["irrelevant"]) # ["lab_results", "medications"]
203
+ print(details["missing"]) # [] — no JOIN table was dropped
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Feedback Loop (v2.3)
209
+
210
+ Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
211
+
212
+ ```python
213
+ from sqlas import FeedbackStore, FeedbackEntry
214
+
215
+ store = FeedbackStore()
216
+
217
+ # User gives thumbs up → store as gold SQL
218
+ store.store(FeedbackEntry(
219
+ question = "How many active users?",
220
+ sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
221
+ is_correct = True,
222
+ score = scores.overall_score,
223
+ ))
224
+
225
+ # Next evaluation auto-uses stored gold SQL
226
+ c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
227
+ # execution_accuracy is now verified (1.0) instead of unverified (0.5)
228
+ print(c.details["gold_sql_source"]) # "feedback_store"
229
+ ```
230
+
231
+ ---
232
+
233
+ ## Any Database (v2.1)
234
+
235
+ ```python
236
+ from sqlas import build_schema_info, run_suite
237
+
238
+ # Auto-extract schema from any database
239
+ tables, columns = build_schema_info(db_path="my.db") # SQLite
240
+ tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
241
+
242
+ results = run_suite(
243
+ test_cases = test_cases,
244
+ agent_fn = my_agent,
245
+ llm_judge = llm_judge,
246
+ execute_fn = execute_fn,
247
+ valid_tables = tables, # 100+ tables — no problem
248
+ valid_columns = columns,
249
+ )
250
+ ```
251
+
252
+ ---
253
+
254
+ ## Run a Test Suite
255
+
256
+ ```python
257
+ from sqlas import run_suite, TestCase
258
+
259
+ test_cases = [
260
+ TestCase(question="How many users signed up this month?",
261
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
262
+ expected_tables=["users"], category="easy"),
263
+ TestCase(question="Average order value by country",
264
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
265
+ expected_tables=["orders"], category="medium"),
266
+ ]
267
+
268
+ def my_agent(question: str) -> dict:
269
+ sql = generate_sql(question)
270
+ return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
271
+
272
+ results = run_suite(
273
+ test_cases = test_cases,
274
+ agent_fn = my_agent,
275
+ llm_judge = llm_judge,
276
+ execute_fn = execute_fn,
277
+ pass_threshold = 0.6,
278
+ verbose = True,
279
+ )
280
+ print(results["summary"]["overall_score"])
281
+ print(results["summary"]["by_category"])
282
+ ```
283
+
284
+ ---
285
+
286
+ ## Weight Profiles
287
+
288
+ | Profile | Metrics | Best for |
289
+ |---|---|---|
290
+ | `WEIGHTS` | 15 | Standard NL→SQL pipeline |
291
+ | `WEIGHTS_V2` | 20 | + RAGAS context quality |
292
+ | `WEIGHTS_V3` | 30 | + Guardrails + visualization |
293
+ | `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
294
+
295
+ ---
296
+
297
+ ## RAGAS Mapping
298
+
299
+ | RAGAS | SQLAS | Notes |
300
+ |---|---|---|
301
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result |
302
+ | Answer Relevance | `answer_relevance` | Answers the question |
303
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
304
+ | Context Precision | `context_precision` | Right schema elements used |
305
+ | Context Recall | `context_recall` | All required schema elements present |
306
+ | Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
307
+ | — | `schema_retrieval_quality` | Did the index return the right tables? |
308
+ | — | `result_coverage` | Truncated GROUP BY detection |
309
+ | — | `agentic_score` | ReAct planning quality |
310
+
311
+ ---
312
+
313
+ ## LLM-Agnostic Judge
314
+
315
+ ```python
316
+ # OpenAI
317
+ def judge(p): return openai.chat.completions.create(model="gpt-4o",
318
+ messages=[{"role":"user","content":p}]).choices[0].message.content
319
+
320
+ # Anthropic
321
+ def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
322
+ max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
323
+
324
+ # Ollama (local, free)
325
+ def judge(p): return requests.post("http://localhost:11434/api/generate",
326
+ json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
327
+ ```
328
+
329
+ ---
330
+
331
+ ## Changelog
332
+
333
+ ### v2.4.0
334
+ - `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
335
+ - `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
336
+ - `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
337
+
338
+ ### v2.3.0
339
+ - `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
340
+ - `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
341
+ - `evaluate_correctness/quality/safety` — standalone metric evaluators
342
+
343
+ ### v2.2.0
344
+ - Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
345
+ - `verdict` — AND logic: `PASS` only when all three pass thresholds
346
+ - `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
347
+
348
+ ### v2.1.0
349
+ - `build_schema_info()` — auto-extract schema from any DB
350
+ - `result_coverage` — truncation-aware GROUP BY penalty
351
+ - `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
352
+ - 100+ table support with focused schema context
353
+
354
+ ### v2.0.0
355
+ - Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
356
+ - Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
357
+ - `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
358
+ - `read_only_compliance` upgraded to sqlglot AST
359
+
360
+ ---
361
+
362
+ ## License
363
+
364
+ MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
sqlas-2.5.0/README.md ADDED
@@ -0,0 +1,328 @@
1
+ # SQLAS — SQL Agent Scoring Framework
2
+
3
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and Agentic SQL agents.**
4
+
5
+ [![PyPI](https://img.shields.io/pypi/v/sqlas)](https://pypi.org/project/sqlas/)
6
+ [![Python](https://img.shields.io/pypi/pyversions/sqlas)](https://pypi.org/project/sqlas/)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
8
+ [![Tests](https://img.shields.io/badge/tests-140%20passing-brightgreen)](https://github.com/thepradip/SQLAS)
9
+
10
+ Evaluate SQL agents across 45 metrics — correctness, quality, safety, agentic reasoning, schema retrieval, prompt versioning, and guardrails. Aligned with Spider, BIRD, RAGAS, and MLflow standards.
11
+
12
+ **Author:** [thepradip](https://github.com/thepradip)
13
+
14
+ ---
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ pip install sqlas # core
20
+ pip install "sqlas[mlflow]" # + MLflow integration
21
+ ```
22
+
23
+ ---
24
+
25
+ ## What's New in v2.4.0
26
+
27
+ | Feature | Description |
28
+ |---|---|
29
+ | `PromptRegistry` | Version prompts, compare A/B, detect regressions, get data-driven improvement hints |
30
+ | `schema_retrieval_quality` | Measure precision/recall of schema index — did it return the right tables? |
31
+ | `evaluate_correctness/quality/safety` | Three standalone evaluators — run only what you need |
32
+ | `GuardrailPipeline` | Three-stage safety: input → SQL → output (zero LLM cost) |
33
+ | `FeedbackStore` | Thumbs-up stores verified gold SQL, auto-improves `execution_accuracy` |
34
+ | Three-dimension verdict | `PASS` only when correctness + quality + safety ALL pass their thresholds |
35
+ | `result_coverage` | Penalises truncated GROUP BY (score 0.3) — catches big-dataset evaluation blind spots |
36
+
37
+ ---
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from sqlas import evaluate
43
+
44
+ def llm_judge(prompt: str) -> str:
45
+ return openai_client.chat.completions.create(
46
+ model="gpt-4o",
47
+ messages=[{"role": "user", "content": prompt}],
48
+ ).choices[0].message.content
49
+
50
+ scores = evaluate(
51
+ question = "How many active users are there?",
52
+ generated_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
53
+ gold_sql = "SELECT COUNT(*) FROM users WHERE active = 1",
54
+ db_path = "my_database.db",
55
+ llm_judge = llm_judge,
56
+ response = "There are 1,523 active users.",
57
+ result_data = {"columns": ["COUNT(*)"], "rows": [[1523]],
58
+ "row_count": 1, "execution_time_ms": 2.1},
59
+ )
60
+
61
+ print(scores.overall_score) # 0.95
62
+ print(scores.correctness_score) # 0.88 (v2.2)
63
+ print(scores.quality_score) # 0.93 (v2.2)
64
+ print(scores.safety_composite_score) # 1.00 (v2.2)
65
+ print(scores.verdict) # PASS (v2.2 — AND logic)
66
+ print(scores.summary())
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Three-Dimension Scoring (v2.2)
72
+
73
+ `PASS` requires **all three** dimensions to exceed their thresholds. A safe-but-wrong query no longer masks as PASS.
74
+
75
+ ```python
76
+ from sqlas import evaluate_correctness, evaluate_quality, evaluate_safety
77
+
78
+ # Run only the metrics you need — each is fully independent
79
+ c = evaluate_correctness(question, sql, llm_judge, gold_sql=gold, execute_fn=db)
80
+ q = evaluate_quality(question, sql, llm_judge, response=text, result_data=data)
81
+ s = evaluate_safety(sql, question=question, pii_columns=["email","ssn"])
82
+
83
+ print(c.score, c.verdict) # 0.85 PASS (threshold 0.5)
84
+ print(q.score, q.verdict) # 0.72 PASS (threshold 0.6)
85
+ print(s.score, s.verdict) # 0.45 FAIL (threshold 0.9 — PII detected)
86
+ print(s.issues) # ["PII_ACCESS: 'email'", "PII_ACCESS: 'ssn'"]
87
+ ```
88
+
89
+ `evaluate_safety()` makes **zero LLM calls** — pure regex + sqlglot AST.
90
+
91
+ ---
92
+
93
+ ## Three-Stage Guardrail Pipeline (v2.3)
94
+
95
+ ```python
96
+ from sqlas import GuardrailPipeline
97
+
98
+ pipeline = GuardrailPipeline(pii_columns=["email", "ssn", "password"])
99
+
100
+ # Stage 1 — before sending to LLM
101
+ r = pipeline.check_input("List every user's SSN and password")
102
+ if r.blocked: return {"error": r.block_reason}
103
+ # → BLOCK: DANGEROUS_INPUT: pii_bulk_request
104
+
105
+ # Stage 2 — after SQL generation, before execution
106
+ r = pipeline.check_sql("SELECT email, password FROM users")
107
+ if r.blocked: return {"error": r.block_reason}
108
+ # → score=0.80, issues=["PII_ACCESS: 'email'", "PII_ACCESS: 'password'"]
109
+
110
+ # Stage 3 — before returning response to user
111
+ r = pipeline.check_output(response, result_data)
112
+ if r.blocked: return {"error": r.block_reason}
113
+ # → scans result rows for PII patterns, blocks if found
114
+ ```
115
+
116
+ ---
117
+
118
+ ## Prompt Versioning & Regression Detection (v2.4)
119
+
120
+ ```python
121
+ from sqlas import PromptRegistry
122
+
123
+ registry = PromptRegistry()
124
+
125
+ # Register versions
126
+ registry.register("You are a SQL analyst...", version_id="v1", description="baseline")
127
+ registry.register("...Only cite exact numbers from the SQL result.", version_id="v2", description="grounding fix")
128
+
129
+ # Record scores after each evaluation
130
+ scores = evaluate(...)
131
+ registry.record("v2", scores)
132
+
133
+ # Compare versions
134
+ comp = registry.compare("v1", "v2")
135
+ print(comp["winner"]) # "v2"
136
+ print(comp["delta_overall"]) # +0.09
137
+ print(comp["improvements"]) # [{"metric": "faithfulness", "delta": "+0.27", ...}]
138
+
139
+ # Auto-detect regressions
140
+ status = registry.detect_regression("v2", window=50, threshold=0.05)
141
+ if status["regressed"]:
142
+ for hint in status["hints"]:
143
+ print(f"[{hint['severity']}] {hint['metric']} = {hint['score']}")
144
+ print(f" Fix: {hint['hint']}")
145
+ # [WARNING] faithfulness = 0.61
146
+ # Fix: Add to prompt: 'Only cite exact numbers from the SQL result...'
147
+ ```
148
+
149
+ ---
150
+
151
+ ## Schema Retrieval Quality (v2.4)
152
+
153
+ Measures whether the schema index returned the right tables for a query — not just whether the SQL used valid tables.
154
+
155
+ ```python
156
+ from sqlas import schema_retrieval_quality
157
+
158
+ score, details = schema_retrieval_quality(
159
+ retrieved_tables = schema_index.retrieve(question), # what index returned
160
+ generated_sql = agent_sql,
161
+ gold_tables = test_case.expected_tables, # ground truth
162
+ )
163
+
164
+ print(details["precision"]) # 0.50 — 2 of 4 retrieved tables were needed
165
+ print(details["recall"]) # 1.00 — both needed tables were retrieved
166
+ print(details["irrelevant"]) # ["lab_results", "medications"]
167
+ print(details["missing"]) # [] — no JOIN table was dropped
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Feedback Loop (v2.3)
173
+
174
+ Thumbs-up feedback stores verified gold SQL — future evaluations of the same question use it automatically.
175
+
176
+ ```python
177
+ from sqlas import FeedbackStore, FeedbackEntry
178
+
179
+ store = FeedbackStore()
180
+
181
+ # User gives thumbs up → store as gold SQL
182
+ store.store(FeedbackEntry(
183
+ question = "How many active users?",
184
+ sql = "SELECT COUNT(*) FROM users WHERE status = 'active'",
185
+ is_correct = True,
186
+ score = scores.overall_score,
187
+ ))
188
+
189
+ # Next evaluation auto-uses stored gold SQL
190
+ c = evaluate_correctness(question, agent_sql, llm_judge, feedback_store=store)
191
+ # execution_accuracy is now verified (1.0) instead of unverified (0.5)
192
+ print(c.details["gold_sql_source"]) # "feedback_store"
193
+ ```
194
+
195
+ ---
196
+
197
+ ## Any Database (v2.1)
198
+
199
+ ```python
200
+ from sqlas import build_schema_info, run_suite
201
+
202
+ # Auto-extract schema from any database
203
+ tables, columns = build_schema_info(db_path="my.db") # SQLite
204
+ tables, columns = build_schema_info(execute_fn=pg_execute_fn) # PostgreSQL / Snowflake / BigQuery
205
+
206
+ results = run_suite(
207
+ test_cases = test_cases,
208
+ agent_fn = my_agent,
209
+ llm_judge = llm_judge,
210
+ execute_fn = execute_fn,
211
+ valid_tables = tables, # 100+ tables — no problem
212
+ valid_columns = columns,
213
+ )
214
+ ```
215
+
216
+ ---
217
+
218
+ ## Run a Test Suite
219
+
220
+ ```python
221
+ from sqlas import run_suite, TestCase
222
+
223
+ test_cases = [
224
+ TestCase(question="How many users signed up this month?",
225
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
226
+ expected_tables=["users"], category="easy"),
227
+ TestCase(question="Average order value by country",
228
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
229
+ expected_tables=["orders"], category="medium"),
230
+ ]
231
+
232
+ def my_agent(question: str) -> dict:
233
+ sql = generate_sql(question)
234
+ return {"sql": sql, "response": narrate(sql), "data": execute(sql)}
235
+
236
+ results = run_suite(
237
+ test_cases = test_cases,
238
+ agent_fn = my_agent,
239
+ llm_judge = llm_judge,
240
+ execute_fn = execute_fn,
241
+ pass_threshold = 0.6,
242
+ verbose = True,
243
+ )
244
+ print(results["summary"]["overall_score"])
245
+ print(results["summary"]["by_category"])
246
+ ```
247
+
248
+ ---
249
+
250
+ ## Weight Profiles
251
+
252
+ | Profile | Metrics | Best for |
253
+ |---|---|---|
254
+ | `WEIGHTS` | 15 | Standard NL→SQL pipeline |
255
+ | `WEIGHTS_V2` | 20 | + RAGAS context quality |
256
+ | `WEIGHTS_V3` | 30 | + Guardrails + visualization |
257
+ | `WEIGHTS_V4` | 28 | + Agentic quality — ReAct agents |
258
+
259
+ ---
260
+
261
+ ## RAGAS Mapping
262
+
263
+ | RAGAS | SQLAS | Notes |
264
+ |---|---|---|
265
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result |
266
+ | Answer Relevance | `answer_relevance` | Answers the question |
267
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
268
+ | Context Precision | `context_precision` | Right schema elements used |
269
+ | Context Recall | `context_recall` | All required schema elements present |
270
+ | Noise Sensitivity | `noise_robustness` | Irrelevant schema ignored |
271
+ | — | `schema_retrieval_quality` | Did the index return the right tables? |
272
+ | — | `result_coverage` | Truncated GROUP BY detection |
273
+ | — | `agentic_score` | ReAct planning quality |
274
+
275
+ ---
276
+
277
+ ## LLM-Agnostic Judge
278
+
279
+ ```python
280
+ # OpenAI
281
+ def judge(p): return openai.chat.completions.create(model="gpt-4o",
282
+ messages=[{"role":"user","content":p}]).choices[0].message.content
283
+
284
+ # Anthropic
285
+ def judge(p): return anthropic.messages.create(model="claude-opus-4-7",
286
+ max_tokens=500, messages=[{"role":"user","content":p}]).content[0].text
287
+
288
+ # Ollama (local, free)
289
+ def judge(p): return requests.post("http://localhost:11434/api/generate",
290
+ json={"model":"llama3","prompt":p,"stream":False}).json()["response"]
291
+ ```
292
+
293
+ ---
294
+
295
+ ## Changelog
296
+
297
+ ### v2.4.0
298
+ - `PromptRegistry` — version prompts, compare A/B, detect regressions, get improvement hints
299
+ - `schema_retrieval_quality` — precision/recall/F1 for schema index evaluation
300
+ - `prompt_id` + `schema_retrieval_*` fields on `SQLASScores`
301
+
302
+ ### v2.3.0
303
+ - `GuardrailPipeline` — 3-stage safety: `check_input`, `check_sql`, `check_output`
304
+ - `FeedbackStore` + `FeedbackEntry` — verified gold SQL from user thumbs-up
305
+ - `evaluate_correctness/quality/safety` — standalone metric evaluators
306
+
307
+ ### v2.2.0
308
+ - Three-dimension scoring: `correctness_score`, `quality_score`, `safety_composite_score`
309
+ - `verdict` — AND logic: `PASS` only when all three pass thresholds
310
+ - `CorrectnessResult`, `QualityResult`, `SafetyResult` dataclasses
311
+
312
+ ### v2.1.0
313
+ - `build_schema_info()` — auto-extract schema from any DB
314
+ - `result_coverage` — truncation-aware GROUP BY penalty
315
+ - `execution_accuracy` capped at 0.5 without gold SQL (was incorrectly 1.0)
316
+ - 100+ table support with focused schema context
317
+
318
+ ### v2.0.0
319
+ - Agentic quality: `steps_efficiency`, `schema_grounding`, `planning_quality`, `agentic_score`
320
+ - Cache metrics: `cache_hit_score`, `tokens_saved_score`, `few_shot_score`
321
+ - `WEIGHTS_V4` — 28-metric profile with 10% agentic dimension
322
+ - `read_only_compliance` upgraded to sqlglot AST
323
+
324
+ ---
325
+
326
+ ## License
327
+
328
+ MIT — [thepradip](https://github.com/thepradip) · [pypi.org/project/sqlas](https://pypi.org/project/sqlas/)
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sqlas"
7
- version = "1.3.0"
8
- description = "SQLAS — SQL Agent Scoring Framework. A RAGAS-equivalent evaluation library for Text-to-SQL and SQL AI agents with guardrail and visualization metrics."
7
+ version = "2.5.0"
8
+ description = "SQLAS — SQL Agent Scoring Framework. Production-grade evaluation for Text-to-SQL and Agentic SQL agents with guardrail, visualization, agentic quality, and cache performance metrics."
9
9
  readme = "README.md"
10
10
  license = "MIT"
11
11
  authors = [{name = "thepradip", email = "pradiptivhale@gmail.com"}]