sqlas 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlas-1.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 SQLAS Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sqlas-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,322 @@
1
+ Metadata-Version: 2.4
2
+ Name: sqlas
3
+ Version: 1.1.0
4
+ Summary: SQLAS — SQL Agent Scoring Framework. A RAGAS-equivalent evaluation library for Text-to-SQL and SQL AI agents. 20 production-grade metrics across 8 categories.
5
+ Author: SQLAS Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/sqlas-framework/sqlas
8
+ Project-URL: Documentation, https://github.com/sqlas-framework/sqlas#readme
9
+ Project-URL: Repository, https://github.com/sqlas-framework/sqlas
10
+ Project-URL: Changelog, https://github.com/sqlas-framework/sqlas/blob/main/CHANGELOG.md
11
+ Keywords: sql,agent,evaluation,llm,text-to-sql,ragas,mlflow,benchmark,monitoring
12
+ Classifier: Development Status :: 5 - Production/Stable
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: sqlglot>=20.0
28
+ Provides-Extra: mlflow
29
+ Requires-Dist: mlflow>=3.0; extra == "mlflow"
30
+ Provides-Extra: dev
31
+ Requires-Dist: pytest>=7.0; extra == "dev"
32
+ Requires-Dist: build; extra == "dev"
33
+ Requires-Dist: twine; extra == "dev"
34
+ Provides-Extra: all
35
+ Requires-Dist: mlflow>=3.0; extra == "all"
36
+ Dynamic: license-file
37
+
38
+ # SQLAS — SQL Agent Scoring Framework
39
+
40
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and SQL AI agents.**
41
+
42
+ SQLAS evaluates SQL agents across **20 production metrics** in **8 categories**, aligned with industry best practices (Spider, BIRD, Arize, MLflow).
43
+
44
+ **Author:** SQLAS Contributors
45
+
46
+ ---
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ pip install sqlas
52
+
53
+ # With MLflow integration
54
+ pip install sqlas[mlflow]
55
+
56
+ # With dev tools
57
+ pip install sqlas[dev]
58
+ ```
59
+
60
+ ---
61
+
62
+ ## Quick Start
63
+
64
+ ```python
65
+ from sqlas import evaluate
66
+
67
+ # Your LLM judge function (any LLM: OpenAI, Anthropic, local, etc.)
68
+ def my_llm_judge(prompt: str) -> str:
69
+ return client.chat.completions.create(
70
+ model="gpt-4o",
71
+ messages=[{"role": "user", "content": prompt}],
72
+ ).choices[0].message.content
73
+
74
+ # Evaluate a single query
75
+ scores = evaluate(
76
+ question="How many active users are there?",
77
+ generated_sql="SELECT COUNT(*) FROM users WHERE active = 1",
78
+ gold_sql="SELECT COUNT(*) FROM users WHERE active = 1",
79
+ db_path="my_database.db",
80
+ llm_judge=my_llm_judge,
81
+ response="There are 1,523 active users.",
82
+ result_data={"columns": ["COUNT(*)"], "rows": [[1523]], "row_count": 1, "execution_time_ms": 2.1},
83
+ )
84
+
85
+ print(scores.overall_score) # 0.95
86
+ print(scores.summary())
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Evaluate Without Gold SQL
92
+
93
+ Gold SQL is optional. Without it, SQLAS uses semantic equivalence (LLM judge) and execution success:
94
+
95
+ ```python
96
+ scores = evaluate(
97
+ question="Show top 10 products by revenue",
98
+ generated_sql="SELECT name, SUM(price * qty) AS rev FROM orders GROUP BY name ORDER BY rev DESC LIMIT 10",
99
+ llm_judge=my_llm_judge,
100
+ response="The top products are...",
101
+ result_data={"columns": ["name", "rev"], "rows": [...], "row_count": 10, "execution_time_ms": 15},
102
+ )
103
+ ```
104
+
105
+ ---
106
+
107
+ ## Run a Test Suite
108
+
109
+ ```python
110
+ from sqlas import run_suite, TestCase
111
+
112
+ test_cases = [
113
+ TestCase(
114
+ question="How many users signed up this month?",
115
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
116
+ category="easy",
117
+ ),
118
+ TestCase(
119
+ question="Average order value by country",
120
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
121
+ category="medium",
122
+ ),
123
+ ]
124
+
125
+ def my_agent(question: str) -> dict:
126
+ # Your SQL agent pipeline
127
+ sql = generate_sql(question)
128
+ result = execute(sql)
129
+ response = narrate(result)
130
+ return {"sql": sql, "response": response, "data": result}
131
+
132
+ results = run_suite(
133
+ test_cases=test_cases,
134
+ agent_fn=my_agent,
135
+ llm_judge=my_llm_judge,
136
+ db_path="my_database.db",
137
+ pass_threshold=0.6, # configurable
138
+ )
139
+
140
+ print(results["summary"]["overall_score"]) # 0.88
141
+ ```
142
+
143
+ ---
144
+
145
+ ## Metric Framework
146
+
147
+ ### v1: Production Composite Score (15 metrics, 6 categories)
148
+
149
+ The default `WEIGHTS` profile uses 15 metrics:
150
+
151
+ ```
152
+ SQLAS = 40% Execution Accuracy
153
+ + 15% Semantic Correctness
154
+ + 15% Cost Efficiency
155
+ + 10% Execution Quality
156
+ + 10% Task Success
157
+ + 10% Safety
158
+ ```
159
+
160
+ ### v2: Full RAGAS-Mapped Score (20 metrics, 8 categories)
161
+
162
+ Use `WEIGHTS_V2` for the full 20-metric evaluation with context quality:
163
+
164
+ ```python
165
+ from sqlas import evaluate, WEIGHTS_V2
166
+
167
+ scores = evaluate(..., weights=WEIGHTS_V2)
168
+ ```
169
+
170
+ ```
171
+ SQLAS v2 = 35% Execution Accuracy
172
+ + 13% Semantic Correctness
173
+ + 10% Context Quality (NEW — RAGAS-mapped)
174
+ + 12% Cost Efficiency
175
+ + 8% Execution Quality
176
+ + 8% Task Success
177
+ + 4% Result Similarity (NEW)
178
+ + 10% Safety
179
+ ```
180
+
181
+ ### Detailed Breakdown (v2 — 20 metrics)
182
+
183
+ | Category | Metric | v1 Weight | v2 Weight | Method |
184
+ |---|---|---|---|---|
185
+ | **Execution Accuracy** | execution_accuracy | 40% | 35% | Automated: output + structure + efficiency |
186
+ | **Semantic Correctness** | semantic_equivalence | 15% | 13% | LLM-as-judge |
187
+ | **Context Quality** | context_precision | — | 3% | Automated: schema element precision vs gold |
188
+ | | context_recall | — | 3% | Automated: schema element recall vs gold |
189
+ | | entity_recall | — | 2% | Automated: strict entity-level recall |
190
+ | | noise_robustness | — | 2% | Automated: irrelevant schema resistance |
191
+ | **Cost Efficiency** | efficiency_score | 5% | 4% | Automated: VES |
192
+ | | data_scan_efficiency | 5% | 4% | Automated: scan detection |
193
+ | | sql_quality | 3% | 2% | LLM: join/agg/filter |
194
+ | | schema_compliance | 2% | 2% | Automated: sqlglot |
195
+ | **Execution Quality** | execution_success | 5% | 4% | Automated |
196
+ | | complexity_match | 3% | 2% | LLM-as-judge |
197
+ | | empty_result_penalty | 2% | 2% | Automated |
198
+ | **Task Success** | faithfulness | 4% | 3% | LLM-as-judge |
199
+ | | answer_relevance | 3% | 2% | LLM-as-judge |
200
+ | | answer_completeness | 2% | 2% | LLM-as-judge |
201
+ | | fluency | 1% | 1% | LLM-as-judge |
202
+ | **Result Similarity** | result_set_similarity | — | 4% | Automated: Jaccard on result sets |
203
+ | **Safety** | read_only_compliance | 5% | 5% | Automated: DDL/DML |
204
+ | | safety_score | 5% | 5% | Automated: PII/injection |
205
+
206
+ ### Custom Weights
207
+
208
+ ```python
209
+ my_weights = {
210
+ "execution_accuracy": 0.50, # increase correctness weight
211
+ "semantic_equivalence": 0.10,
212
+ "safety_score": 0.15, # stricter safety
213
+ # ... other metrics (must sum to 1.0)
214
+ }
215
+
216
+ scores = evaluate(..., weights=my_weights)
217
+ ```
218
+
219
+ ---
220
+
221
+ ## Use Individual Metrics
222
+
223
+ ```python
224
+ from sqlas import execution_accuracy, schema_compliance, safety_score
225
+ from sqlas import context_precision, context_recall, entity_recall
226
+
227
+ # Just check execution accuracy
228
+ score, details = execution_accuracy(
229
+ generated_sql="SELECT COUNT(*) FROM users",
230
+ gold_sql="SELECT COUNT(*) FROM users",
231
+ db_path="my.db",
232
+ )
233
+
234
+ # Just check schema compliance
235
+ score, details = schema_compliance(
236
+ sql="SELECT name FROM users",
237
+ valid_tables={"users", "orders"},
238
+ valid_columns={"users": {"id", "name", "email"}, "orders": {"id", "user_id", "total"}},
239
+ )
240
+
241
+ # Just check safety
242
+ score, details = safety_score(
243
+ sql="SELECT * FROM users",
244
+ pii_columns=["email", "phone", "ssn"],
245
+ )
246
+
247
+ # Context quality (requires gold SQL)
248
+ precision, details = context_precision(
249
+ generated_sql="SELECT name, age FROM users WHERE active = 1",
250
+ gold_sql="SELECT name FROM users WHERE active = 1",
251
+ )
252
+ # precision < 1.0 — 'age' is extra
253
+
254
+ recall, details = context_recall(
255
+ generated_sql="SELECT name FROM users",
256
+ gold_sql="SELECT name FROM users WHERE active = 1",
257
+ )
258
+ # recall < 1.0 — 'active' is missing
259
+ ```
260
+
261
+ ---
262
+
263
+ ## RAGAS Mapping
264
+
265
+ | RAGAS Metric | SQLAS Equivalent | Description |
266
+ |---|---|---|
267
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result data |
268
+ | Answer Relevance | `answer_relevance` | Response answers the question |
269
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
270
+ | Answer Similarity | `result_set_similarity` | Result set Jaccard similarity |
271
+ | Context Precision | `context_precision` | Only relevant schema elements used |
272
+ | Context Recall | `context_recall` | All required schema elements used |
273
+ | Context Entity Recall | `entity_recall` | Strict entity match (tables, columns, literals, functions) |
274
+ | Noise Sensitivity | `noise_robustness` | Resistance to irrelevant schema context |
275
+ | — | `semantic_equivalence` | SQL answers the intent (LLM judge) |
276
+ | — | `safety_score` | PII + injection + DDL protection |
277
+ | — | `schema_compliance` | Valid tables/columns via AST |
278
+
279
+ ---
280
+
281
+ ## Production Features
282
+
283
+ - **Read-only DB**: All query execution uses read-only connections
284
+ - **Timeout guard**: SQL execution timeout (default 30s) prevents hangs
285
+ - **LLM resilience**: All LLM judge calls wrapped with error handling
286
+ - **Input validation**: Empty SQL, missing db_path, weight sum checks
287
+ - **Structured logging**: Uses Python `logging` module (not print)
288
+ - **Type-checked**: Ships `py.typed` marker for mypy/pyright
289
+
290
+ ---
291
+
292
+ ## LLM Judge
293
+
294
+ SQLAS is **LLM-agnostic**. Provide any function `(prompt: str) -> str`:
295
+
296
+ ```python
297
+ # OpenAI
298
+ def judge(prompt):
299
+ return openai_client.chat.completions.create(
300
+ model="gpt-4o", messages=[{"role": "user", "content": prompt}]
301
+ ).choices[0].message.content
302
+
303
+ # Anthropic
304
+ def judge(prompt):
305
+ return anthropic_client.messages.create(
306
+ model="claude-sonnet-4-20250514", max_tokens=500,
307
+ messages=[{"role": "user", "content": prompt}]
308
+ ).content[0].text
309
+
310
+ # Local (Ollama)
311
+ def judge(prompt):
312
+ import requests
313
+ return requests.post("http://localhost:11434/api/generate",
314
+ json={"model": "llama3", "prompt": prompt}
315
+ ).json()["response"]
316
+ ```
317
+
318
+ ---
319
+
320
+ ## License
321
+
322
+ MIT License - SQLAS Contributors
sqlas-1.1.0/README.md ADDED
@@ -0,0 +1,285 @@
1
+ # SQLAS — SQL Agent Scoring Framework
2
+
3
+ **A RAGAS-equivalent evaluation library for Text-to-SQL and SQL AI agents.**
4
+
5
+ SQLAS evaluates SQL agents across **20 production metrics** in **8 categories**, aligned with industry best practices (Spider, BIRD, Arize, MLflow).
6
+
7
+ **Author:** SQLAS Contributors
8
+
9
+ ---
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install sqlas
15
+
16
+ # With MLflow integration
17
+ pip install sqlas[mlflow]
18
+
19
+ # With dev tools
20
+ pip install sqlas[dev]
21
+ ```
22
+
23
+ ---
24
+
25
+ ## Quick Start
26
+
27
+ ```python
28
+ from sqlas import evaluate
29
+
30
+ # Your LLM judge function (any LLM: OpenAI, Anthropic, local, etc.)
31
+ def my_llm_judge(prompt: str) -> str:
32
+ return client.chat.completions.create(
33
+ model="gpt-4o",
34
+ messages=[{"role": "user", "content": prompt}],
35
+ ).choices[0].message.content
36
+
37
+ # Evaluate a single query
38
+ scores = evaluate(
39
+ question="How many active users are there?",
40
+ generated_sql="SELECT COUNT(*) FROM users WHERE active = 1",
41
+ gold_sql="SELECT COUNT(*) FROM users WHERE active = 1",
42
+ db_path="my_database.db",
43
+ llm_judge=my_llm_judge,
44
+ response="There are 1,523 active users.",
45
+ result_data={"columns": ["COUNT(*)"], "rows": [[1523]], "row_count": 1, "execution_time_ms": 2.1},
46
+ )
47
+
48
+ print(scores.overall_score) # 0.95
49
+ print(scores.summary())
50
+ ```
51
+
52
+ ---
53
+
54
+ ## Evaluate Without Gold SQL
55
+
56
+ Gold SQL is optional. Without it, SQLAS uses semantic equivalence (LLM judge) and execution success:
57
+
58
+ ```python
59
+ scores = evaluate(
60
+ question="Show top 10 products by revenue",
61
+ generated_sql="SELECT name, SUM(price * qty) AS rev FROM orders GROUP BY name ORDER BY rev DESC LIMIT 10",
62
+ llm_judge=my_llm_judge,
63
+ response="The top products are...",
64
+ result_data={"columns": ["name", "rev"], "rows": [...], "row_count": 10, "execution_time_ms": 15},
65
+ )
66
+ ```
67
+
68
+ ---
69
+
70
+ ## Run a Test Suite
71
+
72
+ ```python
73
+ from sqlas import run_suite, TestCase
74
+
75
+ test_cases = [
76
+ TestCase(
77
+ question="How many users signed up this month?",
78
+ gold_sql="SELECT COUNT(*) FROM users WHERE created_at >= '2026-03-01'",
79
+ category="easy",
80
+ ),
81
+ TestCase(
82
+ question="Average order value by country",
83
+ gold_sql="SELECT country, AVG(total) FROM orders GROUP BY country",
84
+ category="medium",
85
+ ),
86
+ ]
87
+
88
+ def my_agent(question: str) -> dict:
89
+ # Your SQL agent pipeline
90
+ sql = generate_sql(question)
91
+ result = execute(sql)
92
+ response = narrate(result)
93
+ return {"sql": sql, "response": response, "data": result}
94
+
95
+ results = run_suite(
96
+ test_cases=test_cases,
97
+ agent_fn=my_agent,
98
+ llm_judge=my_llm_judge,
99
+ db_path="my_database.db",
100
+ pass_threshold=0.6, # configurable
101
+ )
102
+
103
+ print(results["summary"]["overall_score"]) # 0.88
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Metric Framework
109
+
110
+ ### v1: Production Composite Score (15 metrics, 6 categories)
111
+
112
+ The default `WEIGHTS` profile uses 15 metrics:
113
+
114
+ ```
115
+ SQLAS = 40% Execution Accuracy
116
+ + 15% Semantic Correctness
117
+ + 15% Cost Efficiency
118
+ + 10% Execution Quality
119
+ + 10% Task Success
120
+ + 10% Safety
121
+ ```
122
+
123
+ ### v2: Full RAGAS-Mapped Score (20 metrics, 8 categories)
124
+
125
+ Use `WEIGHTS_V2` for the full 20-metric evaluation with context quality:
126
+
127
+ ```python
128
+ from sqlas import evaluate, WEIGHTS_V2
129
+
130
+ scores = evaluate(..., weights=WEIGHTS_V2)
131
+ ```
132
+
133
+ ```
134
+ SQLAS v2 = 35% Execution Accuracy
135
+ + 13% Semantic Correctness
136
+ + 10% Context Quality (NEW — RAGAS-mapped)
137
+ + 12% Cost Efficiency
138
+ + 8% Execution Quality
139
+ + 8% Task Success
140
+ + 4% Result Similarity (NEW)
141
+ + 10% Safety
142
+ ```
143
+
144
+ ### Detailed Breakdown (v2 — 20 metrics)
145
+
146
+ | Category | Metric | v1 Weight | v2 Weight | Method |
147
+ |---|---|---|---|---|
148
+ | **Execution Accuracy** | execution_accuracy | 40% | 35% | Automated: output + structure + efficiency |
149
+ | **Semantic Correctness** | semantic_equivalence | 15% | 13% | LLM-as-judge |
150
+ | **Context Quality** | context_precision | — | 3% | Automated: schema element precision vs gold |
151
+ | | context_recall | — | 3% | Automated: schema element recall vs gold |
152
+ | | entity_recall | — | 2% | Automated: strict entity-level recall |
153
+ | | noise_robustness | — | 2% | Automated: irrelevant schema resistance |
154
+ | **Cost Efficiency** | efficiency_score | 5% | 4% | Automated: VES |
155
+ | | data_scan_efficiency | 5% | 4% | Automated: scan detection |
156
+ | | sql_quality | 3% | 2% | LLM: join/agg/filter |
157
+ | | schema_compliance | 2% | 2% | Automated: sqlglot |
158
+ | **Execution Quality** | execution_success | 5% | 4% | Automated |
159
+ | | complexity_match | 3% | 2% | LLM-as-judge |
160
+ | | empty_result_penalty | 2% | 2% | Automated |
161
+ | **Task Success** | faithfulness | 4% | 3% | LLM-as-judge |
162
+ | | answer_relevance | 3% | 2% | LLM-as-judge |
163
+ | | answer_completeness | 2% | 2% | LLM-as-judge |
164
+ | | fluency | 1% | 1% | LLM-as-judge |
165
+ | **Result Similarity** | result_set_similarity | — | 4% | Automated: Jaccard on result sets |
166
+ | **Safety** | read_only_compliance | 5% | 5% | Automated: DDL/DML |
167
+ | | safety_score | 5% | 5% | Automated: PII/injection |
168
+
169
+ ### Custom Weights
170
+
171
+ ```python
172
+ my_weights = {
173
+ "execution_accuracy": 0.50, # increase correctness weight
174
+ "semantic_equivalence": 0.10,
175
+ "safety_score": 0.15, # stricter safety
176
+ # ... other metrics (must sum to 1.0)
177
+ }
178
+
179
+ scores = evaluate(..., weights=my_weights)
180
+ ```
181
+
182
+ ---
183
+
184
+ ## Use Individual Metrics
185
+
186
+ ```python
187
+ from sqlas import execution_accuracy, schema_compliance, safety_score
188
+ from sqlas import context_precision, context_recall, entity_recall
189
+
190
+ # Just check execution accuracy
191
+ score, details = execution_accuracy(
192
+ generated_sql="SELECT COUNT(*) FROM users",
193
+ gold_sql="SELECT COUNT(*) FROM users",
194
+ db_path="my.db",
195
+ )
196
+
197
+ # Just check schema compliance
198
+ score, details = schema_compliance(
199
+ sql="SELECT name FROM users",
200
+ valid_tables={"users", "orders"},
201
+ valid_columns={"users": {"id", "name", "email"}, "orders": {"id", "user_id", "total"}},
202
+ )
203
+
204
+ # Just check safety
205
+ score, details = safety_score(
206
+ sql="SELECT * FROM users",
207
+ pii_columns=["email", "phone", "ssn"],
208
+ )
209
+
210
+ # Context quality (requires gold SQL)
211
+ precision, details = context_precision(
212
+ generated_sql="SELECT name, age FROM users WHERE active = 1",
213
+ gold_sql="SELECT name FROM users WHERE active = 1",
214
+ )
215
+ # precision < 1.0 — 'age' is extra
216
+
217
+ recall, details = context_recall(
218
+ generated_sql="SELECT name FROM users",
219
+ gold_sql="SELECT name FROM users WHERE active = 1",
220
+ )
221
+ # recall < 1.0 — 'active' is missing
222
+ ```
223
+
224
+ ---
225
+
226
+ ## RAGAS Mapping
227
+
228
+ | RAGAS Metric | SQLAS Equivalent | Description |
229
+ |---|---|---|
230
+ | Faithfulness | `faithfulness` | Claims grounded in SQL result data |
231
+ | Answer Relevance | `answer_relevance` | Response answers the question |
232
+ | Answer Correctness | `execution_accuracy` | SQL returns correct results |
233
+ | Answer Similarity | `result_set_similarity` | Result set Jaccard similarity |
234
+ | Context Precision | `context_precision` | Only relevant schema elements used |
235
+ | Context Recall | `context_recall` | All required schema elements used |
236
+ | Context Entity Recall | `entity_recall` | Strict entity match (tables, columns, literals, functions) |
237
+ | Noise Sensitivity | `noise_robustness` | Resistance to irrelevant schema context |
238
+ | — | `semantic_equivalence` | SQL answers the intent (LLM judge) |
239
+ | — | `safety_score` | PII + injection + DDL protection |
240
+ | — | `schema_compliance` | Valid tables/columns via AST |
241
+
242
+ ---
243
+
244
+ ## Production Features
245
+
246
+ - **Read-only DB**: All query execution uses read-only connections
247
+ - **Timeout guard**: SQL execution timeout (default 30s) prevents hangs
248
+ - **LLM resilience**: All LLM judge calls wrapped with error handling
249
+ - **Input validation**: Empty SQL, missing db_path, weight sum checks
250
+ - **Structured logging**: Uses Python `logging` module (not print)
251
+ - **Type-checked**: Ships `py.typed` marker for mypy/pyright
252
+
253
+ ---
254
+
255
+ ## LLM Judge
256
+
257
+ SQLAS is **LLM-agnostic**. Provide any function `(prompt: str) -> str`:
258
+
259
+ ```python
260
+ # OpenAI
261
+ def judge(prompt):
262
+ return openai_client.chat.completions.create(
263
+ model="gpt-4o", messages=[{"role": "user", "content": prompt}]
264
+ ).choices[0].message.content
265
+
266
+ # Anthropic
267
+ def judge(prompt):
268
+ return anthropic_client.messages.create(
269
+ model="claude-sonnet-4-20250514", max_tokens=500,
270
+ messages=[{"role": "user", "content": prompt}]
271
+ ).content[0].text
272
+
273
+ # Local (Ollama)
274
+ def judge(prompt):
275
+ import requests
276
+ return requests.post("http://localhost:11434/api/generate",
277
+ json={"model": "llama3", "prompt": prompt}
278
+ ).json()["response"]
279
+ ```
280
+
281
+ ---
282
+
283
+ ## License
284
+
285
+ MIT License - SQLAS Contributors
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sqlas"
7
+ version = "1.1.0"
8
+ description = "SQLAS — SQL Agent Scoring Framework. A RAGAS-equivalent evaluation library for Text-to-SQL and SQL AI agents. 20 production-grade metrics across 8 categories."
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ authors = [{name = "SQLAS Contributors"}]
12
+ requires-python = ">=3.10"
13
+ keywords = ["sql", "agent", "evaluation", "llm", "text-to-sql", "ragas", "mlflow", "benchmark", "monitoring"]
14
+ classifiers = [
15
+ "Development Status :: 5 - Production/Stable",
16
+ "Intended Audience :: Developers",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Database",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ "Typing :: Typed",
27
+ ]
28
+ dependencies = [
29
+ "sqlglot>=20.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ mlflow = ["mlflow>=3.0"]
34
+ dev = ["pytest>=7.0", "build", "twine"]
35
+ all = ["mlflow>=3.0"]
36
+
37
+ [project.urls]
38
+ Homepage = "https://github.com/sqlas-framework/sqlas"
39
+ Documentation = "https://github.com/sqlas-framework/sqlas#readme"
40
+ Repository = "https://github.com/sqlas-framework/sqlas"
41
+ Changelog = "https://github.com/sqlas-framework/sqlas/blob/main/CHANGELOG.md"
42
+
43
+ [tool.setuptools.packages.find]
44
+ include = ["sqlas*"]
45
+
46
+ [tool.setuptools.package-data]
47
+ sqlas = ["py.typed"]
sqlas-1.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+