evalvault 1.70.1__py3-none-any.whl → 1.72.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +367 -3
- evalvault/adapters/inbound/api/main.py +17 -1
- evalvault/adapters/inbound/api/routers/calibration.py +133 -0
- evalvault/adapters/inbound/api/routers/runs.py +71 -1
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
- evalvault/adapters/inbound/cli/commands/compare.py +1 -1
- evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
- evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
- evalvault/adapters/inbound/cli/commands/history.py +1 -1
- evalvault/adapters/inbound/cli/commands/regress.py +169 -1
- evalvault/adapters/inbound/cli/commands/run.py +225 -1
- evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
- evalvault/adapters/outbound/dataset/__init__.py +6 -0
- evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
- evalvault/adapters/outbound/report/__init__.py +6 -0
- evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
- evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
- evalvault/adapters/outbound/retriever/__init__.py +8 -0
- evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
- evalvault/adapters/outbound/storage/base_sql.py +291 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
- evalvault/adapters/outbound/storage/schema.sql +63 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
- evalvault/domain/entities/__init__.py +20 -0
- evalvault/domain/entities/graph_rag.py +30 -0
- evalvault/domain/entities/multiturn.py +78 -0
- evalvault/domain/metrics/__init__.py +10 -0
- evalvault/domain/metrics/multiturn_metrics.py +113 -0
- evalvault/domain/metrics/registry.py +36 -0
- evalvault/domain/services/__init__.py +8 -0
- evalvault/domain/services/evaluator.py +5 -2
- evalvault/domain/services/graph_rag_experiment.py +155 -0
- evalvault/domain/services/multiturn_evaluator.py +187 -0
- evalvault/ports/inbound/__init__.py +2 -0
- evalvault/ports/inbound/multiturn_port.py +23 -0
- evalvault/ports/inbound/web_port.py +4 -0
- evalvault/ports/outbound/graph_retriever_port.py +24 -0
- evalvault/ports/outbound/storage_port.py +25 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/METADATA +1 -1
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/RECORD +47 -33
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/WHEEL +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -14,6 +14,9 @@ from evalvault.domain.entities import (
|
|
|
14
14
|
EvaluationRun,
|
|
15
15
|
FeedbackSummary,
|
|
16
16
|
MetricScore,
|
|
17
|
+
MultiTurnConversationRecord,
|
|
18
|
+
MultiTurnRunRecord,
|
|
19
|
+
MultiTurnTurnResult,
|
|
17
20
|
RunClusterMap,
|
|
18
21
|
RunClusterMapInfo,
|
|
19
22
|
SatisfactionFeedback,
|
|
@@ -72,6 +75,45 @@ class SQLQueries:
|
|
|
72
75
|
) VALUES ({values})
|
|
73
76
|
"""
|
|
74
77
|
|
|
78
|
+
def insert_multiturn_run(self) -> str:
|
|
79
|
+
values = self._values(12)
|
|
80
|
+
return f"""
|
|
81
|
+
INSERT INTO multiturn_runs (
|
|
82
|
+
run_id, dataset_name, dataset_version, model_name,
|
|
83
|
+
started_at, finished_at, conversation_count, turn_count,
|
|
84
|
+
metrics_evaluated, drift_threshold, summary, metadata
|
|
85
|
+
) VALUES ({values})
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def insert_multiturn_conversation(self) -> str:
|
|
89
|
+
values = self._values(7)
|
|
90
|
+
return f"""
|
|
91
|
+
INSERT INTO multiturn_conversations (
|
|
92
|
+
run_id, conversation_id, turn_count, drift_score, drift_threshold,
|
|
93
|
+
drift_detected, summary
|
|
94
|
+
) VALUES ({values})
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
def insert_multiturn_turn(self) -> str:
|
|
98
|
+
values = self._values(8)
|
|
99
|
+
query = f"""
|
|
100
|
+
INSERT INTO multiturn_turn_results (
|
|
101
|
+
run_id, conversation_id, turn_id, turn_index, role,
|
|
102
|
+
passed, latency_ms, metadata
|
|
103
|
+
) VALUES ({values})
|
|
104
|
+
"""
|
|
105
|
+
if self._test_case_returning:
|
|
106
|
+
query = f"{query.strip()} {self._test_case_returning}"
|
|
107
|
+
return query
|
|
108
|
+
|
|
109
|
+
def insert_multiturn_metric_score(self) -> str:
|
|
110
|
+
values = self._values(4)
|
|
111
|
+
return f"""
|
|
112
|
+
INSERT INTO multiturn_metric_scores (
|
|
113
|
+
turn_result_id, metric_name, score, threshold
|
|
114
|
+
) VALUES ({values})
|
|
115
|
+
"""
|
|
116
|
+
|
|
75
117
|
def insert_cluster_map(self) -> str:
|
|
76
118
|
values = self._values(7)
|
|
77
119
|
return f"""
|
|
@@ -127,6 +169,41 @@ class SQLQueries:
|
|
|
127
169
|
ORDER BY id
|
|
128
170
|
"""
|
|
129
171
|
|
|
172
|
+
def select_multiturn_run(self) -> str:
|
|
173
|
+
return f"""
|
|
174
|
+
SELECT run_id, dataset_name, dataset_version, model_name,
|
|
175
|
+
started_at, finished_at, conversation_count, turn_count,
|
|
176
|
+
metrics_evaluated, drift_threshold, summary, metadata, created_at
|
|
177
|
+
FROM multiturn_runs
|
|
178
|
+
WHERE run_id = {self.placeholder}
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
def select_multiturn_conversations(self) -> str:
|
|
182
|
+
return f"""
|
|
183
|
+
SELECT run_id, conversation_id, turn_count, drift_score, drift_threshold,
|
|
184
|
+
drift_detected, summary
|
|
185
|
+
FROM multiturn_conversations
|
|
186
|
+
WHERE run_id = {self.placeholder}
|
|
187
|
+
ORDER BY id
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def select_multiturn_turn_results(self) -> str:
|
|
191
|
+
return f"""
|
|
192
|
+
SELECT id, run_id, conversation_id, turn_id, turn_index, role,
|
|
193
|
+
passed, latency_ms, metadata
|
|
194
|
+
FROM multiturn_turn_results
|
|
195
|
+
WHERE run_id = {self.placeholder}
|
|
196
|
+
ORDER BY id
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def select_multiturn_metric_scores(self) -> str:
|
|
200
|
+
return f"""
|
|
201
|
+
SELECT turn_result_id, metric_name, score, threshold
|
|
202
|
+
FROM multiturn_metric_scores
|
|
203
|
+
WHERE turn_result_id = {self.placeholder}
|
|
204
|
+
ORDER BY id
|
|
205
|
+
"""
|
|
206
|
+
|
|
130
207
|
def select_cluster_map(self) -> str:
|
|
131
208
|
return f"""
|
|
132
209
|
SELECT test_case_id, cluster_id, source, map_id, created_at, metadata
|
|
@@ -223,6 +300,48 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
223
300
|
conn.commit()
|
|
224
301
|
return run.run_id
|
|
225
302
|
|
|
303
|
+
def save_multiturn_run(
|
|
304
|
+
self,
|
|
305
|
+
run: MultiTurnRunRecord,
|
|
306
|
+
conversations: list[MultiTurnConversationRecord],
|
|
307
|
+
turn_results: list[MultiTurnTurnResult],
|
|
308
|
+
*,
|
|
309
|
+
metric_thresholds: dict[str, float] | None = None,
|
|
310
|
+
) -> str:
|
|
311
|
+
with self._get_connection() as conn:
|
|
312
|
+
self._execute(
|
|
313
|
+
conn, self.queries.insert_multiturn_run(), self._multiturn_run_params(run)
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
for conversation in conversations:
|
|
317
|
+
self._execute(
|
|
318
|
+
conn,
|
|
319
|
+
self.queries.insert_multiturn_conversation(),
|
|
320
|
+
self._multiturn_conversation_params(conversation),
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
for turn in turn_results:
|
|
324
|
+
cursor = self._execute(
|
|
325
|
+
conn,
|
|
326
|
+
self.queries.insert_multiturn_turn(),
|
|
327
|
+
self._multiturn_turn_params(run.run_id, turn),
|
|
328
|
+
)
|
|
329
|
+
turn_result_id = self._fetch_lastrowid(cursor)
|
|
330
|
+
for metric_name, score in (turn.metrics or {}).items():
|
|
331
|
+
threshold = None
|
|
332
|
+
if metric_thresholds and metric_name in metric_thresholds:
|
|
333
|
+
threshold = metric_thresholds[metric_name]
|
|
334
|
+
self._execute(
|
|
335
|
+
conn,
|
|
336
|
+
self.queries.insert_multiturn_metric_score(),
|
|
337
|
+
self._multiturn_metric_params(
|
|
338
|
+
turn_result_id, metric_name, score, threshold
|
|
339
|
+
),
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
conn.commit()
|
|
343
|
+
return run.run_id
|
|
344
|
+
|
|
226
345
|
def _insert_test_case(self, conn, run_id: str, result: TestCaseResult) -> int:
|
|
227
346
|
cursor = self._execute(
|
|
228
347
|
conn,
|
|
@@ -515,6 +634,56 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
515
634
|
metric.reason,
|
|
516
635
|
)
|
|
517
636
|
|
|
637
|
+
def _multiturn_run_params(self, run: MultiTurnRunRecord) -> Sequence[Any]:
|
|
638
|
+
return (
|
|
639
|
+
run.run_id,
|
|
640
|
+
run.dataset_name,
|
|
641
|
+
run.dataset_version,
|
|
642
|
+
run.model_name,
|
|
643
|
+
self._serialize_datetime(run.started_at),
|
|
644
|
+
self._serialize_datetime(run.finished_at),
|
|
645
|
+
run.conversation_count,
|
|
646
|
+
run.turn_count,
|
|
647
|
+
self._serialize_json(run.metrics_evaluated),
|
|
648
|
+
run.drift_threshold,
|
|
649
|
+
self._serialize_json(run.summary),
|
|
650
|
+
self._serialize_json(run.metadata),
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
def _multiturn_conversation_params(
|
|
654
|
+
self, conversation: MultiTurnConversationRecord
|
|
655
|
+
) -> Sequence[Any]:
|
|
656
|
+
return (
|
|
657
|
+
conversation.run_id,
|
|
658
|
+
conversation.conversation_id,
|
|
659
|
+
conversation.turn_count,
|
|
660
|
+
conversation.drift_score,
|
|
661
|
+
conversation.drift_threshold,
|
|
662
|
+
int(conversation.drift_detected),
|
|
663
|
+
self._serialize_json(conversation.summary),
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
def _multiturn_turn_params(self, run_id: str, turn: MultiTurnTurnResult) -> Sequence[Any]:
|
|
667
|
+
return (
|
|
668
|
+
run_id,
|
|
669
|
+
turn.conversation_id,
|
|
670
|
+
turn.turn_id,
|
|
671
|
+
turn.turn_index,
|
|
672
|
+
turn.role,
|
|
673
|
+
int(turn.passed),
|
|
674
|
+
turn.latency_ms,
|
|
675
|
+
self._serialize_json(turn.metadata),
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
def _multiturn_metric_params(
|
|
679
|
+
self,
|
|
680
|
+
turn_result_id: int,
|
|
681
|
+
metric_name: str,
|
|
682
|
+
score: float,
|
|
683
|
+
threshold: float | None,
|
|
684
|
+
) -> Sequence[Any]:
|
|
685
|
+
return (turn_result_id, metric_name, score, threshold)
|
|
686
|
+
|
|
518
687
|
def _row_to_test_case(self, conn, row) -> TestCaseResult:
|
|
519
688
|
result_id = row["id"]
|
|
520
689
|
metrics = self._fetch_metric_scores(conn, result_id)
|
|
@@ -1132,3 +1301,125 @@ class BaseSQLStorageAdapter(ABC):
|
|
|
1132
1301
|
|
|
1133
1302
|
workbook.save(output)
|
|
1134
1303
|
return output
|
|
1304
|
+
|
|
1305
|
+
def export_multiturn_run_to_excel(self, run_id: str, output_path) -> Path:
|
|
1306
|
+
from openpyxl import Workbook
|
|
1307
|
+
|
|
1308
|
+
output = Path(output_path)
|
|
1309
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
1310
|
+
placeholder = self.queries.placeholder
|
|
1311
|
+
|
|
1312
|
+
with self._get_connection() as conn:
|
|
1313
|
+
run_row = self._execute(conn, self.queries.select_multiturn_run(), (run_id,)).fetchone()
|
|
1314
|
+
if not run_row:
|
|
1315
|
+
raise KeyError(f"Multiturn run not found: {run_id}")
|
|
1316
|
+
|
|
1317
|
+
run_rows = self._normalize_rows(
|
|
1318
|
+
[run_row],
|
|
1319
|
+
json_columns={"metrics_evaluated", "summary", "metadata"},
|
|
1320
|
+
)
|
|
1321
|
+
|
|
1322
|
+
conversation_rows = self._execute(
|
|
1323
|
+
conn, self.queries.select_multiturn_conversations(), (run_id,)
|
|
1324
|
+
).fetchall()
|
|
1325
|
+
conversation_payloads = self._normalize_rows(
|
|
1326
|
+
conversation_rows,
|
|
1327
|
+
json_columns={"summary"},
|
|
1328
|
+
)
|
|
1329
|
+
|
|
1330
|
+
turn_rows = self._execute(
|
|
1331
|
+
conn, self.queries.select_multiturn_turn_results(), (run_id,)
|
|
1332
|
+
).fetchall()
|
|
1333
|
+
turn_payloads = self._normalize_rows(
|
|
1334
|
+
turn_rows,
|
|
1335
|
+
json_columns={"metadata"},
|
|
1336
|
+
)
|
|
1337
|
+
|
|
1338
|
+
metric_rows = self._execute(
|
|
1339
|
+
conn,
|
|
1340
|
+
(
|
|
1341
|
+
"SELECT m.turn_result_id, t.conversation_id, t.turn_id, t.turn_index, "
|
|
1342
|
+
"m.metric_name, m.score, m.threshold "
|
|
1343
|
+
"FROM multiturn_metric_scores m "
|
|
1344
|
+
"JOIN multiturn_turn_results t ON m.turn_result_id = t.id "
|
|
1345
|
+
f"WHERE t.run_id = {placeholder} ORDER BY m.id"
|
|
1346
|
+
),
|
|
1347
|
+
(run_id,),
|
|
1348
|
+
).fetchall()
|
|
1349
|
+
metric_payloads = self._normalize_rows(metric_rows)
|
|
1350
|
+
|
|
1351
|
+
sheet_order: list[tuple[str, list[dict[str, Any]], list[str]]] = [
|
|
1352
|
+
(
|
|
1353
|
+
"MultiTurnRun",
|
|
1354
|
+
run_rows,
|
|
1355
|
+
[
|
|
1356
|
+
"run_id",
|
|
1357
|
+
"dataset_name",
|
|
1358
|
+
"dataset_version",
|
|
1359
|
+
"model_name",
|
|
1360
|
+
"started_at",
|
|
1361
|
+
"finished_at",
|
|
1362
|
+
"conversation_count",
|
|
1363
|
+
"turn_count",
|
|
1364
|
+
"metrics_evaluated",
|
|
1365
|
+
"drift_threshold",
|
|
1366
|
+
"summary",
|
|
1367
|
+
"metadata",
|
|
1368
|
+
"created_at",
|
|
1369
|
+
],
|
|
1370
|
+
),
|
|
1371
|
+
(
|
|
1372
|
+
"MultiTurnConversations",
|
|
1373
|
+
conversation_payloads,
|
|
1374
|
+
[
|
|
1375
|
+
"run_id",
|
|
1376
|
+
"conversation_id",
|
|
1377
|
+
"turn_count",
|
|
1378
|
+
"drift_score",
|
|
1379
|
+
"drift_threshold",
|
|
1380
|
+
"drift_detected",
|
|
1381
|
+
"summary",
|
|
1382
|
+
],
|
|
1383
|
+
),
|
|
1384
|
+
(
|
|
1385
|
+
"MultiTurnTurns",
|
|
1386
|
+
turn_payloads,
|
|
1387
|
+
[
|
|
1388
|
+
"id",
|
|
1389
|
+
"run_id",
|
|
1390
|
+
"conversation_id",
|
|
1391
|
+
"turn_id",
|
|
1392
|
+
"turn_index",
|
|
1393
|
+
"role",
|
|
1394
|
+
"passed",
|
|
1395
|
+
"latency_ms",
|
|
1396
|
+
"metadata",
|
|
1397
|
+
],
|
|
1398
|
+
),
|
|
1399
|
+
(
|
|
1400
|
+
"MultiTurnTurnMetrics",
|
|
1401
|
+
metric_payloads,
|
|
1402
|
+
[
|
|
1403
|
+
"turn_result_id",
|
|
1404
|
+
"conversation_id",
|
|
1405
|
+
"turn_id",
|
|
1406
|
+
"turn_index",
|
|
1407
|
+
"metric_name",
|
|
1408
|
+
"score",
|
|
1409
|
+
"threshold",
|
|
1410
|
+
],
|
|
1411
|
+
),
|
|
1412
|
+
]
|
|
1413
|
+
|
|
1414
|
+
workbook = Workbook()
|
|
1415
|
+
default_sheet = workbook.active
|
|
1416
|
+
workbook.remove(default_sheet)
|
|
1417
|
+
|
|
1418
|
+
for sheet_name, rows, headers in sheet_order:
|
|
1419
|
+
sheet = workbook.create_sheet(title=sheet_name)
|
|
1420
|
+
sheet.append(headers)
|
|
1421
|
+
for row in rows:
|
|
1422
|
+
sheet.append([self._row_value(row, header) for header in headers])
|
|
1423
|
+
|
|
1424
|
+
workbook.save(output)
|
|
1425
|
+
return output
|
|
@@ -221,6 +221,90 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
221
221
|
"CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
|
|
222
222
|
)
|
|
223
223
|
|
|
224
|
+
conn.execute(
|
|
225
|
+
"""
|
|
226
|
+
CREATE TABLE IF NOT EXISTS multiturn_runs (
|
|
227
|
+
run_id UUID PRIMARY KEY,
|
|
228
|
+
dataset_name VARCHAR(255) NOT NULL,
|
|
229
|
+
dataset_version VARCHAR(50),
|
|
230
|
+
model_name VARCHAR(255),
|
|
231
|
+
started_at TIMESTAMP WITH TIME ZONE NOT NULL,
|
|
232
|
+
finished_at TIMESTAMP WITH TIME ZONE,
|
|
233
|
+
conversation_count INTEGER DEFAULT 0,
|
|
234
|
+
turn_count INTEGER DEFAULT 0,
|
|
235
|
+
metrics_evaluated JSONB,
|
|
236
|
+
drift_threshold DOUBLE PRECISION,
|
|
237
|
+
summary JSONB,
|
|
238
|
+
metadata JSONB,
|
|
239
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
240
|
+
)
|
|
241
|
+
"""
|
|
242
|
+
)
|
|
243
|
+
conn.execute(
|
|
244
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name)"
|
|
245
|
+
)
|
|
246
|
+
conn.execute(
|
|
247
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC)"
|
|
248
|
+
)
|
|
249
|
+
conn.execute(
|
|
250
|
+
"""
|
|
251
|
+
CREATE TABLE IF NOT EXISTS multiturn_conversations (
|
|
252
|
+
id SERIAL PRIMARY KEY,
|
|
253
|
+
run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
|
|
254
|
+
conversation_id VARCHAR(255) NOT NULL,
|
|
255
|
+
turn_count INTEGER DEFAULT 0,
|
|
256
|
+
drift_score DOUBLE PRECISION,
|
|
257
|
+
drift_threshold DOUBLE PRECISION,
|
|
258
|
+
drift_detected BOOLEAN DEFAULT FALSE,
|
|
259
|
+
summary JSONB
|
|
260
|
+
)
|
|
261
|
+
"""
|
|
262
|
+
)
|
|
263
|
+
conn.execute(
|
|
264
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id)"
|
|
265
|
+
)
|
|
266
|
+
conn.execute(
|
|
267
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id)"
|
|
268
|
+
)
|
|
269
|
+
conn.execute(
|
|
270
|
+
"""
|
|
271
|
+
CREATE TABLE IF NOT EXISTS multiturn_turn_results (
|
|
272
|
+
id SERIAL PRIMARY KEY,
|
|
273
|
+
run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
|
|
274
|
+
conversation_id VARCHAR(255) NOT NULL,
|
|
275
|
+
turn_id VARCHAR(255) NOT NULL,
|
|
276
|
+
turn_index INTEGER,
|
|
277
|
+
role VARCHAR(50) NOT NULL,
|
|
278
|
+
passed BOOLEAN DEFAULT FALSE,
|
|
279
|
+
latency_ms INTEGER,
|
|
280
|
+
metadata JSONB
|
|
281
|
+
)
|
|
282
|
+
"""
|
|
283
|
+
)
|
|
284
|
+
conn.execute(
|
|
285
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id)"
|
|
286
|
+
)
|
|
287
|
+
conn.execute(
|
|
288
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id)"
|
|
289
|
+
)
|
|
290
|
+
conn.execute(
|
|
291
|
+
"""
|
|
292
|
+
CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
|
|
293
|
+
id SERIAL PRIMARY KEY,
|
|
294
|
+
turn_result_id INTEGER NOT NULL REFERENCES multiturn_turn_results(id) ON DELETE CASCADE,
|
|
295
|
+
metric_name VARCHAR(100) NOT NULL,
|
|
296
|
+
score DECIMAL(5, 4) NOT NULL,
|
|
297
|
+
threshold DECIMAL(5, 4)
|
|
298
|
+
)
|
|
299
|
+
"""
|
|
300
|
+
)
|
|
301
|
+
conn.execute(
|
|
302
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id)"
|
|
303
|
+
)
|
|
304
|
+
conn.execute(
|
|
305
|
+
"CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name)"
|
|
306
|
+
)
|
|
307
|
+
|
|
224
308
|
# Prompt set methods
|
|
225
309
|
|
|
226
310
|
def save_prompt_set(self, bundle: PromptSetBundle) -> None:
|
|
@@ -874,6 +958,52 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
|
|
|
874
958
|
|
|
875
959
|
return report_id
|
|
876
960
|
|
|
961
|
+
def list_analysis_reports(
|
|
962
|
+
self,
|
|
963
|
+
*,
|
|
964
|
+
run_id: str,
|
|
965
|
+
report_type: str | None = None,
|
|
966
|
+
format: str | None = None,
|
|
967
|
+
limit: int = 20,
|
|
968
|
+
) -> list[dict[str, Any]]:
|
|
969
|
+
clauses = ["run_id = %s"]
|
|
970
|
+
params: list[Any] = [run_id]
|
|
971
|
+
if report_type:
|
|
972
|
+
clauses.append("report_type = %s")
|
|
973
|
+
params.append(report_type)
|
|
974
|
+
if format:
|
|
975
|
+
clauses.append("format = %s")
|
|
976
|
+
params.append(format)
|
|
977
|
+
params.append(limit)
|
|
978
|
+
|
|
979
|
+
query = (
|
|
980
|
+
"SELECT report_id, run_id, experiment_id, report_type, format, content, metadata, created_at "
|
|
981
|
+
"FROM analysis_reports WHERE "
|
|
982
|
+
+ " AND ".join(clauses)
|
|
983
|
+
+ " ORDER BY created_at DESC LIMIT %s"
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
with self._get_connection() as conn:
|
|
987
|
+
rows = conn.execute(query, tuple(params)).fetchall()
|
|
988
|
+
|
|
989
|
+
reports: list[dict[str, Any]] = []
|
|
990
|
+
for row in rows:
|
|
991
|
+
reports.append(
|
|
992
|
+
{
|
|
993
|
+
"report_id": row["report_id"],
|
|
994
|
+
"run_id": row["run_id"],
|
|
995
|
+
"experiment_id": row["experiment_id"],
|
|
996
|
+
"report_type": row["report_type"],
|
|
997
|
+
"format": row["format"],
|
|
998
|
+
"content": row["content"],
|
|
999
|
+
"metadata": self._deserialize_json(row["metadata"]),
|
|
1000
|
+
"created_at": row["created_at"].isoformat()
|
|
1001
|
+
if isinstance(row["created_at"], datetime)
|
|
1002
|
+
else row["created_at"],
|
|
1003
|
+
}
|
|
1004
|
+
)
|
|
1005
|
+
return reports
|
|
1006
|
+
|
|
877
1007
|
def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
|
|
878
1008
|
"""파이프라인 분석 결과 목록을 조회합니다."""
|
|
879
1009
|
query = """
|
|
@@ -86,6 +86,66 @@ CREATE TABLE IF NOT EXISTS metric_scores (
|
|
|
86
86
|
CREATE INDEX IF NOT EXISTS idx_scores_result_id ON metric_scores(result_id);
|
|
87
87
|
CREATE INDEX IF NOT EXISTS idx_scores_name ON metric_scores(name);
|
|
88
88
|
|
|
89
|
+
-- Multiturn evaluation tables
|
|
90
|
+
CREATE TABLE IF NOT EXISTS multiturn_runs (
|
|
91
|
+
run_id UUID PRIMARY KEY,
|
|
92
|
+
dataset_name VARCHAR(255) NOT NULL,
|
|
93
|
+
dataset_version VARCHAR(50),
|
|
94
|
+
model_name VARCHAR(255),
|
|
95
|
+
started_at TIMESTAMP WITH TIME ZONE NOT NULL,
|
|
96
|
+
finished_at TIMESTAMP WITH TIME ZONE,
|
|
97
|
+
conversation_count INTEGER DEFAULT 0,
|
|
98
|
+
turn_count INTEGER DEFAULT 0,
|
|
99
|
+
metrics_evaluated JSONB,
|
|
100
|
+
drift_threshold DOUBLE PRECISION,
|
|
101
|
+
summary JSONB,
|
|
102
|
+
metadata JSONB,
|
|
103
|
+
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
|
104
|
+
);
|
|
105
|
+
|
|
106
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name);
|
|
107
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC);
|
|
108
|
+
|
|
109
|
+
CREATE TABLE IF NOT EXISTS multiturn_conversations (
|
|
110
|
+
id SERIAL PRIMARY KEY,
|
|
111
|
+
run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
|
|
112
|
+
conversation_id VARCHAR(255) NOT NULL,
|
|
113
|
+
turn_count INTEGER DEFAULT 0,
|
|
114
|
+
drift_score DOUBLE PRECISION,
|
|
115
|
+
drift_threshold DOUBLE PRECISION,
|
|
116
|
+
drift_detected BOOLEAN DEFAULT FALSE,
|
|
117
|
+
summary JSONB
|
|
118
|
+
);
|
|
119
|
+
|
|
120
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id);
|
|
121
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id);
|
|
122
|
+
|
|
123
|
+
CREATE TABLE IF NOT EXISTS multiturn_turn_results (
|
|
124
|
+
id SERIAL PRIMARY KEY,
|
|
125
|
+
run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
|
|
126
|
+
conversation_id VARCHAR(255) NOT NULL,
|
|
127
|
+
turn_id VARCHAR(255) NOT NULL,
|
|
128
|
+
turn_index INTEGER,
|
|
129
|
+
role VARCHAR(50) NOT NULL,
|
|
130
|
+
passed BOOLEAN DEFAULT FALSE,
|
|
131
|
+
latency_ms INTEGER,
|
|
132
|
+
metadata JSONB
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id);
|
|
136
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id);
|
|
137
|
+
|
|
138
|
+
CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
|
|
139
|
+
id SERIAL PRIMARY KEY,
|
|
140
|
+
turn_result_id INTEGER NOT NULL REFERENCES multiturn_turn_results(id) ON DELETE CASCADE,
|
|
141
|
+
metric_name VARCHAR(100) NOT NULL,
|
|
142
|
+
score DECIMAL(5, 4) NOT NULL,
|
|
143
|
+
threshold DECIMAL(5, 4)
|
|
144
|
+
);
|
|
145
|
+
|
|
146
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id);
|
|
147
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name);
|
|
148
|
+
|
|
89
149
|
-- Prompt storage tables
|
|
90
150
|
CREATE TABLE IF NOT EXISTS prompts (
|
|
91
151
|
prompt_id UUID PRIMARY KEY,
|
|
@@ -90,6 +90,69 @@ CREATE TABLE IF NOT EXISTS metric_scores (
|
|
|
90
90
|
CREATE INDEX IF NOT EXISTS idx_scores_result_id ON metric_scores(result_id);
|
|
91
91
|
CREATE INDEX IF NOT EXISTS idx_scores_metric_name ON metric_scores(metric_name);
|
|
92
92
|
|
|
93
|
+
-- Multiturn evaluation tables
|
|
94
|
+
CREATE TABLE IF NOT EXISTS multiturn_runs (
|
|
95
|
+
run_id TEXT PRIMARY KEY,
|
|
96
|
+
dataset_name TEXT NOT NULL,
|
|
97
|
+
dataset_version TEXT,
|
|
98
|
+
model_name TEXT,
|
|
99
|
+
started_at TIMESTAMP NOT NULL,
|
|
100
|
+
finished_at TIMESTAMP,
|
|
101
|
+
conversation_count INTEGER DEFAULT 0,
|
|
102
|
+
turn_count INTEGER DEFAULT 0,
|
|
103
|
+
metrics_evaluated TEXT, -- JSON array of metric names
|
|
104
|
+
drift_threshold REAL,
|
|
105
|
+
summary TEXT, -- JSON summary
|
|
106
|
+
metadata TEXT, -- JSON metadata
|
|
107
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name);
|
|
111
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC);
|
|
112
|
+
|
|
113
|
+
CREATE TABLE IF NOT EXISTS multiturn_conversations (
|
|
114
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
115
|
+
run_id TEXT NOT NULL,
|
|
116
|
+
conversation_id TEXT NOT NULL,
|
|
117
|
+
turn_count INTEGER DEFAULT 0,
|
|
118
|
+
drift_score REAL,
|
|
119
|
+
drift_threshold REAL,
|
|
120
|
+
drift_detected INTEGER DEFAULT 0,
|
|
121
|
+
summary TEXT, -- JSON summary
|
|
122
|
+
FOREIGN KEY (run_id) REFERENCES multiturn_runs(run_id) ON DELETE CASCADE
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id);
|
|
126
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id);
|
|
127
|
+
|
|
128
|
+
CREATE TABLE IF NOT EXISTS multiturn_turn_results (
|
|
129
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
130
|
+
run_id TEXT NOT NULL,
|
|
131
|
+
conversation_id TEXT NOT NULL,
|
|
132
|
+
turn_id TEXT NOT NULL,
|
|
133
|
+
turn_index INTEGER,
|
|
134
|
+
role TEXT NOT NULL,
|
|
135
|
+
passed INTEGER DEFAULT 0,
|
|
136
|
+
latency_ms INTEGER,
|
|
137
|
+
metadata TEXT, -- JSON metadata
|
|
138
|
+
FOREIGN KEY (run_id) REFERENCES multiturn_runs(run_id) ON DELETE CASCADE
|
|
139
|
+
);
|
|
140
|
+
|
|
141
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id);
|
|
142
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id);
|
|
143
|
+
|
|
144
|
+
CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
|
|
145
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
146
|
+
turn_result_id INTEGER NOT NULL,
|
|
147
|
+
metric_name TEXT NOT NULL,
|
|
148
|
+
score REAL NOT NULL,
|
|
149
|
+
threshold REAL,
|
|
150
|
+
FOREIGN KEY (turn_result_id) REFERENCES multiturn_turn_results(id) ON DELETE CASCADE
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id);
|
|
154
|
+
CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name);
|
|
155
|
+
|
|
93
156
|
-- Prompt storage tables
|
|
94
157
|
CREATE TABLE IF NOT EXISTS prompts (
|
|
95
158
|
prompt_id TEXT PRIMARY KEY,
|