evalvault 1.70.1__py3-none-any.whl → 1.71.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. evalvault/adapters/inbound/api/adapter.py +367 -3
  2. evalvault/adapters/inbound/api/main.py +17 -1
  3. evalvault/adapters/inbound/api/routers/calibration.py +133 -0
  4. evalvault/adapters/inbound/api/routers/runs.py +71 -1
  5. evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
  6. evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +1 -1
  8. evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
  9. evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
  10. evalvault/adapters/inbound/cli/commands/history.py +1 -1
  11. evalvault/adapters/inbound/cli/commands/regress.py +169 -1
  12. evalvault/adapters/inbound/cli/commands/run.py +225 -1
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
  14. evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
  15. evalvault/adapters/outbound/dataset/__init__.py +6 -0
  16. evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
  17. evalvault/adapters/outbound/report/__init__.py +6 -0
  18. evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
  19. evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
  20. evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
  21. evalvault/adapters/outbound/retriever/__init__.py +8 -0
  22. evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
  23. evalvault/adapters/outbound/storage/base_sql.py +291 -0
  24. evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
  25. evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
  26. evalvault/adapters/outbound/storage/schema.sql +63 -0
  27. evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
  28. evalvault/domain/entities/__init__.py +20 -0
  29. evalvault/domain/entities/graph_rag.py +30 -0
  30. evalvault/domain/entities/multiturn.py +78 -0
  31. evalvault/domain/metrics/__init__.py +10 -0
  32. evalvault/domain/metrics/multiturn_metrics.py +113 -0
  33. evalvault/domain/metrics/registry.py +36 -0
  34. evalvault/domain/services/__init__.py +8 -0
  35. evalvault/domain/services/evaluator.py +5 -2
  36. evalvault/domain/services/graph_rag_experiment.py +155 -0
  37. evalvault/domain/services/multiturn_evaluator.py +187 -0
  38. evalvault/ports/inbound/__init__.py +2 -0
  39. evalvault/ports/inbound/multiturn_port.py +23 -0
  40. evalvault/ports/inbound/web_port.py +4 -0
  41. evalvault/ports/outbound/graph_retriever_port.py +24 -0
  42. evalvault/ports/outbound/storage_port.py +25 -0
  43. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/METADATA +1 -1
  44. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/RECORD +47 -33
  45. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/WHEEL +0 -0
  46. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/entry_points.txt +0 -0
  47. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -14,6 +14,9 @@ from evalvault.domain.entities import (
14
14
  EvaluationRun,
15
15
  FeedbackSummary,
16
16
  MetricScore,
17
+ MultiTurnConversationRecord,
18
+ MultiTurnRunRecord,
19
+ MultiTurnTurnResult,
17
20
  RunClusterMap,
18
21
  RunClusterMapInfo,
19
22
  SatisfactionFeedback,
@@ -72,6 +75,45 @@ class SQLQueries:
72
75
  ) VALUES ({values})
73
76
  """
74
77
 
78
+ def insert_multiturn_run(self) -> str:
79
+ values = self._values(12)
80
+ return f"""
81
+ INSERT INTO multiturn_runs (
82
+ run_id, dataset_name, dataset_version, model_name,
83
+ started_at, finished_at, conversation_count, turn_count,
84
+ metrics_evaluated, drift_threshold, summary, metadata
85
+ ) VALUES ({values})
86
+ """
87
+
88
+ def insert_multiturn_conversation(self) -> str:
89
+ values = self._values(7)
90
+ return f"""
91
+ INSERT INTO multiturn_conversations (
92
+ run_id, conversation_id, turn_count, drift_score, drift_threshold,
93
+ drift_detected, summary
94
+ ) VALUES ({values})
95
+ """
96
+
97
+ def insert_multiturn_turn(self) -> str:
98
+ values = self._values(8)
99
+ query = f"""
100
+ INSERT INTO multiturn_turn_results (
101
+ run_id, conversation_id, turn_id, turn_index, role,
102
+ passed, latency_ms, metadata
103
+ ) VALUES ({values})
104
+ """
105
+ if self._test_case_returning:
106
+ query = f"{query.strip()} {self._test_case_returning}"
107
+ return query
108
+
109
+ def insert_multiturn_metric_score(self) -> str:
110
+ values = self._values(4)
111
+ return f"""
112
+ INSERT INTO multiturn_metric_scores (
113
+ turn_result_id, metric_name, score, threshold
114
+ ) VALUES ({values})
115
+ """
116
+
75
117
  def insert_cluster_map(self) -> str:
76
118
  values = self._values(7)
77
119
  return f"""
@@ -127,6 +169,41 @@ class SQLQueries:
127
169
  ORDER BY id
128
170
  """
129
171
 
172
+ def select_multiturn_run(self) -> str:
173
+ return f"""
174
+ SELECT run_id, dataset_name, dataset_version, model_name,
175
+ started_at, finished_at, conversation_count, turn_count,
176
+ metrics_evaluated, drift_threshold, summary, metadata, created_at
177
+ FROM multiturn_runs
178
+ WHERE run_id = {self.placeholder}
179
+ """
180
+
181
+ def select_multiturn_conversations(self) -> str:
182
+ return f"""
183
+ SELECT run_id, conversation_id, turn_count, drift_score, drift_threshold,
184
+ drift_detected, summary
185
+ FROM multiturn_conversations
186
+ WHERE run_id = {self.placeholder}
187
+ ORDER BY id
188
+ """
189
+
190
+ def select_multiturn_turn_results(self) -> str:
191
+ return f"""
192
+ SELECT id, run_id, conversation_id, turn_id, turn_index, role,
193
+ passed, latency_ms, metadata
194
+ FROM multiturn_turn_results
195
+ WHERE run_id = {self.placeholder}
196
+ ORDER BY id
197
+ """
198
+
199
+ def select_multiturn_metric_scores(self) -> str:
200
+ return f"""
201
+ SELECT turn_result_id, metric_name, score, threshold
202
+ FROM multiturn_metric_scores
203
+ WHERE turn_result_id = {self.placeholder}
204
+ ORDER BY id
205
+ """
206
+
130
207
  def select_cluster_map(self) -> str:
131
208
  return f"""
132
209
  SELECT test_case_id, cluster_id, source, map_id, created_at, metadata
@@ -223,6 +300,48 @@ class BaseSQLStorageAdapter(ABC):
223
300
  conn.commit()
224
301
  return run.run_id
225
302
 
303
+ def save_multiturn_run(
304
+ self,
305
+ run: MultiTurnRunRecord,
306
+ conversations: list[MultiTurnConversationRecord],
307
+ turn_results: list[MultiTurnTurnResult],
308
+ *,
309
+ metric_thresholds: dict[str, float] | None = None,
310
+ ) -> str:
311
+ with self._get_connection() as conn:
312
+ self._execute(
313
+ conn, self.queries.insert_multiturn_run(), self._multiturn_run_params(run)
314
+ )
315
+
316
+ for conversation in conversations:
317
+ self._execute(
318
+ conn,
319
+ self.queries.insert_multiturn_conversation(),
320
+ self._multiturn_conversation_params(conversation),
321
+ )
322
+
323
+ for turn in turn_results:
324
+ cursor = self._execute(
325
+ conn,
326
+ self.queries.insert_multiturn_turn(),
327
+ self._multiturn_turn_params(run.run_id, turn),
328
+ )
329
+ turn_result_id = self._fetch_lastrowid(cursor)
330
+ for metric_name, score in (turn.metrics or {}).items():
331
+ threshold = None
332
+ if metric_thresholds and metric_name in metric_thresholds:
333
+ threshold = metric_thresholds[metric_name]
334
+ self._execute(
335
+ conn,
336
+ self.queries.insert_multiturn_metric_score(),
337
+ self._multiturn_metric_params(
338
+ turn_result_id, metric_name, score, threshold
339
+ ),
340
+ )
341
+
342
+ conn.commit()
343
+ return run.run_id
344
+
226
345
  def _insert_test_case(self, conn, run_id: str, result: TestCaseResult) -> int:
227
346
  cursor = self._execute(
228
347
  conn,
@@ -515,6 +634,56 @@ class BaseSQLStorageAdapter(ABC):
515
634
  metric.reason,
516
635
  )
517
636
 
637
+ def _multiturn_run_params(self, run: MultiTurnRunRecord) -> Sequence[Any]:
638
+ return (
639
+ run.run_id,
640
+ run.dataset_name,
641
+ run.dataset_version,
642
+ run.model_name,
643
+ self._serialize_datetime(run.started_at),
644
+ self._serialize_datetime(run.finished_at),
645
+ run.conversation_count,
646
+ run.turn_count,
647
+ self._serialize_json(run.metrics_evaluated),
648
+ run.drift_threshold,
649
+ self._serialize_json(run.summary),
650
+ self._serialize_json(run.metadata),
651
+ )
652
+
653
+ def _multiturn_conversation_params(
654
+ self, conversation: MultiTurnConversationRecord
655
+ ) -> Sequence[Any]:
656
+ return (
657
+ conversation.run_id,
658
+ conversation.conversation_id,
659
+ conversation.turn_count,
660
+ conversation.drift_score,
661
+ conversation.drift_threshold,
662
+ int(conversation.drift_detected),
663
+ self._serialize_json(conversation.summary),
664
+ )
665
+
666
+ def _multiturn_turn_params(self, run_id: str, turn: MultiTurnTurnResult) -> Sequence[Any]:
667
+ return (
668
+ run_id,
669
+ turn.conversation_id,
670
+ turn.turn_id,
671
+ turn.turn_index,
672
+ turn.role,
673
+ int(turn.passed),
674
+ turn.latency_ms,
675
+ self._serialize_json(turn.metadata),
676
+ )
677
+
678
+ def _multiturn_metric_params(
679
+ self,
680
+ turn_result_id: int,
681
+ metric_name: str,
682
+ score: float,
683
+ threshold: float | None,
684
+ ) -> Sequence[Any]:
685
+ return (turn_result_id, metric_name, score, threshold)
686
+
518
687
  def _row_to_test_case(self, conn, row) -> TestCaseResult:
519
688
  result_id = row["id"]
520
689
  metrics = self._fetch_metric_scores(conn, result_id)
@@ -1132,3 +1301,125 @@ class BaseSQLStorageAdapter(ABC):
1132
1301
 
1133
1302
  workbook.save(output)
1134
1303
  return output
1304
+
1305
+ def export_multiturn_run_to_excel(self, run_id: str, output_path) -> Path:
1306
+ from openpyxl import Workbook
1307
+
1308
+ output = Path(output_path)
1309
+ output.parent.mkdir(parents=True, exist_ok=True)
1310
+ placeholder = self.queries.placeholder
1311
+
1312
+ with self._get_connection() as conn:
1313
+ run_row = self._execute(conn, self.queries.select_multiturn_run(), (run_id,)).fetchone()
1314
+ if not run_row:
1315
+ raise KeyError(f"Multiturn run not found: {run_id}")
1316
+
1317
+ run_rows = self._normalize_rows(
1318
+ [run_row],
1319
+ json_columns={"metrics_evaluated", "summary", "metadata"},
1320
+ )
1321
+
1322
+ conversation_rows = self._execute(
1323
+ conn, self.queries.select_multiturn_conversations(), (run_id,)
1324
+ ).fetchall()
1325
+ conversation_payloads = self._normalize_rows(
1326
+ conversation_rows,
1327
+ json_columns={"summary"},
1328
+ )
1329
+
1330
+ turn_rows = self._execute(
1331
+ conn, self.queries.select_multiturn_turn_results(), (run_id,)
1332
+ ).fetchall()
1333
+ turn_payloads = self._normalize_rows(
1334
+ turn_rows,
1335
+ json_columns={"metadata"},
1336
+ )
1337
+
1338
+ metric_rows = self._execute(
1339
+ conn,
1340
+ (
1341
+ "SELECT m.turn_result_id, t.conversation_id, t.turn_id, t.turn_index, "
1342
+ "m.metric_name, m.score, m.threshold "
1343
+ "FROM multiturn_metric_scores m "
1344
+ "JOIN multiturn_turn_results t ON m.turn_result_id = t.id "
1345
+ f"WHERE t.run_id = {placeholder} ORDER BY m.id"
1346
+ ),
1347
+ (run_id,),
1348
+ ).fetchall()
1349
+ metric_payloads = self._normalize_rows(metric_rows)
1350
+
1351
+ sheet_order: list[tuple[str, list[dict[str, Any]], list[str]]] = [
1352
+ (
1353
+ "MultiTurnRun",
1354
+ run_rows,
1355
+ [
1356
+ "run_id",
1357
+ "dataset_name",
1358
+ "dataset_version",
1359
+ "model_name",
1360
+ "started_at",
1361
+ "finished_at",
1362
+ "conversation_count",
1363
+ "turn_count",
1364
+ "metrics_evaluated",
1365
+ "drift_threshold",
1366
+ "summary",
1367
+ "metadata",
1368
+ "created_at",
1369
+ ],
1370
+ ),
1371
+ (
1372
+ "MultiTurnConversations",
1373
+ conversation_payloads,
1374
+ [
1375
+ "run_id",
1376
+ "conversation_id",
1377
+ "turn_count",
1378
+ "drift_score",
1379
+ "drift_threshold",
1380
+ "drift_detected",
1381
+ "summary",
1382
+ ],
1383
+ ),
1384
+ (
1385
+ "MultiTurnTurns",
1386
+ turn_payloads,
1387
+ [
1388
+ "id",
1389
+ "run_id",
1390
+ "conversation_id",
1391
+ "turn_id",
1392
+ "turn_index",
1393
+ "role",
1394
+ "passed",
1395
+ "latency_ms",
1396
+ "metadata",
1397
+ ],
1398
+ ),
1399
+ (
1400
+ "MultiTurnTurnMetrics",
1401
+ metric_payloads,
1402
+ [
1403
+ "turn_result_id",
1404
+ "conversation_id",
1405
+ "turn_id",
1406
+ "turn_index",
1407
+ "metric_name",
1408
+ "score",
1409
+ "threshold",
1410
+ ],
1411
+ ),
1412
+ ]
1413
+
1414
+ workbook = Workbook()
1415
+ default_sheet = workbook.active
1416
+ workbook.remove(default_sheet)
1417
+
1418
+ for sheet_name, rows, headers in sheet_order:
1419
+ sheet = workbook.create_sheet(title=sheet_name)
1420
+ sheet.append(headers)
1421
+ for row in rows:
1422
+ sheet.append([self._row_value(row, header) for header in headers])
1423
+
1424
+ workbook.save(output)
1425
+ return output
@@ -221,6 +221,90 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
221
221
  "CREATE INDEX IF NOT EXISTS idx_feedback_test_case_id ON satisfaction_feedback(test_case_id)"
222
222
  )
223
223
 
224
+ conn.execute(
225
+ """
226
+ CREATE TABLE IF NOT EXISTS multiturn_runs (
227
+ run_id UUID PRIMARY KEY,
228
+ dataset_name VARCHAR(255) NOT NULL,
229
+ dataset_version VARCHAR(50),
230
+ model_name VARCHAR(255),
231
+ started_at TIMESTAMP WITH TIME ZONE NOT NULL,
232
+ finished_at TIMESTAMP WITH TIME ZONE,
233
+ conversation_count INTEGER DEFAULT 0,
234
+ turn_count INTEGER DEFAULT 0,
235
+ metrics_evaluated JSONB,
236
+ drift_threshold DOUBLE PRECISION,
237
+ summary JSONB,
238
+ metadata JSONB,
239
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
240
+ )
241
+ """
242
+ )
243
+ conn.execute(
244
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name)"
245
+ )
246
+ conn.execute(
247
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC)"
248
+ )
249
+ conn.execute(
250
+ """
251
+ CREATE TABLE IF NOT EXISTS multiturn_conversations (
252
+ id SERIAL PRIMARY KEY,
253
+ run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
254
+ conversation_id VARCHAR(255) NOT NULL,
255
+ turn_count INTEGER DEFAULT 0,
256
+ drift_score DOUBLE PRECISION,
257
+ drift_threshold DOUBLE PRECISION,
258
+ drift_detected BOOLEAN DEFAULT FALSE,
259
+ summary JSONB
260
+ )
261
+ """
262
+ )
263
+ conn.execute(
264
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id)"
265
+ )
266
+ conn.execute(
267
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id)"
268
+ )
269
+ conn.execute(
270
+ """
271
+ CREATE TABLE IF NOT EXISTS multiturn_turn_results (
272
+ id SERIAL PRIMARY KEY,
273
+ run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
274
+ conversation_id VARCHAR(255) NOT NULL,
275
+ turn_id VARCHAR(255) NOT NULL,
276
+ turn_index INTEGER,
277
+ role VARCHAR(50) NOT NULL,
278
+ passed BOOLEAN DEFAULT FALSE,
279
+ latency_ms INTEGER,
280
+ metadata JSONB
281
+ )
282
+ """
283
+ )
284
+ conn.execute(
285
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id)"
286
+ )
287
+ conn.execute(
288
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id)"
289
+ )
290
+ conn.execute(
291
+ """
292
+ CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
293
+ id SERIAL PRIMARY KEY,
294
+ turn_result_id INTEGER NOT NULL REFERENCES multiturn_turn_results(id) ON DELETE CASCADE,
295
+ metric_name VARCHAR(100) NOT NULL,
296
+ score DECIMAL(5, 4) NOT NULL,
297
+ threshold DECIMAL(5, 4)
298
+ )
299
+ """
300
+ )
301
+ conn.execute(
302
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id)"
303
+ )
304
+ conn.execute(
305
+ "CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name)"
306
+ )
307
+
224
308
  # Prompt set methods
225
309
 
226
310
  def save_prompt_set(self, bundle: PromptSetBundle) -> None:
@@ -874,6 +958,52 @@ class PostgreSQLStorageAdapter(BaseSQLStorageAdapter):
874
958
 
875
959
  return report_id
876
960
 
961
+ def list_analysis_reports(
962
+ self,
963
+ *,
964
+ run_id: str,
965
+ report_type: str | None = None,
966
+ format: str | None = None,
967
+ limit: int = 20,
968
+ ) -> list[dict[str, Any]]:
969
+ clauses = ["run_id = %s"]
970
+ params: list[Any] = [run_id]
971
+ if report_type:
972
+ clauses.append("report_type = %s")
973
+ params.append(report_type)
974
+ if format:
975
+ clauses.append("format = %s")
976
+ params.append(format)
977
+ params.append(limit)
978
+
979
+ query = (
980
+ "SELECT report_id, run_id, experiment_id, report_type, format, content, metadata, created_at "
981
+ "FROM analysis_reports WHERE "
982
+ + " AND ".join(clauses)
983
+ + " ORDER BY created_at DESC LIMIT %s"
984
+ )
985
+
986
+ with self._get_connection() as conn:
987
+ rows = conn.execute(query, tuple(params)).fetchall()
988
+
989
+ reports: list[dict[str, Any]] = []
990
+ for row in rows:
991
+ reports.append(
992
+ {
993
+ "report_id": row["report_id"],
994
+ "run_id": row["run_id"],
995
+ "experiment_id": row["experiment_id"],
996
+ "report_type": row["report_type"],
997
+ "format": row["format"],
998
+ "content": row["content"],
999
+ "metadata": self._deserialize_json(row["metadata"]),
1000
+ "created_at": row["created_at"].isoformat()
1001
+ if isinstance(row["created_at"], datetime)
1002
+ else row["created_at"],
1003
+ }
1004
+ )
1005
+ return reports
1006
+
877
1007
  def list_pipeline_results(self, limit: int = 50) -> list[dict[str, Any]]:
878
1008
  """파이프라인 분석 결과 목록을 조회합니다."""
879
1009
  query = """
@@ -86,6 +86,66 @@ CREATE TABLE IF NOT EXISTS metric_scores (
86
86
  CREATE INDEX IF NOT EXISTS idx_scores_result_id ON metric_scores(result_id);
87
87
  CREATE INDEX IF NOT EXISTS idx_scores_name ON metric_scores(name);
88
88
 
89
+ -- Multiturn evaluation tables
90
+ CREATE TABLE IF NOT EXISTS multiturn_runs (
91
+ run_id UUID PRIMARY KEY,
92
+ dataset_name VARCHAR(255) NOT NULL,
93
+ dataset_version VARCHAR(50),
94
+ model_name VARCHAR(255),
95
+ started_at TIMESTAMP WITH TIME ZONE NOT NULL,
96
+ finished_at TIMESTAMP WITH TIME ZONE,
97
+ conversation_count INTEGER DEFAULT 0,
98
+ turn_count INTEGER DEFAULT 0,
99
+ metrics_evaluated JSONB,
100
+ drift_threshold DOUBLE PRECISION,
101
+ summary JSONB,
102
+ metadata JSONB,
103
+ created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
104
+ );
105
+
106
+ CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name);
107
+ CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC);
108
+
109
+ CREATE TABLE IF NOT EXISTS multiturn_conversations (
110
+ id SERIAL PRIMARY KEY,
111
+ run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
112
+ conversation_id VARCHAR(255) NOT NULL,
113
+ turn_count INTEGER DEFAULT 0,
114
+ drift_score DOUBLE PRECISION,
115
+ drift_threshold DOUBLE PRECISION,
116
+ drift_detected BOOLEAN DEFAULT FALSE,
117
+ summary JSONB
118
+ );
119
+
120
+ CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id);
121
+ CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id);
122
+
123
+ CREATE TABLE IF NOT EXISTS multiturn_turn_results (
124
+ id SERIAL PRIMARY KEY,
125
+ run_id UUID NOT NULL REFERENCES multiturn_runs(run_id) ON DELETE CASCADE,
126
+ conversation_id VARCHAR(255) NOT NULL,
127
+ turn_id VARCHAR(255) NOT NULL,
128
+ turn_index INTEGER,
129
+ role VARCHAR(50) NOT NULL,
130
+ passed BOOLEAN DEFAULT FALSE,
131
+ latency_ms INTEGER,
132
+ metadata JSONB
133
+ );
134
+
135
+ CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id);
136
+ CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id);
137
+
138
+ CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
139
+ id SERIAL PRIMARY KEY,
140
+ turn_result_id INTEGER NOT NULL REFERENCES multiturn_turn_results(id) ON DELETE CASCADE,
141
+ metric_name VARCHAR(100) NOT NULL,
142
+ score DECIMAL(5, 4) NOT NULL,
143
+ threshold DECIMAL(5, 4)
144
+ );
145
+
146
+ CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id);
147
+ CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name);
148
+
89
149
  -- Prompt storage tables
90
150
  CREATE TABLE IF NOT EXISTS prompts (
91
151
  prompt_id UUID PRIMARY KEY,
@@ -90,6 +90,69 @@ CREATE TABLE IF NOT EXISTS metric_scores (
90
90
  CREATE INDEX IF NOT EXISTS idx_scores_result_id ON metric_scores(result_id);
91
91
  CREATE INDEX IF NOT EXISTS idx_scores_metric_name ON metric_scores(metric_name);
92
92
 
93
+ -- Multiturn evaluation tables
94
+ CREATE TABLE IF NOT EXISTS multiturn_runs (
95
+ run_id TEXT PRIMARY KEY,
96
+ dataset_name TEXT NOT NULL,
97
+ dataset_version TEXT,
98
+ model_name TEXT,
99
+ started_at TIMESTAMP NOT NULL,
100
+ finished_at TIMESTAMP,
101
+ conversation_count INTEGER DEFAULT 0,
102
+ turn_count INTEGER DEFAULT 0,
103
+ metrics_evaluated TEXT, -- JSON array of metric names
104
+ drift_threshold REAL,
105
+ summary TEXT, -- JSON summary
106
+ metadata TEXT, -- JSON metadata
107
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
108
+ );
109
+
110
+ CREATE INDEX IF NOT EXISTS idx_multiturn_runs_dataset ON multiturn_runs(dataset_name);
111
+ CREATE INDEX IF NOT EXISTS idx_multiturn_runs_started_at ON multiturn_runs(started_at DESC);
112
+
113
+ CREATE TABLE IF NOT EXISTS multiturn_conversations (
114
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
115
+ run_id TEXT NOT NULL,
116
+ conversation_id TEXT NOT NULL,
117
+ turn_count INTEGER DEFAULT 0,
118
+ drift_score REAL,
119
+ drift_threshold REAL,
120
+ drift_detected INTEGER DEFAULT 0,
121
+ summary TEXT, -- JSON summary
122
+ FOREIGN KEY (run_id) REFERENCES multiturn_runs(run_id) ON DELETE CASCADE
123
+ );
124
+
125
+ CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_run_id ON multiturn_conversations(run_id);
126
+ CREATE INDEX IF NOT EXISTS idx_multiturn_conversations_conv_id ON multiturn_conversations(conversation_id);
127
+
128
+ CREATE TABLE IF NOT EXISTS multiturn_turn_results (
129
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
130
+ run_id TEXT NOT NULL,
131
+ conversation_id TEXT NOT NULL,
132
+ turn_id TEXT NOT NULL,
133
+ turn_index INTEGER,
134
+ role TEXT NOT NULL,
135
+ passed INTEGER DEFAULT 0,
136
+ latency_ms INTEGER,
137
+ metadata TEXT, -- JSON metadata
138
+ FOREIGN KEY (run_id) REFERENCES multiturn_runs(run_id) ON DELETE CASCADE
139
+ );
140
+
141
+ CREATE INDEX IF NOT EXISTS idx_multiturn_turns_run_id ON multiturn_turn_results(run_id);
142
+ CREATE INDEX IF NOT EXISTS idx_multiturn_turns_conv_id ON multiturn_turn_results(conversation_id);
143
+
144
+ CREATE TABLE IF NOT EXISTS multiturn_metric_scores (
145
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
146
+ turn_result_id INTEGER NOT NULL,
147
+ metric_name TEXT NOT NULL,
148
+ score REAL NOT NULL,
149
+ threshold REAL,
150
+ FOREIGN KEY (turn_result_id) REFERENCES multiturn_turn_results(id) ON DELETE CASCADE
151
+ );
152
+
153
+ CREATE INDEX IF NOT EXISTS idx_multiturn_scores_turn_id ON multiturn_metric_scores(turn_result_id);
154
+ CREATE INDEX IF NOT EXISTS idx_multiturn_scores_metric_name ON multiturn_metric_scores(metric_name);
155
+
93
156
  -- Prompt storage tables
94
157
  CREATE TABLE IF NOT EXISTS prompts (
95
158
  prompt_id TEXT PRIMARY KEY,