evalvault 1.70.1__py3-none-any.whl → 1.71.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. evalvault/adapters/inbound/api/adapter.py +367 -3
  2. evalvault/adapters/inbound/api/main.py +17 -1
  3. evalvault/adapters/inbound/api/routers/calibration.py +133 -0
  4. evalvault/adapters/inbound/api/routers/runs.py +71 -1
  5. evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
  6. evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +1 -1
  8. evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
  9. evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
  10. evalvault/adapters/inbound/cli/commands/history.py +1 -1
  11. evalvault/adapters/inbound/cli/commands/regress.py +169 -1
  12. evalvault/adapters/inbound/cli/commands/run.py +225 -1
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
  14. evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
  15. evalvault/adapters/outbound/dataset/__init__.py +6 -0
  16. evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
  17. evalvault/adapters/outbound/report/__init__.py +6 -0
  18. evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
  19. evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
  20. evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
  21. evalvault/adapters/outbound/retriever/__init__.py +8 -0
  22. evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
  23. evalvault/adapters/outbound/storage/base_sql.py +291 -0
  24. evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
  25. evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
  26. evalvault/adapters/outbound/storage/schema.sql +63 -0
  27. evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
  28. evalvault/domain/entities/__init__.py +20 -0
  29. evalvault/domain/entities/graph_rag.py +30 -0
  30. evalvault/domain/entities/multiturn.py +78 -0
  31. evalvault/domain/metrics/__init__.py +10 -0
  32. evalvault/domain/metrics/multiturn_metrics.py +113 -0
  33. evalvault/domain/metrics/registry.py +36 -0
  34. evalvault/domain/services/__init__.py +8 -0
  35. evalvault/domain/services/evaluator.py +5 -2
  36. evalvault/domain/services/graph_rag_experiment.py +155 -0
  37. evalvault/domain/services/multiturn_evaluator.py +187 -0
  38. evalvault/ports/inbound/__init__.py +2 -0
  39. evalvault/ports/inbound/multiturn_port.py +23 -0
  40. evalvault/ports/inbound/web_port.py +4 -0
  41. evalvault/ports/outbound/graph_retriever_port.py +24 -0
  42. evalvault/ports/outbound/storage_port.py +25 -0
  43. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/METADATA +1 -1
  44. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/RECORD +47 -33
  45. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/WHEEL +0 -0
  46. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/entry_points.txt +0 -0
  47. {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,9 +5,11 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import os
7
7
  from collections.abc import Callable, Sequence
8
+ from dataclasses import asdict
8
9
  from datetime import date, datetime
9
10
  from pathlib import Path
10
11
  from typing import Any, cast
12
+ from uuid import uuid4
11
13
 
12
14
  import click
13
15
  import typer
@@ -15,7 +17,7 @@ from rich.console import Console
15
17
  from rich.table import Table
16
18
 
17
19
  from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
18
- from evalvault.adapters.outbound.dataset import get_loader
20
+ from evalvault.adapters.outbound.dataset import get_loader, load_multiturn_dataset
19
21
  from evalvault.adapters.outbound.documents.versioned_loader import (
20
22
  load_versioned_chunks_from_pdf_dir,
21
23
  )
@@ -33,10 +35,16 @@ from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import PhoenixTra
33
35
  from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
34
36
  from evalvault.config.settings import Settings, apply_profile
35
37
  from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
38
+ from evalvault.domain.entities.multiturn import (
39
+ MultiTurnConversationRecord,
40
+ MultiTurnRunRecord,
41
+ MultiTurnTurnResult,
42
+ )
36
43
  from evalvault.domain.services.document_versioning import parse_contract_date
37
44
  from evalvault.domain.services.evaluator import RagasEvaluator
38
45
  from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
39
46
  from evalvault.domain.services.memory_based_analysis import MemoryBasedAnalysis
47
+ from evalvault.domain.services.multiturn_evaluator import MultiTurnEvaluator
40
48
  from evalvault.domain.services.prompt_registry import (
41
49
  PromptInput,
42
50
  build_prompt_bundle,
@@ -81,6 +89,7 @@ from .run_helpers import (
81
89
  _option_was_provided,
82
90
  _print_run_mode_banner,
83
91
  _resolve_thresholds,
92
+ _save_multiturn_to_db,
84
93
  _save_results,
85
94
  _save_to_db,
86
95
  _write_stage_events_jsonl,
@@ -221,21 +230,26 @@ def register_run_commands(
221
230
  False,
222
231
  "--auto-analyze",
223
232
  help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
233
+ rich_help_panel="Auto Analysis",
224
234
  ),
225
235
  analysis_output: Path | None = typer.Option(
226
236
  None,
227
237
  "--analysis-json",
228
238
  help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
239
+ rich_help_panel="Auto Analysis",
229
240
  ),
230
241
  analysis_report: Path | None = typer.Option(
231
242
  None,
232
243
  "--analysis-report",
244
+ "--report",
233
245
  help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
246
+ rich_help_panel="Auto Analysis",
234
247
  ),
235
248
  analysis_dir: Path | None = typer.Option(
236
249
  None,
237
250
  "--analysis-dir",
238
251
  help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
252
+ rich_help_panel="Auto Analysis",
239
253
  ),
240
254
  retriever: str | None = typer.Option(
241
255
  None,
@@ -428,6 +442,18 @@ def register_run_commands(
428
442
  help="실행 모드 선택: 'simple'은 간편 실행, 'full'은 모든 옵션 노출.",
429
443
  rich_help_panel="Run modes",
430
444
  ),
445
+ max_turns: int | None = typer.Option(
446
+ None,
447
+ "--max-turns",
448
+ help="멀티턴 모드에서 사용할 최대 턴 수 (지정 시 앞에서부터 절단).",
449
+ rich_help_panel="Multiturn options",
450
+ ),
451
+ drift_threshold: float = typer.Option(
452
+ 0.1,
453
+ "--drift-threshold",
454
+ help="멀티턴 모드에서 드리프트 경고 임계값.",
455
+ rich_help_panel="Multiturn options",
456
+ ),
431
457
  db_path: Path | None = db_option(
432
458
  help_text="Path to SQLite database file for storing results.",
433
459
  ),
@@ -462,6 +488,7 @@ def register_run_commands(
462
488
  False,
463
489
  "--verbose",
464
490
  "-v",
491
+ "-V",
465
492
  help="Show detailed output.",
466
493
  ),
467
494
  parallel: bool = typer.Option(
@@ -983,6 +1010,191 @@ def register_run_commands(
983
1010
  if threshold_profile:
984
1011
  phoenix_trace_metadata["threshold.profile"] = str(threshold_profile).strip().lower()
985
1012
 
1013
+ if preset.name == "multiturn":
1014
+ llm_factory = SettingsLLMFactory(settings)
1015
+ korean_toolkit = try_create_korean_toolkit()
1016
+ evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
1017
+ try:
1018
+ llm_adapter = get_llm_adapter(settings)
1019
+ except Exception as exc:
1020
+ provider = str(getattr(settings, "llm_provider", "")).strip().lower()
1021
+ fixes: list[str]
1022
+ if provider == "ollama":
1023
+ fixes = [
1024
+ "Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
1025
+ "필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
1026
+ "URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
1027
+ ]
1028
+ elif provider == "openai":
1029
+ fixes = [
1030
+ "`.env`에 `OPENAI_API_KEY`를 설정하세요.",
1031
+ "프록시/네트워크가 필요한 환경이면 연결 가능 여부를 확인하세요.",
1032
+ ]
1033
+ elif provider == "vllm":
1034
+ fixes = [
1035
+ "`.env`의 `VLLM_BASE_URL`/`VLLM_MODEL` 설정을 확인하세요.",
1036
+ "vLLM 서버가 OpenAI 호환 API로 실행 중인지 확인하세요.",
1037
+ ]
1038
+ else:
1039
+ fixes = ["--profile 또는 환경변수 설정을 확인하세요."]
1040
+ print_cli_error(
1041
+ console,
1042
+ "LLM/임베딩 어댑터를 초기화하지 못했습니다.",
1043
+ details=str(exc),
1044
+ fixes=fixes,
1045
+ )
1046
+ raise typer.Exit(1) from exc
1047
+
1048
+ multiturn_started_at = datetime.now()
1049
+ _log_timestamp(console, verbose, "멀티턴 데이터셋 로딩 시작")
1050
+ try:
1051
+ multiturn_dataset = load_multiturn_dataset(dataset)
1052
+ except Exception as exc:
1053
+ _log_duration(console, verbose, "멀티턴 데이터셋 로딩 실패", multiturn_started_at)
1054
+ print_cli_error(
1055
+ console,
1056
+ "멀티턴 데이터셋을 불러오지 못했습니다.",
1057
+ details=str(exc),
1058
+ fixes=[
1059
+ "파일 경로/형식을 확인하세요.",
1060
+ "멀티턴 스키마(turns, conversation_id)가 문서와 동일한지 확인하세요.",
1061
+ ],
1062
+ )
1063
+ raise typer.Exit(1) from exc
1064
+ _log_duration(console, verbose, "멀티턴 데이터셋 로딩 완료", multiturn_started_at)
1065
+
1066
+ if stream:
1067
+ print_cli_warning(
1068
+ console,
1069
+ "멀티턴 모드에서는 streaming 옵션을 무시합니다.",
1070
+ tips=["--stream을 제거하거나 일반 모드로 실행하세요."],
1071
+ )
1072
+ if retriever:
1073
+ print_cli_warning(
1074
+ console,
1075
+ "멀티턴 모드에서는 retriever 적용을 지원하지 않습니다.",
1076
+ tips=["단일 턴 모드에서 retriever를 사용하세요."],
1077
+ )
1078
+ if use_domain_memory:
1079
+ print_cli_warning(
1080
+ console,
1081
+ "멀티턴 모드에서는 Domain Memory를 지원하지 않습니다.",
1082
+ tips=["--use-domain-memory 옵션을 제거하세요."],
1083
+ )
1084
+
1085
+ if max_turns and max_turns > 0:
1086
+ trimmed = 0
1087
+ for case in multiturn_dataset.test_cases:
1088
+ if len(case.turns) > max_turns:
1089
+ case.turns = case.turns[:max_turns]
1090
+ trimmed += 1
1091
+ if trimmed:
1092
+ console.print(f"[dim]Trimmed turns in {trimmed} conversation(s).[/dim]")
1093
+
1094
+ evaluation_started_at = datetime.now()
1095
+ multiturn_evaluator = MultiTurnEvaluator(evaluator=evaluator, llm=llm_adapter)
1096
+ results = []
1097
+ drift_flags = 0
1098
+ turn_count = 0
1099
+ for case in multiturn_dataset.test_cases:
1100
+ result = multiturn_evaluator.evaluate_conversation(case, metric_list)
1101
+ drift = multiturn_evaluator.detect_drift(case, threshold=drift_threshold)
1102
+ result.summary["drift_detected"] = drift.drift_detected
1103
+ result.summary["drift_threshold"] = drift.drift_threshold
1104
+ result.summary["drift_score"] = drift.drift_score
1105
+ results.append(result)
1106
+ turn_count += len(result.turn_results)
1107
+ if drift.drift_detected:
1108
+ drift_flags += 1
1109
+
1110
+ multiturn_summary: dict[str, object] = {
1111
+ "conversation_count": len(results),
1112
+ "turn_count": turn_count,
1113
+ "drift_detected_count": drift_flags,
1114
+ "drift_threshold": drift_threshold,
1115
+ }
1116
+ for metric in metric_list:
1117
+ scores = [
1118
+ result.summary.get(metric)
1119
+ for result in results
1120
+ if isinstance(result.summary.get(metric), (int, float))
1121
+ ]
1122
+ if scores:
1123
+ multiturn_summary[metric] = sum(scores) / len(scores)
1124
+
1125
+ payload = {
1126
+ "dataset": {
1127
+ "name": multiturn_dataset.name,
1128
+ "version": multiturn_dataset.version,
1129
+ "metadata": multiturn_dataset.metadata,
1130
+ "source_file": multiturn_dataset.source_file,
1131
+ },
1132
+ "metrics": metric_list,
1133
+ "summary": multiturn_summary,
1134
+ "conversations": [asdict(item) for item in results],
1135
+ }
1136
+
1137
+ table = Table(title="Multi-turn Summary", show_header=True, header_style="bold cyan")
1138
+ table.add_column("Metric", style="bold")
1139
+ table.add_column("Value", justify="right")
1140
+ for metric in metric_list:
1141
+ value = multiturn_summary.get(metric)
1142
+ if isinstance(value, float):
1143
+ display = f"{value:.3f}"
1144
+ else:
1145
+ display = str(value) if value is not None else "-"
1146
+ table.add_row(metric, display)
1147
+ table.add_row("conversation_count", str(multiturn_summary.get("conversation_count")))
1148
+ table.add_row("turn_count", str(multiturn_summary.get("turn_count")))
1149
+ table.add_row("drift_detected", str(multiturn_summary.get("drift_detected_count")))
1150
+ console.print(table)
1151
+
1152
+ if output:
1153
+ write_json(output, payload)
1154
+ console.print(f"[green]멀티턴 결과 저장:[/green] {output}")
1155
+ if db_path:
1156
+ run_id = str(uuid4())
1157
+ run_record = MultiTurnRunRecord(
1158
+ run_id=run_id,
1159
+ dataset_name=multiturn_dataset.name,
1160
+ dataset_version=multiturn_dataset.version,
1161
+ model_name=llm_adapter.get_model_name(),
1162
+ started_at=evaluation_started_at,
1163
+ finished_at=datetime.now(),
1164
+ conversation_count=len(results),
1165
+ turn_count=turn_count,
1166
+ metrics_evaluated=list(metric_list),
1167
+ drift_threshold=drift_threshold,
1168
+ summary=multiturn_summary,
1169
+ metadata={"dataset": multiturn_dataset.metadata},
1170
+ )
1171
+ conversation_records = [
1172
+ MultiTurnConversationRecord(
1173
+ run_id=run_id,
1174
+ conversation_id=conversation.conversation_id,
1175
+ turn_count=len(conversation.turn_results),
1176
+ drift_score=conversation.summary.get("drift_score"),
1177
+ drift_threshold=conversation.summary.get("drift_threshold"),
1178
+ drift_detected=bool(conversation.summary.get("drift_detected")),
1179
+ summary=dict(conversation.summary),
1180
+ )
1181
+ for conversation in results
1182
+ ]
1183
+ turn_results: list[MultiTurnTurnResult] = []
1184
+ for conversation in results:
1185
+ for turn in conversation.turn_results:
1186
+ turn_results.append(turn)
1187
+ _save_multiturn_to_db(
1188
+ db_path,
1189
+ run_record,
1190
+ conversation_records,
1191
+ turn_results,
1192
+ console,
1193
+ export_excel=True,
1194
+ excel_output_path=excel_output,
1195
+ )
1196
+ return
1197
+
986
1198
  # Load dataset or configure streaming metadata
987
1199
  if stream:
988
1200
  stream_started_at = datetime.now()
@@ -2120,21 +2332,26 @@ def register_run_commands(
2120
2332
  False,
2121
2333
  "--auto-analyze",
2122
2334
  help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
2335
+ rich_help_panel="Auto Analysis",
2123
2336
  ),
2124
2337
  analysis_output: Path | None = typer.Option(
2125
2338
  None,
2126
2339
  "--analysis-json",
2127
2340
  help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
2341
+ rich_help_panel="Auto Analysis",
2128
2342
  ),
2129
2343
  analysis_report: Path | None = typer.Option(
2130
2344
  None,
2131
2345
  "--analysis-report",
2346
+ "--report",
2132
2347
  help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
2348
+ rich_help_panel="Auto Analysis",
2133
2349
  ),
2134
2350
  analysis_dir: Path | None = typer.Option(
2135
2351
  None,
2136
2352
  "--analysis-dir",
2137
2353
  help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
2354
+ rich_help_panel="Auto Analysis",
2138
2355
  ),
2139
2356
  retriever: str | None = typer.Option(
2140
2357
  None,
@@ -2273,6 +2490,7 @@ def register_run_commands(
2273
2490
  verbose: bool = typer.Option(
2274
2491
  False,
2275
2492
  "--verbose",
2493
+ "-V",
2276
2494
  help="Show detailed output.",
2277
2495
  ),
2278
2496
  parallel: bool = typer.Option(
@@ -2406,21 +2624,26 @@ def register_run_commands(
2406
2624
  False,
2407
2625
  "--auto-analyze",
2408
2626
  help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
2627
+ rich_help_panel="Auto Analysis",
2409
2628
  ),
2410
2629
  analysis_output: Path | None = typer.Option(
2411
2630
  None,
2412
2631
  "--analysis-json",
2413
2632
  help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
2633
+ rich_help_panel="Auto Analysis",
2414
2634
  ),
2415
2635
  analysis_report: Path | None = typer.Option(
2416
2636
  None,
2417
2637
  "--analysis-report",
2638
+ "--report",
2418
2639
  help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
2640
+ rich_help_panel="Auto Analysis",
2419
2641
  ),
2420
2642
  analysis_dir: Path | None = typer.Option(
2421
2643
  None,
2422
2644
  "--analysis-dir",
2423
2645
  help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
2646
+ rich_help_panel="Auto Analysis",
2424
2647
  ),
2425
2648
  retriever: str | None = typer.Option(
2426
2649
  None,
@@ -2559,6 +2782,7 @@ def register_run_commands(
2559
2782
  verbose: bool = typer.Option(
2560
2783
  False,
2561
2784
  "--verbose",
2785
+ "-V",
2562
2786
  help="Show detailed output.",
2563
2787
  ),
2564
2788
  parallel: bool = typer.Option(
@@ -29,6 +29,9 @@ from evalvault.domain.entities import (
29
29
  Dataset,
30
30
  EvaluationRun,
31
31
  GenerationData,
32
+ MultiTurnConversationRecord,
33
+ MultiTurnRunRecord,
34
+ MultiTurnTurnResult,
32
35
  PromptSetBundle,
33
36
  RAGTraceData,
34
37
  RetrievalData,
@@ -86,6 +89,14 @@ RUN_MODE_PRESETS: dict[str, RunModePreset] = {
86
89
  label="Full",
87
90
  description="모든 CLI 옵션과 Domain Memory, Prompt manifest를 활용하는 전체 모드.",
88
91
  ),
92
+ "multiturn": RunModePreset(
93
+ name="multiturn",
94
+ label="Multiturn",
95
+ description="멀티턴 대화 평가 전용 모드 (멀티턴 메트릭만 지원).",
96
+ default_metrics=("turn_faithfulness", "context_coherence", "drift_rate"),
97
+ allow_domain_memory=False,
98
+ allow_prompt_metadata=False,
99
+ ),
89
100
  }
90
101
 
91
102
  SUMMARY_METRIC_ORDER = (
@@ -490,6 +501,52 @@ def _save_to_db(
490
501
  )
491
502
 
492
503
 
504
+ def _save_multiturn_to_db(
505
+ db_path: Path,
506
+ run_record: MultiTurnRunRecord,
507
+ conversations: list[MultiTurnConversationRecord],
508
+ turn_results: list[MultiTurnTurnResult],
509
+ console: Console,
510
+ *,
511
+ storage_cls: type[SQLiteStorageAdapter] = SQLiteStorageAdapter,
512
+ export_excel: bool = True,
513
+ excel_output_path: Path | None = None,
514
+ metric_thresholds: dict[str, float] | None = None,
515
+ ) -> None:
516
+ """Persist multiturn evaluation run to SQLite database."""
517
+ with console.status(f"[bold green]Saving multiturn run to {db_path}..."):
518
+ try:
519
+ storage = storage_cls(db_path=db_path)
520
+ storage.save_multiturn_run(
521
+ run_record,
522
+ conversations,
523
+ turn_results,
524
+ metric_thresholds=metric_thresholds,
525
+ )
526
+ if export_excel:
527
+ excel_path = excel_output_path or (
528
+ db_path.parent / f"evalvault_multiturn_{run_record.run_id}.xlsx"
529
+ )
530
+ try:
531
+ storage.export_multiturn_run_to_excel(run_record.run_id, excel_path)
532
+ console.print(f"[green]Multiturn Excel export saved: {excel_path}[/green]")
533
+ except Exception as exc:
534
+ print_cli_warning(
535
+ console,
536
+ "멀티턴 엑셀 내보내기에 실패했습니다.",
537
+ tips=[str(exc)],
538
+ )
539
+ console.print(f"[green]Multiturn results saved to database: {db_path}[/green]")
540
+ console.print(f"[dim]Run ID: {run_record.run_id}[/dim]")
541
+ except Exception as exc: # pragma: no cover - persistence errors
542
+ print_cli_error(
543
+ console,
544
+ "멀티턴 결과를 데이터베이스에 저장하지 못했습니다.",
545
+ details=str(exc),
546
+ fixes=["경로 권한과 DB 파일 잠금 상태를 확인하세요."],
547
+ )
548
+
549
+
493
550
  def _save_results(output: Path, result, console: Console) -> None:
494
551
  """Write evaluation summary to disk."""
495
552
  with console.status(f"[bold green]Saving to {output}..."):
@@ -1,7 +1,11 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import os
5
+ import sys
3
6
  from dataclasses import dataclass, field
4
7
  from datetime import datetime
8
+ from importlib import import_module
5
9
  from typing import Any
6
10
 
7
11
  import networkx as nx
@@ -12,10 +16,18 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
12
16
  to_serializable,
13
17
  )
14
18
 
15
- try:
16
- import matplotlib.pyplot as plt
17
- except ImportError:
18
- plt = None
19
+
20
+ def _get_matplotlib_pyplot() -> Any | None:
21
+ try:
22
+ if "matplotlib.pyplot" in sys.modules:
23
+ return import_module("matplotlib.pyplot")
24
+ os.environ.setdefault("MPLBACKEND", "Agg")
25
+ matplotlib = import_module("matplotlib")
26
+ with contextlib.suppress(Exception):
27
+ matplotlib.use("Agg", force=True)
28
+ return import_module("matplotlib.pyplot")
29
+ except ModuleNotFoundError:
30
+ return None
19
31
 
20
32
 
21
33
  @dataclass
@@ -173,6 +185,7 @@ class NetworkAnalyzerModule(BaseAnalysisModule):
173
185
  output_path: str | None = None,
174
186
  figsize: tuple[int, int] = (12, 8),
175
187
  ) -> Any | None:
188
+ plt = _get_matplotlib_pyplot()
176
189
  if plt is None:
177
190
  return None
178
191
 
@@ -6,6 +6,10 @@ from evalvault.adapters.outbound.dataset.excel_loader import ExcelDatasetLoader
6
6
  from evalvault.adapters.outbound.dataset.json_loader import JSONDatasetLoader
7
7
  from evalvault.adapters.outbound.dataset.loader_factory import get_loader, register_loader
8
8
  from evalvault.adapters.outbound.dataset.method_input_loader import MethodInputDatasetLoader
9
+ from evalvault.adapters.outbound.dataset.multiturn_json_loader import (
10
+ MultiTurnDataset,
11
+ load_multiturn_dataset,
12
+ )
9
13
  from evalvault.adapters.outbound.dataset.streaming_loader import (
10
14
  StreamingConfig,
11
15
  StreamingCSVLoader,
@@ -23,6 +27,7 @@ __all__ = [
23
27
  "ExcelDatasetLoader",
24
28
  "JSONDatasetLoader",
25
29
  "MethodInputDatasetLoader",
30
+ "MultiTurnDataset",
26
31
  "StreamingCSVLoader",
27
32
  "StreamingConfig",
28
33
  "StreamingDatasetLoader",
@@ -31,6 +36,7 @@ __all__ = [
31
36
  "StreamingTestCaseIterator",
32
37
  "get_loader",
33
38
  "load_in_chunks",
39
+ "load_multiturn_dataset",
34
40
  "register_loader",
35
41
  "stream_file",
36
42
  ]
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from evalvault.domain.entities.multiturn import ConversationTurn, MultiTurnTestCase
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class MultiTurnDataset:
13
+ name: str
14
+ version: str
15
+ test_cases: list[MultiTurnTestCase]
16
+ metadata: dict[str, Any]
17
+ source_file: str | None = None
18
+
19
+
20
+ def load_multiturn_dataset(file_path: str | Path) -> MultiTurnDataset:
21
+ path = Path(file_path)
22
+ if not path.exists():
23
+ raise FileNotFoundError(f"File not found: {file_path}")
24
+ if not path.is_file():
25
+ raise ValueError(f"Path is not a file: {file_path}")
26
+
27
+ try:
28
+ payload = json.loads(path.read_text(encoding="utf-8"))
29
+ except json.JSONDecodeError as exc:
30
+ raise ValueError(f"Invalid JSON file: {exc}") from exc
31
+
32
+ name = path.stem
33
+ version = "1.0.0"
34
+ metadata: dict[str, Any] = {}
35
+ raw_cases: list[dict[str, Any]]
36
+
37
+ if isinstance(payload, list):
38
+ raw_cases = payload
39
+ elif isinstance(payload, dict):
40
+ name = str(payload.get("name") or name)
41
+ version = str(payload.get("version") or version)
42
+ metadata = payload.get("metadata") or {}
43
+ if not isinstance(metadata, dict):
44
+ raise ValueError("metadata must be a JSON object")
45
+ raw_cases = payload.get("test_cases") or payload.get("conversations") or []
46
+ else:
47
+ raise ValueError("JSON must be an array or object with 'test_cases' key")
48
+
49
+ if not isinstance(raw_cases, list):
50
+ raise ValueError("test_cases must be a list")
51
+
52
+ test_cases: list[MultiTurnTestCase] = []
53
+ for idx, raw_case in enumerate(raw_cases, start=1):
54
+ if not isinstance(raw_case, dict):
55
+ raise ValueError(f"test_cases[{idx}] must be an object")
56
+ conversation_id = raw_case.get("conversation_id") or raw_case.get("id")
57
+ if not conversation_id:
58
+ raise ValueError(f"test_cases[{idx}] missing conversation_id")
59
+ raw_turns = raw_case.get("turns")
60
+ if not isinstance(raw_turns, list) or not raw_turns:
61
+ raise ValueError(f"test_cases[{idx}] missing turns list")
62
+
63
+ turns: list[ConversationTurn] = []
64
+ for t_idx, raw_turn in enumerate(raw_turns, start=1):
65
+ if not isinstance(raw_turn, dict):
66
+ raise ValueError(f"turns[{t_idx}] must be an object")
67
+ role = raw_turn.get("role")
68
+ if role not in {"user", "assistant"}:
69
+ raise ValueError(f"turns[{t_idx}] role must be 'user' or 'assistant'")
70
+ content = raw_turn.get("content")
71
+ if content is None:
72
+ raise ValueError(f"turns[{t_idx}] missing content")
73
+ turn_id = raw_turn.get("turn_id") or f"t{t_idx:02d}"
74
+ contexts = raw_turn.get("contexts")
75
+ if contexts is None:
76
+ contexts = []
77
+ if isinstance(contexts, str):
78
+ contexts = [contexts]
79
+ if not isinstance(contexts, list):
80
+ raise ValueError(f"turns[{t_idx}] contexts must be a list")
81
+ ground_truth = raw_turn.get("ground_truth")
82
+ metadata_value = raw_turn.get("metadata") or {}
83
+ if not isinstance(metadata_value, dict):
84
+ raise ValueError(f"turns[{t_idx}] metadata must be an object")
85
+ turns.append(
86
+ ConversationTurn(
87
+ turn_id=str(turn_id),
88
+ role=role,
89
+ content=str(content),
90
+ contexts=[str(ctx) for ctx in contexts],
91
+ ground_truth=ground_truth,
92
+ metadata=metadata_value,
93
+ )
94
+ )
95
+
96
+ test_cases.append(
97
+ MultiTurnTestCase(
98
+ conversation_id=str(conversation_id),
99
+ turns=turns,
100
+ expected_final_answer=raw_case.get("expected_final_answer"),
101
+ drift_tolerance=float(raw_case.get("drift_tolerance", 0.1)),
102
+ )
103
+ )
104
+
105
+ return MultiTurnDataset(
106
+ name=name,
107
+ version=version,
108
+ test_cases=test_cases,
109
+ metadata=metadata,
110
+ source_file=str(path),
111
+ )
@@ -1,5 +1,9 @@
1
1
  """Report generation adapters."""
2
2
 
3
+ from evalvault.adapters.outbound.report.ci_report_formatter import (
4
+ CIGateMetricRow,
5
+ format_ci_regression_report,
6
+ )
3
7
  from evalvault.adapters.outbound.report.dashboard_generator import DashboardGenerator
4
8
  from evalvault.adapters.outbound.report.llm_report_generator import (
5
9
  LLMReport,
@@ -9,7 +13,9 @@ from evalvault.adapters.outbound.report.llm_report_generator import (
9
13
  from evalvault.adapters.outbound.report.markdown_adapter import MarkdownReportAdapter
10
14
 
11
15
  __all__ = [
16
+ "CIGateMetricRow",
12
17
  "DashboardGenerator",
18
+ "format_ci_regression_report",
13
19
  "LLMReport",
14
20
  "LLMReportGenerator",
15
21
  "LLMReportSection",
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class CIGateMetricRow:
8
+ metric: str
9
+ baseline_score: float
10
+ current_score: float
11
+ change_percent: float
12
+ status: str
13
+
14
+
15
+ def format_ci_regression_report(
16
+ rows: list[CIGateMetricRow],
17
+ *,
18
+ regression_rate: float,
19
+ regression_threshold: float,
20
+ gate_passed: bool,
21
+ ) -> str:
22
+ lines: list[str] = ["## RAG Regression Gate Results", ""]
23
+ lines.append("| Metric | Baseline | Current | Change | Status |")
24
+ lines.append("|--------|----------|---------|--------|--------|")
25
+ for row in rows:
26
+ change = f"{row.change_percent:+.1f}%"
27
+ lines.append(
28
+ f"| {row.metric} | {row.baseline_score:.3f} | {row.current_score:.3f} | {change} | {row.status} |"
29
+ )
30
+ lines.append("")
31
+ if gate_passed:
32
+ status_line = "✅ PASSED"
33
+ comparison = "<"
34
+ else:
35
+ status_line = "❌ FAILED"
36
+ comparison = ">="
37
+ lines.append(
38
+ f"**Gate Status**: {status_line} (regression: {regression_rate:.1%} {comparison} {regression_threshold:.1%} threshold)"
39
+ )
40
+ return "\n".join(lines).strip()
41
+
42
+
43
+ __all__ = ["CIGateMetricRow", "format_ci_regression_report"]