evalvault 1.70.1__py3-none-any.whl → 1.71.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +367 -3
- evalvault/adapters/inbound/api/main.py +17 -1
- evalvault/adapters/inbound/api/routers/calibration.py +133 -0
- evalvault/adapters/inbound/api/routers/runs.py +71 -1
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
- evalvault/adapters/inbound/cli/commands/compare.py +1 -1
- evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
- evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
- evalvault/adapters/inbound/cli/commands/history.py +1 -1
- evalvault/adapters/inbound/cli/commands/regress.py +169 -1
- evalvault/adapters/inbound/cli/commands/run.py +225 -1
- evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
- evalvault/adapters/outbound/dataset/__init__.py +6 -0
- evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
- evalvault/adapters/outbound/report/__init__.py +6 -0
- evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
- evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
- evalvault/adapters/outbound/retriever/__init__.py +8 -0
- evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
- evalvault/adapters/outbound/storage/base_sql.py +291 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
- evalvault/adapters/outbound/storage/schema.sql +63 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
- evalvault/domain/entities/__init__.py +20 -0
- evalvault/domain/entities/graph_rag.py +30 -0
- evalvault/domain/entities/multiturn.py +78 -0
- evalvault/domain/metrics/__init__.py +10 -0
- evalvault/domain/metrics/multiturn_metrics.py +113 -0
- evalvault/domain/metrics/registry.py +36 -0
- evalvault/domain/services/__init__.py +8 -0
- evalvault/domain/services/evaluator.py +5 -2
- evalvault/domain/services/graph_rag_experiment.py +155 -0
- evalvault/domain/services/multiturn_evaluator.py +187 -0
- evalvault/ports/inbound/__init__.py +2 -0
- evalvault/ports/inbound/multiturn_port.py +23 -0
- evalvault/ports/inbound/web_port.py +4 -0
- evalvault/ports/outbound/graph_retriever_port.py +24 -0
- evalvault/ports/outbound/storage_port.py +25 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/METADATA +1 -1
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/RECORD +47 -33
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/WHEEL +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -5,9 +5,11 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import os
|
|
7
7
|
from collections.abc import Callable, Sequence
|
|
8
|
+
from dataclasses import asdict
|
|
8
9
|
from datetime import date, datetime
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import Any, cast
|
|
12
|
+
from uuid import uuid4
|
|
11
13
|
|
|
12
14
|
import click
|
|
13
15
|
import typer
|
|
@@ -15,7 +17,7 @@ from rich.console import Console
|
|
|
15
17
|
from rich.table import Table
|
|
16
18
|
|
|
17
19
|
from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis_pipeline_service
|
|
18
|
-
from evalvault.adapters.outbound.dataset import get_loader
|
|
20
|
+
from evalvault.adapters.outbound.dataset import get_loader, load_multiturn_dataset
|
|
19
21
|
from evalvault.adapters.outbound.documents.versioned_loader import (
|
|
20
22
|
load_versioned_chunks_from_pdf_dir,
|
|
21
23
|
)
|
|
@@ -33,10 +35,16 @@ from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import PhoenixTra
|
|
|
33
35
|
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
34
36
|
from evalvault.config.settings import Settings, apply_profile
|
|
35
37
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
38
|
+
from evalvault.domain.entities.multiturn import (
|
|
39
|
+
MultiTurnConversationRecord,
|
|
40
|
+
MultiTurnRunRecord,
|
|
41
|
+
MultiTurnTurnResult,
|
|
42
|
+
)
|
|
36
43
|
from evalvault.domain.services.document_versioning import parse_contract_date
|
|
37
44
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
38
45
|
from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
|
|
39
46
|
from evalvault.domain.services.memory_based_analysis import MemoryBasedAnalysis
|
|
47
|
+
from evalvault.domain.services.multiturn_evaluator import MultiTurnEvaluator
|
|
40
48
|
from evalvault.domain.services.prompt_registry import (
|
|
41
49
|
PromptInput,
|
|
42
50
|
build_prompt_bundle,
|
|
@@ -81,6 +89,7 @@ from .run_helpers import (
|
|
|
81
89
|
_option_was_provided,
|
|
82
90
|
_print_run_mode_banner,
|
|
83
91
|
_resolve_thresholds,
|
|
92
|
+
_save_multiturn_to_db,
|
|
84
93
|
_save_results,
|
|
85
94
|
_save_to_db,
|
|
86
95
|
_write_stage_events_jsonl,
|
|
@@ -221,21 +230,26 @@ def register_run_commands(
|
|
|
221
230
|
False,
|
|
222
231
|
"--auto-analyze",
|
|
223
232
|
help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
|
|
233
|
+
rich_help_panel="Auto Analysis",
|
|
224
234
|
),
|
|
225
235
|
analysis_output: Path | None = typer.Option(
|
|
226
236
|
None,
|
|
227
237
|
"--analysis-json",
|
|
228
238
|
help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
|
|
239
|
+
rich_help_panel="Auto Analysis",
|
|
229
240
|
),
|
|
230
241
|
analysis_report: Path | None = typer.Option(
|
|
231
242
|
None,
|
|
232
243
|
"--analysis-report",
|
|
244
|
+
"--report",
|
|
233
245
|
help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
|
|
246
|
+
rich_help_panel="Auto Analysis",
|
|
234
247
|
),
|
|
235
248
|
analysis_dir: Path | None = typer.Option(
|
|
236
249
|
None,
|
|
237
250
|
"--analysis-dir",
|
|
238
251
|
help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
|
|
252
|
+
rich_help_panel="Auto Analysis",
|
|
239
253
|
),
|
|
240
254
|
retriever: str | None = typer.Option(
|
|
241
255
|
None,
|
|
@@ -428,6 +442,18 @@ def register_run_commands(
|
|
|
428
442
|
help="실행 모드 선택: 'simple'은 간편 실행, 'full'은 모든 옵션 노출.",
|
|
429
443
|
rich_help_panel="Run modes",
|
|
430
444
|
),
|
|
445
|
+
max_turns: int | None = typer.Option(
|
|
446
|
+
None,
|
|
447
|
+
"--max-turns",
|
|
448
|
+
help="멀티턴 모드에서 사용할 최대 턴 수 (지정 시 앞에서부터 절단).",
|
|
449
|
+
rich_help_panel="Multiturn options",
|
|
450
|
+
),
|
|
451
|
+
drift_threshold: float = typer.Option(
|
|
452
|
+
0.1,
|
|
453
|
+
"--drift-threshold",
|
|
454
|
+
help="멀티턴 모드에서 드리프트 경고 임계값.",
|
|
455
|
+
rich_help_panel="Multiturn options",
|
|
456
|
+
),
|
|
431
457
|
db_path: Path | None = db_option(
|
|
432
458
|
help_text="Path to SQLite database file for storing results.",
|
|
433
459
|
),
|
|
@@ -462,6 +488,7 @@ def register_run_commands(
|
|
|
462
488
|
False,
|
|
463
489
|
"--verbose",
|
|
464
490
|
"-v",
|
|
491
|
+
"-V",
|
|
465
492
|
help="Show detailed output.",
|
|
466
493
|
),
|
|
467
494
|
parallel: bool = typer.Option(
|
|
@@ -983,6 +1010,191 @@ def register_run_commands(
|
|
|
983
1010
|
if threshold_profile:
|
|
984
1011
|
phoenix_trace_metadata["threshold.profile"] = str(threshold_profile).strip().lower()
|
|
985
1012
|
|
|
1013
|
+
if preset.name == "multiturn":
|
|
1014
|
+
llm_factory = SettingsLLMFactory(settings)
|
|
1015
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
1016
|
+
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
1017
|
+
try:
|
|
1018
|
+
llm_adapter = get_llm_adapter(settings)
|
|
1019
|
+
except Exception as exc:
|
|
1020
|
+
provider = str(getattr(settings, "llm_provider", "")).strip().lower()
|
|
1021
|
+
fixes: list[str]
|
|
1022
|
+
if provider == "ollama":
|
|
1023
|
+
fixes = [
|
|
1024
|
+
"Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
|
|
1025
|
+
"필요 모델을 받아두세요: `ollama pull gpt-oss-safeguard:20b` 및 `ollama pull qwen3-embedding:0.6b`.",
|
|
1026
|
+
"URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
1027
|
+
]
|
|
1028
|
+
elif provider == "openai":
|
|
1029
|
+
fixes = [
|
|
1030
|
+
"`.env`에 `OPENAI_API_KEY`를 설정하세요.",
|
|
1031
|
+
"프록시/네트워크가 필요한 환경이면 연결 가능 여부를 확인하세요.",
|
|
1032
|
+
]
|
|
1033
|
+
elif provider == "vllm":
|
|
1034
|
+
fixes = [
|
|
1035
|
+
"`.env`의 `VLLM_BASE_URL`/`VLLM_MODEL` 설정을 확인하세요.",
|
|
1036
|
+
"vLLM 서버가 OpenAI 호환 API로 실행 중인지 확인하세요.",
|
|
1037
|
+
]
|
|
1038
|
+
else:
|
|
1039
|
+
fixes = ["--profile 또는 환경변수 설정을 확인하세요."]
|
|
1040
|
+
print_cli_error(
|
|
1041
|
+
console,
|
|
1042
|
+
"LLM/임베딩 어댑터를 초기화하지 못했습니다.",
|
|
1043
|
+
details=str(exc),
|
|
1044
|
+
fixes=fixes,
|
|
1045
|
+
)
|
|
1046
|
+
raise typer.Exit(1) from exc
|
|
1047
|
+
|
|
1048
|
+
multiturn_started_at = datetime.now()
|
|
1049
|
+
_log_timestamp(console, verbose, "멀티턴 데이터셋 로딩 시작")
|
|
1050
|
+
try:
|
|
1051
|
+
multiturn_dataset = load_multiturn_dataset(dataset)
|
|
1052
|
+
except Exception as exc:
|
|
1053
|
+
_log_duration(console, verbose, "멀티턴 데이터셋 로딩 실패", multiturn_started_at)
|
|
1054
|
+
print_cli_error(
|
|
1055
|
+
console,
|
|
1056
|
+
"멀티턴 데이터셋을 불러오지 못했습니다.",
|
|
1057
|
+
details=str(exc),
|
|
1058
|
+
fixes=[
|
|
1059
|
+
"파일 경로/형식을 확인하세요.",
|
|
1060
|
+
"멀티턴 스키마(turns, conversation_id)가 문서와 동일한지 확인하세요.",
|
|
1061
|
+
],
|
|
1062
|
+
)
|
|
1063
|
+
raise typer.Exit(1) from exc
|
|
1064
|
+
_log_duration(console, verbose, "멀티턴 데이터셋 로딩 완료", multiturn_started_at)
|
|
1065
|
+
|
|
1066
|
+
if stream:
|
|
1067
|
+
print_cli_warning(
|
|
1068
|
+
console,
|
|
1069
|
+
"멀티턴 모드에서는 streaming 옵션을 무시합니다.",
|
|
1070
|
+
tips=["--stream을 제거하거나 일반 모드로 실행하세요."],
|
|
1071
|
+
)
|
|
1072
|
+
if retriever:
|
|
1073
|
+
print_cli_warning(
|
|
1074
|
+
console,
|
|
1075
|
+
"멀티턴 모드에서는 retriever 적용을 지원하지 않습니다.",
|
|
1076
|
+
tips=["단일 턴 모드에서 retriever를 사용하세요."],
|
|
1077
|
+
)
|
|
1078
|
+
if use_domain_memory:
|
|
1079
|
+
print_cli_warning(
|
|
1080
|
+
console,
|
|
1081
|
+
"멀티턴 모드에서는 Domain Memory를 지원하지 않습니다.",
|
|
1082
|
+
tips=["--use-domain-memory 옵션을 제거하세요."],
|
|
1083
|
+
)
|
|
1084
|
+
|
|
1085
|
+
if max_turns and max_turns > 0:
|
|
1086
|
+
trimmed = 0
|
|
1087
|
+
for case in multiturn_dataset.test_cases:
|
|
1088
|
+
if len(case.turns) > max_turns:
|
|
1089
|
+
case.turns = case.turns[:max_turns]
|
|
1090
|
+
trimmed += 1
|
|
1091
|
+
if trimmed:
|
|
1092
|
+
console.print(f"[dim]Trimmed turns in {trimmed} conversation(s).[/dim]")
|
|
1093
|
+
|
|
1094
|
+
evaluation_started_at = datetime.now()
|
|
1095
|
+
multiturn_evaluator = MultiTurnEvaluator(evaluator=evaluator, llm=llm_adapter)
|
|
1096
|
+
results = []
|
|
1097
|
+
drift_flags = 0
|
|
1098
|
+
turn_count = 0
|
|
1099
|
+
for case in multiturn_dataset.test_cases:
|
|
1100
|
+
result = multiturn_evaluator.evaluate_conversation(case, metric_list)
|
|
1101
|
+
drift = multiturn_evaluator.detect_drift(case, threshold=drift_threshold)
|
|
1102
|
+
result.summary["drift_detected"] = drift.drift_detected
|
|
1103
|
+
result.summary["drift_threshold"] = drift.drift_threshold
|
|
1104
|
+
result.summary["drift_score"] = drift.drift_score
|
|
1105
|
+
results.append(result)
|
|
1106
|
+
turn_count += len(result.turn_results)
|
|
1107
|
+
if drift.drift_detected:
|
|
1108
|
+
drift_flags += 1
|
|
1109
|
+
|
|
1110
|
+
multiturn_summary: dict[str, object] = {
|
|
1111
|
+
"conversation_count": len(results),
|
|
1112
|
+
"turn_count": turn_count,
|
|
1113
|
+
"drift_detected_count": drift_flags,
|
|
1114
|
+
"drift_threshold": drift_threshold,
|
|
1115
|
+
}
|
|
1116
|
+
for metric in metric_list:
|
|
1117
|
+
scores = [
|
|
1118
|
+
result.summary.get(metric)
|
|
1119
|
+
for result in results
|
|
1120
|
+
if isinstance(result.summary.get(metric), (int, float))
|
|
1121
|
+
]
|
|
1122
|
+
if scores:
|
|
1123
|
+
multiturn_summary[metric] = sum(scores) / len(scores)
|
|
1124
|
+
|
|
1125
|
+
payload = {
|
|
1126
|
+
"dataset": {
|
|
1127
|
+
"name": multiturn_dataset.name,
|
|
1128
|
+
"version": multiturn_dataset.version,
|
|
1129
|
+
"metadata": multiturn_dataset.metadata,
|
|
1130
|
+
"source_file": multiturn_dataset.source_file,
|
|
1131
|
+
},
|
|
1132
|
+
"metrics": metric_list,
|
|
1133
|
+
"summary": multiturn_summary,
|
|
1134
|
+
"conversations": [asdict(item) for item in results],
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
table = Table(title="Multi-turn Summary", show_header=True, header_style="bold cyan")
|
|
1138
|
+
table.add_column("Metric", style="bold")
|
|
1139
|
+
table.add_column("Value", justify="right")
|
|
1140
|
+
for metric in metric_list:
|
|
1141
|
+
value = multiturn_summary.get(metric)
|
|
1142
|
+
if isinstance(value, float):
|
|
1143
|
+
display = f"{value:.3f}"
|
|
1144
|
+
else:
|
|
1145
|
+
display = str(value) if value is not None else "-"
|
|
1146
|
+
table.add_row(metric, display)
|
|
1147
|
+
table.add_row("conversation_count", str(multiturn_summary.get("conversation_count")))
|
|
1148
|
+
table.add_row("turn_count", str(multiturn_summary.get("turn_count")))
|
|
1149
|
+
table.add_row("drift_detected", str(multiturn_summary.get("drift_detected_count")))
|
|
1150
|
+
console.print(table)
|
|
1151
|
+
|
|
1152
|
+
if output:
|
|
1153
|
+
write_json(output, payload)
|
|
1154
|
+
console.print(f"[green]멀티턴 결과 저장:[/green] {output}")
|
|
1155
|
+
if db_path:
|
|
1156
|
+
run_id = str(uuid4())
|
|
1157
|
+
run_record = MultiTurnRunRecord(
|
|
1158
|
+
run_id=run_id,
|
|
1159
|
+
dataset_name=multiturn_dataset.name,
|
|
1160
|
+
dataset_version=multiturn_dataset.version,
|
|
1161
|
+
model_name=llm_adapter.get_model_name(),
|
|
1162
|
+
started_at=evaluation_started_at,
|
|
1163
|
+
finished_at=datetime.now(),
|
|
1164
|
+
conversation_count=len(results),
|
|
1165
|
+
turn_count=turn_count,
|
|
1166
|
+
metrics_evaluated=list(metric_list),
|
|
1167
|
+
drift_threshold=drift_threshold,
|
|
1168
|
+
summary=multiturn_summary,
|
|
1169
|
+
metadata={"dataset": multiturn_dataset.metadata},
|
|
1170
|
+
)
|
|
1171
|
+
conversation_records = [
|
|
1172
|
+
MultiTurnConversationRecord(
|
|
1173
|
+
run_id=run_id,
|
|
1174
|
+
conversation_id=conversation.conversation_id,
|
|
1175
|
+
turn_count=len(conversation.turn_results),
|
|
1176
|
+
drift_score=conversation.summary.get("drift_score"),
|
|
1177
|
+
drift_threshold=conversation.summary.get("drift_threshold"),
|
|
1178
|
+
drift_detected=bool(conversation.summary.get("drift_detected")),
|
|
1179
|
+
summary=dict(conversation.summary),
|
|
1180
|
+
)
|
|
1181
|
+
for conversation in results
|
|
1182
|
+
]
|
|
1183
|
+
turn_results: list[MultiTurnTurnResult] = []
|
|
1184
|
+
for conversation in results:
|
|
1185
|
+
for turn in conversation.turn_results:
|
|
1186
|
+
turn_results.append(turn)
|
|
1187
|
+
_save_multiturn_to_db(
|
|
1188
|
+
db_path,
|
|
1189
|
+
run_record,
|
|
1190
|
+
conversation_records,
|
|
1191
|
+
turn_results,
|
|
1192
|
+
console,
|
|
1193
|
+
export_excel=True,
|
|
1194
|
+
excel_output_path=excel_output,
|
|
1195
|
+
)
|
|
1196
|
+
return
|
|
1197
|
+
|
|
986
1198
|
# Load dataset or configure streaming metadata
|
|
987
1199
|
if stream:
|
|
988
1200
|
stream_started_at = datetime.now()
|
|
@@ -2120,21 +2332,26 @@ def register_run_commands(
|
|
|
2120
2332
|
False,
|
|
2121
2333
|
"--auto-analyze",
|
|
2122
2334
|
help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
|
|
2335
|
+
rich_help_panel="Auto Analysis",
|
|
2123
2336
|
),
|
|
2124
2337
|
analysis_output: Path | None = typer.Option(
|
|
2125
2338
|
None,
|
|
2126
2339
|
"--analysis-json",
|
|
2127
2340
|
help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
|
|
2341
|
+
rich_help_panel="Auto Analysis",
|
|
2128
2342
|
),
|
|
2129
2343
|
analysis_report: Path | None = typer.Option(
|
|
2130
2344
|
None,
|
|
2131
2345
|
"--analysis-report",
|
|
2346
|
+
"--report",
|
|
2132
2347
|
help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
|
|
2348
|
+
rich_help_panel="Auto Analysis",
|
|
2133
2349
|
),
|
|
2134
2350
|
analysis_dir: Path | None = typer.Option(
|
|
2135
2351
|
None,
|
|
2136
2352
|
"--analysis-dir",
|
|
2137
2353
|
help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
|
|
2354
|
+
rich_help_panel="Auto Analysis",
|
|
2138
2355
|
),
|
|
2139
2356
|
retriever: str | None = typer.Option(
|
|
2140
2357
|
None,
|
|
@@ -2273,6 +2490,7 @@ def register_run_commands(
|
|
|
2273
2490
|
verbose: bool = typer.Option(
|
|
2274
2491
|
False,
|
|
2275
2492
|
"--verbose",
|
|
2493
|
+
"-V",
|
|
2276
2494
|
help="Show detailed output.",
|
|
2277
2495
|
),
|
|
2278
2496
|
parallel: bool = typer.Option(
|
|
@@ -2406,21 +2624,26 @@ def register_run_commands(
|
|
|
2406
2624
|
False,
|
|
2407
2625
|
"--auto-analyze",
|
|
2408
2626
|
help="평가 완료 후 통합 분석을 자동 실행하고 보고서를 저장합니다.",
|
|
2627
|
+
rich_help_panel="Auto Analysis",
|
|
2409
2628
|
),
|
|
2410
2629
|
analysis_output: Path | None = typer.Option(
|
|
2411
2630
|
None,
|
|
2412
2631
|
"--analysis-json",
|
|
2413
2632
|
help="자동 분석 JSON 결과 파일 경로 (기본값: reports/analysis).",
|
|
2633
|
+
rich_help_panel="Auto Analysis",
|
|
2414
2634
|
),
|
|
2415
2635
|
analysis_report: Path | None = typer.Option(
|
|
2416
2636
|
None,
|
|
2417
2637
|
"--analysis-report",
|
|
2638
|
+
"--report",
|
|
2418
2639
|
help="자동 분석 Markdown 보고서 경로 (기본값: reports/analysis).",
|
|
2640
|
+
rich_help_panel="Auto Analysis",
|
|
2419
2641
|
),
|
|
2420
2642
|
analysis_dir: Path | None = typer.Option(
|
|
2421
2643
|
None,
|
|
2422
2644
|
"--analysis-dir",
|
|
2423
2645
|
help="자동 분석 결과 저장 디렉터리 (기본: reports/analysis).",
|
|
2646
|
+
rich_help_panel="Auto Analysis",
|
|
2424
2647
|
),
|
|
2425
2648
|
retriever: str | None = typer.Option(
|
|
2426
2649
|
None,
|
|
@@ -2559,6 +2782,7 @@ def register_run_commands(
|
|
|
2559
2782
|
verbose: bool = typer.Option(
|
|
2560
2783
|
False,
|
|
2561
2784
|
"--verbose",
|
|
2785
|
+
"-V",
|
|
2562
2786
|
help="Show detailed output.",
|
|
2563
2787
|
),
|
|
2564
2788
|
parallel: bool = typer.Option(
|
|
@@ -29,6 +29,9 @@ from evalvault.domain.entities import (
|
|
|
29
29
|
Dataset,
|
|
30
30
|
EvaluationRun,
|
|
31
31
|
GenerationData,
|
|
32
|
+
MultiTurnConversationRecord,
|
|
33
|
+
MultiTurnRunRecord,
|
|
34
|
+
MultiTurnTurnResult,
|
|
32
35
|
PromptSetBundle,
|
|
33
36
|
RAGTraceData,
|
|
34
37
|
RetrievalData,
|
|
@@ -86,6 +89,14 @@ RUN_MODE_PRESETS: dict[str, RunModePreset] = {
|
|
|
86
89
|
label="Full",
|
|
87
90
|
description="모든 CLI 옵션과 Domain Memory, Prompt manifest를 활용하는 전체 모드.",
|
|
88
91
|
),
|
|
92
|
+
"multiturn": RunModePreset(
|
|
93
|
+
name="multiturn",
|
|
94
|
+
label="Multiturn",
|
|
95
|
+
description="멀티턴 대화 평가 전용 모드 (멀티턴 메트릭만 지원).",
|
|
96
|
+
default_metrics=("turn_faithfulness", "context_coherence", "drift_rate"),
|
|
97
|
+
allow_domain_memory=False,
|
|
98
|
+
allow_prompt_metadata=False,
|
|
99
|
+
),
|
|
89
100
|
}
|
|
90
101
|
|
|
91
102
|
SUMMARY_METRIC_ORDER = (
|
|
@@ -490,6 +501,52 @@ def _save_to_db(
|
|
|
490
501
|
)
|
|
491
502
|
|
|
492
503
|
|
|
504
|
+
def _save_multiturn_to_db(
|
|
505
|
+
db_path: Path,
|
|
506
|
+
run_record: MultiTurnRunRecord,
|
|
507
|
+
conversations: list[MultiTurnConversationRecord],
|
|
508
|
+
turn_results: list[MultiTurnTurnResult],
|
|
509
|
+
console: Console,
|
|
510
|
+
*,
|
|
511
|
+
storage_cls: type[SQLiteStorageAdapter] = SQLiteStorageAdapter,
|
|
512
|
+
export_excel: bool = True,
|
|
513
|
+
excel_output_path: Path | None = None,
|
|
514
|
+
metric_thresholds: dict[str, float] | None = None,
|
|
515
|
+
) -> None:
|
|
516
|
+
"""Persist multiturn evaluation run to SQLite database."""
|
|
517
|
+
with console.status(f"[bold green]Saving multiturn run to {db_path}..."):
|
|
518
|
+
try:
|
|
519
|
+
storage = storage_cls(db_path=db_path)
|
|
520
|
+
storage.save_multiturn_run(
|
|
521
|
+
run_record,
|
|
522
|
+
conversations,
|
|
523
|
+
turn_results,
|
|
524
|
+
metric_thresholds=metric_thresholds,
|
|
525
|
+
)
|
|
526
|
+
if export_excel:
|
|
527
|
+
excel_path = excel_output_path or (
|
|
528
|
+
db_path.parent / f"evalvault_multiturn_{run_record.run_id}.xlsx"
|
|
529
|
+
)
|
|
530
|
+
try:
|
|
531
|
+
storage.export_multiturn_run_to_excel(run_record.run_id, excel_path)
|
|
532
|
+
console.print(f"[green]Multiturn Excel export saved: {excel_path}[/green]")
|
|
533
|
+
except Exception as exc:
|
|
534
|
+
print_cli_warning(
|
|
535
|
+
console,
|
|
536
|
+
"멀티턴 엑셀 내보내기에 실패했습니다.",
|
|
537
|
+
tips=[str(exc)],
|
|
538
|
+
)
|
|
539
|
+
console.print(f"[green]Multiturn results saved to database: {db_path}[/green]")
|
|
540
|
+
console.print(f"[dim]Run ID: {run_record.run_id}[/dim]")
|
|
541
|
+
except Exception as exc: # pragma: no cover - persistence errors
|
|
542
|
+
print_cli_error(
|
|
543
|
+
console,
|
|
544
|
+
"멀티턴 결과를 데이터베이스에 저장하지 못했습니다.",
|
|
545
|
+
details=str(exc),
|
|
546
|
+
fixes=["경로 권한과 DB 파일 잠금 상태를 확인하세요."],
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
493
550
|
def _save_results(output: Path, result, console: Console) -> None:
|
|
494
551
|
"""Write evaluation summary to disk."""
|
|
495
552
|
with console.status(f"[bold green]Saving to {output}..."):
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
3
6
|
from dataclasses import dataclass, field
|
|
4
7
|
from datetime import datetime
|
|
8
|
+
from importlib import import_module
|
|
5
9
|
from typing import Any
|
|
6
10
|
|
|
7
11
|
import networkx as nx
|
|
@@ -12,10 +16,18 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
|
|
|
12
16
|
to_serializable,
|
|
13
17
|
)
|
|
14
18
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
+
|
|
20
|
+
def _get_matplotlib_pyplot() -> Any | None:
|
|
21
|
+
try:
|
|
22
|
+
if "matplotlib.pyplot" in sys.modules:
|
|
23
|
+
return import_module("matplotlib.pyplot")
|
|
24
|
+
os.environ.setdefault("MPLBACKEND", "Agg")
|
|
25
|
+
matplotlib = import_module("matplotlib")
|
|
26
|
+
with contextlib.suppress(Exception):
|
|
27
|
+
matplotlib.use("Agg", force=True)
|
|
28
|
+
return import_module("matplotlib.pyplot")
|
|
29
|
+
except ModuleNotFoundError:
|
|
30
|
+
return None
|
|
19
31
|
|
|
20
32
|
|
|
21
33
|
@dataclass
|
|
@@ -173,6 +185,7 @@ class NetworkAnalyzerModule(BaseAnalysisModule):
|
|
|
173
185
|
output_path: str | None = None,
|
|
174
186
|
figsize: tuple[int, int] = (12, 8),
|
|
175
187
|
) -> Any | None:
|
|
188
|
+
plt = _get_matplotlib_pyplot()
|
|
176
189
|
if plt is None:
|
|
177
190
|
return None
|
|
178
191
|
|
|
@@ -6,6 +6,10 @@ from evalvault.adapters.outbound.dataset.excel_loader import ExcelDatasetLoader
|
|
|
6
6
|
from evalvault.adapters.outbound.dataset.json_loader import JSONDatasetLoader
|
|
7
7
|
from evalvault.adapters.outbound.dataset.loader_factory import get_loader, register_loader
|
|
8
8
|
from evalvault.adapters.outbound.dataset.method_input_loader import MethodInputDatasetLoader
|
|
9
|
+
from evalvault.adapters.outbound.dataset.multiturn_json_loader import (
|
|
10
|
+
MultiTurnDataset,
|
|
11
|
+
load_multiturn_dataset,
|
|
12
|
+
)
|
|
9
13
|
from evalvault.adapters.outbound.dataset.streaming_loader import (
|
|
10
14
|
StreamingConfig,
|
|
11
15
|
StreamingCSVLoader,
|
|
@@ -23,6 +27,7 @@ __all__ = [
|
|
|
23
27
|
"ExcelDatasetLoader",
|
|
24
28
|
"JSONDatasetLoader",
|
|
25
29
|
"MethodInputDatasetLoader",
|
|
30
|
+
"MultiTurnDataset",
|
|
26
31
|
"StreamingCSVLoader",
|
|
27
32
|
"StreamingConfig",
|
|
28
33
|
"StreamingDatasetLoader",
|
|
@@ -31,6 +36,7 @@ __all__ = [
|
|
|
31
36
|
"StreamingTestCaseIterator",
|
|
32
37
|
"get_loader",
|
|
33
38
|
"load_in_chunks",
|
|
39
|
+
"load_multiturn_dataset",
|
|
34
40
|
"register_loader",
|
|
35
41
|
"stream_file",
|
|
36
42
|
]
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from evalvault.domain.entities.multiturn import ConversationTurn, MultiTurnTestCase
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class MultiTurnDataset:
|
|
13
|
+
name: str
|
|
14
|
+
version: str
|
|
15
|
+
test_cases: list[MultiTurnTestCase]
|
|
16
|
+
metadata: dict[str, Any]
|
|
17
|
+
source_file: str | None = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_multiturn_dataset(file_path: str | Path) -> MultiTurnDataset:
|
|
21
|
+
path = Path(file_path)
|
|
22
|
+
if not path.exists():
|
|
23
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
24
|
+
if not path.is_file():
|
|
25
|
+
raise ValueError(f"Path is not a file: {file_path}")
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
29
|
+
except json.JSONDecodeError as exc:
|
|
30
|
+
raise ValueError(f"Invalid JSON file: {exc}") from exc
|
|
31
|
+
|
|
32
|
+
name = path.stem
|
|
33
|
+
version = "1.0.0"
|
|
34
|
+
metadata: dict[str, Any] = {}
|
|
35
|
+
raw_cases: list[dict[str, Any]]
|
|
36
|
+
|
|
37
|
+
if isinstance(payload, list):
|
|
38
|
+
raw_cases = payload
|
|
39
|
+
elif isinstance(payload, dict):
|
|
40
|
+
name = str(payload.get("name") or name)
|
|
41
|
+
version = str(payload.get("version") or version)
|
|
42
|
+
metadata = payload.get("metadata") or {}
|
|
43
|
+
if not isinstance(metadata, dict):
|
|
44
|
+
raise ValueError("metadata must be a JSON object")
|
|
45
|
+
raw_cases = payload.get("test_cases") or payload.get("conversations") or []
|
|
46
|
+
else:
|
|
47
|
+
raise ValueError("JSON must be an array or object with 'test_cases' key")
|
|
48
|
+
|
|
49
|
+
if not isinstance(raw_cases, list):
|
|
50
|
+
raise ValueError("test_cases must be a list")
|
|
51
|
+
|
|
52
|
+
test_cases: list[MultiTurnTestCase] = []
|
|
53
|
+
for idx, raw_case in enumerate(raw_cases, start=1):
|
|
54
|
+
if not isinstance(raw_case, dict):
|
|
55
|
+
raise ValueError(f"test_cases[{idx}] must be an object")
|
|
56
|
+
conversation_id = raw_case.get("conversation_id") or raw_case.get("id")
|
|
57
|
+
if not conversation_id:
|
|
58
|
+
raise ValueError(f"test_cases[{idx}] missing conversation_id")
|
|
59
|
+
raw_turns = raw_case.get("turns")
|
|
60
|
+
if not isinstance(raw_turns, list) or not raw_turns:
|
|
61
|
+
raise ValueError(f"test_cases[{idx}] missing turns list")
|
|
62
|
+
|
|
63
|
+
turns: list[ConversationTurn] = []
|
|
64
|
+
for t_idx, raw_turn in enumerate(raw_turns, start=1):
|
|
65
|
+
if not isinstance(raw_turn, dict):
|
|
66
|
+
raise ValueError(f"turns[{t_idx}] must be an object")
|
|
67
|
+
role = raw_turn.get("role")
|
|
68
|
+
if role not in {"user", "assistant"}:
|
|
69
|
+
raise ValueError(f"turns[{t_idx}] role must be 'user' or 'assistant'")
|
|
70
|
+
content = raw_turn.get("content")
|
|
71
|
+
if content is None:
|
|
72
|
+
raise ValueError(f"turns[{t_idx}] missing content")
|
|
73
|
+
turn_id = raw_turn.get("turn_id") or f"t{t_idx:02d}"
|
|
74
|
+
contexts = raw_turn.get("contexts")
|
|
75
|
+
if contexts is None:
|
|
76
|
+
contexts = []
|
|
77
|
+
if isinstance(contexts, str):
|
|
78
|
+
contexts = [contexts]
|
|
79
|
+
if not isinstance(contexts, list):
|
|
80
|
+
raise ValueError(f"turns[{t_idx}] contexts must be a list")
|
|
81
|
+
ground_truth = raw_turn.get("ground_truth")
|
|
82
|
+
metadata_value = raw_turn.get("metadata") or {}
|
|
83
|
+
if not isinstance(metadata_value, dict):
|
|
84
|
+
raise ValueError(f"turns[{t_idx}] metadata must be an object")
|
|
85
|
+
turns.append(
|
|
86
|
+
ConversationTurn(
|
|
87
|
+
turn_id=str(turn_id),
|
|
88
|
+
role=role,
|
|
89
|
+
content=str(content),
|
|
90
|
+
contexts=[str(ctx) for ctx in contexts],
|
|
91
|
+
ground_truth=ground_truth,
|
|
92
|
+
metadata=metadata_value,
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
test_cases.append(
|
|
97
|
+
MultiTurnTestCase(
|
|
98
|
+
conversation_id=str(conversation_id),
|
|
99
|
+
turns=turns,
|
|
100
|
+
expected_final_answer=raw_case.get("expected_final_answer"),
|
|
101
|
+
drift_tolerance=float(raw_case.get("drift_tolerance", 0.1)),
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
return MultiTurnDataset(
|
|
106
|
+
name=name,
|
|
107
|
+
version=version,
|
|
108
|
+
test_cases=test_cases,
|
|
109
|
+
metadata=metadata,
|
|
110
|
+
source_file=str(path),
|
|
111
|
+
)
|
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
"""Report generation adapters."""
|
|
2
2
|
|
|
3
|
+
from evalvault.adapters.outbound.report.ci_report_formatter import (
|
|
4
|
+
CIGateMetricRow,
|
|
5
|
+
format_ci_regression_report,
|
|
6
|
+
)
|
|
3
7
|
from evalvault.adapters.outbound.report.dashboard_generator import DashboardGenerator
|
|
4
8
|
from evalvault.adapters.outbound.report.llm_report_generator import (
|
|
5
9
|
LLMReport,
|
|
@@ -9,7 +13,9 @@ from evalvault.adapters.outbound.report.llm_report_generator import (
|
|
|
9
13
|
from evalvault.adapters.outbound.report.markdown_adapter import MarkdownReportAdapter
|
|
10
14
|
|
|
11
15
|
__all__ = [
|
|
16
|
+
"CIGateMetricRow",
|
|
12
17
|
"DashboardGenerator",
|
|
18
|
+
"format_ci_regression_report",
|
|
13
19
|
"LLMReport",
|
|
14
20
|
"LLMReportGenerator",
|
|
15
21
|
"LLMReportSection",
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass(frozen=True)
|
|
7
|
+
class CIGateMetricRow:
|
|
8
|
+
metric: str
|
|
9
|
+
baseline_score: float
|
|
10
|
+
current_score: float
|
|
11
|
+
change_percent: float
|
|
12
|
+
status: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def format_ci_regression_report(
|
|
16
|
+
rows: list[CIGateMetricRow],
|
|
17
|
+
*,
|
|
18
|
+
regression_rate: float,
|
|
19
|
+
regression_threshold: float,
|
|
20
|
+
gate_passed: bool,
|
|
21
|
+
) -> str:
|
|
22
|
+
lines: list[str] = ["## RAG Regression Gate Results", ""]
|
|
23
|
+
lines.append("| Metric | Baseline | Current | Change | Status |")
|
|
24
|
+
lines.append("|--------|----------|---------|--------|--------|")
|
|
25
|
+
for row in rows:
|
|
26
|
+
change = f"{row.change_percent:+.1f}%"
|
|
27
|
+
lines.append(
|
|
28
|
+
f"| {row.metric} | {row.baseline_score:.3f} | {row.current_score:.3f} | {change} | {row.status} |"
|
|
29
|
+
)
|
|
30
|
+
lines.append("")
|
|
31
|
+
if gate_passed:
|
|
32
|
+
status_line = "✅ PASSED"
|
|
33
|
+
comparison = "<"
|
|
34
|
+
else:
|
|
35
|
+
status_line = "❌ FAILED"
|
|
36
|
+
comparison = ">="
|
|
37
|
+
lines.append(
|
|
38
|
+
f"**Gate Status**: {status_line} (regression: {regression_rate:.1%} {comparison} {regression_threshold:.1%} threshold)"
|
|
39
|
+
)
|
|
40
|
+
return "\n".join(lines).strip()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
__all__ = ["CIGateMetricRow", "format_ci_regression_report"]
|