evalvault 1.70.1__py3-none-any.whl → 1.71.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +367 -3
- evalvault/adapters/inbound/api/main.py +17 -1
- evalvault/adapters/inbound/api/routers/calibration.py +133 -0
- evalvault/adapters/inbound/api/routers/runs.py +71 -1
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
- evalvault/adapters/inbound/cli/commands/compare.py +1 -1
- evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
- evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
- evalvault/adapters/inbound/cli/commands/history.py +1 -1
- evalvault/adapters/inbound/cli/commands/regress.py +169 -1
- evalvault/adapters/inbound/cli/commands/run.py +225 -1
- evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
- evalvault/adapters/outbound/dataset/__init__.py +6 -0
- evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
- evalvault/adapters/outbound/report/__init__.py +6 -0
- evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
- evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
- evalvault/adapters/outbound/retriever/__init__.py +8 -0
- evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
- evalvault/adapters/outbound/storage/base_sql.py +291 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
- evalvault/adapters/outbound/storage/schema.sql +63 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
- evalvault/domain/entities/__init__.py +20 -0
- evalvault/domain/entities/graph_rag.py +30 -0
- evalvault/domain/entities/multiturn.py +78 -0
- evalvault/domain/metrics/__init__.py +10 -0
- evalvault/domain/metrics/multiturn_metrics.py +113 -0
- evalvault/domain/metrics/registry.py +36 -0
- evalvault/domain/services/__init__.py +8 -0
- evalvault/domain/services/evaluator.py +5 -2
- evalvault/domain/services/graph_rag_experiment.py +155 -0
- evalvault/domain/services/multiturn_evaluator.py +187 -0
- evalvault/ports/inbound/__init__.py +2 -0
- evalvault/ports/inbound/multiturn_port.py +23 -0
- evalvault/ports/inbound/web_port.py +4 -0
- evalvault/ports/outbound/graph_retriever_port.py +24 -0
- evalvault/ports/outbound/storage_port.py +25 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/METADATA +1 -1
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/RECORD +47 -33
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/WHEEL +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.71.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -8,11 +8,20 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import time
|
|
10
10
|
from collections.abc import Callable
|
|
11
|
-
from dataclasses import dataclass
|
|
11
|
+
from dataclasses import asdict, dataclass
|
|
12
|
+
from datetime import UTC, datetime
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
14
15
|
from urllib.request import urlopen
|
|
15
16
|
|
|
17
|
+
from evalvault.adapters.outbound.analysis import (
|
|
18
|
+
CausalAnalysisAdapter,
|
|
19
|
+
NLPAnalysisAdapter,
|
|
20
|
+
StatisticalAnalysisAdapter,
|
|
21
|
+
)
|
|
22
|
+
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
23
|
+
from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
|
|
24
|
+
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
16
25
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
17
26
|
from evalvault.config.settings import Settings
|
|
18
27
|
from evalvault.domain.entities import (
|
|
@@ -20,6 +29,7 @@ from evalvault.domain.entities import (
|
|
|
20
29
|
FeedbackSummary,
|
|
21
30
|
SatisfactionFeedback,
|
|
22
31
|
)
|
|
32
|
+
from evalvault.domain.entities.analysis import AnalysisBundle
|
|
23
33
|
from evalvault.domain.entities.debug import DebugReport
|
|
24
34
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
25
35
|
from evalvault.domain.metrics.registry import (
|
|
@@ -29,8 +39,10 @@ from evalvault.domain.metrics.registry import (
|
|
|
29
39
|
list_metric_names,
|
|
30
40
|
list_metric_specs,
|
|
31
41
|
)
|
|
42
|
+
from evalvault.domain.services.analysis_service import AnalysisService
|
|
32
43
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
33
44
|
from evalvault.domain.services.debug_report_service import DebugReportService
|
|
45
|
+
from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
|
|
34
46
|
from evalvault.domain.services.prompt_registry import (
|
|
35
47
|
PromptInput,
|
|
36
48
|
build_prompt_bundle,
|
|
@@ -990,6 +1002,188 @@ class WebUIAdapter:
|
|
|
990
1002
|
service = SatisfactionCalibrationService()
|
|
991
1003
|
return service.build_calibration(run, feedbacks, model=model)
|
|
992
1004
|
|
|
1005
|
+
def run_judge_calibration(
|
|
1006
|
+
self,
|
|
1007
|
+
*,
|
|
1008
|
+
run_id: str,
|
|
1009
|
+
labels_source: str,
|
|
1010
|
+
method: str,
|
|
1011
|
+
metrics: list[str],
|
|
1012
|
+
holdout_ratio: float,
|
|
1013
|
+
seed: int,
|
|
1014
|
+
parallel: bool,
|
|
1015
|
+
concurrency: int,
|
|
1016
|
+
) -> dict[str, object]:
|
|
1017
|
+
if self._storage is None:
|
|
1018
|
+
raise RuntimeError("Storage not configured")
|
|
1019
|
+
storage = self._storage
|
|
1020
|
+
if holdout_ratio <= 0 or holdout_ratio >= 1:
|
|
1021
|
+
raise ValueError("holdout_ratio must be between 0 and 1")
|
|
1022
|
+
if seed < 0:
|
|
1023
|
+
raise ValueError("seed must be >= 0")
|
|
1024
|
+
if concurrency <= 0:
|
|
1025
|
+
raise ValueError("concurrency must be >= 1")
|
|
1026
|
+
|
|
1027
|
+
run = self.get_run_details(run_id)
|
|
1028
|
+
feedbacks = storage.list_feedback(run_id)
|
|
1029
|
+
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
1030
|
+
raise ValueError("Feedback labels are required for this labels_source")
|
|
1031
|
+
resolved_metrics = metrics or list(run.metrics_evaluated)
|
|
1032
|
+
if not resolved_metrics:
|
|
1033
|
+
raise ValueError("No metrics available for calibration")
|
|
1034
|
+
|
|
1035
|
+
started_at = datetime.now(UTC)
|
|
1036
|
+
service = JudgeCalibrationService()
|
|
1037
|
+
result = service.calibrate(
|
|
1038
|
+
run,
|
|
1039
|
+
feedbacks,
|
|
1040
|
+
labels_source=labels_source,
|
|
1041
|
+
method=method,
|
|
1042
|
+
metrics=resolved_metrics,
|
|
1043
|
+
holdout_ratio=holdout_ratio,
|
|
1044
|
+
seed=seed,
|
|
1045
|
+
parallel=parallel,
|
|
1046
|
+
concurrency=concurrency,
|
|
1047
|
+
)
|
|
1048
|
+
finished_at = datetime.now(UTC)
|
|
1049
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
1050
|
+
|
|
1051
|
+
reporter = JudgeCalibrationReporter()
|
|
1052
|
+
timestamp = started_at.strftime("%Y%m%d_%H%M%S")
|
|
1053
|
+
calibration_id = f"judge_calibration_{run_id}_{timestamp}"
|
|
1054
|
+
base_dir = Path("reports/calibration")
|
|
1055
|
+
output_path = base_dir / f"{calibration_id}.json"
|
|
1056
|
+
artifacts_dir = base_dir / "artifacts" / calibration_id
|
|
1057
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1058
|
+
artifacts_index = reporter.write_artifacts(result=result, artifacts_dir=artifacts_dir)
|
|
1059
|
+
|
|
1060
|
+
rendered = reporter.render_json(result)
|
|
1061
|
+
|
|
1062
|
+
status = "ok" if result.summary.gate_passed else "degraded"
|
|
1063
|
+
summary_payload = {
|
|
1064
|
+
"calibration_id": calibration_id,
|
|
1065
|
+
"run_id": result.summary.run_id,
|
|
1066
|
+
"labels_source": result.summary.labels_source,
|
|
1067
|
+
"method": result.summary.method,
|
|
1068
|
+
"metrics": list(result.summary.metrics),
|
|
1069
|
+
"holdout_ratio": result.summary.holdout_ratio,
|
|
1070
|
+
"seed": result.summary.seed,
|
|
1071
|
+
"total_labels": result.summary.total_labels,
|
|
1072
|
+
"total_samples": result.summary.total_samples,
|
|
1073
|
+
"gate_passed": result.summary.gate_passed,
|
|
1074
|
+
"gate_threshold": result.summary.gate_threshold,
|
|
1075
|
+
"notes": list(result.summary.notes),
|
|
1076
|
+
"created_at": started_at.astimezone(UTC).isoformat(),
|
|
1077
|
+
}
|
|
1078
|
+
payload = {
|
|
1079
|
+
"calibration_id": calibration_id,
|
|
1080
|
+
"status": status,
|
|
1081
|
+
"started_at": started_at.astimezone(UTC).isoformat(),
|
|
1082
|
+
"finished_at": finished_at.astimezone(UTC).isoformat(),
|
|
1083
|
+
"duration_ms": duration_ms,
|
|
1084
|
+
"artifacts": artifacts_index,
|
|
1085
|
+
"summary": summary_payload,
|
|
1086
|
+
"metrics": rendered["metrics"],
|
|
1087
|
+
"case_results": rendered["case_results"],
|
|
1088
|
+
"warnings": list(result.warnings),
|
|
1089
|
+
}
|
|
1090
|
+
output_path.write_text(
|
|
1091
|
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
1092
|
+
encoding="utf-8",
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
metadata = run.tracker_metadata or {}
|
|
1096
|
+
history = metadata.get("judge_calibration_history")
|
|
1097
|
+
if not isinstance(history, list):
|
|
1098
|
+
history = []
|
|
1099
|
+
history.append(
|
|
1100
|
+
{
|
|
1101
|
+
"calibration_id": calibration_id,
|
|
1102
|
+
"run_id": run_id,
|
|
1103
|
+
"labels_source": summary_payload["labels_source"],
|
|
1104
|
+
"method": summary_payload["method"],
|
|
1105
|
+
"metrics": summary_payload["metrics"],
|
|
1106
|
+
"holdout_ratio": summary_payload["holdout_ratio"],
|
|
1107
|
+
"seed": summary_payload["seed"],
|
|
1108
|
+
"total_labels": summary_payload["total_labels"],
|
|
1109
|
+
"total_samples": summary_payload["total_samples"],
|
|
1110
|
+
"gate_passed": summary_payload["gate_passed"],
|
|
1111
|
+
"gate_threshold": summary_payload["gate_threshold"],
|
|
1112
|
+
"created_at": summary_payload["created_at"],
|
|
1113
|
+
"output_path": str(output_path),
|
|
1114
|
+
"artifacts": artifacts_index,
|
|
1115
|
+
}
|
|
1116
|
+
)
|
|
1117
|
+
metadata["judge_calibration_history"] = history
|
|
1118
|
+
storage.update_run_metadata(run_id, metadata)
|
|
1119
|
+
return payload
|
|
1120
|
+
|
|
1121
|
+
def get_judge_calibration(self, calibration_id: str) -> dict[str, object]:
|
|
1122
|
+
if self._storage is None:
|
|
1123
|
+
raise RuntimeError("Storage not configured")
|
|
1124
|
+
entry = self._find_judge_calibration_entry(calibration_id)
|
|
1125
|
+
output_path = Path(str(entry.get("output_path") or ""))
|
|
1126
|
+
if not output_path.exists():
|
|
1127
|
+
raise KeyError(f"Calibration output not found: {calibration_id}")
|
|
1128
|
+
payload = json.loads(output_path.read_text(encoding="utf-8"))
|
|
1129
|
+
return payload
|
|
1130
|
+
|
|
1131
|
+
def list_judge_calibrations(self, *, limit: int = 20) -> list[dict[str, object]]:
|
|
1132
|
+
if self._storage is None:
|
|
1133
|
+
raise RuntimeError("Storage not configured")
|
|
1134
|
+
storage = self._storage
|
|
1135
|
+
scan_limit = max(100, limit * 5)
|
|
1136
|
+
runs = storage.list_runs(limit=scan_limit)
|
|
1137
|
+
entries: list[dict[str, object]] = []
|
|
1138
|
+
for run in runs:
|
|
1139
|
+
metadata = getattr(run, "tracker_metadata", {}) or {}
|
|
1140
|
+
history = metadata.get("judge_calibration_history")
|
|
1141
|
+
if not isinstance(history, list):
|
|
1142
|
+
continue
|
|
1143
|
+
for item in history:
|
|
1144
|
+
if isinstance(item, dict):
|
|
1145
|
+
entries.append(
|
|
1146
|
+
{
|
|
1147
|
+
"calibration_id": item.get("calibration_id"),
|
|
1148
|
+
"run_id": item.get("run_id"),
|
|
1149
|
+
"labels_source": item.get("labels_source"),
|
|
1150
|
+
"method": item.get("method"),
|
|
1151
|
+
"metrics": item.get("metrics") or [],
|
|
1152
|
+
"holdout_ratio": item.get("holdout_ratio"),
|
|
1153
|
+
"seed": item.get("seed"),
|
|
1154
|
+
"total_labels": item.get("total_labels"),
|
|
1155
|
+
"total_samples": item.get("total_samples"),
|
|
1156
|
+
"gate_passed": item.get("gate_passed"),
|
|
1157
|
+
"gate_threshold": item.get("gate_threshold"),
|
|
1158
|
+
"created_at": item.get("created_at"),
|
|
1159
|
+
}
|
|
1160
|
+
)
|
|
1161
|
+
|
|
1162
|
+
def _sort_key(item: dict[str, object]) -> str:
|
|
1163
|
+
value = item.get("created_at")
|
|
1164
|
+
return value if isinstance(value, str) else ""
|
|
1165
|
+
|
|
1166
|
+
entries.sort(key=_sort_key, reverse=True)
|
|
1167
|
+
return entries[:limit]
|
|
1168
|
+
|
|
1169
|
+
def _find_judge_calibration_entry(self, calibration_id: str) -> dict[str, object]:
|
|
1170
|
+
if self._storage is None:
|
|
1171
|
+
raise RuntimeError("Storage not configured")
|
|
1172
|
+
storage = self._storage
|
|
1173
|
+
scan_limit = 1000
|
|
1174
|
+
runs = storage.list_runs(limit=scan_limit)
|
|
1175
|
+
for run in runs:
|
|
1176
|
+
metadata = getattr(run, "tracker_metadata", {}) or {}
|
|
1177
|
+
history = metadata.get("judge_calibration_history")
|
|
1178
|
+
if not isinstance(history, list):
|
|
1179
|
+
continue
|
|
1180
|
+
for item in history:
|
|
1181
|
+
if not isinstance(item, dict):
|
|
1182
|
+
continue
|
|
1183
|
+
if item.get("calibration_id") == calibration_id:
|
|
1184
|
+
return item
|
|
1185
|
+
raise KeyError(f"Calibration not found: {calibration_id}")
|
|
1186
|
+
|
|
993
1187
|
def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
|
|
994
1188
|
"""Stage 이벤트 목록 조회."""
|
|
995
1189
|
if self._storage is None or not hasattr(self._storage, "list_stage_events"):
|
|
@@ -1153,6 +1347,110 @@ class WebUIAdapter:
|
|
|
1153
1347
|
logger.error(f"Failed to delete run {run_id}: {e}")
|
|
1154
1348
|
return False
|
|
1155
1349
|
|
|
1350
|
+
def _build_analysis_bundle(
|
|
1351
|
+
self,
|
|
1352
|
+
run_id: str,
|
|
1353
|
+
*,
|
|
1354
|
+
include_nlp: bool,
|
|
1355
|
+
include_causal: bool,
|
|
1356
|
+
) -> AnalysisBundle:
|
|
1357
|
+
if self._storage is None:
|
|
1358
|
+
raise RuntimeError("Storage not configured")
|
|
1359
|
+
|
|
1360
|
+
run = self._storage.get_run(run_id)
|
|
1361
|
+
if not run.results:
|
|
1362
|
+
raise ValueError("Run has no results to analyze")
|
|
1363
|
+
|
|
1364
|
+
analysis_adapter = StatisticalAnalysisAdapter()
|
|
1365
|
+
cache_adapter = MemoryCacheAdapter()
|
|
1366
|
+
|
|
1367
|
+
nlp_adapter = None
|
|
1368
|
+
if include_nlp:
|
|
1369
|
+
settings = self._settings or Settings()
|
|
1370
|
+
llm_adapter = self._llm_adapter
|
|
1371
|
+
if llm_adapter is None:
|
|
1372
|
+
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
1373
|
+
|
|
1374
|
+
try:
|
|
1375
|
+
llm_adapter = get_llm_adapter(settings)
|
|
1376
|
+
except Exception as exc:
|
|
1377
|
+
logger.warning("LLM adapter initialization failed for report: %s", exc)
|
|
1378
|
+
llm_adapter = None
|
|
1379
|
+
if llm_adapter is not None:
|
|
1380
|
+
nlp_adapter = NLPAnalysisAdapter(
|
|
1381
|
+
llm_adapter=llm_adapter,
|
|
1382
|
+
use_embeddings=True,
|
|
1383
|
+
)
|
|
1384
|
+
|
|
1385
|
+
causal_adapter = CausalAnalysisAdapter() if include_causal else None
|
|
1386
|
+
|
|
1387
|
+
service = AnalysisService(
|
|
1388
|
+
analysis_adapter=analysis_adapter,
|
|
1389
|
+
nlp_adapter=nlp_adapter,
|
|
1390
|
+
causal_adapter=causal_adapter,
|
|
1391
|
+
cache_adapter=cache_adapter,
|
|
1392
|
+
)
|
|
1393
|
+
return service.analyze_run(run, include_nlp=include_nlp, include_causal=include_causal)
|
|
1394
|
+
|
|
1395
|
+
@staticmethod
|
|
1396
|
+
def _build_dashboard_payload(bundle: AnalysisBundle) -> dict[str, Any]:
|
|
1397
|
+
payload: dict[str, Any] = {"run_id": bundle.run_id}
|
|
1398
|
+
analysis = bundle.statistical
|
|
1399
|
+
if analysis is None:
|
|
1400
|
+
return payload
|
|
1401
|
+
|
|
1402
|
+
metrics_summary: dict[str, Any] = {}
|
|
1403
|
+
for metric, stats in analysis.metrics_summary.items():
|
|
1404
|
+
metrics_summary[metric] = {
|
|
1405
|
+
"mean": stats.mean,
|
|
1406
|
+
"std": stats.std,
|
|
1407
|
+
"min": stats.min,
|
|
1408
|
+
"max": stats.max,
|
|
1409
|
+
"median": stats.median,
|
|
1410
|
+
"percentile_25": stats.percentile_25,
|
|
1411
|
+
"percentile_75": stats.percentile_75,
|
|
1412
|
+
"count": stats.count,
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
payload.update(
|
|
1416
|
+
{
|
|
1417
|
+
"metrics_summary": metrics_summary,
|
|
1418
|
+
"correlation_matrix": analysis.correlation_matrix,
|
|
1419
|
+
"correlation_metrics": analysis.correlation_metrics,
|
|
1420
|
+
"metric_pass_rates": analysis.metric_pass_rates,
|
|
1421
|
+
"low_performers": [asdict(item) for item in analysis.low_performers],
|
|
1422
|
+
}
|
|
1423
|
+
)
|
|
1424
|
+
return payload
|
|
1425
|
+
|
|
1426
|
+
def _find_cached_report(
|
|
1427
|
+
self,
|
|
1428
|
+
*,
|
|
1429
|
+
run_id: str,
|
|
1430
|
+
output_format: str,
|
|
1431
|
+
include_nlp: bool,
|
|
1432
|
+
include_causal: bool,
|
|
1433
|
+
) -> str | None:
|
|
1434
|
+
if self._storage is None:
|
|
1435
|
+
return None
|
|
1436
|
+
|
|
1437
|
+
reports = self._storage.list_analysis_reports(
|
|
1438
|
+
run_id=run_id,
|
|
1439
|
+
report_type="analysis",
|
|
1440
|
+
format=output_format,
|
|
1441
|
+
limit=10,
|
|
1442
|
+
)
|
|
1443
|
+
for report in reports:
|
|
1444
|
+
metadata = report.get("metadata") or {}
|
|
1445
|
+
if metadata.get("include_nlp") != include_nlp:
|
|
1446
|
+
continue
|
|
1447
|
+
if metadata.get("include_causal") != include_causal:
|
|
1448
|
+
continue
|
|
1449
|
+
content = report.get("content")
|
|
1450
|
+
if content:
|
|
1451
|
+
return content
|
|
1452
|
+
return None
|
|
1453
|
+
|
|
1156
1454
|
def generate_report(
|
|
1157
1455
|
self,
|
|
1158
1456
|
run_id: str,
|
|
@@ -1160,6 +1458,8 @@ class WebUIAdapter:
|
|
|
1160
1458
|
*,
|
|
1161
1459
|
include_nlp: bool = True,
|
|
1162
1460
|
include_causal: bool = True,
|
|
1461
|
+
use_cache: bool = True,
|
|
1462
|
+
save: bool = False,
|
|
1163
1463
|
) -> str:
|
|
1164
1464
|
"""보고서 생성.
|
|
1165
1465
|
|
|
@@ -1172,8 +1472,72 @@ class WebUIAdapter:
|
|
|
1172
1472
|
Returns:
|
|
1173
1473
|
생성된 보고서
|
|
1174
1474
|
"""
|
|
1175
|
-
|
|
1176
|
-
|
|
1475
|
+
if use_cache:
|
|
1476
|
+
cached = self._find_cached_report(
|
|
1477
|
+
run_id=run_id,
|
|
1478
|
+
output_format=output_format,
|
|
1479
|
+
include_nlp=include_nlp,
|
|
1480
|
+
include_causal=include_causal,
|
|
1481
|
+
)
|
|
1482
|
+
if cached is not None:
|
|
1483
|
+
return cached
|
|
1484
|
+
|
|
1485
|
+
bundle = self._build_analysis_bundle(
|
|
1486
|
+
run_id,
|
|
1487
|
+
include_nlp=include_nlp,
|
|
1488
|
+
include_causal=include_causal,
|
|
1489
|
+
)
|
|
1490
|
+
|
|
1491
|
+
report_generator = self._report_generator or MarkdownReportAdapter()
|
|
1492
|
+
if output_format == "html":
|
|
1493
|
+
if isinstance(report_generator, MarkdownReportAdapter):
|
|
1494
|
+
report_content = report_generator.generate_html(
|
|
1495
|
+
bundle,
|
|
1496
|
+
include_nlp=include_nlp,
|
|
1497
|
+
include_causal=include_causal,
|
|
1498
|
+
)
|
|
1499
|
+
else:
|
|
1500
|
+
report_content = report_generator.generate_html(bundle, include_nlp=include_nlp)
|
|
1501
|
+
elif isinstance(report_generator, MarkdownReportAdapter):
|
|
1502
|
+
report_content = report_generator.generate_markdown(
|
|
1503
|
+
bundle,
|
|
1504
|
+
include_nlp=include_nlp,
|
|
1505
|
+
include_causal=include_causal,
|
|
1506
|
+
)
|
|
1507
|
+
else:
|
|
1508
|
+
report_content = report_generator.generate_markdown(bundle, include_nlp=include_nlp)
|
|
1509
|
+
|
|
1510
|
+
if save and self._storage is not None:
|
|
1511
|
+
metadata = {
|
|
1512
|
+
"include_nlp": include_nlp,
|
|
1513
|
+
"include_causal": include_causal,
|
|
1514
|
+
"source": "api",
|
|
1515
|
+
}
|
|
1516
|
+
self._storage.save_analysis_report(
|
|
1517
|
+
report_id=None,
|
|
1518
|
+
run_id=run_id,
|
|
1519
|
+
experiment_id=None,
|
|
1520
|
+
report_type="analysis",
|
|
1521
|
+
format=output_format,
|
|
1522
|
+
content=report_content,
|
|
1523
|
+
metadata=metadata,
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
return report_content
|
|
1527
|
+
|
|
1528
|
+
def build_dashboard_payload(
|
|
1529
|
+
self,
|
|
1530
|
+
run_id: str,
|
|
1531
|
+
*,
|
|
1532
|
+
include_nlp: bool = True,
|
|
1533
|
+
include_causal: bool = True,
|
|
1534
|
+
) -> dict[str, Any]:
|
|
1535
|
+
bundle = self._build_analysis_bundle(
|
|
1536
|
+
run_id,
|
|
1537
|
+
include_nlp=include_nlp,
|
|
1538
|
+
include_causal=include_causal,
|
|
1539
|
+
)
|
|
1540
|
+
return self._build_dashboard_payload(bundle)
|
|
1177
1541
|
|
|
1178
1542
|
def get_available_metrics(self) -> list[str]:
|
|
1179
1543
|
"""사용 가능한 메트릭 목록 반환."""
|
|
@@ -158,7 +158,17 @@ def create_app() -> FastAPI:
|
|
|
158
158
|
allow_headers=["*"],
|
|
159
159
|
)
|
|
160
160
|
|
|
161
|
-
from .routers import
|
|
161
|
+
from .routers import (
|
|
162
|
+
benchmark,
|
|
163
|
+
calibration,
|
|
164
|
+
chat,
|
|
165
|
+
config,
|
|
166
|
+
domain,
|
|
167
|
+
knowledge,
|
|
168
|
+
mcp,
|
|
169
|
+
pipeline,
|
|
170
|
+
runs,
|
|
171
|
+
)
|
|
162
172
|
|
|
163
173
|
auth_dependencies = [Depends(require_api_token)]
|
|
164
174
|
|
|
@@ -210,6 +220,12 @@ def create_app() -> FastAPI:
|
|
|
210
220
|
tags=["mcp"],
|
|
211
221
|
dependencies=auth_dependencies,
|
|
212
222
|
)
|
|
223
|
+
app.include_router(
|
|
224
|
+
calibration.router,
|
|
225
|
+
prefix="/api/v1/calibration",
|
|
226
|
+
tags=["calibration"],
|
|
227
|
+
dependencies=auth_dependencies,
|
|
228
|
+
)
|
|
213
229
|
|
|
214
230
|
@app.get("/health")
|
|
215
231
|
def health_check():
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
6
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
7
|
+
|
|
8
|
+
from evalvault.adapters.inbound.api.main import AdapterDep
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JudgeCalibrationRequest(BaseModel):
|
|
14
|
+
model_config = ConfigDict(extra="forbid")
|
|
15
|
+
|
|
16
|
+
run_id: str
|
|
17
|
+
labels_source: Literal["feedback", "gold", "hybrid"] = "feedback"
|
|
18
|
+
method: Literal["platt", "isotonic", "temperature", "none"] = "isotonic"
|
|
19
|
+
metrics: list[str] | None = None
|
|
20
|
+
holdout_ratio: float = Field(0.2, gt=0.0, lt=1.0)
|
|
21
|
+
seed: int = Field(42, ge=0)
|
|
22
|
+
parallel: bool = False
|
|
23
|
+
concurrency: int = Field(8, ge=1)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JudgeCalibrationCaseResponse(BaseModel):
|
|
27
|
+
test_case_id: str
|
|
28
|
+
raw_score: float
|
|
29
|
+
calibrated_score: float
|
|
30
|
+
label: float | None = None
|
|
31
|
+
label_source: str | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class JudgeCalibrationMetricResponse(BaseModel):
|
|
35
|
+
metric: str
|
|
36
|
+
method: str
|
|
37
|
+
sample_count: int
|
|
38
|
+
label_count: int
|
|
39
|
+
mae: float | None
|
|
40
|
+
pearson: float | None
|
|
41
|
+
spearman: float | None
|
|
42
|
+
temperature: float | None = None
|
|
43
|
+
parameters: dict[str, float | None] = Field(default_factory=dict)
|
|
44
|
+
gate_passed: bool | None = None
|
|
45
|
+
warning: str | None = None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class JudgeCalibrationSummaryResponse(BaseModel):
|
|
49
|
+
calibration_id: str
|
|
50
|
+
run_id: str
|
|
51
|
+
labels_source: str
|
|
52
|
+
method: str
|
|
53
|
+
metrics: list[str]
|
|
54
|
+
holdout_ratio: float
|
|
55
|
+
seed: int
|
|
56
|
+
total_labels: int
|
|
57
|
+
total_samples: int
|
|
58
|
+
gate_passed: bool
|
|
59
|
+
gate_threshold: float | None = None
|
|
60
|
+
notes: list[str] = Field(default_factory=list)
|
|
61
|
+
created_at: str
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class JudgeCalibrationResponse(BaseModel):
|
|
65
|
+
calibration_id: str
|
|
66
|
+
status: Literal["ok", "degraded"]
|
|
67
|
+
started_at: str
|
|
68
|
+
finished_at: str
|
|
69
|
+
duration_ms: int
|
|
70
|
+
artifacts: dict[str, str]
|
|
71
|
+
summary: JudgeCalibrationSummaryResponse
|
|
72
|
+
metrics: list[JudgeCalibrationMetricResponse]
|
|
73
|
+
case_results: dict[str, list[JudgeCalibrationCaseResponse]]
|
|
74
|
+
warnings: list[str]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class JudgeCalibrationHistoryItem(BaseModel):
|
|
78
|
+
calibration_id: str
|
|
79
|
+
run_id: str
|
|
80
|
+
labels_source: str
|
|
81
|
+
method: str
|
|
82
|
+
metrics: list[str]
|
|
83
|
+
holdout_ratio: float
|
|
84
|
+
seed: int
|
|
85
|
+
total_labels: int
|
|
86
|
+
total_samples: int
|
|
87
|
+
gate_passed: bool
|
|
88
|
+
gate_threshold: float | None = None
|
|
89
|
+
created_at: str
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@router.post("/judge", response_model=JudgeCalibrationResponse)
|
|
93
|
+
def run_judge_calibration(
|
|
94
|
+
request: JudgeCalibrationRequest, adapter: AdapterDep
|
|
95
|
+
) -> JudgeCalibrationResponse:
|
|
96
|
+
try:
|
|
97
|
+
payload = adapter.run_judge_calibration(
|
|
98
|
+
run_id=request.run_id,
|
|
99
|
+
labels_source=request.labels_source,
|
|
100
|
+
method=request.method,
|
|
101
|
+
metrics=request.metrics or [],
|
|
102
|
+
holdout_ratio=request.holdout_ratio,
|
|
103
|
+
seed=request.seed,
|
|
104
|
+
parallel=request.parallel,
|
|
105
|
+
concurrency=request.concurrency,
|
|
106
|
+
)
|
|
107
|
+
except KeyError as exc:
|
|
108
|
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
|
109
|
+
except ValueError as exc:
|
|
110
|
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
111
|
+
except RuntimeError as exc:
|
|
112
|
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
113
|
+
return JudgeCalibrationResponse.model_validate(payload)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
|
|
117
|
+
def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
|
|
118
|
+
try:
|
|
119
|
+
payload = adapter.get_judge_calibration(calibration_id)
|
|
120
|
+
except KeyError as exc:
|
|
121
|
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
|
122
|
+
except RuntimeError as exc:
|
|
123
|
+
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
124
|
+
return JudgeCalibrationResponse.model_validate(payload)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
128
|
+
def list_calibrations(
|
|
129
|
+
adapter: AdapterDep,
|
|
130
|
+
limit: int = Query(20, ge=1, le=200),
|
|
131
|
+
) -> list[JudgeCalibrationHistoryItem]:
|
|
132
|
+
entries = adapter.list_judge_calibrations(limit=limit)
|
|
133
|
+
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|
|
@@ -6,11 +6,12 @@ import asyncio
|
|
|
6
6
|
import csv
|
|
7
7
|
import json
|
|
8
8
|
from datetime import datetime
|
|
9
|
+
from io import BytesIO
|
|
9
10
|
from pathlib import Path
|
|
10
11
|
from typing import Any, Literal
|
|
11
12
|
|
|
12
13
|
from fastapi import APIRouter, File, HTTPException, Query, UploadFile
|
|
13
|
-
from fastapi.responses import PlainTextResponse, Response, StreamingResponse
|
|
14
|
+
from fastapi.responses import HTMLResponse, PlainTextResponse, Response, StreamingResponse
|
|
14
15
|
from pydantic import BaseModel
|
|
15
16
|
|
|
16
17
|
from evalvault.adapters.inbound.api.main import AdapterDep
|
|
@@ -21,6 +22,7 @@ from evalvault.adapters.outbound.dataset.templates import (
|
|
|
21
22
|
)
|
|
22
23
|
from evalvault.adapters.outbound.debug.report_renderer import render_markdown
|
|
23
24
|
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
25
|
+
from evalvault.adapters.outbound.report import DashboardGenerator
|
|
24
26
|
from evalvault.config.settings import get_settings
|
|
25
27
|
from evalvault.domain.entities import (
|
|
26
28
|
CalibrationResult,
|
|
@@ -1159,6 +1161,74 @@ def get_improvement_guide(
|
|
|
1159
1161
|
raise HTTPException(status_code=500, detail=str(e))
|
|
1160
1162
|
|
|
1161
1163
|
|
|
1164
|
+
@router.get("/{run_id}/analysis-report", response_model=None)
|
|
1165
|
+
def get_analysis_report(
|
|
1166
|
+
run_id: str,
|
|
1167
|
+
adapter: AdapterDep,
|
|
1168
|
+
format: Literal["markdown", "html"] = Query("markdown", description="Report format"),
|
|
1169
|
+
include_nlp: bool = Query(True, description="Include NLP analysis"),
|
|
1170
|
+
include_causal: bool = Query(True, description="Include causal analysis"),
|
|
1171
|
+
use_cache: bool = Query(True, description="Use cached report if available"),
|
|
1172
|
+
save: bool = Query(False, description="Save report to database"),
|
|
1173
|
+
):
|
|
1174
|
+
"""Generate analysis report (Markdown/HTML)."""
|
|
1175
|
+
try:
|
|
1176
|
+
report = adapter.generate_report(
|
|
1177
|
+
run_id,
|
|
1178
|
+
output_format=format,
|
|
1179
|
+
include_nlp=include_nlp,
|
|
1180
|
+
include_causal=include_causal,
|
|
1181
|
+
use_cache=use_cache,
|
|
1182
|
+
save=save,
|
|
1183
|
+
)
|
|
1184
|
+
if format == "html":
|
|
1185
|
+
return HTMLResponse(report)
|
|
1186
|
+
return PlainTextResponse(report)
|
|
1187
|
+
except KeyError:
|
|
1188
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
1189
|
+
except Exception as e:
|
|
1190
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1191
|
+
|
|
1192
|
+
|
|
1193
|
+
@router.get("/{run_id}/dashboard", response_model=None)
|
|
1194
|
+
def get_dashboard(
|
|
1195
|
+
run_id: str,
|
|
1196
|
+
adapter: AdapterDep,
|
|
1197
|
+
format: Literal["png", "svg", "pdf"] = Query("png", description="Dashboard format"),
|
|
1198
|
+
include_nlp: bool = Query(True, description="Include NLP analysis"),
|
|
1199
|
+
include_causal: bool = Query(True, description="Include causal analysis"),
|
|
1200
|
+
):
|
|
1201
|
+
"""Generate dashboard image for a run."""
|
|
1202
|
+
try:
|
|
1203
|
+
dashboard_payload = adapter.build_dashboard_payload(
|
|
1204
|
+
run_id,
|
|
1205
|
+
include_nlp=include_nlp,
|
|
1206
|
+
include_causal=include_causal,
|
|
1207
|
+
)
|
|
1208
|
+
generator = DashboardGenerator()
|
|
1209
|
+
fig = generator.generate_evaluation_dashboard(
|
|
1210
|
+
run_id,
|
|
1211
|
+
analysis_data=dashboard_payload,
|
|
1212
|
+
)
|
|
1213
|
+
buffer = BytesIO()
|
|
1214
|
+
fig.savefig(buffer, format=format, dpi=300, bbox_inches="tight")
|
|
1215
|
+
fig.clear()
|
|
1216
|
+
media_types = {
|
|
1217
|
+
"png": "image/png",
|
|
1218
|
+
"svg": "image/svg+xml",
|
|
1219
|
+
"pdf": "application/pdf",
|
|
1220
|
+
}
|
|
1221
|
+
return Response(content=buffer.getvalue(), media_type=media_types[format])
|
|
1222
|
+
except ImportError as exc:
|
|
1223
|
+
raise HTTPException(status_code=500, detail=str(exc))
|
|
1224
|
+
except KeyError:
|
|
1225
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
1226
|
+
except ValueError as exc:
|
|
1227
|
+
raise HTTPException(status_code=400, detail=str(exc))
|
|
1228
|
+
except Exception as e:
|
|
1229
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1230
|
+
|
|
1231
|
+
|
|
1162
1232
|
@router.get("/{run_id}/report")
|
|
1163
1233
|
def generate_llm_report(
|
|
1164
1234
|
run_id: str,
|