evalvault 1.70.1__py3-none-any.whl → 1.72.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. evalvault/adapters/inbound/api/adapter.py +367 -3
  2. evalvault/adapters/inbound/api/main.py +17 -1
  3. evalvault/adapters/inbound/api/routers/calibration.py +133 -0
  4. evalvault/adapters/inbound/api/routers/runs.py +71 -1
  5. evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
  6. evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +1 -1
  8. evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
  9. evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
  10. evalvault/adapters/inbound/cli/commands/history.py +1 -1
  11. evalvault/adapters/inbound/cli/commands/regress.py +169 -1
  12. evalvault/adapters/inbound/cli/commands/run.py +225 -1
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
  14. evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
  15. evalvault/adapters/outbound/dataset/__init__.py +6 -0
  16. evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
  17. evalvault/adapters/outbound/report/__init__.py +6 -0
  18. evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
  19. evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
  20. evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
  21. evalvault/adapters/outbound/retriever/__init__.py +8 -0
  22. evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
  23. evalvault/adapters/outbound/storage/base_sql.py +291 -0
  24. evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
  25. evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
  26. evalvault/adapters/outbound/storage/schema.sql +63 -0
  27. evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
  28. evalvault/domain/entities/__init__.py +20 -0
  29. evalvault/domain/entities/graph_rag.py +30 -0
  30. evalvault/domain/entities/multiturn.py +78 -0
  31. evalvault/domain/metrics/__init__.py +10 -0
  32. evalvault/domain/metrics/multiturn_metrics.py +113 -0
  33. evalvault/domain/metrics/registry.py +36 -0
  34. evalvault/domain/services/__init__.py +8 -0
  35. evalvault/domain/services/evaluator.py +5 -2
  36. evalvault/domain/services/graph_rag_experiment.py +155 -0
  37. evalvault/domain/services/multiturn_evaluator.py +187 -0
  38. evalvault/ports/inbound/__init__.py +2 -0
  39. evalvault/ports/inbound/multiturn_port.py +23 -0
  40. evalvault/ports/inbound/web_port.py +4 -0
  41. evalvault/ports/outbound/graph_retriever_port.py +24 -0
  42. evalvault/ports/outbound/storage_port.py +25 -0
  43. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/METADATA +1 -1
  44. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/RECORD +47 -33
  45. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/WHEEL +0 -0
  46. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/entry_points.txt +0 -0
  47. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -8,11 +8,20 @@ import json
8
8
  import logging
9
9
  import time
10
10
  from collections.abc import Callable
11
- from dataclasses import dataclass
11
+ from dataclasses import asdict, dataclass
12
+ from datetime import UTC, datetime
12
13
  from pathlib import Path
13
14
  from typing import TYPE_CHECKING, Any, Literal, cast
14
15
  from urllib.request import urlopen
15
16
 
17
+ from evalvault.adapters.outbound.analysis import (
18
+ CausalAnalysisAdapter,
19
+ NLPAnalysisAdapter,
20
+ StatisticalAnalysisAdapter,
21
+ )
22
+ from evalvault.adapters.outbound.cache import MemoryCacheAdapter
23
+ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrationReporter
24
+ from evalvault.adapters.outbound.report import MarkdownReportAdapter
16
25
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
17
26
  from evalvault.config.settings import Settings
18
27
  from evalvault.domain.entities import (
@@ -20,6 +29,7 @@ from evalvault.domain.entities import (
20
29
  FeedbackSummary,
21
30
  SatisfactionFeedback,
22
31
  )
32
+ from evalvault.domain.entities.analysis import AnalysisBundle
23
33
  from evalvault.domain.entities.debug import DebugReport
24
34
  from evalvault.domain.entities.prompt import PromptSetBundle
25
35
  from evalvault.domain.metrics.registry import (
@@ -29,8 +39,10 @@ from evalvault.domain.metrics.registry import (
29
39
  list_metric_names,
30
40
  list_metric_specs,
31
41
  )
42
+ from evalvault.domain.services.analysis_service import AnalysisService
32
43
  from evalvault.domain.services.cluster_map_builder import build_cluster_map
33
44
  from evalvault.domain.services.debug_report_service import DebugReportService
45
+ from evalvault.domain.services.judge_calibration_service import JudgeCalibrationService
34
46
  from evalvault.domain.services.prompt_registry import (
35
47
  PromptInput,
36
48
  build_prompt_bundle,
@@ -990,6 +1002,188 @@ class WebUIAdapter:
990
1002
  service = SatisfactionCalibrationService()
991
1003
  return service.build_calibration(run, feedbacks, model=model)
992
1004
 
1005
+ def run_judge_calibration(
1006
+ self,
1007
+ *,
1008
+ run_id: str,
1009
+ labels_source: str,
1010
+ method: str,
1011
+ metrics: list[str],
1012
+ holdout_ratio: float,
1013
+ seed: int,
1014
+ parallel: bool,
1015
+ concurrency: int,
1016
+ ) -> dict[str, object]:
1017
+ if self._storage is None:
1018
+ raise RuntimeError("Storage not configured")
1019
+ storage = self._storage
1020
+ if holdout_ratio <= 0 or holdout_ratio >= 1:
1021
+ raise ValueError("holdout_ratio must be between 0 and 1")
1022
+ if seed < 0:
1023
+ raise ValueError("seed must be >= 0")
1024
+ if concurrency <= 0:
1025
+ raise ValueError("concurrency must be >= 1")
1026
+
1027
+ run = self.get_run_details(run_id)
1028
+ feedbacks = storage.list_feedback(run_id)
1029
+ if labels_source in {"feedback", "hybrid"} and not feedbacks:
1030
+ raise ValueError("Feedback labels are required for this labels_source")
1031
+ resolved_metrics = metrics or list(run.metrics_evaluated)
1032
+ if not resolved_metrics:
1033
+ raise ValueError("No metrics available for calibration")
1034
+
1035
+ started_at = datetime.now(UTC)
1036
+ service = JudgeCalibrationService()
1037
+ result = service.calibrate(
1038
+ run,
1039
+ feedbacks,
1040
+ labels_source=labels_source,
1041
+ method=method,
1042
+ metrics=resolved_metrics,
1043
+ holdout_ratio=holdout_ratio,
1044
+ seed=seed,
1045
+ parallel=parallel,
1046
+ concurrency=concurrency,
1047
+ )
1048
+ finished_at = datetime.now(UTC)
1049
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
1050
+
1051
+ reporter = JudgeCalibrationReporter()
1052
+ timestamp = started_at.strftime("%Y%m%d_%H%M%S")
1053
+ calibration_id = f"judge_calibration_{run_id}_{timestamp}"
1054
+ base_dir = Path("reports/calibration")
1055
+ output_path = base_dir / f"{calibration_id}.json"
1056
+ artifacts_dir = base_dir / "artifacts" / calibration_id
1057
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1058
+ artifacts_index = reporter.write_artifacts(result=result, artifacts_dir=artifacts_dir)
1059
+
1060
+ rendered = reporter.render_json(result)
1061
+
1062
+ status = "ok" if result.summary.gate_passed else "degraded"
1063
+ summary_payload = {
1064
+ "calibration_id": calibration_id,
1065
+ "run_id": result.summary.run_id,
1066
+ "labels_source": result.summary.labels_source,
1067
+ "method": result.summary.method,
1068
+ "metrics": list(result.summary.metrics),
1069
+ "holdout_ratio": result.summary.holdout_ratio,
1070
+ "seed": result.summary.seed,
1071
+ "total_labels": result.summary.total_labels,
1072
+ "total_samples": result.summary.total_samples,
1073
+ "gate_passed": result.summary.gate_passed,
1074
+ "gate_threshold": result.summary.gate_threshold,
1075
+ "notes": list(result.summary.notes),
1076
+ "created_at": started_at.astimezone(UTC).isoformat(),
1077
+ }
1078
+ payload = {
1079
+ "calibration_id": calibration_id,
1080
+ "status": status,
1081
+ "started_at": started_at.astimezone(UTC).isoformat(),
1082
+ "finished_at": finished_at.astimezone(UTC).isoformat(),
1083
+ "duration_ms": duration_ms,
1084
+ "artifacts": artifacts_index,
1085
+ "summary": summary_payload,
1086
+ "metrics": rendered["metrics"],
1087
+ "case_results": rendered["case_results"],
1088
+ "warnings": list(result.warnings),
1089
+ }
1090
+ output_path.write_text(
1091
+ json.dumps(payload, ensure_ascii=False, indent=2),
1092
+ encoding="utf-8",
1093
+ )
1094
+
1095
+ metadata = run.tracker_metadata or {}
1096
+ history = metadata.get("judge_calibration_history")
1097
+ if not isinstance(history, list):
1098
+ history = []
1099
+ history.append(
1100
+ {
1101
+ "calibration_id": calibration_id,
1102
+ "run_id": run_id,
1103
+ "labels_source": summary_payload["labels_source"],
1104
+ "method": summary_payload["method"],
1105
+ "metrics": summary_payload["metrics"],
1106
+ "holdout_ratio": summary_payload["holdout_ratio"],
1107
+ "seed": summary_payload["seed"],
1108
+ "total_labels": summary_payload["total_labels"],
1109
+ "total_samples": summary_payload["total_samples"],
1110
+ "gate_passed": summary_payload["gate_passed"],
1111
+ "gate_threshold": summary_payload["gate_threshold"],
1112
+ "created_at": summary_payload["created_at"],
1113
+ "output_path": str(output_path),
1114
+ "artifacts": artifacts_index,
1115
+ }
1116
+ )
1117
+ metadata["judge_calibration_history"] = history
1118
+ storage.update_run_metadata(run_id, metadata)
1119
+ return payload
1120
+
1121
+ def get_judge_calibration(self, calibration_id: str) -> dict[str, object]:
1122
+ if self._storage is None:
1123
+ raise RuntimeError("Storage not configured")
1124
+ entry = self._find_judge_calibration_entry(calibration_id)
1125
+ output_path = Path(str(entry.get("output_path") or ""))
1126
+ if not output_path.exists():
1127
+ raise KeyError(f"Calibration output not found: {calibration_id}")
1128
+ payload = json.loads(output_path.read_text(encoding="utf-8"))
1129
+ return payload
1130
+
1131
+ def list_judge_calibrations(self, *, limit: int = 20) -> list[dict[str, object]]:
1132
+ if self._storage is None:
1133
+ raise RuntimeError("Storage not configured")
1134
+ storage = self._storage
1135
+ scan_limit = max(100, limit * 5)
1136
+ runs = storage.list_runs(limit=scan_limit)
1137
+ entries: list[dict[str, object]] = []
1138
+ for run in runs:
1139
+ metadata = getattr(run, "tracker_metadata", {}) or {}
1140
+ history = metadata.get("judge_calibration_history")
1141
+ if not isinstance(history, list):
1142
+ continue
1143
+ for item in history:
1144
+ if isinstance(item, dict):
1145
+ entries.append(
1146
+ {
1147
+ "calibration_id": item.get("calibration_id"),
1148
+ "run_id": item.get("run_id"),
1149
+ "labels_source": item.get("labels_source"),
1150
+ "method": item.get("method"),
1151
+ "metrics": item.get("metrics") or [],
1152
+ "holdout_ratio": item.get("holdout_ratio"),
1153
+ "seed": item.get("seed"),
1154
+ "total_labels": item.get("total_labels"),
1155
+ "total_samples": item.get("total_samples"),
1156
+ "gate_passed": item.get("gate_passed"),
1157
+ "gate_threshold": item.get("gate_threshold"),
1158
+ "created_at": item.get("created_at"),
1159
+ }
1160
+ )
1161
+
1162
+ def _sort_key(item: dict[str, object]) -> str:
1163
+ value = item.get("created_at")
1164
+ return value if isinstance(value, str) else ""
1165
+
1166
+ entries.sort(key=_sort_key, reverse=True)
1167
+ return entries[:limit]
1168
+
1169
+ def _find_judge_calibration_entry(self, calibration_id: str) -> dict[str, object]:
1170
+ if self._storage is None:
1171
+ raise RuntimeError("Storage not configured")
1172
+ storage = self._storage
1173
+ scan_limit = 1000
1174
+ runs = storage.list_runs(limit=scan_limit)
1175
+ for run in runs:
1176
+ metadata = getattr(run, "tracker_metadata", {}) or {}
1177
+ history = metadata.get("judge_calibration_history")
1178
+ if not isinstance(history, list):
1179
+ continue
1180
+ for item in history:
1181
+ if not isinstance(item, dict):
1182
+ continue
1183
+ if item.get("calibration_id") == calibration_id:
1184
+ return item
1185
+ raise KeyError(f"Calibration not found: {calibration_id}")
1186
+
993
1187
  def list_stage_events(self, run_id: str, *, stage_type: str | None = None) -> list[StageEvent]:
994
1188
  """Stage 이벤트 목록 조회."""
995
1189
  if self._storage is None or not hasattr(self._storage, "list_stage_events"):
@@ -1153,6 +1347,110 @@ class WebUIAdapter:
1153
1347
  logger.error(f"Failed to delete run {run_id}: {e}")
1154
1348
  return False
1155
1349
 
1350
+ def _build_analysis_bundle(
1351
+ self,
1352
+ run_id: str,
1353
+ *,
1354
+ include_nlp: bool,
1355
+ include_causal: bool,
1356
+ ) -> AnalysisBundle:
1357
+ if self._storage is None:
1358
+ raise RuntimeError("Storage not configured")
1359
+
1360
+ run = self._storage.get_run(run_id)
1361
+ if not run.results:
1362
+ raise ValueError("Run has no results to analyze")
1363
+
1364
+ analysis_adapter = StatisticalAnalysisAdapter()
1365
+ cache_adapter = MemoryCacheAdapter()
1366
+
1367
+ nlp_adapter = None
1368
+ if include_nlp:
1369
+ settings = self._settings or Settings()
1370
+ llm_adapter = self._llm_adapter
1371
+ if llm_adapter is None:
1372
+ from evalvault.adapters.outbound.llm import get_llm_adapter
1373
+
1374
+ try:
1375
+ llm_adapter = get_llm_adapter(settings)
1376
+ except Exception as exc:
1377
+ logger.warning("LLM adapter initialization failed for report: %s", exc)
1378
+ llm_adapter = None
1379
+ if llm_adapter is not None:
1380
+ nlp_adapter = NLPAnalysisAdapter(
1381
+ llm_adapter=llm_adapter,
1382
+ use_embeddings=True,
1383
+ )
1384
+
1385
+ causal_adapter = CausalAnalysisAdapter() if include_causal else None
1386
+
1387
+ service = AnalysisService(
1388
+ analysis_adapter=analysis_adapter,
1389
+ nlp_adapter=nlp_adapter,
1390
+ causal_adapter=causal_adapter,
1391
+ cache_adapter=cache_adapter,
1392
+ )
1393
+ return service.analyze_run(run, include_nlp=include_nlp, include_causal=include_causal)
1394
+
1395
+ @staticmethod
1396
+ def _build_dashboard_payload(bundle: AnalysisBundle) -> dict[str, Any]:
1397
+ payload: dict[str, Any] = {"run_id": bundle.run_id}
1398
+ analysis = bundle.statistical
1399
+ if analysis is None:
1400
+ return payload
1401
+
1402
+ metrics_summary: dict[str, Any] = {}
1403
+ for metric, stats in analysis.metrics_summary.items():
1404
+ metrics_summary[metric] = {
1405
+ "mean": stats.mean,
1406
+ "std": stats.std,
1407
+ "min": stats.min,
1408
+ "max": stats.max,
1409
+ "median": stats.median,
1410
+ "percentile_25": stats.percentile_25,
1411
+ "percentile_75": stats.percentile_75,
1412
+ "count": stats.count,
1413
+ }
1414
+
1415
+ payload.update(
1416
+ {
1417
+ "metrics_summary": metrics_summary,
1418
+ "correlation_matrix": analysis.correlation_matrix,
1419
+ "correlation_metrics": analysis.correlation_metrics,
1420
+ "metric_pass_rates": analysis.metric_pass_rates,
1421
+ "low_performers": [asdict(item) for item in analysis.low_performers],
1422
+ }
1423
+ )
1424
+ return payload
1425
+
1426
+ def _find_cached_report(
1427
+ self,
1428
+ *,
1429
+ run_id: str,
1430
+ output_format: str,
1431
+ include_nlp: bool,
1432
+ include_causal: bool,
1433
+ ) -> str | None:
1434
+ if self._storage is None:
1435
+ return None
1436
+
1437
+ reports = self._storage.list_analysis_reports(
1438
+ run_id=run_id,
1439
+ report_type="analysis",
1440
+ format=output_format,
1441
+ limit=10,
1442
+ )
1443
+ for report in reports:
1444
+ metadata = report.get("metadata") or {}
1445
+ if metadata.get("include_nlp") != include_nlp:
1446
+ continue
1447
+ if metadata.get("include_causal") != include_causal:
1448
+ continue
1449
+ content = report.get("content")
1450
+ if content:
1451
+ return content
1452
+ return None
1453
+
1156
1454
  def generate_report(
1157
1455
  self,
1158
1456
  run_id: str,
@@ -1160,6 +1458,8 @@ class WebUIAdapter:
1160
1458
  *,
1161
1459
  include_nlp: bool = True,
1162
1460
  include_causal: bool = True,
1461
+ use_cache: bool = True,
1462
+ save: bool = False,
1163
1463
  ) -> str:
1164
1464
  """보고서 생성.
1165
1465
 
@@ -1172,8 +1472,72 @@ class WebUIAdapter:
1172
1472
  Returns:
1173
1473
  생성된 보고서
1174
1474
  """
1175
- # TODO: 실제 보고서 생성 로직 구현
1176
- raise NotImplementedError("Report generation not yet implemented")
1475
+ if use_cache:
1476
+ cached = self._find_cached_report(
1477
+ run_id=run_id,
1478
+ output_format=output_format,
1479
+ include_nlp=include_nlp,
1480
+ include_causal=include_causal,
1481
+ )
1482
+ if cached is not None:
1483
+ return cached
1484
+
1485
+ bundle = self._build_analysis_bundle(
1486
+ run_id,
1487
+ include_nlp=include_nlp,
1488
+ include_causal=include_causal,
1489
+ )
1490
+
1491
+ report_generator = self._report_generator or MarkdownReportAdapter()
1492
+ if output_format == "html":
1493
+ if isinstance(report_generator, MarkdownReportAdapter):
1494
+ report_content = report_generator.generate_html(
1495
+ bundle,
1496
+ include_nlp=include_nlp,
1497
+ include_causal=include_causal,
1498
+ )
1499
+ else:
1500
+ report_content = report_generator.generate_html(bundle, include_nlp=include_nlp)
1501
+ elif isinstance(report_generator, MarkdownReportAdapter):
1502
+ report_content = report_generator.generate_markdown(
1503
+ bundle,
1504
+ include_nlp=include_nlp,
1505
+ include_causal=include_causal,
1506
+ )
1507
+ else:
1508
+ report_content = report_generator.generate_markdown(bundle, include_nlp=include_nlp)
1509
+
1510
+ if save and self._storage is not None:
1511
+ metadata = {
1512
+ "include_nlp": include_nlp,
1513
+ "include_causal": include_causal,
1514
+ "source": "api",
1515
+ }
1516
+ self._storage.save_analysis_report(
1517
+ report_id=None,
1518
+ run_id=run_id,
1519
+ experiment_id=None,
1520
+ report_type="analysis",
1521
+ format=output_format,
1522
+ content=report_content,
1523
+ metadata=metadata,
1524
+ )
1525
+
1526
+ return report_content
1527
+
1528
+ def build_dashboard_payload(
1529
+ self,
1530
+ run_id: str,
1531
+ *,
1532
+ include_nlp: bool = True,
1533
+ include_causal: bool = True,
1534
+ ) -> dict[str, Any]:
1535
+ bundle = self._build_analysis_bundle(
1536
+ run_id,
1537
+ include_nlp=include_nlp,
1538
+ include_causal=include_causal,
1539
+ )
1540
+ return self._build_dashboard_payload(bundle)
1177
1541
 
1178
1542
  def get_available_metrics(self) -> list[str]:
1179
1543
  """사용 가능한 메트릭 목록 반환."""
@@ -158,7 +158,17 @@ def create_app() -> FastAPI:
158
158
  allow_headers=["*"],
159
159
  )
160
160
 
161
- from .routers import benchmark, chat, config, domain, knowledge, mcp, pipeline, runs
161
+ from .routers import (
162
+ benchmark,
163
+ calibration,
164
+ chat,
165
+ config,
166
+ domain,
167
+ knowledge,
168
+ mcp,
169
+ pipeline,
170
+ runs,
171
+ )
162
172
 
163
173
  auth_dependencies = [Depends(require_api_token)]
164
174
 
@@ -210,6 +220,12 @@ def create_app() -> FastAPI:
210
220
  tags=["mcp"],
211
221
  dependencies=auth_dependencies,
212
222
  )
223
+ app.include_router(
224
+ calibration.router,
225
+ prefix="/api/v1/calibration",
226
+ tags=["calibration"],
227
+ dependencies=auth_dependencies,
228
+ )
213
229
 
214
230
  @app.get("/health")
215
231
  def health_check():
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal
4
+
5
+ from fastapi import APIRouter, HTTPException, Query
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+ from evalvault.adapters.inbound.api.main import AdapterDep
9
+
10
+ router = APIRouter()
11
+
12
+
13
+ class JudgeCalibrationRequest(BaseModel):
14
+ model_config = ConfigDict(extra="forbid")
15
+
16
+ run_id: str
17
+ labels_source: Literal["feedback", "gold", "hybrid"] = "feedback"
18
+ method: Literal["platt", "isotonic", "temperature", "none"] = "isotonic"
19
+ metrics: list[str] | None = None
20
+ holdout_ratio: float = Field(0.2, gt=0.0, lt=1.0)
21
+ seed: int = Field(42, ge=0)
22
+ parallel: bool = False
23
+ concurrency: int = Field(8, ge=1)
24
+
25
+
26
+ class JudgeCalibrationCaseResponse(BaseModel):
27
+ test_case_id: str
28
+ raw_score: float
29
+ calibrated_score: float
30
+ label: float | None = None
31
+ label_source: str | None = None
32
+
33
+
34
+ class JudgeCalibrationMetricResponse(BaseModel):
35
+ metric: str
36
+ method: str
37
+ sample_count: int
38
+ label_count: int
39
+ mae: float | None
40
+ pearson: float | None
41
+ spearman: float | None
42
+ temperature: float | None = None
43
+ parameters: dict[str, float | None] = Field(default_factory=dict)
44
+ gate_passed: bool | None = None
45
+ warning: str | None = None
46
+
47
+
48
+ class JudgeCalibrationSummaryResponse(BaseModel):
49
+ calibration_id: str
50
+ run_id: str
51
+ labels_source: str
52
+ method: str
53
+ metrics: list[str]
54
+ holdout_ratio: float
55
+ seed: int
56
+ total_labels: int
57
+ total_samples: int
58
+ gate_passed: bool
59
+ gate_threshold: float | None = None
60
+ notes: list[str] = Field(default_factory=list)
61
+ created_at: str
62
+
63
+
64
+ class JudgeCalibrationResponse(BaseModel):
65
+ calibration_id: str
66
+ status: Literal["ok", "degraded"]
67
+ started_at: str
68
+ finished_at: str
69
+ duration_ms: int
70
+ artifacts: dict[str, str]
71
+ summary: JudgeCalibrationSummaryResponse
72
+ metrics: list[JudgeCalibrationMetricResponse]
73
+ case_results: dict[str, list[JudgeCalibrationCaseResponse]]
74
+ warnings: list[str]
75
+
76
+
77
+ class JudgeCalibrationHistoryItem(BaseModel):
78
+ calibration_id: str
79
+ run_id: str
80
+ labels_source: str
81
+ method: str
82
+ metrics: list[str]
83
+ holdout_ratio: float
84
+ seed: int
85
+ total_labels: int
86
+ total_samples: int
87
+ gate_passed: bool
88
+ gate_threshold: float | None = None
89
+ created_at: str
90
+
91
+
92
+ @router.post("/judge", response_model=JudgeCalibrationResponse)
93
+ def run_judge_calibration(
94
+ request: JudgeCalibrationRequest, adapter: AdapterDep
95
+ ) -> JudgeCalibrationResponse:
96
+ try:
97
+ payload = adapter.run_judge_calibration(
98
+ run_id=request.run_id,
99
+ labels_source=request.labels_source,
100
+ method=request.method,
101
+ metrics=request.metrics or [],
102
+ holdout_ratio=request.holdout_ratio,
103
+ seed=request.seed,
104
+ parallel=request.parallel,
105
+ concurrency=request.concurrency,
106
+ )
107
+ except KeyError as exc:
108
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
109
+ except ValueError as exc:
110
+ raise HTTPException(status_code=400, detail=str(exc)) from exc
111
+ except RuntimeError as exc:
112
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
113
+ return JudgeCalibrationResponse.model_validate(payload)
114
+
115
+
116
+ @router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
117
+ def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
118
+ try:
119
+ payload = adapter.get_judge_calibration(calibration_id)
120
+ except KeyError as exc:
121
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
122
+ except RuntimeError as exc:
123
+ raise HTTPException(status_code=500, detail=str(exc)) from exc
124
+ return JudgeCalibrationResponse.model_validate(payload)
125
+
126
+
127
+ @router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
128
+ def list_calibrations(
129
+ adapter: AdapterDep,
130
+ limit: int = Query(20, ge=1, le=200),
131
+ ) -> list[JudgeCalibrationHistoryItem]:
132
+ entries = adapter.list_judge_calibrations(limit=limit)
133
+ return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
@@ -6,11 +6,12 @@ import asyncio
6
6
  import csv
7
7
  import json
8
8
  from datetime import datetime
9
+ from io import BytesIO
9
10
  from pathlib import Path
10
11
  from typing import Any, Literal
11
12
 
12
13
  from fastapi import APIRouter, File, HTTPException, Query, UploadFile
13
- from fastapi.responses import PlainTextResponse, Response, StreamingResponse
14
+ from fastapi.responses import HTMLResponse, PlainTextResponse, Response, StreamingResponse
14
15
  from pydantic import BaseModel
15
16
 
16
17
  from evalvault.adapters.inbound.api.main import AdapterDep
@@ -21,6 +22,7 @@ from evalvault.adapters.outbound.dataset.templates import (
21
22
  )
22
23
  from evalvault.adapters.outbound.debug.report_renderer import render_markdown
23
24
  from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
25
+ from evalvault.adapters.outbound.report import DashboardGenerator
24
26
  from evalvault.config.settings import get_settings
25
27
  from evalvault.domain.entities import (
26
28
  CalibrationResult,
@@ -1159,6 +1161,74 @@ def get_improvement_guide(
1159
1161
  raise HTTPException(status_code=500, detail=str(e))
1160
1162
 
1161
1163
 
1164
+ @router.get("/{run_id}/analysis-report", response_model=None)
1165
+ def get_analysis_report(
1166
+ run_id: str,
1167
+ adapter: AdapterDep,
1168
+ format: Literal["markdown", "html"] = Query("markdown", description="Report format"),
1169
+ include_nlp: bool = Query(True, description="Include NLP analysis"),
1170
+ include_causal: bool = Query(True, description="Include causal analysis"),
1171
+ use_cache: bool = Query(True, description="Use cached report if available"),
1172
+ save: bool = Query(False, description="Save report to database"),
1173
+ ):
1174
+ """Generate analysis report (Markdown/HTML)."""
1175
+ try:
1176
+ report = adapter.generate_report(
1177
+ run_id,
1178
+ output_format=format,
1179
+ include_nlp=include_nlp,
1180
+ include_causal=include_causal,
1181
+ use_cache=use_cache,
1182
+ save=save,
1183
+ )
1184
+ if format == "html":
1185
+ return HTMLResponse(report)
1186
+ return PlainTextResponse(report)
1187
+ except KeyError:
1188
+ raise HTTPException(status_code=404, detail="Run not found")
1189
+ except Exception as e:
1190
+ raise HTTPException(status_code=500, detail=str(e))
1191
+
1192
+
1193
+ @router.get("/{run_id}/dashboard", response_model=None)
1194
+ def get_dashboard(
1195
+ run_id: str,
1196
+ adapter: AdapterDep,
1197
+ format: Literal["png", "svg", "pdf"] = Query("png", description="Dashboard format"),
1198
+ include_nlp: bool = Query(True, description="Include NLP analysis"),
1199
+ include_causal: bool = Query(True, description="Include causal analysis"),
1200
+ ):
1201
+ """Generate dashboard image for a run."""
1202
+ try:
1203
+ dashboard_payload = adapter.build_dashboard_payload(
1204
+ run_id,
1205
+ include_nlp=include_nlp,
1206
+ include_causal=include_causal,
1207
+ )
1208
+ generator = DashboardGenerator()
1209
+ fig = generator.generate_evaluation_dashboard(
1210
+ run_id,
1211
+ analysis_data=dashboard_payload,
1212
+ )
1213
+ buffer = BytesIO()
1214
+ fig.savefig(buffer, format=format, dpi=300, bbox_inches="tight")
1215
+ fig.clear()
1216
+ media_types = {
1217
+ "png": "image/png",
1218
+ "svg": "image/svg+xml",
1219
+ "pdf": "application/pdf",
1220
+ }
1221
+ return Response(content=buffer.getvalue(), media_type=media_types[format])
1222
+ except ImportError as exc:
1223
+ raise HTTPException(status_code=500, detail=str(exc))
1224
+ except KeyError:
1225
+ raise HTTPException(status_code=404, detail="Run not found")
1226
+ except ValueError as exc:
1227
+ raise HTTPException(status_code=400, detail=str(exc))
1228
+ except Exception as e:
1229
+ raise HTTPException(status_code=500, detail=str(e))
1230
+
1231
+
1162
1232
  @router.get("/{run_id}/report")
1163
1233
  def generate_llm_report(
1164
1234
  run_id: str,