evalvault 1.70.1__py3-none-any.whl → 1.72.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. evalvault/adapters/inbound/api/adapter.py +367 -3
  2. evalvault/adapters/inbound/api/main.py +17 -1
  3. evalvault/adapters/inbound/api/routers/calibration.py +133 -0
  4. evalvault/adapters/inbound/api/routers/runs.py +71 -1
  5. evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
  6. evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
  7. evalvault/adapters/inbound/cli/commands/compare.py +1 -1
  8. evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
  9. evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
  10. evalvault/adapters/inbound/cli/commands/history.py +1 -1
  11. evalvault/adapters/inbound/cli/commands/regress.py +169 -1
  12. evalvault/adapters/inbound/cli/commands/run.py +225 -1
  13. evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
  14. evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
  15. evalvault/adapters/outbound/dataset/__init__.py +6 -0
  16. evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
  17. evalvault/adapters/outbound/report/__init__.py +6 -0
  18. evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
  19. evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
  20. evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
  21. evalvault/adapters/outbound/retriever/__init__.py +8 -0
  22. evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
  23. evalvault/adapters/outbound/storage/base_sql.py +291 -0
  24. evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
  25. evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
  26. evalvault/adapters/outbound/storage/schema.sql +63 -0
  27. evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
  28. evalvault/domain/entities/__init__.py +20 -0
  29. evalvault/domain/entities/graph_rag.py +30 -0
  30. evalvault/domain/entities/multiturn.py +78 -0
  31. evalvault/domain/metrics/__init__.py +10 -0
  32. evalvault/domain/metrics/multiturn_metrics.py +113 -0
  33. evalvault/domain/metrics/registry.py +36 -0
  34. evalvault/domain/services/__init__.py +8 -0
  35. evalvault/domain/services/evaluator.py +5 -2
  36. evalvault/domain/services/graph_rag_experiment.py +155 -0
  37. evalvault/domain/services/multiturn_evaluator.py +187 -0
  38. evalvault/ports/inbound/__init__.py +2 -0
  39. evalvault/ports/inbound/multiturn_port.py +23 -0
  40. evalvault/ports/inbound/web_port.py +4 -0
  41. evalvault/ports/outbound/graph_retriever_port.py +24 -0
  42. evalvault/ports/outbound/storage_port.py +25 -0
  43. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/METADATA +1 -1
  44. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/RECORD +47 -33
  45. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/WHEEL +0 -0
  46. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/entry_points.txt +0 -0
  47. {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -23,6 +23,7 @@ from .domain import create_domain_app
23
23
  from .experiment import register_experiment_commands
24
24
  from .gate import register_gate_commands
25
25
  from .generate import register_generate_commands
26
+ from .graph_rag import create_graph_rag_app
26
27
  from .history import register_history_commands
27
28
  from .init import register_init_command
28
29
  from .kg import create_kg_app
@@ -82,6 +83,7 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
82
83
  SubAppModule("kg", create_kg_app),
83
84
  SubAppModule("domain", create_domain_app),
84
85
  SubAppModule("benchmark", create_benchmark_app),
86
+ SubAppModule("graphrag", create_graph_rag_app),
85
87
  SubAppModule("method", create_method_app),
86
88
  SubAppModule("ops", create_ops_app),
87
89
  SubAppModule("phoenix", create_phoenix_app),
@@ -298,6 +298,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
298
298
  _console.print(f"\n[green]리포트 생성: {report}[/green]")
299
299
 
300
300
  @app.command(name="analyze-compare")
301
+ @app.command(name="compare-analysis")
301
302
  def analyze_compare(
302
303
  run_id1: str = typer.Argument(..., help="첫 번째 Run ID"),
303
304
  run_id2: str = typer.Argument(..., help="두 번째 Run ID"),
@@ -54,7 +54,7 @@ def register_compare_commands(app: typer.Typer, console: Console) -> None:
54
54
  "t-test",
55
55
  "--test",
56
56
  "-t",
57
- help="통계 검정 (t-test, mann-whitney)",
57
+ help="통계 검정 (t-test | mann-whitney)",
58
58
  ),
59
59
  output_format: str = typer.Option(
60
60
  "table",
@@ -12,7 +12,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
12
12
  from evalvault.domain.services.experiment_manager import ExperimentManager
13
13
 
14
14
  from ..utils.options import db_option
15
- from ..utils.validators import parse_csv_option
15
+ from ..utils.validators import parse_csv_option, validate_choice
16
16
 
17
17
 
18
18
  def register_experiment_commands(app: typer.Typer, console: Console) -> None:
@@ -29,10 +29,24 @@ def register_experiment_commands(app: typer.Typer, console: Console) -> None:
29
29
  "-m",
30
30
  help="Comma-separated list of metrics to compare.",
31
31
  ),
32
+ control_retriever: str | None = typer.Option(
33
+ None,
34
+ "--control-retriever",
35
+ help="Control retriever (bm25, dense, hybrid, graphrag).",
36
+ ),
37
+ variant_retriever: str | None = typer.Option(
38
+ None,
39
+ "--variant-retriever",
40
+ help="Variant retriever (bm25, dense, hybrid, graphrag).",
41
+ ),
32
42
  db_path: Path = db_option(help_text="Path to database file."),
33
43
  ) -> None:
34
44
  """Create a new experiment for A/B testing."""
35
45
 
46
+ for retriever_name in (control_retriever, variant_retriever):
47
+ if retriever_name:
48
+ validate_choice(retriever_name, ["bm25", "dense", "hybrid", "graphrag"], console)
49
+
36
50
  console.print("\n[bold]Creating Experiment[/bold]\n")
37
51
  storage = SQLiteStorageAdapter(db_path=db_path)
38
52
  manager = ExperimentManager(storage)
@@ -44,6 +58,18 @@ def register_experiment_commands(app: typer.Typer, console: Console) -> None:
44
58
  hypothesis=hypothesis,
45
59
  metrics=metric_list,
46
60
  )
61
+ if control_retriever:
62
+ manager.add_group_to_experiment(
63
+ experiment.experiment_id,
64
+ "control",
65
+ f"retriever={control_retriever}",
66
+ )
67
+ if variant_retriever:
68
+ manager.add_group_to_experiment(
69
+ experiment.experiment_id,
70
+ "variant",
71
+ f"retriever={variant_retriever}",
72
+ )
47
73
  console.print(f"[green]Created experiment:[/green] {experiment.experiment_id}")
48
74
  console.print(f" Name: {experiment.name}")
49
75
  console.print(f" Status: {experiment.status}")
@@ -0,0 +1,303 @@
1
+ """GraphRAG experiment commands for the EvalVault CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from dataclasses import asdict
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import typer
12
+ from rich.console import Console
13
+ from rich.table import Table
14
+
15
+ from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
16
+ from evalvault.adapters.outbound.dataset import get_loader
17
+ from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
18
+ from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
19
+ from evalvault.adapters.outbound.retriever.graph_rag_adapter import GraphRAGAdapter
20
+ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
21
+ from evalvault.config.settings import Settings, apply_profile
22
+ from evalvault.domain.services.analysis_service import AnalysisService
23
+ from evalvault.domain.services.evaluator import RagasEvaluator
24
+ from evalvault.domain.services.graph_rag_experiment import GraphRAGExperiment
25
+ from evalvault.ports.outbound.korean_nlp_port import RetrieverPort
26
+
27
+ from ..utils.console import print_cli_error
28
+ from ..utils.options import db_option, profile_option
29
+ from ..utils.validators import parse_csv_option, validate_choice
30
+ from .run import _build_dense_retriever
31
+ from .run_helpers import _is_oss_open_model, load_knowledge_graph, load_retriever_documents
32
+
33
+
34
+ def create_graph_rag_app(console: Console) -> typer.Typer:
35
+ app = typer.Typer(name="graphrag", help="GraphRAG experiment utilities.")
36
+
37
+ @app.command("compare")
38
+ def graphrag_compare(
39
+ dataset: Path = typer.Argument(
40
+ ...,
41
+ help="Path to dataset file (CSV, Excel, or JSON).",
42
+ exists=True,
43
+ readable=True,
44
+ ),
45
+ metrics: str = typer.Option(
46
+ "faithfulness,answer_relevancy",
47
+ "--metrics",
48
+ "-m",
49
+ help="Comma-separated list of metrics to evaluate.",
50
+ ),
51
+ baseline_retriever: str = typer.Option(
52
+ "bm25",
53
+ "--baseline-retriever",
54
+ help="Baseline retriever (bm25, dense, hybrid).",
55
+ ),
56
+ retriever_docs: Path = typer.Option(
57
+ ...,
58
+ "--retriever-docs",
59
+ help="Retriever documents file (.json/.jsonl/.txt).",
60
+ exists=True,
61
+ readable=True,
62
+ ),
63
+ kg_path: Path = typer.Option(
64
+ ...,
65
+ "--kg",
66
+ "-k",
67
+ help="Knowledge graph JSON file for GraphRAG.",
68
+ exists=True,
69
+ readable=True,
70
+ ),
71
+ retriever_top_k: int = typer.Option(
72
+ 5,
73
+ "--retriever-top-k",
74
+ help="Retriever top-k to fill contexts.",
75
+ ),
76
+ graph_max_hops: int = typer.Option(
77
+ 2,
78
+ "--graph-max-hops",
79
+ help="GraphRAG max hop depth.",
80
+ ),
81
+ graph_max_nodes: int = typer.Option(
82
+ 20,
83
+ "--graph-max-nodes",
84
+ help="GraphRAG max nodes in subgraph.",
85
+ ),
86
+ model: str | None = typer.Option(
87
+ None,
88
+ "--model",
89
+ help="Model to use for evaluation (overrides profile).",
90
+ ),
91
+ db_path: Path | None = db_option(help_text="DB 경로 (저장 시 사용)."),
92
+ profile: str | None = profile_option(help_text="LLM 프로필"),
93
+ output: Path | None = typer.Option(
94
+ None,
95
+ "--output",
96
+ "-o",
97
+ help="JSON 출력 파일 경로",
98
+ ),
99
+ artifact_dir: Path = typer.Option(
100
+ Path("reports/analysis/artifacts"),
101
+ "--artifact-dir",
102
+ help="GraphRAG 아티팩트 저장 경로",
103
+ ),
104
+ ) -> None:
105
+ validate_choice(baseline_retriever, ["bm25", "dense", "hybrid"], console)
106
+
107
+ settings = Settings()
108
+ profile_name = profile or settings.evalvault_profile
109
+ if profile_name:
110
+ settings = apply_profile(settings, profile_name)
111
+
112
+ if model:
113
+ if _is_oss_open_model(model) and settings.llm_provider != "vllm":
114
+ settings.llm_provider = "ollama"
115
+ settings.ollama_model = model
116
+ elif settings.llm_provider == "ollama":
117
+ settings.ollama_model = model
118
+ elif settings.llm_provider == "vllm":
119
+ settings.vllm_model = model
120
+ else:
121
+ settings.openai_model = model
122
+
123
+ if settings.llm_provider == "openai" and not settings.openai_api_key:
124
+ print_cli_error(console, "OPENAI_API_KEY가 설정되지 않았습니다.")
125
+ raise typer.Exit(1)
126
+
127
+ try:
128
+ llm_adapter = get_llm_adapter(settings)
129
+ except Exception as exc:
130
+ print_cli_error(console, "LLM 어댑터 초기화에 실패했습니다.", details=str(exc))
131
+ raise typer.Exit(1) from exc
132
+
133
+ loader = get_loader(dataset)
134
+ ds = loader.load(dataset)
135
+
136
+ documents, doc_ids = load_retriever_documents(retriever_docs)
137
+ baseline = _build_baseline_retriever(
138
+ baseline_retriever,
139
+ documents=documents,
140
+ settings=settings,
141
+ profile_name=profile_name,
142
+ )
143
+ if baseline is None:
144
+ print_cli_error(console, "Baseline retriever 초기화에 실패했습니다.")
145
+ raise typer.Exit(1)
146
+
147
+ kg_graph = load_knowledge_graph(kg_path)
148
+ graph_adapter = GraphRAGAdapter(kg_graph)
149
+
150
+ korean_toolkit = try_create_korean_toolkit()
151
+ evaluator = RagasEvaluator(
152
+ korean_toolkit=korean_toolkit,
153
+ llm_factory=SettingsLLMFactory(settings),
154
+ )
155
+ analysis_service = AnalysisService(analysis_adapter=StatisticalAnalysisAdapter())
156
+ experiment = GraphRAGExperiment(
157
+ evaluator=evaluator,
158
+ analysis_service=analysis_service,
159
+ )
160
+
161
+ metric_list = parse_csv_option(metrics)
162
+ if not metric_list:
163
+ print_cli_error(console, "평가 메트릭을 지정하세요.")
164
+ raise typer.Exit(1)
165
+
166
+ result = asyncio.run(
167
+ experiment.run_comparison(
168
+ dataset=ds,
169
+ baseline_retriever=baseline,
170
+ graph_retriever=graph_adapter,
171
+ metrics=metric_list,
172
+ llm=llm_adapter,
173
+ retriever_top_k=retriever_top_k,
174
+ graph_max_hops=graph_max_hops,
175
+ graph_max_nodes=graph_max_nodes,
176
+ )
177
+ )
178
+
179
+ artifacts_path = _write_graph_rag_artifacts(
180
+ result=result,
181
+ dataset=ds,
182
+ graph_retriever=graph_adapter,
183
+ artifact_root=artifact_dir,
184
+ )
185
+ console.print(f"[green]Saved GraphRAG artifacts:[/green] {artifacts_path}")
186
+
187
+ if db_path is not None:
188
+ storage = SQLiteStorageAdapter(db_path=db_path)
189
+ storage.save_run(result.baseline_run)
190
+ storage.save_run(result.graph_run)
191
+ console.print(f"[green]Saved baseline run:[/green] {result.baseline_run.run_id}")
192
+ console.print(f"[green]Saved graph run:[/green] {result.graph_run.run_id}")
193
+
194
+ _render_comparison_table(console, result)
195
+
196
+ if output:
197
+ payload = _build_output_payload(result, doc_ids)
198
+ output.parent.mkdir(parents=True, exist_ok=True)
199
+ with output.open("w", encoding="utf-8") as f:
200
+ json.dump(payload, f, ensure_ascii=False, indent=2, default=str)
201
+ console.print(f"[green]Saved output:[/green] {output}")
202
+
203
+ return app
204
+
205
+
206
+ def _build_baseline_retriever(
207
+ mode: str,
208
+ *,
209
+ documents: list[str],
210
+ settings: Settings,
211
+ profile_name: str | None,
212
+ ) -> RetrieverPort | None:
213
+ if mode in {"bm25", "hybrid"}:
214
+ toolkit = try_create_korean_toolkit()
215
+ if toolkit is None:
216
+ return None
217
+ return toolkit.build_retriever(documents, use_hybrid=mode == "hybrid", verbose=False)
218
+ return _build_dense_retriever(
219
+ documents=documents,
220
+ settings=settings,
221
+ profile_name=profile_name,
222
+ )
223
+
224
+
225
+ def _render_comparison_table(console: Console, result: Any) -> None:
226
+ table = Table(show_header=True, header_style="bold cyan")
227
+ table.add_column("Metric")
228
+ table.add_column("Baseline", justify="right")
229
+ table.add_column("Graph", justify="right")
230
+ table.add_column("Diff%", justify="right")
231
+ table.add_column("P-Value", justify="right")
232
+
233
+ for comp in result.comparisons:
234
+ table.add_row(
235
+ comp.metric,
236
+ f"{comp.mean_a:.3f}",
237
+ f"{comp.mean_b:.3f}",
238
+ f"{comp.diff_percent:+.1f}%",
239
+ f"{comp.p_value:.4f}",
240
+ )
241
+
242
+ console.print("\n[bold]GraphRAG Comparison[/bold]\n")
243
+ console.print(table)
244
+ console.print()
245
+
246
+
247
+ def _build_output_payload(result: Any, doc_ids: list[str]) -> dict[str, Any]:
248
+ return {
249
+ "baseline": result.baseline_run.to_summary_dict(),
250
+ "graph": result.graph_run.to_summary_dict(),
251
+ "comparisons": [asdict(comp) for comp in result.comparisons],
252
+ "graph_contexts": result.graph_contexts,
253
+ "graph_subgraphs": {key: asdict(value) for key, value in result.graph_subgraphs.items()},
254
+ "retriever_doc_ids": doc_ids,
255
+ }
256
+
257
+
258
+ def _write_graph_rag_artifacts(
259
+ *,
260
+ result: Any,
261
+ dataset: Any,
262
+ graph_retriever: GraphRAGAdapter,
263
+ artifact_root: Path,
264
+ ) -> Path:
265
+ run_id = result.graph_run.run_id
266
+ base_dir = artifact_root / f"analysis_{run_id}"
267
+ graph_dir = base_dir / "graph_subgraphs"
268
+ entity_dir = base_dir / "entity_extraction"
269
+ graph_dir.mkdir(parents=True, exist_ok=True)
270
+ entity_dir.mkdir(parents=True, exist_ok=True)
271
+
272
+ graph_index: dict[str, str] = {}
273
+ for case_id, subgraph in result.graph_subgraphs.items():
274
+ safe_id = _safe_filename(case_id)
275
+ file_name = f"{safe_id}_subgraph.json"
276
+ file_path = graph_dir / file_name
277
+ with file_path.open("w", encoding="utf-8") as handle:
278
+ json.dump(asdict(subgraph), handle, ensure_ascii=False, indent=2)
279
+ graph_index[case_id] = str(Path("graph_subgraphs") / file_name)
280
+
281
+ entities_payload: dict[str, list[dict[str, object]]] = {}
282
+ for case in dataset.test_cases:
283
+ entities = graph_retriever.extract_entities(case.question)
284
+ entities_payload[case.id] = [asdict(entity) for entity in entities]
285
+ entities_path = entity_dir / "entities.json"
286
+ with entities_path.open("w", encoding="utf-8") as handle:
287
+ json.dump(entities_payload, handle, ensure_ascii=False, indent=2)
288
+
289
+ index_payload = {
290
+ "graph_subgraphs": graph_index,
291
+ "entity_extraction": str(Path("entity_extraction") / "entities.json"),
292
+ }
293
+ with (base_dir / "index.json").open("w", encoding="utf-8") as handle:
294
+ json.dump(index_payload, handle, ensure_ascii=False, indent=2)
295
+
296
+ return base_dir
297
+
298
+
299
+ def _safe_filename(value: str) -> str:
300
+ return value.replace("/", "_").replace("\\", "_").replace(" ", "_")
301
+
302
+
303
+ __all__ = ["create_graph_rag_app"]
@@ -28,7 +28,7 @@ def register_history_commands(app: typer.Typer, console: Console) -> None:
28
28
  10,
29
29
  "--limit",
30
30
  "-n",
31
- help="Maximum number of runs to show.",
31
+ help="Maximum number of runs to show (default: 10).",
32
32
  ),
33
33
  dataset: str | None = typer.Option(
34
34
  None,
@@ -13,6 +13,13 @@ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
13
13
  from evalvault.adapters.outbound.analysis.statistical_adapter import (
14
14
  StatisticalAnalysisAdapter,
15
15
  )
16
+ from evalvault.adapters.outbound.report.ci_report_formatter import (
17
+ CIGateMetricRow,
18
+ format_ci_regression_report,
19
+ )
20
+ from evalvault.adapters.outbound.report.pr_comment_formatter import (
21
+ format_ci_gate_pr_comment,
22
+ )
16
23
  from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
17
24
  from evalvault.domain.services.regression_gate_service import (
18
25
  RegressionGateReport,
@@ -25,13 +32,14 @@ from ..utils.options import db_option
25
32
  from ..utils.validators import parse_csv_option, validate_choice
26
33
 
27
34
 
28
- def _coerce_test_type(value: Literal["t-test", "mann-whitney"]) -> TestType:
35
+ def _coerce_test_type(value: str) -> TestType:
29
36
  if value == "t-test":
30
37
  return "t-test"
31
38
  return "mann-whitney"
32
39
 
33
40
 
34
41
  OutputFormat = Literal["table", "json", "github-actions"]
42
+ CIGateOutputFormat = Literal["github", "gitlab", "json", "pr-comment"]
35
43
 
36
44
 
37
45
  def _format_timestamp(value: datetime) -> str:
@@ -200,6 +208,166 @@ def register_regress_commands(app: typer.Typer, console: Console) -> None:
200
208
  if report.regression_detected:
201
209
  raise typer.Exit(2)
202
210
 
211
+ @app.command(name="ci-gate")
212
+ def ci_gate(
213
+ baseline_run_id: str = typer.Argument(..., help="Baseline run ID."),
214
+ current_run_id: str = typer.Argument(..., help="Current run ID."),
215
+ regression_threshold: float = typer.Option(
216
+ 0.05,
217
+ "--regression-threshold",
218
+ help="Fail if regression rate exceeds this threshold (default: 0.05).",
219
+ ),
220
+ output_format: str = typer.Option(
221
+ "github",
222
+ "--format",
223
+ "-f",
224
+ help="Output format: github, gitlab, json, or pr-comment.",
225
+ ),
226
+ fail_on_regression: bool = typer.Option(
227
+ True,
228
+ "--fail-on-regression/--no-fail-on-regression",
229
+ help="Fail the command when regression rate exceeds threshold.",
230
+ ),
231
+ db_path: Path | None = db_option(default=None, help_text="Database path"),
232
+ ) -> None:
233
+ """CI/CD 파이프라인용 회귀 게이트 체크."""
234
+ started_at = datetime.now(UTC)
235
+ if db_path is None:
236
+ console.print("[red]Error:[/red] Database path is not configured.")
237
+ raise typer.Exit(1)
238
+
239
+ validate_choice(
240
+ output_format,
241
+ ["github", "gitlab", "json", "pr-comment"],
242
+ console,
243
+ value_label="format",
244
+ )
245
+
246
+ storage = SQLiteStorageAdapter(db_path=db_path)
247
+ analysis_adapter = StatisticalAnalysisAdapter()
248
+ service = RegressionGateService(storage=storage, analysis_adapter=analysis_adapter)
249
+
250
+ try:
251
+ current_run = storage.get_run(current_run_id)
252
+ storage.get_run(baseline_run_id)
253
+ report = service.run_gate(
254
+ current_run_id,
255
+ baseline_run_id,
256
+ )
257
+ except KeyError as exc:
258
+ finished_at = datetime.now(UTC)
259
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
260
+ payload = _build_envelope(
261
+ report=None,
262
+ status="error",
263
+ started_at=started_at,
264
+ finished_at=finished_at,
265
+ duration_ms=duration_ms,
266
+ message=str(exc),
267
+ error_type=type(exc).__name__,
268
+ )
269
+ if output_format == "json":
270
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
271
+ else:
272
+ console.print(f"[red]Error:[/red] {exc}")
273
+ raise typer.Exit(3) from exc
274
+ except (ValueError, RuntimeError) as exc:
275
+ finished_at = datetime.now(UTC)
276
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
277
+ payload = _build_envelope(
278
+ report=None,
279
+ status="error",
280
+ started_at=started_at,
281
+ finished_at=finished_at,
282
+ duration_ms=duration_ms,
283
+ message=str(exc),
284
+ error_type=type(exc).__name__,
285
+ )
286
+ if output_format == "json":
287
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
288
+ else:
289
+ console.print(f"[red]Error:[/red] {exc}")
290
+ raise typer.Exit(1) from exc
291
+
292
+ thresholds = dict.fromkeys(current_run.metrics_evaluated, 0.7)
293
+ thresholds.update(current_run.thresholds or {})
294
+
295
+ rows: list[CIGateMetricRow] = []
296
+ threshold_failures = []
297
+ regressed_metrics = []
298
+ for result in report.results:
299
+ avg_score = current_run.get_avg_score(result.metric)
300
+ threshold = thresholds.get(result.metric, 0.7)
301
+ threshold_passed = avg_score is not None and avg_score >= threshold
302
+ if not threshold_passed:
303
+ threshold_failures.append(result.metric)
304
+ if result.regression:
305
+ regressed_metrics.append(result.metric)
306
+ if result.regression:
307
+ status = "⚠️"
308
+ elif threshold_passed:
309
+ status = "✅"
310
+ else:
311
+ status = "❌"
312
+ rows.append(
313
+ CIGateMetricRow(
314
+ metric=result.metric,
315
+ baseline_score=result.baseline_score,
316
+ current_score=result.candidate_score,
317
+ change_percent=result.diff_percent,
318
+ status=status,
319
+ )
320
+ )
321
+
322
+ regression_rate = len(regressed_metrics) / len(report.results) if report.results else 0.0
323
+ all_thresholds_passed = not threshold_failures
324
+ gate_passed = all_thresholds_passed and regression_rate < regression_threshold
325
+
326
+ finished_at = datetime.now(UTC)
327
+ duration_ms = int((finished_at - started_at).total_seconds() * 1000)
328
+ payload = {
329
+ "baseline_run_id": baseline_run_id,
330
+ "current_run_id": current_run_id,
331
+ "gate_passed": gate_passed,
332
+ "all_thresholds_passed": all_thresholds_passed,
333
+ "regression_rate": regression_rate,
334
+ "regression_threshold": regression_threshold,
335
+ "regressed_metrics": regressed_metrics,
336
+ "threshold_failures": threshold_failures,
337
+ "started_at": _format_timestamp(started_at),
338
+ "finished_at": _format_timestamp(finished_at),
339
+ "duration_ms": duration_ms,
340
+ "report": report.to_dict(),
341
+ }
342
+
343
+ if output_format == "json":
344
+ console.print(json.dumps(payload, ensure_ascii=False, indent=2))
345
+ elif output_format == "pr-comment":
346
+ markdown = format_ci_gate_pr_comment(
347
+ rows,
348
+ baseline_run_id=baseline_run_id,
349
+ current_run_id=current_run_id,
350
+ regression_rate=regression_rate,
351
+ regression_threshold=regression_threshold,
352
+ gate_passed=gate_passed,
353
+ threshold_failures=threshold_failures,
354
+ regressed_metrics=regressed_metrics,
355
+ )
356
+ console.print(markdown)
357
+ else:
358
+ markdown = format_ci_regression_report(
359
+ rows,
360
+ regression_rate=regression_rate,
361
+ regression_threshold=regression_threshold,
362
+ gate_passed=gate_passed,
363
+ )
364
+ console.print(markdown)
365
+
366
+ if not all_thresholds_passed:
367
+ raise typer.Exit(1)
368
+ if not gate_passed and fail_on_regression:
369
+ raise typer.Exit(2)
370
+
203
371
 
204
372
  def _render_table(report: RegressionGateReport, console: Console) -> None:
205
373
  console.print(f"\n[bold]Regression Gate Check: {report.candidate_run_id}[/bold]\n")