evalvault 1.70.1__py3-none-any.whl → 1.72.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +367 -3
- evalvault/adapters/inbound/api/main.py +17 -1
- evalvault/adapters/inbound/api/routers/calibration.py +133 -0
- evalvault/adapters/inbound/api/routers/runs.py +71 -1
- evalvault/adapters/inbound/cli/commands/__init__.py +2 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +1 -0
- evalvault/adapters/inbound/cli/commands/compare.py +1 -1
- evalvault/adapters/inbound/cli/commands/experiment.py +27 -1
- evalvault/adapters/inbound/cli/commands/graph_rag.py +303 -0
- evalvault/adapters/inbound/cli/commands/history.py +1 -1
- evalvault/adapters/inbound/cli/commands/regress.py +169 -1
- evalvault/adapters/inbound/cli/commands/run.py +225 -1
- evalvault/adapters/inbound/cli/commands/run_helpers.py +57 -0
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +17 -4
- evalvault/adapters/outbound/dataset/__init__.py +6 -0
- evalvault/adapters/outbound/dataset/multiturn_json_loader.py +111 -0
- evalvault/adapters/outbound/report/__init__.py +6 -0
- evalvault/adapters/outbound/report/ci_report_formatter.py +43 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +24 -9
- evalvault/adapters/outbound/report/pr_comment_formatter.py +50 -0
- evalvault/adapters/outbound/retriever/__init__.py +8 -0
- evalvault/adapters/outbound/retriever/graph_rag_adapter.py +326 -0
- evalvault/adapters/outbound/storage/base_sql.py +291 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +130 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +60 -0
- evalvault/adapters/outbound/storage/schema.sql +63 -0
- evalvault/adapters/outbound/storage/sqlite_adapter.py +107 -0
- evalvault/domain/entities/__init__.py +20 -0
- evalvault/domain/entities/graph_rag.py +30 -0
- evalvault/domain/entities/multiturn.py +78 -0
- evalvault/domain/metrics/__init__.py +10 -0
- evalvault/domain/metrics/multiturn_metrics.py +113 -0
- evalvault/domain/metrics/registry.py +36 -0
- evalvault/domain/services/__init__.py +8 -0
- evalvault/domain/services/evaluator.py +5 -2
- evalvault/domain/services/graph_rag_experiment.py +155 -0
- evalvault/domain/services/multiturn_evaluator.py +187 -0
- evalvault/ports/inbound/__init__.py +2 -0
- evalvault/ports/inbound/multiturn_port.py +23 -0
- evalvault/ports/inbound/web_port.py +4 -0
- evalvault/ports/outbound/graph_retriever_port.py +24 -0
- evalvault/ports/outbound/storage_port.py +25 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/METADATA +1 -1
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/RECORD +47 -33
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/WHEEL +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.70.1.dist-info → evalvault-1.72.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -23,6 +23,7 @@ from .domain import create_domain_app
|
|
|
23
23
|
from .experiment import register_experiment_commands
|
|
24
24
|
from .gate import register_gate_commands
|
|
25
25
|
from .generate import register_generate_commands
|
|
26
|
+
from .graph_rag import create_graph_rag_app
|
|
26
27
|
from .history import register_history_commands
|
|
27
28
|
from .init import register_init_command
|
|
28
29
|
from .kg import create_kg_app
|
|
@@ -82,6 +83,7 @@ SUB_APPLICATIONS: tuple[SubAppModule, ...] = (
|
|
|
82
83
|
SubAppModule("kg", create_kg_app),
|
|
83
84
|
SubAppModule("domain", create_domain_app),
|
|
84
85
|
SubAppModule("benchmark", create_benchmark_app),
|
|
86
|
+
SubAppModule("graphrag", create_graph_rag_app),
|
|
85
87
|
SubAppModule("method", create_method_app),
|
|
86
88
|
SubAppModule("ops", create_ops_app),
|
|
87
89
|
SubAppModule("phoenix", create_phoenix_app),
|
|
@@ -298,6 +298,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
298
298
|
_console.print(f"\n[green]리포트 생성: {report}[/green]")
|
|
299
299
|
|
|
300
300
|
@app.command(name="analyze-compare")
|
|
301
|
+
@app.command(name="compare-analysis")
|
|
301
302
|
def analyze_compare(
|
|
302
303
|
run_id1: str = typer.Argument(..., help="첫 번째 Run ID"),
|
|
303
304
|
run_id2: str = typer.Argument(..., help="두 번째 Run ID"),
|
|
@@ -12,7 +12,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
|
|
|
12
12
|
from evalvault.domain.services.experiment_manager import ExperimentManager
|
|
13
13
|
|
|
14
14
|
from ..utils.options import db_option
|
|
15
|
-
from ..utils.validators import parse_csv_option
|
|
15
|
+
from ..utils.validators import parse_csv_option, validate_choice
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def register_experiment_commands(app: typer.Typer, console: Console) -> None:
|
|
@@ -29,10 +29,24 @@ def register_experiment_commands(app: typer.Typer, console: Console) -> None:
|
|
|
29
29
|
"-m",
|
|
30
30
|
help="Comma-separated list of metrics to compare.",
|
|
31
31
|
),
|
|
32
|
+
control_retriever: str | None = typer.Option(
|
|
33
|
+
None,
|
|
34
|
+
"--control-retriever",
|
|
35
|
+
help="Control retriever (bm25, dense, hybrid, graphrag).",
|
|
36
|
+
),
|
|
37
|
+
variant_retriever: str | None = typer.Option(
|
|
38
|
+
None,
|
|
39
|
+
"--variant-retriever",
|
|
40
|
+
help="Variant retriever (bm25, dense, hybrid, graphrag).",
|
|
41
|
+
),
|
|
32
42
|
db_path: Path = db_option(help_text="Path to database file."),
|
|
33
43
|
) -> None:
|
|
34
44
|
"""Create a new experiment for A/B testing."""
|
|
35
45
|
|
|
46
|
+
for retriever_name in (control_retriever, variant_retriever):
|
|
47
|
+
if retriever_name:
|
|
48
|
+
validate_choice(retriever_name, ["bm25", "dense", "hybrid", "graphrag"], console)
|
|
49
|
+
|
|
36
50
|
console.print("\n[bold]Creating Experiment[/bold]\n")
|
|
37
51
|
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
38
52
|
manager = ExperimentManager(storage)
|
|
@@ -44,6 +58,18 @@ def register_experiment_commands(app: typer.Typer, console: Console) -> None:
|
|
|
44
58
|
hypothesis=hypothesis,
|
|
45
59
|
metrics=metric_list,
|
|
46
60
|
)
|
|
61
|
+
if control_retriever:
|
|
62
|
+
manager.add_group_to_experiment(
|
|
63
|
+
experiment.experiment_id,
|
|
64
|
+
"control",
|
|
65
|
+
f"retriever={control_retriever}",
|
|
66
|
+
)
|
|
67
|
+
if variant_retriever:
|
|
68
|
+
manager.add_group_to_experiment(
|
|
69
|
+
experiment.experiment_id,
|
|
70
|
+
"variant",
|
|
71
|
+
f"retriever={variant_retriever}",
|
|
72
|
+
)
|
|
47
73
|
console.print(f"[green]Created experiment:[/green] {experiment.experiment_id}")
|
|
48
74
|
console.print(f" Name: {experiment.name}")
|
|
49
75
|
console.print(f" Status: {experiment.status}")
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""GraphRAG experiment commands for the EvalVault CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
from dataclasses import asdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import typer
|
|
12
|
+
from rich.console import Console
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
|
|
16
|
+
from evalvault.adapters.outbound.dataset import get_loader
|
|
17
|
+
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
18
|
+
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
19
|
+
from evalvault.adapters.outbound.retriever.graph_rag_adapter import GraphRAGAdapter
|
|
20
|
+
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
21
|
+
from evalvault.config.settings import Settings, apply_profile
|
|
22
|
+
from evalvault.domain.services.analysis_service import AnalysisService
|
|
23
|
+
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
24
|
+
from evalvault.domain.services.graph_rag_experiment import GraphRAGExperiment
|
|
25
|
+
from evalvault.ports.outbound.korean_nlp_port import RetrieverPort
|
|
26
|
+
|
|
27
|
+
from ..utils.console import print_cli_error
|
|
28
|
+
from ..utils.options import db_option, profile_option
|
|
29
|
+
from ..utils.validators import parse_csv_option, validate_choice
|
|
30
|
+
from .run import _build_dense_retriever
|
|
31
|
+
from .run_helpers import _is_oss_open_model, load_knowledge_graph, load_retriever_documents
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def create_graph_rag_app(console: Console) -> typer.Typer:
|
|
35
|
+
app = typer.Typer(name="graphrag", help="GraphRAG experiment utilities.")
|
|
36
|
+
|
|
37
|
+
@app.command("compare")
|
|
38
|
+
def graphrag_compare(
|
|
39
|
+
dataset: Path = typer.Argument(
|
|
40
|
+
...,
|
|
41
|
+
help="Path to dataset file (CSV, Excel, or JSON).",
|
|
42
|
+
exists=True,
|
|
43
|
+
readable=True,
|
|
44
|
+
),
|
|
45
|
+
metrics: str = typer.Option(
|
|
46
|
+
"faithfulness,answer_relevancy",
|
|
47
|
+
"--metrics",
|
|
48
|
+
"-m",
|
|
49
|
+
help="Comma-separated list of metrics to evaluate.",
|
|
50
|
+
),
|
|
51
|
+
baseline_retriever: str = typer.Option(
|
|
52
|
+
"bm25",
|
|
53
|
+
"--baseline-retriever",
|
|
54
|
+
help="Baseline retriever (bm25, dense, hybrid).",
|
|
55
|
+
),
|
|
56
|
+
retriever_docs: Path = typer.Option(
|
|
57
|
+
...,
|
|
58
|
+
"--retriever-docs",
|
|
59
|
+
help="Retriever documents file (.json/.jsonl/.txt).",
|
|
60
|
+
exists=True,
|
|
61
|
+
readable=True,
|
|
62
|
+
),
|
|
63
|
+
kg_path: Path = typer.Option(
|
|
64
|
+
...,
|
|
65
|
+
"--kg",
|
|
66
|
+
"-k",
|
|
67
|
+
help="Knowledge graph JSON file for GraphRAG.",
|
|
68
|
+
exists=True,
|
|
69
|
+
readable=True,
|
|
70
|
+
),
|
|
71
|
+
retriever_top_k: int = typer.Option(
|
|
72
|
+
5,
|
|
73
|
+
"--retriever-top-k",
|
|
74
|
+
help="Retriever top-k to fill contexts.",
|
|
75
|
+
),
|
|
76
|
+
graph_max_hops: int = typer.Option(
|
|
77
|
+
2,
|
|
78
|
+
"--graph-max-hops",
|
|
79
|
+
help="GraphRAG max hop depth.",
|
|
80
|
+
),
|
|
81
|
+
graph_max_nodes: int = typer.Option(
|
|
82
|
+
20,
|
|
83
|
+
"--graph-max-nodes",
|
|
84
|
+
help="GraphRAG max nodes in subgraph.",
|
|
85
|
+
),
|
|
86
|
+
model: str | None = typer.Option(
|
|
87
|
+
None,
|
|
88
|
+
"--model",
|
|
89
|
+
help="Model to use for evaluation (overrides profile).",
|
|
90
|
+
),
|
|
91
|
+
db_path: Path | None = db_option(help_text="DB 경로 (저장 시 사용)."),
|
|
92
|
+
profile: str | None = profile_option(help_text="LLM 프로필"),
|
|
93
|
+
output: Path | None = typer.Option(
|
|
94
|
+
None,
|
|
95
|
+
"--output",
|
|
96
|
+
"-o",
|
|
97
|
+
help="JSON 출력 파일 경로",
|
|
98
|
+
),
|
|
99
|
+
artifact_dir: Path = typer.Option(
|
|
100
|
+
Path("reports/analysis/artifacts"),
|
|
101
|
+
"--artifact-dir",
|
|
102
|
+
help="GraphRAG 아티팩트 저장 경로",
|
|
103
|
+
),
|
|
104
|
+
) -> None:
|
|
105
|
+
validate_choice(baseline_retriever, ["bm25", "dense", "hybrid"], console)
|
|
106
|
+
|
|
107
|
+
settings = Settings()
|
|
108
|
+
profile_name = profile or settings.evalvault_profile
|
|
109
|
+
if profile_name:
|
|
110
|
+
settings = apply_profile(settings, profile_name)
|
|
111
|
+
|
|
112
|
+
if model:
|
|
113
|
+
if _is_oss_open_model(model) and settings.llm_provider != "vllm":
|
|
114
|
+
settings.llm_provider = "ollama"
|
|
115
|
+
settings.ollama_model = model
|
|
116
|
+
elif settings.llm_provider == "ollama":
|
|
117
|
+
settings.ollama_model = model
|
|
118
|
+
elif settings.llm_provider == "vllm":
|
|
119
|
+
settings.vllm_model = model
|
|
120
|
+
else:
|
|
121
|
+
settings.openai_model = model
|
|
122
|
+
|
|
123
|
+
if settings.llm_provider == "openai" and not settings.openai_api_key:
|
|
124
|
+
print_cli_error(console, "OPENAI_API_KEY가 설정되지 않았습니다.")
|
|
125
|
+
raise typer.Exit(1)
|
|
126
|
+
|
|
127
|
+
try:
|
|
128
|
+
llm_adapter = get_llm_adapter(settings)
|
|
129
|
+
except Exception as exc:
|
|
130
|
+
print_cli_error(console, "LLM 어댑터 초기화에 실패했습니다.", details=str(exc))
|
|
131
|
+
raise typer.Exit(1) from exc
|
|
132
|
+
|
|
133
|
+
loader = get_loader(dataset)
|
|
134
|
+
ds = loader.load(dataset)
|
|
135
|
+
|
|
136
|
+
documents, doc_ids = load_retriever_documents(retriever_docs)
|
|
137
|
+
baseline = _build_baseline_retriever(
|
|
138
|
+
baseline_retriever,
|
|
139
|
+
documents=documents,
|
|
140
|
+
settings=settings,
|
|
141
|
+
profile_name=profile_name,
|
|
142
|
+
)
|
|
143
|
+
if baseline is None:
|
|
144
|
+
print_cli_error(console, "Baseline retriever 초기화에 실패했습니다.")
|
|
145
|
+
raise typer.Exit(1)
|
|
146
|
+
|
|
147
|
+
kg_graph = load_knowledge_graph(kg_path)
|
|
148
|
+
graph_adapter = GraphRAGAdapter(kg_graph)
|
|
149
|
+
|
|
150
|
+
korean_toolkit = try_create_korean_toolkit()
|
|
151
|
+
evaluator = RagasEvaluator(
|
|
152
|
+
korean_toolkit=korean_toolkit,
|
|
153
|
+
llm_factory=SettingsLLMFactory(settings),
|
|
154
|
+
)
|
|
155
|
+
analysis_service = AnalysisService(analysis_adapter=StatisticalAnalysisAdapter())
|
|
156
|
+
experiment = GraphRAGExperiment(
|
|
157
|
+
evaluator=evaluator,
|
|
158
|
+
analysis_service=analysis_service,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
metric_list = parse_csv_option(metrics)
|
|
162
|
+
if not metric_list:
|
|
163
|
+
print_cli_error(console, "평가 메트릭을 지정하세요.")
|
|
164
|
+
raise typer.Exit(1)
|
|
165
|
+
|
|
166
|
+
result = asyncio.run(
|
|
167
|
+
experiment.run_comparison(
|
|
168
|
+
dataset=ds,
|
|
169
|
+
baseline_retriever=baseline,
|
|
170
|
+
graph_retriever=graph_adapter,
|
|
171
|
+
metrics=metric_list,
|
|
172
|
+
llm=llm_adapter,
|
|
173
|
+
retriever_top_k=retriever_top_k,
|
|
174
|
+
graph_max_hops=graph_max_hops,
|
|
175
|
+
graph_max_nodes=graph_max_nodes,
|
|
176
|
+
)
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
artifacts_path = _write_graph_rag_artifacts(
|
|
180
|
+
result=result,
|
|
181
|
+
dataset=ds,
|
|
182
|
+
graph_retriever=graph_adapter,
|
|
183
|
+
artifact_root=artifact_dir,
|
|
184
|
+
)
|
|
185
|
+
console.print(f"[green]Saved GraphRAG artifacts:[/green] {artifacts_path}")
|
|
186
|
+
|
|
187
|
+
if db_path is not None:
|
|
188
|
+
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
189
|
+
storage.save_run(result.baseline_run)
|
|
190
|
+
storage.save_run(result.graph_run)
|
|
191
|
+
console.print(f"[green]Saved baseline run:[/green] {result.baseline_run.run_id}")
|
|
192
|
+
console.print(f"[green]Saved graph run:[/green] {result.graph_run.run_id}")
|
|
193
|
+
|
|
194
|
+
_render_comparison_table(console, result)
|
|
195
|
+
|
|
196
|
+
if output:
|
|
197
|
+
payload = _build_output_payload(result, doc_ids)
|
|
198
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
with output.open("w", encoding="utf-8") as f:
|
|
200
|
+
json.dump(payload, f, ensure_ascii=False, indent=2, default=str)
|
|
201
|
+
console.print(f"[green]Saved output:[/green] {output}")
|
|
202
|
+
|
|
203
|
+
return app
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _build_baseline_retriever(
|
|
207
|
+
mode: str,
|
|
208
|
+
*,
|
|
209
|
+
documents: list[str],
|
|
210
|
+
settings: Settings,
|
|
211
|
+
profile_name: str | None,
|
|
212
|
+
) -> RetrieverPort | None:
|
|
213
|
+
if mode in {"bm25", "hybrid"}:
|
|
214
|
+
toolkit = try_create_korean_toolkit()
|
|
215
|
+
if toolkit is None:
|
|
216
|
+
return None
|
|
217
|
+
return toolkit.build_retriever(documents, use_hybrid=mode == "hybrid", verbose=False)
|
|
218
|
+
return _build_dense_retriever(
|
|
219
|
+
documents=documents,
|
|
220
|
+
settings=settings,
|
|
221
|
+
profile_name=profile_name,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _render_comparison_table(console: Console, result: Any) -> None:
|
|
226
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
227
|
+
table.add_column("Metric")
|
|
228
|
+
table.add_column("Baseline", justify="right")
|
|
229
|
+
table.add_column("Graph", justify="right")
|
|
230
|
+
table.add_column("Diff%", justify="right")
|
|
231
|
+
table.add_column("P-Value", justify="right")
|
|
232
|
+
|
|
233
|
+
for comp in result.comparisons:
|
|
234
|
+
table.add_row(
|
|
235
|
+
comp.metric,
|
|
236
|
+
f"{comp.mean_a:.3f}",
|
|
237
|
+
f"{comp.mean_b:.3f}",
|
|
238
|
+
f"{comp.diff_percent:+.1f}%",
|
|
239
|
+
f"{comp.p_value:.4f}",
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
console.print("\n[bold]GraphRAG Comparison[/bold]\n")
|
|
243
|
+
console.print(table)
|
|
244
|
+
console.print()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _build_output_payload(result: Any, doc_ids: list[str]) -> dict[str, Any]:
|
|
248
|
+
return {
|
|
249
|
+
"baseline": result.baseline_run.to_summary_dict(),
|
|
250
|
+
"graph": result.graph_run.to_summary_dict(),
|
|
251
|
+
"comparisons": [asdict(comp) for comp in result.comparisons],
|
|
252
|
+
"graph_contexts": result.graph_contexts,
|
|
253
|
+
"graph_subgraphs": {key: asdict(value) for key, value in result.graph_subgraphs.items()},
|
|
254
|
+
"retriever_doc_ids": doc_ids,
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _write_graph_rag_artifacts(
|
|
259
|
+
*,
|
|
260
|
+
result: Any,
|
|
261
|
+
dataset: Any,
|
|
262
|
+
graph_retriever: GraphRAGAdapter,
|
|
263
|
+
artifact_root: Path,
|
|
264
|
+
) -> Path:
|
|
265
|
+
run_id = result.graph_run.run_id
|
|
266
|
+
base_dir = artifact_root / f"analysis_{run_id}"
|
|
267
|
+
graph_dir = base_dir / "graph_subgraphs"
|
|
268
|
+
entity_dir = base_dir / "entity_extraction"
|
|
269
|
+
graph_dir.mkdir(parents=True, exist_ok=True)
|
|
270
|
+
entity_dir.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
|
|
272
|
+
graph_index: dict[str, str] = {}
|
|
273
|
+
for case_id, subgraph in result.graph_subgraphs.items():
|
|
274
|
+
safe_id = _safe_filename(case_id)
|
|
275
|
+
file_name = f"{safe_id}_subgraph.json"
|
|
276
|
+
file_path = graph_dir / file_name
|
|
277
|
+
with file_path.open("w", encoding="utf-8") as handle:
|
|
278
|
+
json.dump(asdict(subgraph), handle, ensure_ascii=False, indent=2)
|
|
279
|
+
graph_index[case_id] = str(Path("graph_subgraphs") / file_name)
|
|
280
|
+
|
|
281
|
+
entities_payload: dict[str, list[dict[str, object]]] = {}
|
|
282
|
+
for case in dataset.test_cases:
|
|
283
|
+
entities = graph_retriever.extract_entities(case.question)
|
|
284
|
+
entities_payload[case.id] = [asdict(entity) for entity in entities]
|
|
285
|
+
entities_path = entity_dir / "entities.json"
|
|
286
|
+
with entities_path.open("w", encoding="utf-8") as handle:
|
|
287
|
+
json.dump(entities_payload, handle, ensure_ascii=False, indent=2)
|
|
288
|
+
|
|
289
|
+
index_payload = {
|
|
290
|
+
"graph_subgraphs": graph_index,
|
|
291
|
+
"entity_extraction": str(Path("entity_extraction") / "entities.json"),
|
|
292
|
+
}
|
|
293
|
+
with (base_dir / "index.json").open("w", encoding="utf-8") as handle:
|
|
294
|
+
json.dump(index_payload, handle, ensure_ascii=False, indent=2)
|
|
295
|
+
|
|
296
|
+
return base_dir
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _safe_filename(value: str) -> str:
|
|
300
|
+
return value.replace("/", "_").replace("\\", "_").replace(" ", "_")
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
__all__ = ["create_graph_rag_app"]
|
|
@@ -13,6 +13,13 @@ from evalvault.adapters.inbound.cli.utils.analysis_io import write_json
|
|
|
13
13
|
from evalvault.adapters.outbound.analysis.statistical_adapter import (
|
|
14
14
|
StatisticalAnalysisAdapter,
|
|
15
15
|
)
|
|
16
|
+
from evalvault.adapters.outbound.report.ci_report_formatter import (
|
|
17
|
+
CIGateMetricRow,
|
|
18
|
+
format_ci_regression_report,
|
|
19
|
+
)
|
|
20
|
+
from evalvault.adapters.outbound.report.pr_comment_formatter import (
|
|
21
|
+
format_ci_gate_pr_comment,
|
|
22
|
+
)
|
|
16
23
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
17
24
|
from evalvault.domain.services.regression_gate_service import (
|
|
18
25
|
RegressionGateReport,
|
|
@@ -25,13 +32,14 @@ from ..utils.options import db_option
|
|
|
25
32
|
from ..utils.validators import parse_csv_option, validate_choice
|
|
26
33
|
|
|
27
34
|
|
|
28
|
-
def _coerce_test_type(value:
|
|
35
|
+
def _coerce_test_type(value: str) -> TestType:
|
|
29
36
|
if value == "t-test":
|
|
30
37
|
return "t-test"
|
|
31
38
|
return "mann-whitney"
|
|
32
39
|
|
|
33
40
|
|
|
34
41
|
OutputFormat = Literal["table", "json", "github-actions"]
|
|
42
|
+
CIGateOutputFormat = Literal["github", "gitlab", "json", "pr-comment"]
|
|
35
43
|
|
|
36
44
|
|
|
37
45
|
def _format_timestamp(value: datetime) -> str:
|
|
@@ -200,6 +208,166 @@ def register_regress_commands(app: typer.Typer, console: Console) -> None:
|
|
|
200
208
|
if report.regression_detected:
|
|
201
209
|
raise typer.Exit(2)
|
|
202
210
|
|
|
211
|
+
@app.command(name="ci-gate")
|
|
212
|
+
def ci_gate(
|
|
213
|
+
baseline_run_id: str = typer.Argument(..., help="Baseline run ID."),
|
|
214
|
+
current_run_id: str = typer.Argument(..., help="Current run ID."),
|
|
215
|
+
regression_threshold: float = typer.Option(
|
|
216
|
+
0.05,
|
|
217
|
+
"--regression-threshold",
|
|
218
|
+
help="Fail if regression rate exceeds this threshold (default: 0.05).",
|
|
219
|
+
),
|
|
220
|
+
output_format: str = typer.Option(
|
|
221
|
+
"github",
|
|
222
|
+
"--format",
|
|
223
|
+
"-f",
|
|
224
|
+
help="Output format: github, gitlab, json, or pr-comment.",
|
|
225
|
+
),
|
|
226
|
+
fail_on_regression: bool = typer.Option(
|
|
227
|
+
True,
|
|
228
|
+
"--fail-on-regression/--no-fail-on-regression",
|
|
229
|
+
help="Fail the command when regression rate exceeds threshold.",
|
|
230
|
+
),
|
|
231
|
+
db_path: Path | None = db_option(default=None, help_text="Database path"),
|
|
232
|
+
) -> None:
|
|
233
|
+
"""CI/CD 파이프라인용 회귀 게이트 체크."""
|
|
234
|
+
started_at = datetime.now(UTC)
|
|
235
|
+
if db_path is None:
|
|
236
|
+
console.print("[red]Error:[/red] Database path is not configured.")
|
|
237
|
+
raise typer.Exit(1)
|
|
238
|
+
|
|
239
|
+
validate_choice(
|
|
240
|
+
output_format,
|
|
241
|
+
["github", "gitlab", "json", "pr-comment"],
|
|
242
|
+
console,
|
|
243
|
+
value_label="format",
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
247
|
+
analysis_adapter = StatisticalAnalysisAdapter()
|
|
248
|
+
service = RegressionGateService(storage=storage, analysis_adapter=analysis_adapter)
|
|
249
|
+
|
|
250
|
+
try:
|
|
251
|
+
current_run = storage.get_run(current_run_id)
|
|
252
|
+
storage.get_run(baseline_run_id)
|
|
253
|
+
report = service.run_gate(
|
|
254
|
+
current_run_id,
|
|
255
|
+
baseline_run_id,
|
|
256
|
+
)
|
|
257
|
+
except KeyError as exc:
|
|
258
|
+
finished_at = datetime.now(UTC)
|
|
259
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
260
|
+
payload = _build_envelope(
|
|
261
|
+
report=None,
|
|
262
|
+
status="error",
|
|
263
|
+
started_at=started_at,
|
|
264
|
+
finished_at=finished_at,
|
|
265
|
+
duration_ms=duration_ms,
|
|
266
|
+
message=str(exc),
|
|
267
|
+
error_type=type(exc).__name__,
|
|
268
|
+
)
|
|
269
|
+
if output_format == "json":
|
|
270
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
271
|
+
else:
|
|
272
|
+
console.print(f"[red]Error:[/red] {exc}")
|
|
273
|
+
raise typer.Exit(3) from exc
|
|
274
|
+
except (ValueError, RuntimeError) as exc:
|
|
275
|
+
finished_at = datetime.now(UTC)
|
|
276
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
277
|
+
payload = _build_envelope(
|
|
278
|
+
report=None,
|
|
279
|
+
status="error",
|
|
280
|
+
started_at=started_at,
|
|
281
|
+
finished_at=finished_at,
|
|
282
|
+
duration_ms=duration_ms,
|
|
283
|
+
message=str(exc),
|
|
284
|
+
error_type=type(exc).__name__,
|
|
285
|
+
)
|
|
286
|
+
if output_format == "json":
|
|
287
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
288
|
+
else:
|
|
289
|
+
console.print(f"[red]Error:[/red] {exc}")
|
|
290
|
+
raise typer.Exit(1) from exc
|
|
291
|
+
|
|
292
|
+
thresholds = dict.fromkeys(current_run.metrics_evaluated, 0.7)
|
|
293
|
+
thresholds.update(current_run.thresholds or {})
|
|
294
|
+
|
|
295
|
+
rows: list[CIGateMetricRow] = []
|
|
296
|
+
threshold_failures = []
|
|
297
|
+
regressed_metrics = []
|
|
298
|
+
for result in report.results:
|
|
299
|
+
avg_score = current_run.get_avg_score(result.metric)
|
|
300
|
+
threshold = thresholds.get(result.metric, 0.7)
|
|
301
|
+
threshold_passed = avg_score is not None and avg_score >= threshold
|
|
302
|
+
if not threshold_passed:
|
|
303
|
+
threshold_failures.append(result.metric)
|
|
304
|
+
if result.regression:
|
|
305
|
+
regressed_metrics.append(result.metric)
|
|
306
|
+
if result.regression:
|
|
307
|
+
status = "⚠️"
|
|
308
|
+
elif threshold_passed:
|
|
309
|
+
status = "✅"
|
|
310
|
+
else:
|
|
311
|
+
status = "❌"
|
|
312
|
+
rows.append(
|
|
313
|
+
CIGateMetricRow(
|
|
314
|
+
metric=result.metric,
|
|
315
|
+
baseline_score=result.baseline_score,
|
|
316
|
+
current_score=result.candidate_score,
|
|
317
|
+
change_percent=result.diff_percent,
|
|
318
|
+
status=status,
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
regression_rate = len(regressed_metrics) / len(report.results) if report.results else 0.0
|
|
323
|
+
all_thresholds_passed = not threshold_failures
|
|
324
|
+
gate_passed = all_thresholds_passed and regression_rate < regression_threshold
|
|
325
|
+
|
|
326
|
+
finished_at = datetime.now(UTC)
|
|
327
|
+
duration_ms = int((finished_at - started_at).total_seconds() * 1000)
|
|
328
|
+
payload = {
|
|
329
|
+
"baseline_run_id": baseline_run_id,
|
|
330
|
+
"current_run_id": current_run_id,
|
|
331
|
+
"gate_passed": gate_passed,
|
|
332
|
+
"all_thresholds_passed": all_thresholds_passed,
|
|
333
|
+
"regression_rate": regression_rate,
|
|
334
|
+
"regression_threshold": regression_threshold,
|
|
335
|
+
"regressed_metrics": regressed_metrics,
|
|
336
|
+
"threshold_failures": threshold_failures,
|
|
337
|
+
"started_at": _format_timestamp(started_at),
|
|
338
|
+
"finished_at": _format_timestamp(finished_at),
|
|
339
|
+
"duration_ms": duration_ms,
|
|
340
|
+
"report": report.to_dict(),
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
if output_format == "json":
|
|
344
|
+
console.print(json.dumps(payload, ensure_ascii=False, indent=2))
|
|
345
|
+
elif output_format == "pr-comment":
|
|
346
|
+
markdown = format_ci_gate_pr_comment(
|
|
347
|
+
rows,
|
|
348
|
+
baseline_run_id=baseline_run_id,
|
|
349
|
+
current_run_id=current_run_id,
|
|
350
|
+
regression_rate=regression_rate,
|
|
351
|
+
regression_threshold=regression_threshold,
|
|
352
|
+
gate_passed=gate_passed,
|
|
353
|
+
threshold_failures=threshold_failures,
|
|
354
|
+
regressed_metrics=regressed_metrics,
|
|
355
|
+
)
|
|
356
|
+
console.print(markdown)
|
|
357
|
+
else:
|
|
358
|
+
markdown = format_ci_regression_report(
|
|
359
|
+
rows,
|
|
360
|
+
regression_rate=regression_rate,
|
|
361
|
+
regression_threshold=regression_threshold,
|
|
362
|
+
gate_passed=gate_passed,
|
|
363
|
+
)
|
|
364
|
+
console.print(markdown)
|
|
365
|
+
|
|
366
|
+
if not all_thresholds_passed:
|
|
367
|
+
raise typer.Exit(1)
|
|
368
|
+
if not gate_passed and fail_on_regression:
|
|
369
|
+
raise typer.Exit(2)
|
|
370
|
+
|
|
203
371
|
|
|
204
372
|
def _render_table(report: RegressionGateReport, console: Console) -> None:
|
|
205
373
|
console.print(f"\n[bold]Regression Gate Check: {report.candidate_run_id}[/bold]\n")
|