evalvault 1.58.0__py3-none-any.whl → 1.60.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +13 -25
- evalvault/adapters/inbound/api/routers/pipeline.py +64 -0
- evalvault/adapters/inbound/api/routers/runs.py +16 -0
- evalvault/adapters/inbound/cli/app.py +3 -13
- evalvault/adapters/inbound/cli/commands/analyze.py +258 -2
- evalvault/adapters/inbound/cli/commands/config.py +4 -40
- evalvault/adapters/inbound/cli/commands/pipeline.py +9 -9
- evalvault/adapters/inbound/cli/commands/run.py +60 -26
- evalvault/adapters/inbound/cli/utils/analysis_io.py +2 -2
- evalvault/adapters/outbound/analysis/__init__.py +13 -3
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +2 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +2 -1
- evalvault/adapters/outbound/analysis/hypothesis_generator_module.py +359 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +225 -16
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +250 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +3 -0
- evalvault/adapters/outbound/analysis/pipeline_helpers.py +1 -1
- evalvault/adapters/outbound/analysis/priority_summary_module.py +1 -1
- evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py +3 -2
- evalvault/adapters/outbound/analysis/timeseries_advanced_module.py +349 -0
- evalvault/adapters/outbound/benchmark/lm_eval_adapter.py +1 -1
- evalvault/adapters/outbound/improvement/insight_generator.py +13 -10
- evalvault/adapters/outbound/improvement/pattern_detector.py +11 -13
- evalvault/adapters/outbound/improvement/playbook_loader.py +3 -3
- evalvault/adapters/outbound/llm/__init__.py +63 -63
- evalvault/adapters/outbound/llm/instructor_factory.py +101 -7
- evalvault/adapters/outbound/llm/ollama_adapter.py +8 -1
- evalvault/adapters/outbound/llm/token_aware_chat.py +1 -1
- evalvault/adapters/outbound/report/__init__.py +2 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +197 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +1 -1
- evalvault/adapters/outbound/tracer/open_rag_log_handler.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py +4 -4
- evalvault/config/settings.py +10 -0
- evalvault/domain/entities/analysis_pipeline.py +13 -3
- evalvault/domain/metrics/analysis_registry.py +217 -0
- evalvault/domain/metrics/registry.py +185 -0
- evalvault/domain/services/analysis_service.py +3 -3
- evalvault/domain/services/evaluator.py +1 -1
- evalvault/domain/services/pipeline_template_registry.py +205 -23
- evalvault/domain/services/visual_space_service.py +1 -1
- evalvault/ports/inbound/web_port.py +4 -0
- {evalvault-1.58.0.dist-info → evalvault-1.60.0.dist-info}/METADATA +10 -4
- {evalvault-1.58.0.dist-info → evalvault-1.60.0.dist-info}/RECORD +48 -42
- {evalvault-1.58.0.dist-info → evalvault-1.60.0.dist-info}/WHEEL +0 -0
- {evalvault-1.58.0.dist-info → evalvault-1.60.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.58.0.dist-info → evalvault-1.60.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -15,6 +15,13 @@ from urllib.request import urlopen
|
|
|
15
15
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
16
16
|
from evalvault.config.settings import Settings
|
|
17
17
|
from evalvault.domain.entities.prompt import PromptSetBundle
|
|
18
|
+
from evalvault.domain.metrics.registry import (
|
|
19
|
+
get_metric_descriptions as registry_metric_descriptions,
|
|
20
|
+
)
|
|
21
|
+
from evalvault.domain.metrics.registry import (
|
|
22
|
+
list_metric_names,
|
|
23
|
+
list_metric_specs,
|
|
24
|
+
)
|
|
18
25
|
from evalvault.domain.services.cluster_map_builder import build_cluster_map
|
|
19
26
|
from evalvault.domain.services.prompt_registry import (
|
|
20
27
|
PromptInput,
|
|
@@ -42,21 +49,6 @@ if TYPE_CHECKING:
|
|
|
42
49
|
|
|
43
50
|
logger = logging.getLogger(__name__)
|
|
44
51
|
|
|
45
|
-
# 지원하는 메트릭 목록
|
|
46
|
-
AVAILABLE_METRICS = [
|
|
47
|
-
"faithfulness",
|
|
48
|
-
"answer_relevancy",
|
|
49
|
-
"context_precision",
|
|
50
|
-
"context_recall",
|
|
51
|
-
"factual_correctness",
|
|
52
|
-
"semantic_similarity",
|
|
53
|
-
"summary_score",
|
|
54
|
-
"summary_faithfulness",
|
|
55
|
-
"insurance_term_accuracy",
|
|
56
|
-
"entity_preservation",
|
|
57
|
-
"contextual_relevancy",
|
|
58
|
-
]
|
|
59
|
-
|
|
60
52
|
|
|
61
53
|
@dataclass
|
|
62
54
|
class GateResult:
|
|
@@ -978,19 +970,15 @@ class WebUIAdapter:
|
|
|
978
970
|
|
|
979
971
|
def get_available_metrics(self) -> list[str]:
|
|
980
972
|
"""사용 가능한 메트릭 목록 반환."""
|
|
981
|
-
return
|
|
973
|
+
return list_metric_names()
|
|
974
|
+
|
|
975
|
+
def get_metric_specs(self) -> list[dict[str, object]]:
|
|
976
|
+
"""메트릭 스펙 목록 반환."""
|
|
977
|
+
return [spec.to_dict() for spec in list_metric_specs()]
|
|
982
978
|
|
|
983
979
|
def get_metric_descriptions(self) -> dict[str, str]:
|
|
984
980
|
"""메트릭별 설명 반환."""
|
|
985
|
-
return
|
|
986
|
-
"faithfulness": "답변이 컨텍스트에 충실한지 평가",
|
|
987
|
-
"answer_relevancy": "답변이 질문과 관련있는지 평가",
|
|
988
|
-
"context_precision": "검색된 컨텍스트의 정밀도 평가",
|
|
989
|
-
"context_recall": "필요한 정보가 검색되었는지 평가",
|
|
990
|
-
"factual_correctness": "ground_truth 대비 사실적 정확성 평가",
|
|
991
|
-
"semantic_similarity": "답변과 ground_truth 간 의미적 유사도 평가",
|
|
992
|
-
"insurance_term_accuracy": "보험 용어 정확성 평가",
|
|
993
|
-
}
|
|
981
|
+
return registry_metric_descriptions()
|
|
994
982
|
|
|
995
983
|
def create_dataset_from_upload(
|
|
996
984
|
self,
|
|
@@ -11,6 +11,7 @@ from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
|
11
11
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
12
12
|
from evalvault.config.settings import get_settings
|
|
13
13
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
14
|
+
from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
|
|
14
15
|
from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
|
|
15
16
|
|
|
16
17
|
router = APIRouter(tags=["pipeline"])
|
|
@@ -71,6 +72,54 @@ INTENT_CATALOG = {
|
|
|
71
72
|
"description": "시간에 따른 추세를 분석합니다.",
|
|
72
73
|
"sample_query": "메트릭 추세를 분석해줘",
|
|
73
74
|
},
|
|
75
|
+
AnalysisIntent.ANALYZE_STATISTICAL: {
|
|
76
|
+
"label": "기술 통계량",
|
|
77
|
+
"category": "analysis",
|
|
78
|
+
"description": "메트릭별 기초 통계량을 계산합니다.",
|
|
79
|
+
"sample_query": "기초 통계 분석해줘",
|
|
80
|
+
},
|
|
81
|
+
AnalysisIntent.ANALYZE_NLP: {
|
|
82
|
+
"label": "NLP 분석",
|
|
83
|
+
"category": "analysis",
|
|
84
|
+
"description": "질문/답변 텍스트를 분석합니다.",
|
|
85
|
+
"sample_query": "텍스트 분석해줘",
|
|
86
|
+
},
|
|
87
|
+
AnalysisIntent.ANALYZE_CAUSAL: {
|
|
88
|
+
"label": "인과 관계 분석",
|
|
89
|
+
"category": "analysis",
|
|
90
|
+
"description": "요인별 영향도와 인과 관계를 분석합니다.",
|
|
91
|
+
"sample_query": "인과 관계 분석해줘",
|
|
92
|
+
},
|
|
93
|
+
AnalysisIntent.ANALYZE_NETWORK: {
|
|
94
|
+
"label": "네트워크 분석",
|
|
95
|
+
"category": "analysis",
|
|
96
|
+
"description": "메트릭 간 상관관계 네트워크를 분석합니다.",
|
|
97
|
+
"sample_query": "메트릭 네트워크 분석해줘",
|
|
98
|
+
},
|
|
99
|
+
AnalysisIntent.ANALYZE_PLAYBOOK: {
|
|
100
|
+
"label": "플레이북 분석",
|
|
101
|
+
"category": "analysis",
|
|
102
|
+
"description": "개선 플레이북 기반 진단을 수행합니다.",
|
|
103
|
+
"sample_query": "플레이북으로 분석해줘",
|
|
104
|
+
},
|
|
105
|
+
AnalysisIntent.DETECT_ANOMALIES: {
|
|
106
|
+
"label": "이상 탐지",
|
|
107
|
+
"category": "timeseries",
|
|
108
|
+
"description": "시계열 이상 패턴을 탐지합니다.",
|
|
109
|
+
"sample_query": "이상 탐지해줘",
|
|
110
|
+
},
|
|
111
|
+
AnalysisIntent.FORECAST_PERFORMANCE: {
|
|
112
|
+
"label": "성능 예측",
|
|
113
|
+
"category": "timeseries",
|
|
114
|
+
"description": "미래 성능을 예측합니다.",
|
|
115
|
+
"sample_query": "성능 예측해줘",
|
|
116
|
+
},
|
|
117
|
+
AnalysisIntent.GENERATE_HYPOTHESES: {
|
|
118
|
+
"label": "가설 생성",
|
|
119
|
+
"category": "generation",
|
|
120
|
+
"description": "성능 저하 원인에 대한 가설을 생성합니다.",
|
|
121
|
+
"sample_query": "가설 생성해줘",
|
|
122
|
+
},
|
|
74
123
|
AnalysisIntent.BENCHMARK_RETRIEVAL: {
|
|
75
124
|
"label": "검색 벤치마크",
|
|
76
125
|
"category": "benchmark",
|
|
@@ -172,6 +221,15 @@ class PipelineResultResponse(PipelineResultSummary):
|
|
|
172
221
|
final_output: dict[str, Any] | None = None
|
|
173
222
|
|
|
174
223
|
|
|
224
|
+
class AnalysisMetricSpecResponse(BaseModel):
|
|
225
|
+
key: str
|
|
226
|
+
label: str
|
|
227
|
+
description: str
|
|
228
|
+
signal_group: str
|
|
229
|
+
module_id: str
|
|
230
|
+
output_path: list[str]
|
|
231
|
+
|
|
232
|
+
|
|
175
233
|
def _serialize_payload(value: Any) -> Any:
|
|
176
234
|
try:
|
|
177
235
|
return jsonable_encoder(value)
|
|
@@ -318,6 +376,12 @@ async def list_intents():
|
|
|
318
376
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
319
377
|
|
|
320
378
|
|
|
379
|
+
@router.get("/options/analysis-metric-specs", response_model=list[AnalysisMetricSpecResponse])
|
|
380
|
+
async def list_analysis_metric_specs_endpoint():
|
|
381
|
+
"""List analysis metric specs for pipeline outputs."""
|
|
382
|
+
return [spec.to_dict() for spec in list_analysis_metric_specs()]
|
|
383
|
+
|
|
384
|
+
|
|
321
385
|
@router.post("/results", response_model=PipelineResultSummary)
|
|
322
386
|
async def save_pipeline_result(payload: PipelineResultPayload):
|
|
323
387
|
"""Save a pipeline analysis result for history."""
|
|
@@ -113,6 +113,16 @@ class ModelItemResponse(BaseModel):
|
|
|
113
113
|
supports_tools: bool | None = None
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
class MetricSpecResponse(BaseModel):
|
|
117
|
+
name: str
|
|
118
|
+
description: str
|
|
119
|
+
requires_ground_truth: bool
|
|
120
|
+
requires_embeddings: bool
|
|
121
|
+
source: str
|
|
122
|
+
category: str
|
|
123
|
+
signal_group: str
|
|
124
|
+
|
|
125
|
+
|
|
116
126
|
class ClusterMapItemResponse(BaseModel):
|
|
117
127
|
test_case_id: str
|
|
118
128
|
cluster_id: str
|
|
@@ -395,6 +405,12 @@ def list_metrics(adapter: AdapterDep):
|
|
|
395
405
|
return adapter.get_available_metrics()
|
|
396
406
|
|
|
397
407
|
|
|
408
|
+
@router.get("/options/metric-specs", response_model=list[MetricSpecResponse])
|
|
409
|
+
def list_metric_specs(adapter: AdapterDep):
|
|
410
|
+
"""Get available metrics with metadata."""
|
|
411
|
+
return adapter.get_metric_specs()
|
|
412
|
+
|
|
413
|
+
|
|
398
414
|
@router.get("/options/cluster-maps", response_model=list[ClusterMapFileResponse])
|
|
399
415
|
def list_cluster_maps():
|
|
400
416
|
"""List available cluster map CSV files."""
|
|
@@ -14,6 +14,8 @@ import typer
|
|
|
14
14
|
from rich import print as rprint
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
|
|
17
|
+
from evalvault.domain.metrics.registry import list_metric_names
|
|
18
|
+
|
|
17
19
|
from .commands import attach_sub_apps, register_all_commands
|
|
18
20
|
|
|
19
21
|
|
|
@@ -32,19 +34,7 @@ app = typer.Typer(
|
|
|
32
34
|
)
|
|
33
35
|
console = Console()
|
|
34
36
|
|
|
35
|
-
AVAILABLE_METRICS
|
|
36
|
-
"faithfulness",
|
|
37
|
-
"answer_relevancy",
|
|
38
|
-
"context_precision",
|
|
39
|
-
"context_recall",
|
|
40
|
-
"factual_correctness",
|
|
41
|
-
"semantic_similarity",
|
|
42
|
-
"summary_score",
|
|
43
|
-
"summary_faithfulness",
|
|
44
|
-
"insurance_term_accuracy",
|
|
45
|
-
"entity_preservation",
|
|
46
|
-
"contextual_relevancy",
|
|
47
|
-
]
|
|
37
|
+
AVAILABLE_METRICS = list_metric_names()
|
|
48
38
|
|
|
49
39
|
register_all_commands(app, console, available_metrics=AVAILABLE_METRICS)
|
|
50
40
|
attach_sub_apps(app, console)
|
|
@@ -12,15 +12,18 @@ from rich.table import Table
|
|
|
12
12
|
|
|
13
13
|
from evalvault.adapters.outbound.analysis import (
|
|
14
14
|
CausalAnalysisAdapter,
|
|
15
|
+
HypothesisGeneratorModule,
|
|
16
|
+
NetworkAnalyzerModule,
|
|
15
17
|
NLPAnalysisAdapter,
|
|
16
18
|
StatisticalAnalysisAdapter,
|
|
19
|
+
TimeSeriesAdvancedModule,
|
|
17
20
|
)
|
|
18
21
|
from evalvault.adapters.outbound.analysis.pipeline_factory import (
|
|
19
22
|
build_analysis_pipeline_service,
|
|
20
23
|
)
|
|
21
24
|
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
22
25
|
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
23
|
-
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
26
|
+
from evalvault.adapters.outbound.report import DashboardGenerator, MarkdownReportAdapter
|
|
24
27
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
25
28
|
from evalvault.config.phoenix_support import get_phoenix_trace_url
|
|
26
29
|
from evalvault.config.settings import Settings, apply_profile
|
|
@@ -64,6 +67,37 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
64
67
|
"-L",
|
|
65
68
|
help="플레이북 분석에서 LLM 인사이트 생성",
|
|
66
69
|
),
|
|
70
|
+
dashboard: bool = typer.Option(False, "--dashboard", help="시각화 대시보드 생성"),
|
|
71
|
+
dashboard_format: str = typer.Option(
|
|
72
|
+
"png", "--dashboard-format", help="대시보드 출력 형식 (png, svg, pdf)"
|
|
73
|
+
),
|
|
74
|
+
anomaly_detect: bool = typer.Option(
|
|
75
|
+
False, "--anomaly-detect", "-A", help="이상치 탐지 실행 (Phase 2)"
|
|
76
|
+
),
|
|
77
|
+
window_size: int = typer.Option(
|
|
78
|
+
200, "--window-size", "-w", help="이상치 탐지 윈도 크기", min=50, max=500
|
|
79
|
+
),
|
|
80
|
+
forecast: bool = typer.Option(False, "--forecast", "-F", help="성능 예측 실행 (Phase 2)"),
|
|
81
|
+
forecast_horizon: int = typer.Option(
|
|
82
|
+
3, "--forecast-horizon", help="예측 범위(런 개수)", min=1, max=10
|
|
83
|
+
),
|
|
84
|
+
network: bool = typer.Option(
|
|
85
|
+
False, "--network", help="메트릭 상관관계 네트워크 생성 (Phase 3)"
|
|
86
|
+
),
|
|
87
|
+
min_correlation: float = typer.Option(
|
|
88
|
+
0.5, "--min-correlation", help="네트워크 최소 상관계수", min=0, max=1
|
|
89
|
+
),
|
|
90
|
+
generate_hypothesis: bool = typer.Option(
|
|
91
|
+
False, "--generate-hypothesis", "-H", help="가설 자동 생성 (Phase 4)"
|
|
92
|
+
),
|
|
93
|
+
hypothesis_method: str = typer.Option(
|
|
94
|
+
"heuristic",
|
|
95
|
+
"--hypothesis-method",
|
|
96
|
+
help="가설 생성 방식 (heuristic, hyporefine, union)",
|
|
97
|
+
),
|
|
98
|
+
num_hypotheses: int = typer.Option(
|
|
99
|
+
5, "--num-hypotheses", help="생성할 가설 수", min=1, max=20
|
|
100
|
+
),
|
|
67
101
|
output: Path | None = typer.Option(None, "--output", "-o", help="JSON 출력 파일"),
|
|
68
102
|
report: Path | None = typer.Option(
|
|
69
103
|
None, "--report", "-r", help="리포트 출력 파일 (*.md 또는 *.html)"
|
|
@@ -77,6 +111,9 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
77
111
|
"""평가 실행 결과를 분석하고 통계 인사이트를 표시합니다."""
|
|
78
112
|
|
|
79
113
|
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
114
|
+
if resolved_db_path is None:
|
|
115
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
116
|
+
raise typer.Exit(1)
|
|
80
117
|
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
81
118
|
|
|
82
119
|
try:
|
|
@@ -161,6 +198,97 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
161
198
|
storage.save_analysis(analysis)
|
|
162
199
|
_console.print(f"\n[green]분석 결과 DB 저장: {resolved_db_path}[/green]")
|
|
163
200
|
|
|
201
|
+
if dashboard:
|
|
202
|
+
dashboard_gen = DashboardGenerator()
|
|
203
|
+
_console.print("\n[bold cyan]Generating visualization dashboard...[/bold cyan]")
|
|
204
|
+
|
|
205
|
+
fig = dashboard_gen.generate_evaluation_dashboard(run_id)
|
|
206
|
+
|
|
207
|
+
output_dir = Path("reports/dashboard")
|
|
208
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
209
|
+
|
|
210
|
+
output_path = output_dir / f"dashboard_{run_id[:8]}.{dashboard_format}"
|
|
211
|
+
fig.savefig(output_path, dpi=300, bbox_inches="tight")
|
|
212
|
+
_console.print(f"\n[green]Dashboard saved to: {output_path}[/green]")
|
|
213
|
+
|
|
214
|
+
if anomaly_detect or forecast:
|
|
215
|
+
ts_analyzer = TimeSeriesAdvancedModule(window_size=window_size)
|
|
216
|
+
run_history = storage.list_runs(limit=50)
|
|
217
|
+
|
|
218
|
+
if not run_history or len(run_history) < 5:
|
|
219
|
+
_console.print("[yellow]Need at least 5 runs for time series analysis.[/yellow]")
|
|
220
|
+
else:
|
|
221
|
+
if anomaly_detect:
|
|
222
|
+
_console.print("\n[bold cyan]Running anomaly detection...[/bold cyan]")
|
|
223
|
+
history_data = [
|
|
224
|
+
{
|
|
225
|
+
"run_id": r.run_id,
|
|
226
|
+
"pass_rate": r.pass_rate,
|
|
227
|
+
"timestamp": r.started_at,
|
|
228
|
+
}
|
|
229
|
+
for r in run_history
|
|
230
|
+
]
|
|
231
|
+
anomaly_result = ts_analyzer.detect_anomalies(history_data)
|
|
232
|
+
_display_anomaly_detection(anomaly_result)
|
|
233
|
+
|
|
234
|
+
if forecast:
|
|
235
|
+
_console.print("\n[bold cyan]Running performance forecasting...[/bold cyan]")
|
|
236
|
+
history_data = [
|
|
237
|
+
{"run_id": r.run_id, "pass_rate": r.pass_rate} for r in run_history
|
|
238
|
+
]
|
|
239
|
+
forecast_result = ts_analyzer.forecast_performance(
|
|
240
|
+
history_data, horizon=forecast_horizon
|
|
241
|
+
)
|
|
242
|
+
_display_forecast_result(forecast_result)
|
|
243
|
+
|
|
244
|
+
if network:
|
|
245
|
+
_console.print("\n[bold cyan]Building metric correlation network...[/bold cyan]")
|
|
246
|
+
net_analyzer = NetworkAnalyzerModule()
|
|
247
|
+
|
|
248
|
+
if not bundle.statistical or not bundle.statistical.significant_correlations:
|
|
249
|
+
_console.print("[yellow]No significant correlations for network analysis.[/yellow]")
|
|
250
|
+
else:
|
|
251
|
+
correlations_data = [
|
|
252
|
+
{
|
|
253
|
+
"variable1": corr.variable1,
|
|
254
|
+
"variable2": corr.variable2,
|
|
255
|
+
"correlation": corr.correlation,
|
|
256
|
+
"p_value": corr.p_value,
|
|
257
|
+
"is_significant": corr.is_significant,
|
|
258
|
+
}
|
|
259
|
+
for corr in bundle.statistical.significant_correlations
|
|
260
|
+
]
|
|
261
|
+
graph = net_analyzer.build_correlation_network(
|
|
262
|
+
correlations_data, min_correlation=min_correlation
|
|
263
|
+
)
|
|
264
|
+
net_result = net_analyzer.analyze_metric_network(graph)
|
|
265
|
+
_display_network_analysis(net_result)
|
|
266
|
+
|
|
267
|
+
if generate_hypothesis:
|
|
268
|
+
_console.print(
|
|
269
|
+
f"\n[bold cyan]Generating hypotheses ({hypothesis_method})...[/bold cyan]"
|
|
270
|
+
)
|
|
271
|
+
hyp_gen = HypothesisGeneratorModule(
|
|
272
|
+
method=hypothesis_method, num_hypotheses=num_hypotheses
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
metric_scores = {}
|
|
276
|
+
for metric_name, stats in analysis.metrics_summary.items():
|
|
277
|
+
metric_scores[metric_name] = stats.mean
|
|
278
|
+
|
|
279
|
+
low_performers_data = [
|
|
280
|
+
{
|
|
281
|
+
"question": lp.test_case_id,
|
|
282
|
+
"metric_name": lp.metric_name,
|
|
283
|
+
}
|
|
284
|
+
for lp in (analysis.low_performers or [])
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
hypotheses = hyp_gen.generate_simple_hypotheses(
|
|
288
|
+
run_id, metric_scores, low_performers_data
|
|
289
|
+
)
|
|
290
|
+
_display_hypothesis_generation(hypotheses, hypothesis_method)
|
|
291
|
+
|
|
164
292
|
if output:
|
|
165
293
|
_export_analysis_json(analysis, output, bundle.nlp if nlp else None, improvement_report)
|
|
166
294
|
_console.print(f"\n[green]분석 결과 내보냄: {output}[/green]")
|
|
@@ -192,6 +320,9 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
192
320
|
"""두 실행을 통계적으로 비교합니다."""
|
|
193
321
|
|
|
194
322
|
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
323
|
+
if resolved_db_path is None:
|
|
324
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
325
|
+
raise typer.Exit(1)
|
|
195
326
|
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
196
327
|
|
|
197
328
|
try:
|
|
@@ -220,7 +351,15 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
220
351
|
_console.print(f" Phoenix 트레이스: {trace_b}")
|
|
221
352
|
_console.print(f" 검정: {test}\n")
|
|
222
353
|
|
|
223
|
-
|
|
354
|
+
if test == "t-test":
|
|
355
|
+
test_type = "t-test"
|
|
356
|
+
elif test == "mann-whitney":
|
|
357
|
+
test_type = "mann-whitney"
|
|
358
|
+
else:
|
|
359
|
+
_console.print(f"[red]Error: Unsupported test type: {test}[/red]")
|
|
360
|
+
raise typer.Exit(1)
|
|
361
|
+
|
|
362
|
+
comparisons = service.compare_runs(run_a, run_b, metrics=metric_list, test_type=test_type)
|
|
224
363
|
|
|
225
364
|
if not comparisons:
|
|
226
365
|
_console.print("[yellow]비교할 공통 메트릭이 없습니다.[/yellow]")
|
|
@@ -942,6 +1081,123 @@ def _generate_report(
|
|
|
942
1081
|
file.write(content)
|
|
943
1082
|
|
|
944
1083
|
|
|
1084
|
+
def _display_anomaly_detection(anomaly_result) -> None:
|
|
1085
|
+
_console.print("\n[bold]Anomaly Detection Results[/bold]")
|
|
1086
|
+
_console.print(f"Detection method: {anomaly_result.detection_method}")
|
|
1087
|
+
_console.print(f"Threshold: {anomaly_result.threshold:.2f}")
|
|
1088
|
+
_console.print(f"Total runs: {anomaly_result.total_runs}")
|
|
1089
|
+
|
|
1090
|
+
if anomaly_result.anomalies:
|
|
1091
|
+
detected = [a for a in anomaly_result.anomalies if a.is_anomaly]
|
|
1092
|
+
if detected:
|
|
1093
|
+
_console.print(f"\n[red]Detected {len(detected)} anomalies:[/red]")
|
|
1094
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1095
|
+
table.add_column("Run ID")
|
|
1096
|
+
table.add_column("Score", justify="right")
|
|
1097
|
+
table.add_column("Pass Rate", justify="right")
|
|
1098
|
+
table.add_column("Severity")
|
|
1099
|
+
|
|
1100
|
+
for anomaly in detected[:10]:
|
|
1101
|
+
severity_color = (
|
|
1102
|
+
"red"
|
|
1103
|
+
if anomaly.severity == "high"
|
|
1104
|
+
else "yellow"
|
|
1105
|
+
if anomaly.severity == "medium"
|
|
1106
|
+
else "green"
|
|
1107
|
+
)
|
|
1108
|
+
table.add_row(
|
|
1109
|
+
anomaly.run_id[:12] + "...",
|
|
1110
|
+
f"{anomaly.anomaly_score:.2f}",
|
|
1111
|
+
f"{anomaly.pass_rate:.1%}",
|
|
1112
|
+
f"[{severity_color}]{anomaly.severity}[/{severity_color}]",
|
|
1113
|
+
)
|
|
1114
|
+
_console.print(table)
|
|
1115
|
+
else:
|
|
1116
|
+
_console.print("[green]No anomalies detected.[/green]")
|
|
1117
|
+
|
|
1118
|
+
if anomaly_result.insights:
|
|
1119
|
+
_console.print("\n[bold]Insights:[/bold]")
|
|
1120
|
+
for insight in anomaly_result.insights:
|
|
1121
|
+
_console.print(f" • {insight}")
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _display_forecast_result(forecast_result) -> None:
|
|
1125
|
+
_console.print("\n[bold]Forecast Results[/bold]")
|
|
1126
|
+
_console.print(f"Method: {forecast_result.method}")
|
|
1127
|
+
_console.print(f"Horizon: {forecast_result.horizon} runs")
|
|
1128
|
+
|
|
1129
|
+
if forecast_result.predicted_values:
|
|
1130
|
+
_console.print("\n[bold]Predicted Pass Rates:[/bold]")
|
|
1131
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1132
|
+
table.add_column("Run")
|
|
1133
|
+
table.add_column("Predicted", justify="right")
|
|
1134
|
+
|
|
1135
|
+
for i, value in enumerate(forecast_result.predicted_values, 1):
|
|
1136
|
+
table.add_row(f"+{i}", f"{value:.1%}")
|
|
1137
|
+
_console.print(table)
|
|
1138
|
+
|
|
1139
|
+
avg_forecast = sum(forecast_result.predicted_values) / len(forecast_result.predicted_values)
|
|
1140
|
+
_console.print(f"\nAverage forecast: {avg_forecast:.1%}")
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def _display_network_analysis(net_result) -> None:
|
|
1144
|
+
_console.print("\n[bold]Network Analysis Results[/bold]")
|
|
1145
|
+
_console.print(f"Nodes (metrics): {net_result.node_count}")
|
|
1146
|
+
_console.print(f"Edges (correlations): {net_result.edge_count}")
|
|
1147
|
+
_console.print(f"Density: {net_result.density:.3f}")
|
|
1148
|
+
_console.print(f"Avg clustering: {net_result.avg_clustering:.3f}")
|
|
1149
|
+
|
|
1150
|
+
if net_result.communities:
|
|
1151
|
+
_console.print(f"\n[bold]Communities ({len(net_result.communities)}):[/bold]")
|
|
1152
|
+
for i, community in enumerate(net_result.communities):
|
|
1153
|
+
if len(community) > 1:
|
|
1154
|
+
_console.print(f" Community {i + 1}: {', '.join(community)}")
|
|
1155
|
+
|
|
1156
|
+
if net_result.hub_metrics:
|
|
1157
|
+
_console.print("\n[bold]Hub Metrics:[/bold]")
|
|
1158
|
+
for metric in net_result.hub_metrics:
|
|
1159
|
+
_console.print(f" • {metric}")
|
|
1160
|
+
|
|
1161
|
+
if net_result.insights:
|
|
1162
|
+
_console.print("\n[bold]Insights:[/bold]")
|
|
1163
|
+
for insight in net_result.insights:
|
|
1164
|
+
_console.print(f" • {insight}")
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
def _display_hypothesis_generation(hypotheses, method: str) -> None:
|
|
1168
|
+
_console.print("\n[bold]Hypothesis Generation Results[/bold]")
|
|
1169
|
+
_console.print(f"Method: {method}")
|
|
1170
|
+
_console.print(f"Total hypotheses: {len(hypotheses)}")
|
|
1171
|
+
|
|
1172
|
+
if hypotheses:
|
|
1173
|
+
_console.print("\n[bold]Generated Hypotheses:[/bold]")
|
|
1174
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1175
|
+
table.add_column("#")
|
|
1176
|
+
table.add_column("Hypothesis")
|
|
1177
|
+
table.add_column("Metric")
|
|
1178
|
+
table.add_column("Confidence", justify="right")
|
|
1179
|
+
table.add_column("Evidence")
|
|
1180
|
+
|
|
1181
|
+
for i, hyp in enumerate(hypotheses[:10], 1):
|
|
1182
|
+
confidence_color = (
|
|
1183
|
+
"green" if hyp.confidence >= 0.8 else "yellow" if hyp.confidence >= 0.6 else "red"
|
|
1184
|
+
)
|
|
1185
|
+
table.add_row(
|
|
1186
|
+
str(i),
|
|
1187
|
+
hyp.text[:60] + "..." if len(hyp.text) > 60 else hyp.text,
|
|
1188
|
+
hyp.metric_name or "-",
|
|
1189
|
+
f"[{confidence_color}]{hyp.confidence:.2f}[/{confidence_color}]",
|
|
1190
|
+
hyp.evidence[:30] + "..." if len(hyp.evidence) > 30 else hyp.evidence,
|
|
1191
|
+
)
|
|
1192
|
+
_console.print(table)
|
|
1193
|
+
|
|
1194
|
+
high_conf = [h for h in hypotheses if h.confidence >= 0.8]
|
|
1195
|
+
if high_conf:
|
|
1196
|
+
_console.print(
|
|
1197
|
+
f"\n[green]High confidence hypotheses: {len(high_conf)}/{len(hypotheses)}[/green]"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
|
|
945
1201
|
__all__ = [
|
|
946
1202
|
"register_analyze_commands",
|
|
947
1203
|
"_perform_playbook_analysis",
|
|
@@ -7,6 +7,7 @@ from rich.console import Console
|
|
|
7
7
|
from rich.table import Table
|
|
8
8
|
|
|
9
9
|
from evalvault.config.settings import Settings, apply_profile
|
|
10
|
+
from evalvault.domain.metrics.registry import list_metric_specs
|
|
10
11
|
|
|
11
12
|
|
|
12
13
|
def register_config_commands(app: typer.Typer, console: Console) -> None:
|
|
@@ -22,46 +23,9 @@ def register_config_commands(app: typer.Typer, console: Console) -> None:
|
|
|
22
23
|
table.add_column("Description")
|
|
23
24
|
table.add_column("Requires Ground Truth", justify="center")
|
|
24
25
|
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
|
|
28
|
-
"[red]No[/red]",
|
|
29
|
-
)
|
|
30
|
-
table.add_row(
|
|
31
|
-
"answer_relevancy",
|
|
32
|
-
"Measures how relevant the answer is to the question",
|
|
33
|
-
"[red]No[/red]",
|
|
34
|
-
)
|
|
35
|
-
table.add_row(
|
|
36
|
-
"context_precision",
|
|
37
|
-
"Measures ranking quality of retrieved contexts",
|
|
38
|
-
"[green]Yes[/green]",
|
|
39
|
-
)
|
|
40
|
-
table.add_row(
|
|
41
|
-
"context_recall",
|
|
42
|
-
"Measures if all relevant info is in retrieved contexts",
|
|
43
|
-
"[green]Yes[/green]",
|
|
44
|
-
)
|
|
45
|
-
table.add_row(
|
|
46
|
-
"summary_score",
|
|
47
|
-
"Measures summary coverage and conciseness against contexts",
|
|
48
|
-
"[red]No[/red]",
|
|
49
|
-
)
|
|
50
|
-
table.add_row(
|
|
51
|
-
"summary_faithfulness",
|
|
52
|
-
"Measures whether summary statements are grounded in contexts",
|
|
53
|
-
"[red]No[/red]",
|
|
54
|
-
)
|
|
55
|
-
table.add_row(
|
|
56
|
-
"entity_preservation",
|
|
57
|
-
"Measures preservation of key insurance entities in summaries",
|
|
58
|
-
"[red]No[/red]",
|
|
59
|
-
)
|
|
60
|
-
table.add_row(
|
|
61
|
-
"insurance_term_accuracy",
|
|
62
|
-
"Measures if insurance terms in answer are grounded in contexts",
|
|
63
|
-
"[red]No[/red]",
|
|
64
|
-
)
|
|
26
|
+
for spec in list_metric_specs():
|
|
27
|
+
needs_gt = "[green]Yes[/green]" if spec.requires_ground_truth else "[red]No[/red]"
|
|
28
|
+
table.add_row(spec.name, spec.description, needs_gt)
|
|
65
29
|
|
|
66
30
|
console.print(table)
|
|
67
31
|
console.print("\n[dim]Use --metrics flag with 'run' command to specify metrics.[/dim]")
|
|
@@ -13,6 +13,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
|
|
|
13
13
|
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
14
14
|
from evalvault.config.settings import Settings
|
|
15
15
|
|
|
16
|
+
from ..utils.analysis_io import serialize_pipeline_result
|
|
16
17
|
from ..utils.options import db_option
|
|
17
18
|
|
|
18
19
|
|
|
@@ -36,7 +37,7 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
36
37
|
"-o",
|
|
37
38
|
help="Output file for results (JSON format).",
|
|
38
39
|
),
|
|
39
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
40
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
40
41
|
) -> None:
|
|
41
42
|
"""Analyze evaluation results using natural language query."""
|
|
42
43
|
from evalvault.adapters.outbound.analysis.pipeline_factory import (
|
|
@@ -52,6 +53,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
52
53
|
if settings.phoenix_enabled:
|
|
53
54
|
ensure_phoenix_instrumentation(settings, console=console)
|
|
54
55
|
|
|
56
|
+
if db_path is None:
|
|
57
|
+
console.print("[red]Error: Database path is not configured.[/red]")
|
|
58
|
+
raise typer.Exit(1)
|
|
59
|
+
|
|
55
60
|
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
56
61
|
llm_adapter = None
|
|
57
62
|
try:
|
|
@@ -100,15 +105,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
100
105
|
console.print(f" [red]{node_id}:[/red] {node_result.error}")
|
|
101
106
|
|
|
102
107
|
if output:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
"intent": result.intent.value if result.intent else None,
|
|
106
|
-
"is_complete": result.is_complete,
|
|
107
|
-
"duration_ms": result.total_duration_ms,
|
|
108
|
-
"final_output": result.final_output,
|
|
109
|
-
}
|
|
108
|
+
payload = serialize_pipeline_result(result)
|
|
109
|
+
payload["query"] = query
|
|
110
110
|
with open(output, "w", encoding="utf-8") as f:
|
|
111
|
-
json.dump(
|
|
111
|
+
json.dump(payload, f, ensure_ascii=False, indent=2)
|
|
112
112
|
console.print(f"\n[green]Results saved to {output}[/green]")
|
|
113
113
|
|
|
114
114
|
console.print()
|