evalvault 1.58.0__py3-none-any.whl → 1.59.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/routers/pipeline.py +48 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +258 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +5 -1
- evalvault/adapters/inbound/cli/commands/run.py +60 -26
- evalvault/adapters/inbound/cli/utils/analysis_io.py +2 -2
- evalvault/adapters/outbound/analysis/__init__.py +13 -3
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +2 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +2 -1
- evalvault/adapters/outbound/analysis/hypothesis_generator_module.py +359 -0
- evalvault/adapters/outbound/analysis/llm_report_module.py +9 -9
- evalvault/adapters/outbound/analysis/network_analyzer_module.py +250 -0
- evalvault/adapters/outbound/analysis/pipeline_factory.py +3 -0
- evalvault/adapters/outbound/analysis/pipeline_helpers.py +1 -1
- evalvault/adapters/outbound/analysis/priority_summary_module.py +1 -1
- evalvault/adapters/outbound/analysis/retrieval_benchmark_module.py +3 -2
- evalvault/adapters/outbound/analysis/timeseries_advanced_module.py +349 -0
- evalvault/adapters/outbound/benchmark/lm_eval_adapter.py +1 -1
- evalvault/adapters/outbound/improvement/insight_generator.py +13 -10
- evalvault/adapters/outbound/improvement/pattern_detector.py +11 -13
- evalvault/adapters/outbound/improvement/playbook_loader.py +3 -3
- evalvault/adapters/outbound/llm/__init__.py +63 -63
- evalvault/adapters/outbound/llm/instructor_factory.py +101 -7
- evalvault/adapters/outbound/llm/ollama_adapter.py +8 -1
- evalvault/adapters/outbound/llm/token_aware_chat.py +1 -1
- evalvault/adapters/outbound/report/__init__.py +2 -0
- evalvault/adapters/outbound/report/dashboard_generator.py +197 -0
- evalvault/adapters/outbound/storage/postgres_adapter.py +1 -1
- evalvault/adapters/outbound/tracer/open_rag_log_handler.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_adapter.py +3 -3
- evalvault/adapters/outbound/tracer/open_rag_trace_helpers.py +4 -4
- evalvault/config/settings.py +10 -0
- evalvault/domain/entities/analysis_pipeline.py +13 -3
- evalvault/domain/services/analysis_service.py +3 -3
- evalvault/domain/services/evaluator.py +1 -1
- evalvault/domain/services/pipeline_template_registry.py +197 -127
- evalvault/domain/services/visual_space_service.py +1 -1
- {evalvault-1.58.0.dist-info → evalvault-1.59.0.dist-info}/METADATA +10 -4
- {evalvault-1.58.0.dist-info → evalvault-1.59.0.dist-info}/RECORD +41 -37
- {evalvault-1.58.0.dist-info → evalvault-1.59.0.dist-info}/WHEEL +0 -0
- {evalvault-1.58.0.dist-info → evalvault-1.59.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.58.0.dist-info → evalvault-1.59.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -71,6 +71,54 @@ INTENT_CATALOG = {
|
|
|
71
71
|
"description": "시간에 따른 추세를 분석합니다.",
|
|
72
72
|
"sample_query": "메트릭 추세를 분석해줘",
|
|
73
73
|
},
|
|
74
|
+
AnalysisIntent.ANALYZE_STATISTICAL: {
|
|
75
|
+
"label": "기술 통계량",
|
|
76
|
+
"category": "analysis",
|
|
77
|
+
"description": "메트릭별 기초 통계량을 계산합니다.",
|
|
78
|
+
"sample_query": "기초 통계 분석해줘",
|
|
79
|
+
},
|
|
80
|
+
AnalysisIntent.ANALYZE_NLP: {
|
|
81
|
+
"label": "NLP 분석",
|
|
82
|
+
"category": "analysis",
|
|
83
|
+
"description": "질문/답변 텍스트를 분석합니다.",
|
|
84
|
+
"sample_query": "텍스트 분석해줘",
|
|
85
|
+
},
|
|
86
|
+
AnalysisIntent.ANALYZE_CAUSAL: {
|
|
87
|
+
"label": "인과 관계 분석",
|
|
88
|
+
"category": "analysis",
|
|
89
|
+
"description": "요인별 영향도와 인과 관계를 분석합니다.",
|
|
90
|
+
"sample_query": "인과 관계 분석해줘",
|
|
91
|
+
},
|
|
92
|
+
AnalysisIntent.ANALYZE_NETWORK: {
|
|
93
|
+
"label": "네트워크 분석",
|
|
94
|
+
"category": "analysis",
|
|
95
|
+
"description": "메트릭 간 상관관계 네트워크를 분석합니다.",
|
|
96
|
+
"sample_query": "메트릭 네트워크 분석해줘",
|
|
97
|
+
},
|
|
98
|
+
AnalysisIntent.ANALYZE_PLAYBOOK: {
|
|
99
|
+
"label": "플레이북 분석",
|
|
100
|
+
"category": "analysis",
|
|
101
|
+
"description": "개선 플레이북 기반 진단을 수행합니다.",
|
|
102
|
+
"sample_query": "플레이북으로 분석해줘",
|
|
103
|
+
},
|
|
104
|
+
AnalysisIntent.DETECT_ANOMALIES: {
|
|
105
|
+
"label": "이상 탐지",
|
|
106
|
+
"category": "timeseries",
|
|
107
|
+
"description": "시계열 이상 패턴을 탐지합니다.",
|
|
108
|
+
"sample_query": "이상 탐지해줘",
|
|
109
|
+
},
|
|
110
|
+
AnalysisIntent.FORECAST_PERFORMANCE: {
|
|
111
|
+
"label": "성능 예측",
|
|
112
|
+
"category": "timeseries",
|
|
113
|
+
"description": "미래 성능을 예측합니다.",
|
|
114
|
+
"sample_query": "성능 예측해줘",
|
|
115
|
+
},
|
|
116
|
+
AnalysisIntent.GENERATE_HYPOTHESES: {
|
|
117
|
+
"label": "가설 생성",
|
|
118
|
+
"category": "generation",
|
|
119
|
+
"description": "성능 저하 원인에 대한 가설을 생성합니다.",
|
|
120
|
+
"sample_query": "가설 생성해줘",
|
|
121
|
+
},
|
|
74
122
|
AnalysisIntent.BENCHMARK_RETRIEVAL: {
|
|
75
123
|
"label": "검색 벤치마크",
|
|
76
124
|
"category": "benchmark",
|
|
@@ -12,15 +12,18 @@ from rich.table import Table
|
|
|
12
12
|
|
|
13
13
|
from evalvault.adapters.outbound.analysis import (
|
|
14
14
|
CausalAnalysisAdapter,
|
|
15
|
+
HypothesisGeneratorModule,
|
|
16
|
+
NetworkAnalyzerModule,
|
|
15
17
|
NLPAnalysisAdapter,
|
|
16
18
|
StatisticalAnalysisAdapter,
|
|
19
|
+
TimeSeriesAdvancedModule,
|
|
17
20
|
)
|
|
18
21
|
from evalvault.adapters.outbound.analysis.pipeline_factory import (
|
|
19
22
|
build_analysis_pipeline_service,
|
|
20
23
|
)
|
|
21
24
|
from evalvault.adapters.outbound.cache import MemoryCacheAdapter
|
|
22
25
|
from evalvault.adapters.outbound.llm import get_llm_adapter
|
|
23
|
-
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
26
|
+
from evalvault.adapters.outbound.report import DashboardGenerator, MarkdownReportAdapter
|
|
24
27
|
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
25
28
|
from evalvault.config.phoenix_support import get_phoenix_trace_url
|
|
26
29
|
from evalvault.config.settings import Settings, apply_profile
|
|
@@ -64,6 +67,37 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
64
67
|
"-L",
|
|
65
68
|
help="플레이북 분석에서 LLM 인사이트 생성",
|
|
66
69
|
),
|
|
70
|
+
dashboard: bool = typer.Option(False, "--dashboard", help="시각화 대시보드 생성"),
|
|
71
|
+
dashboard_format: str = typer.Option(
|
|
72
|
+
"png", "--dashboard-format", help="대시보드 출력 형식 (png, svg, pdf)"
|
|
73
|
+
),
|
|
74
|
+
anomaly_detect: bool = typer.Option(
|
|
75
|
+
False, "--anomaly-detect", "-A", help="이상치 탐지 실행 (Phase 2)"
|
|
76
|
+
),
|
|
77
|
+
window_size: int = typer.Option(
|
|
78
|
+
200, "--window-size", "-w", help="이상치 탐지 윈도 크기", min=50, max=500
|
|
79
|
+
),
|
|
80
|
+
forecast: bool = typer.Option(False, "--forecast", "-F", help="성능 예측 실행 (Phase 2)"),
|
|
81
|
+
forecast_horizon: int = typer.Option(
|
|
82
|
+
3, "--forecast-horizon", help="예측 범위(런 개수)", min=1, max=10
|
|
83
|
+
),
|
|
84
|
+
network: bool = typer.Option(
|
|
85
|
+
False, "--network", help="메트릭 상관관계 네트워크 생성 (Phase 3)"
|
|
86
|
+
),
|
|
87
|
+
min_correlation: float = typer.Option(
|
|
88
|
+
0.5, "--min-correlation", help="네트워크 최소 상관계수", min=0, max=1
|
|
89
|
+
),
|
|
90
|
+
generate_hypothesis: bool = typer.Option(
|
|
91
|
+
False, "--generate-hypothesis", "-H", help="가설 자동 생성 (Phase 4)"
|
|
92
|
+
),
|
|
93
|
+
hypothesis_method: str = typer.Option(
|
|
94
|
+
"heuristic",
|
|
95
|
+
"--hypothesis-method",
|
|
96
|
+
help="가설 생성 방식 (heuristic, hyporefine, union)",
|
|
97
|
+
),
|
|
98
|
+
num_hypotheses: int = typer.Option(
|
|
99
|
+
5, "--num-hypotheses", help="생성할 가설 수", min=1, max=20
|
|
100
|
+
),
|
|
67
101
|
output: Path | None = typer.Option(None, "--output", "-o", help="JSON 출력 파일"),
|
|
68
102
|
report: Path | None = typer.Option(
|
|
69
103
|
None, "--report", "-r", help="리포트 출력 파일 (*.md 또는 *.html)"
|
|
@@ -77,6 +111,9 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
77
111
|
"""평가 실행 결과를 분석하고 통계 인사이트를 표시합니다."""
|
|
78
112
|
|
|
79
113
|
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
114
|
+
if resolved_db_path is None:
|
|
115
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
116
|
+
raise typer.Exit(1)
|
|
80
117
|
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
81
118
|
|
|
82
119
|
try:
|
|
@@ -161,6 +198,97 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
161
198
|
storage.save_analysis(analysis)
|
|
162
199
|
_console.print(f"\n[green]분석 결과 DB 저장: {resolved_db_path}[/green]")
|
|
163
200
|
|
|
201
|
+
if dashboard:
|
|
202
|
+
dashboard_gen = DashboardGenerator()
|
|
203
|
+
_console.print("\n[bold cyan]Generating visualization dashboard...[/bold cyan]")
|
|
204
|
+
|
|
205
|
+
fig = dashboard_gen.generate_evaluation_dashboard(run_id)
|
|
206
|
+
|
|
207
|
+
output_dir = Path("reports/dashboard")
|
|
208
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
209
|
+
|
|
210
|
+
output_path = output_dir / f"dashboard_{run_id[:8]}.{dashboard_format}"
|
|
211
|
+
fig.savefig(output_path, dpi=300, bbox_inches="tight")
|
|
212
|
+
_console.print(f"\n[green]Dashboard saved to: {output_path}[/green]")
|
|
213
|
+
|
|
214
|
+
if anomaly_detect or forecast:
|
|
215
|
+
ts_analyzer = TimeSeriesAdvancedModule(window_size=window_size)
|
|
216
|
+
run_history = storage.list_runs(limit=50)
|
|
217
|
+
|
|
218
|
+
if not run_history or len(run_history) < 5:
|
|
219
|
+
_console.print("[yellow]Need at least 5 runs for time series analysis.[/yellow]")
|
|
220
|
+
else:
|
|
221
|
+
if anomaly_detect:
|
|
222
|
+
_console.print("\n[bold cyan]Running anomaly detection...[/bold cyan]")
|
|
223
|
+
history_data = [
|
|
224
|
+
{
|
|
225
|
+
"run_id": r.run_id,
|
|
226
|
+
"pass_rate": r.pass_rate,
|
|
227
|
+
"timestamp": r.started_at,
|
|
228
|
+
}
|
|
229
|
+
for r in run_history
|
|
230
|
+
]
|
|
231
|
+
anomaly_result = ts_analyzer.detect_anomalies(history_data)
|
|
232
|
+
_display_anomaly_detection(anomaly_result)
|
|
233
|
+
|
|
234
|
+
if forecast:
|
|
235
|
+
_console.print("\n[bold cyan]Running performance forecasting...[/bold cyan]")
|
|
236
|
+
history_data = [
|
|
237
|
+
{"run_id": r.run_id, "pass_rate": r.pass_rate} for r in run_history
|
|
238
|
+
]
|
|
239
|
+
forecast_result = ts_analyzer.forecast_performance(
|
|
240
|
+
history_data, horizon=forecast_horizon
|
|
241
|
+
)
|
|
242
|
+
_display_forecast_result(forecast_result)
|
|
243
|
+
|
|
244
|
+
if network:
|
|
245
|
+
_console.print("\n[bold cyan]Building metric correlation network...[/bold cyan]")
|
|
246
|
+
net_analyzer = NetworkAnalyzerModule()
|
|
247
|
+
|
|
248
|
+
if not bundle.statistical or not bundle.statistical.significant_correlations:
|
|
249
|
+
_console.print("[yellow]No significant correlations for network analysis.[/yellow]")
|
|
250
|
+
else:
|
|
251
|
+
correlations_data = [
|
|
252
|
+
{
|
|
253
|
+
"variable1": corr.variable1,
|
|
254
|
+
"variable2": corr.variable2,
|
|
255
|
+
"correlation": corr.correlation,
|
|
256
|
+
"p_value": corr.p_value,
|
|
257
|
+
"is_significant": corr.is_significant,
|
|
258
|
+
}
|
|
259
|
+
for corr in bundle.statistical.significant_correlations
|
|
260
|
+
]
|
|
261
|
+
graph = net_analyzer.build_correlation_network(
|
|
262
|
+
correlations_data, min_correlation=min_correlation
|
|
263
|
+
)
|
|
264
|
+
net_result = net_analyzer.analyze_metric_network(graph)
|
|
265
|
+
_display_network_analysis(net_result)
|
|
266
|
+
|
|
267
|
+
if generate_hypothesis:
|
|
268
|
+
_console.print(
|
|
269
|
+
f"\n[bold cyan]Generating hypotheses ({hypothesis_method})...[/bold cyan]"
|
|
270
|
+
)
|
|
271
|
+
hyp_gen = HypothesisGeneratorModule(
|
|
272
|
+
method=hypothesis_method, num_hypotheses=num_hypotheses
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
metric_scores = {}
|
|
276
|
+
for metric_name, stats in analysis.metrics_summary.items():
|
|
277
|
+
metric_scores[metric_name] = stats.mean
|
|
278
|
+
|
|
279
|
+
low_performers_data = [
|
|
280
|
+
{
|
|
281
|
+
"question": lp.test_case_id,
|
|
282
|
+
"metric_name": lp.metric_name,
|
|
283
|
+
}
|
|
284
|
+
for lp in (analysis.low_performers or [])
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
hypotheses = hyp_gen.generate_simple_hypotheses(
|
|
288
|
+
run_id, metric_scores, low_performers_data
|
|
289
|
+
)
|
|
290
|
+
_display_hypothesis_generation(hypotheses, hypothesis_method)
|
|
291
|
+
|
|
164
292
|
if output:
|
|
165
293
|
_export_analysis_json(analysis, output, bundle.nlp if nlp else None, improvement_report)
|
|
166
294
|
_console.print(f"\n[green]분석 결과 내보냄: {output}[/green]")
|
|
@@ -192,6 +320,9 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
192
320
|
"""두 실행을 통계적으로 비교합니다."""
|
|
193
321
|
|
|
194
322
|
resolved_db_path = db_path or Settings().evalvault_db_path
|
|
323
|
+
if resolved_db_path is None:
|
|
324
|
+
_console.print("[red]오류: DB 경로가 설정되지 않았습니다.[/red]")
|
|
325
|
+
raise typer.Exit(1)
|
|
195
326
|
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
196
327
|
|
|
197
328
|
try:
|
|
@@ -220,7 +351,15 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
220
351
|
_console.print(f" Phoenix 트레이스: {trace_b}")
|
|
221
352
|
_console.print(f" 검정: {test}\n")
|
|
222
353
|
|
|
223
|
-
|
|
354
|
+
if test == "t-test":
|
|
355
|
+
test_type = "t-test"
|
|
356
|
+
elif test == "mann-whitney":
|
|
357
|
+
test_type = "mann-whitney"
|
|
358
|
+
else:
|
|
359
|
+
_console.print(f"[red]Error: Unsupported test type: {test}[/red]")
|
|
360
|
+
raise typer.Exit(1)
|
|
361
|
+
|
|
362
|
+
comparisons = service.compare_runs(run_a, run_b, metrics=metric_list, test_type=test_type)
|
|
224
363
|
|
|
225
364
|
if not comparisons:
|
|
226
365
|
_console.print("[yellow]비교할 공통 메트릭이 없습니다.[/yellow]")
|
|
@@ -942,6 +1081,123 @@ def _generate_report(
|
|
|
942
1081
|
file.write(content)
|
|
943
1082
|
|
|
944
1083
|
|
|
1084
|
+
def _display_anomaly_detection(anomaly_result) -> None:
|
|
1085
|
+
_console.print("\n[bold]Anomaly Detection Results[/bold]")
|
|
1086
|
+
_console.print(f"Detection method: {anomaly_result.detection_method}")
|
|
1087
|
+
_console.print(f"Threshold: {anomaly_result.threshold:.2f}")
|
|
1088
|
+
_console.print(f"Total runs: {anomaly_result.total_runs}")
|
|
1089
|
+
|
|
1090
|
+
if anomaly_result.anomalies:
|
|
1091
|
+
detected = [a for a in anomaly_result.anomalies if a.is_anomaly]
|
|
1092
|
+
if detected:
|
|
1093
|
+
_console.print(f"\n[red]Detected {len(detected)} anomalies:[/red]")
|
|
1094
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1095
|
+
table.add_column("Run ID")
|
|
1096
|
+
table.add_column("Score", justify="right")
|
|
1097
|
+
table.add_column("Pass Rate", justify="right")
|
|
1098
|
+
table.add_column("Severity")
|
|
1099
|
+
|
|
1100
|
+
for anomaly in detected[:10]:
|
|
1101
|
+
severity_color = (
|
|
1102
|
+
"red"
|
|
1103
|
+
if anomaly.severity == "high"
|
|
1104
|
+
else "yellow"
|
|
1105
|
+
if anomaly.severity == "medium"
|
|
1106
|
+
else "green"
|
|
1107
|
+
)
|
|
1108
|
+
table.add_row(
|
|
1109
|
+
anomaly.run_id[:12] + "...",
|
|
1110
|
+
f"{anomaly.anomaly_score:.2f}",
|
|
1111
|
+
f"{anomaly.pass_rate:.1%}",
|
|
1112
|
+
f"[{severity_color}]{anomaly.severity}[/{severity_color}]",
|
|
1113
|
+
)
|
|
1114
|
+
_console.print(table)
|
|
1115
|
+
else:
|
|
1116
|
+
_console.print("[green]No anomalies detected.[/green]")
|
|
1117
|
+
|
|
1118
|
+
if anomaly_result.insights:
|
|
1119
|
+
_console.print("\n[bold]Insights:[/bold]")
|
|
1120
|
+
for insight in anomaly_result.insights:
|
|
1121
|
+
_console.print(f" • {insight}")
|
|
1122
|
+
|
|
1123
|
+
|
|
1124
|
+
def _display_forecast_result(forecast_result) -> None:
|
|
1125
|
+
_console.print("\n[bold]Forecast Results[/bold]")
|
|
1126
|
+
_console.print(f"Method: {forecast_result.method}")
|
|
1127
|
+
_console.print(f"Horizon: {forecast_result.horizon} runs")
|
|
1128
|
+
|
|
1129
|
+
if forecast_result.predicted_values:
|
|
1130
|
+
_console.print("\n[bold]Predicted Pass Rates:[/bold]")
|
|
1131
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1132
|
+
table.add_column("Run")
|
|
1133
|
+
table.add_column("Predicted", justify="right")
|
|
1134
|
+
|
|
1135
|
+
for i, value in enumerate(forecast_result.predicted_values, 1):
|
|
1136
|
+
table.add_row(f"+{i}", f"{value:.1%}")
|
|
1137
|
+
_console.print(table)
|
|
1138
|
+
|
|
1139
|
+
avg_forecast = sum(forecast_result.predicted_values) / len(forecast_result.predicted_values)
|
|
1140
|
+
_console.print(f"\nAverage forecast: {avg_forecast:.1%}")
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
def _display_network_analysis(net_result) -> None:
|
|
1144
|
+
_console.print("\n[bold]Network Analysis Results[/bold]")
|
|
1145
|
+
_console.print(f"Nodes (metrics): {net_result.node_count}")
|
|
1146
|
+
_console.print(f"Edges (correlations): {net_result.edge_count}")
|
|
1147
|
+
_console.print(f"Density: {net_result.density:.3f}")
|
|
1148
|
+
_console.print(f"Avg clustering: {net_result.avg_clustering:.3f}")
|
|
1149
|
+
|
|
1150
|
+
if net_result.communities:
|
|
1151
|
+
_console.print(f"\n[bold]Communities ({len(net_result.communities)}):[/bold]")
|
|
1152
|
+
for i, community in enumerate(net_result.communities):
|
|
1153
|
+
if len(community) > 1:
|
|
1154
|
+
_console.print(f" Community {i + 1}: {', '.join(community)}")
|
|
1155
|
+
|
|
1156
|
+
if net_result.hub_metrics:
|
|
1157
|
+
_console.print("\n[bold]Hub Metrics:[/bold]")
|
|
1158
|
+
for metric in net_result.hub_metrics:
|
|
1159
|
+
_console.print(f" • {metric}")
|
|
1160
|
+
|
|
1161
|
+
if net_result.insights:
|
|
1162
|
+
_console.print("\n[bold]Insights:[/bold]")
|
|
1163
|
+
for insight in net_result.insights:
|
|
1164
|
+
_console.print(f" • {insight}")
|
|
1165
|
+
|
|
1166
|
+
|
|
1167
|
+
def _display_hypothesis_generation(hypotheses, method: str) -> None:
|
|
1168
|
+
_console.print("\n[bold]Hypothesis Generation Results[/bold]")
|
|
1169
|
+
_console.print(f"Method: {method}")
|
|
1170
|
+
_console.print(f"Total hypotheses: {len(hypotheses)}")
|
|
1171
|
+
|
|
1172
|
+
if hypotheses:
|
|
1173
|
+
_console.print("\n[bold]Generated Hypotheses:[/bold]")
|
|
1174
|
+
table = Table(show_header=True, header_style="bold cyan")
|
|
1175
|
+
table.add_column("#")
|
|
1176
|
+
table.add_column("Hypothesis")
|
|
1177
|
+
table.add_column("Metric")
|
|
1178
|
+
table.add_column("Confidence", justify="right")
|
|
1179
|
+
table.add_column("Evidence")
|
|
1180
|
+
|
|
1181
|
+
for i, hyp in enumerate(hypotheses[:10], 1):
|
|
1182
|
+
confidence_color = (
|
|
1183
|
+
"green" if hyp.confidence >= 0.8 else "yellow" if hyp.confidence >= 0.6 else "red"
|
|
1184
|
+
)
|
|
1185
|
+
table.add_row(
|
|
1186
|
+
str(i),
|
|
1187
|
+
hyp.text[:60] + "..." if len(hyp.text) > 60 else hyp.text,
|
|
1188
|
+
hyp.metric_name or "-",
|
|
1189
|
+
f"[{confidence_color}]{hyp.confidence:.2f}[/{confidence_color}]",
|
|
1190
|
+
hyp.evidence[:30] + "..." if len(hyp.evidence) > 30 else hyp.evidence,
|
|
1191
|
+
)
|
|
1192
|
+
_console.print(table)
|
|
1193
|
+
|
|
1194
|
+
high_conf = [h for h in hypotheses if h.confidence >= 0.8]
|
|
1195
|
+
if high_conf:
|
|
1196
|
+
_console.print(
|
|
1197
|
+
f"\n[green]High confidence hypotheses: {len(high_conf)}/{len(hypotheses)}[/green]"
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
|
|
945
1201
|
__all__ = [
|
|
946
1202
|
"register_analyze_commands",
|
|
947
1203
|
"_perform_playbook_analysis",
|
|
@@ -36,7 +36,7 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
36
36
|
"-o",
|
|
37
37
|
help="Output file for results (JSON format).",
|
|
38
38
|
),
|
|
39
|
-
db_path: Path = db_option(help_text="Path to database file."),
|
|
39
|
+
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
40
40
|
) -> None:
|
|
41
41
|
"""Analyze evaluation results using natural language query."""
|
|
42
42
|
from evalvault.adapters.outbound.analysis.pipeline_factory import (
|
|
@@ -52,6 +52,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
|
|
|
52
52
|
if settings.phoenix_enabled:
|
|
53
53
|
ensure_phoenix_instrumentation(settings, console=console)
|
|
54
54
|
|
|
55
|
+
if db_path is None:
|
|
56
|
+
console.print("[red]Error: Database path is not configured.[/red]")
|
|
57
|
+
raise typer.Exit(1)
|
|
58
|
+
|
|
55
59
|
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
56
60
|
llm_adapter = None
|
|
57
61
|
try:
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
6
|
+
import os
|
|
6
7
|
from collections.abc import Callable, Sequence
|
|
7
8
|
from datetime import date, datetime
|
|
8
9
|
from pathlib import Path
|
|
@@ -794,6 +795,12 @@ def register_run_commands(
|
|
|
794
795
|
)
|
|
795
796
|
raise typer.Exit(1)
|
|
796
797
|
|
|
798
|
+
ollama_env_url = os.environ.get("OLLAMA_BASE_URL")
|
|
799
|
+
if ollama_env_url:
|
|
800
|
+
normalized_url = ollama_env_url.strip()
|
|
801
|
+
if normalized_url and "://" not in normalized_url:
|
|
802
|
+
os.environ["OLLAMA_BASE_URL"] = f"http://{normalized_url}"
|
|
803
|
+
|
|
797
804
|
settings = Settings()
|
|
798
805
|
|
|
799
806
|
# Apply profile (CLI > .env > default)
|
|
@@ -915,6 +922,15 @@ def register_run_commands(
|
|
|
915
922
|
raise typer.Exit(1) from exc
|
|
916
923
|
|
|
917
924
|
if settings.llm_provider == "ollama":
|
|
925
|
+
base_url = getattr(settings, "ollama_base_url", "")
|
|
926
|
+
if not isinstance(base_url, str):
|
|
927
|
+
base_url = ""
|
|
928
|
+
base_url = base_url.strip()
|
|
929
|
+
if not base_url:
|
|
930
|
+
base_url = "http://localhost:11434"
|
|
931
|
+
elif "://" not in base_url:
|
|
932
|
+
base_url = f"http://{base_url}"
|
|
933
|
+
settings.ollama_base_url = base_url
|
|
918
934
|
display_model = f"ollama/{settings.ollama_model}"
|
|
919
935
|
elif settings.llm_provider == "vllm":
|
|
920
936
|
display_model = f"vllm/{settings.vllm_model}"
|
|
@@ -1421,37 +1437,55 @@ def register_run_commands(
|
|
|
1421
1437
|
ensure_phoenix_instrumentation(settings, console=console, force=True)
|
|
1422
1438
|
|
|
1423
1439
|
evaluator = RagasEvaluator()
|
|
1440
|
+
llm_adapter = None
|
|
1424
1441
|
try:
|
|
1425
1442
|
llm_adapter = get_llm_adapter(settings)
|
|
1426
1443
|
except Exception as exc:
|
|
1427
1444
|
provider = str(getattr(settings, "llm_provider", "")).strip().lower()
|
|
1428
|
-
|
|
1429
|
-
if provider == "ollama":
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1445
|
+
recovered = False
|
|
1446
|
+
if provider == "ollama" and "http://" in str(exc):
|
|
1447
|
+
base_url = getattr(settings, "ollama_base_url", "")
|
|
1448
|
+
if not isinstance(base_url, str) or not base_url.strip():
|
|
1449
|
+
base_url = "http://localhost:11434"
|
|
1450
|
+
elif "://" not in base_url:
|
|
1451
|
+
base_url = f"http://{base_url.strip()}"
|
|
1452
|
+
settings.ollama_base_url = base_url
|
|
1453
|
+
try:
|
|
1454
|
+
llm_adapter = get_llm_adapter(settings)
|
|
1455
|
+
recovered = True
|
|
1456
|
+
except Exception as retry_exc:
|
|
1457
|
+
exc = retry_exc
|
|
1458
|
+
|
|
1459
|
+
if not recovered:
|
|
1460
|
+
fixes: list[str] = []
|
|
1461
|
+
if provider == "ollama":
|
|
1462
|
+
fixes = [
|
|
1463
|
+
"Ollama 서버가 실행 중인지 확인하세요 (기본: http://localhost:11434).",
|
|
1464
|
+
"필요 모델을 받아두세요: `ollama pull gemma3:1b` 및 `ollama pull qwen3-embedding:0.6b`.",
|
|
1465
|
+
"URL을 바꿨다면 .env의 `OLLAMA_BASE_URL`을 확인하세요.",
|
|
1466
|
+
]
|
|
1467
|
+
elif provider == "openai":
|
|
1468
|
+
fixes = [
|
|
1469
|
+
"`.env`에 `OPENAI_API_KEY`를 설정하세요.",
|
|
1470
|
+
"프록시/네트워크가 필요한 환경이면 연결 가능 여부를 확인하세요.",
|
|
1471
|
+
]
|
|
1472
|
+
elif provider == "vllm":
|
|
1473
|
+
fixes = [
|
|
1474
|
+
"`.env`의 `VLLM_BASE_URL`/`VLLM_MODEL` 설정을 확인하세요.",
|
|
1475
|
+
"vLLM 서버가 OpenAI 호환 API로 실행 중인지 확인하세요.",
|
|
1476
|
+
]
|
|
1477
|
+
else:
|
|
1478
|
+
fixes = ["--profile 또는 환경변수 설정을 확인하세요."]
|
|
1447
1479
|
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1480
|
+
print_cli_error(
|
|
1481
|
+
console,
|
|
1482
|
+
"LLM/임베딩 어댑터를 초기화하지 못했습니다.",
|
|
1483
|
+
details=str(exc),
|
|
1484
|
+
fixes=fixes,
|
|
1485
|
+
)
|
|
1486
|
+
raise typer.Exit(1) from exc
|
|
1487
|
+
|
|
1488
|
+
assert llm_adapter is not None
|
|
1455
1489
|
|
|
1456
1490
|
memory_adapter: SQLiteDomainMemoryAdapter | None = None
|
|
1457
1491
|
memory_evaluator: MemoryAwareEvaluator | None = None
|
|
@@ -187,9 +187,9 @@ def build_metric_scorecard(
|
|
|
187
187
|
threshold = _resolve_threshold(run, metric)
|
|
188
188
|
pass_rate = pass_rates.get(metric) if isinstance(pass_rates, dict) else None
|
|
189
189
|
status = "unknown"
|
|
190
|
-
if isinstance(mean,
|
|
190
|
+
if isinstance(mean, int | float):
|
|
191
191
|
status = "pass" if float(mean) >= threshold else "risk"
|
|
192
|
-
elif isinstance(pass_rate,
|
|
192
|
+
elif isinstance(pass_rate, int | float):
|
|
193
193
|
status = "pass" if float(pass_rate) >= 0.7 else "risk"
|
|
194
194
|
scorecard.append(
|
|
195
195
|
{
|
|
@@ -42,6 +42,9 @@ from evalvault.adapters.outbound.analysis.hybrid_rrf_module import HybridRRFModu
|
|
|
42
42
|
from evalvault.adapters.outbound.analysis.hybrid_weighted_module import (
|
|
43
43
|
HybridWeightedModule,
|
|
44
44
|
)
|
|
45
|
+
from evalvault.adapters.outbound.analysis.hypothesis_generator_module import (
|
|
46
|
+
HypothesisGeneratorModule,
|
|
47
|
+
)
|
|
45
48
|
from evalvault.adapters.outbound.analysis.llm_report_module import LLMReportModule
|
|
46
49
|
from evalvault.adapters.outbound.analysis.low_performer_extractor_module import (
|
|
47
50
|
LowPerformerExtractorModule,
|
|
@@ -53,6 +56,9 @@ from evalvault.adapters.outbound.analysis.morpheme_analyzer_module import (
|
|
|
53
56
|
from evalvault.adapters.outbound.analysis.morpheme_quality_checker_module import (
|
|
54
57
|
MorphemeQualityCheckerModule,
|
|
55
58
|
)
|
|
59
|
+
from evalvault.adapters.outbound.analysis.network_analyzer_module import (
|
|
60
|
+
NetworkAnalyzerModule,
|
|
61
|
+
)
|
|
56
62
|
from evalvault.adapters.outbound.analysis.nlp_adapter import NLPAnalysisAdapter
|
|
57
63
|
from evalvault.adapters.outbound.analysis.nlp_analyzer_module import NLPAnalyzerModule
|
|
58
64
|
from evalvault.adapters.outbound.analysis.pattern_detector_module import (
|
|
@@ -103,6 +109,9 @@ from evalvault.adapters.outbound.analysis.summary_report_module import (
|
|
|
103
109
|
from evalvault.adapters.outbound.analysis.time_series_analyzer_module import (
|
|
104
110
|
TimeSeriesAnalyzerModule,
|
|
105
111
|
)
|
|
112
|
+
from evalvault.adapters.outbound.analysis.timeseries_advanced_module import (
|
|
113
|
+
TimeSeriesAdvancedModule,
|
|
114
|
+
)
|
|
106
115
|
from evalvault.adapters.outbound.analysis.trend_detector_module import (
|
|
107
116
|
TrendDetectorModule,
|
|
108
117
|
)
|
|
@@ -111,16 +120,16 @@ from evalvault.adapters.outbound.analysis.verification_report_module import (
|
|
|
111
120
|
)
|
|
112
121
|
|
|
113
122
|
__all__ = [
|
|
114
|
-
|
|
115
|
-
"
|
|
123
|
+
"TimeSeriesAdvancedModule",
|
|
124
|
+
"NetworkAnalyzerModule",
|
|
116
125
|
"NLPAnalysisAdapter",
|
|
117
126
|
"StatisticalAnalysisAdapter",
|
|
118
127
|
"BaseAnalysisAdapter",
|
|
119
128
|
"AnalysisDataProcessor",
|
|
120
|
-
# Phase 14
|
|
121
129
|
"BaseAnalysisModule",
|
|
122
130
|
"AnalysisReportModule",
|
|
123
131
|
"BM25SearcherModule",
|
|
132
|
+
"CausalAnalysisAdapter",
|
|
124
133
|
"CausalAnalyzerModule",
|
|
125
134
|
"ComparisonReportModule",
|
|
126
135
|
"DataLoaderModule",
|
|
@@ -131,6 +140,7 @@ __all__ = [
|
|
|
131
140
|
"EmbeddingSearcherModule",
|
|
132
141
|
"HybridRRFModule",
|
|
133
142
|
"HybridWeightedModule",
|
|
143
|
+
"HypothesisGeneratorModule",
|
|
134
144
|
"LowPerformerExtractorModule",
|
|
135
145
|
"LLMReportModule",
|
|
136
146
|
"ModelAnalyzerModule",
|
|
@@ -8,7 +8,6 @@ import numpy as np
|
|
|
8
8
|
|
|
9
9
|
from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
|
|
10
10
|
from evalvault.adapters.outbound.analysis.pipeline_helpers import get_upstream_output
|
|
11
|
-
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
12
11
|
from evalvault.adapters.outbound.nlp.korean.dense_retriever import KoreanDenseRetriever
|
|
13
12
|
from evalvault.config.settings import Settings
|
|
14
13
|
from evalvault.domain.entities import EvaluationRun
|
|
@@ -140,6 +139,8 @@ class EmbeddingAnalyzerModule(BaseAnalysisModule):
|
|
|
140
139
|
|
|
141
140
|
if backend_hint == "ollama" or embedding_profile in {"dev", "prod"}:
|
|
142
141
|
try:
|
|
142
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
143
|
+
|
|
143
144
|
adapter = OllamaAdapter(settings)
|
|
144
145
|
retriever = KoreanDenseRetriever(
|
|
145
146
|
model_name=model_name or settings.ollama_embedding_model,
|
|
@@ -12,7 +12,6 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
|
|
|
12
12
|
recall_at_k,
|
|
13
13
|
safe_mean,
|
|
14
14
|
)
|
|
15
|
-
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
16
15
|
from evalvault.adapters.outbound.nlp.korean.dense_retriever import KoreanDenseRetriever
|
|
17
16
|
from evalvault.config.settings import Settings
|
|
18
17
|
from evalvault.domain.entities import EvaluationRun
|
|
@@ -66,6 +65,8 @@ class EmbeddingSearcherModule(BaseAnalysisModule):
|
|
|
66
65
|
|
|
67
66
|
if embedding_profile in {"dev", "prod"}:
|
|
68
67
|
try:
|
|
68
|
+
from evalvault.adapters.outbound.llm.ollama_adapter import OllamaAdapter
|
|
69
|
+
|
|
69
70
|
adapter = OllamaAdapter(settings)
|
|
70
71
|
retriever = KoreanDenseRetriever(
|
|
71
72
|
model_name=settings.ollama_embedding_model,
|