evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +123 -64
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +3 -3
- evalvault/adapters/inbound/cli/commands/run.py +153 -30
- evalvault/adapters/inbound/cli/commands/run_helpers.py +166 -62
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +100 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +178 -12
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/runtime_services.py +122 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +25 -24
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -33,7 +33,7 @@ from evalvault.adapters.outbound.phoenix.sync_service import (
|
|
|
33
33
|
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
34
34
|
from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import PhoenixTracerAdapter
|
|
35
35
|
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
36
|
-
from evalvault.config.settings import Settings, apply_profile
|
|
36
|
+
from evalvault.config.settings import Settings, apply_profile, resolve_tracker_providers
|
|
37
37
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
38
38
|
from evalvault.domain.entities.multiturn import (
|
|
39
39
|
MultiTurnConversationRecord,
|
|
@@ -86,7 +86,8 @@ from .run_helpers import (
|
|
|
86
86
|
_display_results,
|
|
87
87
|
_evaluate_streaming_run,
|
|
88
88
|
_is_oss_open_model,
|
|
89
|
-
|
|
89
|
+
_log_analysis_artifacts,
|
|
90
|
+
_log_to_trackers,
|
|
90
91
|
_option_was_provided,
|
|
91
92
|
_print_run_mode_banner,
|
|
92
93
|
_resolve_thresholds,
|
|
@@ -178,6 +179,14 @@ def _log_duration(
|
|
|
178
179
|
_log_timestamp(console, verbose, f"{message} ({elapsed:.2f}s)")
|
|
179
180
|
|
|
180
181
|
|
|
182
|
+
def _infer_phoenix_model_provider(model_name: str) -> str:
|
|
183
|
+
if not model_name:
|
|
184
|
+
return "OPENAI"
|
|
185
|
+
provider = model_name.split("/")[0].upper() if "/" in model_name else "OPENAI"
|
|
186
|
+
allowed = {"OPENAI", "AZURE_OPENAI", "ANTHROPIC", "GOOGLE", "DEEPSEEK", "XAI", "AWS", "OLLAMA"}
|
|
187
|
+
return provider if provider in allowed else "OPENAI"
|
|
188
|
+
|
|
189
|
+
|
|
181
190
|
def register_run_commands(
|
|
182
191
|
app: typer.Typer,
|
|
183
192
|
console: Console,
|
|
@@ -358,10 +367,13 @@ def register_run_commands(
|
|
|
358
367
|
help="Store stage events in the SQLite database (requires --db).",
|
|
359
368
|
),
|
|
360
369
|
tracker: str = typer.Option(
|
|
361
|
-
"
|
|
370
|
+
"mlflow+phoenix",
|
|
362
371
|
"--tracker",
|
|
363
372
|
"-t",
|
|
364
|
-
help=
|
|
373
|
+
help=(
|
|
374
|
+
"Tracker to log results: 'langfuse', 'mlflow', 'phoenix', 'none', "
|
|
375
|
+
"or combinations like 'mlflow+phoenix'."
|
|
376
|
+
),
|
|
365
377
|
rich_help_panel="Simple mode preset",
|
|
366
378
|
),
|
|
367
379
|
langfuse: bool = typer.Option(
|
|
@@ -667,13 +679,24 @@ def register_run_commands(
|
|
|
667
679
|
tracker_override = _option_was_provided(ctx, "tracker") or langfuse
|
|
668
680
|
selected_tracker = tracker
|
|
669
681
|
if preset.default_tracker:
|
|
670
|
-
if tracker_override
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
682
|
+
if tracker_override:
|
|
683
|
+
try:
|
|
684
|
+
providers = resolve_tracker_providers(tracker)
|
|
685
|
+
except ValueError as exc:
|
|
686
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
687
|
+
raise typer.Exit(2) from exc
|
|
688
|
+
if providers == ["none"]:
|
|
689
|
+
selected_tracker = preset.default_tracker
|
|
690
|
+
elif preset.default_tracker not in providers:
|
|
691
|
+
print_cli_warning(
|
|
692
|
+
console,
|
|
693
|
+
f"Simple 모드는 tracker에 {preset.default_tracker}가 포함되어야 합니다.",
|
|
694
|
+
tips=["다른 Tracker를 사용하려면 --mode full을 사용하세요."],
|
|
695
|
+
)
|
|
696
|
+
providers.append(preset.default_tracker)
|
|
697
|
+
selected_tracker = "+".join(providers)
|
|
698
|
+
else:
|
|
699
|
+
selected_tracker = preset.default_tracker
|
|
677
700
|
tracker = selected_tracker
|
|
678
701
|
|
|
679
702
|
prompt_manifest_value = prompt_manifest
|
|
@@ -852,7 +875,7 @@ def register_run_commands(
|
|
|
852
875
|
if profile_name:
|
|
853
876
|
settings = apply_profile(settings, profile_name)
|
|
854
877
|
|
|
855
|
-
if db_path is None:
|
|
878
|
+
if db_path is None and settings.db_backend == "sqlite":
|
|
856
879
|
db_path = Path(settings.evalvault_db_path)
|
|
857
880
|
|
|
858
881
|
excel_output: Path | None = None
|
|
@@ -1198,6 +1221,7 @@ def register_run_commands(
|
|
|
1198
1221
|
for turn in conversation.turn_results:
|
|
1199
1222
|
turn_results.append(turn)
|
|
1200
1223
|
_save_multiturn_to_db(
|
|
1224
|
+
settings,
|
|
1201
1225
|
db_path,
|
|
1202
1226
|
run_record,
|
|
1203
1227
|
conversation_records,
|
|
@@ -1646,10 +1670,29 @@ def register_run_commands(
|
|
|
1646
1670
|
)
|
|
1647
1671
|
raise typer.Exit(2) from exc
|
|
1648
1672
|
|
|
1673
|
+
effective_tracker = tracker
|
|
1674
|
+
if langfuse:
|
|
1675
|
+
effective_tracker = "langfuse"
|
|
1676
|
+
print_cli_warning(
|
|
1677
|
+
console,
|
|
1678
|
+
"--langfuse 플래그는 곧 제거됩니다.",
|
|
1679
|
+
tips=["대신 --tracker langfuse를 사용하세요."],
|
|
1680
|
+
)
|
|
1681
|
+
|
|
1682
|
+
try:
|
|
1683
|
+
effective_providers = resolve_tracker_providers(effective_tracker)
|
|
1684
|
+
except ValueError as exc:
|
|
1685
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
1686
|
+
raise typer.Exit(2) from exc
|
|
1687
|
+
|
|
1649
1688
|
phoenix_dataset_name = phoenix_dataset
|
|
1650
1689
|
if phoenix_experiment and not phoenix_dataset_name:
|
|
1651
1690
|
phoenix_dataset_name = f"{ds.name}:{ds.version}"
|
|
1652
1691
|
|
|
1692
|
+
auto_phoenix_sync = "phoenix" in effective_providers and not stream
|
|
1693
|
+
if auto_phoenix_sync and not phoenix_dataset_name:
|
|
1694
|
+
phoenix_dataset_name = f"{ds.name}:{ds.version}"
|
|
1695
|
+
|
|
1653
1696
|
phoenix_dataset_description_value = phoenix_dataset_description
|
|
1654
1697
|
if phoenix_dataset_name and not phoenix_dataset_description_value:
|
|
1655
1698
|
desc_source = ds.metadata.get("description") if isinstance(ds.metadata, dict) else None
|
|
@@ -1659,13 +1702,23 @@ def register_run_commands(
|
|
|
1659
1702
|
phoenix_dataset_result: dict[str, Any] | None = None
|
|
1660
1703
|
phoenix_experiment_result: dict[str, Any] | None = None
|
|
1661
1704
|
|
|
1662
|
-
if phoenix_dataset_name or phoenix_experiment:
|
|
1705
|
+
if phoenix_dataset_name or phoenix_experiment or auto_phoenix_sync:
|
|
1663
1706
|
try:
|
|
1707
|
+
phoenix_endpoint = getattr(settings, "phoenix_endpoint", None)
|
|
1708
|
+
if not isinstance(phoenix_endpoint, str) or not phoenix_endpoint.strip():
|
|
1709
|
+
phoenix_endpoint = "http://localhost:6006/v1/traces"
|
|
1664
1710
|
phoenix_sync_service = PhoenixSyncService(
|
|
1665
|
-
endpoint=
|
|
1711
|
+
endpoint=phoenix_endpoint,
|
|
1666
1712
|
api_token=getattr(settings, "phoenix_api_token", None),
|
|
1667
1713
|
)
|
|
1668
1714
|
except PhoenixSyncError as exc:
|
|
1715
|
+
if auto_phoenix_sync:
|
|
1716
|
+
print_cli_error(
|
|
1717
|
+
console,
|
|
1718
|
+
"Phoenix Sync 서비스를 초기화할 수 없습니다.",
|
|
1719
|
+
details=str(exc),
|
|
1720
|
+
)
|
|
1721
|
+
raise typer.Exit(2) from exc
|
|
1669
1722
|
print_cli_warning(
|
|
1670
1723
|
console,
|
|
1671
1724
|
"Phoenix Sync 서비스를 초기화할 수 없습니다.",
|
|
@@ -1673,19 +1726,10 @@ def register_run_commands(
|
|
|
1673
1726
|
)
|
|
1674
1727
|
phoenix_sync_service = None
|
|
1675
1728
|
|
|
1676
|
-
effective_tracker = tracker
|
|
1677
|
-
if langfuse and tracker == "none" and not preset.default_tracker:
|
|
1678
|
-
effective_tracker = "langfuse"
|
|
1679
|
-
print_cli_warning(
|
|
1680
|
-
console,
|
|
1681
|
-
"--langfuse 플래그는 곧 제거됩니다.",
|
|
1682
|
-
tips=["대신 --tracker langfuse를 사용하세요."],
|
|
1683
|
-
)
|
|
1684
|
-
|
|
1685
1729
|
config_wants_phoenix = getattr(settings, "phoenix_enabled", False)
|
|
1686
1730
|
if not isinstance(config_wants_phoenix, bool):
|
|
1687
1731
|
config_wants_phoenix = False
|
|
1688
|
-
should_enable_phoenix =
|
|
1732
|
+
should_enable_phoenix = "phoenix" in effective_providers or config_wants_phoenix
|
|
1689
1733
|
if should_enable_phoenix:
|
|
1690
1734
|
ensure_phoenix_instrumentation(settings, console=console, force=True)
|
|
1691
1735
|
|
|
@@ -2032,6 +2076,9 @@ def register_run_commands(
|
|
|
2032
2076
|
)
|
|
2033
2077
|
if prompt_bundle:
|
|
2034
2078
|
result.tracker_metadata["prompt_set"] = build_prompt_summary(prompt_bundle)
|
|
2079
|
+
result.tracker_metadata["prompt_set_detail"] = prompt_bundle.to_dict(
|
|
2080
|
+
include_content=True
|
|
2081
|
+
)
|
|
2035
2082
|
|
|
2036
2083
|
if retriever_instance or used_versioned_prefill:
|
|
2037
2084
|
retriever_tracker_meta: dict[str, Any] = {
|
|
@@ -2105,13 +2152,29 @@ def register_run_commands(
|
|
|
2105
2152
|
)
|
|
2106
2153
|
console.print(f"[dim]View datasets: {dataset_info.url}[/dim]")
|
|
2107
2154
|
except PhoenixSyncError as exc:
|
|
2155
|
+
if auto_phoenix_sync:
|
|
2156
|
+
print_cli_error(
|
|
2157
|
+
console,
|
|
2158
|
+
"Phoenix Dataset 업로드에 실패했습니다.",
|
|
2159
|
+
details=str(exc),
|
|
2160
|
+
)
|
|
2161
|
+
raise typer.Exit(2) from exc
|
|
2108
2162
|
print_cli_warning(
|
|
2109
2163
|
console,
|
|
2110
2164
|
"Phoenix Dataset 업로드에 실패했습니다.",
|
|
2111
2165
|
tips=[str(exc)],
|
|
2112
2166
|
)
|
|
2167
|
+
if auto_phoenix_sync and not phoenix_experiment:
|
|
2168
|
+
phoenix_experiment = f"{result.model_name}-{result.run_id[:8]}"
|
|
2113
2169
|
if phoenix_experiment:
|
|
2114
2170
|
if not phoenix_dataset_result:
|
|
2171
|
+
if auto_phoenix_sync:
|
|
2172
|
+
print_cli_error(
|
|
2173
|
+
console,
|
|
2174
|
+
"Dataset 업로드에 실패해 Phoenix Experiment 생성을 진행할 수 없습니다.",
|
|
2175
|
+
details="Phoenix dataset 업로드가 필요합니다.",
|
|
2176
|
+
)
|
|
2177
|
+
raise typer.Exit(2)
|
|
2115
2178
|
print_cli_warning(
|
|
2116
2179
|
console,
|
|
2117
2180
|
"Dataset 업로드에 실패해 Phoenix Experiment 생성을 건너뜁니다.",
|
|
@@ -2169,6 +2232,41 @@ def register_run_commands(
|
|
|
2169
2232
|
phoenix_meta = result.tracker_metadata.setdefault("phoenix", {})
|
|
2170
2233
|
phoenix_meta.setdefault("schema_version", 2)
|
|
2171
2234
|
phoenix_meta["prompts"] = prompt_metadata_entries
|
|
2235
|
+
if phoenix_sync_service and "phoenix" in effective_providers:
|
|
2236
|
+
try:
|
|
2237
|
+
prompt_set_summary = result.tracker_metadata.get("prompt_set") or {}
|
|
2238
|
+
prompt_set_name = prompt_set_summary.get("prompt_set_name")
|
|
2239
|
+
prompt_entries = list(prompt_metadata_entries)
|
|
2240
|
+
prompt_set_detail = result.tracker_metadata.get("prompt_set_detail")
|
|
2241
|
+
if isinstance(prompt_set_detail, dict):
|
|
2242
|
+
for item in prompt_set_detail.get("items", []):
|
|
2243
|
+
prompt = item.get("prompt") or {}
|
|
2244
|
+
if not isinstance(prompt, dict):
|
|
2245
|
+
continue
|
|
2246
|
+
prompt_entries.append(
|
|
2247
|
+
{
|
|
2248
|
+
"name": prompt.get("name"),
|
|
2249
|
+
"role": item.get("role"),
|
|
2250
|
+
"kind": prompt.get("kind"),
|
|
2251
|
+
"checksum": prompt.get("checksum"),
|
|
2252
|
+
"content": prompt.get("content"),
|
|
2253
|
+
"source": prompt.get("source"),
|
|
2254
|
+
}
|
|
2255
|
+
)
|
|
2256
|
+
synced = phoenix_sync_service.sync_prompts(
|
|
2257
|
+
prompt_entries=prompt_entries,
|
|
2258
|
+
model_name=result.model_name,
|
|
2259
|
+
model_provider=_infer_phoenix_model_provider(result.model_name),
|
|
2260
|
+
prompt_set_name=prompt_set_name,
|
|
2261
|
+
)
|
|
2262
|
+
if synced:
|
|
2263
|
+
phoenix_meta["prompts"] = synced
|
|
2264
|
+
except PhoenixSyncError as exc:
|
|
2265
|
+
print_cli_warning(
|
|
2266
|
+
console,
|
|
2267
|
+
"Phoenix Prompt 동기화에 실패했습니다.",
|
|
2268
|
+
tips=[str(exc)],
|
|
2269
|
+
)
|
|
2172
2270
|
|
|
2173
2271
|
if stage_events or stage_store:
|
|
2174
2272
|
stage_event_builder = StageEventBuilder()
|
|
@@ -2187,7 +2285,7 @@ def register_run_commands(
|
|
|
2187
2285
|
|
|
2188
2286
|
if effective_tracker != "none":
|
|
2189
2287
|
phoenix_opts = None
|
|
2190
|
-
if
|
|
2288
|
+
if "phoenix" in effective_providers:
|
|
2191
2289
|
phoenix_opts = {
|
|
2192
2290
|
"max_traces": phoenix_max_traces,
|
|
2193
2291
|
"metadata": phoenix_trace_metadata or None,
|
|
@@ -2198,7 +2296,7 @@ def register_run_commands(
|
|
|
2198
2296
|
verbose,
|
|
2199
2297
|
f"Tracker 로깅 시작 ({effective_tracker})",
|
|
2200
2298
|
)
|
|
2201
|
-
|
|
2299
|
+
_log_to_trackers(
|
|
2202
2300
|
settings,
|
|
2203
2301
|
result,
|
|
2204
2302
|
console,
|
|
@@ -2210,6 +2308,7 @@ def register_run_commands(
|
|
|
2210
2308
|
db_started_at = datetime.now()
|
|
2211
2309
|
_log_timestamp(console, verbose, "DB 저장 시작")
|
|
2212
2310
|
_save_to_db(
|
|
2311
|
+
settings,
|
|
2213
2312
|
db_path,
|
|
2214
2313
|
result,
|
|
2215
2314
|
console,
|
|
@@ -2276,6 +2375,12 @@ def register_run_commands(
|
|
|
2276
2375
|
pipeline_result,
|
|
2277
2376
|
artifacts_dir=artifacts_dir,
|
|
2278
2377
|
)
|
|
2378
|
+
result.tracker_metadata["analysis_artifacts"] = {
|
|
2379
|
+
"dir": artifact_index.get("dir"),
|
|
2380
|
+
"index": artifact_index.get("index"),
|
|
2381
|
+
"output": str(analysis_output_path),
|
|
2382
|
+
"report": str(analysis_report_path),
|
|
2383
|
+
}
|
|
2279
2384
|
payload = serialize_pipeline_result(pipeline_result)
|
|
2280
2385
|
payload["run_id"] = result.run_id
|
|
2281
2386
|
payload["artifacts"] = artifact_index
|
|
@@ -2292,6 +2397,18 @@ def register_run_commands(
|
|
|
2292
2397
|
"[green]자동 분석 상세 결과 저장:[/green] "
|
|
2293
2398
|
f"{artifact_index['dir']} (index: {artifact_index['index']})\n"
|
|
2294
2399
|
)
|
|
2400
|
+
if effective_tracker != "none":
|
|
2401
|
+
_log_analysis_artifacts(
|
|
2402
|
+
settings,
|
|
2403
|
+
result,
|
|
2404
|
+
console,
|
|
2405
|
+
effective_tracker,
|
|
2406
|
+
analysis_payload=payload,
|
|
2407
|
+
artifact_index=artifact_index,
|
|
2408
|
+
report_text=report_text,
|
|
2409
|
+
output_path=analysis_output_path,
|
|
2410
|
+
report_path=analysis_report_path,
|
|
2411
|
+
)
|
|
2295
2412
|
|
|
2296
2413
|
@app.command(
|
|
2297
2414
|
name="run-simple",
|
|
@@ -2395,10 +2512,13 @@ def register_run_commands(
|
|
|
2395
2512
|
help="Store stage events in the SQLite database (requires --db).",
|
|
2396
2513
|
),
|
|
2397
2514
|
tracker: str = typer.Option(
|
|
2398
|
-
"
|
|
2515
|
+
"mlflow+phoenix",
|
|
2399
2516
|
"--tracker",
|
|
2400
2517
|
"-t",
|
|
2401
|
-
help=
|
|
2518
|
+
help=(
|
|
2519
|
+
"Tracker to log results: 'langfuse', 'mlflow', 'phoenix', 'none', "
|
|
2520
|
+
"or combinations like 'mlflow+phoenix'."
|
|
2521
|
+
),
|
|
2402
2522
|
),
|
|
2403
2523
|
langfuse: bool = typer.Option(
|
|
2404
2524
|
False,
|
|
@@ -2687,10 +2807,13 @@ def register_run_commands(
|
|
|
2687
2807
|
help="Store stage events in the SQLite database (requires --db).",
|
|
2688
2808
|
),
|
|
2689
2809
|
tracker: str = typer.Option(
|
|
2690
|
-
"
|
|
2810
|
+
"mlflow+phoenix",
|
|
2691
2811
|
"--tracker",
|
|
2692
2812
|
"-t",
|
|
2693
|
-
help=
|
|
2813
|
+
help=(
|
|
2814
|
+
"Tracker to log results: 'langfuse', 'mlflow', 'phoenix', 'none', "
|
|
2815
|
+
"or combinations like 'mlflow+phoenix'."
|
|
2816
|
+
),
|
|
2694
2817
|
),
|
|
2695
2818
|
langfuse: bool = typer.Option(
|
|
2696
2819
|
False,
|