evalvault 1.76.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +24 -1
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +1 -1
- evalvault/adapters/inbound/cli/commands/run.py +9 -4
- evalvault/adapters/inbound/cli/commands/run_helpers.py +18 -16
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/phoenix/sync_service.py +1 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +25 -8
- evalvault/config/runtime_services.py +122 -0
- {evalvault-1.76.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.76.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +18 -17
- {evalvault-1.76.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.76.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.76.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -567,11 +567,31 @@ class WebUIAdapter:
|
|
|
567
567
|
|
|
568
568
|
# 2. 진행률 초기화
|
|
569
569
|
start_time = time.monotonic()
|
|
570
|
+
total_cases = len(dataset.test_cases)
|
|
571
|
+
|
|
572
|
+
def emit_progress(message: str, *, status: str = "running") -> None:
|
|
573
|
+
if not on_progress:
|
|
574
|
+
return
|
|
575
|
+
elapsed = time.monotonic() - start_time
|
|
576
|
+
rate = (total_cases / elapsed) if total_cases > 0 and elapsed > 0 else None
|
|
577
|
+
on_progress(
|
|
578
|
+
EvalProgress(
|
|
579
|
+
current=total_cases,
|
|
580
|
+
total=total_cases,
|
|
581
|
+
current_metric=message,
|
|
582
|
+
percent=100.0 if total_cases > 0 else 0.0,
|
|
583
|
+
status=status,
|
|
584
|
+
elapsed_seconds=elapsed,
|
|
585
|
+
eta_seconds=0.0,
|
|
586
|
+
rate=rate,
|
|
587
|
+
)
|
|
588
|
+
)
|
|
589
|
+
|
|
570
590
|
if on_progress:
|
|
571
591
|
on_progress(
|
|
572
592
|
EvalProgress(
|
|
573
593
|
current=0,
|
|
574
|
-
total=
|
|
594
|
+
total=total_cases,
|
|
575
595
|
current_metric="",
|
|
576
596
|
percent=0.0,
|
|
577
597
|
status="running",
|
|
@@ -728,6 +748,7 @@ class WebUIAdapter:
|
|
|
728
748
|
)
|
|
729
749
|
|
|
730
750
|
if trackers:
|
|
751
|
+
emit_progress("Logging trackers...", status="finalizing")
|
|
731
752
|
result.tracker_metadata.setdefault("tracker_providers", tracker_providers)
|
|
732
753
|
for provider, tracker in trackers:
|
|
733
754
|
try:
|
|
@@ -750,6 +771,7 @@ class WebUIAdapter:
|
|
|
750
771
|
raise RuntimeError(f"Tracker logging failed for {provider}: {exc}") from exc
|
|
751
772
|
|
|
752
773
|
if stage_store and self._storage and hasattr(self._storage, "save_stage_events"):
|
|
774
|
+
emit_progress("Storing stage events...", status="finalizing")
|
|
753
775
|
try:
|
|
754
776
|
prompt_metadata_entries = self._build_prompt_metadata_entries(prompt_bundle)
|
|
755
777
|
stage_event_builder = StageEventBuilder()
|
|
@@ -791,6 +813,7 @@ class WebUIAdapter:
|
|
|
791
813
|
|
|
792
814
|
# 5. 결과 저장
|
|
793
815
|
if self._storage:
|
|
816
|
+
emit_progress("Saving evaluation run...", status="finalizing")
|
|
794
817
|
logger.info(f"Saving evaluation run: {result.run_id}")
|
|
795
818
|
if prompt_bundle:
|
|
796
819
|
self._storage.save_prompt_set(prompt_bundle)
|
|
@@ -15,6 +15,7 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
|
15
15
|
from starlette.responses import JSONResponse
|
|
16
16
|
|
|
17
17
|
from evalvault.adapters.inbound.api.adapter import WebUIAdapter, create_adapter
|
|
18
|
+
from evalvault.config.runtime_services import ensure_local_observability
|
|
18
19
|
from evalvault.config.settings import Settings, get_settings, is_production_profile
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
@@ -63,6 +64,7 @@ async def lifespan(app: FastAPI):
|
|
|
63
64
|
# Startup: Initialize adapter
|
|
64
65
|
adapter = create_adapter()
|
|
65
66
|
app.state.adapter = adapter
|
|
67
|
+
ensure_local_observability(get_settings())
|
|
66
68
|
try:
|
|
67
69
|
from evalvault.adapters.inbound.api.routers.chat import warm_rag_index
|
|
68
70
|
|
|
@@ -14,6 +14,8 @@ import typer
|
|
|
14
14
|
from rich import print as rprint
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
|
|
17
|
+
from evalvault.config.runtime_services import ensure_local_observability
|
|
18
|
+
from evalvault.config.settings import get_settings
|
|
17
19
|
from evalvault.domain.metrics.registry import list_metric_names
|
|
18
20
|
|
|
19
21
|
from .commands import attach_sub_apps, register_all_commands
|
|
@@ -61,6 +63,7 @@ def main(
|
|
|
61
63
|
),
|
|
62
64
|
) -> None:
|
|
63
65
|
"""EvalVault - RAG evaluation system."""
|
|
66
|
+
ensure_local_observability(get_settings())
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
if __name__ == "__main__": # pragma: no cover
|
|
@@ -358,6 +358,11 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
358
358
|
profile: str | None = profile_option(
|
|
359
359
|
help_text="비교 리포트용 LLM 프로필 (dev, prod, openai)",
|
|
360
360
|
),
|
|
361
|
+
use_llm_report: bool = typer.Option(
|
|
362
|
+
True,
|
|
363
|
+
"--use-llm-report/--no-llm-report",
|
|
364
|
+
help="LLM 보고서 사용 여부",
|
|
365
|
+
),
|
|
361
366
|
) -> None:
|
|
362
367
|
"""두 실행을 통계적으로 비교합니다."""
|
|
363
368
|
|
|
@@ -461,7 +466,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
461
466
|
compare_metrics=metric_list,
|
|
462
467
|
test_type=test,
|
|
463
468
|
report_type="comparison",
|
|
464
|
-
use_llm_report=
|
|
469
|
+
use_llm_report=use_llm_report,
|
|
465
470
|
)
|
|
466
471
|
|
|
467
472
|
artifacts_dir = resolve_artifact_dir(
|
|
@@ -875,7 +875,7 @@ def register_run_commands(
|
|
|
875
875
|
if profile_name:
|
|
876
876
|
settings = apply_profile(settings, profile_name)
|
|
877
877
|
|
|
878
|
-
if db_path is None:
|
|
878
|
+
if db_path is None and settings.db_backend == "sqlite":
|
|
879
879
|
db_path = Path(settings.evalvault_db_path)
|
|
880
880
|
|
|
881
881
|
excel_output: Path | None = None
|
|
@@ -1221,6 +1221,7 @@ def register_run_commands(
|
|
|
1221
1221
|
for turn in conversation.turn_results:
|
|
1222
1222
|
turn_results.append(turn)
|
|
1223
1223
|
_save_multiturn_to_db(
|
|
1224
|
+
settings,
|
|
1224
1225
|
db_path,
|
|
1225
1226
|
run_record,
|
|
1226
1227
|
conversation_records,
|
|
@@ -1670,7 +1671,7 @@ def register_run_commands(
|
|
|
1670
1671
|
raise typer.Exit(2) from exc
|
|
1671
1672
|
|
|
1672
1673
|
effective_tracker = tracker
|
|
1673
|
-
if langfuse
|
|
1674
|
+
if langfuse:
|
|
1674
1675
|
effective_tracker = "langfuse"
|
|
1675
1676
|
print_cli_warning(
|
|
1676
1677
|
console,
|
|
@@ -1688,7 +1689,7 @@ def register_run_commands(
|
|
|
1688
1689
|
if phoenix_experiment and not phoenix_dataset_name:
|
|
1689
1690
|
phoenix_dataset_name = f"{ds.name}:{ds.version}"
|
|
1690
1691
|
|
|
1691
|
-
auto_phoenix_sync = "phoenix" in effective_providers
|
|
1692
|
+
auto_phoenix_sync = "phoenix" in effective_providers and not stream
|
|
1692
1693
|
if auto_phoenix_sync and not phoenix_dataset_name:
|
|
1693
1694
|
phoenix_dataset_name = f"{ds.name}:{ds.version}"
|
|
1694
1695
|
|
|
@@ -1703,8 +1704,11 @@ def register_run_commands(
|
|
|
1703
1704
|
|
|
1704
1705
|
if phoenix_dataset_name or phoenix_experiment or auto_phoenix_sync:
|
|
1705
1706
|
try:
|
|
1707
|
+
phoenix_endpoint = getattr(settings, "phoenix_endpoint", None)
|
|
1708
|
+
if not isinstance(phoenix_endpoint, str) or not phoenix_endpoint.strip():
|
|
1709
|
+
phoenix_endpoint = "http://localhost:6006/v1/traces"
|
|
1706
1710
|
phoenix_sync_service = PhoenixSyncService(
|
|
1707
|
-
endpoint=
|
|
1711
|
+
endpoint=phoenix_endpoint,
|
|
1708
1712
|
api_token=getattr(settings, "phoenix_api_token", None),
|
|
1709
1713
|
)
|
|
1710
1714
|
except PhoenixSyncError as exc:
|
|
@@ -2304,6 +2308,7 @@ def register_run_commands(
|
|
|
2304
2308
|
db_started_at = datetime.now()
|
|
2305
2309
|
_log_timestamp(console, verbose, "DB 저장 시작")
|
|
2306
2310
|
_save_to_db(
|
|
2311
|
+
settings,
|
|
2307
2312
|
db_path,
|
|
2308
2313
|
result,
|
|
2309
2314
|
console,
|
|
@@ -345,20 +345,18 @@ def _get_tracker(
|
|
|
345
345
|
)
|
|
346
346
|
|
|
347
347
|
elif tracker_type == "mlflow":
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
|
|
355
|
-
return None
|
|
348
|
+
tracking_uri = getattr(settings, "mlflow_tracking_uri", None)
|
|
349
|
+
if not isinstance(tracking_uri, str) or not tracking_uri.strip():
|
|
350
|
+
tracking_uri = f"sqlite:///{Path.cwd() / 'mlruns.db'}"
|
|
351
|
+
experiment_name = getattr(settings, "mlflow_experiment_name", None)
|
|
352
|
+
if not isinstance(experiment_name, str) or not experiment_name.strip():
|
|
353
|
+
experiment_name = "evalvault"
|
|
356
354
|
try:
|
|
357
355
|
from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
|
|
358
356
|
|
|
359
357
|
return MLflowAdapter(
|
|
360
|
-
tracking_uri=
|
|
361
|
-
experiment_name=
|
|
358
|
+
tracking_uri=tracking_uri,
|
|
359
|
+
experiment_name=experiment_name,
|
|
362
360
|
)
|
|
363
361
|
except ImportError:
|
|
364
362
|
message = "MLflow extra가 설치되지 않았습니다."
|
|
@@ -373,8 +371,11 @@ def _get_tracker(
|
|
|
373
371
|
try:
|
|
374
372
|
from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
|
|
375
373
|
|
|
374
|
+
endpoint = getattr(settings, "phoenix_endpoint", None)
|
|
375
|
+
if not isinstance(endpoint, str) or not endpoint.strip():
|
|
376
|
+
endpoint = "http://localhost:6006/v1/traces"
|
|
376
377
|
return PhoenixAdapter(
|
|
377
|
-
endpoint=
|
|
378
|
+
endpoint=endpoint,
|
|
378
379
|
service_name="evalvault",
|
|
379
380
|
project_name=getattr(settings, "phoenix_project_name", None),
|
|
380
381
|
annotations_enabled=getattr(settings, "phoenix_annotations_enabled", True),
|
|
@@ -407,9 +408,6 @@ def _resolve_tracker_list(tracker_type: str) -> list[str]:
|
|
|
407
408
|
unknown = [entry for entry in providers if entry not in supported]
|
|
408
409
|
if unknown:
|
|
409
410
|
raise ValueError(f"Unknown tracker provider(s): {', '.join(unknown)}")
|
|
410
|
-
required = {"mlflow", "phoenix"}
|
|
411
|
-
if not required.issubset(set(providers)):
|
|
412
|
-
raise ValueError("tracker must include both 'mlflow' and 'phoenix'")
|
|
413
411
|
return providers
|
|
414
412
|
|
|
415
413
|
|
|
@@ -555,6 +553,7 @@ def _log_analysis_artifacts(
|
|
|
555
553
|
|
|
556
554
|
|
|
557
555
|
def _save_to_db(
|
|
556
|
+
settings: Settings,
|
|
558
557
|
db_path: Path | None,
|
|
559
558
|
result,
|
|
560
559
|
console: Console,
|
|
@@ -563,7 +562,7 @@ def _save_to_db(
|
|
|
563
562
|
export_excel: bool = True,
|
|
564
563
|
) -> None:
|
|
565
564
|
"""Persist evaluation run (and optional prompt set) to database."""
|
|
566
|
-
storage = build_storage_adapter(settings=
|
|
565
|
+
storage = build_storage_adapter(settings=settings, db_path=db_path)
|
|
567
566
|
storage_label = (
|
|
568
567
|
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
569
568
|
)
|
|
@@ -607,6 +606,7 @@ def _save_to_db(
|
|
|
607
606
|
|
|
608
607
|
|
|
609
608
|
def _save_multiturn_to_db(
|
|
609
|
+
settings: Settings,
|
|
610
610
|
db_path: Path | None,
|
|
611
611
|
run_record: MultiTurnRunRecord,
|
|
612
612
|
conversations: list[MultiTurnConversationRecord],
|
|
@@ -618,7 +618,7 @@ def _save_multiturn_to_db(
|
|
|
618
618
|
metric_thresholds: dict[str, float] | None = None,
|
|
619
619
|
) -> None:
|
|
620
620
|
"""Persist multiturn evaluation run to database."""
|
|
621
|
-
storage = build_storage_adapter(settings=
|
|
621
|
+
storage = build_storage_adapter(settings=settings, db_path=db_path)
|
|
622
622
|
storage_label = (
|
|
623
623
|
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
624
624
|
)
|
|
@@ -840,6 +840,8 @@ def log_phoenix_traces(
|
|
|
840
840
|
return 0
|
|
841
841
|
|
|
842
842
|
limit = max_traces if max_traces is not None else run.total_test_cases
|
|
843
|
+
if not isinstance(limit, int):
|
|
844
|
+
limit = None
|
|
843
845
|
|
|
844
846
|
count = 0
|
|
845
847
|
for result in run.results:
|