evalvault 1.76.0__py3-none-any.whl → 1.77.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -567,11 +567,31 @@ class WebUIAdapter:
567
567
 
568
568
  # 2. 진행률 초기화
569
569
  start_time = time.monotonic()
570
+ total_cases = len(dataset.test_cases)
571
+
572
+ def emit_progress(message: str, *, status: str = "running") -> None:
573
+ if not on_progress:
574
+ return
575
+ elapsed = time.monotonic() - start_time
576
+ rate = (total_cases / elapsed) if total_cases > 0 and elapsed > 0 else None
577
+ on_progress(
578
+ EvalProgress(
579
+ current=total_cases,
580
+ total=total_cases,
581
+ current_metric=message,
582
+ percent=100.0 if total_cases > 0 else 0.0,
583
+ status=status,
584
+ elapsed_seconds=elapsed,
585
+ eta_seconds=0.0,
586
+ rate=rate,
587
+ )
588
+ )
589
+
570
590
  if on_progress:
571
591
  on_progress(
572
592
  EvalProgress(
573
593
  current=0,
574
- total=len(dataset.test_cases),
594
+ total=total_cases,
575
595
  current_metric="",
576
596
  percent=0.0,
577
597
  status="running",
@@ -728,6 +748,7 @@ class WebUIAdapter:
728
748
  )
729
749
 
730
750
  if trackers:
751
+ emit_progress("Logging trackers...", status="finalizing")
731
752
  result.tracker_metadata.setdefault("tracker_providers", tracker_providers)
732
753
  for provider, tracker in trackers:
733
754
  try:
@@ -750,6 +771,7 @@ class WebUIAdapter:
750
771
  raise RuntimeError(f"Tracker logging failed for {provider}: {exc}") from exc
751
772
 
752
773
  if stage_store and self._storage and hasattr(self._storage, "save_stage_events"):
774
+ emit_progress("Storing stage events...", status="finalizing")
753
775
  try:
754
776
  prompt_metadata_entries = self._build_prompt_metadata_entries(prompt_bundle)
755
777
  stage_event_builder = StageEventBuilder()
@@ -791,6 +813,7 @@ class WebUIAdapter:
791
813
 
792
814
  # 5. 결과 저장
793
815
  if self._storage:
816
+ emit_progress("Saving evaluation run...", status="finalizing")
794
817
  logger.info(f"Saving evaluation run: {result.run_id}")
795
818
  if prompt_bundle:
796
819
  self._storage.save_prompt_set(prompt_bundle)
@@ -15,6 +15,7 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
15
15
  from starlette.responses import JSONResponse
16
16
 
17
17
  from evalvault.adapters.inbound.api.adapter import WebUIAdapter, create_adapter
18
+ from evalvault.config.runtime_services import ensure_local_observability
18
19
  from evalvault.config.settings import Settings, get_settings, is_production_profile
19
20
 
20
21
  logger = logging.getLogger(__name__)
@@ -63,6 +64,7 @@ async def lifespan(app: FastAPI):
63
64
  # Startup: Initialize adapter
64
65
  adapter = create_adapter()
65
66
  app.state.adapter = adapter
67
+ ensure_local_observability(get_settings())
66
68
  try:
67
69
  from evalvault.adapters.inbound.api.routers.chat import warm_rag_index
68
70
 
@@ -14,6 +14,8 @@ import typer
14
14
  from rich import print as rprint
15
15
  from rich.console import Console
16
16
 
17
+ from evalvault.config.runtime_services import ensure_local_observability
18
+ from evalvault.config.settings import get_settings
17
19
  from evalvault.domain.metrics.registry import list_metric_names
18
20
 
19
21
  from .commands import attach_sub_apps, register_all_commands
@@ -61,6 +63,7 @@ def main(
61
63
  ),
62
64
  ) -> None:
63
65
  """EvalVault - RAG evaluation system."""
66
+ ensure_local_observability(get_settings())
64
67
 
65
68
 
66
69
  if __name__ == "__main__": # pragma: no cover
@@ -358,6 +358,11 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
358
358
  profile: str | None = profile_option(
359
359
  help_text="비교 리포트용 LLM 프로필 (dev, prod, openai)",
360
360
  ),
361
+ use_llm_report: bool = typer.Option(
362
+ True,
363
+ "--use-llm-report/--no-llm-report",
364
+ help="LLM 보고서 사용 여부",
365
+ ),
361
366
  ) -> None:
362
367
  """두 실행을 통계적으로 비교합니다."""
363
368
 
@@ -461,7 +466,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
461
466
  compare_metrics=metric_list,
462
467
  test_type=test,
463
468
  report_type="comparison",
464
- use_llm_report=True,
469
+ use_llm_report=use_llm_report,
465
470
  )
466
471
 
467
472
  artifacts_dir = resolve_artifact_dir(
@@ -424,7 +424,7 @@ def create_method_app(console: Console) -> typer.Typer:
424
424
  if eval_output:
425
425
  _save_results(eval_output, result, console)
426
426
 
427
- _save_to_db(db_path, result, console)
427
+ _save_to_db(settings, db_path, result, console)
428
428
 
429
429
  return method_app
430
430
 
@@ -875,7 +875,7 @@ def register_run_commands(
875
875
  if profile_name:
876
876
  settings = apply_profile(settings, profile_name)
877
877
 
878
- if db_path is None:
878
+ if db_path is None and settings.db_backend == "sqlite":
879
879
  db_path = Path(settings.evalvault_db_path)
880
880
 
881
881
  excel_output: Path | None = None
@@ -1221,6 +1221,7 @@ def register_run_commands(
1221
1221
  for turn in conversation.turn_results:
1222
1222
  turn_results.append(turn)
1223
1223
  _save_multiturn_to_db(
1224
+ settings,
1224
1225
  db_path,
1225
1226
  run_record,
1226
1227
  conversation_records,
@@ -1670,7 +1671,7 @@ def register_run_commands(
1670
1671
  raise typer.Exit(2) from exc
1671
1672
 
1672
1673
  effective_tracker = tracker
1673
- if langfuse and tracker == "none" and not preset.default_tracker:
1674
+ if langfuse:
1674
1675
  effective_tracker = "langfuse"
1675
1676
  print_cli_warning(
1676
1677
  console,
@@ -1688,7 +1689,7 @@ def register_run_commands(
1688
1689
  if phoenix_experiment and not phoenix_dataset_name:
1689
1690
  phoenix_dataset_name = f"{ds.name}:{ds.version}"
1690
1691
 
1691
- auto_phoenix_sync = "phoenix" in effective_providers
1692
+ auto_phoenix_sync = "phoenix" in effective_providers and not stream
1692
1693
  if auto_phoenix_sync and not phoenix_dataset_name:
1693
1694
  phoenix_dataset_name = f"{ds.name}:{ds.version}"
1694
1695
 
@@ -1703,8 +1704,11 @@ def register_run_commands(
1703
1704
 
1704
1705
  if phoenix_dataset_name or phoenix_experiment or auto_phoenix_sync:
1705
1706
  try:
1707
+ phoenix_endpoint = getattr(settings, "phoenix_endpoint", None)
1708
+ if not isinstance(phoenix_endpoint, str) or not phoenix_endpoint.strip():
1709
+ phoenix_endpoint = "http://localhost:6006/v1/traces"
1706
1710
  phoenix_sync_service = PhoenixSyncService(
1707
- endpoint=settings.phoenix_endpoint,
1711
+ endpoint=phoenix_endpoint,
1708
1712
  api_token=getattr(settings, "phoenix_api_token", None),
1709
1713
  )
1710
1714
  except PhoenixSyncError as exc:
@@ -2304,6 +2308,7 @@ def register_run_commands(
2304
2308
  db_started_at = datetime.now()
2305
2309
  _log_timestamp(console, verbose, "DB 저장 시작")
2306
2310
  _save_to_db(
2311
+ settings,
2307
2312
  db_path,
2308
2313
  result,
2309
2314
  console,
@@ -345,20 +345,18 @@ def _get_tracker(
345
345
  )
346
346
 
347
347
  elif tracker_type == "mlflow":
348
- if not settings.mlflow_tracking_uri:
349
- message = "MLflow tracking URI가 설정되지 않았습니다."
350
- tips = ["MLFLOW_TRACKING_URI 환경 변수를 설정하세요."]
351
- if required:
352
- print_cli_error(console, message, fixes=tips)
353
- raise typer.Exit(2)
354
- print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
355
- return None
348
+ tracking_uri = getattr(settings, "mlflow_tracking_uri", None)
349
+ if not isinstance(tracking_uri, str) or not tracking_uri.strip():
350
+ tracking_uri = f"sqlite:///{Path.cwd() / 'mlruns.db'}"
351
+ experiment_name = getattr(settings, "mlflow_experiment_name", None)
352
+ if not isinstance(experiment_name, str) or not experiment_name.strip():
353
+ experiment_name = "evalvault"
356
354
  try:
357
355
  from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
358
356
 
359
357
  return MLflowAdapter(
360
- tracking_uri=settings.mlflow_tracking_uri,
361
- experiment_name=settings.mlflow_experiment_name,
358
+ tracking_uri=tracking_uri,
359
+ experiment_name=experiment_name,
362
360
  )
363
361
  except ImportError:
364
362
  message = "MLflow extra가 설치되지 않았습니다."
@@ -373,8 +371,11 @@ def _get_tracker(
373
371
  try:
374
372
  from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
375
373
 
374
+ endpoint = getattr(settings, "phoenix_endpoint", None)
375
+ if not isinstance(endpoint, str) or not endpoint.strip():
376
+ endpoint = "http://localhost:6006/v1/traces"
376
377
  return PhoenixAdapter(
377
- endpoint=settings.phoenix_endpoint,
378
+ endpoint=endpoint,
378
379
  service_name="evalvault",
379
380
  project_name=getattr(settings, "phoenix_project_name", None),
380
381
  annotations_enabled=getattr(settings, "phoenix_annotations_enabled", True),
@@ -407,9 +408,6 @@ def _resolve_tracker_list(tracker_type: str) -> list[str]:
407
408
  unknown = [entry for entry in providers if entry not in supported]
408
409
  if unknown:
409
410
  raise ValueError(f"Unknown tracker provider(s): {', '.join(unknown)}")
410
- required = {"mlflow", "phoenix"}
411
- if not required.issubset(set(providers)):
412
- raise ValueError("tracker must include both 'mlflow' and 'phoenix'")
413
411
  return providers
414
412
 
415
413
 
@@ -555,6 +553,7 @@ def _log_analysis_artifacts(
555
553
 
556
554
 
557
555
  def _save_to_db(
556
+ settings: Settings,
558
557
  db_path: Path | None,
559
558
  result,
560
559
  console: Console,
@@ -563,7 +562,7 @@ def _save_to_db(
563
562
  export_excel: bool = True,
564
563
  ) -> None:
565
564
  """Persist evaluation run (and optional prompt set) to database."""
566
- storage = build_storage_adapter(settings=Settings(), db_path=db_path)
565
+ storage = build_storage_adapter(settings=settings, db_path=db_path)
567
566
  storage_label = (
568
567
  "PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
569
568
  )
@@ -607,6 +606,7 @@ def _save_to_db(
607
606
 
608
607
 
609
608
  def _save_multiturn_to_db(
609
+ settings: Settings,
610
610
  db_path: Path | None,
611
611
  run_record: MultiTurnRunRecord,
612
612
  conversations: list[MultiTurnConversationRecord],
@@ -618,7 +618,7 @@ def _save_multiturn_to_db(
618
618
  metric_thresholds: dict[str, float] | None = None,
619
619
  ) -> None:
620
620
  """Persist multiturn evaluation run to database."""
621
- storage = build_storage_adapter(settings=Settings(), db_path=db_path)
621
+ storage = build_storage_adapter(settings=settings, db_path=db_path)
622
622
  storage_label = (
623
623
  "PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
624
624
  )
@@ -840,6 +840,8 @@ def log_phoenix_traces(
840
840
  return 0
841
841
 
842
842
  limit = max_traces if max_traces is not None else run.total_test_cases
843
+ if not isinstance(limit, int):
844
+ limit = None
843
845
 
844
846
  count = 0
845
847
  for result in run.results: