evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. evalvault/adapters/inbound/api/adapter.py +127 -80
  2. evalvault/adapters/inbound/api/routers/calibration.py +9 -9
  3. evalvault/adapters/inbound/api/routers/chat.py +303 -17
  4. evalvault/adapters/inbound/api/routers/config.py +3 -1
  5. evalvault/adapters/inbound/api/routers/domain.py +10 -5
  6. evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
  7. evalvault/adapters/inbound/api/routers/runs.py +23 -4
  8. evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
  9. evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
  10. evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
  11. evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
  12. evalvault/adapters/inbound/cli/commands/compare.py +2 -7
  13. evalvault/adapters/inbound/cli/commands/debug.py +3 -2
  14. evalvault/adapters/inbound/cli/commands/domain.py +12 -12
  15. evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
  16. evalvault/adapters/inbound/cli/commands/gate.py +3 -2
  17. evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
  18. evalvault/adapters/inbound/cli/commands/history.py +3 -12
  19. evalvault/adapters/inbound/cli/commands/method.py +3 -4
  20. evalvault/adapters/inbound/cli/commands/ops.py +2 -2
  21. evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
  22. evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
  23. evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
  24. evalvault/adapters/inbound/cli/commands/regress.py +5 -4
  25. evalvault/adapters/inbound/cli/commands/run.py +188 -59
  26. evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
  27. evalvault/adapters/inbound/cli/commands/stage.py +6 -25
  28. evalvault/adapters/inbound/cli/utils/options.py +10 -4
  29. evalvault/adapters/inbound/mcp/tools.py +11 -8
  30. evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
  31. evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
  32. evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
  33. evalvault/adapters/outbound/domain_memory/factory.py +68 -0
  34. evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
  35. evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
  36. evalvault/adapters/outbound/llm/factory.py +1 -1
  37. evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
  38. evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
  39. evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
  40. evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
  41. evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
  42. evalvault/adapters/outbound/storage/base_sql.py +3 -2
  43. evalvault/adapters/outbound/storage/factory.py +53 -0
  44. evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
  45. evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
  46. evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
  47. evalvault/config/instrumentation.py +8 -6
  48. evalvault/config/phoenix_support.py +5 -0
  49. evalvault/config/settings.py +71 -11
  50. evalvault/domain/services/domain_learning_hook.py +2 -1
  51. evalvault/domain/services/evaluator.py +2 -0
  52. evalvault/ports/inbound/web_port.py +3 -1
  53. evalvault/ports/outbound/storage_port.py +2 -0
  54. evalvault-1.76.0.dist-info/METADATA +221 -0
  55. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
  56. evalvault-1.74.0.dist-info/METADATA +0 -585
  57. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
  58. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
  59. {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -24,7 +24,7 @@ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrat
24
24
  from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
25
25
  from evalvault.adapters.outbound.report import MarkdownReportAdapter
26
26
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
27
- from evalvault.config.settings import Settings
27
+ from evalvault.config.settings import Settings, resolve_tracker_providers
28
28
  from evalvault.domain.entities import (
29
29
  CalibrationResult,
30
30
  FeedbackSummary,
@@ -127,14 +127,15 @@ class WebUIAdapter:
127
127
  llm_adapter: LLM 어댑터 (선택적)
128
128
  data_loader: 데이터 로더 (선택적)
129
129
  """
130
- resolved_settings = settings
130
+ resolved_settings = settings or Settings()
131
131
  if storage is None:
132
- resolved_settings = settings or Settings()
133
- db_path = getattr(resolved_settings, "evalvault_db_path", None)
134
- if db_path:
135
- from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
132
+ from evalvault.adapters.outbound.storage.factory import build_storage_adapter
136
133
 
137
- storage = SQLiteStorageAdapter(db_path=db_path)
134
+ try:
135
+ storage = build_storage_adapter(settings=resolved_settings)
136
+ except Exception as exc:
137
+ logger.warning("Storage initialization failed: %s", exc)
138
+ storage = None
138
139
 
139
140
  self._storage = storage
140
141
  self._evaluator = evaluator
@@ -216,56 +217,83 @@ class WebUIAdapter:
216
217
  logger.warning(f"Failed to create LLM adapter for {model_id}: {e}, using default")
217
218
  return self._llm_adapter
218
219
 
219
- def _get_tracker(
220
+ def _get_trackers(
220
221
  self,
221
222
  settings: Settings,
222
223
  tracker_config: dict[str, Any] | None,
223
- ) -> tuple[str | None, Any | None]:
224
- provider = (tracker_config or {}).get("provider") or "none"
225
- provider = provider.lower()
226
-
227
- if provider in {"none", ""}:
228
- return None, None
229
-
230
- if provider == "langfuse":
231
- if not settings.langfuse_public_key or not settings.langfuse_secret_key:
232
- logger.warning("Langfuse credentials missing; skipping tracker logging.")
233
- return None, None
234
- from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
235
-
236
- return provider, LangfuseAdapter(
237
- public_key=settings.langfuse_public_key,
238
- secret_key=settings.langfuse_secret_key,
239
- host=settings.langfuse_host,
240
- )
224
+ ) -> list[tuple[str, Any]]:
225
+ provider = (tracker_config or {}).get("provider") or settings.tracker_provider or "none"
226
+ providers = resolve_tracker_providers(provider)
227
+ if not providers or providers == ["none"]:
228
+ return []
229
+ required = {"mlflow", "phoenix"}
230
+ if not required.issubset(set(providers)):
231
+ raise RuntimeError("Tracker must include both mlflow and phoenix")
232
+
233
+ trackers: list[tuple[str, Any]] = []
234
+ for entry in providers:
235
+ if entry == "langfuse":
236
+ if not settings.langfuse_public_key or not settings.langfuse_secret_key:
237
+ raise RuntimeError("Langfuse credentials missing")
238
+ from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
239
+
240
+ trackers.append(
241
+ (
242
+ entry,
243
+ LangfuseAdapter(
244
+ public_key=settings.langfuse_public_key,
245
+ secret_key=settings.langfuse_secret_key,
246
+ host=settings.langfuse_host,
247
+ ),
248
+ )
249
+ )
250
+ continue
241
251
 
242
- if provider == "phoenix":
243
- from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
252
+ if entry == "phoenix":
253
+ from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
244
254
 
245
- ensure_phoenix_instrumentation(settings, force=True)
246
- try:
247
- from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
248
- except ImportError as exc:
249
- logger.warning("Phoenix extras not installed: %s", exc)
250
- return None, None
251
- return provider, PhoenixAdapter(endpoint=settings.phoenix_endpoint)
252
-
253
- if provider == "mlflow":
254
- if not settings.mlflow_tracking_uri:
255
- logger.warning("MLflow tracking URI missing; skipping tracker logging.")
256
- return None, None
257
- try:
258
- from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
259
- except ImportError as exc:
260
- logger.warning("MLflow adapter unavailable: %s", exc)
261
- return None, None
262
- return provider, MLflowAdapter(
263
- tracking_uri=settings.mlflow_tracking_uri,
264
- experiment_name=settings.mlflow_experiment_name,
265
- )
255
+ ensure_phoenix_instrumentation(settings, force=True)
256
+ try:
257
+ from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
258
+ except ImportError as exc:
259
+ raise RuntimeError("Phoenix extras not installed") from exc
260
+ trackers.append(
261
+ (
262
+ entry,
263
+ PhoenixAdapter(
264
+ endpoint=settings.phoenix_endpoint,
265
+ project_name=getattr(settings, "phoenix_project_name", None),
266
+ annotations_enabled=getattr(
267
+ settings,
268
+ "phoenix_annotations_enabled",
269
+ True,
270
+ ),
271
+ ),
272
+ )
273
+ )
274
+ continue
275
+
276
+ if entry == "mlflow":
277
+ if not settings.mlflow_tracking_uri:
278
+ raise RuntimeError("MLflow tracking URI missing")
279
+ try:
280
+ from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
281
+ except ImportError as exc:
282
+ raise RuntimeError("MLflow adapter unavailable") from exc
283
+ trackers.append(
284
+ (
285
+ entry,
286
+ MLflowAdapter(
287
+ tracking_uri=settings.mlflow_tracking_uri,
288
+ experiment_name=settings.mlflow_experiment_name,
289
+ ),
290
+ )
291
+ )
292
+ continue
293
+
294
+ raise RuntimeError(f"Unknown tracker provider: {entry}")
266
295
 
267
- logger.warning("Unknown tracker provider: %s", provider)
268
- return None, None
296
+ return trackers
269
297
 
270
298
  @staticmethod
271
299
  def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
@@ -424,7 +452,11 @@ class WebUIAdapter:
424
452
  dataset.metadata["domain"] = requested_domain
425
453
 
426
454
  settings = self._settings or Settings()
427
- tracker_provider, tracker = self._get_tracker(settings, request.tracker_config)
455
+ try:
456
+ trackers = self._get_trackers(settings, request.tracker_config)
457
+ except RuntimeError as exc:
458
+ raise RuntimeError(f"Tracker configuration error: {exc}") from exc
459
+ tracker_providers = [provider for provider, _ in trackers]
428
460
  stage_store = bool(request.stage_store)
429
461
 
430
462
  retriever_instance = None
@@ -450,7 +482,12 @@ class WebUIAdapter:
450
482
  memory_domain = memory_config.get("domain") or dataset.metadata.get("domain") or "default"
451
483
  memory_language = memory_config.get("language") or "ko"
452
484
  memory_augment = bool(memory_config.get("augment_context"))
453
- memory_db_path = memory_config.get("db_path") or settings.evalvault_memory_db_path
485
+ if memory_config.get("db_path"):
486
+ memory_db_path = memory_config.get("db_path")
487
+ elif settings.db_backend == "sqlite":
488
+ memory_db_path = settings.evalvault_memory_db_path
489
+ else:
490
+ memory_db_path = None
454
491
  memory_evaluator = None
455
492
  requested_thresholds = request.thresholds or {}
456
493
  if request.threshold_profile or requested_thresholds:
@@ -472,16 +509,17 @@ class WebUIAdapter:
472
509
  memory_active = False
473
510
  if memory_enabled:
474
511
  try:
475
- from evalvault.adapters.outbound.domain_memory.sqlite_adapter import (
476
- SQLiteDomainMemoryAdapter,
477
- )
512
+ from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
478
513
  from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import (
479
514
  PhoenixTracerAdapter,
480
515
  )
481
516
  from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
482
517
 
483
- tracer = PhoenixTracerAdapter() if tracker_provider == "phoenix" else None
484
- memory_adapter = SQLiteDomainMemoryAdapter(memory_db_path)
518
+ tracer = PhoenixTracerAdapter() if "phoenix" in tracker_providers else None
519
+ memory_adapter = build_domain_memory_adapter(
520
+ settings=self._settings,
521
+ db_path=Path(memory_db_path) if memory_db_path else None,
522
+ )
485
523
  memory_evaluator = MemoryAwareEvaluator(
486
524
  evaluator=self._evaluator,
487
525
  memory_port=memory_adapter,
@@ -689,22 +727,27 @@ class WebUIAdapter:
689
727
  str(request.threshold_profile).strip().lower()
690
728
  )
691
729
 
692
- if tracker and tracker_provider:
693
- try:
694
- trace_id = tracker.log_evaluation_run(result)
695
- if tracker_provider == "phoenix":
696
- endpoint = settings.phoenix_endpoint or "http://localhost:6006/v1/traces"
697
- phoenix_meta = result.tracker_metadata.setdefault("phoenix", {})
698
- phoenix_meta.update(
699
- {
700
- "trace_id": trace_id,
701
- "endpoint": endpoint,
702
- "trace_url": self._build_phoenix_trace_url(endpoint, trace_id),
703
- "schema_version": 2,
704
- }
705
- )
706
- except Exception as exc:
707
- logger.warning("Tracker logging failed: %s", exc)
730
+ if trackers:
731
+ result.tracker_metadata.setdefault("tracker_providers", tracker_providers)
732
+ for provider, tracker in trackers:
733
+ try:
734
+ trace_id = tracker.log_evaluation_run(result)
735
+ provider_meta = result.tracker_metadata.setdefault(provider, {})
736
+ if isinstance(provider_meta, dict):
737
+ provider_meta.setdefault("trace_id", trace_id)
738
+ if provider == "phoenix":
739
+ endpoint = settings.phoenix_endpoint or "http://localhost:6006/v1/traces"
740
+ phoenix_meta = result.tracker_metadata.setdefault("phoenix", {})
741
+ phoenix_meta.update(
742
+ {
743
+ "trace_id": trace_id,
744
+ "endpoint": endpoint,
745
+ "trace_url": self._build_phoenix_trace_url(endpoint, trace_id),
746
+ "schema_version": 2,
747
+ }
748
+ )
749
+ except Exception as exc:
750
+ raise RuntimeError(f"Tracker logging failed for {provider}: {exc}") from exc
708
751
 
709
752
  if stage_store and self._storage and hasattr(self._storage, "save_stage_events"):
710
753
  try:
@@ -814,6 +857,7 @@ class WebUIAdapter:
814
857
  def list_runs(
815
858
  self,
816
859
  limit: int = 50,
860
+ offset: int = 0,
817
861
  filters: RunFilters | None = None,
818
862
  ) -> list[RunSummary]:
819
863
  """평가 목록 조회.
@@ -833,7 +877,7 @@ class WebUIAdapter:
833
877
 
834
878
  try:
835
879
  # 저장소에서 평가 목록 조회
836
- runs = self._storage.list_runs(limit=limit)
880
+ runs = self._storage.list_runs(limit=limit, offset=offset)
837
881
 
838
882
  # RunSummary로 변환
839
883
  summaries = []
@@ -1029,7 +1073,11 @@ class WebUIAdapter:
1029
1073
  run = self.get_run_details(run_id)
1030
1074
  feedbacks = storage.list_feedback(run_id)
1031
1075
  if labels_source in {"feedback", "hybrid"} and not feedbacks:
1032
- raise ValueError("Feedback labels are required for this labels_source")
1076
+ raise ValueError(
1077
+ f"No feedback labels found for run '{run_id}'. "
1078
+ f"Calibration with labels_source='{labels_source}' requires at least one feedback label. "
1079
+ "Please add feedback labels via the UI or API, or use labels_source='gold' if gold labels are available."
1080
+ )
1033
1081
  resolved_metrics = metrics or list(run.metrics_evaluated)
1034
1082
  if not resolved_metrics:
1035
1083
  raise ValueError("No metrics available for calibration")
@@ -2198,16 +2246,15 @@ def create_adapter() -> WebUIAdapter:
2198
2246
  """
2199
2247
  from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
2200
2248
  from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
2201
- from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
2249
+ from evalvault.adapters.outbound.storage.factory import build_storage_adapter
2202
2250
  from evalvault.config.settings import get_settings
2203
2251
  from evalvault.domain.services.evaluator import RagasEvaluator
2204
2252
 
2205
2253
  # 설정 로드
2206
2254
  settings = get_settings()
2207
2255
 
2208
- # Storage 생성 (기본 SQLite)
2209
- db_path = Path(settings.evalvault_db_path)
2210
- storage = SQLiteStorageAdapter(db_path=db_path)
2256
+ # Storage 생성
2257
+ storage = build_storage_adapter(settings=settings)
2211
2258
 
2212
2259
  # LLM adapter 생성 (API 키 없으면 None)
2213
2260
  llm_adapter = None
@@ -113,6 +113,15 @@ def run_judge_calibration(
113
113
  return JudgeCalibrationResponse.model_validate(payload)
114
114
 
115
115
 
116
+ @router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
117
+ def list_calibrations(
118
+ adapter: AdapterDep,
119
+ limit: int = Query(20, ge=1, le=200),
120
+ ) -> list[JudgeCalibrationHistoryItem]:
121
+ entries = adapter.list_judge_calibrations(limit=limit)
122
+ return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
123
+
124
+
116
125
  @router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
117
126
  def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
118
127
  try:
@@ -122,12 +131,3 @@ def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCal
122
131
  except RuntimeError as exc:
123
132
  raise HTTPException(status_code=500, detail=str(exc)) from exc
124
133
  return JudgeCalibrationResponse.model_validate(payload)
125
-
126
-
127
- @router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
128
- def list_calibrations(
129
- adapter: AdapterDep,
130
- limit: int = Query(20, ge=1, le=200),
131
- ) -> list[JudgeCalibrationHistoryItem]:
132
- entries = adapter.list_judge_calibrations(limit=limit)
133
- return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]