evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +127 -80
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +303 -17
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +3 -4
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +188 -59
- evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/settings.py +71 -11
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.76.0.dist-info/METADATA +221 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
- evalvault-1.74.0.dist-info/METADATA +0 -585
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -24,7 +24,7 @@ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrat
|
|
|
24
24
|
from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
|
|
25
25
|
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
26
26
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
27
|
-
from evalvault.config.settings import Settings
|
|
27
|
+
from evalvault.config.settings import Settings, resolve_tracker_providers
|
|
28
28
|
from evalvault.domain.entities import (
|
|
29
29
|
CalibrationResult,
|
|
30
30
|
FeedbackSummary,
|
|
@@ -127,14 +127,15 @@ class WebUIAdapter:
|
|
|
127
127
|
llm_adapter: LLM 어댑터 (선택적)
|
|
128
128
|
data_loader: 데이터 로더 (선택적)
|
|
129
129
|
"""
|
|
130
|
-
resolved_settings = settings
|
|
130
|
+
resolved_settings = settings or Settings()
|
|
131
131
|
if storage is None:
|
|
132
|
-
|
|
133
|
-
db_path = getattr(resolved_settings, "evalvault_db_path", None)
|
|
134
|
-
if db_path:
|
|
135
|
-
from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
|
|
132
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
136
133
|
|
|
137
|
-
|
|
134
|
+
try:
|
|
135
|
+
storage = build_storage_adapter(settings=resolved_settings)
|
|
136
|
+
except Exception as exc:
|
|
137
|
+
logger.warning("Storage initialization failed: %s", exc)
|
|
138
|
+
storage = None
|
|
138
139
|
|
|
139
140
|
self._storage = storage
|
|
140
141
|
self._evaluator = evaluator
|
|
@@ -216,56 +217,83 @@ class WebUIAdapter:
|
|
|
216
217
|
logger.warning(f"Failed to create LLM adapter for {model_id}: {e}, using default")
|
|
217
218
|
return self._llm_adapter
|
|
218
219
|
|
|
219
|
-
def
|
|
220
|
+
def _get_trackers(
|
|
220
221
|
self,
|
|
221
222
|
settings: Settings,
|
|
222
223
|
tracker_config: dict[str, Any] | None,
|
|
223
|
-
) -> tuple[str
|
|
224
|
-
provider = (tracker_config or {}).get("provider") or "none"
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
224
|
+
) -> list[tuple[str, Any]]:
|
|
225
|
+
provider = (tracker_config or {}).get("provider") or settings.tracker_provider or "none"
|
|
226
|
+
providers = resolve_tracker_providers(provider)
|
|
227
|
+
if not providers or providers == ["none"]:
|
|
228
|
+
return []
|
|
229
|
+
required = {"mlflow", "phoenix"}
|
|
230
|
+
if not required.issubset(set(providers)):
|
|
231
|
+
raise RuntimeError("Tracker must include both mlflow and phoenix")
|
|
232
|
+
|
|
233
|
+
trackers: list[tuple[str, Any]] = []
|
|
234
|
+
for entry in providers:
|
|
235
|
+
if entry == "langfuse":
|
|
236
|
+
if not settings.langfuse_public_key or not settings.langfuse_secret_key:
|
|
237
|
+
raise RuntimeError("Langfuse credentials missing")
|
|
238
|
+
from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
|
|
239
|
+
|
|
240
|
+
trackers.append(
|
|
241
|
+
(
|
|
242
|
+
entry,
|
|
243
|
+
LangfuseAdapter(
|
|
244
|
+
public_key=settings.langfuse_public_key,
|
|
245
|
+
secret_key=settings.langfuse_secret_key,
|
|
246
|
+
host=settings.langfuse_host,
|
|
247
|
+
),
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
continue
|
|
241
251
|
|
|
242
|
-
|
|
243
|
-
|
|
252
|
+
if entry == "phoenix":
|
|
253
|
+
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
244
254
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
255
|
+
ensure_phoenix_instrumentation(settings, force=True)
|
|
256
|
+
try:
|
|
257
|
+
from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
|
|
258
|
+
except ImportError as exc:
|
|
259
|
+
raise RuntimeError("Phoenix extras not installed") from exc
|
|
260
|
+
trackers.append(
|
|
261
|
+
(
|
|
262
|
+
entry,
|
|
263
|
+
PhoenixAdapter(
|
|
264
|
+
endpoint=settings.phoenix_endpoint,
|
|
265
|
+
project_name=getattr(settings, "phoenix_project_name", None),
|
|
266
|
+
annotations_enabled=getattr(
|
|
267
|
+
settings,
|
|
268
|
+
"phoenix_annotations_enabled",
|
|
269
|
+
True,
|
|
270
|
+
),
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
if entry == "mlflow":
|
|
277
|
+
if not settings.mlflow_tracking_uri:
|
|
278
|
+
raise RuntimeError("MLflow tracking URI missing")
|
|
279
|
+
try:
|
|
280
|
+
from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
|
|
281
|
+
except ImportError as exc:
|
|
282
|
+
raise RuntimeError("MLflow adapter unavailable") from exc
|
|
283
|
+
trackers.append(
|
|
284
|
+
(
|
|
285
|
+
entry,
|
|
286
|
+
MLflowAdapter(
|
|
287
|
+
tracking_uri=settings.mlflow_tracking_uri,
|
|
288
|
+
experiment_name=settings.mlflow_experiment_name,
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
raise RuntimeError(f"Unknown tracker provider: {entry}")
|
|
266
295
|
|
|
267
|
-
|
|
268
|
-
return None, None
|
|
296
|
+
return trackers
|
|
269
297
|
|
|
270
298
|
@staticmethod
|
|
271
299
|
def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
@@ -424,7 +452,11 @@ class WebUIAdapter:
|
|
|
424
452
|
dataset.metadata["domain"] = requested_domain
|
|
425
453
|
|
|
426
454
|
settings = self._settings or Settings()
|
|
427
|
-
|
|
455
|
+
try:
|
|
456
|
+
trackers = self._get_trackers(settings, request.tracker_config)
|
|
457
|
+
except RuntimeError as exc:
|
|
458
|
+
raise RuntimeError(f"Tracker configuration error: {exc}") from exc
|
|
459
|
+
tracker_providers = [provider for provider, _ in trackers]
|
|
428
460
|
stage_store = bool(request.stage_store)
|
|
429
461
|
|
|
430
462
|
retriever_instance = None
|
|
@@ -450,7 +482,12 @@ class WebUIAdapter:
|
|
|
450
482
|
memory_domain = memory_config.get("domain") or dataset.metadata.get("domain") or "default"
|
|
451
483
|
memory_language = memory_config.get("language") or "ko"
|
|
452
484
|
memory_augment = bool(memory_config.get("augment_context"))
|
|
453
|
-
|
|
485
|
+
if memory_config.get("db_path"):
|
|
486
|
+
memory_db_path = memory_config.get("db_path")
|
|
487
|
+
elif settings.db_backend == "sqlite":
|
|
488
|
+
memory_db_path = settings.evalvault_memory_db_path
|
|
489
|
+
else:
|
|
490
|
+
memory_db_path = None
|
|
454
491
|
memory_evaluator = None
|
|
455
492
|
requested_thresholds = request.thresholds or {}
|
|
456
493
|
if request.threshold_profile or requested_thresholds:
|
|
@@ -472,16 +509,17 @@ class WebUIAdapter:
|
|
|
472
509
|
memory_active = False
|
|
473
510
|
if memory_enabled:
|
|
474
511
|
try:
|
|
475
|
-
from evalvault.adapters.outbound.domain_memory
|
|
476
|
-
SQLiteDomainMemoryAdapter,
|
|
477
|
-
)
|
|
512
|
+
from evalvault.adapters.outbound.domain_memory import build_domain_memory_adapter
|
|
478
513
|
from evalvault.adapters.outbound.tracer.phoenix_tracer_adapter import (
|
|
479
514
|
PhoenixTracerAdapter,
|
|
480
515
|
)
|
|
481
516
|
from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
|
|
482
517
|
|
|
483
|
-
tracer = PhoenixTracerAdapter() if
|
|
484
|
-
memory_adapter =
|
|
518
|
+
tracer = PhoenixTracerAdapter() if "phoenix" in tracker_providers else None
|
|
519
|
+
memory_adapter = build_domain_memory_adapter(
|
|
520
|
+
settings=self._settings,
|
|
521
|
+
db_path=Path(memory_db_path) if memory_db_path else None,
|
|
522
|
+
)
|
|
485
523
|
memory_evaluator = MemoryAwareEvaluator(
|
|
486
524
|
evaluator=self._evaluator,
|
|
487
525
|
memory_port=memory_adapter,
|
|
@@ -689,22 +727,27 @@ class WebUIAdapter:
|
|
|
689
727
|
str(request.threshold_profile).strip().lower()
|
|
690
728
|
)
|
|
691
729
|
|
|
692
|
-
if
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
730
|
+
if trackers:
|
|
731
|
+
result.tracker_metadata.setdefault("tracker_providers", tracker_providers)
|
|
732
|
+
for provider, tracker in trackers:
|
|
733
|
+
try:
|
|
734
|
+
trace_id = tracker.log_evaluation_run(result)
|
|
735
|
+
provider_meta = result.tracker_metadata.setdefault(provider, {})
|
|
736
|
+
if isinstance(provider_meta, dict):
|
|
737
|
+
provider_meta.setdefault("trace_id", trace_id)
|
|
738
|
+
if provider == "phoenix":
|
|
739
|
+
endpoint = settings.phoenix_endpoint or "http://localhost:6006/v1/traces"
|
|
740
|
+
phoenix_meta = result.tracker_metadata.setdefault("phoenix", {})
|
|
741
|
+
phoenix_meta.update(
|
|
742
|
+
{
|
|
743
|
+
"trace_id": trace_id,
|
|
744
|
+
"endpoint": endpoint,
|
|
745
|
+
"trace_url": self._build_phoenix_trace_url(endpoint, trace_id),
|
|
746
|
+
"schema_version": 2,
|
|
747
|
+
}
|
|
748
|
+
)
|
|
749
|
+
except Exception as exc:
|
|
750
|
+
raise RuntimeError(f"Tracker logging failed for {provider}: {exc}") from exc
|
|
708
751
|
|
|
709
752
|
if stage_store and self._storage and hasattr(self._storage, "save_stage_events"):
|
|
710
753
|
try:
|
|
@@ -814,6 +857,7 @@ class WebUIAdapter:
|
|
|
814
857
|
def list_runs(
|
|
815
858
|
self,
|
|
816
859
|
limit: int = 50,
|
|
860
|
+
offset: int = 0,
|
|
817
861
|
filters: RunFilters | None = None,
|
|
818
862
|
) -> list[RunSummary]:
|
|
819
863
|
"""평가 목록 조회.
|
|
@@ -833,7 +877,7 @@ class WebUIAdapter:
|
|
|
833
877
|
|
|
834
878
|
try:
|
|
835
879
|
# 저장소에서 평가 목록 조회
|
|
836
|
-
runs = self._storage.list_runs(limit=limit)
|
|
880
|
+
runs = self._storage.list_runs(limit=limit, offset=offset)
|
|
837
881
|
|
|
838
882
|
# RunSummary로 변환
|
|
839
883
|
summaries = []
|
|
@@ -1029,7 +1073,11 @@ class WebUIAdapter:
|
|
|
1029
1073
|
run = self.get_run_details(run_id)
|
|
1030
1074
|
feedbacks = storage.list_feedback(run_id)
|
|
1031
1075
|
if labels_source in {"feedback", "hybrid"} and not feedbacks:
|
|
1032
|
-
raise ValueError(
|
|
1076
|
+
raise ValueError(
|
|
1077
|
+
f"No feedback labels found for run '{run_id}'. "
|
|
1078
|
+
f"Calibration with labels_source='{labels_source}' requires at least one feedback label. "
|
|
1079
|
+
"Please add feedback labels via the UI or API, or use labels_source='gold' if gold labels are available."
|
|
1080
|
+
)
|
|
1033
1081
|
resolved_metrics = metrics or list(run.metrics_evaluated)
|
|
1034
1082
|
if not resolved_metrics:
|
|
1035
1083
|
raise ValueError("No metrics available for calibration")
|
|
@@ -2198,16 +2246,15 @@ def create_adapter() -> WebUIAdapter:
|
|
|
2198
2246
|
"""
|
|
2199
2247
|
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
2200
2248
|
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
2201
|
-
from evalvault.adapters.outbound.storage.
|
|
2249
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
2202
2250
|
from evalvault.config.settings import get_settings
|
|
2203
2251
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
2204
2252
|
|
|
2205
2253
|
# 설정 로드
|
|
2206
2254
|
settings = get_settings()
|
|
2207
2255
|
|
|
2208
|
-
# Storage 생성
|
|
2209
|
-
|
|
2210
|
-
storage = SQLiteStorageAdapter(db_path=db_path)
|
|
2256
|
+
# Storage 생성
|
|
2257
|
+
storage = build_storage_adapter(settings=settings)
|
|
2211
2258
|
|
|
2212
2259
|
# LLM adapter 생성 (API 키 없으면 None)
|
|
2213
2260
|
llm_adapter = None
|
|
@@ -113,6 +113,15 @@ def run_judge_calibration(
|
|
|
113
113
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
114
114
|
|
|
115
115
|
|
|
116
|
+
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
117
|
+
def list_calibrations(
|
|
118
|
+
adapter: AdapterDep,
|
|
119
|
+
limit: int = Query(20, ge=1, le=200),
|
|
120
|
+
) -> list[JudgeCalibrationHistoryItem]:
|
|
121
|
+
entries = adapter.list_judge_calibrations(limit=limit)
|
|
122
|
+
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|
|
123
|
+
|
|
124
|
+
|
|
116
125
|
@router.get("/judge/{calibration_id}", response_model=JudgeCalibrationResponse)
|
|
117
126
|
def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCalibrationResponse:
|
|
118
127
|
try:
|
|
@@ -122,12 +131,3 @@ def get_calibration_result(calibration_id: str, adapter: AdapterDep) -> JudgeCal
|
|
|
122
131
|
except RuntimeError as exc:
|
|
123
132
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
124
133
|
return JudgeCalibrationResponse.model_validate(payload)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
@router.get("/judge/history", response_model=list[JudgeCalibrationHistoryItem])
|
|
128
|
-
def list_calibrations(
|
|
129
|
-
adapter: AdapterDep,
|
|
130
|
-
limit: int = Query(20, ge=1, le=200),
|
|
131
|
-
) -> list[JudgeCalibrationHistoryItem]:
|
|
132
|
-
entries = adapter.list_judge_calibrations(limit=limit)
|
|
133
|
-
return [JudgeCalibrationHistoryItem.model_validate(entry) for entry in entries]
|