evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +123 -64
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +3 -3
- evalvault/adapters/inbound/cli/commands/run.py +153 -30
- evalvault/adapters/inbound/cli/commands/run_helpers.py +166 -62
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +100 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +178 -12
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/runtime_services.py +122 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +25 -24
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -24,7 +24,7 @@ from evalvault.adapters.outbound.judge_calibration_reporter import JudgeCalibrat
|
|
|
24
24
|
from evalvault.adapters.outbound.ops.report_renderer import render_json, render_markdown
|
|
25
25
|
from evalvault.adapters.outbound.report import MarkdownReportAdapter
|
|
26
26
|
from evalvault.config.phoenix_support import PhoenixExperimentResolver
|
|
27
|
-
from evalvault.config.settings import Settings
|
|
27
|
+
from evalvault.config.settings import Settings, resolve_tracker_providers
|
|
28
28
|
from evalvault.domain.entities import (
|
|
29
29
|
CalibrationResult,
|
|
30
30
|
FeedbackSummary,
|
|
@@ -217,56 +217,83 @@ class WebUIAdapter:
|
|
|
217
217
|
logger.warning(f"Failed to create LLM adapter for {model_id}: {e}, using default")
|
|
218
218
|
return self._llm_adapter
|
|
219
219
|
|
|
220
|
-
def
|
|
220
|
+
def _get_trackers(
|
|
221
221
|
self,
|
|
222
222
|
settings: Settings,
|
|
223
223
|
tracker_config: dict[str, Any] | None,
|
|
224
|
-
) -> tuple[str
|
|
225
|
-
provider = (tracker_config or {}).get("provider") or "none"
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
224
|
+
) -> list[tuple[str, Any]]:
|
|
225
|
+
provider = (tracker_config or {}).get("provider") or settings.tracker_provider or "none"
|
|
226
|
+
providers = resolve_tracker_providers(provider)
|
|
227
|
+
if not providers or providers == ["none"]:
|
|
228
|
+
return []
|
|
229
|
+
required = {"mlflow", "phoenix"}
|
|
230
|
+
if not required.issubset(set(providers)):
|
|
231
|
+
raise RuntimeError("Tracker must include both mlflow and phoenix")
|
|
232
|
+
|
|
233
|
+
trackers: list[tuple[str, Any]] = []
|
|
234
|
+
for entry in providers:
|
|
235
|
+
if entry == "langfuse":
|
|
236
|
+
if not settings.langfuse_public_key or not settings.langfuse_secret_key:
|
|
237
|
+
raise RuntimeError("Langfuse credentials missing")
|
|
238
|
+
from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
|
|
239
|
+
|
|
240
|
+
trackers.append(
|
|
241
|
+
(
|
|
242
|
+
entry,
|
|
243
|
+
LangfuseAdapter(
|
|
244
|
+
public_key=settings.langfuse_public_key,
|
|
245
|
+
secret_key=settings.langfuse_secret_key,
|
|
246
|
+
host=settings.langfuse_host,
|
|
247
|
+
),
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
continue
|
|
242
251
|
|
|
243
|
-
|
|
244
|
-
|
|
252
|
+
if entry == "phoenix":
|
|
253
|
+
from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
|
|
245
254
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
)
|
|
255
|
+
ensure_phoenix_instrumentation(settings, force=True)
|
|
256
|
+
try:
|
|
257
|
+
from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
|
|
258
|
+
except ImportError as exc:
|
|
259
|
+
raise RuntimeError("Phoenix extras not installed") from exc
|
|
260
|
+
trackers.append(
|
|
261
|
+
(
|
|
262
|
+
entry,
|
|
263
|
+
PhoenixAdapter(
|
|
264
|
+
endpoint=settings.phoenix_endpoint,
|
|
265
|
+
project_name=getattr(settings, "phoenix_project_name", None),
|
|
266
|
+
annotations_enabled=getattr(
|
|
267
|
+
settings,
|
|
268
|
+
"phoenix_annotations_enabled",
|
|
269
|
+
True,
|
|
270
|
+
),
|
|
271
|
+
),
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
continue
|
|
267
275
|
|
|
268
|
-
|
|
269
|
-
|
|
276
|
+
if entry == "mlflow":
|
|
277
|
+
if not settings.mlflow_tracking_uri:
|
|
278
|
+
raise RuntimeError("MLflow tracking URI missing")
|
|
279
|
+
try:
|
|
280
|
+
from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
|
|
281
|
+
except ImportError as exc:
|
|
282
|
+
raise RuntimeError("MLflow adapter unavailable") from exc
|
|
283
|
+
trackers.append(
|
|
284
|
+
(
|
|
285
|
+
entry,
|
|
286
|
+
MLflowAdapter(
|
|
287
|
+
tracking_uri=settings.mlflow_tracking_uri,
|
|
288
|
+
experiment_name=settings.mlflow_experiment_name,
|
|
289
|
+
),
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
continue
|
|
293
|
+
|
|
294
|
+
raise RuntimeError(f"Unknown tracker provider: {entry}")
|
|
295
|
+
|
|
296
|
+
return trackers
|
|
270
297
|
|
|
271
298
|
@staticmethod
|
|
272
299
|
def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
@@ -425,7 +452,11 @@ class WebUIAdapter:
|
|
|
425
452
|
dataset.metadata["domain"] = requested_domain
|
|
426
453
|
|
|
427
454
|
settings = self._settings or Settings()
|
|
428
|
-
|
|
455
|
+
try:
|
|
456
|
+
trackers = self._get_trackers(settings, request.tracker_config)
|
|
457
|
+
except RuntimeError as exc:
|
|
458
|
+
raise RuntimeError(f"Tracker configuration error: {exc}") from exc
|
|
459
|
+
tracker_providers = [provider for provider, _ in trackers]
|
|
429
460
|
stage_store = bool(request.stage_store)
|
|
430
461
|
|
|
431
462
|
retriever_instance = None
|
|
@@ -484,7 +515,7 @@ class WebUIAdapter:
|
|
|
484
515
|
)
|
|
485
516
|
from evalvault.domain.services.memory_aware_evaluator import MemoryAwareEvaluator
|
|
486
517
|
|
|
487
|
-
tracer = PhoenixTracerAdapter() if
|
|
518
|
+
tracer = PhoenixTracerAdapter() if "phoenix" in tracker_providers else None
|
|
488
519
|
memory_adapter = build_domain_memory_adapter(
|
|
489
520
|
settings=self._settings,
|
|
490
521
|
db_path=Path(memory_db_path) if memory_db_path else None,
|
|
@@ -536,11 +567,31 @@ class WebUIAdapter:
|
|
|
536
567
|
|
|
537
568
|
# 2. 진행률 초기화
|
|
538
569
|
start_time = time.monotonic()
|
|
570
|
+
total_cases = len(dataset.test_cases)
|
|
571
|
+
|
|
572
|
+
def emit_progress(message: str, *, status: str = "running") -> None:
|
|
573
|
+
if not on_progress:
|
|
574
|
+
return
|
|
575
|
+
elapsed = time.monotonic() - start_time
|
|
576
|
+
rate = (total_cases / elapsed) if total_cases > 0 and elapsed > 0 else None
|
|
577
|
+
on_progress(
|
|
578
|
+
EvalProgress(
|
|
579
|
+
current=total_cases,
|
|
580
|
+
total=total_cases,
|
|
581
|
+
current_metric=message,
|
|
582
|
+
percent=100.0 if total_cases > 0 else 0.0,
|
|
583
|
+
status=status,
|
|
584
|
+
elapsed_seconds=elapsed,
|
|
585
|
+
eta_seconds=0.0,
|
|
586
|
+
rate=rate,
|
|
587
|
+
)
|
|
588
|
+
)
|
|
589
|
+
|
|
539
590
|
if on_progress:
|
|
540
591
|
on_progress(
|
|
541
592
|
EvalProgress(
|
|
542
593
|
current=0,
|
|
543
|
-
total=
|
|
594
|
+
total=total_cases,
|
|
544
595
|
current_metric="",
|
|
545
596
|
percent=0.0,
|
|
546
597
|
status="running",
|
|
@@ -696,24 +747,31 @@ class WebUIAdapter:
|
|
|
696
747
|
str(request.threshold_profile).strip().lower()
|
|
697
748
|
)
|
|
698
749
|
|
|
699
|
-
if
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
750
|
+
if trackers:
|
|
751
|
+
emit_progress("Logging trackers...", status="finalizing")
|
|
752
|
+
result.tracker_metadata.setdefault("tracker_providers", tracker_providers)
|
|
753
|
+
for provider, tracker in trackers:
|
|
754
|
+
try:
|
|
755
|
+
trace_id = tracker.log_evaluation_run(result)
|
|
756
|
+
provider_meta = result.tracker_metadata.setdefault(provider, {})
|
|
757
|
+
if isinstance(provider_meta, dict):
|
|
758
|
+
provider_meta.setdefault("trace_id", trace_id)
|
|
759
|
+
if provider == "phoenix":
|
|
760
|
+
endpoint = settings.phoenix_endpoint or "http://localhost:6006/v1/traces"
|
|
761
|
+
phoenix_meta = result.tracker_metadata.setdefault("phoenix", {})
|
|
762
|
+
phoenix_meta.update(
|
|
763
|
+
{
|
|
764
|
+
"trace_id": trace_id,
|
|
765
|
+
"endpoint": endpoint,
|
|
766
|
+
"trace_url": self._build_phoenix_trace_url(endpoint, trace_id),
|
|
767
|
+
"schema_version": 2,
|
|
768
|
+
}
|
|
769
|
+
)
|
|
770
|
+
except Exception as exc:
|
|
771
|
+
raise RuntimeError(f"Tracker logging failed for {provider}: {exc}") from exc
|
|
715
772
|
|
|
716
773
|
if stage_store and self._storage and hasattr(self._storage, "save_stage_events"):
|
|
774
|
+
emit_progress("Storing stage events...", status="finalizing")
|
|
717
775
|
try:
|
|
718
776
|
prompt_metadata_entries = self._build_prompt_metadata_entries(prompt_bundle)
|
|
719
777
|
stage_event_builder = StageEventBuilder()
|
|
@@ -755,6 +813,7 @@ class WebUIAdapter:
|
|
|
755
813
|
|
|
756
814
|
# 5. 결과 저장
|
|
757
815
|
if self._storage:
|
|
816
|
+
emit_progress("Saving evaluation run...", status="finalizing")
|
|
758
817
|
logger.info(f"Saving evaluation run: {result.run_id}")
|
|
759
818
|
if prompt_bundle:
|
|
760
819
|
self._storage.save_prompt_set(prompt_bundle)
|
|
@@ -15,6 +15,7 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
|
15
15
|
from starlette.responses import JSONResponse
|
|
16
16
|
|
|
17
17
|
from evalvault.adapters.inbound.api.adapter import WebUIAdapter, create_adapter
|
|
18
|
+
from evalvault.config.runtime_services import ensure_local_observability
|
|
18
19
|
from evalvault.config.settings import Settings, get_settings, is_production_profile
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger(__name__)
|
|
@@ -63,6 +64,7 @@ async def lifespan(app: FastAPI):
|
|
|
63
64
|
# Startup: Initialize adapter
|
|
64
65
|
adapter = create_adapter()
|
|
65
66
|
app.state.adapter = adapter
|
|
67
|
+
ensure_local_observability(get_settings())
|
|
66
68
|
try:
|
|
67
69
|
from evalvault.adapters.inbound.api.routers.chat import warm_rag_index
|
|
68
70
|
|
|
@@ -71,7 +71,9 @@ class ConfigUpdateRequest(BaseModel):
|
|
|
71
71
|
phoenix_endpoint: str | None = None
|
|
72
72
|
phoenix_enabled: bool | None = None
|
|
73
73
|
phoenix_sample_rate: float | None = None
|
|
74
|
-
|
|
74
|
+
phoenix_project_name: str | None = None
|
|
75
|
+
phoenix_annotations_enabled: bool | None = None
|
|
76
|
+
tracker_provider: str | None = None
|
|
75
77
|
postgres_host: str | None = None
|
|
76
78
|
postgres_port: int | None = None
|
|
77
79
|
postgres_database: str | None = None
|
|
@@ -14,6 +14,8 @@ import typer
|
|
|
14
14
|
from rich import print as rprint
|
|
15
15
|
from rich.console import Console
|
|
16
16
|
|
|
17
|
+
from evalvault.config.runtime_services import ensure_local_observability
|
|
18
|
+
from evalvault.config.settings import get_settings
|
|
17
19
|
from evalvault.domain.metrics.registry import list_metric_names
|
|
18
20
|
|
|
19
21
|
from .commands import attach_sub_apps, register_all_commands
|
|
@@ -61,6 +63,7 @@ def main(
|
|
|
61
63
|
),
|
|
62
64
|
) -> None:
|
|
63
65
|
"""EvalVault - RAG evaluation system."""
|
|
66
|
+
ensure_local_observability(get_settings())
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
if __name__ == "__main__": # pragma: no cover
|
|
@@ -358,6 +358,11 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
358
358
|
profile: str | None = profile_option(
|
|
359
359
|
help_text="비교 리포트용 LLM 프로필 (dev, prod, openai)",
|
|
360
360
|
),
|
|
361
|
+
use_llm_report: bool = typer.Option(
|
|
362
|
+
True,
|
|
363
|
+
"--use-llm-report/--no-llm-report",
|
|
364
|
+
help="LLM 보고서 사용 여부",
|
|
365
|
+
),
|
|
361
366
|
) -> None:
|
|
362
367
|
"""두 실행을 통계적으로 비교합니다."""
|
|
363
368
|
|
|
@@ -461,7 +466,7 @@ def register_analyze_commands(app: typer.Typer, console: Console) -> None:
|
|
|
461
466
|
compare_metrics=metric_list,
|
|
462
467
|
test_type=test,
|
|
463
468
|
report_type="comparison",
|
|
464
|
-
use_llm_report=
|
|
469
|
+
use_llm_report=use_llm_report,
|
|
465
470
|
)
|
|
466
471
|
|
|
467
472
|
artifacts_dir = resolve_artifact_dir(
|
|
@@ -31,7 +31,7 @@ from ..utils.validators import parse_csv_option, validate_choices
|
|
|
31
31
|
from .run_helpers import (
|
|
32
32
|
_display_results,
|
|
33
33
|
_is_oss_open_model,
|
|
34
|
-
|
|
34
|
+
_log_to_trackers,
|
|
35
35
|
_resolve_thresholds,
|
|
36
36
|
_save_results,
|
|
37
37
|
_save_to_db,
|
|
@@ -419,12 +419,12 @@ def create_method_app(console: Console) -> typer.Typer:
|
|
|
419
419
|
_display_results(result, console)
|
|
420
420
|
|
|
421
421
|
if tracker and tracker != "none":
|
|
422
|
-
|
|
422
|
+
_log_to_trackers(settings, result, console, tracker_type=tracker)
|
|
423
423
|
|
|
424
424
|
if eval_output:
|
|
425
425
|
_save_results(eval_output, result, console)
|
|
426
426
|
|
|
427
|
-
_save_to_db(db_path, result, console)
|
|
427
|
+
_save_to_db(settings, db_path, result, console)
|
|
428
428
|
|
|
429
429
|
return method_app
|
|
430
430
|
|