evalvault 1.75.0__py3-none-any.whl → 1.77.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +123 -64
- evalvault/adapters/inbound/api/main.py +2 -0
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/cli/app.py +3 -0
- evalvault/adapters/inbound/cli/commands/analyze.py +6 -1
- evalvault/adapters/inbound/cli/commands/method.py +3 -3
- evalvault/adapters/inbound/cli/commands/run.py +153 -30
- evalvault/adapters/inbound/cli/commands/run_helpers.py +166 -62
- evalvault/adapters/outbound/analysis/llm_report_module.py +515 -33
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/phoenix/sync_service.py +100 -1
- evalvault/adapters/outbound/report/markdown_adapter.py +92 -0
- evalvault/adapters/outbound/storage/factory.py +1 -4
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +178 -12
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/runtime_services.py +122 -0
- evalvault/config/settings.py +40 -4
- evalvault/domain/services/evaluator.py +2 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/METADATA +2 -1
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/RECORD +25 -24
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/WHEEL +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.75.0.dist-info → evalvault-1.77.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
from collections.abc import Callable, Sequence
|
|
7
7
|
from dataclasses import asdict, dataclass
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import click
|
|
12
12
|
import typer
|
|
@@ -25,7 +25,7 @@ from evalvault.config.phoenix_support import (
|
|
|
25
25
|
instrumentation_span,
|
|
26
26
|
set_span_attributes,
|
|
27
27
|
)
|
|
28
|
-
from evalvault.config.settings import Settings
|
|
28
|
+
from evalvault.config.settings import Settings, resolve_tracker_providers
|
|
29
29
|
from evalvault.domain.entities import (
|
|
30
30
|
Dataset,
|
|
31
31
|
EvaluationRun,
|
|
@@ -58,7 +58,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
|
58
58
|
from ..utils.console import print_cli_error, print_cli_warning
|
|
59
59
|
from ..utils.formatters import format_score, format_status
|
|
60
60
|
|
|
61
|
-
TrackerType =
|
|
61
|
+
TrackerType = str
|
|
62
62
|
apply_retriever_to_dataset = retriever_context.apply_retriever_to_dataset
|
|
63
63
|
|
|
64
64
|
|
|
@@ -319,15 +319,22 @@ def _display_memory_insights(insights: dict[str, Any], console: Console) -> None
|
|
|
319
319
|
console.print(Panel(panel_body, title="Domain Memory Insights", border_style="magenta"))
|
|
320
320
|
|
|
321
321
|
|
|
322
|
-
def _get_tracker(
|
|
322
|
+
def _get_tracker(
|
|
323
|
+
settings: Settings,
|
|
324
|
+
tracker_type: str,
|
|
325
|
+
console: Console,
|
|
326
|
+
*,
|
|
327
|
+
required: bool = False,
|
|
328
|
+
) -> TrackerPort | None:
|
|
323
329
|
"""Get the appropriate tracker adapter based on type."""
|
|
324
330
|
if tracker_type == "langfuse":
|
|
325
331
|
if not settings.langfuse_public_key or not settings.langfuse_secret_key:
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
332
|
+
message = "Langfuse 자격 증명이 설정되지 않았습니다."
|
|
333
|
+
tips = ["LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY를 .env에 추가하세요."]
|
|
334
|
+
if required:
|
|
335
|
+
print_cli_error(console, message, fixes=tips)
|
|
336
|
+
raise typer.Exit(2)
|
|
337
|
+
print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
|
|
331
338
|
return None
|
|
332
339
|
from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
|
|
333
340
|
|
|
@@ -338,42 +345,48 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
338
345
|
)
|
|
339
346
|
|
|
340
347
|
elif tracker_type == "mlflow":
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
return None
|
|
348
|
+
tracking_uri = getattr(settings, "mlflow_tracking_uri", None)
|
|
349
|
+
if not isinstance(tracking_uri, str) or not tracking_uri.strip():
|
|
350
|
+
tracking_uri = f"sqlite:///{Path.cwd() / 'mlruns.db'}"
|
|
351
|
+
experiment_name = getattr(settings, "mlflow_experiment_name", None)
|
|
352
|
+
if not isinstance(experiment_name, str) or not experiment_name.strip():
|
|
353
|
+
experiment_name = "evalvault"
|
|
348
354
|
try:
|
|
349
355
|
from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
|
|
350
356
|
|
|
351
357
|
return MLflowAdapter(
|
|
352
|
-
tracking_uri=
|
|
353
|
-
experiment_name=
|
|
358
|
+
tracking_uri=tracking_uri,
|
|
359
|
+
experiment_name=experiment_name,
|
|
354
360
|
)
|
|
355
361
|
except ImportError:
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
362
|
+
message = "MLflow extra가 설치되지 않았습니다."
|
|
363
|
+
tips = ["uv sync --extra mlflow 명령으로 구성요소를 설치하세요."]
|
|
364
|
+
if required:
|
|
365
|
+
print_cli_error(console, message, fixes=tips)
|
|
366
|
+
raise typer.Exit(2)
|
|
367
|
+
print_cli_warning(console, message, tips=tips)
|
|
361
368
|
return None
|
|
362
369
|
|
|
363
370
|
elif tracker_type == "phoenix":
|
|
364
371
|
try:
|
|
365
372
|
from evalvault.adapters.outbound.tracker.phoenix_adapter import PhoenixAdapter
|
|
366
373
|
|
|
374
|
+
endpoint = getattr(settings, "phoenix_endpoint", None)
|
|
375
|
+
if not isinstance(endpoint, str) or not endpoint.strip():
|
|
376
|
+
endpoint = "http://localhost:6006/v1/traces"
|
|
367
377
|
return PhoenixAdapter(
|
|
368
|
-
endpoint=
|
|
378
|
+
endpoint=endpoint,
|
|
369
379
|
service_name="evalvault",
|
|
380
|
+
project_name=getattr(settings, "phoenix_project_name", None),
|
|
381
|
+
annotations_enabled=getattr(settings, "phoenix_annotations_enabled", True),
|
|
370
382
|
)
|
|
371
383
|
except ImportError:
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
384
|
+
message = "Phoenix extra가 설치되지 않았습니다."
|
|
385
|
+
tips = ["uv sync --extra phoenix 명령으로 의존성을 추가하세요."]
|
|
386
|
+
if required:
|
|
387
|
+
print_cli_error(console, message, fixes=tips)
|
|
388
|
+
raise typer.Exit(2)
|
|
389
|
+
print_cli_warning(console, message, tips=tips)
|
|
377
390
|
return None
|
|
378
391
|
|
|
379
392
|
else:
|
|
@@ -385,6 +398,19 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
385
398
|
return None
|
|
386
399
|
|
|
387
400
|
|
|
401
|
+
def _resolve_tracker_list(tracker_type: str) -> list[str]:
|
|
402
|
+
providers = resolve_tracker_providers(tracker_type)
|
|
403
|
+
if not providers:
|
|
404
|
+
return []
|
|
405
|
+
if providers == ["none"]:
|
|
406
|
+
return ["none"]
|
|
407
|
+
supported = {"langfuse", "mlflow", "phoenix"}
|
|
408
|
+
unknown = [entry for entry in providers if entry not in supported]
|
|
409
|
+
if unknown:
|
|
410
|
+
raise ValueError(f"Unknown tracker provider(s): {', '.join(unknown)}")
|
|
411
|
+
return providers
|
|
412
|
+
|
|
413
|
+
|
|
388
414
|
def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
389
415
|
"""Build a Phoenix UI URL for the given trace ID."""
|
|
390
416
|
|
|
@@ -395,7 +421,7 @@ def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
|
395
421
|
return f"{base.rstrip('/')}/#/traces/{trace_id}"
|
|
396
422
|
|
|
397
423
|
|
|
398
|
-
def
|
|
424
|
+
def _log_to_trackers(
|
|
399
425
|
settings: Settings,
|
|
400
426
|
result,
|
|
401
427
|
console: Console,
|
|
@@ -404,18 +430,39 @@ def _log_to_tracker(
|
|
|
404
430
|
phoenix_options: dict[str, Any] | None = None,
|
|
405
431
|
log_phoenix_traces_fn: Callable[..., int] | None = None,
|
|
406
432
|
) -> None:
|
|
407
|
-
"""Log evaluation results to the specified tracker."""
|
|
408
|
-
|
|
409
|
-
|
|
433
|
+
"""Log evaluation results to the specified tracker(s)."""
|
|
434
|
+
try:
|
|
435
|
+
tracker_types = _resolve_tracker_list(tracker_type)
|
|
436
|
+
except ValueError as exc:
|
|
437
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
438
|
+
raise typer.Exit(2) from exc
|
|
439
|
+
if not tracker_types or tracker_types == ["none"]:
|
|
410
440
|
return
|
|
411
441
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
442
|
+
result.tracker_metadata.setdefault("tracker_providers", tracker_types)
|
|
443
|
+
for provider in tracker_types:
|
|
444
|
+
tracker = _get_tracker(settings, provider, console, required=True)
|
|
445
|
+
if tracker is None:
|
|
446
|
+
raise typer.Exit(2)
|
|
447
|
+
tracker_name = provider.capitalize()
|
|
448
|
+
trace_id: str | None = None
|
|
449
|
+
with console.status(f"[bold green]Logging to {tracker_name}..."):
|
|
450
|
+
try:
|
|
451
|
+
trace_id = tracker.log_evaluation_run(result)
|
|
452
|
+
console.print(f"[green]Logged to {tracker_name}[/green] (trace_id: {trace_id})")
|
|
453
|
+
except Exception as exc:
|
|
454
|
+
print_cli_error(
|
|
455
|
+
console,
|
|
456
|
+
f"{tracker_name} 로깅에 실패했습니다.",
|
|
457
|
+
details=str(exc),
|
|
458
|
+
)
|
|
459
|
+
raise typer.Exit(2) from exc
|
|
460
|
+
|
|
461
|
+
if trace_id:
|
|
462
|
+
provider_meta = result.tracker_metadata.setdefault(provider, {})
|
|
463
|
+
if isinstance(provider_meta, dict):
|
|
464
|
+
provider_meta.setdefault("trace_id", trace_id)
|
|
465
|
+
if provider == "phoenix":
|
|
419
466
|
endpoint = getattr(settings, "phoenix_endpoint", "http://localhost:6006/v1/traces")
|
|
420
467
|
if not isinstance(endpoint, str) or not endpoint:
|
|
421
468
|
endpoint = "http://localhost:6006/v1/traces"
|
|
@@ -431,30 +478,82 @@ def _log_to_tracker(
|
|
|
431
478
|
trace_url = get_phoenix_trace_url(result.tracker_metadata)
|
|
432
479
|
if trace_url:
|
|
433
480
|
console.print(f"[dim]Phoenix Trace: {trace_url}[/dim]")
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
481
|
+
|
|
482
|
+
options = phoenix_options or {}
|
|
483
|
+
log_traces = log_phoenix_traces_fn or log_phoenix_traces
|
|
484
|
+
extra = log_traces(
|
|
485
|
+
tracker,
|
|
486
|
+
result,
|
|
487
|
+
max_traces=options.get("max_traces"),
|
|
488
|
+
metadata=options.get("metadata"),
|
|
489
|
+
)
|
|
490
|
+
if extra:
|
|
491
|
+
console.print(
|
|
492
|
+
f"[dim]Recorded {extra} Phoenix RAG trace(s) for detailed observability.[/dim]"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _log_analysis_artifacts(
|
|
497
|
+
settings: Settings,
|
|
498
|
+
result: EvaluationRun,
|
|
499
|
+
console: Console,
|
|
500
|
+
tracker_type: str,
|
|
501
|
+
*,
|
|
502
|
+
analysis_payload: dict[str, Any],
|
|
503
|
+
artifact_index: dict[str, Any],
|
|
504
|
+
report_text: str,
|
|
505
|
+
output_path: Path,
|
|
506
|
+
report_path: Path,
|
|
507
|
+
) -> None:
|
|
508
|
+
"""Log analysis artifacts to tracker(s) as a separate trace/run."""
|
|
509
|
+
try:
|
|
510
|
+
tracker_types = _resolve_tracker_list(tracker_type)
|
|
511
|
+
except ValueError as exc:
|
|
512
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
513
|
+
raise typer.Exit(2) from exc
|
|
514
|
+
if not tracker_types or tracker_types == ["none"]:
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
metadata = {
|
|
518
|
+
"run_id": result.run_id,
|
|
519
|
+
"dataset_name": result.dataset_name,
|
|
520
|
+
"dataset_version": result.dataset_version,
|
|
521
|
+
"analysis_output": str(output_path),
|
|
522
|
+
"analysis_report": str(report_path),
|
|
523
|
+
"analysis_artifacts_dir": artifact_index.get("dir"),
|
|
524
|
+
"event_type": "analysis",
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
for provider in tracker_types:
|
|
528
|
+
tracker = _get_tracker(settings, provider, console, required=True)
|
|
529
|
+
if tracker is None:
|
|
530
|
+
raise typer.Exit(2)
|
|
531
|
+
trace_name = f"analysis-{result.run_id[:8]}"
|
|
532
|
+
try:
|
|
533
|
+
trace_id = tracker.start_trace(trace_name, metadata=metadata)
|
|
534
|
+
tracker.save_artifact(
|
|
535
|
+
trace_id, "analysis_payload", analysis_payload, artifact_type="json"
|
|
439
536
|
)
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
extra = log_traces(
|
|
446
|
-
tracker,
|
|
447
|
-
result,
|
|
448
|
-
max_traces=options.get("max_traces"),
|
|
449
|
-
metadata=options.get("metadata"),
|
|
450
|
-
)
|
|
451
|
-
if extra:
|
|
537
|
+
tracker.save_artifact(
|
|
538
|
+
trace_id, "analysis_artifacts", artifact_index, artifact_type="json"
|
|
539
|
+
)
|
|
540
|
+
tracker.save_artifact(trace_id, "analysis_report", report_text, artifact_type="text")
|
|
541
|
+
tracker.end_trace(trace_id)
|
|
452
542
|
console.print(
|
|
453
|
-
f"[
|
|
543
|
+
f"[green]Logged analysis artifacts to {provider.capitalize()}[/green] "
|
|
544
|
+
f"(trace_id: {trace_id})"
|
|
454
545
|
)
|
|
546
|
+
except Exception as exc:
|
|
547
|
+
print_cli_error(
|
|
548
|
+
console,
|
|
549
|
+
f"{provider.capitalize()} 분석 로깅에 실패했습니다.",
|
|
550
|
+
details=str(exc),
|
|
551
|
+
)
|
|
552
|
+
raise typer.Exit(2) from exc
|
|
455
553
|
|
|
456
554
|
|
|
457
555
|
def _save_to_db(
|
|
556
|
+
settings: Settings,
|
|
458
557
|
db_path: Path | None,
|
|
459
558
|
result,
|
|
460
559
|
console: Console,
|
|
@@ -463,7 +562,7 @@ def _save_to_db(
|
|
|
463
562
|
export_excel: bool = True,
|
|
464
563
|
) -> None:
|
|
465
564
|
"""Persist evaluation run (and optional prompt set) to database."""
|
|
466
|
-
storage = build_storage_adapter(settings=
|
|
565
|
+
storage = build_storage_adapter(settings=settings, db_path=db_path)
|
|
467
566
|
storage_label = (
|
|
468
567
|
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
469
568
|
)
|
|
@@ -507,6 +606,7 @@ def _save_to_db(
|
|
|
507
606
|
|
|
508
607
|
|
|
509
608
|
def _save_multiturn_to_db(
|
|
609
|
+
settings: Settings,
|
|
510
610
|
db_path: Path | None,
|
|
511
611
|
run_record: MultiTurnRunRecord,
|
|
512
612
|
conversations: list[MultiTurnConversationRecord],
|
|
@@ -518,7 +618,7 @@ def _save_multiturn_to_db(
|
|
|
518
618
|
metric_thresholds: dict[str, float] | None = None,
|
|
519
619
|
) -> None:
|
|
520
620
|
"""Persist multiturn evaluation run to database."""
|
|
521
|
-
storage = build_storage_adapter(settings=
|
|
621
|
+
storage = build_storage_adapter(settings=settings, db_path=db_path)
|
|
522
622
|
storage_label = (
|
|
523
623
|
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
524
624
|
)
|
|
@@ -740,6 +840,8 @@ def log_phoenix_traces(
|
|
|
740
840
|
return 0
|
|
741
841
|
|
|
742
842
|
limit = max_traces if max_traces is not None else run.total_test_cases
|
|
843
|
+
if not isinstance(limit, int):
|
|
844
|
+
limit = None
|
|
743
845
|
|
|
744
846
|
count = 0
|
|
745
847
|
for result in run.results:
|
|
@@ -1173,8 +1275,10 @@ def _collect_prompt_metadata(
|
|
|
1173
1275
|
prompt_path=target,
|
|
1174
1276
|
content=content,
|
|
1175
1277
|
)
|
|
1176
|
-
|
|
1177
|
-
|
|
1278
|
+
summary_dict = asdict(summary)
|
|
1279
|
+
summary_dict["content_preview"] = _build_content_preview(content)
|
|
1280
|
+
summary_dict["content"] = content
|
|
1281
|
+
summaries.append(summary_dict)
|
|
1178
1282
|
|
|
1179
1283
|
return summaries
|
|
1180
1284
|
|