evalvault 1.74.0__py3-none-any.whl → 1.76.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalvault/adapters/inbound/api/adapter.py +127 -80
- evalvault/adapters/inbound/api/routers/calibration.py +9 -9
- evalvault/adapters/inbound/api/routers/chat.py +303 -17
- evalvault/adapters/inbound/api/routers/config.py +3 -1
- evalvault/adapters/inbound/api/routers/domain.py +10 -5
- evalvault/adapters/inbound/api/routers/pipeline.py +3 -3
- evalvault/adapters/inbound/api/routers/runs.py +23 -4
- evalvault/adapters/inbound/cli/commands/analyze.py +10 -12
- evalvault/adapters/inbound/cli/commands/benchmark.py +10 -8
- evalvault/adapters/inbound/cli/commands/calibrate.py +2 -7
- evalvault/adapters/inbound/cli/commands/calibrate_judge.py +2 -7
- evalvault/adapters/inbound/cli/commands/compare.py +2 -7
- evalvault/adapters/inbound/cli/commands/debug.py +3 -2
- evalvault/adapters/inbound/cli/commands/domain.py +12 -12
- evalvault/adapters/inbound/cli/commands/experiment.py +9 -8
- evalvault/adapters/inbound/cli/commands/gate.py +3 -2
- evalvault/adapters/inbound/cli/commands/graph_rag.py +2 -2
- evalvault/adapters/inbound/cli/commands/history.py +3 -12
- evalvault/adapters/inbound/cli/commands/method.py +3 -4
- evalvault/adapters/inbound/cli/commands/ops.py +2 -2
- evalvault/adapters/inbound/cli/commands/pipeline.py +2 -2
- evalvault/adapters/inbound/cli/commands/profile_difficulty.py +3 -12
- evalvault/adapters/inbound/cli/commands/prompts.py +4 -18
- evalvault/adapters/inbound/cli/commands/regress.py +5 -4
- evalvault/adapters/inbound/cli/commands/run.py +188 -59
- evalvault/adapters/inbound/cli/commands/run_helpers.py +181 -70
- evalvault/adapters/inbound/cli/commands/stage.py +6 -25
- evalvault/adapters/inbound/cli/utils/options.py +10 -4
- evalvault/adapters/inbound/mcp/tools.py +11 -8
- evalvault/adapters/outbound/analysis/embedding_analyzer_module.py +17 -1
- evalvault/adapters/outbound/analysis/embedding_searcher_module.py +14 -0
- evalvault/adapters/outbound/domain_memory/__init__.py +8 -4
- evalvault/adapters/outbound/domain_memory/factory.py +68 -0
- evalvault/adapters/outbound/domain_memory/postgres_adapter.py +1062 -0
- evalvault/adapters/outbound/domain_memory/postgres_domain_memory_schema.sql +177 -0
- evalvault/adapters/outbound/llm/factory.py +1 -1
- evalvault/adapters/outbound/llm/vllm_adapter.py +23 -0
- evalvault/adapters/outbound/nlp/korean/dense_retriever.py +10 -7
- evalvault/adapters/outbound/nlp/korean/toolkit.py +15 -4
- evalvault/adapters/outbound/phoenix/sync_service.py +99 -0
- evalvault/adapters/outbound/retriever/pgvector_store.py +165 -0
- evalvault/adapters/outbound/storage/base_sql.py +3 -2
- evalvault/adapters/outbound/storage/factory.py +53 -0
- evalvault/adapters/outbound/storage/postgres_schema.sql +2 -0
- evalvault/adapters/outbound/tracker/mlflow_adapter.py +209 -54
- evalvault/adapters/outbound/tracker/phoenix_adapter.py +158 -9
- evalvault/config/instrumentation.py +8 -6
- evalvault/config/phoenix_support.py +5 -0
- evalvault/config/settings.py +71 -11
- evalvault/domain/services/domain_learning_hook.py +2 -1
- evalvault/domain/services/evaluator.py +2 -0
- evalvault/ports/inbound/web_port.py +3 -1
- evalvault/ports/outbound/storage_port.py +2 -0
- evalvault-1.76.0.dist-info/METADATA +221 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/RECORD +58 -53
- evalvault-1.74.0.dist-info/METADATA +0 -585
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/WHEEL +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/entry_points.txt +0 -0
- {evalvault-1.74.0.dist-info → evalvault-1.76.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
from collections.abc import Callable, Sequence
|
|
7
7
|
from dataclasses import asdict, dataclass
|
|
8
8
|
from pathlib import Path
|
|
9
|
-
from typing import Any
|
|
9
|
+
from typing import Any
|
|
10
10
|
|
|
11
11
|
import click
|
|
12
12
|
import typer
|
|
@@ -18,13 +18,14 @@ from rich.table import Table
|
|
|
18
18
|
from evalvault.adapters.outbound.dataset import StreamingConfig, StreamingDatasetLoader
|
|
19
19
|
from evalvault.adapters.outbound.dataset.thresholds import extract_thresholds_from_rows
|
|
20
20
|
from evalvault.adapters.outbound.kg.networkx_adapter import NetworkXKnowledgeGraph
|
|
21
|
-
from evalvault.adapters.outbound.storage.
|
|
21
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
22
|
+
from evalvault.adapters.outbound.storage.postgres_adapter import PostgreSQLStorageAdapter
|
|
22
23
|
from evalvault.config.phoenix_support import (
|
|
23
24
|
get_phoenix_trace_url,
|
|
24
25
|
instrumentation_span,
|
|
25
26
|
set_span_attributes,
|
|
26
27
|
)
|
|
27
|
-
from evalvault.config.settings import Settings
|
|
28
|
+
from evalvault.config.settings import Settings, resolve_tracker_providers
|
|
28
29
|
from evalvault.domain.entities import (
|
|
29
30
|
Dataset,
|
|
30
31
|
EvaluationRun,
|
|
@@ -57,7 +58,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
|
|
|
57
58
|
from ..utils.console import print_cli_error, print_cli_warning
|
|
58
59
|
from ..utils.formatters import format_score, format_status
|
|
59
60
|
|
|
60
|
-
TrackerType =
|
|
61
|
+
TrackerType = str
|
|
61
62
|
apply_retriever_to_dataset = retriever_context.apply_retriever_to_dataset
|
|
62
63
|
|
|
63
64
|
|
|
@@ -318,15 +319,22 @@ def _display_memory_insights(insights: dict[str, Any], console: Console) -> None
|
|
|
318
319
|
console.print(Panel(panel_body, title="Domain Memory Insights", border_style="magenta"))
|
|
319
320
|
|
|
320
321
|
|
|
321
|
-
def _get_tracker(
|
|
322
|
+
def _get_tracker(
|
|
323
|
+
settings: Settings,
|
|
324
|
+
tracker_type: str,
|
|
325
|
+
console: Console,
|
|
326
|
+
*,
|
|
327
|
+
required: bool = False,
|
|
328
|
+
) -> TrackerPort | None:
|
|
322
329
|
"""Get the appropriate tracker adapter based on type."""
|
|
323
330
|
if tracker_type == "langfuse":
|
|
324
331
|
if not settings.langfuse_public_key or not settings.langfuse_secret_key:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
332
|
+
message = "Langfuse 자격 증명이 설정되지 않았습니다."
|
|
333
|
+
tips = ["LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY를 .env에 추가하세요."]
|
|
334
|
+
if required:
|
|
335
|
+
print_cli_error(console, message, fixes=tips)
|
|
336
|
+
raise typer.Exit(2)
|
|
337
|
+
print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
|
|
330
338
|
return None
|
|
331
339
|
from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
|
|
332
340
|
|
|
@@ -338,11 +346,12 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
338
346
|
|
|
339
347
|
elif tracker_type == "mlflow":
|
|
340
348
|
if not settings.mlflow_tracking_uri:
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
349
|
+
message = "MLflow tracking URI가 설정되지 않았습니다."
|
|
350
|
+
tips = ["MLFLOW_TRACKING_URI 환경 변수를 설정하세요."]
|
|
351
|
+
if required:
|
|
352
|
+
print_cli_error(console, message, fixes=tips)
|
|
353
|
+
raise typer.Exit(2)
|
|
354
|
+
print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
|
|
346
355
|
return None
|
|
347
356
|
try:
|
|
348
357
|
from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
|
|
@@ -352,11 +361,12 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
352
361
|
experiment_name=settings.mlflow_experiment_name,
|
|
353
362
|
)
|
|
354
363
|
except ImportError:
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
364
|
+
message = "MLflow extra가 설치되지 않았습니다."
|
|
365
|
+
tips = ["uv sync --extra mlflow 명령으로 구성요소를 설치하세요."]
|
|
366
|
+
if required:
|
|
367
|
+
print_cli_error(console, message, fixes=tips)
|
|
368
|
+
raise typer.Exit(2)
|
|
369
|
+
print_cli_warning(console, message, tips=tips)
|
|
360
370
|
return None
|
|
361
371
|
|
|
362
372
|
elif tracker_type == "phoenix":
|
|
@@ -366,13 +376,16 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
366
376
|
return PhoenixAdapter(
|
|
367
377
|
endpoint=settings.phoenix_endpoint,
|
|
368
378
|
service_name="evalvault",
|
|
379
|
+
project_name=getattr(settings, "phoenix_project_name", None),
|
|
380
|
+
annotations_enabled=getattr(settings, "phoenix_annotations_enabled", True),
|
|
369
381
|
)
|
|
370
382
|
except ImportError:
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
383
|
+
message = "Phoenix extra가 설치되지 않았습니다."
|
|
384
|
+
tips = ["uv sync --extra phoenix 명령으로 의존성을 추가하세요."]
|
|
385
|
+
if required:
|
|
386
|
+
print_cli_error(console, message, fixes=tips)
|
|
387
|
+
raise typer.Exit(2)
|
|
388
|
+
print_cli_warning(console, message, tips=tips)
|
|
376
389
|
return None
|
|
377
390
|
|
|
378
391
|
else:
|
|
@@ -384,6 +397,22 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
|
|
|
384
397
|
return None
|
|
385
398
|
|
|
386
399
|
|
|
400
|
+
def _resolve_tracker_list(tracker_type: str) -> list[str]:
|
|
401
|
+
providers = resolve_tracker_providers(tracker_type)
|
|
402
|
+
if not providers:
|
|
403
|
+
return []
|
|
404
|
+
if providers == ["none"]:
|
|
405
|
+
return ["none"]
|
|
406
|
+
supported = {"langfuse", "mlflow", "phoenix"}
|
|
407
|
+
unknown = [entry for entry in providers if entry not in supported]
|
|
408
|
+
if unknown:
|
|
409
|
+
raise ValueError(f"Unknown tracker provider(s): {', '.join(unknown)}")
|
|
410
|
+
required = {"mlflow", "phoenix"}
|
|
411
|
+
if not required.issubset(set(providers)):
|
|
412
|
+
raise ValueError("tracker must include both 'mlflow' and 'phoenix'")
|
|
413
|
+
return providers
|
|
414
|
+
|
|
415
|
+
|
|
387
416
|
def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
388
417
|
"""Build a Phoenix UI URL for the given trace ID."""
|
|
389
418
|
|
|
@@ -394,7 +423,7 @@ def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
|
|
|
394
423
|
return f"{base.rstrip('/')}/#/traces/{trace_id}"
|
|
395
424
|
|
|
396
425
|
|
|
397
|
-
def
|
|
426
|
+
def _log_to_trackers(
|
|
398
427
|
settings: Settings,
|
|
399
428
|
result,
|
|
400
429
|
console: Console,
|
|
@@ -403,18 +432,39 @@ def _log_to_tracker(
|
|
|
403
432
|
phoenix_options: dict[str, Any] | None = None,
|
|
404
433
|
log_phoenix_traces_fn: Callable[..., int] | None = None,
|
|
405
434
|
) -> None:
|
|
406
|
-
"""Log evaluation results to the specified tracker."""
|
|
407
|
-
|
|
408
|
-
|
|
435
|
+
"""Log evaluation results to the specified tracker(s)."""
|
|
436
|
+
try:
|
|
437
|
+
tracker_types = _resolve_tracker_list(tracker_type)
|
|
438
|
+
except ValueError as exc:
|
|
439
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
440
|
+
raise typer.Exit(2) from exc
|
|
441
|
+
if not tracker_types or tracker_types == ["none"]:
|
|
409
442
|
return
|
|
410
443
|
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
444
|
+
result.tracker_metadata.setdefault("tracker_providers", tracker_types)
|
|
445
|
+
for provider in tracker_types:
|
|
446
|
+
tracker = _get_tracker(settings, provider, console, required=True)
|
|
447
|
+
if tracker is None:
|
|
448
|
+
raise typer.Exit(2)
|
|
449
|
+
tracker_name = provider.capitalize()
|
|
450
|
+
trace_id: str | None = None
|
|
451
|
+
with console.status(f"[bold green]Logging to {tracker_name}..."):
|
|
452
|
+
try:
|
|
453
|
+
trace_id = tracker.log_evaluation_run(result)
|
|
454
|
+
console.print(f"[green]Logged to {tracker_name}[/green] (trace_id: {trace_id})")
|
|
455
|
+
except Exception as exc:
|
|
456
|
+
print_cli_error(
|
|
457
|
+
console,
|
|
458
|
+
f"{tracker_name} 로깅에 실패했습니다.",
|
|
459
|
+
details=str(exc),
|
|
460
|
+
)
|
|
461
|
+
raise typer.Exit(2) from exc
|
|
462
|
+
|
|
463
|
+
if trace_id:
|
|
464
|
+
provider_meta = result.tracker_metadata.setdefault(provider, {})
|
|
465
|
+
if isinstance(provider_meta, dict):
|
|
466
|
+
provider_meta.setdefault("trace_id", trace_id)
|
|
467
|
+
if provider == "phoenix":
|
|
418
468
|
endpoint = getattr(settings, "phoenix_endpoint", "http://localhost:6006/v1/traces")
|
|
419
469
|
if not isinstance(endpoint, str) or not endpoint:
|
|
420
470
|
endpoint = "http://localhost:6006/v1/traces"
|
|
@@ -430,42 +480,96 @@ def _log_to_tracker(
|
|
|
430
480
|
trace_url = get_phoenix_trace_url(result.tracker_metadata)
|
|
431
481
|
if trace_url:
|
|
432
482
|
console.print(f"[dim]Phoenix Trace: {trace_url}[/dim]")
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
483
|
+
|
|
484
|
+
options = phoenix_options or {}
|
|
485
|
+
log_traces = log_phoenix_traces_fn or log_phoenix_traces
|
|
486
|
+
extra = log_traces(
|
|
487
|
+
tracker,
|
|
488
|
+
result,
|
|
489
|
+
max_traces=options.get("max_traces"),
|
|
490
|
+
metadata=options.get("metadata"),
|
|
491
|
+
)
|
|
492
|
+
if extra:
|
|
493
|
+
console.print(
|
|
494
|
+
f"[dim]Recorded {extra} Phoenix RAG trace(s) for detailed observability.[/dim]"
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def _log_analysis_artifacts(
|
|
499
|
+
settings: Settings,
|
|
500
|
+
result: EvaluationRun,
|
|
501
|
+
console: Console,
|
|
502
|
+
tracker_type: str,
|
|
503
|
+
*,
|
|
504
|
+
analysis_payload: dict[str, Any],
|
|
505
|
+
artifact_index: dict[str, Any],
|
|
506
|
+
report_text: str,
|
|
507
|
+
output_path: Path,
|
|
508
|
+
report_path: Path,
|
|
509
|
+
) -> None:
|
|
510
|
+
"""Log analysis artifacts to tracker(s) as a separate trace/run."""
|
|
511
|
+
try:
|
|
512
|
+
tracker_types = _resolve_tracker_list(tracker_type)
|
|
513
|
+
except ValueError as exc:
|
|
514
|
+
print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
|
|
515
|
+
raise typer.Exit(2) from exc
|
|
516
|
+
if not tracker_types or tracker_types == ["none"]:
|
|
517
|
+
return
|
|
518
|
+
|
|
519
|
+
metadata = {
|
|
520
|
+
"run_id": result.run_id,
|
|
521
|
+
"dataset_name": result.dataset_name,
|
|
522
|
+
"dataset_version": result.dataset_version,
|
|
523
|
+
"analysis_output": str(output_path),
|
|
524
|
+
"analysis_report": str(report_path),
|
|
525
|
+
"analysis_artifacts_dir": artifact_index.get("dir"),
|
|
526
|
+
"event_type": "analysis",
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
for provider in tracker_types:
|
|
530
|
+
tracker = _get_tracker(settings, provider, console, required=True)
|
|
531
|
+
if tracker is None:
|
|
532
|
+
raise typer.Exit(2)
|
|
533
|
+
trace_name = f"analysis-{result.run_id[:8]}"
|
|
534
|
+
try:
|
|
535
|
+
trace_id = tracker.start_trace(trace_name, metadata=metadata)
|
|
536
|
+
tracker.save_artifact(
|
|
537
|
+
trace_id, "analysis_payload", analysis_payload, artifact_type="json"
|
|
438
538
|
)
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
extra = log_traces(
|
|
445
|
-
tracker,
|
|
446
|
-
result,
|
|
447
|
-
max_traces=options.get("max_traces"),
|
|
448
|
-
metadata=options.get("metadata"),
|
|
449
|
-
)
|
|
450
|
-
if extra:
|
|
539
|
+
tracker.save_artifact(
|
|
540
|
+
trace_id, "analysis_artifacts", artifact_index, artifact_type="json"
|
|
541
|
+
)
|
|
542
|
+
tracker.save_artifact(trace_id, "analysis_report", report_text, artifact_type="text")
|
|
543
|
+
tracker.end_trace(trace_id)
|
|
451
544
|
console.print(
|
|
452
|
-
f"[
|
|
545
|
+
f"[green]Logged analysis artifacts to {provider.capitalize()}[/green] "
|
|
546
|
+
f"(trace_id: {trace_id})"
|
|
453
547
|
)
|
|
548
|
+
except Exception as exc:
|
|
549
|
+
print_cli_error(
|
|
550
|
+
console,
|
|
551
|
+
f"{provider.capitalize()} 분석 로깅에 실패했습니다.",
|
|
552
|
+
details=str(exc),
|
|
553
|
+
)
|
|
554
|
+
raise typer.Exit(2) from exc
|
|
454
555
|
|
|
455
556
|
|
|
456
557
|
def _save_to_db(
|
|
457
|
-
db_path: Path,
|
|
558
|
+
db_path: Path | None,
|
|
458
559
|
result,
|
|
459
560
|
console: Console,
|
|
460
561
|
*,
|
|
461
|
-
storage_cls: type[SQLiteStorageAdapter] = SQLiteStorageAdapter,
|
|
462
562
|
prompt_bundle: PromptSetBundle | None = None,
|
|
463
563
|
export_excel: bool = True,
|
|
464
564
|
) -> None:
|
|
465
|
-
"""Persist evaluation run (and optional prompt set) to
|
|
466
|
-
|
|
565
|
+
"""Persist evaluation run (and optional prompt set) to database."""
|
|
566
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
567
|
+
storage_label = (
|
|
568
|
+
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
569
|
+
)
|
|
570
|
+
export_base = db_path.parent if db_path else Path("data/exports")
|
|
571
|
+
with console.status(f"[bold green]Saving to database {storage_label}..."):
|
|
467
572
|
try:
|
|
468
|
-
storage = storage_cls(db_path=db_path)
|
|
469
573
|
if prompt_bundle:
|
|
470
574
|
storage.save_prompt_set(prompt_bundle)
|
|
471
575
|
storage.save_run(result)
|
|
@@ -475,7 +579,8 @@ def _save_to_db(
|
|
|
475
579
|
prompt_bundle.prompt_set.prompt_set_id,
|
|
476
580
|
)
|
|
477
581
|
if export_excel:
|
|
478
|
-
|
|
582
|
+
export_base.mkdir(parents=True, exist_ok=True)
|
|
583
|
+
excel_path = export_base / f"evalvault_run_{result.run_id}.xlsx"
|
|
479
584
|
try:
|
|
480
585
|
storage.export_run_to_excel(result.run_id, excel_path)
|
|
481
586
|
console.print(f"[green]Excel export saved: {excel_path}[/green]")
|
|
@@ -485,7 +590,7 @@ def _save_to_db(
|
|
|
485
590
|
"엑셀 내보내기에 실패했습니다.",
|
|
486
591
|
tips=[str(exc)],
|
|
487
592
|
)
|
|
488
|
-
console.print(f"[green]Results saved to database: {
|
|
593
|
+
console.print(f"[green]Results saved to database: {storage_label}[/green]")
|
|
489
594
|
console.print(f"[dim]Run ID: {result.run_id}[/dim]")
|
|
490
595
|
if prompt_bundle:
|
|
491
596
|
console.print(
|
|
@@ -502,21 +607,24 @@ def _save_to_db(
|
|
|
502
607
|
|
|
503
608
|
|
|
504
609
|
def _save_multiturn_to_db(
|
|
505
|
-
db_path: Path,
|
|
610
|
+
db_path: Path | None,
|
|
506
611
|
run_record: MultiTurnRunRecord,
|
|
507
612
|
conversations: list[MultiTurnConversationRecord],
|
|
508
613
|
turn_results: list[MultiTurnTurnResult],
|
|
509
614
|
console: Console,
|
|
510
615
|
*,
|
|
511
|
-
storage_cls: type[SQLiteStorageAdapter] = SQLiteStorageAdapter,
|
|
512
616
|
export_excel: bool = True,
|
|
513
617
|
excel_output_path: Path | None = None,
|
|
514
618
|
metric_thresholds: dict[str, float] | None = None,
|
|
515
619
|
) -> None:
|
|
516
|
-
"""Persist multiturn evaluation run to
|
|
517
|
-
|
|
620
|
+
"""Persist multiturn evaluation run to database."""
|
|
621
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
622
|
+
storage_label = (
|
|
623
|
+
"PostgreSQL" if isinstance(storage, PostgreSQLStorageAdapter) else f"SQLite ({db_path})"
|
|
624
|
+
)
|
|
625
|
+
export_base = db_path.parent if db_path else Path("data/exports")
|
|
626
|
+
with console.status(f"[bold green]Saving multiturn run to {storage_label}..."):
|
|
518
627
|
try:
|
|
519
|
-
storage = storage_cls(db_path=db_path)
|
|
520
628
|
storage.save_multiturn_run(
|
|
521
629
|
run_record,
|
|
522
630
|
conversations,
|
|
@@ -524,8 +632,9 @@ def _save_multiturn_to_db(
|
|
|
524
632
|
metric_thresholds=metric_thresholds,
|
|
525
633
|
)
|
|
526
634
|
if export_excel:
|
|
635
|
+
export_base.mkdir(parents=True, exist_ok=True)
|
|
527
636
|
excel_path = excel_output_path or (
|
|
528
|
-
|
|
637
|
+
export_base / f"evalvault_multiturn_{run_record.run_id}.xlsx"
|
|
529
638
|
)
|
|
530
639
|
try:
|
|
531
640
|
storage.export_multiturn_run_to_excel(run_record.run_id, excel_path)
|
|
@@ -536,7 +645,7 @@ def _save_multiturn_to_db(
|
|
|
536
645
|
"멀티턴 엑셀 내보내기에 실패했습니다.",
|
|
537
646
|
tips=[str(exc)],
|
|
538
647
|
)
|
|
539
|
-
console.print(f"[green]Multiturn results saved to database: {
|
|
648
|
+
console.print(f"[green]Multiturn results saved to database: {storage_label}[/green]")
|
|
540
649
|
console.print(f"[dim]Run ID: {run_record.run_id}[/dim]")
|
|
541
650
|
except Exception as exc: # pragma: no cover - persistence errors
|
|
542
651
|
print_cli_error(
|
|
@@ -1164,8 +1273,10 @@ def _collect_prompt_metadata(
|
|
|
1164
1273
|
prompt_path=target,
|
|
1165
1274
|
content=content,
|
|
1166
1275
|
)
|
|
1167
|
-
|
|
1168
|
-
|
|
1276
|
+
summary_dict = asdict(summary)
|
|
1277
|
+
summary_dict["content_preview"] = _build_content_preview(content)
|
|
1278
|
+
summary_dict["content"] = content
|
|
1279
|
+
summaries.append(summary_dict)
|
|
1169
1280
|
|
|
1170
1281
|
return summaries
|
|
1171
1282
|
|
|
@@ -16,7 +16,7 @@ from rich.table import Table
|
|
|
16
16
|
from evalvault.adapters.outbound.improvement.stage_metric_playbook_loader import (
|
|
17
17
|
StageMetricPlaybookLoader,
|
|
18
18
|
)
|
|
19
|
-
from evalvault.adapters.outbound.storage.
|
|
19
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
20
20
|
from evalvault.config.settings import Settings
|
|
21
21
|
from evalvault.domain.entities.stage import REQUIRED_STAGE_TYPES, StageEvent, StageMetric
|
|
22
22
|
from evalvault.domain.services.stage_metric_guide_service import StageMetricGuideService
|
|
@@ -28,13 +28,6 @@ from ..utils.options import db_option
|
|
|
28
28
|
logger = logging.getLogger(__name__)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
def _resolve_db_path(db_path: Path | None) -> Path:
|
|
32
|
-
resolved = db_path or Settings().evalvault_db_path
|
|
33
|
-
if resolved is None:
|
|
34
|
-
raise typer.BadParameter("Database path is not configured.")
|
|
35
|
-
return resolved
|
|
36
|
-
|
|
37
|
-
|
|
38
31
|
@dataclass
|
|
39
32
|
class ValidationStats:
|
|
40
33
|
"""Tracks StageEvent validation failures by error type."""
|
|
@@ -122,8 +115,7 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
122
115
|
console.print("[yellow]No valid stage events found in the input file.[/yellow]")
|
|
123
116
|
raise typer.Exit(1)
|
|
124
117
|
|
|
125
|
-
|
|
126
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
118
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
127
119
|
stored = storage.save_stage_events(events)
|
|
128
120
|
|
|
129
121
|
console.print(f"[green]Stored {stored} stage event(s).[/green]")
|
|
@@ -147,8 +139,7 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
147
139
|
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
148
140
|
) -> None:
|
|
149
141
|
"""List stage events for a run."""
|
|
150
|
-
|
|
151
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
142
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
152
143
|
events = storage.list_stage_events(run_id, stage_type=stage_type)
|
|
153
144
|
|
|
154
145
|
if not events:
|
|
@@ -184,8 +175,7 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
184
175
|
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
185
176
|
) -> None:
|
|
186
177
|
"""Show summary stats for stage events."""
|
|
187
|
-
|
|
188
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
178
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
189
179
|
events = storage.list_stage_events(run_id)
|
|
190
180
|
if not events:
|
|
191
181
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -218,8 +208,7 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
218
208
|
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
219
209
|
) -> None:
|
|
220
210
|
"""Compute stage metrics from stored events."""
|
|
221
|
-
|
|
222
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
211
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
223
212
|
events = storage.list_stage_events(run_id)
|
|
224
213
|
if not events:
|
|
225
214
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -276,8 +265,7 @@ def create_stage_app(console: Console) -> typer.Typer:
|
|
|
276
265
|
db_path: Path | None = db_option(help_text="Path to database file."),
|
|
277
266
|
) -> None:
|
|
278
267
|
"""Report stage summary, metrics, and improvement guides."""
|
|
279
|
-
|
|
280
|
-
storage = SQLiteStorageAdapter(db_path=resolved_db_path)
|
|
268
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
281
269
|
events = storage.list_stage_events(run_id)
|
|
282
270
|
if not events:
|
|
283
271
|
console.print("[yellow]No stage events found.[/yellow]")
|
|
@@ -547,13 +535,6 @@ def _load_default_profile() -> str | None:
|
|
|
547
535
|
return None
|
|
548
536
|
|
|
549
537
|
|
|
550
|
-
def _resolve_db_path(db_path: Path | None) -> Path:
|
|
551
|
-
resolved = db_path or Settings().evalvault_db_path
|
|
552
|
-
if resolved is None:
|
|
553
|
-
raise typer.BadParameter("Database path is not configured.")
|
|
554
|
-
return resolved
|
|
555
|
-
|
|
556
|
-
|
|
557
538
|
def _print_stage_summary(console: Console, summary_data) -> None:
|
|
558
539
|
table = Table(show_header=True, header_style="bold cyan")
|
|
559
540
|
table.add_column("Stage Type")
|
|
@@ -31,11 +31,11 @@ def profile_option(
|
|
|
31
31
|
def db_option(
|
|
32
32
|
*,
|
|
33
33
|
default: str | Path | None = _UNSET,
|
|
34
|
-
help_text: str = "
|
|
34
|
+
help_text: str = "SQLite DB path (PostgreSQL is default when omitted).",
|
|
35
35
|
) -> Path | None:
|
|
36
36
|
"""Shared --db / -D option definition."""
|
|
37
37
|
|
|
38
|
-
resolved_default =
|
|
38
|
+
resolved_default = None if default is _UNSET else default
|
|
39
39
|
normalized_default = _normalize_path(resolved_default)
|
|
40
40
|
return typer.Option(
|
|
41
41
|
normalized_default,
|
|
@@ -49,11 +49,17 @@ def db_option(
|
|
|
49
49
|
def memory_db_option(
|
|
50
50
|
*,
|
|
51
51
|
default: str | Path | None = _UNSET,
|
|
52
|
-
help_text: str = "
|
|
52
|
+
help_text: str = "Domain Memory SQLite path (Postgres is default when omitted).",
|
|
53
53
|
) -> Path | None:
|
|
54
54
|
"""Shared option factory for the domain memory database path."""
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
if default is _UNSET:
|
|
57
|
+
settings = Settings()
|
|
58
|
+
resolved_default = (
|
|
59
|
+
settings.evalvault_memory_db_path if settings.db_backend == "sqlite" else None
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
resolved_default = default
|
|
57
63
|
normalized_default = _normalize_path(resolved_default)
|
|
58
64
|
return typer.Option(
|
|
59
65
|
normalized_default,
|
|
@@ -20,12 +20,13 @@ from evalvault.adapters.outbound.analysis.pipeline_factory import build_analysis
|
|
|
20
20
|
from evalvault.adapters.outbound.analysis.statistical_adapter import StatisticalAnalysisAdapter
|
|
21
21
|
from evalvault.adapters.outbound.llm import SettingsLLMFactory, get_llm_adapter
|
|
22
22
|
from evalvault.adapters.outbound.nlp.korean.toolkit_factory import try_create_korean_toolkit
|
|
23
|
-
from evalvault.adapters.outbound.storage.
|
|
23
|
+
from evalvault.adapters.outbound.storage.factory import build_storage_adapter
|
|
24
24
|
from evalvault.config.settings import Settings, apply_profile
|
|
25
25
|
from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
|
|
26
26
|
from evalvault.domain.services.analysis_service import AnalysisService
|
|
27
27
|
from evalvault.domain.services.evaluator import RagasEvaluator
|
|
28
28
|
from evalvault.ports.inbound.web_port import EvalRequest, RunFilters, RunSummary
|
|
29
|
+
from evalvault.ports.outbound.storage_port import StoragePort
|
|
29
30
|
|
|
30
31
|
from .schemas import (
|
|
31
32
|
AnalyzeCompareRequest,
|
|
@@ -82,7 +83,7 @@ def list_runs(payload: dict[str, Any] | ListRunsRequest) -> ListRunsResponse:
|
|
|
82
83
|
errors=[_error("EVAL_DB_UNSAFE_PATH", str(exc), stage=ErrorStage.storage)]
|
|
83
84
|
)
|
|
84
85
|
|
|
85
|
-
storage =
|
|
86
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
86
87
|
adapter = WebUIAdapter(storage=storage, settings=Settings())
|
|
87
88
|
|
|
88
89
|
filters = RunFilters(
|
|
@@ -123,7 +124,7 @@ def get_run_summary(payload: dict[str, Any] | GetRunSummaryRequest) -> GetRunSum
|
|
|
123
124
|
errors=[_error("EVAL_DB_UNSAFE_PATH", str(exc), stage=ErrorStage.storage)]
|
|
124
125
|
)
|
|
125
126
|
|
|
126
|
-
storage =
|
|
127
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
127
128
|
try:
|
|
128
129
|
run = storage.get_run(request.run_id)
|
|
129
130
|
except KeyError as exc:
|
|
@@ -175,7 +176,7 @@ def run_evaluation(payload: dict[str, Any] | RunEvaluationRequest) -> RunEvaluat
|
|
|
175
176
|
errors=[_error("EVAL_LLM_INIT_FAILED", str(exc), stage=ErrorStage.evaluate)],
|
|
176
177
|
)
|
|
177
178
|
|
|
178
|
-
storage =
|
|
179
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
179
180
|
llm_factory = SettingsLLMFactory(settings)
|
|
180
181
|
korean_toolkit = try_create_korean_toolkit()
|
|
181
182
|
evaluator = RagasEvaluator(korean_toolkit=korean_toolkit, llm_factory=llm_factory)
|
|
@@ -266,7 +267,7 @@ def analyze_compare(payload: dict[str, Any] | AnalyzeCompareRequest) -> AnalyzeC
|
|
|
266
267
|
errors=[_error("EVAL_DB_UNSAFE_PATH", str(exc), stage=ErrorStage.storage)],
|
|
267
268
|
)
|
|
268
269
|
|
|
269
|
-
storage =
|
|
270
|
+
storage = build_storage_adapter(settings=Settings(), db_path=db_path)
|
|
270
271
|
try:
|
|
271
272
|
run_a = storage.get_run(request.run_id_a)
|
|
272
273
|
run_b = storage.get_run(request.run_id_b)
|
|
@@ -503,9 +504,11 @@ def _serialize_run_summary(summary: RunSummary) -> RunSummaryPayload:
|
|
|
503
504
|
return RunSummaryPayload.model_validate(payload)
|
|
504
505
|
|
|
505
506
|
|
|
506
|
-
def _resolve_db_path(db_path: Path | None) -> Path:
|
|
507
|
+
def _resolve_db_path(db_path: Path | None) -> Path | None:
|
|
508
|
+
settings = Settings()
|
|
507
509
|
if db_path is None:
|
|
508
|
-
settings
|
|
510
|
+
if getattr(settings, "db_backend", "postgres") != "sqlite":
|
|
511
|
+
return None
|
|
509
512
|
db_path = Path(settings.evalvault_db_path)
|
|
510
513
|
resolved = db_path.expanduser().resolve()
|
|
511
514
|
_ensure_allowed_path(resolved)
|
|
@@ -547,7 +550,7 @@ def _run_auto_analysis(
|
|
|
547
550
|
*,
|
|
548
551
|
run_id: str,
|
|
549
552
|
run: Any,
|
|
550
|
-
storage:
|
|
553
|
+
storage: StoragePort,
|
|
551
554
|
llm_adapter: Any,
|
|
552
555
|
analysis_output: Path | None,
|
|
553
556
|
analysis_report: Path | None,
|
|
@@ -152,6 +152,20 @@ class EmbeddingAnalyzerModule(BaseAnalysisModule):
|
|
|
152
152
|
errors.append(str(exc))
|
|
153
153
|
retriever = None
|
|
154
154
|
|
|
155
|
+
if retriever is None and (backend_hint == "vllm" or embedding_profile == "vllm"):
|
|
156
|
+
try:
|
|
157
|
+
from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
|
|
158
|
+
|
|
159
|
+
adapter = VLLMAdapter(settings)
|
|
160
|
+
retriever = KoreanDenseRetriever(
|
|
161
|
+
model_name=model_name or settings.vllm_embedding_model,
|
|
162
|
+
ollama_adapter=adapter,
|
|
163
|
+
profile=embedding_profile,
|
|
164
|
+
)
|
|
165
|
+
except Exception as exc:
|
|
166
|
+
errors.append(str(exc))
|
|
167
|
+
retriever = None
|
|
168
|
+
|
|
155
169
|
if retriever is None and backend_hint != "ollama":
|
|
156
170
|
try:
|
|
157
171
|
retriever = KoreanDenseRetriever(model_name=model_name)
|
|
@@ -166,7 +180,9 @@ class EmbeddingAnalyzerModule(BaseAnalysisModule):
|
|
|
166
180
|
batch_size=batch_size if isinstance(batch_size, int) else None,
|
|
167
181
|
)
|
|
168
182
|
meta = {
|
|
169
|
-
"backend": "
|
|
183
|
+
"backend": "vllm"
|
|
184
|
+
if backend_hint == "vllm" or embedding_profile == "vllm"
|
|
185
|
+
else "ollama"
|
|
170
186
|
if retriever.model_name.startswith("qwen3")
|
|
171
187
|
else "sentence-transformers",
|
|
172
188
|
"model": retriever.model_name,
|
|
@@ -77,6 +77,20 @@ class EmbeddingSearcherModule(BaseAnalysisModule):
|
|
|
77
77
|
errors.append(str(exc))
|
|
78
78
|
retriever = None
|
|
79
79
|
|
|
80
|
+
if retriever is None and embedding_profile == "vllm":
|
|
81
|
+
try:
|
|
82
|
+
from evalvault.adapters.outbound.llm.vllm_adapter import VLLMAdapter
|
|
83
|
+
|
|
84
|
+
adapter = VLLMAdapter(settings)
|
|
85
|
+
retriever = KoreanDenseRetriever(
|
|
86
|
+
model_name=settings.vllm_embedding_model,
|
|
87
|
+
ollama_adapter=adapter,
|
|
88
|
+
profile=embedding_profile,
|
|
89
|
+
)
|
|
90
|
+
except Exception as exc:
|
|
91
|
+
errors.append(str(exc))
|
|
92
|
+
retriever = None
|
|
93
|
+
|
|
80
94
|
if retriever is None:
|
|
81
95
|
try:
|
|
82
96
|
retriever = KoreanDenseRetriever(model_name=model_name)
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
"""Domain Memory adapters for factual, experiential, and working memory layers."""
|
|
2
2
|
|
|
3
|
-
from evalvault.adapters.outbound.domain_memory.
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
from evalvault.adapters.outbound.domain_memory.factory import build_domain_memory_adapter
|
|
4
|
+
from evalvault.adapters.outbound.domain_memory.postgres_adapter import PostgresDomainMemoryAdapter
|
|
5
|
+
from evalvault.adapters.outbound.domain_memory.sqlite_adapter import SQLiteDomainMemoryAdapter
|
|
6
6
|
|
|
7
|
-
__all__ = [
|
|
7
|
+
__all__ = [
|
|
8
|
+
"SQLiteDomainMemoryAdapter",
|
|
9
|
+
"PostgresDomainMemoryAdapter",
|
|
10
|
+
"build_domain_memory_adapter",
|
|
11
|
+
]
|