evalvault 1.75.0__py3-none-any.whl → 1.76.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import json
6
6
  from collections.abc import Callable, Sequence
7
7
  from dataclasses import asdict, dataclass
8
8
  from pathlib import Path
9
- from typing import Any, Literal
9
+ from typing import Any
10
10
 
11
11
  import click
12
12
  import typer
@@ -25,7 +25,7 @@ from evalvault.config.phoenix_support import (
25
25
  instrumentation_span,
26
26
  set_span_attributes,
27
27
  )
28
- from evalvault.config.settings import Settings
28
+ from evalvault.config.settings import Settings, resolve_tracker_providers
29
29
  from evalvault.domain.entities import (
30
30
  Dataset,
31
31
  EvaluationRun,
@@ -58,7 +58,7 @@ from evalvault.ports.outbound.tracker_port import TrackerPort
58
58
  from ..utils.console import print_cli_error, print_cli_warning
59
59
  from ..utils.formatters import format_score, format_status
60
60
 
61
- TrackerType = Literal["langfuse", "mlflow", "phoenix", "none"]
61
+ TrackerType = str
62
62
  apply_retriever_to_dataset = retriever_context.apply_retriever_to_dataset
63
63
 
64
64
 
@@ -319,15 +319,22 @@ def _display_memory_insights(insights: dict[str, Any], console: Console) -> None
319
319
  console.print(Panel(panel_body, title="Domain Memory Insights", border_style="magenta"))
320
320
 
321
321
 
322
- def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> TrackerPort | None:
322
+ def _get_tracker(
323
+ settings: Settings,
324
+ tracker_type: str,
325
+ console: Console,
326
+ *,
327
+ required: bool = False,
328
+ ) -> TrackerPort | None:
323
329
  """Get the appropriate tracker adapter based on type."""
324
330
  if tracker_type == "langfuse":
325
331
  if not settings.langfuse_public_key or not settings.langfuse_secret_key:
326
- print_cli_warning(
327
- console,
328
- "Langfuse 자격 증명이 설정되지 않아 로깅을 건너뜁니다.",
329
- tips=["LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY를 .env에 추가하세요."],
330
- )
332
+ message = "Langfuse 자격 증명이 설정되지 않았습니다."
333
+ tips = ["LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY를 .env에 추가하세요."]
334
+ if required:
335
+ print_cli_error(console, message, fixes=tips)
336
+ raise typer.Exit(2)
337
+ print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
331
338
  return None
332
339
  from evalvault.adapters.outbound.tracker.langfuse_adapter import LangfuseAdapter
333
340
 
@@ -339,11 +346,12 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
339
346
 
340
347
  elif tracker_type == "mlflow":
341
348
  if not settings.mlflow_tracking_uri:
342
- print_cli_warning(
343
- console,
344
- "MLflow tracking URI가 설정되지 않아 로깅을 건너뜁니다.",
345
- tips=["MLFLOW_TRACKING_URI 환경 변수를 설정하세요."],
346
- )
349
+ message = "MLflow tracking URI가 설정되지 않았습니다."
350
+ tips = ["MLFLOW_TRACKING_URI 환경 변수를 설정하세요."]
351
+ if required:
352
+ print_cli_error(console, message, fixes=tips)
353
+ raise typer.Exit(2)
354
+ print_cli_warning(console, message + " 로깅을 건너뜁니다.", tips=tips)
347
355
  return None
348
356
  try:
349
357
  from evalvault.adapters.outbound.tracker.mlflow_adapter import MLflowAdapter
@@ -353,11 +361,12 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
353
361
  experiment_name=settings.mlflow_experiment_name,
354
362
  )
355
363
  except ImportError:
356
- print_cli_warning(
357
- console,
358
- "MLflow extra가 설치되지 않았습니다.",
359
- tips=["uv sync --extra mlflow 명령으로 구성요소를 설치하세요."],
360
- )
364
+ message = "MLflow extra가 설치되지 않았습니다."
365
+ tips = ["uv sync --extra mlflow 명령으로 구성요소를 설치하세요."]
366
+ if required:
367
+ print_cli_error(console, message, fixes=tips)
368
+ raise typer.Exit(2)
369
+ print_cli_warning(console, message, tips=tips)
361
370
  return None
362
371
 
363
372
  elif tracker_type == "phoenix":
@@ -367,13 +376,16 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
367
376
  return PhoenixAdapter(
368
377
  endpoint=settings.phoenix_endpoint,
369
378
  service_name="evalvault",
379
+ project_name=getattr(settings, "phoenix_project_name", None),
380
+ annotations_enabled=getattr(settings, "phoenix_annotations_enabled", True),
370
381
  )
371
382
  except ImportError:
372
- print_cli_warning(
373
- console,
374
- "Phoenix extra가 설치되지 않았습니다.",
375
- tips=["uv sync --extra phoenix 명령으로 의존성을 추가하세요."],
376
- )
383
+ message = "Phoenix extra가 설치되지 않았습니다."
384
+ tips = ["uv sync --extra phoenix 명령으로 의존성을 추가하세요."]
385
+ if required:
386
+ print_cli_error(console, message, fixes=tips)
387
+ raise typer.Exit(2)
388
+ print_cli_warning(console, message, tips=tips)
377
389
  return None
378
390
 
379
391
  else:
@@ -385,6 +397,22 @@ def _get_tracker(settings: Settings, tracker_type: str, console: Console) -> Tra
385
397
  return None
386
398
 
387
399
 
400
+ def _resolve_tracker_list(tracker_type: str) -> list[str]:
401
+ providers = resolve_tracker_providers(tracker_type)
402
+ if not providers:
403
+ return []
404
+ if providers == ["none"]:
405
+ return ["none"]
406
+ supported = {"langfuse", "mlflow", "phoenix"}
407
+ unknown = [entry for entry in providers if entry not in supported]
408
+ if unknown:
409
+ raise ValueError(f"Unknown tracker provider(s): {', '.join(unknown)}")
410
+ required = {"mlflow", "phoenix"}
411
+ if not required.issubset(set(providers)):
412
+ raise ValueError("tracker must include both 'mlflow' and 'phoenix'")
413
+ return providers
414
+
415
+
388
416
  def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
389
417
  """Build a Phoenix UI URL for the given trace ID."""
390
418
 
@@ -395,7 +423,7 @@ def _build_phoenix_trace_url(endpoint: str, trace_id: str) -> str:
395
423
  return f"{base.rstrip('/')}/#/traces/{trace_id}"
396
424
 
397
425
 
398
- def _log_to_tracker(
426
+ def _log_to_trackers(
399
427
  settings: Settings,
400
428
  result,
401
429
  console: Console,
@@ -404,18 +432,39 @@ def _log_to_tracker(
404
432
  phoenix_options: dict[str, Any] | None = None,
405
433
  log_phoenix_traces_fn: Callable[..., int] | None = None,
406
434
  ) -> None:
407
- """Log evaluation results to the specified tracker."""
408
- tracker = _get_tracker(settings, tracker_type, console)
409
- if tracker is None:
435
+ """Log evaluation results to the specified tracker(s)."""
436
+ try:
437
+ tracker_types = _resolve_tracker_list(tracker_type)
438
+ except ValueError as exc:
439
+ print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
440
+ raise typer.Exit(2) from exc
441
+ if not tracker_types or tracker_types == ["none"]:
410
442
  return
411
443
 
412
- tracker_name = tracker_type.capitalize()
413
- trace_id: str | None = None
414
- with console.status(f"[bold green]Logging to {tracker_name}..."):
415
- try:
416
- trace_id = tracker.log_evaluation_run(result)
417
- console.print(f"[green]Logged to {tracker_name}[/green] (trace_id: {trace_id})")
418
- if trace_id and tracker_type == "phoenix":
444
+ result.tracker_metadata.setdefault("tracker_providers", tracker_types)
445
+ for provider in tracker_types:
446
+ tracker = _get_tracker(settings, provider, console, required=True)
447
+ if tracker is None:
448
+ raise typer.Exit(2)
449
+ tracker_name = provider.capitalize()
450
+ trace_id: str | None = None
451
+ with console.status(f"[bold green]Logging to {tracker_name}..."):
452
+ try:
453
+ trace_id = tracker.log_evaluation_run(result)
454
+ console.print(f"[green]Logged to {tracker_name}[/green] (trace_id: {trace_id})")
455
+ except Exception as exc:
456
+ print_cli_error(
457
+ console,
458
+ f"{tracker_name} 로깅에 실패했습니다.",
459
+ details=str(exc),
460
+ )
461
+ raise typer.Exit(2) from exc
462
+
463
+ if trace_id:
464
+ provider_meta = result.tracker_metadata.setdefault(provider, {})
465
+ if isinstance(provider_meta, dict):
466
+ provider_meta.setdefault("trace_id", trace_id)
467
+ if provider == "phoenix":
419
468
  endpoint = getattr(settings, "phoenix_endpoint", "http://localhost:6006/v1/traces")
420
469
  if not isinstance(endpoint, str) or not endpoint:
421
470
  endpoint = "http://localhost:6006/v1/traces"
@@ -431,27 +480,78 @@ def _log_to_tracker(
431
480
  trace_url = get_phoenix_trace_url(result.tracker_metadata)
432
481
  if trace_url:
433
482
  console.print(f"[dim]Phoenix Trace: {trace_url}[/dim]")
434
- except Exception as exc: # pragma: no cover - telemetry best-effort
435
- print_cli_warning(
436
- console,
437
- f"{tracker_name} 로깅에 실패했습니다.",
438
- tips=[str(exc)],
483
+
484
+ options = phoenix_options or {}
485
+ log_traces = log_phoenix_traces_fn or log_phoenix_traces
486
+ extra = log_traces(
487
+ tracker,
488
+ result,
489
+ max_traces=options.get("max_traces"),
490
+ metadata=options.get("metadata"),
491
+ )
492
+ if extra:
493
+ console.print(
494
+ f"[dim]Recorded {extra} Phoenix RAG trace(s) for detailed observability.[/dim]"
495
+ )
496
+
497
+
498
+ def _log_analysis_artifacts(
499
+ settings: Settings,
500
+ result: EvaluationRun,
501
+ console: Console,
502
+ tracker_type: str,
503
+ *,
504
+ analysis_payload: dict[str, Any],
505
+ artifact_index: dict[str, Any],
506
+ report_text: str,
507
+ output_path: Path,
508
+ report_path: Path,
509
+ ) -> None:
510
+ """Log analysis artifacts to tracker(s) as a separate trace/run."""
511
+ try:
512
+ tracker_types = _resolve_tracker_list(tracker_type)
513
+ except ValueError as exc:
514
+ print_cli_error(console, "Tracker 설정이 올바르지 않습니다.", details=str(exc))
515
+ raise typer.Exit(2) from exc
516
+ if not tracker_types or tracker_types == ["none"]:
517
+ return
518
+
519
+ metadata = {
520
+ "run_id": result.run_id,
521
+ "dataset_name": result.dataset_name,
522
+ "dataset_version": result.dataset_version,
523
+ "analysis_output": str(output_path),
524
+ "analysis_report": str(report_path),
525
+ "analysis_artifacts_dir": artifact_index.get("dir"),
526
+ "event_type": "analysis",
527
+ }
528
+
529
+ for provider in tracker_types:
530
+ tracker = _get_tracker(settings, provider, console, required=True)
531
+ if tracker is None:
532
+ raise typer.Exit(2)
533
+ trace_name = f"analysis-{result.run_id[:8]}"
534
+ try:
535
+ trace_id = tracker.start_trace(trace_name, metadata=metadata)
536
+ tracker.save_artifact(
537
+ trace_id, "analysis_payload", analysis_payload, artifact_type="json"
439
538
  )
440
- return
441
-
442
- if tracker_type == "phoenix":
443
- options = phoenix_options or {}
444
- log_traces = log_phoenix_traces_fn or log_phoenix_traces
445
- extra = log_traces(
446
- tracker,
447
- result,
448
- max_traces=options.get("max_traces"),
449
- metadata=options.get("metadata"),
450
- )
451
- if extra:
539
+ tracker.save_artifact(
540
+ trace_id, "analysis_artifacts", artifact_index, artifact_type="json"
541
+ )
542
+ tracker.save_artifact(trace_id, "analysis_report", report_text, artifact_type="text")
543
+ tracker.end_trace(trace_id)
452
544
  console.print(
453
- f"[dim]Recorded {extra} Phoenix RAG trace(s) for detailed observability.[/dim]"
545
+ f"[green]Logged analysis artifacts to {provider.capitalize()}[/green] "
546
+ f"(trace_id: {trace_id})"
547
+ )
548
+ except Exception as exc:
549
+ print_cli_error(
550
+ console,
551
+ f"{provider.capitalize()} 분석 로깅에 실패했습니다.",
552
+ details=str(exc),
454
553
  )
554
+ raise typer.Exit(2) from exc
455
555
 
456
556
 
457
557
  def _save_to_db(
@@ -1173,8 +1273,10 @@ def _collect_prompt_metadata(
1173
1273
  prompt_path=target,
1174
1274
  content=content,
1175
1275
  )
1176
- summary.content_preview = _build_content_preview(content)
1177
- summaries.append(asdict(summary))
1276
+ summary_dict = asdict(summary)
1277
+ summary_dict["content_preview"] = _build_content_preview(content)
1278
+ summary_dict["content"] = content
1279
+ summaries.append(summary_dict)
1178
1280
 
1179
1281
  return summaries
1180
1282
 
@@ -97,7 +97,7 @@ def _resolve_faithfulness_fallback_config(
97
97
 
98
98
  def _default_faithfulness_fallback_model(provider: str) -> str | None:
99
99
  if provider == "ollama":
100
- return "gpt-oss-safeguard:20b"
100
+ return "qwen3:8b"
101
101
  if provider == "vllm":
102
102
  return "gpt-oss-120b"
103
103
  return None
@@ -104,6 +104,22 @@ class PhoenixSyncService:
104
104
  dataset_description=description,
105
105
  )
106
106
  except Exception as exc: # pragma: no cover - HTTP/serialization errors
107
+ message = str(exc)
108
+ if "already exists" in message:
109
+ existing = self._find_dataset_by_name(dataset_name)
110
+ if existing:
111
+ dataset_obj = self._client.datasets.get_dataset(dataset=existing["id"])
112
+ dataset_url = self._client.experiments.get_dataset_experiments_url(
113
+ dataset_obj.id
114
+ )
115
+ return PhoenixDatasetInfo(
116
+ dataset_id=dataset_obj.id,
117
+ dataset_name=dataset_obj.name,
118
+ dataset_version_id=dataset_obj.version_id,
119
+ url=dataset_url,
120
+ description=description,
121
+ example_count=getattr(dataset_obj, "examples", None),
122
+ )
107
123
  raise PhoenixSyncError(f"Dataset upload failed: {exc}") from exc
108
124
 
109
125
  dataset_url = self._client.experiments.get_dataset_experiments_url(phoenix_dataset.id)
@@ -173,6 +189,74 @@ class PhoenixSyncService:
173
189
  )
174
190
  return examples
175
191
 
192
+ def _find_dataset_by_name(self, dataset_name: str) -> dict[str, Any] | None:
193
+ try:
194
+ datasets = self._client.datasets.list()
195
+ except Exception:
196
+ return None
197
+ for entry in datasets:
198
+ if entry.get("name") == dataset_name:
199
+ return entry
200
+ return None
201
+
202
+ def sync_prompts(
203
+ self,
204
+ *,
205
+ prompt_entries: list[dict[str, Any]],
206
+ model_name: str,
207
+ model_provider: str,
208
+ prompt_set_name: str | None = None,
209
+ ) -> list[dict[str, Any]]:
210
+ """Create prompt versions in Phoenix Prompt Management."""
211
+
212
+ if not prompt_entries:
213
+ return []
214
+
215
+ try:
216
+ from phoenix.client.resources.prompts import PromptVersion
217
+ except Exception as exc: # pragma: no cover - optional dependency
218
+ raise PhoenixSyncError("Phoenix prompt client unavailable") from exc
219
+
220
+ synced: list[dict[str, Any]] = []
221
+ for index, entry in enumerate(prompt_entries, start=1):
222
+ name = entry.get("name") or entry.get("role") or f"prompt_{index}"
223
+ content = entry.get("content") or entry.get("content_preview") or ""
224
+ if not content:
225
+ continue
226
+ prompt_version = PromptVersion(
227
+ [{"role": "system", "content": content}],
228
+ model_name=model_name,
229
+ model_provider=model_provider,
230
+ template_format="NONE",
231
+ )
232
+ prompt_metadata = {
233
+ "kind": entry.get("kind"),
234
+ "role": entry.get("role"),
235
+ "checksum": entry.get("checksum"),
236
+ "status": entry.get("status"),
237
+ "source": entry.get("source") or entry.get("path"),
238
+ "order": index,
239
+ }
240
+ if prompt_set_name:
241
+ prompt_metadata["prompt_set"] = prompt_set_name
242
+ try:
243
+ version = self._client.prompts.create(
244
+ version=prompt_version,
245
+ name=name,
246
+ prompt_description=entry.get("notes"),
247
+ prompt_metadata=_as_serializable(prompt_metadata),
248
+ )
249
+ synced.append(
250
+ {
251
+ **entry,
252
+ "phoenix_prompt_version_id": getattr(version, "id", None),
253
+ }
254
+ )
255
+ except Exception as exc: # pragma: no cover - HTTP errors
256
+ raise PhoenixSyncError(f"Prompt sync failed: {exc}") from exc
257
+
258
+ return synced
259
+
176
260
  def _build_input_payload(self, test_case: TestCase) -> dict[str, Any]:
177
261
  return {
178
262
  "question": test_case.question,
@@ -258,6 +342,21 @@ def build_experiment_metadata(
258
342
  "total_test_cases": run.total_test_cases,
259
343
  "metrics": metrics,
260
344
  }
345
+ if run.results:
346
+ latencies = [r.latency_ms for r in run.results if r.latency_ms]
347
+ tokens = [r.tokens_used for r in run.results if r.tokens_used]
348
+ costs = [r.cost_usd for r in run.results if r.cost_usd is not None]
349
+ if latencies:
350
+ payload["avg_latency_ms"] = round(sum(latencies) / len(latencies), 2)
351
+ if tokens:
352
+ payload["avg_tokens"] = round(sum(tokens) / len(tokens), 2)
353
+ if costs:
354
+ payload["avg_cost_usd"] = round(sum(costs) / len(costs), 6)
355
+ if run.total_tokens:
356
+ payload["total_tokens"] = run.total_tokens
357
+ if run.total_cost_usd is not None:
358
+ payload["total_cost_usd"] = run.total_cost_usd
359
+ payload["error_rate"] = round(1 - run.pass_rate, 4)
261
360
  if reliability_snapshot:
262
361
  payload["reliability_snapshot"] = reliability_snapshot
263
362
  if dataset.metadata: