evalvault 1.59.0__py3-none-any.whl → 1.60.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,13 @@ from urllib.request import urlopen
15
15
  from evalvault.config.phoenix_support import PhoenixExperimentResolver
16
16
  from evalvault.config.settings import Settings
17
17
  from evalvault.domain.entities.prompt import PromptSetBundle
18
+ from evalvault.domain.metrics.registry import (
19
+ get_metric_descriptions as registry_metric_descriptions,
20
+ )
21
+ from evalvault.domain.metrics.registry import (
22
+ list_metric_names,
23
+ list_metric_specs,
24
+ )
18
25
  from evalvault.domain.services.cluster_map_builder import build_cluster_map
19
26
  from evalvault.domain.services.prompt_registry import (
20
27
  PromptInput,
@@ -42,21 +49,6 @@ if TYPE_CHECKING:
42
49
 
43
50
  logger = logging.getLogger(__name__)
44
51
 
45
- # 지원하는 메트릭 목록
46
- AVAILABLE_METRICS = [
47
- "faithfulness",
48
- "answer_relevancy",
49
- "context_precision",
50
- "context_recall",
51
- "factual_correctness",
52
- "semantic_similarity",
53
- "summary_score",
54
- "summary_faithfulness",
55
- "insurance_term_accuracy",
56
- "entity_preservation",
57
- "contextual_relevancy",
58
- ]
59
-
60
52
 
61
53
  @dataclass
62
54
  class GateResult:
@@ -978,19 +970,15 @@ class WebUIAdapter:
978
970
 
979
971
  def get_available_metrics(self) -> list[str]:
980
972
  """사용 가능한 메트릭 목록 반환."""
981
- return AVAILABLE_METRICS.copy()
973
+ return list_metric_names()
974
+
975
+ def get_metric_specs(self) -> list[dict[str, object]]:
976
+ """메트릭 스펙 목록 반환."""
977
+ return [spec.to_dict() for spec in list_metric_specs()]
982
978
 
983
979
  def get_metric_descriptions(self) -> dict[str, str]:
984
980
  """메트릭별 설명 반환."""
985
- return {
986
- "faithfulness": "답변이 컨텍스트에 충실한지 평가",
987
- "answer_relevancy": "답변이 질문과 관련있는지 평가",
988
- "context_precision": "검색된 컨텍스트의 정밀도 평가",
989
- "context_recall": "필요한 정보가 검색되었는지 평가",
990
- "factual_correctness": "ground_truth 대비 사실적 정확성 평가",
991
- "semantic_similarity": "답변과 ground_truth 간 의미적 유사도 평가",
992
- "insurance_term_accuracy": "보험 용어 정확성 평가",
993
- }
981
+ return registry_metric_descriptions()
994
982
 
995
983
  def create_dataset_from_upload(
996
984
  self,
@@ -11,6 +11,7 @@ from evalvault.adapters.outbound.llm import get_llm_adapter
11
11
  from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
12
12
  from evalvault.config.settings import get_settings
13
13
  from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
14
+ from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
14
15
  from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
15
16
 
16
17
  router = APIRouter(tags=["pipeline"])
@@ -220,6 +221,15 @@ class PipelineResultResponse(PipelineResultSummary):
220
221
  final_output: dict[str, Any] | None = None
221
222
 
222
223
 
224
+ class AnalysisMetricSpecResponse(BaseModel):
225
+ key: str
226
+ label: str
227
+ description: str
228
+ signal_group: str
229
+ module_id: str
230
+ output_path: list[str]
231
+
232
+
223
233
  def _serialize_payload(value: Any) -> Any:
224
234
  try:
225
235
  return jsonable_encoder(value)
@@ -366,6 +376,12 @@ async def list_intents():
366
376
  raise HTTPException(status_code=500, detail=str(exc)) from exc
367
377
 
368
378
 
379
+ @router.get("/options/analysis-metric-specs", response_model=list[AnalysisMetricSpecResponse])
380
+ async def list_analysis_metric_specs_endpoint():
381
+ """List analysis metric specs for pipeline outputs."""
382
+ return [spec.to_dict() for spec in list_analysis_metric_specs()]
383
+
384
+
369
385
  @router.post("/results", response_model=PipelineResultSummary)
370
386
  async def save_pipeline_result(payload: PipelineResultPayload):
371
387
  """Save a pipeline analysis result for history."""
@@ -113,6 +113,16 @@ class ModelItemResponse(BaseModel):
113
113
  supports_tools: bool | None = None
114
114
 
115
115
 
116
+ class MetricSpecResponse(BaseModel):
117
+ name: str
118
+ description: str
119
+ requires_ground_truth: bool
120
+ requires_embeddings: bool
121
+ source: str
122
+ category: str
123
+ signal_group: str
124
+
125
+
116
126
  class ClusterMapItemResponse(BaseModel):
117
127
  test_case_id: str
118
128
  cluster_id: str
@@ -395,6 +405,12 @@ def list_metrics(adapter: AdapterDep):
395
405
  return adapter.get_available_metrics()
396
406
 
397
407
 
408
+ @router.get("/options/metric-specs", response_model=list[MetricSpecResponse])
409
+ def list_metric_specs(adapter: AdapterDep):
410
+ """Get available metrics with metadata."""
411
+ return adapter.get_metric_specs()
412
+
413
+
398
414
  @router.get("/options/cluster-maps", response_model=list[ClusterMapFileResponse])
399
415
  def list_cluster_maps():
400
416
  """List available cluster map CSV files."""
@@ -14,6 +14,8 @@ import typer
14
14
  from rich import print as rprint
15
15
  from rich.console import Console
16
16
 
17
+ from evalvault.domain.metrics.registry import list_metric_names
18
+
17
19
  from .commands import attach_sub_apps, register_all_commands
18
20
 
19
21
 
@@ -32,19 +34,7 @@ app = typer.Typer(
32
34
  )
33
35
  console = Console()
34
36
 
35
- AVAILABLE_METRICS: list[str] = [
36
- "faithfulness",
37
- "answer_relevancy",
38
- "context_precision",
39
- "context_recall",
40
- "factual_correctness",
41
- "semantic_similarity",
42
- "summary_score",
43
- "summary_faithfulness",
44
- "insurance_term_accuracy",
45
- "entity_preservation",
46
- "contextual_relevancy",
47
- ]
37
+ AVAILABLE_METRICS = list_metric_names()
48
38
 
49
39
  register_all_commands(app, console, available_metrics=AVAILABLE_METRICS)
50
40
  attach_sub_apps(app, console)
@@ -7,6 +7,7 @@ from rich.console import Console
7
7
  from rich.table import Table
8
8
 
9
9
  from evalvault.config.settings import Settings, apply_profile
10
+ from evalvault.domain.metrics.registry import list_metric_specs
10
11
 
11
12
 
12
13
  def register_config_commands(app: typer.Typer, console: Console) -> None:
@@ -22,46 +23,9 @@ def register_config_commands(app: typer.Typer, console: Console) -> None:
22
23
  table.add_column("Description")
23
24
  table.add_column("Requires Ground Truth", justify="center")
24
25
 
25
- table.add_row(
26
- "faithfulness",
27
- "Measures factual accuracy of the answer based on contexts",
28
- "[red]No[/red]",
29
- )
30
- table.add_row(
31
- "answer_relevancy",
32
- "Measures how relevant the answer is to the question",
33
- "[red]No[/red]",
34
- )
35
- table.add_row(
36
- "context_precision",
37
- "Measures ranking quality of retrieved contexts",
38
- "[green]Yes[/green]",
39
- )
40
- table.add_row(
41
- "context_recall",
42
- "Measures if all relevant info is in retrieved contexts",
43
- "[green]Yes[/green]",
44
- )
45
- table.add_row(
46
- "summary_score",
47
- "Measures summary coverage and conciseness against contexts",
48
- "[red]No[/red]",
49
- )
50
- table.add_row(
51
- "summary_faithfulness",
52
- "Measures whether summary statements are grounded in contexts",
53
- "[red]No[/red]",
54
- )
55
- table.add_row(
56
- "entity_preservation",
57
- "Measures preservation of key insurance entities in summaries",
58
- "[red]No[/red]",
59
- )
60
- table.add_row(
61
- "insurance_term_accuracy",
62
- "Measures if insurance terms in answer are grounded in contexts",
63
- "[red]No[/red]",
64
- )
26
+ for spec in list_metric_specs():
27
+ needs_gt = "[green]Yes[/green]" if spec.requires_ground_truth else "[red]No[/red]"
28
+ table.add_row(spec.name, spec.description, needs_gt)
65
29
 
66
30
  console.print(table)
67
31
  console.print("\n[dim]Use --metrics flag with 'run' command to specify metrics.[/dim]")
@@ -13,6 +13,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
13
13
  from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
14
14
  from evalvault.config.settings import Settings
15
15
 
16
+ from ..utils.analysis_io import serialize_pipeline_result
16
17
  from ..utils.options import db_option
17
18
 
18
19
 
@@ -104,15 +105,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
104
105
  console.print(f" [red]{node_id}:[/red] {node_result.error}")
105
106
 
106
107
  if output:
107
- data = {
108
- "query": query,
109
- "intent": result.intent.value if result.intent else None,
110
- "is_complete": result.is_complete,
111
- "duration_ms": result.total_duration_ms,
112
- "final_output": result.final_output,
113
- }
108
+ payload = serialize_pipeline_result(result)
109
+ payload["query"] = query
114
110
  with open(output, "w", encoding="utf-8") as f:
115
- json.dump(data, f, ensure_ascii=False, indent=2)
111
+ json.dump(payload, f, ensure_ascii=False, indent=2)
116
112
  console.print(f"\n[green]Results saved to {output}[/green]")
117
113
 
118
114
  console.print()
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import asyncio
6
6
  import json
7
+ import re
7
8
  from typing import Any
8
9
 
9
10
  from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
@@ -13,6 +14,7 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
13
14
  truncate_text,
14
15
  )
15
16
  from evalvault.domain.entities import EvaluationRun
17
+ from evalvault.domain.metrics.registry import get_metric_spec_map
16
18
  from evalvault.ports.outbound.llm_port import LLMPort
17
19
 
18
20
 
@@ -54,6 +56,18 @@ class LLMReportModule(BaseAnalysisModule):
54
56
  report = self._llm_adapter.generate_text(
55
57
  self._build_prompt(context, evidence),
56
58
  )
59
+ report_type = context.get("report_type") or "analysis"
60
+ is_valid, reasons = self._validate_report(
61
+ report,
62
+ report_type=report_type,
63
+ evidence=evidence,
64
+ )
65
+ if not is_valid:
66
+ output = self._fallback_report(context, evidence, llm_used=False)
67
+ output["llm_error"] = (
68
+ "LLM report validation failed: " + "; ".join(reasons)
69
+ ).strip()
70
+ return output
57
71
  return self._build_output(context, evidence, report, llm_used=True)
58
72
  except Exception as exc:
59
73
  output = self._fallback_report(context, evidence, llm_used=False)
@@ -91,6 +105,18 @@ class LLMReportModule(BaseAnalysisModule):
91
105
  self._llm_adapter.generate_text,
92
106
  self._build_prompt(context, evidence),
93
107
  )
108
+ report_type = context.get("report_type") or "analysis"
109
+ is_valid, reasons = self._validate_report(
110
+ report,
111
+ report_type=report_type,
112
+ evidence=evidence,
113
+ )
114
+ if not is_valid:
115
+ output = self._fallback_report(context, evidence, llm_used=False)
116
+ output["llm_error"] = (
117
+ "LLM report validation failed: " + "; ".join(reasons)
118
+ ).strip()
119
+ return output
94
120
  return self._build_output(context, evidence, report, llm_used=True)
95
121
  except Exception as exc:
96
122
  output = self._fallback_report(context, evidence, llm_used=False)
@@ -367,6 +393,107 @@ class LLMReportModule(BaseAnalysisModule):
367
393
  )
368
394
  return scorecard
369
395
 
396
+ def _build_signal_group_summary(
397
+ self,
398
+ scorecard: list[dict[str, Any]],
399
+ ) -> dict[str, dict[str, Any]]:
400
+ spec_map = get_metric_spec_map()
401
+ summary: dict[str, dict[str, Any]] = {}
402
+ for row in scorecard:
403
+ metric = row.get("metric")
404
+ if not metric:
405
+ continue
406
+ spec = spec_map.get(metric)
407
+ group = spec.signal_group if spec else "unknown"
408
+ bucket = summary.setdefault(
409
+ group,
410
+ {
411
+ "metrics": [],
412
+ "mean_avg": None,
413
+ "pass_rate_avg": None,
414
+ "risk_count": 0,
415
+ "total": 0,
416
+ "_mean_values": [],
417
+ "_pass_rates": [],
418
+ },
419
+ )
420
+ bucket["metrics"].append(metric)
421
+ mean = row.get("mean")
422
+ if isinstance(mean, int | float):
423
+ bucket["_mean_values"].append(float(mean))
424
+ pass_rate = row.get("pass_rate")
425
+ if isinstance(pass_rate, int | float):
426
+ bucket["_pass_rates"].append(float(pass_rate))
427
+ if row.get("status") == "risk":
428
+ bucket["risk_count"] += 1
429
+ bucket["total"] += 1
430
+
431
+ for bucket in summary.values():
432
+ mean_values = bucket.pop("_mean_values", [])
433
+ pass_rates = bucket.pop("_pass_rates", [])
434
+ bucket["mean_avg"] = round(safe_mean(mean_values), 4) if mean_values else None
435
+ bucket["pass_rate_avg"] = round(safe_mean(pass_rates), 4) if pass_rates else None
436
+ return summary
437
+
438
+ def _build_risk_metrics(
439
+ self,
440
+ scorecard: list[dict[str, Any]],
441
+ *,
442
+ limit: int = 6,
443
+ ) -> list[dict[str, Any]]:
444
+ risk_rows: list[dict[str, Any]] = []
445
+ for row in scorecard:
446
+ status = row.get("status")
447
+ pass_rate = row.get("pass_rate")
448
+ is_risk = status == "risk" or (
449
+ isinstance(pass_rate, int | float) and float(pass_rate) < 0.7
450
+ )
451
+ if not is_risk:
452
+ continue
453
+ risk_rows.append(
454
+ {
455
+ "metric": row.get("metric"),
456
+ "mean": row.get("mean"),
457
+ "threshold": row.get("threshold"),
458
+ "pass_rate": pass_rate,
459
+ "gap": row.get("gap"),
460
+ "status": status,
461
+ }
462
+ )
463
+
464
+ def _sort_key(item: dict[str, Any]) -> tuple[float, float]:
465
+ gap = item.get("gap")
466
+ gap_value = float(gap) if isinstance(gap, int | float) else 0.0
467
+ pass_rate = item.get("pass_rate")
468
+ pass_value = float(pass_rate) if isinstance(pass_rate, int | float) else 1.0
469
+ return (gap_value, -pass_value)
470
+
471
+ risk_rows.sort(key=_sort_key, reverse=True)
472
+ return risk_rows[:limit]
473
+
474
+ def _build_significant_changes(
475
+ self,
476
+ comparison_scorecard: list[dict[str, Any]],
477
+ *,
478
+ limit: int = 6,
479
+ ) -> list[dict[str, Any]]:
480
+ changes: list[dict[str, Any]] = []
481
+ for row in comparison_scorecard:
482
+ if not row.get("is_significant"):
483
+ continue
484
+ changes.append(
485
+ {
486
+ "metric": row.get("metric"),
487
+ "diff": row.get("diff"),
488
+ "diff_percent": row.get("diff_percent"),
489
+ "effect_size": row.get("effect_size"),
490
+ "effect_level": row.get("effect_level"),
491
+ "direction": row.get("direction"),
492
+ "winner": row.get("winner"),
493
+ }
494
+ )
495
+ return changes[:limit]
496
+
370
497
  def _build_comparison_scorecard(
371
498
  self,
372
499
  comparison_details: dict[str, Any] | None,
@@ -397,6 +524,62 @@ class LLMReportModule(BaseAnalysisModule):
397
524
  )
398
525
  return scorecard
399
526
 
527
+ def _validate_report(
528
+ self,
529
+ report: str,
530
+ *,
531
+ report_type: str,
532
+ evidence: list[dict[str, Any]],
533
+ ) -> tuple[bool, list[str]]:
534
+ reasons: list[str] = []
535
+ normalized = report_type or "analysis"
536
+
537
+ if not re.search(r"[가-힣]", report):
538
+ reasons.append("한국어 본문 미검출")
539
+
540
+ required_sections = {
541
+ "comparison": [
542
+ "요약",
543
+ "변경 사항 요약",
544
+ "지표 비교 스코어카드",
545
+ "통계적 신뢰도",
546
+ "원인 분석",
547
+ "개선 제안",
548
+ "다음 단계",
549
+ "부록(산출물)",
550
+ ],
551
+ "summary": [
552
+ "요약",
553
+ "지표 스코어카드",
554
+ "개선 제안",
555
+ "다음 단계",
556
+ "부록(산출물)",
557
+ ],
558
+ "analysis": [
559
+ "요약",
560
+ "지표 스코어카드",
561
+ "데이터 품질/신뢰도",
562
+ "증거 기반 인사이트",
563
+ "원인 가설",
564
+ "개선 제안",
565
+ "다음 단계",
566
+ "부록(산출물)",
567
+ ],
568
+ }
569
+ for section in required_sections.get(normalized, []):
570
+ if section not in report:
571
+ reasons.append(f"섹션 누락: {section}")
572
+
573
+ if evidence:
574
+ if normalized == "comparison":
575
+ if not re.search(r"\[(A|B)\\d+\\]", report):
576
+ reasons.append("증거 인용([A1]/[B1]) 누락")
577
+ else:
578
+ if not re.search(r"\\[E\\d+\\]", report):
579
+ reasons.append("증거 인용([E1]) 누락")
580
+
581
+ return len(reasons) == 0, reasons
582
+
400
583
  def _build_priority_highlights(self, priority_summary: dict[str, Any] | None) -> dict[str, Any]:
401
584
  priority_summary = priority_summary or {}
402
585
  bottom_cases = priority_summary.get("bottom_cases", [])
@@ -598,6 +781,9 @@ class LLMReportModule(BaseAnalysisModule):
598
781
  priority_highlights = self._build_priority_highlights(context.get("priority_summary"))
599
782
  prompt_change_summary = self._summarize_prompt_changes(context.get("change_summary"))
600
783
  artifact_manifest = self._build_artifact_manifest(context.get("artifact_nodes") or [])
784
+ signal_group_summary = self._build_signal_group_summary(scorecard)
785
+ risk_metrics = self._build_risk_metrics(scorecard)
786
+ significant_changes = self._build_significant_changes(comparison_scorecard)
601
787
  change_summary = context.get("change_summary")
602
788
  if isinstance(change_summary, dict) and prompt_change_summary:
603
789
  change_summary = dict(change_summary)
@@ -635,12 +821,15 @@ class LLMReportModule(BaseAnalysisModule):
635
821
  "comparison": context.get("comparison"),
636
822
  "comparison_details": comparison_details,
637
823
  "comparison_scorecard": comparison_scorecard,
824
+ "significant_changes": significant_changes,
638
825
  "change_summary": change_summary,
639
826
  "priority_summary": context.get("priority_summary"),
640
827
  "priority_highlights": priority_highlights,
641
828
  "quality_checks": context.get("quality_checks"),
642
829
  "quality_summary": quality_summary,
643
830
  "scorecard": scorecard,
831
+ "signal_group_summary": signal_group_summary,
832
+ "risk_metrics": risk_metrics,
644
833
  "artifact_manifest": artifact_manifest,
645
834
  }
646
835
 
@@ -648,6 +837,19 @@ class LLMReportModule(BaseAnalysisModule):
648
837
  evidence_json = json.dumps(evidence, ensure_ascii=False, indent=2)
649
838
 
650
839
  report_type = context.get("report_type")
840
+ common_requirements = (
841
+ "공통 원칙:\n"
842
+ "1) 모든 주장/원인/개선안은 summary_json 또는 evidence에 근거해야 함\n"
843
+ "2) 숫자/지표는 scorecard, comparison_scorecard, risk_metrics에서 직접 인용\n"
844
+ "3) 근거가 부족하면 '추가 데이터 필요'를 명시하고 추측 금지\n"
845
+ "4) 2026-01 기준 널리 쓰이는 RAG 개선 패턴을 우선 고려하되, "
846
+ "현재 데이터 이슈와 연결되는 항목만 선택\n"
847
+ "4-1) 개선 패턴 예시: 하이브리드 검색+리랭커, 쿼리 재작성, "
848
+ "동적 청크/컨텍스트 압축, 메타데이터/필터링, 인용/검증 단계, "
849
+ "신뢰도 기반 답변 거절, 평가셋 확장/하드 네거티브, 피드백 루프\n"
850
+ "5) 개선안마다 기대되는 영향 지표, 검증 방법(실험/재평가), 리스크를 함께 서술\n"
851
+ "6) 신뢰도/타당성 제약(표본 수, 커버리지, 유의성, 데이터 변경)을 명시\n"
852
+ )
651
853
  if report_type == "comparison":
652
854
  requirements = (
653
855
  "요구사항:\n"
@@ -656,9 +858,11 @@ class LLMReportModule(BaseAnalysisModule):
656
858
  "3) 섹션: 요약, 변경 사항 요약, 지표 비교 스코어카드, 통계적 신뢰도, "
657
859
  "원인 분석, 개선 제안, 다음 단계, 부록(산출물)\n"
658
860
  "4) 요약은 한 문장 결론 + 핵심 3개 bullet(지표/변경 사항/사용자 영향)\n"
659
- "5) 스코어카드는 Markdown 표로 작성 (metric, A, B, diff, p-value, effect, 상태)\n"
861
+ "5) 스코어카드는 Markdown 표로 작성 (metric, A, B, diff, "
862
+ "p-value, effect, 상태)\n"
660
863
  "6) 데이터셋 차이가 있으면 비교 해석 제한을 명확히 표기\n"
661
- "7) 변경 사항이 없거나 근거가 약하면 '추가 데이터 필요'라고 명시\n"
864
+ "7) 유의한 변화는 significant_changes를 활용해 강조\n"
865
+ "8) 변경 사항이 없거나 근거가 약하면 '추가 데이터 필요'라고 명시\n"
662
866
  )
663
867
  elif report_type == "summary":
664
868
  requirements = (
@@ -667,8 +871,10 @@ class LLMReportModule(BaseAnalysisModule):
667
871
  "2) 핵심 주장/개선안에는 evidence_id를 [E1] 형식으로 인용\n"
668
872
  "3) 섹션: 요약, 지표 스코어카드, 개선 제안, 다음 단계, 부록(산출물)\n"
669
873
  "4) 요약은 한 문장 결론 + 핵심 3개 bullet\n"
670
- "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, pass_rate, 상태)\n"
671
- "6) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
874
+ "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, "
875
+ "pass_rate, 상태)\n"
876
+ "6) risk_metrics를 활용해 상위 위험 지표 3개를 명확히 언급\n"
877
+ "7) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
672
878
  )
673
879
  else:
674
880
  requirements = (
@@ -678,15 +884,18 @@ class LLMReportModule(BaseAnalysisModule):
678
884
  "3) 섹션: 요약, 지표 스코어카드, 데이터 품질/신뢰도, 증거 기반 인사이트, "
679
885
  "원인 가설, 개선 제안, 다음 단계, 부록(산출물)\n"
680
886
  "4) 요약은 한 문장 결론 + 핵심 3개 bullet(지표/원인/사용자 영향)\n"
681
- "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, pass_rate, 상태)\n"
682
- "6) 사용자 영향은 신뢰/이해/인지부하 관점으로 1~2문장\n"
683
- "7) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
887
+ "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, "
888
+ "pass_rate, 상태)\n"
889
+ "6) signal_group_summary로 축별 약점/강점을 분해\n"
890
+ "7) 사용자 영향은 신뢰/이해/인지부하 관점으로 1~2문장\n"
891
+ "8) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
684
892
  )
685
893
 
686
894
  return (
687
895
  "당신은 RAG 평가 분석 보고서 작성자입니다. "
688
896
  "아래 데이터와 증거를 기반으로 Markdown 보고서를 작성하세요.\n"
689
897
  "\n"
898
+ f"{common_requirements}\n"
690
899
  f"{requirements}\n"
691
900
  "[요약 데이터]\n"
692
901
  f"{summary_json}\n"
@@ -0,0 +1,217 @@
1
+ """Analysis metric registry for pipeline outputs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ from evalvault.domain.metrics.registry import SignalGroup
9
+
10
+ AnalysisMetricSource = Literal[
11
+ "retrieval_analyzer",
12
+ "embedding_analyzer",
13
+ "bm25_searcher",
14
+ "embedding_searcher",
15
+ "hybrid_rrf",
16
+ "hybrid_weighted",
17
+ ]
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class AnalysisMetricSpec:
22
+ key: str
23
+ label: str
24
+ description: str
25
+ signal_group: SignalGroup
26
+ module_id: AnalysisMetricSource
27
+ output_path: tuple[str, ...]
28
+
29
+ def to_dict(self) -> dict[str, object]:
30
+ return {
31
+ "key": self.key,
32
+ "label": self.label,
33
+ "description": self.description,
34
+ "signal_group": self.signal_group,
35
+ "module_id": self.module_id,
36
+ "output_path": list(self.output_path),
37
+ }
38
+
39
+
40
+ _ANALYSIS_METRICS: tuple[AnalysisMetricSpec, ...] = (
41
+ AnalysisMetricSpec(
42
+ key="retrieval.avg_contexts",
43
+ label="Avg contexts per query",
44
+ description="Average number of contexts retrieved per query",
45
+ signal_group="retrieval_effectiveness",
46
+ module_id="retrieval_analyzer",
47
+ output_path=("summary", "avg_contexts"),
48
+ ),
49
+ AnalysisMetricSpec(
50
+ key="retrieval.empty_context_rate",
51
+ label="Empty context rate",
52
+ description="Share of queries with empty contexts",
53
+ signal_group="retrieval_effectiveness",
54
+ module_id="retrieval_analyzer",
55
+ output_path=("summary", "empty_context_rate"),
56
+ ),
57
+ AnalysisMetricSpec(
58
+ key="retrieval.avg_context_tokens",
59
+ label="Avg context tokens",
60
+ description="Average token count across contexts",
61
+ signal_group="retrieval_effectiveness",
62
+ module_id="retrieval_analyzer",
63
+ output_path=("summary", "avg_context_tokens"),
64
+ ),
65
+ AnalysisMetricSpec(
66
+ key="retrieval.keyword_overlap",
67
+ label="Keyword overlap",
68
+ description="Keyword overlap between question and contexts",
69
+ signal_group="retrieval_effectiveness",
70
+ module_id="retrieval_analyzer",
71
+ output_path=("summary", "avg_keyword_overlap"),
72
+ ),
73
+ AnalysisMetricSpec(
74
+ key="retrieval.ground_truth_hit_rate",
75
+ label="Ground truth hit rate",
76
+ description="Share of cases where ground truth appears in contexts",
77
+ signal_group="retrieval_effectiveness",
78
+ module_id="retrieval_analyzer",
79
+ output_path=("summary", "ground_truth_hit_rate"),
80
+ ),
81
+ AnalysisMetricSpec(
82
+ key="retrieval.avg_faithfulness_proxy",
83
+ label="Context faithfulness proxy",
84
+ description="Proxy faithfulness from context-grounding check",
85
+ signal_group="groundedness",
86
+ module_id="retrieval_analyzer",
87
+ output_path=("summary", "avg_faithfulness"),
88
+ ),
89
+ AnalysisMetricSpec(
90
+ key="retrieval.avg_retrieval_score",
91
+ label="Avg retrieval score",
92
+ description="Average retrieval score from metadata",
93
+ signal_group="retrieval_effectiveness",
94
+ module_id="retrieval_analyzer",
95
+ output_path=("summary", "avg_retrieval_score"),
96
+ ),
97
+ AnalysisMetricSpec(
98
+ key="retrieval.avg_retrieval_time_ms",
99
+ label="Avg retrieval latency (ms)",
100
+ description="Average retrieval latency in milliseconds",
101
+ signal_group="efficiency",
102
+ module_id="retrieval_analyzer",
103
+ output_path=("summary", "avg_retrieval_time_ms"),
104
+ ),
105
+ AnalysisMetricSpec(
106
+ key="bm25.avg_recall_at_k",
107
+ label="BM25 avg recall@k",
108
+ description="Average recall@k for BM25 retrieval",
109
+ signal_group="retrieval_effectiveness",
110
+ module_id="bm25_searcher",
111
+ output_path=("summary", "avg_recall_at_k"),
112
+ ),
113
+ AnalysisMetricSpec(
114
+ key="bm25.avg_top_score",
115
+ label="BM25 avg top score",
116
+ description="Average top score for BM25 retrieval",
117
+ signal_group="retrieval_effectiveness",
118
+ module_id="bm25_searcher",
119
+ output_path=("summary", "avg_top_score"),
120
+ ),
121
+ AnalysisMetricSpec(
122
+ key="embedding.avg_recall_at_k",
123
+ label="Embedding avg recall@k",
124
+ description="Average recall@k for dense retrieval",
125
+ signal_group="retrieval_effectiveness",
126
+ module_id="embedding_searcher",
127
+ output_path=("summary", "avg_recall_at_k"),
128
+ ),
129
+ AnalysisMetricSpec(
130
+ key="embedding.avg_top_score",
131
+ label="Embedding avg top score",
132
+ description="Average top score for dense retrieval",
133
+ signal_group="retrieval_effectiveness",
134
+ module_id="embedding_searcher",
135
+ output_path=("summary", "avg_top_score"),
136
+ ),
137
+ AnalysisMetricSpec(
138
+ key="hybrid_rrf.avg_recall_at_k",
139
+ label="Hybrid RRF avg recall@k",
140
+ description="Average recall@k for RRF hybrid retrieval",
141
+ signal_group="retrieval_effectiveness",
142
+ module_id="hybrid_rrf",
143
+ output_path=("summary", "avg_recall_at_k"),
144
+ ),
145
+ AnalysisMetricSpec(
146
+ key="hybrid_rrf.avg_top_score",
147
+ label="Hybrid RRF avg top score",
148
+ description="Average top score for RRF hybrid retrieval",
149
+ signal_group="retrieval_effectiveness",
150
+ module_id="hybrid_rrf",
151
+ output_path=("summary", "avg_top_score"),
152
+ ),
153
+ AnalysisMetricSpec(
154
+ key="hybrid_weighted.avg_recall_at_k",
155
+ label="Hybrid weighted avg recall@k",
156
+ description="Average recall@k for weighted hybrid retrieval",
157
+ signal_group="retrieval_effectiveness",
158
+ module_id="hybrid_weighted",
159
+ output_path=("summary", "avg_recall_at_k"),
160
+ ),
161
+ AnalysisMetricSpec(
162
+ key="hybrid_weighted.avg_top_score",
163
+ label="Hybrid weighted avg top score",
164
+ description="Average top score for weighted hybrid retrieval",
165
+ signal_group="retrieval_effectiveness",
166
+ module_id="hybrid_weighted",
167
+ output_path=("summary", "avg_top_score"),
168
+ ),
169
+ AnalysisMetricSpec(
170
+ key="embedding.avg_norm",
171
+ label="Embedding avg norm",
172
+ description="Average embedding vector norm",
173
+ signal_group="embedding_quality",
174
+ module_id="embedding_analyzer",
175
+ output_path=("summary", "avg_norm"),
176
+ ),
177
+ AnalysisMetricSpec(
178
+ key="embedding.norm_std",
179
+ label="Embedding norm std",
180
+ description="Std-dev of embedding norms",
181
+ signal_group="embedding_quality",
182
+ module_id="embedding_analyzer",
183
+ output_path=("summary", "norm_std"),
184
+ ),
185
+ AnalysisMetricSpec(
186
+ key="embedding.norm_min",
187
+ label="Embedding norm min",
188
+ description="Minimum embedding norm",
189
+ signal_group="embedding_quality",
190
+ module_id="embedding_analyzer",
191
+ output_path=("summary", "norm_min"),
192
+ ),
193
+ AnalysisMetricSpec(
194
+ key="embedding.norm_max",
195
+ label="Embedding norm max",
196
+ description="Maximum embedding norm",
197
+ signal_group="embedding_quality",
198
+ module_id="embedding_analyzer",
199
+ output_path=("summary", "norm_max"),
200
+ ),
201
+ AnalysisMetricSpec(
202
+ key="embedding.mean_cosine_to_centroid",
203
+ label="Embedding mean cosine",
204
+ description="Mean cosine similarity to centroid",
205
+ signal_group="embedding_quality",
206
+ module_id="embedding_analyzer",
207
+ output_path=("summary", "mean_cosine_to_centroid"),
208
+ ),
209
+ )
210
+
211
+
212
+ def list_analysis_metric_specs() -> list[AnalysisMetricSpec]:
213
+ return list(_ANALYSIS_METRICS)
214
+
215
+
216
+ def list_analysis_metric_keys() -> list[str]:
217
+ return [spec.key for spec in _ANALYSIS_METRICS]
@@ -0,0 +1,185 @@
1
+ """Metric registry for CLI/Web UI integrations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ MetricSource = Literal["ragas", "custom"]
9
+ MetricCategory = Literal["qa", "summary", "retrieval", "domain"]
10
+ SignalGroup = Literal[
11
+ "groundedness",
12
+ "intent_alignment",
13
+ "retrieval_effectiveness",
14
+ "summary_fidelity",
15
+ "embedding_quality",
16
+ "efficiency",
17
+ ]
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class MetricSpec:
22
+ name: str
23
+ description: str
24
+ requires_ground_truth: bool
25
+ requires_embeddings: bool
26
+ source: MetricSource
27
+ category: MetricCategory
28
+ signal_group: SignalGroup
29
+
30
+ def to_dict(self) -> dict[str, object]:
31
+ return {
32
+ "name": self.name,
33
+ "description": self.description,
34
+ "requires_ground_truth": self.requires_ground_truth,
35
+ "requires_embeddings": self.requires_embeddings,
36
+ "source": self.source,
37
+ "category": self.category,
38
+ "signal_group": self.signal_group,
39
+ }
40
+
41
+
42
+ _METRIC_SPECS: tuple[MetricSpec, ...] = (
43
+ MetricSpec(
44
+ name="faithfulness",
45
+ description="Measures factual accuracy of the answer based on contexts",
46
+ requires_ground_truth=False,
47
+ requires_embeddings=False,
48
+ source="ragas",
49
+ category="qa",
50
+ signal_group="groundedness",
51
+ ),
52
+ MetricSpec(
53
+ name="answer_relevancy",
54
+ description="Measures how relevant the answer is to the question",
55
+ requires_ground_truth=False,
56
+ requires_embeddings=True,
57
+ source="ragas",
58
+ category="qa",
59
+ signal_group="intent_alignment",
60
+ ),
61
+ MetricSpec(
62
+ name="context_precision",
63
+ description="Measures ranking quality of retrieved contexts",
64
+ requires_ground_truth=True,
65
+ requires_embeddings=False,
66
+ source="ragas",
67
+ category="qa",
68
+ signal_group="retrieval_effectiveness",
69
+ ),
70
+ MetricSpec(
71
+ name="context_recall",
72
+ description="Measures if all relevant info is in retrieved contexts",
73
+ requires_ground_truth=True,
74
+ requires_embeddings=False,
75
+ source="ragas",
76
+ category="qa",
77
+ signal_group="retrieval_effectiveness",
78
+ ),
79
+ MetricSpec(
80
+ name="factual_correctness",
81
+ description="Measures factual correctness against ground truth",
82
+ requires_ground_truth=True,
83
+ requires_embeddings=False,
84
+ source="ragas",
85
+ category="qa",
86
+ signal_group="groundedness",
87
+ ),
88
+ MetricSpec(
89
+ name="semantic_similarity",
90
+ description="Measures semantic similarity between answer and ground truth",
91
+ requires_ground_truth=True,
92
+ requires_embeddings=True,
93
+ source="ragas",
94
+ category="qa",
95
+ signal_group="intent_alignment",
96
+ ),
97
+ MetricSpec(
98
+ name="mrr",
99
+ description="Measures reciprocal rank of the first relevant context",
100
+ requires_ground_truth=True,
101
+ requires_embeddings=False,
102
+ source="custom",
103
+ category="retrieval",
104
+ signal_group="retrieval_effectiveness",
105
+ ),
106
+ MetricSpec(
107
+ name="ndcg",
108
+ description="Measures ranking quality across relevant contexts",
109
+ requires_ground_truth=True,
110
+ requires_embeddings=False,
111
+ source="custom",
112
+ category="retrieval",
113
+ signal_group="retrieval_effectiveness",
114
+ ),
115
+ MetricSpec(
116
+ name="hit_rate",
117
+ description="Measures whether any relevant context appears in top K",
118
+ requires_ground_truth=True,
119
+ requires_embeddings=False,
120
+ source="custom",
121
+ category="retrieval",
122
+ signal_group="retrieval_effectiveness",
123
+ ),
124
+ MetricSpec(
125
+ name="summary_score",
126
+ description="Measures summary coverage and conciseness against contexts",
127
+ requires_ground_truth=False,
128
+ requires_embeddings=False,
129
+ source="ragas",
130
+ category="summary",
131
+ signal_group="summary_fidelity",
132
+ ),
133
+ MetricSpec(
134
+ name="summary_faithfulness",
135
+ description="Measures whether summary statements are grounded in contexts",
136
+ requires_ground_truth=False,
137
+ requires_embeddings=False,
138
+ source="ragas",
139
+ category="summary",
140
+ signal_group="summary_fidelity",
141
+ ),
142
+ MetricSpec(
143
+ name="entity_preservation",
144
+ description="Measures preservation of key insurance entities in summaries",
145
+ requires_ground_truth=False,
146
+ requires_embeddings=False,
147
+ source="custom",
148
+ category="summary",
149
+ signal_group="summary_fidelity",
150
+ ),
151
+ MetricSpec(
152
+ name="insurance_term_accuracy",
153
+ description="Measures if insurance terms in answer are grounded in contexts",
154
+ requires_ground_truth=False,
155
+ requires_embeddings=False,
156
+ source="custom",
157
+ category="domain",
158
+ signal_group="groundedness",
159
+ ),
160
+ MetricSpec(
161
+ name="contextual_relevancy",
162
+ description="Measures how well contexts align with the question intent",
163
+ requires_ground_truth=False,
164
+ requires_embeddings=False,
165
+ source="custom",
166
+ category="qa",
167
+ signal_group="retrieval_effectiveness",
168
+ ),
169
+ )
170
+
171
+
172
+ def list_metric_specs() -> list[MetricSpec]:
173
+ return list(_METRIC_SPECS)
174
+
175
+
176
+ def list_metric_names() -> list[str]:
177
+ return [spec.name for spec in _METRIC_SPECS]
178
+
179
+
180
+ def get_metric_descriptions() -> dict[str, str]:
181
+ return {spec.name: spec.description for spec in _METRIC_SPECS}
182
+
183
+
184
+ def get_metric_spec_map() -> dict[str, MetricSpec]:
185
+ return {spec.name: spec for spec in _METRIC_SPECS}
@@ -68,6 +68,118 @@ class PipelineTemplateRegistry:
68
68
  self._templates[AnalysisIntent.BENCHMARK_RETRIEVAL] = (
69
69
  self._create_benchmark_retrieval_template()
70
70
  )
71
+ # 보고서 템플릿
72
+ self._templates[AnalysisIntent.GENERATE_SUMMARY] = self._create_generate_summary_template()
73
+ self._templates[AnalysisIntent.GENERATE_DETAILED] = (
74
+ self._create_generate_detailed_template()
75
+ )
76
+ self._templates[AnalysisIntent.GENERATE_COMPARISON] = (
77
+ self._create_generate_comparison_template()
78
+ )
79
+
80
+ def get_template(self, intent: AnalysisIntent) -> AnalysisPipeline | None:
81
+ """의도에 대응하는 파이프라인 템플릿 조회."""
82
+ return self._templates.get(intent)
83
+
84
+ # =========================================================================
85
+ # Verification Templates
86
+ # =========================================================================
87
+
88
+ def _create_verify_morpheme_template(self) -> AnalysisPipeline:
89
+ """형태소 검증 템플릿."""
90
+ nodes = [
91
+ AnalysisNode(
92
+ id="load_data",
93
+ name="데이터 로드",
94
+ module="data_loader",
95
+ ),
96
+ AnalysisNode(
97
+ id="morpheme_analysis",
98
+ name="형태소 분석",
99
+ module="morpheme_analyzer",
100
+ depends_on=["load_data"],
101
+ ),
102
+ AnalysisNode(
103
+ id="quality_check",
104
+ name="형태소 품질 점검",
105
+ module="morpheme_quality_checker",
106
+ depends_on=["morpheme_analysis"],
107
+ ),
108
+ AnalysisNode(
109
+ id="report",
110
+ name="검증 보고서",
111
+ module="verification_report",
112
+ depends_on=["quality_check"],
113
+ ),
114
+ ]
115
+ return AnalysisPipeline(
116
+ intent=AnalysisIntent.VERIFY_MORPHEME,
117
+ nodes=nodes,
118
+ )
119
+
120
+ def _create_verify_embedding_template(self) -> AnalysisPipeline:
121
+ """임베딩 품질 검증 템플릿."""
122
+ nodes = [
123
+ AnalysisNode(
124
+ id="load_data",
125
+ name="데이터 로드",
126
+ module="data_loader",
127
+ ),
128
+ AnalysisNode(
129
+ id="embedding_analysis",
130
+ name="임베딩 분석",
131
+ module="embedding_analyzer",
132
+ depends_on=["load_data"],
133
+ ),
134
+ AnalysisNode(
135
+ id="quality_check",
136
+ name="임베딩 분포 점검",
137
+ module="embedding_distribution",
138
+ depends_on=["embedding_analysis"],
139
+ ),
140
+ AnalysisNode(
141
+ id="report",
142
+ name="검증 보고서",
143
+ module="verification_report",
144
+ depends_on=["quality_check"],
145
+ ),
146
+ ]
147
+ return AnalysisPipeline(
148
+ intent=AnalysisIntent.VERIFY_EMBEDDING,
149
+ nodes=nodes,
150
+ )
151
+
152
+ def _create_verify_retrieval_template(self) -> AnalysisPipeline:
153
+ """검색 품질 검증 템플릿."""
154
+ nodes = [
155
+ AnalysisNode(
156
+ id="load_data",
157
+ name="데이터 로드",
158
+ module="data_loader",
159
+ ),
160
+ AnalysisNode(
161
+ id="retrieval_analysis",
162
+ name="검색 분석",
163
+ module="retrieval_analyzer",
164
+ depends_on=["load_data"],
165
+ ),
166
+ AnalysisNode(
167
+ id="quality_check",
168
+ name="검색 품질 점검",
169
+ module="retrieval_quality_checker",
170
+ depends_on=["retrieval_analysis"],
171
+ ),
172
+ AnalysisNode(
173
+ id="report",
174
+ name="검증 보고서",
175
+ module="verification_report",
176
+ depends_on=["quality_check"],
177
+ ),
178
+ ]
179
+ return AnalysisPipeline(
180
+ intent=AnalysisIntent.VERIFY_RETRIEVAL,
181
+ nodes=nodes,
182
+ )
71
183
 
72
184
  # =========================================================================
73
185
  # Comparison Templates
@@ -188,6 +188,10 @@ class WebUIPort(Protocol):
188
188
  """
189
189
  ...
190
190
 
191
+ def get_metric_specs(self) -> list[dict[str, object]]:
192
+ """메트릭 스펙 목록 조회."""
193
+ ...
194
+
191
195
  def list_stage_events(
192
196
  self,
193
197
  run_id: str,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evalvault
3
- Version: 1.59.0
3
+ Version: 1.60.0
4
4
  Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
5
5
  Project-URL: Homepage, https://github.com/ntts9990/EvalVault
6
6
  Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme
@@ -5,23 +5,23 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
5
5
  evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  evalvault/adapters/inbound/__init__.py,sha256=bWSL3styP4BIMeVk04nPI_9rKTln-puBH5lYg6XtnNo,91
7
7
  evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
8
- evalvault/adapters/inbound/api/adapter.py,sha256=5d4ii_OeXs1aPjK58hh_WwfEEjiEAQf0xznTy9retro,60329
8
+ evalvault/adapters/inbound/api/adapter.py,sha256=6L95Csns-ac_9Q1rbVjYA8G7mu0wb981G5lsbvcqzcI,59820
9
9
  evalvault/adapters/inbound/api/main.py,sha256=KdlAxKn0QfGI3UuoTrBDBbUs2xCvP8lnWOY1ce3svcU,2619
10
10
  evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
11
11
  evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
12
12
  evalvault/adapters/inbound/api/routers/config.py,sha256=CN-FH2cn0Ive-BD3WacWY6PFfuMtZEHP5_out3fvST4,3957
13
13
  evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
14
14
  evalvault/adapters/inbound/api/routers/knowledge.py,sha256=7mgyoUM1PepFb4X8_Ntn0vd7ZZYcNbM3_9nyD10g4Aw,5307
15
- evalvault/adapters/inbound/api/routers/pipeline.py,sha256=tWuXwM-AH_NVDzemtsxbi5Dyn5kYyc1vPFS1sg2TPuw,16655
16
- evalvault/adapters/inbound/api/routers/runs.py,sha256=W3QaSMN3ByqNLynh_uWkMv0_-NvsVKedbuKsEAAoZr0,33160
15
+ evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPee4OMwfxVdvRWHJ2_qYBF0,17175
16
+ evalvault/adapters/inbound/api/routers/runs.py,sha256=Xn0Tj6sbxijdG9-x7rXFiLvKOAzdJ18QSZR0j5VEMYQ,33561
17
17
  evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
18
- evalvault/adapters/inbound/cli/app.py,sha256=Gf_VWXK2aUzVL63F5ulqPd88MgO1n823uISGhGHsdEI,1813
18
+ evalvault/adapters/inbound/cli/app.py,sha256=ytNgHRg9ZTAl33AkB1wIL8RKfQ_Cf8fsy0gSsLTs7Ew,1603
19
19
  evalvault/adapters/inbound/cli/commands/__init__.py,sha256=ciIHbHgP0gtasVi4l5cHjVojERrb-uipga_E0EwCrqM,3431
20
20
  evalvault/adapters/inbound/cli/commands/agent.py,sha256=YlOYMEzzS1aSKDKD_a7UK3St18X6GXGkdTatrzyd8Zc,7555
21
21
  evalvault/adapters/inbound/cli/commands/analyze.py,sha256=aMi1BEDOX3yhN-ppBftDssPQLB5TdzIfpx9U7CZEgWo,48932
22
22
  evalvault/adapters/inbound/cli/commands/api.py,sha256=YdbJ_-QEajnFcjTa7P2heLMjFKpeQ4nWP_p-HvfYkEo,1943
23
23
  evalvault/adapters/inbound/cli/commands/benchmark.py,sha256=RZ4nRTF7d6hDZug-Pw8dGcFEyWdOKclwqkvS-gN4VWo,41097
24
- evalvault/adapters/inbound/cli/commands/config.py,sha256=r3DH2a0-PgJIzpyB7teiykDulhUwUJUkiFWLrbjhF6k,7148
24
+ evalvault/adapters/inbound/cli/commands/config.py,sha256=Mv9IQHBFHZ3I2stUzHDgLDn-Znt_Awdy3j-sk5ruUmw,6069
25
25
  evalvault/adapters/inbound/cli/commands/debug.py,sha256=KU-hL1gLhpjV2ZybDQgGMwRfm-hCynkrqY4UzETfL9k,2234
26
26
  evalvault/adapters/inbound/cli/commands/domain.py,sha256=dL9iqBlnr5mDeS1unXW6uxE0qp6yfnxj-ls6k3EenwI,27279
27
27
  evalvault/adapters/inbound/cli/commands/experiment.py,sha256=jficaFOsZ9EMHrPHCOZjq6jpFrgmqCwmIo--wA_OcvQ,10389
@@ -33,7 +33,7 @@ evalvault/adapters/inbound/cli/commands/kg.py,sha256=ycV9Xj6SUUJLTyTfLZcjXDVLcZq
33
33
  evalvault/adapters/inbound/cli/commands/langfuse.py,sha256=aExhZ5WYT0FzJI4v1sF-a1jqy9b1BF46_HBtfiQjVGI,4085
34
34
  evalvault/adapters/inbound/cli/commands/method.py,sha256=K1UacoKwV9w8sLeQK8qHyTuZqFZrlcj6yS_y2izfRlo,18853
35
35
  evalvault/adapters/inbound/cli/commands/phoenix.py,sha256=LQi3KTLq1ybjjBuz92oQ6lYyBS3mHrCHk0qe-7bqB4U,15611
36
- evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=Hg3A2LGTLw_rjd6ZgT5lOVsTASXIyq2DimUna24FRv0,7936
36
+ evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=NeqWLzO9kRDuZd0pHAIHglP3F7VzoNOU4JI0QcSZ120,7788
37
37
  evalvault/adapters/inbound/cli/commands/prompts.py,sha256=6UwQtKJf3JYhcNI4tQqjjsL-sp_cmu2VV7gETkCcmkk,5490
38
38
  evalvault/adapters/inbound/cli/commands/run.py,sha256=6d_AnONUiroNMF1xZt8O1sbtqb5HcE53ZMAU-UOp1cA,115469
39
39
  evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=50nYzf4DUniJd7fQgT2cyh_FWVTWZzW0UMXCg-EHBuY,39764
@@ -65,7 +65,7 @@ evalvault/adapters/outbound/analysis/embedding_searcher_module.py,sha256=j6w_jIG
65
65
  evalvault/adapters/outbound/analysis/hybrid_rrf_module.py,sha256=kaHSc7z3Jg_KrRLBqPMTV_9XXsL6v1dmbz-3dDO6IMw,3255
66
66
  evalvault/adapters/outbound/analysis/hybrid_weighted_module.py,sha256=AO-7thmnFGerUDWd8l9ydxeAkHkACo7Raf9O0RfW_nE,3671
67
67
  evalvault/adapters/outbound/analysis/hypothesis_generator_module.py,sha256=tx9fWgS0rBoK5eJPmwK5POoV78yN03hkFmWhCx71Ln0,13337
68
- evalvault/adapters/outbound/analysis/llm_report_module.py,sha256=KjIM2MET6gl9jUpxRo0rDVIzqSXFw-I4y0QoG_TULFA,38773
68
+ evalvault/adapters/outbound/analysis/llm_report_module.py,sha256=RIACcqy7DAgllcr_sea4Ap3rE8NXEJTeAx46GWL7Dq4,47250
69
69
  evalvault/adapters/outbound/analysis/low_performer_extractor_module.py,sha256=Pt0Tmtc5Etqp_3SBDCPAzqWI2EF9woSg0mmBucEHlQw,1291
70
70
  evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIFpLHixbbZcv6-j2QVgl3yaGN0vU1Q0gFc,2682
71
71
  evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
@@ -204,11 +204,13 @@ evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDS
204
204
  evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
205
205
  evalvault/domain/entities/stage.py,sha256=dbVzhgpP_p2p2eDJBWe7mwyyl6zUTP9kEKN_YRUvufY,7183
206
206
  evalvault/domain/metrics/__init__.py,sha256=fxjC5Z_8OuBIeMn80bYgnZZxpNoay2wH-qtG3NqCUvk,797
207
+ evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
207
208
  evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
208
209
  evalvault/domain/metrics/contextual_relevancy.py,sha256=xAPYUv_0TM4j4WOutOSGftNln_l-2Ev6qpANeu4REv8,11057
209
210
  evalvault/domain/metrics/entity_preservation.py,sha256=uSCbaETceE5PbGn-230Rm8pryOA8jDkkeOwAkWxA65g,6500
210
211
  evalvault/domain/metrics/insurance.py,sha256=5NPeAi_86rpuZRgV4KhzomGrq3Uw2jjglN6FfA_AO8o,4040
211
212
  evalvault/domain/metrics/no_answer.py,sha256=x6vRyOa1jw-qsnw9kOYT8YMPdLElaDRu7zjNCpyJfqM,8237
213
+ evalvault/domain/metrics/registry.py,sha256=QKjo4RNHxCqObGg36xJP3KAHqFpHM50Jy7GeSksdz0Y,5665
212
214
  evalvault/domain/metrics/retrieval_rank.py,sha256=F55ByadJBowyKHKBmKAZ0T0qN_R1_7UNu-MiLnT4Ypg,14675
213
215
  evalvault/domain/metrics/terms_dictionary.json,sha256=-ZQmpx6yMOYoAOpcLj-xK2LkAeCbAw0EUb6-syIOKS0,3801
214
216
  evalvault/domain/metrics/text_match.py,sha256=P-YTZs9ekDqEmxLNBP8eXnMRymPdC8V4dJPtwG2ajVM,10219
@@ -241,7 +243,7 @@ evalvault/domain/services/memory_aware_evaluator.py,sha256=vTiYoxiMfZ_CMjSBjqwkB
241
243
  evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SMEhPyu7fyBVz-giO2hlNifE,4499
242
244
  evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
243
245
  evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
244
- evalvault/domain/services/pipeline_template_registry.py,sha256=c1rvYsTQU5MdAsmbZ7LlnuF6TD3p4IXlzgq_i18J3f8,24039
246
+ evalvault/domain/services/pipeline_template_registry.py,sha256=j2WQwXrCvYd-dbtxOUTmgTZZAgNtu0eUvqgdryerCbc,27964
245
247
  evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
246
248
  evalvault/domain/services/prompt_registry.py,sha256=81tq__u2fFxTEG8bWnyJ2Qdb9N89jcqIdSfOAKEbEvg,3029
247
249
  evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9YfZ_yLIxF1MMBo,6731
@@ -262,7 +264,7 @@ evalvault/ports/inbound/__init__.py,sha256=2Wsc0vNzH8_ZaErk4OHxP93hRonLUkMbn3W28
262
264
  evalvault/ports/inbound/analysis_pipeline_port.py,sha256=RJfKtp22AYEqnmRk6RDawAK52rEmyAhuk0FUPJQUwQU,1758
263
265
  evalvault/ports/inbound/evaluator_port.py,sha256=rDvouIRUjBD7uICgrpeo11vNPvo27_0CdylRHPodPSE,1323
264
266
  evalvault/ports/inbound/learning_hook_port.py,sha256=ehpRyRNUY1PRtzIoaCyDM_QRxp6WjEQvwPskAxI4CPc,3109
265
- evalvault/ports/inbound/web_port.py,sha256=kjDyNXkgRwbevmSnm25URk-qHjGN9K9ML83FAvwhbpM,5448
267
+ evalvault/ports/inbound/web_port.py,sha256=ljggDzHGUfh_H2j86F9upGFwR-ZXIJTunR2ahKMkn-A,5566
266
268
  evalvault/ports/outbound/__init__.py,sha256=jEmLbY3lZ9osue6pG5dc345BdMikBEWq4cnX7ocEul0,3276
267
269
  evalvault/ports/outbound/analysis_cache_port.py,sha256=zPSdUVK_yw3PMWPII2YvS1WLmCGlg5bDScSuYINW9yc,1386
268
270
  evalvault/ports/outbound/analysis_module_port.py,sha256=QYzkvie9-BbONj8ZgiQUjm8I-bn8mgzlXTzIXMhehmQ,1881
@@ -288,8 +290,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
288
290
  evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
289
291
  evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
290
292
  evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
291
- evalvault-1.59.0.dist-info/METADATA,sha256=unwBGPN_vReQ3ohlNQZjMhPy8GBTxDqy1eSPvprX7dk,14058
292
- evalvault-1.59.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
293
- evalvault-1.59.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
294
- evalvault-1.59.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
295
- evalvault-1.59.0.dist-info/RECORD,,
293
+ evalvault-1.60.0.dist-info/METADATA,sha256=STLcsvyERi1Xlx36zsmSzl5dz-skmdRTH-SXHoBK27E,14058
294
+ evalvault-1.60.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
295
+ evalvault-1.60.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
296
+ evalvault-1.60.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
297
+ evalvault-1.60.0.dist-info/RECORD,,