PyPI - evalvault - Versions diffs - 1.59.0__py3-none-any.whl → 1.60.0__py3-none-any.whl - Mend

evalvault 1.59.0py3-none-any.whl → 1.60.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

evalvault/adapters/inbound/api/adapter.py CHANGED Viewed

@@ -15,6 +15,13 @@ from urllib.request import urlopen
 from evalvault.config.phoenix_support import PhoenixExperimentResolver
 from evalvault.config.settings import Settings
 from evalvault.domain.entities.prompt import PromptSetBundle
+from evalvault.domain.metrics.registry import (
+    get_metric_descriptions as registry_metric_descriptions,
+)
+from evalvault.domain.metrics.registry import (
+    list_metric_names,
+    list_metric_specs,
+)
 from evalvault.domain.services.cluster_map_builder import build_cluster_map
 from evalvault.domain.services.prompt_registry import (
     PromptInput,
@@ -42,21 +49,6 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-# 지원하는 메트릭 목록
-AVAILABLE_METRICS = [
-    "faithfulness",
-    "answer_relevancy",
-    "context_precision",
-    "context_recall",
-    "factual_correctness",
-    "semantic_similarity",
-    "summary_score",
-    "summary_faithfulness",
-    "insurance_term_accuracy",
-    "entity_preservation",
-    "contextual_relevancy",
-]
 @dataclass
 class GateResult:
@@ -978,19 +970,15 @@ class WebUIAdapter:
     def get_available_metrics(self) -> list[str]:
         """사용 가능한 메트릭 목록 반환."""
-        return AVAILABLE_METRICS.copy()
+        return list_metric_names()
+    def get_metric_specs(self) -> list[dict[str, object]]:
+        """메트릭 스펙 목록 반환."""
+        return [spec.to_dict() for spec in list_metric_specs()]
     def get_metric_descriptions(self) -> dict[str, str]:
         """메트릭별 설명 반환."""
-        return {
-            "faithfulness": "답변이 컨텍스트에 충실한지 평가",
-            "answer_relevancy": "답변이 질문과 관련있는지 평가",
-            "context_precision": "검색된 컨텍스트의 정밀도 평가",
-            "context_recall": "필요한 정보가 검색되었는지 평가",
-            "factual_correctness": "ground_truth 대비 사실적 정확성 평가",
-            "semantic_similarity": "답변과 ground_truth 간 의미적 유사도 평가",
-            "insurance_term_accuracy": "보험 용어 정확성 평가",
-        }
+        return registry_metric_descriptions()
     def create_dataset_from_upload(
         self,

evalvault/adapters/inbound/api/routers/pipeline.py CHANGED Viewed

@@ -11,6 +11,7 @@ from evalvault.adapters.outbound.llm import get_llm_adapter
 from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdapter
 from evalvault.config.settings import get_settings
 from evalvault.domain.entities.analysis_pipeline import AnalysisIntent
+from evalvault.domain.metrics.analysis_registry import list_analysis_metric_specs
 from evalvault.domain.services.pipeline_orchestrator import AnalysisPipelineService
 router = APIRouter(tags=["pipeline"])
@@ -220,6 +221,15 @@ class PipelineResultResponse(PipelineResultSummary):
     final_output: dict[str, Any] | None = None
+class AnalysisMetricSpecResponse(BaseModel):
+    key: str
+    label: str
+    description: str
+    signal_group: str
+    module_id: str
+    output_path: list[str]
 def _serialize_payload(value: Any) -> Any:
     try:
         return jsonable_encoder(value)
@@ -366,6 +376,12 @@ async def list_intents():
         raise HTTPException(status_code=500, detail=str(exc)) from exc
+@router.get("/options/analysis-metric-specs", response_model=list[AnalysisMetricSpecResponse])
+async def list_analysis_metric_specs_endpoint():
+    """List analysis metric specs for pipeline outputs."""
+    return [spec.to_dict() for spec in list_analysis_metric_specs()]
 @router.post("/results", response_model=PipelineResultSummary)
 async def save_pipeline_result(payload: PipelineResultPayload):
     """Save a pipeline analysis result for history."""

evalvault/adapters/inbound/api/routers/runs.py CHANGED Viewed

@@ -113,6 +113,16 @@ class ModelItemResponse(BaseModel):
     supports_tools: bool | None = None
+class MetricSpecResponse(BaseModel):
+    name: str
+    description: str
+    requires_ground_truth: bool
+    requires_embeddings: bool
+    source: str
+    category: str
+    signal_group: str
 class ClusterMapItemResponse(BaseModel):
     test_case_id: str
     cluster_id: str
@@ -395,6 +405,12 @@ def list_metrics(adapter: AdapterDep):
     return adapter.get_available_metrics()
+@router.get("/options/metric-specs", response_model=list[MetricSpecResponse])
+def list_metric_specs(adapter: AdapterDep):
+    """Get available metrics with metadata."""
+    return adapter.get_metric_specs()
 @router.get("/options/cluster-maps", response_model=list[ClusterMapFileResponse])
 def list_cluster_maps():
     """List available cluster map CSV files."""

evalvault/adapters/inbound/cli/app.py CHANGED Viewed

@@ -14,6 +14,8 @@ import typer
 from rich import print as rprint
 from rich.console import Console
+from evalvault.domain.metrics.registry import list_metric_names
 from .commands import attach_sub_apps, register_all_commands
@@ -32,19 +34,7 @@ app = typer.Typer(
 )
 console = Console()
-AVAILABLE_METRICS: list[str] = [
-    "faithfulness",
-    "answer_relevancy",
-    "context_precision",
-    "context_recall",
-    "factual_correctness",
-    "semantic_similarity",
-    "summary_score",
-    "summary_faithfulness",
-    "insurance_term_accuracy",
-    "entity_preservation",
-    "contextual_relevancy",
-]
+AVAILABLE_METRICS = list_metric_names()
 register_all_commands(app, console, available_metrics=AVAILABLE_METRICS)
 attach_sub_apps(app, console)

evalvault/adapters/inbound/cli/commands/config.py CHANGED Viewed

@@ -7,6 +7,7 @@ from rich.console import Console
 from rich.table import Table
 from evalvault.config.settings import Settings, apply_profile
+from evalvault.domain.metrics.registry import list_metric_specs
 def register_config_commands(app: typer.Typer, console: Console) -> None:
@@ -22,46 +23,9 @@ def register_config_commands(app: typer.Typer, console: Console) -> None:
         table.add_column("Description")
         table.add_column("Requires Ground Truth", justify="center")
-        table.add_row(
-            "faithfulness",
-            "Measures factual accuracy of the answer based on contexts",
-            "[red]No[/red]",
-        )
-        table.add_row(
-            "answer_relevancy",
-            "Measures how relevant the answer is to the question",
-            "[red]No[/red]",
-        )
-        table.add_row(
-            "context_precision",
-            "Measures ranking quality of retrieved contexts",
-            "[green]Yes[/green]",
-        )
-        table.add_row(
-            "context_recall",
-            "Measures if all relevant info is in retrieved contexts",
-            "[green]Yes[/green]",
-        )
-        table.add_row(
-            "summary_score",
-            "Measures summary coverage and conciseness against contexts",
-            "[red]No[/red]",
-        )
-        table.add_row(
-            "summary_faithfulness",
-            "Measures whether summary statements are grounded in contexts",
-            "[red]No[/red]",
-        )
-        table.add_row(
-            "entity_preservation",
-            "Measures preservation of key insurance entities in summaries",
-            "[red]No[/red]",
-        )
-        table.add_row(
-            "insurance_term_accuracy",
-            "Measures if insurance terms in answer are grounded in contexts",
-            "[red]No[/red]",
-        )
+        for spec in list_metric_specs():
+            needs_gt = "[green]Yes[/green]" if spec.requires_ground_truth else "[red]No[/red]"
+            table.add_row(spec.name, spec.description, needs_gt)
         console.print(table)
         console.print("\n[dim]Use --metrics flag with 'run' command to specify metrics.[/dim]")

evalvault/adapters/inbound/cli/commands/pipeline.py CHANGED Viewed

@@ -13,6 +13,7 @@ from evalvault.adapters.outbound.storage.sqlite_adapter import SQLiteStorageAdap
 from evalvault.config.phoenix_support import ensure_phoenix_instrumentation
 from evalvault.config.settings import Settings
+from ..utils.analysis_io import serialize_pipeline_result
 from ..utils.options import db_option
@@ -104,15 +105,10 @@ def register_pipeline_commands(app: typer.Typer, console) -> None:
                     console.print(f"  [red]{node_id}:[/red] {node_result.error}")
         if output:
-            data = {
-                "query": query,
-                "intent": result.intent.value if result.intent else None,
-                "is_complete": result.is_complete,
-                "duration_ms": result.total_duration_ms,
-                "final_output": result.final_output,
-            }
+            payload = serialize_pipeline_result(result)
+            payload["query"] = query
             with open(output, "w", encoding="utf-8") as f:
-                json.dump(data, f, ensure_ascii=False, indent=2)
+                json.dump(payload, f, ensure_ascii=False, indent=2)
             console.print(f"\n[green]Results saved to {output}[/green]")
         console.print()

evalvault/adapters/outbound/analysis/llm_report_module.py CHANGED Viewed

@@ -4,6 +4,7 @@ from __future__ import annotations
 import asyncio
 import json
+import re
 from typing import Any
 from evalvault.adapters.outbound.analysis.base_module import BaseAnalysisModule
@@ -13,6 +14,7 @@ from evalvault.adapters.outbound.analysis.pipeline_helpers import (
     truncate_text,
 )
 from evalvault.domain.entities import EvaluationRun
+from evalvault.domain.metrics.registry import get_metric_spec_map
 from evalvault.ports.outbound.llm_port import LLMPort
@@ -54,6 +56,18 @@ class LLMReportModule(BaseAnalysisModule):
             report = self._llm_adapter.generate_text(
                 self._build_prompt(context, evidence),
             )
+            report_type = context.get("report_type") or "analysis"
+            is_valid, reasons = self._validate_report(
+                report,
+                report_type=report_type,
+                evidence=evidence,
+            )
+            if not is_valid:
+                output = self._fallback_report(context, evidence, llm_used=False)
+                output["llm_error"] = (
+                    "LLM report validation failed: " + "; ".join(reasons)
+                ).strip()
+                return output
             return self._build_output(context, evidence, report, llm_used=True)
         except Exception as exc:
             output = self._fallback_report(context, evidence, llm_used=False)
@@ -91,6 +105,18 @@ class LLMReportModule(BaseAnalysisModule):
                     self._llm_adapter.generate_text,
                     self._build_prompt(context, evidence),
                 )
+            report_type = context.get("report_type") or "analysis"
+            is_valid, reasons = self._validate_report(
+                report,
+                report_type=report_type,
+                evidence=evidence,
+            )
+            if not is_valid:
+                output = self._fallback_report(context, evidence, llm_used=False)
+                output["llm_error"] = (
+                    "LLM report validation failed: " + "; ".join(reasons)
+                ).strip()
+                return output
             return self._build_output(context, evidence, report, llm_used=True)
         except Exception as exc:
             output = self._fallback_report(context, evidence, llm_used=False)
@@ -367,6 +393,107 @@ class LLMReportModule(BaseAnalysisModule):
             )
         return scorecard
+    def _build_signal_group_summary(
+        self,
+        scorecard: list[dict[str, Any]],
+    ) -> dict[str, dict[str, Any]]:
+        spec_map = get_metric_spec_map()
+        summary: dict[str, dict[str, Any]] = {}
+        for row in scorecard:
+            metric = row.get("metric")
+            if not metric:
+                continue
+            spec = spec_map.get(metric)
+            group = spec.signal_group if spec else "unknown"
+            bucket = summary.setdefault(
+                group,
+                {
+                    "metrics": [],
+                    "mean_avg": None,
+                    "pass_rate_avg": None,
+                    "risk_count": 0,
+                    "total": 0,
+                    "_mean_values": [],
+                    "_pass_rates": [],
+                },
+            )
+            bucket["metrics"].append(metric)
+            mean = row.get("mean")
+            if isinstance(mean, int | float):
+                bucket["_mean_values"].append(float(mean))
+            pass_rate = row.get("pass_rate")
+            if isinstance(pass_rate, int | float):
+                bucket["_pass_rates"].append(float(pass_rate))
+            if row.get("status") == "risk":
+                bucket["risk_count"] += 1
+            bucket["total"] += 1
+        for bucket in summary.values():
+            mean_values = bucket.pop("_mean_values", [])
+            pass_rates = bucket.pop("_pass_rates", [])
+            bucket["mean_avg"] = round(safe_mean(mean_values), 4) if mean_values else None
+            bucket["pass_rate_avg"] = round(safe_mean(pass_rates), 4) if pass_rates else None
+        return summary
+    def _build_risk_metrics(
+        self,
+        scorecard: list[dict[str, Any]],
+        *,
+        limit: int = 6,
+    ) -> list[dict[str, Any]]:
+        risk_rows: list[dict[str, Any]] = []
+        for row in scorecard:
+            status = row.get("status")
+            pass_rate = row.get("pass_rate")
+            is_risk = status == "risk" or (
+                isinstance(pass_rate, int | float) and float(pass_rate) < 0.7
+            )
+            if not is_risk:
+                continue
+            risk_rows.append(
+                {
+                    "metric": row.get("metric"),
+                    "mean": row.get("mean"),
+                    "threshold": row.get("threshold"),
+                    "pass_rate": pass_rate,
+                    "gap": row.get("gap"),
+                    "status": status,
+                }
+            )
+        def _sort_key(item: dict[str, Any]) -> tuple[float, float]:
+            gap = item.get("gap")
+            gap_value = float(gap) if isinstance(gap, int | float) else 0.0
+            pass_rate = item.get("pass_rate")
+            pass_value = float(pass_rate) if isinstance(pass_rate, int | float) else 1.0
+            return (gap_value, -pass_value)
+        risk_rows.sort(key=_sort_key, reverse=True)
+        return risk_rows[:limit]
+    def _build_significant_changes(
+        self,
+        comparison_scorecard: list[dict[str, Any]],
+        *,
+        limit: int = 6,
+    ) -> list[dict[str, Any]]:
+        changes: list[dict[str, Any]] = []
+        for row in comparison_scorecard:
+            if not row.get("is_significant"):
+                continue
+            changes.append(
+                {
+                    "metric": row.get("metric"),
+                    "diff": row.get("diff"),
+                    "diff_percent": row.get("diff_percent"),
+                    "effect_size": row.get("effect_size"),
+                    "effect_level": row.get("effect_level"),
+                    "direction": row.get("direction"),
+                    "winner": row.get("winner"),
+                }
+            )
+        return changes[:limit]
     def _build_comparison_scorecard(
         self,
         comparison_details: dict[str, Any] | None,
@@ -397,6 +524,62 @@ class LLMReportModule(BaseAnalysisModule):
             )
         return scorecard
+    def _validate_report(
+        self,
+        report: str,
+        *,
+        report_type: str,
+        evidence: list[dict[str, Any]],
+    ) -> tuple[bool, list[str]]:
+        reasons: list[str] = []
+        normalized = report_type or "analysis"
+        if not re.search(r"[가-힣]", report):
+            reasons.append("한국어 본문 미검출")
+        required_sections = {
+            "comparison": [
+                "요약",
+                "변경 사항 요약",
+                "지표 비교 스코어카드",
+                "통계적 신뢰도",
+                "원인 분석",
+                "개선 제안",
+                "다음 단계",
+                "부록(산출물)",
+            ],
+            "summary": [
+                "요약",
+                "지표 스코어카드",
+                "개선 제안",
+                "다음 단계",
+                "부록(산출물)",
+            ],
+            "analysis": [
+                "요약",
+                "지표 스코어카드",
+                "데이터 품질/신뢰도",
+                "증거 기반 인사이트",
+                "원인 가설",
+                "개선 제안",
+                "다음 단계",
+                "부록(산출물)",
+            ],
+        }
+        for section in required_sections.get(normalized, []):
+            if section not in report:
+                reasons.append(f"섹션 누락: {section}")
+        if evidence:
+            if normalized == "comparison":
+                if not re.search(r"\[(A|B)\\d+\\]", report):
+                    reasons.append("증거 인용([A1]/[B1]) 누락")
+            else:
+                if not re.search(r"\\[E\\d+\\]", report):
+                    reasons.append("증거 인용([E1]) 누락")
+        return len(reasons) == 0, reasons
     def _build_priority_highlights(self, priority_summary: dict[str, Any] | None) -> dict[str, Any]:
         priority_summary = priority_summary or {}
         bottom_cases = priority_summary.get("bottom_cases", [])
@@ -598,6 +781,9 @@ class LLMReportModule(BaseAnalysisModule):
         priority_highlights = self._build_priority_highlights(context.get("priority_summary"))
         prompt_change_summary = self._summarize_prompt_changes(context.get("change_summary"))
         artifact_manifest = self._build_artifact_manifest(context.get("artifact_nodes") or [])
+        signal_group_summary = self._build_signal_group_summary(scorecard)
+        risk_metrics = self._build_risk_metrics(scorecard)
+        significant_changes = self._build_significant_changes(comparison_scorecard)
         change_summary = context.get("change_summary")
         if isinstance(change_summary, dict) and prompt_change_summary:
             change_summary = dict(change_summary)
@@ -635,12 +821,15 @@ class LLMReportModule(BaseAnalysisModule):
             "comparison": context.get("comparison"),
             "comparison_details": comparison_details,
             "comparison_scorecard": comparison_scorecard,
+            "significant_changes": significant_changes,
             "change_summary": change_summary,
             "priority_summary": context.get("priority_summary"),
             "priority_highlights": priority_highlights,
             "quality_checks": context.get("quality_checks"),
             "quality_summary": quality_summary,
             "scorecard": scorecard,
+            "signal_group_summary": signal_group_summary,
+            "risk_metrics": risk_metrics,
             "artifact_manifest": artifact_manifest,
         }
@@ -648,6 +837,19 @@ class LLMReportModule(BaseAnalysisModule):
         evidence_json = json.dumps(evidence, ensure_ascii=False, indent=2)
         report_type = context.get("report_type")
+        common_requirements = (
+            "공통 원칙:\n"
+            "1) 모든 주장/원인/개선안은 summary_json 또는 evidence에 근거해야 함\n"
+            "2) 숫자/지표는 scorecard, comparison_scorecard, risk_metrics에서 직접 인용\n"
+            "3) 근거가 부족하면 '추가 데이터 필요'를 명시하고 추측 금지\n"
+            "4) 2026-01 기준 널리 쓰이는 RAG 개선 패턴을 우선 고려하되, "
+            "현재 데이터 이슈와 연결되는 항목만 선택\n"
+            "4-1) 개선 패턴 예시: 하이브리드 검색+리랭커, 쿼리 재작성, "
+            "동적 청크/컨텍스트 압축, 메타데이터/필터링, 인용/검증 단계, "
+            "신뢰도 기반 답변 거절, 평가셋 확장/하드 네거티브, 피드백 루프\n"
+            "5) 개선안마다 기대되는 영향 지표, 검증 방법(실험/재평가), 리스크를 함께 서술\n"
+            "6) 신뢰도/타당성 제약(표본 수, 커버리지, 유의성, 데이터 변경)을 명시\n"
+        )
         if report_type == "comparison":
             requirements = (
                 "요구사항:\n"
@@ -656,9 +858,11 @@ class LLMReportModule(BaseAnalysisModule):
                 "3) 섹션: 요약, 변경 사항 요약, 지표 비교 스코어카드, 통계적 신뢰도, "
                 "원인 분석, 개선 제안, 다음 단계, 부록(산출물)\n"
                 "4) 요약은 한 문장 결론 + 핵심 3개 bullet(지표/변경 사항/사용자 영향)\n"
-                "5) 스코어카드는 Markdown 표로 작성 (metric, A, B, diff, p-value, effect, 상태)\n"
+                "5) 스코어카드는 Markdown 표로 작성 (metric, A, B, diff, "
+                "p-value, effect, 상태)\n"
                 "6) 데이터셋 차이가 있으면 비교 해석 제한을 명확히 표기\n"
-                "7) 변경 사항이 없거나 근거가 약하면 '추가 데이터 필요'라고 명시\n"
+                "7) 유의한 변화는 significant_changes를 활용해 강조\n"
+                "8) 변경 사항이 없거나 근거가 약하면 '추가 데이터 필요'라고 명시\n"
             )
         elif report_type == "summary":
             requirements = (
@@ -667,8 +871,10 @@ class LLMReportModule(BaseAnalysisModule):
                 "2) 핵심 주장/개선안에는 evidence_id를 [E1] 형식으로 인용\n"
                 "3) 섹션: 요약, 지표 스코어카드, 개선 제안, 다음 단계, 부록(산출물)\n"
                 "4) 요약은 한 문장 결론 + 핵심 3개 bullet\n"
-                "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, pass_rate, 상태)\n"
-                "6) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
+                "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, "
+                "pass_rate, 상태)\n"
+                "6) risk_metrics를 활용해 상위 위험 지표 3개를 명확히 언급\n"
+                "7) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
             )
         else:
             requirements = (
@@ -678,15 +884,18 @@ class LLMReportModule(BaseAnalysisModule):
                 "3) 섹션: 요약, 지표 스코어카드, 데이터 품질/신뢰도, 증거 기반 인사이트, "
                 "원인 가설, 개선 제안, 다음 단계, 부록(산출물)\n"
                 "4) 요약은 한 문장 결론 + 핵심 3개 bullet(지표/원인/사용자 영향)\n"
-                "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, pass_rate, 상태)\n"
-                "6) 사용자 영향은 신뢰/이해/인지부하 관점으로 1~2문장\n"
-                "7) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
+                "5) 스코어카드는 Markdown 표로 작성 (metric, 평균, threshold, "
+                "pass_rate, 상태)\n"
+                "6) signal_group_summary로 축별 약점/강점을 분해\n"
+                "7) 사용자 영향은 신뢰/이해/인지부하 관점으로 1~2문장\n"
+                "8) 근거가 부족하면 '추가 데이터 필요'라고 명시\n"
             )
         return (
             "당신은 RAG 평가 분석 보고서 작성자입니다. "
             "아래 데이터와 증거를 기반으로 Markdown 보고서를 작성하세요.\n"
             "\n"
+            f"{common_requirements}\n"
             f"{requirements}\n"
             "[요약 데이터]\n"
             f"{summary_json}\n"

evalvault/domain/metrics/analysis_registry.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""Analysis metric registry for pipeline outputs."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+from evalvault.domain.metrics.registry import SignalGroup
+AnalysisMetricSource = Literal[
+    "retrieval_analyzer",
+    "embedding_analyzer",
+    "bm25_searcher",
+    "embedding_searcher",
+    "hybrid_rrf",
+    "hybrid_weighted",
+]
+@dataclass(frozen=True)
+class AnalysisMetricSpec:
+    key: str
+    label: str
+    description: str
+    signal_group: SignalGroup
+    module_id: AnalysisMetricSource
+    output_path: tuple[str, ...]
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "key": self.key,
+            "label": self.label,
+            "description": self.description,
+            "signal_group": self.signal_group,
+            "module_id": self.module_id,
+            "output_path": list(self.output_path),
+        }
+_ANALYSIS_METRICS: tuple[AnalysisMetricSpec, ...] = (
+    AnalysisMetricSpec(
+        key="retrieval.avg_contexts",
+        label="Avg contexts per query",
+        description="Average number of contexts retrieved per query",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_contexts"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.empty_context_rate",
+        label="Empty context rate",
+        description="Share of queries with empty contexts",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "empty_context_rate"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.avg_context_tokens",
+        label="Avg context tokens",
+        description="Average token count across contexts",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_context_tokens"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.keyword_overlap",
+        label="Keyword overlap",
+        description="Keyword overlap between question and contexts",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_keyword_overlap"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.ground_truth_hit_rate",
+        label="Ground truth hit rate",
+        description="Share of cases where ground truth appears in contexts",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "ground_truth_hit_rate"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.avg_faithfulness_proxy",
+        label="Context faithfulness proxy",
+        description="Proxy faithfulness from context-grounding check",
+        signal_group="groundedness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_faithfulness"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.avg_retrieval_score",
+        label="Avg retrieval score",
+        description="Average retrieval score from metadata",
+        signal_group="retrieval_effectiveness",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_retrieval_score"),
+    ),
+    AnalysisMetricSpec(
+        key="retrieval.avg_retrieval_time_ms",
+        label="Avg retrieval latency (ms)",
+        description="Average retrieval latency in milliseconds",
+        signal_group="efficiency",
+        module_id="retrieval_analyzer",
+        output_path=("summary", "avg_retrieval_time_ms"),
+    ),
+    AnalysisMetricSpec(
+        key="bm25.avg_recall_at_k",
+        label="BM25 avg recall@k",
+        description="Average recall@k for BM25 retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="bm25_searcher",
+        output_path=("summary", "avg_recall_at_k"),
+    ),
+    AnalysisMetricSpec(
+        key="bm25.avg_top_score",
+        label="BM25 avg top score",
+        description="Average top score for BM25 retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="bm25_searcher",
+        output_path=("summary", "avg_top_score"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.avg_recall_at_k",
+        label="Embedding avg recall@k",
+        description="Average recall@k for dense retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="embedding_searcher",
+        output_path=("summary", "avg_recall_at_k"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.avg_top_score",
+        label="Embedding avg top score",
+        description="Average top score for dense retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="embedding_searcher",
+        output_path=("summary", "avg_top_score"),
+    ),
+    AnalysisMetricSpec(
+        key="hybrid_rrf.avg_recall_at_k",
+        label="Hybrid RRF avg recall@k",
+        description="Average recall@k for RRF hybrid retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="hybrid_rrf",
+        output_path=("summary", "avg_recall_at_k"),
+    ),
+    AnalysisMetricSpec(
+        key="hybrid_rrf.avg_top_score",
+        label="Hybrid RRF avg top score",
+        description="Average top score for RRF hybrid retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="hybrid_rrf",
+        output_path=("summary", "avg_top_score"),
+    ),
+    AnalysisMetricSpec(
+        key="hybrid_weighted.avg_recall_at_k",
+        label="Hybrid weighted avg recall@k",
+        description="Average recall@k for weighted hybrid retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="hybrid_weighted",
+        output_path=("summary", "avg_recall_at_k"),
+    ),
+    AnalysisMetricSpec(
+        key="hybrid_weighted.avg_top_score",
+        label="Hybrid weighted avg top score",
+        description="Average top score for weighted hybrid retrieval",
+        signal_group="retrieval_effectiveness",
+        module_id="hybrid_weighted",
+        output_path=("summary", "avg_top_score"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.avg_norm",
+        label="Embedding avg norm",
+        description="Average embedding vector norm",
+        signal_group="embedding_quality",
+        module_id="embedding_analyzer",
+        output_path=("summary", "avg_norm"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.norm_std",
+        label="Embedding norm std",
+        description="Std-dev of embedding norms",
+        signal_group="embedding_quality",
+        module_id="embedding_analyzer",
+        output_path=("summary", "norm_std"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.norm_min",
+        label="Embedding norm min",
+        description="Minimum embedding norm",
+        signal_group="embedding_quality",
+        module_id="embedding_analyzer",
+        output_path=("summary", "norm_min"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.norm_max",
+        label="Embedding norm max",
+        description="Maximum embedding norm",
+        signal_group="embedding_quality",
+        module_id="embedding_analyzer",
+        output_path=("summary", "norm_max"),
+    ),
+    AnalysisMetricSpec(
+        key="embedding.mean_cosine_to_centroid",
+        label="Embedding mean cosine",
+        description="Mean cosine similarity to centroid",
+        signal_group="embedding_quality",
+        module_id="embedding_analyzer",
+        output_path=("summary", "mean_cosine_to_centroid"),
+    ),
+)
+def list_analysis_metric_specs() -> list[AnalysisMetricSpec]:
+    return list(_ANALYSIS_METRICS)
+def list_analysis_metric_keys() -> list[str]:
+    return [spec.key for spec in _ANALYSIS_METRICS]

evalvault/domain/metrics/registry.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Metric registry for CLI/Web UI integrations."""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+MetricSource = Literal["ragas", "custom"]
+MetricCategory = Literal["qa", "summary", "retrieval", "domain"]
+SignalGroup = Literal[
+    "groundedness",
+    "intent_alignment",
+    "retrieval_effectiveness",
+    "summary_fidelity",
+    "embedding_quality",
+    "efficiency",
+]
+@dataclass(frozen=True)
+class MetricSpec:
+    name: str
+    description: str
+    requires_ground_truth: bool
+    requires_embeddings: bool
+    source: MetricSource
+    category: MetricCategory
+    signal_group: SignalGroup
+    def to_dict(self) -> dict[str, object]:
+        return {
+            "name": self.name,
+            "description": self.description,
+            "requires_ground_truth": self.requires_ground_truth,
+            "requires_embeddings": self.requires_embeddings,
+            "source": self.source,
+            "category": self.category,
+            "signal_group": self.signal_group,
+        }
+_METRIC_SPECS: tuple[MetricSpec, ...] = (
+    MetricSpec(
+        name="faithfulness",
+        description="Measures factual accuracy of the answer based on contexts",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="ragas",
+        category="qa",
+        signal_group="groundedness",
+    ),
+    MetricSpec(
+        name="answer_relevancy",
+        description="Measures how relevant the answer is to the question",
+        requires_ground_truth=False,
+        requires_embeddings=True,
+        source="ragas",
+        category="qa",
+        signal_group="intent_alignment",
+    ),
+    MetricSpec(
+        name="context_precision",
+        description="Measures ranking quality of retrieved contexts",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="ragas",
+        category="qa",
+        signal_group="retrieval_effectiveness",
+    ),
+    MetricSpec(
+        name="context_recall",
+        description="Measures if all relevant info is in retrieved contexts",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="ragas",
+        category="qa",
+        signal_group="retrieval_effectiveness",
+    ),
+    MetricSpec(
+        name="factual_correctness",
+        description="Measures factual correctness against ground truth",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="ragas",
+        category="qa",
+        signal_group="groundedness",
+    ),
+    MetricSpec(
+        name="semantic_similarity",
+        description="Measures semantic similarity between answer and ground truth",
+        requires_ground_truth=True,
+        requires_embeddings=True,
+        source="ragas",
+        category="qa",
+        signal_group="intent_alignment",
+    ),
+    MetricSpec(
+        name="mrr",
+        description="Measures reciprocal rank of the first relevant context",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="custom",
+        category="retrieval",
+        signal_group="retrieval_effectiveness",
+    ),
+    MetricSpec(
+        name="ndcg",
+        description="Measures ranking quality across relevant contexts",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="custom",
+        category="retrieval",
+        signal_group="retrieval_effectiveness",
+    ),
+    MetricSpec(
+        name="hit_rate",
+        description="Measures whether any relevant context appears in top K",
+        requires_ground_truth=True,
+        requires_embeddings=False,
+        source="custom",
+        category="retrieval",
+        signal_group="retrieval_effectiveness",
+    ),
+    MetricSpec(
+        name="summary_score",
+        description="Measures summary coverage and conciseness against contexts",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="ragas",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="summary_faithfulness",
+        description="Measures whether summary statements are grounded in contexts",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="ragas",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="entity_preservation",
+        description="Measures preservation of key insurance entities in summaries",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="insurance_term_accuracy",
+        description="Measures if insurance terms in answer are grounded in contexts",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="domain",
+        signal_group="groundedness",
+    ),
+    MetricSpec(
+        name="contextual_relevancy",
+        description="Measures how well contexts align with the question intent",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="qa",
+        signal_group="retrieval_effectiveness",
+    ),
+)
+def list_metric_specs() -> list[MetricSpec]:
+    return list(_METRIC_SPECS)
+def list_metric_names() -> list[str]:
+    return [spec.name for spec in _METRIC_SPECS]
+def get_metric_descriptions() -> dict[str, str]:
+    return {spec.name: spec.description for spec in _METRIC_SPECS}
+def get_metric_spec_map() -> dict[str, MetricSpec]:
+    return {spec.name: spec for spec in _METRIC_SPECS}

evalvault/domain/services/pipeline_template_registry.py CHANGED Viewed

@@ -68,6 +68,118 @@ class PipelineTemplateRegistry:
         self._templates[AnalysisIntent.BENCHMARK_RETRIEVAL] = (
             self._create_benchmark_retrieval_template()
         )
+        # 보고서 템플릿
+        self._templates[AnalysisIntent.GENERATE_SUMMARY] = self._create_generate_summary_template()
+        self._templates[AnalysisIntent.GENERATE_DETAILED] = (
+            self._create_generate_detailed_template()
+        )
+        self._templates[AnalysisIntent.GENERATE_COMPARISON] = (
+            self._create_generate_comparison_template()
+        )
+    def get_template(self, intent: AnalysisIntent) -> AnalysisPipeline | None:
+        """의도에 대응하는 파이프라인 템플릿 조회."""
+        return self._templates.get(intent)
+    # =========================================================================
+    # Verification Templates
+    # =========================================================================
+    def _create_verify_morpheme_template(self) -> AnalysisPipeline:
+        """형태소 검증 템플릿."""
+        nodes = [
+            AnalysisNode(
+                id="load_data",
+                name="데이터 로드",
+                module="data_loader",
+            ),
+            AnalysisNode(
+                id="morpheme_analysis",
+                name="형태소 분석",
+                module="morpheme_analyzer",
+                depends_on=["load_data"],
+            ),
+            AnalysisNode(
+                id="quality_check",
+                name="형태소 품질 점검",
+                module="morpheme_quality_checker",
+                depends_on=["morpheme_analysis"],
+            ),
+            AnalysisNode(
+                id="report",
+                name="검증 보고서",
+                module="verification_report",
+                depends_on=["quality_check"],
+            ),
+        ]
+        return AnalysisPipeline(
+            intent=AnalysisIntent.VERIFY_MORPHEME,
+            nodes=nodes,
+        )
+    def _create_verify_embedding_template(self) -> AnalysisPipeline:
+        """임베딩 품질 검증 템플릿."""
+        nodes = [
+            AnalysisNode(
+                id="load_data",
+                name="데이터 로드",
+                module="data_loader",
+            ),
+            AnalysisNode(
+                id="embedding_analysis",
+                name="임베딩 분석",
+                module="embedding_analyzer",
+                depends_on=["load_data"],
+            ),
+            AnalysisNode(
+                id="quality_check",
+                name="임베딩 분포 점검",
+                module="embedding_distribution",
+                depends_on=["embedding_analysis"],
+            ),
+            AnalysisNode(
+                id="report",
+                name="검증 보고서",
+                module="verification_report",
+                depends_on=["quality_check"],
+            ),
+        ]
+        return AnalysisPipeline(
+            intent=AnalysisIntent.VERIFY_EMBEDDING,
+            nodes=nodes,
+        )
+    def _create_verify_retrieval_template(self) -> AnalysisPipeline:
+        """검색 품질 검증 템플릿."""
+        nodes = [
+            AnalysisNode(
+                id="load_data",
+                name="데이터 로드",
+                module="data_loader",
+            ),
+            AnalysisNode(
+                id="retrieval_analysis",
+                name="검색 분석",
+                module="retrieval_analyzer",
+                depends_on=["load_data"],
+            ),
+            AnalysisNode(
+                id="quality_check",
+                name="검색 품질 점검",
+                module="retrieval_quality_checker",
+                depends_on=["retrieval_analysis"],
+            ),
+            AnalysisNode(
+                id="report",
+                name="검증 보고서",
+                module="verification_report",
+                depends_on=["quality_check"],
+            ),
+        ]
+        return AnalysisPipeline(
+            intent=AnalysisIntent.VERIFY_RETRIEVAL,
+            nodes=nodes,
+        )
     # =========================================================================
     # Comparison Templates

evalvault/ports/inbound/web_port.py CHANGED Viewed

@@ -188,6 +188,10 @@ class WebUIPort(Protocol):
         """
         ...
+    def get_metric_specs(self) -> list[dict[str, object]]:
+        """메트릭 스펙 목록 조회."""
+        ...
     def list_stage_events(
         self,
         run_id: str,

{evalvault-1.59.0.dist-info → evalvault-1.60.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: evalvault
-Version: 1.59.0
+Version: 1.60.0
 Summary: RAG evaluation system using Ragas with Phoenix/Langfuse tracing
 Project-URL: Homepage, https://github.com/ntts9990/EvalVault
 Project-URL: Documentation, https://github.com/ntts9990/EvalVault#readme

{evalvault-1.59.0.dist-info → evalvault-1.60.0.dist-info}/RECORD RENAMED Viewed

@@ -5,23 +5,23 @@ evalvault/mkdocs_helpers.py,sha256=1AKVQ1W2_VO4qclhfyefyU9Dz1Hzkh1DWDwsFMe24jc,3
 evalvault/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 evalvault/adapters/inbound/__init__.py,sha256=bWSL3styP4BIMeVk04nPI_9rKTln-puBH5lYg6XtnNo,91
 evalvault/adapters/inbound/api/__init__.py,sha256=LeVVttCA3tLKoHA2PO4z3y8VkfVcf3Bq8CZSzo91lf4,34
-evalvault/adapters/inbound/api/adapter.py,sha256=5d4ii_OeXs1aPjK58hh_WwfEEjiEAQf0xznTy9retro,60329
+evalvault/adapters/inbound/api/adapter.py,sha256=6L95Csns-ac_9Q1rbVjYA8G7mu0wb981G5lsbvcqzcI,59820
 evalvault/adapters/inbound/api/main.py,sha256=KdlAxKn0QfGI3UuoTrBDBbUs2xCvP8lnWOY1ce3svcU,2619
 evalvault/adapters/inbound/api/routers/__init__.py,sha256=q07_YF9TnBl68bqcRCvhPU4-zRTyvmPoHVehwO6W7QM,19
 evalvault/adapters/inbound/api/routers/benchmark.py,sha256=yevntbZcNtMvbVODsITUBgR1Ka4pdFQrXBJJ4K4Jyr4,4477
 evalvault/adapters/inbound/api/routers/config.py,sha256=CN-FH2cn0Ive-BD3WacWY6PFfuMtZEHP5_out3fvST4,3957
 evalvault/adapters/inbound/api/routers/domain.py,sha256=RsR7GIFMjccDN7vpG1uDyk9n1DnCTH18JDGAX7o4Qqc,3648
 evalvault/adapters/inbound/api/routers/knowledge.py,sha256=7mgyoUM1PepFb4X8_Ntn0vd7ZZYcNbM3_9nyD10g4Aw,5307
-evalvault/adapters/inbound/api/routers/pipeline.py,sha256=tWuXwM-AH_NVDzemtsxbi5Dyn5kYyc1vPFS1sg2TPuw,16655
-evalvault/adapters/inbound/api/routers/runs.py,sha256=W3QaSMN3ByqNLynh_uWkMv0_-NvsVKedbuKsEAAoZr0,33160
+evalvault/adapters/inbound/api/routers/pipeline.py,sha256=8UgQzNFHcuqS61s69mOrPee4OMwfxVdvRWHJ2_qYBF0,17175
+evalvault/adapters/inbound/api/routers/runs.py,sha256=Xn0Tj6sbxijdG9-x7rXFiLvKOAzdJ18QSZR0j5VEMYQ,33561
 evalvault/adapters/inbound/cli/__init__.py,sha256=a42flC5NK-VfbdbBrE49IrUL5zAyKdXZYJVM6E3NTE0,675
-evalvault/adapters/inbound/cli/app.py,sha256=Gf_VWXK2aUzVL63F5ulqPd88MgO1n823uISGhGHsdEI,1813
+evalvault/adapters/inbound/cli/app.py,sha256=ytNgHRg9ZTAl33AkB1wIL8RKfQ_Cf8fsy0gSsLTs7Ew,1603
 evalvault/adapters/inbound/cli/commands/__init__.py,sha256=ciIHbHgP0gtasVi4l5cHjVojERrb-uipga_E0EwCrqM,3431
 evalvault/adapters/inbound/cli/commands/agent.py,sha256=YlOYMEzzS1aSKDKD_a7UK3St18X6GXGkdTatrzyd8Zc,7555
 evalvault/adapters/inbound/cli/commands/analyze.py,sha256=aMi1BEDOX3yhN-ppBftDssPQLB5TdzIfpx9U7CZEgWo,48932
 evalvault/adapters/inbound/cli/commands/api.py,sha256=YdbJ_-QEajnFcjTa7P2heLMjFKpeQ4nWP_p-HvfYkEo,1943
 evalvault/adapters/inbound/cli/commands/benchmark.py,sha256=RZ4nRTF7d6hDZug-Pw8dGcFEyWdOKclwqkvS-gN4VWo,41097
-evalvault/adapters/inbound/cli/commands/config.py,sha256=r3DH2a0-PgJIzpyB7teiykDulhUwUJUkiFWLrbjhF6k,7148
+evalvault/adapters/inbound/cli/commands/config.py,sha256=Mv9IQHBFHZ3I2stUzHDgLDn-Znt_Awdy3j-sk5ruUmw,6069
 evalvault/adapters/inbound/cli/commands/debug.py,sha256=KU-hL1gLhpjV2ZybDQgGMwRfm-hCynkrqY4UzETfL9k,2234
 evalvault/adapters/inbound/cli/commands/domain.py,sha256=dL9iqBlnr5mDeS1unXW6uxE0qp6yfnxj-ls6k3EenwI,27279
 evalvault/adapters/inbound/cli/commands/experiment.py,sha256=jficaFOsZ9EMHrPHCOZjq6jpFrgmqCwmIo--wA_OcvQ,10389
@@ -33,7 +33,7 @@ evalvault/adapters/inbound/cli/commands/kg.py,sha256=ycV9Xj6SUUJLTyTfLZcjXDVLcZq
 evalvault/adapters/inbound/cli/commands/langfuse.py,sha256=aExhZ5WYT0FzJI4v1sF-a1jqy9b1BF46_HBtfiQjVGI,4085
 evalvault/adapters/inbound/cli/commands/method.py,sha256=K1UacoKwV9w8sLeQK8qHyTuZqFZrlcj6yS_y2izfRlo,18853
 evalvault/adapters/inbound/cli/commands/phoenix.py,sha256=LQi3KTLq1ybjjBuz92oQ6lYyBS3mHrCHk0qe-7bqB4U,15611
-evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=Hg3A2LGTLw_rjd6ZgT5lOVsTASXIyq2DimUna24FRv0,7936
+evalvault/adapters/inbound/cli/commands/pipeline.py,sha256=NeqWLzO9kRDuZd0pHAIHglP3F7VzoNOU4JI0QcSZ120,7788
 evalvault/adapters/inbound/cli/commands/prompts.py,sha256=6UwQtKJf3JYhcNI4tQqjjsL-sp_cmu2VV7gETkCcmkk,5490
 evalvault/adapters/inbound/cli/commands/run.py,sha256=6d_AnONUiroNMF1xZt8O1sbtqb5HcE53ZMAU-UOp1cA,115469
 evalvault/adapters/inbound/cli/commands/run_helpers.py,sha256=50nYzf4DUniJd7fQgT2cyh_FWVTWZzW0UMXCg-EHBuY,39764
@@ -65,7 +65,7 @@ evalvault/adapters/outbound/analysis/embedding_searcher_module.py,sha256=j6w_jIG
 evalvault/adapters/outbound/analysis/hybrid_rrf_module.py,sha256=kaHSc7z3Jg_KrRLBqPMTV_9XXsL6v1dmbz-3dDO6IMw,3255
 evalvault/adapters/outbound/analysis/hybrid_weighted_module.py,sha256=AO-7thmnFGerUDWd8l9ydxeAkHkACo7Raf9O0RfW_nE,3671
 evalvault/adapters/outbound/analysis/hypothesis_generator_module.py,sha256=tx9fWgS0rBoK5eJPmwK5POoV78yN03hkFmWhCx71Ln0,13337
-evalvault/adapters/outbound/analysis/llm_report_module.py,sha256=KjIM2MET6gl9jUpxRo0rDVIzqSXFw-I4y0QoG_TULFA,38773
+evalvault/adapters/outbound/analysis/llm_report_module.py,sha256=RIACcqy7DAgllcr_sea4Ap3rE8NXEJTeAx46GWL7Dq4,47250
 evalvault/adapters/outbound/analysis/low_performer_extractor_module.py,sha256=Pt0Tmtc5Etqp_3SBDCPAzqWI2EF9woSg0mmBucEHlQw,1291
 evalvault/adapters/outbound/analysis/model_analyzer_module.py,sha256=28rHdXBXYIFpLHixbbZcv6-j2QVgl3yaGN0vU1Q0gFc,2682
 evalvault/adapters/outbound/analysis/morpheme_analyzer_module.py,sha256=Hrh4mluMsOhQHPrliD2w0FVKokJpfikXOFKT6sNwk74,4158
@@ -204,11 +204,13 @@ evalvault/domain/entities/rag_trace.py,sha256=sZgnkG4fK6KOe3Np6TYAZ_tPnsRbOmucDS
 evalvault/domain/entities/result.py,sha256=OaGHMDLWMW2O4fNVuVTUvWFVBQ1iu93OD_oI3NumrCQ,10697
 evalvault/domain/entities/stage.py,sha256=dbVzhgpP_p2p2eDJBWe7mwyyl6zUTP9kEKN_YRUvufY,7183
 evalvault/domain/metrics/__init__.py,sha256=fxjC5Z_8OuBIeMn80bYgnZZxpNoay2wH-qtG3NqCUvk,797
+evalvault/domain/metrics/analysis_registry.py,sha256=JZpBrBs7-JExHKYuEML6Vg_uYLm-WniBE3BfiU5OtJg,7641
 evalvault/domain/metrics/confidence.py,sha256=AX4oeN28OvmMkwD0pT-jskkOlXh87C1pe2W9P1sF69g,17224
 evalvault/domain/metrics/contextual_relevancy.py,sha256=xAPYUv_0TM4j4WOutOSGftNln_l-2Ev6qpANeu4REv8,11057
 evalvault/domain/metrics/entity_preservation.py,sha256=uSCbaETceE5PbGn-230Rm8pryOA8jDkkeOwAkWxA65g,6500
 evalvault/domain/metrics/insurance.py,sha256=5NPeAi_86rpuZRgV4KhzomGrq3Uw2jjglN6FfA_AO8o,4040
 evalvault/domain/metrics/no_answer.py,sha256=x6vRyOa1jw-qsnw9kOYT8YMPdLElaDRu7zjNCpyJfqM,8237
+evalvault/domain/metrics/registry.py,sha256=QKjo4RNHxCqObGg36xJP3KAHqFpHM50Jy7GeSksdz0Y,5665
 evalvault/domain/metrics/retrieval_rank.py,sha256=F55ByadJBowyKHKBmKAZ0T0qN_R1_7UNu-MiLnT4Ypg,14675
 evalvault/domain/metrics/terms_dictionary.json,sha256=-ZQmpx6yMOYoAOpcLj-xK2LkAeCbAw0EUb6-syIOKS0,3801
 evalvault/domain/metrics/text_match.py,sha256=P-YTZs9ekDqEmxLNBP8eXnMRymPdC8V4dJPtwG2ajVM,10219
@@ -241,7 +243,7 @@ evalvault/domain/services/memory_aware_evaluator.py,sha256=vTiYoxiMfZ_CMjSBjqwkB
 evalvault/domain/services/memory_based_analysis.py,sha256=oh2irCy3le7fWiTtL31SMEhPyu7fyBVz-giO2hlNifE,4499
 evalvault/domain/services/method_runner.py,sha256=pABqKZeaALpWZYDfzAbd-VOZt2djQggRNIPuuPQeUSw,3571
 evalvault/domain/services/pipeline_orchestrator.py,sha256=yriVlEVZYDtt0Vwt4Ae6xyW1H6Dj4Hxdn8XQSvQNSoQ,19436
-evalvault/domain/services/pipeline_template_registry.py,sha256=c1rvYsTQU5MdAsmbZ7LlnuF6TD3p4IXlzgq_i18J3f8,24039
+evalvault/domain/services/pipeline_template_registry.py,sha256=j2WQwXrCvYd-dbtxOUTmgTZZAgNtu0eUvqgdryerCbc,27964
 evalvault/domain/services/prompt_manifest.py,sha256=5s5Kd6-_Dn-xrjjlU99CVo6njsPhvE50H5m_85U-H6U,5612
 evalvault/domain/services/prompt_registry.py,sha256=81tq__u2fFxTEG8bWnyJ2Qdb9N89jcqIdSfOAKEbEvg,3029
 evalvault/domain/services/prompt_status.py,sha256=r1dFLGz4SfRxXaxsULQsr0-HpJkG9YfZ_yLIxF1MMBo,6731
@@ -262,7 +264,7 @@ evalvault/ports/inbound/__init__.py,sha256=2Wsc0vNzH8_ZaErk4OHxP93hRonLUkMbn3W28
 evalvault/ports/inbound/analysis_pipeline_port.py,sha256=RJfKtp22AYEqnmRk6RDawAK52rEmyAhuk0FUPJQUwQU,1758
 evalvault/ports/inbound/evaluator_port.py,sha256=rDvouIRUjBD7uICgrpeo11vNPvo27_0CdylRHPodPSE,1323
 evalvault/ports/inbound/learning_hook_port.py,sha256=ehpRyRNUY1PRtzIoaCyDM_QRxp6WjEQvwPskAxI4CPc,3109
-evalvault/ports/inbound/web_port.py,sha256=kjDyNXkgRwbevmSnm25URk-qHjGN9K9ML83FAvwhbpM,5448
+evalvault/ports/inbound/web_port.py,sha256=ljggDzHGUfh_H2j86F9upGFwR-ZXIJTunR2ahKMkn-A,5566
 evalvault/ports/outbound/__init__.py,sha256=jEmLbY3lZ9osue6pG5dc345BdMikBEWq4cnX7ocEul0,3276
 evalvault/ports/outbound/analysis_cache_port.py,sha256=zPSdUVK_yw3PMWPII2YvS1WLmCGlg5bDScSuYINW9yc,1386
 evalvault/ports/outbound/analysis_module_port.py,sha256=QYzkvie9-BbONj8ZgiQUjm8I-bn8mgzlXTzIXMhehmQ,1881
@@ -288,8 +290,8 @@ evalvault/reports/__init__.py,sha256=Bb1X4871msAN8I6PM6nKGED3psPwZt88hXZBAOdH06Y
 evalvault/reports/release_notes.py,sha256=pZj0PBFT-4F_Ty-Kv5P69BuoOnmTCn4kznDcORFJd0w,4011
 evalvault/scripts/__init__.py,sha256=NwEeIFQbkX4ml2R_PhtIoNtArDSX_suuoymgG_7Kwso,89
 evalvault/scripts/regression_runner.py,sha256=SxZori5BZ8jVQ057Mf5V5FPgIVDccrV5oRONmnhuk8w,8438
-evalvault-1.59.0.dist-info/METADATA,sha256=unwBGPN_vReQ3ohlNQZjMhPy8GBTxDqy1eSPvprX7dk,14058
-evalvault-1.59.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-evalvault-1.59.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
-evalvault-1.59.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
-evalvault-1.59.0.dist-info/RECORD,,
+evalvault-1.60.0.dist-info/METADATA,sha256=STLcsvyERi1Xlx36zsmSzl5dz-skmdRTH-SXHoBK27E,14058
+evalvault-1.60.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+evalvault-1.60.0.dist-info/entry_points.txt,sha256=Oj9Xc5gYcyUYYNmQfWI8NYGw7nN-3M-h2ipHIMlVn6o,65
+evalvault-1.60.0.dist-info/licenses/LICENSE.md,sha256=3RNWY4jjtrQ_yYa-D-7I3XO12Ti7YzxsLV_dpykujvo,11358
+evalvault-1.60.0.dist-info/RECORD,,

{evalvault-1.59.0.dist-info → evalvault-1.60.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalvault-1.59.0.dist-info → evalvault-1.60.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalvault-1.59.0.dist-info → evalvault-1.60.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

evalvault 1.59.0__py3-none-any.whl → 1.60.0__py3-none-any.whl

evalvault 1.59.0py3-none-any.whl → 1.60.0py3-none-any.whl