PyPI - evalvault - Versions diffs - 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl - Mend

evalvault 1.65.0py3-none-any.whl → 1.66.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

evalvault/adapters/inbound/api/adapter.py +14 -0
evalvault/adapters/inbound/api/main.py +14 -4
evalvault/adapters/inbound/api/routers/chat.py +543 -0
evalvault/adapters/inbound/cli/commands/run.py +14 -0
evalvault/adapters/inbound/cli/commands/run_helpers.py +21 -2
evalvault/adapters/outbound/report/llm_report_generator.py +13 -1
evalvault/adapters/outbound/storage/base_sql.py +41 -1
evalvault/adapters/outbound/tracker/langfuse_adapter.py +1 -0
evalvault/adapters/outbound/tracker/mlflow_adapter.py +5 -0
evalvault/adapters/outbound/tracker/phoenix_adapter.py +29 -2
evalvault/config/settings.py +21 -0
evalvault/domain/entities/prompt.py +1 -1
evalvault/domain/metrics/__init__.py +8 -0
evalvault/domain/metrics/registry.py +39 -3
evalvault/domain/metrics/summary_accuracy.py +189 -0
evalvault/domain/metrics/summary_needs_followup.py +45 -0
evalvault/domain/metrics/summary_non_definitive.py +41 -0
evalvault/domain/metrics/summary_risk_coverage.py +45 -0
evalvault/domain/services/custom_metric_snapshot.py +233 -0
evalvault/domain/services/evaluator.py +280 -27
evalvault/domain/services/prompt_registry.py +39 -10
evalvault/domain/services/threshold_profiles.py +4 -0
evalvault/domain/services/visual_space_service.py +79 -4
{evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/METADATA +25 -1
{evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/RECORD +28 -22
{evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/WHEEL +0 -0
{evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/entry_points.txt +0 -0
{evalvault-1.65.0.dist-info → evalvault-1.66.0.dist-info}/licenses/LICENSE.md +0 -0

evalvault/adapters/outbound/report/llm_report_generator.py CHANGED Viewed

@@ -499,8 +499,20 @@ SUMMARY_RECOMMENDED_THRESHOLDS = {
     "summary_faithfulness": 0.90,
     "summary_score": 0.85,
     "entity_preservation": 0.90,
+    "summary_accuracy": 0.90,
+    "summary_risk_coverage": 0.90,
+    "summary_non_definitive": 0.80,
+    "summary_needs_followup": 0.80,
 }
-SUMMARY_METRIC_ORDER = ("summary_faithfulness", "summary_score", "entity_preservation")
+SUMMARY_METRIC_ORDER = (
+    "summary_faithfulness",
+    "summary_score",
+    "entity_preservation",
+    "summary_accuracy",
+    "summary_risk_coverage",
+    "summary_non_definitive",
+    "summary_needs_followup",
+)
 @dataclass

evalvault/adapters/outbound/storage/base_sql.py CHANGED Viewed

@@ -664,6 +664,8 @@ class BaseSQLStorageAdapter(ABC):
     def export_run_to_excel(self, run_id: str, output_path) -> Path:
         from openpyxl import Workbook
+        from evalvault.domain.metrics.registry import get_metric_spec_map
         output = Path(output_path)
         output.parent.mkdir(parents=True, exist_ok=True)
@@ -837,6 +839,23 @@ class BaseSQLStorageAdapter(ABC):
         summary_rows: list[dict[str, Any]] = []
         run_payload = run_rows[0] if run_rows else {}
+        custom_metric_rows: list[dict[str, Any]] = []
+        run_metadata = self._deserialize_json(run_payload.get("metadata")) if run_payload else None
+        if isinstance(run_metadata, dict):
+            custom_snapshot = run_metadata.get("custom_metric_snapshot")
+            if isinstance(custom_snapshot, dict):
+                entries = custom_snapshot.get("metrics")
+                if isinstance(entries, list):
+                    for entry in entries:
+                        if isinstance(entry, dict):
+                            row = dict(entry)
+                            row["schema_version"] = custom_snapshot.get("schema_version")
+                            custom_metric_rows.append(row)
+        if custom_metric_rows:
+            custom_metric_rows = self._normalize_rows(
+                custom_metric_rows,
+                json_columns={"inputs", "rules"},
+            )
         prompt_set_id = None
         prompt_set_name = None
         if run_prompt_payloads:
@@ -878,14 +897,17 @@ class BaseSQLStorageAdapter(ABC):
                 if isinstance(threshold, (int, float)) and score >= threshold:
                     entry["pass_count"] += 1
+        metric_spec_map = get_metric_spec_map()
         for entry in metrics_index.values():
             count = entry["count"] or 0
+            spec = metric_spec_map.get(entry["metric_name"])
             metric_summary_rows.append(
                 {
                     "metric_name": entry["metric_name"],
                     "avg_score": (entry["score_sum"] / count) if count else None,
                     "pass_rate": (entry["pass_count"] / count) if count else None,
                     "samples": count,
+                    "source": spec.source if spec else None,
                 }
             )
@@ -956,7 +978,25 @@ class BaseSQLStorageAdapter(ABC):
             (
                 "MetricsSummary",
                 metric_summary_rows,
-                ["metric_name", "avg_score", "pass_rate", "samples"],
+                ["metric_name", "avg_score", "pass_rate", "samples", "source"],
+            ),
+            (
+                "CustomMetrics",
+                custom_metric_rows,
+                [
+                    "schema_version",
+                    "metric_name",
+                    "source",
+                    "description",
+                    "evaluation_method",
+                    "inputs",
+                    "output",
+                    "evaluation_process",
+                    "rules",
+                    "notes",
+                    "implementation_path",
+                    "implementation_hash",
+                ],
             ),
             (
                 "RunPromptSets",

evalvault/adapters/outbound/tracker/langfuse_adapter.py CHANGED Viewed

@@ -360,6 +360,7 @@ class LangfuseAdapter(TrackerPort):
             "summary": trace_output["summary"],
             "metrics": metric_summary,
             "phoenix_links": phoenix_links or {},
+            "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
             "test_cases": [
                 {
                     "test_case_id": result.test_case_id,

evalvault/adapters/outbound/tracker/mlflow_adapter.py CHANGED Viewed

@@ -220,6 +220,11 @@ class MLflowAdapter(TrackerPort):
             results_data.append(result_dict)
         self.save_artifact(trace_id, "test_results", results_data)
+        self.save_artifact(
+            trace_id,
+            "custom_metric_snapshot",
+            (run.tracker_metadata or {}).get("custom_metric_snapshot"),
+        )
         # 6. End MLflow run
         self.end_trace(trace_id)

evalvault/adapters/outbound/tracker/phoenix_adapter.py CHANGED Viewed

@@ -352,13 +352,40 @@ class PhoenixAdapter(TrackerPort):
                 "version": run.dataset_version,
                 "total_test_cases": run.total_test_cases,
             },
+            "evaluation_config": {
+                "model": run.model_name,
+                "metrics": run.metrics_evaluated,
+                "thresholds": run.thresholds,
+            },
             "summary": {
-                "pass_rate": run.pass_rate,
+                "total_test_cases": run.total_test_cases,
+                "passed": run.passed_test_cases,
+                "failed": run.total_test_cases - run.passed_test_cases,
+                "pass_rate": round(run.pass_rate, 4),
+                "duration_seconds": round(run.duration_seconds, 2)
+                if run.duration_seconds
+                else None,
                 "total_tokens": run.total_tokens,
-                "duration_seconds": run.duration_seconds,
             },
             "metrics": metric_summary,
+            "custom_metrics": (run.tracker_metadata or {}).get("custom_metric_snapshot"),
+            "test_cases": [
+                {
+                    "test_case_id": result.test_case_id,
+                    "all_passed": result.all_passed,
+                    "metrics": {
+                        metric.name: {
+                            "score": metric.score,
+                            "threshold": metric.threshold,
+                            "passed": metric.passed,
+                        }
+                        for metric in result.metrics
+                    },
+                }
+                for result in run.results
+            ],
         }
         self.save_artifact(trace_id, "ragas_evaluation", structured_artifact)
         # End the trace

evalvault/config/settings.py CHANGED Viewed

@@ -321,6 +321,27 @@ class Settings(BaseSettings):
         default="https://cloud.langfuse.com", description="Langfuse host URL"
     )
+    mcp_enabled: bool = Field(
+        default=False,
+        description="Enable MCP JSON-RPC endpoint over HTTP.",
+    )
+    mcp_protocol_version: str = Field(
+        default="2025-11-25",
+        description="MCP protocol version to advertise.",
+    )
+    mcp_server_version: str = Field(
+        default="0.1.0",
+        description="EvalVault MCP server version.",
+    )
+    mcp_auth_tokens: str | None = Field(
+        default=None,
+        description="Comma-separated bearer tokens for MCP endpoint (required).",
+    )
+    mcp_allowed_tools: str | None = Field(
+        default=None,
+        description="Comma-separated allowlist of MCP tool names.",
+    )
     # MLflow Configuration (optional)
     mlflow_tracking_uri: str | None = Field(default=None, description="MLflow tracking server URI")
     mlflow_experiment_name: str = Field(default="evalvault", description="MLflow experiment name")

evalvault/domain/entities/prompt.py CHANGED Viewed

@@ -7,7 +7,7 @@ from datetime import datetime
 from typing import Any, Literal
 from uuid import uuid4
-PromptKind = Literal["system", "ragas"]
+PromptKind = Literal["system", "ragas", "custom"]
 @dataclass

evalvault/domain/metrics/__init__.py CHANGED Viewed

@@ -6,6 +6,10 @@ from evalvault.domain.metrics.entity_preservation import EntityPreservation
 from evalvault.domain.metrics.insurance import InsuranceTermAccuracy
 from evalvault.domain.metrics.no_answer import NoAnswerAccuracy, is_no_answer
 from evalvault.domain.metrics.retrieval_rank import MRR, NDCG, HitRate
+from evalvault.domain.metrics.summary_accuracy import SummaryAccuracy
+from evalvault.domain.metrics.summary_needs_followup import SummaryNeedsFollowup
+from evalvault.domain.metrics.summary_non_definitive import SummaryNonDefinitive
+from evalvault.domain.metrics.summary_risk_coverage import SummaryRiskCoverage
 from evalvault.domain.metrics.text_match import ExactMatch, F1Score
 __all__ = [
@@ -19,5 +23,9 @@ __all__ = [
     "MRR",
     "NDCG",
     "NoAnswerAccuracy",
+    "SummaryAccuracy",
+    "SummaryNeedsFollowup",
+    "SummaryNonDefinitive",
+    "SummaryRiskCoverage",
     "is_no_answer",
 ]

evalvault/domain/metrics/registry.py CHANGED Viewed

@@ -123,7 +123,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
     ),
     MetricSpec(
         name="summary_score",
-        description="Measures summary coverage and conciseness against contexts",
+        description="(LLM) Measures summary coverage and conciseness against contexts",
         requires_ground_truth=False,
         requires_embeddings=False,
         source="ragas",
@@ -132,7 +132,7 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
     ),
     MetricSpec(
         name="summary_faithfulness",
-        description="Measures whether summary statements are grounded in contexts",
+        description="(LLM) Measures whether summary statements are grounded in contexts",
         requires_ground_truth=False,
         requires_embeddings=False,
         source="ragas",
@@ -141,7 +141,43 @@ _METRIC_SPECS: tuple[MetricSpec, ...] = (
     ),
     MetricSpec(
         name="entity_preservation",
-        description="Measures preservation of key insurance entities in summaries",
+        description="(Rule) Measures preservation of key insurance entities in summaries",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="summary_accuracy",
+        description="(Rule) Measures whether summary entities are grounded in contexts",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="summary_risk_coverage",
+        description="(Rule) Measures coverage of expected insurance risk tags in summaries",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="summary_non_definitive",
+        description="(Rule) Measures avoidance of definitive claims in summaries",
+        requires_ground_truth=False,
+        requires_embeddings=False,
+        source="custom",
+        category="summary",
+        signal_group="summary_fidelity",
+    ),
+    MetricSpec(
+        name="summary_needs_followup",
+        description="(Rule) Measures follow-up guidance when required",
         requires_ground_truth=False,
         requires_embeddings=False,
         source="custom",

evalvault/domain/metrics/summary_accuracy.py ADDED Viewed

@@ -0,0 +1,189 @@
+from __future__ import annotations
+import re
+from decimal import Decimal, InvalidOperation
+class SummaryAccuracy:
+    """Measure whether summary entities are supported by contexts."""
+    name = "summary_accuracy"
+    _PERCENT_RE = re.compile(r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>%|퍼센트|percent)", re.I)
+    _CURRENCY_RE = re.compile(
+        r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>원|만원|억원|달러|usd|krw|won)",
+        re.I,
+    )
+    _CURRENCY_PREFIX_RE = re.compile(r"(?P<unit>[$₩])\s*(?P<number>\d+(?:[.,]\d+)?)")
+    _DURATION_RE = re.compile(
+        r"(?P<number>\d+(?:[.,]\d+)?)\s*(?P<unit>년|개월|월|일|years?|months?|days?)",
+        re.I,
+    )
+    _DATE_RE = re.compile(r"\b\d{4}[./-]\d{1,2}[./-]\d{1,2}\b")
+    _CURRENCY_MULTIPLIERS = {"만원": Decimal("10000"), "억원": Decimal("100000000")}
+    _KRW_UNITS = {"원", "krw", "won", "₩", "만원", "억원"}
+    _USD_UNITS = {"달러", "usd", "$"}
+    _DURATION_UNITS = {
+        "년": "year",
+        "year": "year",
+        "years": "year",
+        "개월": "month",
+        "월": "month",
+        "month": "month",
+        "months": "month",
+        "일": "day",
+        "day": "day",
+        "days": "day",
+    }
+    _KEYWORDS_KO = (
+        "면책",
+        "제외",
+        "단서",
+        "다만",
+        "조건",
+        "자기부담",
+        "한도",
+        "감액",
+    )
+    _KEYWORDS_EN = (
+        "exclusion",
+        "excluded",
+        "exception",
+        "except",
+        "condition",
+        "deductible",
+        "limit",
+        "cap",
+        "waiting period",
+        "co-pay",
+        "copay",
+        "co-insurance",
+        "coinsurance",
+    )
+    def score(self, answer: str, contexts: list[str]) -> float:
+        if not contexts:
+            return 0.0
+        context_text = " ".join([ctx for ctx in contexts if ctx])
+        context_entities = self._extract_entities(context_text)
+        summary_entities = self._extract_entities(answer or "")
+        if not summary_entities:
+            return 0.5 if context_entities else 0.0
+        if not context_entities:
+            return 0.0
+        supported = summary_entities.intersection(context_entities)
+        return len(supported) / len(summary_entities)
+    def _extract_entities(self, text: str) -> set[str]:
+        entities = set()
+        entities.update(self._extract_numeric_entities(text))
+        entities.update(self._extract_keyword_entities(text))
+        return entities
+    def _extract_numeric_entities(self, text: str) -> set[str]:
+        entities: set[str] = set()
+        for match in self._PERCENT_RE.finditer(text):
+            number = self._normalize_number(match.group("number"))
+            if number:
+                entities.add(f"percent:{number}")
+        for match in self._CURRENCY_RE.finditer(text):
+            number = self._normalize_number(match.group("number"))
+            unit = match.group("unit").lower()
+            normalized = self._normalize_currency(number, unit)
+            if normalized:
+                entities.add(f"currency:{normalized}")
+        for match in self._CURRENCY_PREFIX_RE.finditer(text):
+            number = self._normalize_number(match.group("number"))
+            unit = match.group("unit")
+            normalized = self._normalize_currency(number, unit)
+            if normalized:
+                entities.add(f"currency:{normalized}")
+        for match in self._DURATION_RE.finditer(text):
+            number = self._normalize_number(match.group("number"))
+            unit = match.group("unit").lower()
+            normalized = self._normalize_duration(number, unit)
+            if normalized:
+                entities.add(f"duration:{normalized}")
+        for match in self._DATE_RE.finditer(text):
+            entities.add(f"date:{self._normalize_date(match.group(0))}")
+        return entities
+    def _extract_keyword_entities(self, text: str) -> set[str]:
+        entities: set[str] = set()
+        lower = text.lower()
+        for keyword in self._KEYWORDS_KO:
+            if keyword in text:
+                entities.add(f"kw:{keyword}")
+        for keyword in self._KEYWORDS_EN:
+            if keyword in lower:
+                entities.add(f"kw:{keyword}")
+        return entities
+    def _normalize_currency(self, number: str | None, unit: str) -> str | None:
+        if number is None:
+            return None
+        try:
+            value = Decimal(number)
+        except InvalidOperation:
+            return None
+        unit_key = unit.lower()
+        multiplier = self._CURRENCY_MULTIPLIERS.get(unit_key)
+        if multiplier:
+            value *= multiplier
+        if unit_key in self._KRW_UNITS:
+            currency = "krw"
+        elif unit_key in self._USD_UNITS:
+            currency = "usd"
+        else:
+            currency = unit_key
+        return f"{currency}:{self._format_decimal(value)}"
+    def _normalize_duration(self, number: str | None, unit: str) -> str | None:
+        if number is None:
+            return None
+        try:
+            value = Decimal(number)
+        except InvalidOperation:
+            return None
+        base_unit = self._DURATION_UNITS.get(unit, unit)
+        return f"{self._format_decimal(value)}{base_unit}"
+    @staticmethod
+    def _normalize_date(raw: str) -> str:
+        return re.sub(r"[./-]", "", raw)
+    @staticmethod
+    def _normalize_number(raw: str | None) -> str | None:
+        if raw is None:
+            return None
+        cleaned = raw.replace(",", "").strip()
+        if not cleaned:
+            return None
+        try:
+            value = Decimal(cleaned)
+        except InvalidOperation:
+            return None
+        return SummaryAccuracy._format_decimal(value)
+    @staticmethod
+    def _format_decimal(value: Decimal) -> str:
+        if value == value.to_integral_value():
+            return str(value.to_integral_value())
+        return format(value.normalize(), "f").rstrip("0").rstrip(".")

evalvault/domain/metrics/summary_needs_followup.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+class SummaryNeedsFollowup:
+    """Check if follow-up guidance appears when required."""
+    name = "summary_needs_followup"
+    _FOLLOWUP_KEYWORDS = [
+        "확인 필요",
+        "추가 확인",
+        "담당자 확인",
+        "재문의",
+        "추가 문의",
+        "서류 확인",
+        "follow up",
+        "follow-up",
+    ]
+    def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
+        text = answer or ""
+        has_followup = self._has_followup(text)
+        expected = self._expects_followup(metadata)
+        if expected:
+            return 1.0 if has_followup else 0.0
+        return 1.0 if not has_followup else 0.0
+    def _expects_followup(self, metadata: dict | None) -> bool:
+        if not metadata:
+            return False
+        raw = metadata.get("summary_tags")
+        if not raw:
+            return False
+        if isinstance(raw, list):
+            tags = [str(item).strip().lower() for item in raw if str(item).strip()]
+        else:
+            tags = [str(raw).strip().lower()]
+        return "needs_followup" in tags
+    def _has_followup(self, text: str) -> bool:
+        lowered = text.lower()
+        return any(
+            keyword in text or keyword.lower() in lowered for keyword in self._FOLLOWUP_KEYWORDS
+        )

evalvault/domain/metrics/summary_non_definitive.py ADDED Viewed

@@ -0,0 +1,41 @@
+from __future__ import annotations
+import re
+class SummaryNonDefinitive:
+    """Penalize definitive statements in summaries."""
+    name = "summary_non_definitive"
+    _DEFINITIVE_PATTERNS_KO = [
+        r"무조건",
+        r"반드시",
+        r"100%",
+        r"전액\s*지급",
+        r"확실히",
+        r"분명히",
+        r"절대",
+        r"항상",
+    ]
+    _DEFINITIVE_PATTERNS_EN = [
+        r"always",
+        r"guaranteed",
+        r"definitely",
+        r"certainly",
+        r"absolutely",
+        r"100%",
+    ]
+    def score(self, answer: str, contexts: list[str]) -> float:
+        text = answer or ""
+        if self._has_definitive_pattern(text):
+            return 0.0
+        return 1.0
+    def _has_definitive_pattern(self, text: str) -> bool:
+        for pattern in self._DEFINITIVE_PATTERNS_KO:
+            if re.search(pattern, text):
+                return True
+        lowered = text.lower()
+        return any(re.search(pattern, lowered) for pattern in self._DEFINITIVE_PATTERNS_EN)

evalvault/domain/metrics/summary_risk_coverage.py ADDED Viewed

@@ -0,0 +1,45 @@
+from __future__ import annotations
+class SummaryRiskCoverage:
+    """Measure coverage of expected insurance risk tags in summary."""
+    name = "summary_risk_coverage"
+    _TAG_KEYWORDS = {
+        "exclusion": ["면책", "보장 제외", "지급 불가", "exclusion"],
+        "deductible": ["자기부담", "본인부담금", "deductible", "copay"],
+        "limit": ["한도", "상한", "최대", "limit", "cap"],
+        "waiting_period": ["면책기간", "대기기간", "waiting period"],
+        "condition": ["조건", "단서", "다만", "condition"],
+        "documents_required": ["서류", "진단서", "영수증", "documents"],
+        "needs_followup": ["확인 필요", "추가 확인", "담당자 확인", "재문의", "follow up"],
+    }
+    def score(self, answer: str, contexts: list[str], metadata: dict | None = None) -> float:
+        expected_tags = self._extract_expected_tags(metadata)
+        if not expected_tags:
+            return 1.0
+        text = answer or ""
+        covered = 0
+        for tag in expected_tags:
+            if self._has_tag_keyword(text, tag):
+                covered += 1
+        return covered / len(expected_tags)
+    def _extract_expected_tags(self, metadata: dict | None) -> list[str]:
+        if not metadata:
+            return []
+        raw = metadata.get("summary_tags")
+        if not raw:
+            return []
+        if isinstance(raw, list):
+            return [str(item).strip().lower() for item in raw if str(item).strip()]
+        return [str(raw).strip().lower()]
+    def _has_tag_keyword(self, text: str, tag: str) -> bool:
+        keywords = self._TAG_KEYWORDS.get(tag, [])
+        lowered = text.lower()
+        return any(keyword in text or keyword.lower() in lowered for keyword in keywords)

evalvault 1.65.0__py3-none-any.whl → 1.66.0__py3-none-any.whl

evalvault 1.65.0py3-none-any.whl → 1.66.0py3-none-any.whl