PyPI - evalgate-sdk - Versions diffs - 3.3.1__py3-none-any.whl - Mend

evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

evalgate_sdk/__init__.py +707 -0
evalgate_sdk/_version.py +3 -0
evalgate_sdk/assertions.py +1362 -0
evalgate_sdk/auto.py +247 -0
evalgate_sdk/batch.py +174 -0
evalgate_sdk/cache.py +111 -0
evalgate_sdk/ci_context.py +123 -0
evalgate_sdk/cli/__init__.py +111 -0
evalgate_sdk/cli/api.py +261 -0
evalgate_sdk/cli/cli_constants.py +20 -0
evalgate_sdk/cli/commands.py +1041 -0
evalgate_sdk/cli/config.py +228 -0
evalgate_sdk/cli/env.py +43 -0
evalgate_sdk/cli/formatters/types.py +132 -0
evalgate_sdk/cli/golden_commands.py +322 -0
evalgate_sdk/cli/manifest.py +301 -0
evalgate_sdk/cli/new_commands.py +435 -0
evalgate_sdk/cli/policy_packs.py +103 -0
evalgate_sdk/cli/profiles.py +12 -0
evalgate_sdk/cli/regression_gate.py +312 -0
evalgate_sdk/cli/render/__init__.py +1 -0
evalgate_sdk/cli/render/snippet.py +18 -0
evalgate_sdk/cli/render/sort.py +29 -0
evalgate_sdk/cli/report/__init__.py +1 -0
evalgate_sdk/cli/report/build_check_report.py +209 -0
evalgate_sdk/cli/traces.py +186 -0
evalgate_sdk/cli/workspace.py +63 -0
evalgate_sdk/client.py +609 -0
evalgate_sdk/cluster.py +359 -0
evalgate_sdk/collector.py +161 -0
evalgate_sdk/constants.py +6 -0
evalgate_sdk/context.py +151 -0
evalgate_sdk/errors.py +236 -0
evalgate_sdk/export.py +238 -0
evalgate_sdk/formatters/__init__.py +11 -0
evalgate_sdk/formatters/github.py +51 -0
evalgate_sdk/formatters/human.py +68 -0
evalgate_sdk/formatters/json_fmt.py +11 -0
evalgate_sdk/formatters/pr_comment.py +80 -0
evalgate_sdk/golden.py +426 -0
evalgate_sdk/integrations/__init__.py +1 -0
evalgate_sdk/integrations/anthropic.py +99 -0
evalgate_sdk/integrations/autogen.py +62 -0
evalgate_sdk/integrations/crewai.py +61 -0
evalgate_sdk/integrations/langchain.py +100 -0
evalgate_sdk/integrations/openai.py +155 -0
evalgate_sdk/integrations/openai_eval.py +221 -0
evalgate_sdk/local.py +144 -0
evalgate_sdk/logger.py +123 -0
evalgate_sdk/matchers.py +62 -0
evalgate_sdk/otel.py +256 -0
evalgate_sdk/pagination.py +145 -0
evalgate_sdk/py.typed +0 -0
evalgate_sdk/pytest_plugin.py +96 -0
evalgate_sdk/reason_codes.py +103 -0
evalgate_sdk/regression.py +196 -0
evalgate_sdk/replay_decision.py +115 -0
evalgate_sdk/runtime/__init__.py +50 -0
evalgate_sdk/runtime/adapters/__init__.py +1 -0
evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
evalgate_sdk/runtime/context.py +68 -0
evalgate_sdk/runtime/eval.py +318 -0
evalgate_sdk/runtime/execution_mode.py +170 -0
evalgate_sdk/runtime/executor.py +92 -0
evalgate_sdk/runtime/registry.py +125 -0
evalgate_sdk/runtime/run_report.py +249 -0
evalgate_sdk/runtime/types.py +143 -0
evalgate_sdk/snapshot.py +219 -0
evalgate_sdk/streaming.py +124 -0
evalgate_sdk/synthesize.py +226 -0
evalgate_sdk/testing.py +128 -0
evalgate_sdk/types.py +666 -0
evalgate_sdk/utils/__init__.py +1 -0
evalgate_sdk/utils/input_hash.py +42 -0
evalgate_sdk/workflows.py +264 -0
evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0

evalgate_sdk/cli/config.py ADDED Viewed

@@ -0,0 +1,228 @@
+"""EvalGate config loader.
+Discovery: evalgate.config.json → evalai.config.json → pyproject.toml [evalgate].
+Port of ``cli/config.ts``.
+"""
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal
+from evalgate_sdk.cli.profiles import PROFILES
+ProfileName = Literal["strict", "balanced", "fast"]
+CONFIG_FILES = [
+    "evalgate.config.json",
+    "evalai.config.json",
+]
+@dataclass
+class EvalGateConfig:
+    """Loaded configuration."""
+    evaluation_id: str | None = None
+    api_key: str | None = None
+    base_url: str | None = None
+    min_score: int | None = None
+    min_n: int | None = None
+    max_drop: int | None = None
+    warn_drop: int | None = None
+    allow_weak_evidence: bool | None = None
+    baseline: str | None = None  # "published" | "previous" | "production" | "auto"
+    profile: str | None = None
+    packages: dict[str, Any] | None = None
+# Deprecated alias — remove in v4
+EvalAIConfig = EvalGateConfig
+def find_config_path(cwd: str | None = None) -> str | None:
+    """Find config file path in directory, walking up to root."""
+    directory = os.path.abspath(cwd or os.getcwd())
+    root = os.path.splitdrive(directory)[0] + os.sep
+    while True:
+        for name in CONFIG_FILES:
+            candidate = os.path.join(directory, name)
+            if os.path.isfile(candidate):
+                return candidate
+        # Check pyproject.toml for [tool.evalgate] or [tool.evalai]
+        pyproject = os.path.join(directory, "pyproject.toml")
+        if os.path.isfile(pyproject):
+            try:
+                text = Path(pyproject).read_text(encoding="utf-8")
+                if "[tool.evalgate]" in text or "[tool.evalai]" in text:
+                    return pyproject
+            except OSError:
+                pass
+        parent = os.path.dirname(directory)
+        if parent == directory or directory == root:
+            break
+        directory = parent
+    return None
+def load_config(cwd: str | None = None) -> EvalAIConfig | None:
+    """Load config from file system."""
+    config_path = find_config_path(cwd)
+    if not config_path:
+        return None
+    try:
+        if config_path.endswith("pyproject.toml"):
+            return _load_from_pyproject(config_path, cwd)
+        with open(config_path, encoding="utf-8") as f:
+            data = json.load(f)
+        config = _dict_to_config(data)
+        # Monorepo package resolution
+        if config.packages and cwd:
+            config_dir = os.path.dirname(config_path)
+            rel = os.path.relpath(os.path.abspath(cwd), config_dir).replace("\\", "/")
+            pkg_config = config.packages.get(rel)
+            if pkg_config:
+                merged = _dict_to_config({**_config_to_dict(config), **pkg_config})
+                merged.packages = config.packages
+                return merged
+            for key, val in config.packages.items():
+                if rel == key or rel.startswith(f"{key}/"):
+                    merged = _dict_to_config({**_config_to_dict(config), **val})
+                    merged.packages = config.packages
+                    return merged
+        return config
+    except Exception as exc:
+        import warnings
+        warnings.warn(f"[EvalGate] Failed to load config from {config_path}: {exc}", stacklevel=2)
+        return None
+def merge_config_with_args(
+    config: EvalAIConfig | None,
+    args: dict[str, Any],
+) -> dict[str, Any]:
+    """Merge config with CLI args. Priority: args > profile > config > defaults."""
+    merged: dict[str, Any] = {}
+    if config:
+        if config.evaluation_id:
+            merged["evaluation_id"] = config.evaluation_id
+        if config.base_url:
+            merged["base_url"] = config.base_url
+        if config.min_score is not None:
+            merged["min_score"] = config.min_score
+        if config.min_n is not None:
+            merged["min_n"] = config.min_n
+        if config.max_drop is not None:
+            merged["max_drop"] = config.max_drop
+        if config.warn_drop is not None:
+            merged["warn_drop"] = config.warn_drop
+        if config.allow_weak_evidence is not None:
+            merged["allow_weak_evidence"] = config.allow_weak_evidence
+        if config.baseline:
+            merged["baseline"] = config.baseline
+        if config.profile:
+            merged["profile"] = config.profile
+    # Profile defaults
+    profile_name = args.get("profile") or merged.get("profile")
+    if profile_name and profile_name in PROFILES:
+        profile = PROFILES[profile_name]
+        for key in ("min_score", "max_drop", "warn_drop", "min_n", "allow_weak_evidence"):
+            if merged.get(key) is None and args.get(key) is None and key in profile:
+                merged[key] = profile[key]
+    # Args override
+    for key in (
+        "evaluation_id",
+        "base_url",
+        "min_score",
+        "max_drop",
+        "warn_drop",
+        "min_n",
+        "allow_weak_evidence",
+        "baseline",
+        "profile",
+    ):
+        if args.get(key) is not None:
+            merged[key] = args[key]
+    return merged
+def _first_defined(*values: Any) -> Any:
+    """Return the first value that is not None (preserves 0, False, empty string)."""
+    for v in values:
+        if v is not None:
+            return v
+    return None
+def _dict_to_config(d: dict[str, Any]) -> EvalAIConfig:
+    return EvalAIConfig(
+        evaluation_id=_first_defined(d.get("evaluationId"), d.get("evaluation_id")),
+        api_key=_first_defined(d.get("apiKey"), d.get("api_key")),
+        base_url=_first_defined(d.get("baseUrl"), d.get("base_url")),
+        min_score=_first_defined(d.get("minScore"), d.get("min_score")),
+        min_n=_first_defined(d.get("minN"), d.get("min_n")),
+        max_drop=_first_defined(d.get("maxDrop"), d.get("max_drop")),
+        warn_drop=_first_defined(d.get("warnDrop"), d.get("warn_drop")),
+        allow_weak_evidence=_first_defined(d.get("allowWeakEvidence"), d.get("allow_weak_evidence")),
+        baseline=d.get("baseline"),
+        profile=d.get("profile"),
+        packages=d.get("packages"),
+    )
+def _config_to_dict(c: EvalAIConfig) -> dict[str, Any]:
+    return {
+        k: v
+        for k, v in {
+            "evaluation_id": c.evaluation_id,
+            "api_key": c.api_key,
+            "base_url": c.base_url,
+            "min_score": c.min_score,
+            "min_n": c.min_n,
+            "max_drop": c.max_drop,
+            "warn_drop": c.warn_drop,
+            "allow_weak_evidence": c.allow_weak_evidence,
+            "baseline": c.baseline,
+            "profile": c.profile,
+        }.items()
+        if v is not None
+    }
+def _load_from_pyproject(path: str, cwd: str | None) -> EvalAIConfig | None:
+    """Load config from pyproject.toml [tool.evalgate] or [tool.evalai]."""
+    try:
+        import tomllib  # type: ignore[import-not-found]
+    except ImportError:
+        try:
+            import tomli as tomllib  # type: ignore[no-redef]
+        except ImportError:
+            return None
+    try:
+        with open(path, "rb") as f:
+            data = tomllib.load(f)
+        tool = data.get("tool", {})
+        cfg = tool.get("evalgate") or tool.get("evalai")
+        if cfg:
+            return _dict_to_config(cfg)
+    except Exception:
+        pass
+    return None

evalgate_sdk/cli/env.py ADDED Viewed

@@ -0,0 +1,43 @@
+"""Centralized environment detection for CLI commands.
+Port of ``cli/env.ts``.
+"""
+from __future__ import annotations
+import os
+import re
+def is_ci() -> bool:
+    """Check if running in a CI environment."""
+    return bool(
+        os.environ.get("GITHUB_ACTIONS")
+        or os.environ.get("CI")
+        or os.environ.get("CONTINUOUS_INTEGRATION")
+        or os.environ.get("BUILDKITE")
+        or os.environ.get("CIRCLECI")
+        or os.environ.get("TRAVIS")
+        or os.environ.get("JENKINS_URL")
+    )
+def is_github_actions() -> bool:
+    """Check if running in GitHub Actions."""
+    return os.environ.get("GITHUB_ACTIONS") == "true"
+def get_github_step_summary_path() -> str | None:
+    """Get GitHub Step Summary path if available."""
+    return os.environ.get("GITHUB_STEP_SUMMARY")
+_GIT_REF_PATTERN = re.compile(
+    r"^(main|master|develop|dev|origin/|remotes/|feature/|hotfix/|release/"
+    r"|v\d+\.\d+\.\d+|.*\.\.\..*).*"
+)
+def is_git_ref(ref: str) -> bool:
+    """Check if string looks like a git reference."""
+    return bool(_GIT_REF_PATTERN.match(ref))

evalgate_sdk/cli/formatters/types.py ADDED Viewed

@@ -0,0 +1,132 @@
+"""CheckReport and related types for formatters.
+Port of ``cli/formatters/types.ts``.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Literal
+GateVerdict = Literal["pass", "warn", "fail"]
+GateMode = Literal["enforced", "neutral"]
+FailureReasonCode = Literal[
+    "PASS",
+    "WARN_REGRESSION",
+    "LOW_SAMPLE_SIZE",
+    "BASELINE_MISSING",
+    "SCORE_TOO_LOW",
+    "DELTA_TOO_HIGH",
+    "COST_BUDGET_EXCEEDED",
+    "LATENCY_BUDGET_EXCEEDED",
+    "POLICY_FAILED",
+    "UNKNOWN",
+    "LOW_SCORE",
+    "LOW_PASS_RATE",
+    "SAFETY_RISK",
+    "LATENCY_RISK",
+    "COST_RISK",
+    "MAX_DROP_EXCEEDED",
+    "INSUFFICIENT_EVIDENCE",
+    "POLICY_VIOLATION",
+]
+CHECK_REPORT_SCHEMA_VERSION = 1
+@dataclass
+class ScoreBreakdown01:
+    pass_rate: float | None = None
+    safety: float | None = None
+    judge: float | None = None
+    schema: float | None = None
+    latency: float | None = None
+    cost: float | None = None
+@dataclass
+class ScoreContribPts:
+    pass_rate_pts: float | None = None
+    safety_pts: float | None = None
+    compliance_pts: float | None = None
+    performance_pts: float | None = None
+@dataclass
+class GateThresholds:
+    min_score: float | None = None
+    min_pass_rate: float | None = None
+    min_safety: float | None = None
+    max_drop: float | None = None
+    warn_drop: float | None = None
+    min_n: int | None = None
+    allow_weak_evidence: bool | None = None
+    baseline: str | None = None
+    max_cost_usd: float | None = None
+    max_latency_ms: float | None = None
+    max_cost_delta_usd: float | None = None
+@dataclass
+class FailedCase:
+    test_case_id: int | None = None
+    status: str | None = None
+    name: str | None = None
+    input: str | None = None
+    input_snippet: str | None = None
+    expected_output: str | None = None
+    expected_snippet: str | None = None
+    output: str | None = None
+    output_snippet: str | None = None
+    reason: str | None = None
+@dataclass
+class CiContext:
+    provider: str | None = None
+    repo: str | None = None
+    sha: str | None = None
+    branch: str | None = None
+    pr: int | None = None
+    run_url: str | None = None
+    actor: str | None = None
+@dataclass
+class CheckReport:
+    evaluation_id: str = ""
+    verdict: GateVerdict = "fail"
+    gate_applied: bool = True
+    gate_mode: GateMode = "enforced"
+    reason_code: str = "UNKNOWN"
+    schema_version: int = CHECK_REPORT_SCHEMA_VERSION
+    run_id: int | None = None
+    actionable_message: str | None = None
+    reason_message: str | None = None
+    score: float | None = None
+    baseline_score: float | None = None
+    delta: float | None = None
+    pass_rate: float | None = None
+    safety_pass_rate: float | None = None
+    flags: list[str] | None = None
+    breakdown_01: ScoreBreakdown01 | None = None
+    contrib_pts: ScoreContribPts | None = None
+    thresholds: GateThresholds | None = None
+    n: int | None = None
+    evidence_level: str | None = None
+    baseline_missing: bool | None = None
+    baseline_status: str | None = None
+    dashboard_url: str | None = None
+    failed_cases: list[FailedCase] = field(default_factory=list)
+    failed_cases_shown: int | None = None
+    failed_cases_more: int | None = None
+    request_id: str | None = None
+    duration_ms: float | None = None
+    ci: CiContext | None = None
+    explain: bool | None = None
+    share_url: str | None = None
+    policy: str | None = None
+    baseline_run_id: int | None = None
+    ci_run_url: str | None = None
+    policy_evidence: dict[str, Any] | None = None