evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,228 @@
1
+ """EvalGate config loader.
2
+
3
+ Discovery: evalgate.config.json → evalai.config.json → pyproject.toml [evalgate].
4
+ Port of ``cli/config.ts``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ from dataclasses import dataclass
12
+ from pathlib import Path
13
+ from typing import Any, Literal
14
+
15
+ from evalgate_sdk.cli.profiles import PROFILES
16
+
17
+ ProfileName = Literal["strict", "balanced", "fast"]
18
+
19
+ CONFIG_FILES = [
20
+ "evalgate.config.json",
21
+ "evalai.config.json",
22
+ ]
23
+
24
+
25
+ @dataclass
26
+ class EvalGateConfig:
27
+ """Loaded configuration."""
28
+
29
+ evaluation_id: str | None = None
30
+ api_key: str | None = None
31
+ base_url: str | None = None
32
+ min_score: int | None = None
33
+ min_n: int | None = None
34
+ max_drop: int | None = None
35
+ warn_drop: int | None = None
36
+ allow_weak_evidence: bool | None = None
37
+ baseline: str | None = None # "published" | "previous" | "production" | "auto"
38
+ profile: str | None = None
39
+ packages: dict[str, Any] | None = None
40
+
41
+
42
+ # Deprecated alias — remove in v4
43
+ EvalAIConfig = EvalGateConfig
44
+
45
+
46
+ def find_config_path(cwd: str | None = None) -> str | None:
47
+ """Find config file path in directory, walking up to root."""
48
+ directory = os.path.abspath(cwd or os.getcwd())
49
+ root = os.path.splitdrive(directory)[0] + os.sep
50
+
51
+ while True:
52
+ for name in CONFIG_FILES:
53
+ candidate = os.path.join(directory, name)
54
+ if os.path.isfile(candidate):
55
+ return candidate
56
+
57
+ # Check pyproject.toml for [tool.evalgate] or [tool.evalai]
58
+ pyproject = os.path.join(directory, "pyproject.toml")
59
+ if os.path.isfile(pyproject):
60
+ try:
61
+ text = Path(pyproject).read_text(encoding="utf-8")
62
+ if "[tool.evalgate]" in text or "[tool.evalai]" in text:
63
+ return pyproject
64
+ except OSError:
65
+ pass
66
+
67
+ parent = os.path.dirname(directory)
68
+ if parent == directory or directory == root:
69
+ break
70
+ directory = parent
71
+
72
+ return None
73
+
74
+
75
+ def load_config(cwd: str | None = None) -> EvalAIConfig | None:
76
+ """Load config from file system."""
77
+ config_path = find_config_path(cwd)
78
+ if not config_path:
79
+ return None
80
+
81
+ try:
82
+ if config_path.endswith("pyproject.toml"):
83
+ return _load_from_pyproject(config_path, cwd)
84
+
85
+ with open(config_path, encoding="utf-8") as f:
86
+ data = json.load(f)
87
+
88
+ config = _dict_to_config(data)
89
+
90
+ # Monorepo package resolution
91
+ if config.packages and cwd:
92
+ config_dir = os.path.dirname(config_path)
93
+ rel = os.path.relpath(os.path.abspath(cwd), config_dir).replace("\\", "/")
94
+ pkg_config = config.packages.get(rel)
95
+ if pkg_config:
96
+ merged = _dict_to_config({**_config_to_dict(config), **pkg_config})
97
+ merged.packages = config.packages
98
+ return merged
99
+ for key, val in config.packages.items():
100
+ if rel == key or rel.startswith(f"{key}/"):
101
+ merged = _dict_to_config({**_config_to_dict(config), **val})
102
+ merged.packages = config.packages
103
+ return merged
104
+
105
+ return config
106
+ except Exception as exc:
107
+ import warnings
108
+
109
+ warnings.warn(f"[EvalGate] Failed to load config from {config_path}: {exc}", stacklevel=2)
110
+ return None
111
+
112
+
113
+ def merge_config_with_args(
114
+ config: EvalAIConfig | None,
115
+ args: dict[str, Any],
116
+ ) -> dict[str, Any]:
117
+ """Merge config with CLI args. Priority: args > profile > config > defaults."""
118
+ merged: dict[str, Any] = {}
119
+
120
+ if config:
121
+ if config.evaluation_id:
122
+ merged["evaluation_id"] = config.evaluation_id
123
+ if config.base_url:
124
+ merged["base_url"] = config.base_url
125
+ if config.min_score is not None:
126
+ merged["min_score"] = config.min_score
127
+ if config.min_n is not None:
128
+ merged["min_n"] = config.min_n
129
+ if config.max_drop is not None:
130
+ merged["max_drop"] = config.max_drop
131
+ if config.warn_drop is not None:
132
+ merged["warn_drop"] = config.warn_drop
133
+ if config.allow_weak_evidence is not None:
134
+ merged["allow_weak_evidence"] = config.allow_weak_evidence
135
+ if config.baseline:
136
+ merged["baseline"] = config.baseline
137
+ if config.profile:
138
+ merged["profile"] = config.profile
139
+
140
+ # Profile defaults
141
+ profile_name = args.get("profile") or merged.get("profile")
142
+ if profile_name and profile_name in PROFILES:
143
+ profile = PROFILES[profile_name]
144
+ for key in ("min_score", "max_drop", "warn_drop", "min_n", "allow_weak_evidence"):
145
+ if merged.get(key) is None and args.get(key) is None and key in profile:
146
+ merged[key] = profile[key]
147
+
148
+ # Args override
149
+ for key in (
150
+ "evaluation_id",
151
+ "base_url",
152
+ "min_score",
153
+ "max_drop",
154
+ "warn_drop",
155
+ "min_n",
156
+ "allow_weak_evidence",
157
+ "baseline",
158
+ "profile",
159
+ ):
160
+ if args.get(key) is not None:
161
+ merged[key] = args[key]
162
+
163
+ return merged
164
+
165
+
166
+ def _first_defined(*values: Any) -> Any:
167
+ """Return the first value that is not None (preserves 0, False, empty string)."""
168
+ for v in values:
169
+ if v is not None:
170
+ return v
171
+ return None
172
+
173
+
174
+ def _dict_to_config(d: dict[str, Any]) -> EvalAIConfig:
175
+ return EvalAIConfig(
176
+ evaluation_id=_first_defined(d.get("evaluationId"), d.get("evaluation_id")),
177
+ api_key=_first_defined(d.get("apiKey"), d.get("api_key")),
178
+ base_url=_first_defined(d.get("baseUrl"), d.get("base_url")),
179
+ min_score=_first_defined(d.get("minScore"), d.get("min_score")),
180
+ min_n=_first_defined(d.get("minN"), d.get("min_n")),
181
+ max_drop=_first_defined(d.get("maxDrop"), d.get("max_drop")),
182
+ warn_drop=_first_defined(d.get("warnDrop"), d.get("warn_drop")),
183
+ allow_weak_evidence=_first_defined(d.get("allowWeakEvidence"), d.get("allow_weak_evidence")),
184
+ baseline=d.get("baseline"),
185
+ profile=d.get("profile"),
186
+ packages=d.get("packages"),
187
+ )
188
+
189
+
190
+ def _config_to_dict(c: EvalAIConfig) -> dict[str, Any]:
191
+ return {
192
+ k: v
193
+ for k, v in {
194
+ "evaluation_id": c.evaluation_id,
195
+ "api_key": c.api_key,
196
+ "base_url": c.base_url,
197
+ "min_score": c.min_score,
198
+ "min_n": c.min_n,
199
+ "max_drop": c.max_drop,
200
+ "warn_drop": c.warn_drop,
201
+ "allow_weak_evidence": c.allow_weak_evidence,
202
+ "baseline": c.baseline,
203
+ "profile": c.profile,
204
+ }.items()
205
+ if v is not None
206
+ }
207
+
208
+
209
+ def _load_from_pyproject(path: str, cwd: str | None) -> EvalAIConfig | None:
210
+ """Load config from pyproject.toml [tool.evalgate] or [tool.evalai]."""
211
+ try:
212
+ import tomllib # type: ignore[import-not-found]
213
+ except ImportError:
214
+ try:
215
+ import tomli as tomllib # type: ignore[no-redef]
216
+ except ImportError:
217
+ return None
218
+
219
+ try:
220
+ with open(path, "rb") as f:
221
+ data = tomllib.load(f)
222
+ tool = data.get("tool", {})
223
+ cfg = tool.get("evalgate") or tool.get("evalai")
224
+ if cfg:
225
+ return _dict_to_config(cfg)
226
+ except Exception:
227
+ pass
228
+ return None
@@ -0,0 +1,43 @@
1
+ """Centralized environment detection for CLI commands.
2
+
3
+ Port of ``cli/env.ts``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ import re
10
+
11
+
12
+ def is_ci() -> bool:
13
+ """Check if running in a CI environment."""
14
+ return bool(
15
+ os.environ.get("GITHUB_ACTIONS")
16
+ or os.environ.get("CI")
17
+ or os.environ.get("CONTINUOUS_INTEGRATION")
18
+ or os.environ.get("BUILDKITE")
19
+ or os.environ.get("CIRCLECI")
20
+ or os.environ.get("TRAVIS")
21
+ or os.environ.get("JENKINS_URL")
22
+ )
23
+
24
+
25
+ def is_github_actions() -> bool:
26
+ """Check if running in GitHub Actions."""
27
+ return os.environ.get("GITHUB_ACTIONS") == "true"
28
+
29
+
30
+ def get_github_step_summary_path() -> str | None:
31
+ """Get GitHub Step Summary path if available."""
32
+ return os.environ.get("GITHUB_STEP_SUMMARY")
33
+
34
+
35
+ _GIT_REF_PATTERN = re.compile(
36
+ r"^(main|master|develop|dev|origin/|remotes/|feature/|hotfix/|release/"
37
+ r"|v\d+\.\d+\.\d+|.*\.\.\..*).*"
38
+ )
39
+
40
+
41
+ def is_git_ref(ref: str) -> bool:
42
+ """Check if string looks like a git reference."""
43
+ return bool(_GIT_REF_PATTERN.match(ref))
@@ -0,0 +1,132 @@
1
+ """CheckReport and related types for formatters.
2
+
3
+ Port of ``cli/formatters/types.ts``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Literal
10
+
11
+ GateVerdict = Literal["pass", "warn", "fail"]
12
+ GateMode = Literal["enforced", "neutral"]
13
+
14
+ FailureReasonCode = Literal[
15
+ "PASS",
16
+ "WARN_REGRESSION",
17
+ "LOW_SAMPLE_SIZE",
18
+ "BASELINE_MISSING",
19
+ "SCORE_TOO_LOW",
20
+ "DELTA_TOO_HIGH",
21
+ "COST_BUDGET_EXCEEDED",
22
+ "LATENCY_BUDGET_EXCEEDED",
23
+ "POLICY_FAILED",
24
+ "UNKNOWN",
25
+ "LOW_SCORE",
26
+ "LOW_PASS_RATE",
27
+ "SAFETY_RISK",
28
+ "LATENCY_RISK",
29
+ "COST_RISK",
30
+ "MAX_DROP_EXCEEDED",
31
+ "INSUFFICIENT_EVIDENCE",
32
+ "POLICY_VIOLATION",
33
+ ]
34
+
35
+ CHECK_REPORT_SCHEMA_VERSION = 1
36
+
37
+
38
+ @dataclass
39
+ class ScoreBreakdown01:
40
+ pass_rate: float | None = None
41
+ safety: float | None = None
42
+ judge: float | None = None
43
+ schema: float | None = None
44
+ latency: float | None = None
45
+ cost: float | None = None
46
+
47
+
48
+ @dataclass
49
+ class ScoreContribPts:
50
+ pass_rate_pts: float | None = None
51
+ safety_pts: float | None = None
52
+ compliance_pts: float | None = None
53
+ performance_pts: float | None = None
54
+
55
+
56
+ @dataclass
57
+ class GateThresholds:
58
+ min_score: float | None = None
59
+ min_pass_rate: float | None = None
60
+ min_safety: float | None = None
61
+ max_drop: float | None = None
62
+ warn_drop: float | None = None
63
+ min_n: int | None = None
64
+ allow_weak_evidence: bool | None = None
65
+ baseline: str | None = None
66
+ max_cost_usd: float | None = None
67
+ max_latency_ms: float | None = None
68
+ max_cost_delta_usd: float | None = None
69
+
70
+
71
+ @dataclass
72
+ class FailedCase:
73
+ test_case_id: int | None = None
74
+ status: str | None = None
75
+ name: str | None = None
76
+ input: str | None = None
77
+ input_snippet: str | None = None
78
+ expected_output: str | None = None
79
+ expected_snippet: str | None = None
80
+ output: str | None = None
81
+ output_snippet: str | None = None
82
+ reason: str | None = None
83
+
84
+
85
+ @dataclass
86
+ class CiContext:
87
+ provider: str | None = None
88
+ repo: str | None = None
89
+ sha: str | None = None
90
+ branch: str | None = None
91
+ pr: int | None = None
92
+ run_url: str | None = None
93
+ actor: str | None = None
94
+
95
+
96
+ @dataclass
97
+ class CheckReport:
98
+ evaluation_id: str = ""
99
+ verdict: GateVerdict = "fail"
100
+ gate_applied: bool = True
101
+ gate_mode: GateMode = "enforced"
102
+ reason_code: str = "UNKNOWN"
103
+ schema_version: int = CHECK_REPORT_SCHEMA_VERSION
104
+ run_id: int | None = None
105
+ actionable_message: str | None = None
106
+ reason_message: str | None = None
107
+ score: float | None = None
108
+ baseline_score: float | None = None
109
+ delta: float | None = None
110
+ pass_rate: float | None = None
111
+ safety_pass_rate: float | None = None
112
+ flags: list[str] | None = None
113
+ breakdown_01: ScoreBreakdown01 | None = None
114
+ contrib_pts: ScoreContribPts | None = None
115
+ thresholds: GateThresholds | None = None
116
+ n: int | None = None
117
+ evidence_level: str | None = None
118
+ baseline_missing: bool | None = None
119
+ baseline_status: str | None = None
120
+ dashboard_url: str | None = None
121
+ failed_cases: list[FailedCase] = field(default_factory=list)
122
+ failed_cases_shown: int | None = None
123
+ failed_cases_more: int | None = None
124
+ request_id: str | None = None
125
+ duration_ms: float | None = None
126
+ ci: CiContext | None = None
127
+ explain: bool | None = None
128
+ share_url: str | None = None
129
+ policy: str | None = None
130
+ baseline_run_id: int | None = None
131
+ ci_run_url: str | None = None
132
+ policy_evidence: dict[str, Any] | None = None