evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,103 @@
1
+ """Standardized failure reason codes (T10).
2
+
3
+ Port of the TypeScript SDK's ``reason-codes.ts``.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Literal
9
+
10
+ ReasonCode = Literal[
11
+ "PASS",
12
+ "WARN_REGRESSION",
13
+ "LOW_SAMPLE_SIZE",
14
+ "BASELINE_MISSING",
15
+ "SCORE_TOO_LOW",
16
+ "DELTA_TOO_HIGH",
17
+ "COST_BUDGET_EXCEEDED",
18
+ "LATENCY_BUDGET_EXCEEDED",
19
+ "POLICY_FAILED",
20
+ "UNKNOWN",
21
+ # Legacy aliases
22
+ "LOW_SCORE",
23
+ "LOW_PASS_RATE",
24
+ "SAFETY_RISK",
25
+ "LATENCY_RISK",
26
+ "COST_RISK",
27
+ "MAX_DROP_EXCEEDED",
28
+ "INSUFFICIENT_EVIDENCE",
29
+ "POLICY_VIOLATION",
30
+ ]
31
+
32
+ REASON_CODES: dict[str, dict[str, str]] = {
33
+ "PASS": {"label": "Pass", "severity": "info", "description": "All checks passed."},
34
+ "WARN_REGRESSION": {
35
+ "label": "Warning: Regression",
36
+ "severity": "warn",
37
+ "description": "Score dropped but within tolerance.",
38
+ },
39
+ "LOW_SAMPLE_SIZE": {
40
+ "label": "Low Sample Size",
41
+ "severity": "warn",
42
+ "description": "Not enough data points for confidence.",
43
+ },
44
+ "BASELINE_MISSING": {
45
+ "label": "Baseline Missing",
46
+ "severity": "warn",
47
+ "description": "No baseline to compare against.",
48
+ },
49
+ "SCORE_TOO_LOW": {
50
+ "label": "Score Too Low",
51
+ "severity": "fail",
52
+ "description": "Score is below the minimum threshold.",
53
+ },
54
+ "DELTA_TOO_HIGH": {
55
+ "label": "Delta Too High",
56
+ "severity": "fail",
57
+ "description": "Score dropped more than the allowed delta.",
58
+ },
59
+ "COST_BUDGET_EXCEEDED": {
60
+ "label": "Cost Budget Exceeded",
61
+ "severity": "fail",
62
+ "description": "Evaluation cost exceeded the budget.",
63
+ },
64
+ "LATENCY_BUDGET_EXCEEDED": {
65
+ "label": "Latency Budget Exceeded",
66
+ "severity": "fail",
67
+ "description": "Response latency exceeded the limit.",
68
+ },
69
+ "POLICY_FAILED": {"label": "Policy Failed", "severity": "fail", "description": "One or more policy checks failed."},
70
+ "UNKNOWN": {"label": "Unknown", "severity": "fail", "description": "Unknown failure reason."},
71
+ # Legacy aliases
72
+ "LOW_SCORE": {"label": "Low Score", "severity": "fail", "description": "Score is below the minimum threshold."},
73
+ "LOW_PASS_RATE": {
74
+ "label": "Low Pass Rate",
75
+ "severity": "fail",
76
+ "description": "Pass rate is below the minimum threshold.",
77
+ },
78
+ "SAFETY_RISK": {"label": "Safety Risk", "severity": "fail", "description": "Safety check failed."},
79
+ "LATENCY_RISK": {"label": "Latency Risk", "severity": "warn", "description": "Latency is near or above the limit."},
80
+ "COST_RISK": {"label": "Cost Risk", "severity": "warn", "description": "Cost is near or above the budget."},
81
+ "MAX_DROP_EXCEEDED": {
82
+ "label": "Max Drop Exceeded",
83
+ "severity": "fail",
84
+ "description": "Score dropped more than the allowed maximum.",
85
+ },
86
+ "INSUFFICIENT_EVIDENCE": {
87
+ "label": "Insufficient Evidence",
88
+ "severity": "warn",
89
+ "description": "Not enough data to make a determination.",
90
+ },
91
+ "POLICY_VIOLATION": {"label": "Policy Violation", "severity": "fail", "description": "A policy was violated."},
92
+ }
93
+
94
+
95
+ def get_reason_info(code: str) -> dict[str, str]:
96
+ """Get label, severity, and description for a reason code."""
97
+ return REASON_CODES.get(code, REASON_CODES["UNKNOWN"])
98
+
99
+
100
+ def is_blocking(code: str) -> bool:
101
+ """Return True if the reason code should block a deployment."""
102
+ info = get_reason_info(code)
103
+ return info["severity"] == "fail"
@@ -0,0 +1,196 @@
1
+ """Regression gate constants, types, and helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Literal
9
+
10
+
11
+ class GATE_EXIT:
12
+ PASS = 0
13
+ REGRESSION = 1
14
+ INFRA_ERROR = 2
15
+ CONFIDENCE_FAILED = 3
16
+ CONFIDENCE_MISSING = 4
17
+
18
+ def __class_getitem__(cls, key: str) -> int:
19
+ """Support dict-style access: GATE_EXIT['PASS'] == 0."""
20
+ return getattr(cls, key)
21
+
22
+
23
+ class GATE_CATEGORY:
24
+ PASS = "pass"
25
+ REGRESSION = "regression"
26
+ INFRA_ERROR = "infra_error"
27
+
28
+
29
+ REPORT_SCHEMA_VERSION = 1
30
+
31
+
32
+ class ARTIFACTS:
33
+ BASELINE = "evals/baseline.json"
34
+ REGRESSION_REPORT = "evals/regression-report.json"
35
+ CONFIDENCE_SUMMARY = "evals/confidence-summary.json"
36
+ LATENCY_BENCHMARK = "evals/latency-benchmark.json"
37
+
38
+
39
+ @dataclass
40
+ class BaselineTolerance:
41
+ score_drop: float = 0.05
42
+ latency_increase_pct: float = 20.0
43
+ min_confidence: float = 0.8
44
+
45
+
46
+ @dataclass
47
+ class Baseline:
48
+ version: int = REPORT_SCHEMA_VERSION
49
+ scores: dict[str, float] = field(default_factory=dict)
50
+ latencies: dict[str, float] = field(default_factory=dict)
51
+ created_at: str | None = None
52
+ tolerance: BaselineTolerance = field(default_factory=BaselineTolerance)
53
+ metadata: dict[str, Any] = field(default_factory=dict)
54
+
55
+
56
+ @dataclass
57
+ class RegressionDelta:
58
+ test_id: str
59
+ metric: str
60
+ baseline_value: float
61
+ current_value: float
62
+ delta: float
63
+ delta_pct: float
64
+ category: str = GATE_CATEGORY.PASS
65
+ severity: Literal["low", "medium", "high", "critical"] = "low"
66
+
67
+
68
+ @dataclass
69
+ class RegressionReport:
70
+ version: int = REPORT_SCHEMA_VERSION
71
+ run_id: str = ""
72
+ gate_exit: int = GATE_EXIT.PASS
73
+ gate_category: str = GATE_CATEGORY.PASS
74
+ deltas: list[RegressionDelta] = field(default_factory=list)
75
+ summary: dict[str, Any] = field(default_factory=dict)
76
+ created_at: str | None = None
77
+
78
+
79
+ def evaluate_regression(
80
+ baseline: Baseline,
81
+ current_scores: dict[str, float],
82
+ *,
83
+ min_score: float | None = None,
84
+ max_drop: float | None = None,
85
+ ) -> RegressionReport:
86
+ """Compare current scores against a baseline and produce a regression report.
87
+
88
+ Args:
89
+ baseline: Baseline scores and tolerance configuration.
90
+ current_scores: Current run scores keyed by test ID.
91
+ min_score: Optional absolute floor — any score below this is a failure.
92
+ max_drop: Optional override for baseline.tolerance.score_drop.
93
+ """
94
+ effective_drop = max_drop if max_drop is not None else baseline.tolerance.score_drop
95
+
96
+ deltas: list[RegressionDelta] = []
97
+ gate_exit = GATE_EXIT.PASS
98
+ gate_category = GATE_CATEGORY.PASS
99
+
100
+ for test_id, baseline_score in baseline.scores.items():
101
+ current = current_scores.get(test_id)
102
+ if current is None:
103
+ continue
104
+ delta = current - baseline_score
105
+ delta_pct = (delta / baseline_score * 100) if baseline_score != 0 else 0
106
+
107
+ failed = False
108
+
109
+ if delta < -effective_drop:
110
+ failed = True
111
+
112
+ if min_score is not None and current < min_score:
113
+ failed = True
114
+
115
+ if failed:
116
+ severity: Literal["low", "medium", "high", "critical"] = "high" if abs(delta_pct) > 20 else "medium"
117
+ deltas.append(
118
+ RegressionDelta(
119
+ test_id=test_id,
120
+ metric="score",
121
+ baseline_value=baseline_score,
122
+ current_value=current,
123
+ delta=delta,
124
+ delta_pct=delta_pct,
125
+ category=GATE_CATEGORY.REGRESSION,
126
+ severity=severity,
127
+ )
128
+ )
129
+ gate_exit = GATE_EXIT.REGRESSION
130
+ gate_category = GATE_CATEGORY.REGRESSION
131
+ else:
132
+ deltas.append(
133
+ RegressionDelta(
134
+ test_id=test_id,
135
+ metric="score",
136
+ baseline_value=baseline_score,
137
+ current_value=current,
138
+ delta=delta,
139
+ delta_pct=delta_pct,
140
+ )
141
+ )
142
+
143
+ # Check min_score for tests not in the baseline
144
+ if min_score is not None:
145
+ for test_id, current in current_scores.items():
146
+ if test_id in baseline.scores:
147
+ continue
148
+ if current < min_score:
149
+ deltas.append(
150
+ RegressionDelta(
151
+ test_id=test_id,
152
+ metric="score",
153
+ baseline_value=0.0,
154
+ current_value=current,
155
+ delta=current,
156
+ delta_pct=0.0,
157
+ category=GATE_CATEGORY.REGRESSION,
158
+ severity="medium",
159
+ )
160
+ )
161
+ gate_exit = GATE_EXIT.REGRESSION
162
+ gate_category = GATE_CATEGORY.REGRESSION
163
+
164
+ return RegressionReport(
165
+ gate_exit=gate_exit,
166
+ gate_category=gate_category,
167
+ deltas=deltas,
168
+ summary={
169
+ "total": len(deltas),
170
+ "regressions": sum(1 for d in deltas if d.category == GATE_CATEGORY.REGRESSION),
171
+ "passed": sum(1 for d in deltas if d.category == GATE_CATEGORY.PASS),
172
+ },
173
+ )
174
+
175
+
176
+ def compute_baseline_checksum(baseline: Baseline) -> str:
177
+ """Compute a deterministic SHA-256 checksum of a baseline's score data.
178
+
179
+ The checksum covers ``version``, ``scores``, and ``latencies`` so that
180
+ any tamper with the stored values is detectable.
181
+ """
182
+ payload = json.dumps(
183
+ {
184
+ "version": baseline.version,
185
+ "scores": dict(sorted(baseline.scores.items())),
186
+ "latencies": dict(sorted(baseline.latencies.items())),
187
+ },
188
+ sort_keys=True,
189
+ separators=(",", ":"),
190
+ )
191
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
192
+
193
+
194
+ def verify_baseline_checksum(baseline: Baseline, expected_checksum: str) -> bool:
195
+ """Return ``True`` if the baseline's current checksum matches *expected_checksum*."""
196
+ return compute_baseline_checksum(baseline) == expected_checksum
@@ -0,0 +1,115 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ from evalgate_sdk.golden import RunMetrics, extract_run_metrics
7
+
8
+
9
+ @dataclass(slots=True)
10
+ class NormalizedBudgetConfig:
11
+ mode: Literal["traces", "cost"]
12
+ max_traces: int | None = None
13
+ max_cost_usd: float | None = None
14
+
15
+
16
+ @dataclass(slots=True)
17
+ class ReplayDecision:
18
+ action: Literal["keep", "discard"]
19
+ reason: Literal["pass_rate_improved", "pass_rate_declined", "budget_exceeded"]
20
+ previous_pass_rate: float
21
+ new_pass_rate: float
22
+ previous_corrected_pass_rate: float | None
23
+ new_corrected_pass_rate: float | None
24
+ comparison_basis: Literal["corrected", "raw"]
25
+ budget_used: float
26
+ budget_limit: float
27
+
28
+ def to_dict(self) -> dict[str, float | str | None]:
29
+ return {
30
+ "action": self.action,
31
+ "reason": self.reason,
32
+ "previousPassRate": self.previous_pass_rate,
33
+ "newPassRate": self.new_pass_rate,
34
+ "previousCorrectedPassRate": self.previous_corrected_pass_rate,
35
+ "newCorrectedPassRate": self.new_corrected_pass_rate,
36
+ "comparisonBasis": self.comparison_basis,
37
+ "budgetUsed": self.budget_used,
38
+ "budgetLimit": self.budget_limit,
39
+ }
40
+
41
+
42
+ def determine_comparison_basis(
43
+ previous_corrected: float | None,
44
+ new_corrected: float | None,
45
+ ) -> Literal["corrected", "raw"]:
46
+ if previous_corrected is not None and new_corrected is not None:
47
+ return "corrected"
48
+ return "raw"
49
+
50
+
51
+ def _budget_values(metrics: RunMetrics, budget: NormalizedBudgetConfig) -> tuple[float, float]:
52
+ if budget.mode == "traces":
53
+ used = float(metrics.total_results)
54
+ limit = float(budget.max_traces or 0)
55
+ return used, limit
56
+ used = float(metrics.total_cost_usd or 0.0)
57
+ limit = float(budget.max_cost_usd or 0.0)
58
+ return used, limit
59
+
60
+
61
+ def evaluate_replay_outcome(
62
+ previous_run: dict,
63
+ new_run: dict,
64
+ budget_config: NormalizedBudgetConfig,
65
+ ) -> ReplayDecision:
66
+ previous_metrics = extract_run_metrics(previous_run)
67
+ new_metrics = extract_run_metrics(new_run)
68
+ budget_used, budget_limit = _budget_values(new_metrics, budget_config)
69
+ comparison_basis = determine_comparison_basis(
70
+ previous_metrics.corrected_pass_rate_ratio,
71
+ new_metrics.corrected_pass_rate_ratio,
72
+ )
73
+ if budget_used > budget_limit:
74
+ return ReplayDecision(
75
+ action="discard",
76
+ reason="budget_exceeded",
77
+ previous_pass_rate=previous_metrics.pass_rate_ratio,
78
+ new_pass_rate=new_metrics.pass_rate_ratio,
79
+ previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
80
+ new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
81
+ comparison_basis=comparison_basis,
82
+ budget_used=budget_used,
83
+ budget_limit=budget_limit,
84
+ )
85
+
86
+ previous_rate = (
87
+ previous_metrics.corrected_pass_rate_ratio if comparison_basis == "corrected" else previous_metrics.pass_rate_ratio
88
+ )
89
+ new_rate = new_metrics.corrected_pass_rate_ratio if comparison_basis == "corrected" else new_metrics.pass_rate_ratio
90
+ previous_rate = previous_rate if previous_rate is not None else previous_metrics.pass_rate_ratio
91
+ new_rate = new_rate if new_rate is not None else new_metrics.pass_rate_ratio
92
+
93
+ if new_rate > previous_rate:
94
+ return ReplayDecision(
95
+ action="keep",
96
+ reason="pass_rate_improved",
97
+ previous_pass_rate=previous_metrics.pass_rate_ratio,
98
+ new_pass_rate=new_metrics.pass_rate_ratio,
99
+ previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
100
+ new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
101
+ comparison_basis=comparison_basis,
102
+ budget_used=budget_used,
103
+ budget_limit=budget_limit,
104
+ )
105
+ return ReplayDecision(
106
+ action="discard",
107
+ reason="pass_rate_declined",
108
+ previous_pass_rate=previous_metrics.pass_rate_ratio,
109
+ new_pass_rate=new_metrics.pass_rate_ratio,
110
+ previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
111
+ new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
112
+ comparison_basis=comparison_basis,
113
+ budget_used=budget_used,
114
+ budget_limit=budget_limit,
115
+ )
@@ -0,0 +1,50 @@
1
+ """Runtime foundation — defineEval DSL, registry, executor."""
2
+
3
+ from evalgate_sdk.runtime.eval import create_result, define_eval, define_suite
4
+ from evalgate_sdk.runtime.executor import create_local_executor, default_local_executor
5
+ from evalgate_sdk.runtime.registry import (
6
+ create_eval_runtime,
7
+ dispose_active_runtime,
8
+ get_active_runtime,
9
+ set_active_runtime,
10
+ with_runtime,
11
+ )
12
+ from evalgate_sdk.runtime.types import (
13
+ EvalContext,
14
+ EvalExecutionError,
15
+ EvalResult,
16
+ EvalRuntimeError,
17
+ EvalSpec,
18
+ ExecutorCapabilities,
19
+ SpecConfig,
20
+ SpecExecutionError,
21
+ SpecOptions,
22
+ SpecRegistrationError,
23
+ )
24
+ from evalgate_sdk.runtime.types import (
25
+ RuntimeError as EvalSDKRuntimeError,
26
+ )
27
+
28
+ __all__ = [
29
+ "define_eval",
30
+ "define_suite",
31
+ "create_result",
32
+ "create_local_executor",
33
+ "default_local_executor",
34
+ "create_eval_runtime",
35
+ "get_active_runtime",
36
+ "set_active_runtime",
37
+ "dispose_active_runtime",
38
+ "with_runtime",
39
+ "EvalSpec",
40
+ "EvalContext",
41
+ "EvalResult",
42
+ "SpecConfig",
43
+ "SpecOptions",
44
+ "ExecutorCapabilities",
45
+ "EvalRuntimeError",
46
+ "SpecRegistrationError",
47
+ "SpecExecutionError",
48
+ "EvalSDKRuntimeError",
49
+ "EvalExecutionError",
50
+ ]
@@ -0,0 +1 @@
1
+ """Runtime adapters for legacy → DSL migration."""