evalgate-sdk 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgate_sdk/__init__.py +707 -0
- evalgate_sdk/_version.py +3 -0
- evalgate_sdk/assertions.py +1362 -0
- evalgate_sdk/auto.py +247 -0
- evalgate_sdk/batch.py +174 -0
- evalgate_sdk/cache.py +111 -0
- evalgate_sdk/ci_context.py +123 -0
- evalgate_sdk/cli/__init__.py +111 -0
- evalgate_sdk/cli/api.py +261 -0
- evalgate_sdk/cli/cli_constants.py +20 -0
- evalgate_sdk/cli/commands.py +1041 -0
- evalgate_sdk/cli/config.py +228 -0
- evalgate_sdk/cli/env.py +43 -0
- evalgate_sdk/cli/formatters/types.py +132 -0
- evalgate_sdk/cli/golden_commands.py +322 -0
- evalgate_sdk/cli/manifest.py +301 -0
- evalgate_sdk/cli/new_commands.py +435 -0
- evalgate_sdk/cli/policy_packs.py +103 -0
- evalgate_sdk/cli/profiles.py +12 -0
- evalgate_sdk/cli/regression_gate.py +312 -0
- evalgate_sdk/cli/render/__init__.py +1 -0
- evalgate_sdk/cli/render/snippet.py +18 -0
- evalgate_sdk/cli/render/sort.py +29 -0
- evalgate_sdk/cli/report/__init__.py +1 -0
- evalgate_sdk/cli/report/build_check_report.py +209 -0
- evalgate_sdk/cli/traces.py +186 -0
- evalgate_sdk/cli/workspace.py +63 -0
- evalgate_sdk/client.py +609 -0
- evalgate_sdk/cluster.py +359 -0
- evalgate_sdk/collector.py +161 -0
- evalgate_sdk/constants.py +6 -0
- evalgate_sdk/context.py +151 -0
- evalgate_sdk/errors.py +236 -0
- evalgate_sdk/export.py +238 -0
- evalgate_sdk/formatters/__init__.py +11 -0
- evalgate_sdk/formatters/github.py +51 -0
- evalgate_sdk/formatters/human.py +68 -0
- evalgate_sdk/formatters/json_fmt.py +11 -0
- evalgate_sdk/formatters/pr_comment.py +80 -0
- evalgate_sdk/golden.py +426 -0
- evalgate_sdk/integrations/__init__.py +1 -0
- evalgate_sdk/integrations/anthropic.py +99 -0
- evalgate_sdk/integrations/autogen.py +62 -0
- evalgate_sdk/integrations/crewai.py +61 -0
- evalgate_sdk/integrations/langchain.py +100 -0
- evalgate_sdk/integrations/openai.py +155 -0
- evalgate_sdk/integrations/openai_eval.py +221 -0
- evalgate_sdk/local.py +144 -0
- evalgate_sdk/logger.py +123 -0
- evalgate_sdk/matchers.py +62 -0
- evalgate_sdk/otel.py +256 -0
- evalgate_sdk/pagination.py +145 -0
- evalgate_sdk/py.typed +0 -0
- evalgate_sdk/pytest_plugin.py +96 -0
- evalgate_sdk/reason_codes.py +103 -0
- evalgate_sdk/regression.py +196 -0
- evalgate_sdk/replay_decision.py +115 -0
- evalgate_sdk/runtime/__init__.py +50 -0
- evalgate_sdk/runtime/adapters/__init__.py +1 -0
- evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
- evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
- evalgate_sdk/runtime/context.py +68 -0
- evalgate_sdk/runtime/eval.py +318 -0
- evalgate_sdk/runtime/execution_mode.py +170 -0
- evalgate_sdk/runtime/executor.py +92 -0
- evalgate_sdk/runtime/registry.py +125 -0
- evalgate_sdk/runtime/run_report.py +249 -0
- evalgate_sdk/runtime/types.py +143 -0
- evalgate_sdk/snapshot.py +219 -0
- evalgate_sdk/streaming.py +124 -0
- evalgate_sdk/synthesize.py +226 -0
- evalgate_sdk/testing.py +128 -0
- evalgate_sdk/types.py +666 -0
- evalgate_sdk/utils/__init__.py +1 -0
- evalgate_sdk/utils/input_hash.py +42 -0
- evalgate_sdk/workflows.py +264 -0
- evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
- evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
- evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
- evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""Standardized failure reason codes (T10).
|
|
2
|
+
|
|
3
|
+
Port of the TypeScript SDK's ``reason-codes.ts``.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
ReasonCode = Literal[
|
|
11
|
+
"PASS",
|
|
12
|
+
"WARN_REGRESSION",
|
|
13
|
+
"LOW_SAMPLE_SIZE",
|
|
14
|
+
"BASELINE_MISSING",
|
|
15
|
+
"SCORE_TOO_LOW",
|
|
16
|
+
"DELTA_TOO_HIGH",
|
|
17
|
+
"COST_BUDGET_EXCEEDED",
|
|
18
|
+
"LATENCY_BUDGET_EXCEEDED",
|
|
19
|
+
"POLICY_FAILED",
|
|
20
|
+
"UNKNOWN",
|
|
21
|
+
# Legacy aliases
|
|
22
|
+
"LOW_SCORE",
|
|
23
|
+
"LOW_PASS_RATE",
|
|
24
|
+
"SAFETY_RISK",
|
|
25
|
+
"LATENCY_RISK",
|
|
26
|
+
"COST_RISK",
|
|
27
|
+
"MAX_DROP_EXCEEDED",
|
|
28
|
+
"INSUFFICIENT_EVIDENCE",
|
|
29
|
+
"POLICY_VIOLATION",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
REASON_CODES: dict[str, dict[str, str]] = {
|
|
33
|
+
"PASS": {"label": "Pass", "severity": "info", "description": "All checks passed."},
|
|
34
|
+
"WARN_REGRESSION": {
|
|
35
|
+
"label": "Warning: Regression",
|
|
36
|
+
"severity": "warn",
|
|
37
|
+
"description": "Score dropped but within tolerance.",
|
|
38
|
+
},
|
|
39
|
+
"LOW_SAMPLE_SIZE": {
|
|
40
|
+
"label": "Low Sample Size",
|
|
41
|
+
"severity": "warn",
|
|
42
|
+
"description": "Not enough data points for confidence.",
|
|
43
|
+
},
|
|
44
|
+
"BASELINE_MISSING": {
|
|
45
|
+
"label": "Baseline Missing",
|
|
46
|
+
"severity": "warn",
|
|
47
|
+
"description": "No baseline to compare against.",
|
|
48
|
+
},
|
|
49
|
+
"SCORE_TOO_LOW": {
|
|
50
|
+
"label": "Score Too Low",
|
|
51
|
+
"severity": "fail",
|
|
52
|
+
"description": "Score is below the minimum threshold.",
|
|
53
|
+
},
|
|
54
|
+
"DELTA_TOO_HIGH": {
|
|
55
|
+
"label": "Delta Too High",
|
|
56
|
+
"severity": "fail",
|
|
57
|
+
"description": "Score dropped more than the allowed delta.",
|
|
58
|
+
},
|
|
59
|
+
"COST_BUDGET_EXCEEDED": {
|
|
60
|
+
"label": "Cost Budget Exceeded",
|
|
61
|
+
"severity": "fail",
|
|
62
|
+
"description": "Evaluation cost exceeded the budget.",
|
|
63
|
+
},
|
|
64
|
+
"LATENCY_BUDGET_EXCEEDED": {
|
|
65
|
+
"label": "Latency Budget Exceeded",
|
|
66
|
+
"severity": "fail",
|
|
67
|
+
"description": "Response latency exceeded the limit.",
|
|
68
|
+
},
|
|
69
|
+
"POLICY_FAILED": {"label": "Policy Failed", "severity": "fail", "description": "One or more policy checks failed."},
|
|
70
|
+
"UNKNOWN": {"label": "Unknown", "severity": "fail", "description": "Unknown failure reason."},
|
|
71
|
+
# Legacy aliases
|
|
72
|
+
"LOW_SCORE": {"label": "Low Score", "severity": "fail", "description": "Score is below the minimum threshold."},
|
|
73
|
+
"LOW_PASS_RATE": {
|
|
74
|
+
"label": "Low Pass Rate",
|
|
75
|
+
"severity": "fail",
|
|
76
|
+
"description": "Pass rate is below the minimum threshold.",
|
|
77
|
+
},
|
|
78
|
+
"SAFETY_RISK": {"label": "Safety Risk", "severity": "fail", "description": "Safety check failed."},
|
|
79
|
+
"LATENCY_RISK": {"label": "Latency Risk", "severity": "warn", "description": "Latency is near or above the limit."},
|
|
80
|
+
"COST_RISK": {"label": "Cost Risk", "severity": "warn", "description": "Cost is near or above the budget."},
|
|
81
|
+
"MAX_DROP_EXCEEDED": {
|
|
82
|
+
"label": "Max Drop Exceeded",
|
|
83
|
+
"severity": "fail",
|
|
84
|
+
"description": "Score dropped more than the allowed maximum.",
|
|
85
|
+
},
|
|
86
|
+
"INSUFFICIENT_EVIDENCE": {
|
|
87
|
+
"label": "Insufficient Evidence",
|
|
88
|
+
"severity": "warn",
|
|
89
|
+
"description": "Not enough data to make a determination.",
|
|
90
|
+
},
|
|
91
|
+
"POLICY_VIOLATION": {"label": "Policy Violation", "severity": "fail", "description": "A policy was violated."},
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_reason_info(code: str) -> dict[str, str]:
|
|
96
|
+
"""Get label, severity, and description for a reason code."""
|
|
97
|
+
return REASON_CODES.get(code, REASON_CODES["UNKNOWN"])
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_blocking(code: str) -> bool:
|
|
101
|
+
"""Return True if the reason code should block a deployment."""
|
|
102
|
+
info = get_reason_info(code)
|
|
103
|
+
return info["severity"] == "fail"
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Regression gate constants, types, and helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GATE_EXIT:
|
|
12
|
+
PASS = 0
|
|
13
|
+
REGRESSION = 1
|
|
14
|
+
INFRA_ERROR = 2
|
|
15
|
+
CONFIDENCE_FAILED = 3
|
|
16
|
+
CONFIDENCE_MISSING = 4
|
|
17
|
+
|
|
18
|
+
def __class_getitem__(cls, key: str) -> int:
|
|
19
|
+
"""Support dict-style access: GATE_EXIT['PASS'] == 0."""
|
|
20
|
+
return getattr(cls, key)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GATE_CATEGORY:
|
|
24
|
+
PASS = "pass"
|
|
25
|
+
REGRESSION = "regression"
|
|
26
|
+
INFRA_ERROR = "infra_error"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
REPORT_SCHEMA_VERSION = 1
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ARTIFACTS:
|
|
33
|
+
BASELINE = "evals/baseline.json"
|
|
34
|
+
REGRESSION_REPORT = "evals/regression-report.json"
|
|
35
|
+
CONFIDENCE_SUMMARY = "evals/confidence-summary.json"
|
|
36
|
+
LATENCY_BENCHMARK = "evals/latency-benchmark.json"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class BaselineTolerance:
|
|
41
|
+
score_drop: float = 0.05
|
|
42
|
+
latency_increase_pct: float = 20.0
|
|
43
|
+
min_confidence: float = 0.8
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Baseline:
|
|
48
|
+
version: int = REPORT_SCHEMA_VERSION
|
|
49
|
+
scores: dict[str, float] = field(default_factory=dict)
|
|
50
|
+
latencies: dict[str, float] = field(default_factory=dict)
|
|
51
|
+
created_at: str | None = None
|
|
52
|
+
tolerance: BaselineTolerance = field(default_factory=BaselineTolerance)
|
|
53
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class RegressionDelta:
|
|
58
|
+
test_id: str
|
|
59
|
+
metric: str
|
|
60
|
+
baseline_value: float
|
|
61
|
+
current_value: float
|
|
62
|
+
delta: float
|
|
63
|
+
delta_pct: float
|
|
64
|
+
category: str = GATE_CATEGORY.PASS
|
|
65
|
+
severity: Literal["low", "medium", "high", "critical"] = "low"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class RegressionReport:
|
|
70
|
+
version: int = REPORT_SCHEMA_VERSION
|
|
71
|
+
run_id: str = ""
|
|
72
|
+
gate_exit: int = GATE_EXIT.PASS
|
|
73
|
+
gate_category: str = GATE_CATEGORY.PASS
|
|
74
|
+
deltas: list[RegressionDelta] = field(default_factory=list)
|
|
75
|
+
summary: dict[str, Any] = field(default_factory=dict)
|
|
76
|
+
created_at: str | None = None
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def evaluate_regression(
|
|
80
|
+
baseline: Baseline,
|
|
81
|
+
current_scores: dict[str, float],
|
|
82
|
+
*,
|
|
83
|
+
min_score: float | None = None,
|
|
84
|
+
max_drop: float | None = None,
|
|
85
|
+
) -> RegressionReport:
|
|
86
|
+
"""Compare current scores against a baseline and produce a regression report.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
baseline: Baseline scores and tolerance configuration.
|
|
90
|
+
current_scores: Current run scores keyed by test ID.
|
|
91
|
+
min_score: Optional absolute floor — any score below this is a failure.
|
|
92
|
+
max_drop: Optional override for baseline.tolerance.score_drop.
|
|
93
|
+
"""
|
|
94
|
+
effective_drop = max_drop if max_drop is not None else baseline.tolerance.score_drop
|
|
95
|
+
|
|
96
|
+
deltas: list[RegressionDelta] = []
|
|
97
|
+
gate_exit = GATE_EXIT.PASS
|
|
98
|
+
gate_category = GATE_CATEGORY.PASS
|
|
99
|
+
|
|
100
|
+
for test_id, baseline_score in baseline.scores.items():
|
|
101
|
+
current = current_scores.get(test_id)
|
|
102
|
+
if current is None:
|
|
103
|
+
continue
|
|
104
|
+
delta = current - baseline_score
|
|
105
|
+
delta_pct = (delta / baseline_score * 100) if baseline_score != 0 else 0
|
|
106
|
+
|
|
107
|
+
failed = False
|
|
108
|
+
|
|
109
|
+
if delta < -effective_drop:
|
|
110
|
+
failed = True
|
|
111
|
+
|
|
112
|
+
if min_score is not None and current < min_score:
|
|
113
|
+
failed = True
|
|
114
|
+
|
|
115
|
+
if failed:
|
|
116
|
+
severity: Literal["low", "medium", "high", "critical"] = "high" if abs(delta_pct) > 20 else "medium"
|
|
117
|
+
deltas.append(
|
|
118
|
+
RegressionDelta(
|
|
119
|
+
test_id=test_id,
|
|
120
|
+
metric="score",
|
|
121
|
+
baseline_value=baseline_score,
|
|
122
|
+
current_value=current,
|
|
123
|
+
delta=delta,
|
|
124
|
+
delta_pct=delta_pct,
|
|
125
|
+
category=GATE_CATEGORY.REGRESSION,
|
|
126
|
+
severity=severity,
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
gate_exit = GATE_EXIT.REGRESSION
|
|
130
|
+
gate_category = GATE_CATEGORY.REGRESSION
|
|
131
|
+
else:
|
|
132
|
+
deltas.append(
|
|
133
|
+
RegressionDelta(
|
|
134
|
+
test_id=test_id,
|
|
135
|
+
metric="score",
|
|
136
|
+
baseline_value=baseline_score,
|
|
137
|
+
current_value=current,
|
|
138
|
+
delta=delta,
|
|
139
|
+
delta_pct=delta_pct,
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Check min_score for tests not in the baseline
|
|
144
|
+
if min_score is not None:
|
|
145
|
+
for test_id, current in current_scores.items():
|
|
146
|
+
if test_id in baseline.scores:
|
|
147
|
+
continue
|
|
148
|
+
if current < min_score:
|
|
149
|
+
deltas.append(
|
|
150
|
+
RegressionDelta(
|
|
151
|
+
test_id=test_id,
|
|
152
|
+
metric="score",
|
|
153
|
+
baseline_value=0.0,
|
|
154
|
+
current_value=current,
|
|
155
|
+
delta=current,
|
|
156
|
+
delta_pct=0.0,
|
|
157
|
+
category=GATE_CATEGORY.REGRESSION,
|
|
158
|
+
severity="medium",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
gate_exit = GATE_EXIT.REGRESSION
|
|
162
|
+
gate_category = GATE_CATEGORY.REGRESSION
|
|
163
|
+
|
|
164
|
+
return RegressionReport(
|
|
165
|
+
gate_exit=gate_exit,
|
|
166
|
+
gate_category=gate_category,
|
|
167
|
+
deltas=deltas,
|
|
168
|
+
summary={
|
|
169
|
+
"total": len(deltas),
|
|
170
|
+
"regressions": sum(1 for d in deltas if d.category == GATE_CATEGORY.REGRESSION),
|
|
171
|
+
"passed": sum(1 for d in deltas if d.category == GATE_CATEGORY.PASS),
|
|
172
|
+
},
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def compute_baseline_checksum(baseline: Baseline) -> str:
|
|
177
|
+
"""Compute a deterministic SHA-256 checksum of a baseline's score data.
|
|
178
|
+
|
|
179
|
+
The checksum covers ``version``, ``scores``, and ``latencies`` so that
|
|
180
|
+
any tamper with the stored values is detectable.
|
|
181
|
+
"""
|
|
182
|
+
payload = json.dumps(
|
|
183
|
+
{
|
|
184
|
+
"version": baseline.version,
|
|
185
|
+
"scores": dict(sorted(baseline.scores.items())),
|
|
186
|
+
"latencies": dict(sorted(baseline.latencies.items())),
|
|
187
|
+
},
|
|
188
|
+
sort_keys=True,
|
|
189
|
+
separators=(",", ":"),
|
|
190
|
+
)
|
|
191
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def verify_baseline_checksum(baseline: Baseline, expected_checksum: str) -> bool:
|
|
195
|
+
"""Return ``True`` if the baseline's current checksum matches *expected_checksum*."""
|
|
196
|
+
return compute_baseline_checksum(baseline) == expected_checksum
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from evalgate_sdk.golden import RunMetrics, extract_run_metrics
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(slots=True)
|
|
10
|
+
class NormalizedBudgetConfig:
|
|
11
|
+
mode: Literal["traces", "cost"]
|
|
12
|
+
max_traces: int | None = None
|
|
13
|
+
max_cost_usd: float | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(slots=True)
|
|
17
|
+
class ReplayDecision:
|
|
18
|
+
action: Literal["keep", "discard"]
|
|
19
|
+
reason: Literal["pass_rate_improved", "pass_rate_declined", "budget_exceeded"]
|
|
20
|
+
previous_pass_rate: float
|
|
21
|
+
new_pass_rate: float
|
|
22
|
+
previous_corrected_pass_rate: float | None
|
|
23
|
+
new_corrected_pass_rate: float | None
|
|
24
|
+
comparison_basis: Literal["corrected", "raw"]
|
|
25
|
+
budget_used: float
|
|
26
|
+
budget_limit: float
|
|
27
|
+
|
|
28
|
+
def to_dict(self) -> dict[str, float | str | None]:
|
|
29
|
+
return {
|
|
30
|
+
"action": self.action,
|
|
31
|
+
"reason": self.reason,
|
|
32
|
+
"previousPassRate": self.previous_pass_rate,
|
|
33
|
+
"newPassRate": self.new_pass_rate,
|
|
34
|
+
"previousCorrectedPassRate": self.previous_corrected_pass_rate,
|
|
35
|
+
"newCorrectedPassRate": self.new_corrected_pass_rate,
|
|
36
|
+
"comparisonBasis": self.comparison_basis,
|
|
37
|
+
"budgetUsed": self.budget_used,
|
|
38
|
+
"budgetLimit": self.budget_limit,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def determine_comparison_basis(
|
|
43
|
+
previous_corrected: float | None,
|
|
44
|
+
new_corrected: float | None,
|
|
45
|
+
) -> Literal["corrected", "raw"]:
|
|
46
|
+
if previous_corrected is not None and new_corrected is not None:
|
|
47
|
+
return "corrected"
|
|
48
|
+
return "raw"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _budget_values(metrics: RunMetrics, budget: NormalizedBudgetConfig) -> tuple[float, float]:
|
|
52
|
+
if budget.mode == "traces":
|
|
53
|
+
used = float(metrics.total_results)
|
|
54
|
+
limit = float(budget.max_traces or 0)
|
|
55
|
+
return used, limit
|
|
56
|
+
used = float(metrics.total_cost_usd or 0.0)
|
|
57
|
+
limit = float(budget.max_cost_usd or 0.0)
|
|
58
|
+
return used, limit
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def evaluate_replay_outcome(
|
|
62
|
+
previous_run: dict,
|
|
63
|
+
new_run: dict,
|
|
64
|
+
budget_config: NormalizedBudgetConfig,
|
|
65
|
+
) -> ReplayDecision:
|
|
66
|
+
previous_metrics = extract_run_metrics(previous_run)
|
|
67
|
+
new_metrics = extract_run_metrics(new_run)
|
|
68
|
+
budget_used, budget_limit = _budget_values(new_metrics, budget_config)
|
|
69
|
+
comparison_basis = determine_comparison_basis(
|
|
70
|
+
previous_metrics.corrected_pass_rate_ratio,
|
|
71
|
+
new_metrics.corrected_pass_rate_ratio,
|
|
72
|
+
)
|
|
73
|
+
if budget_used > budget_limit:
|
|
74
|
+
return ReplayDecision(
|
|
75
|
+
action="discard",
|
|
76
|
+
reason="budget_exceeded",
|
|
77
|
+
previous_pass_rate=previous_metrics.pass_rate_ratio,
|
|
78
|
+
new_pass_rate=new_metrics.pass_rate_ratio,
|
|
79
|
+
previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
|
|
80
|
+
new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
|
|
81
|
+
comparison_basis=comparison_basis,
|
|
82
|
+
budget_used=budget_used,
|
|
83
|
+
budget_limit=budget_limit,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
previous_rate = (
|
|
87
|
+
previous_metrics.corrected_pass_rate_ratio if comparison_basis == "corrected" else previous_metrics.pass_rate_ratio
|
|
88
|
+
)
|
|
89
|
+
new_rate = new_metrics.corrected_pass_rate_ratio if comparison_basis == "corrected" else new_metrics.pass_rate_ratio
|
|
90
|
+
previous_rate = previous_rate if previous_rate is not None else previous_metrics.pass_rate_ratio
|
|
91
|
+
new_rate = new_rate if new_rate is not None else new_metrics.pass_rate_ratio
|
|
92
|
+
|
|
93
|
+
if new_rate > previous_rate:
|
|
94
|
+
return ReplayDecision(
|
|
95
|
+
action="keep",
|
|
96
|
+
reason="pass_rate_improved",
|
|
97
|
+
previous_pass_rate=previous_metrics.pass_rate_ratio,
|
|
98
|
+
new_pass_rate=new_metrics.pass_rate_ratio,
|
|
99
|
+
previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
|
|
100
|
+
new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
|
|
101
|
+
comparison_basis=comparison_basis,
|
|
102
|
+
budget_used=budget_used,
|
|
103
|
+
budget_limit=budget_limit,
|
|
104
|
+
)
|
|
105
|
+
return ReplayDecision(
|
|
106
|
+
action="discard",
|
|
107
|
+
reason="pass_rate_declined",
|
|
108
|
+
previous_pass_rate=previous_metrics.pass_rate_ratio,
|
|
109
|
+
new_pass_rate=new_metrics.pass_rate_ratio,
|
|
110
|
+
previous_corrected_pass_rate=previous_metrics.corrected_pass_rate_ratio,
|
|
111
|
+
new_corrected_pass_rate=new_metrics.corrected_pass_rate_ratio,
|
|
112
|
+
comparison_basis=comparison_basis,
|
|
113
|
+
budget_used=budget_used,
|
|
114
|
+
budget_limit=budget_limit,
|
|
115
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Runtime foundation — defineEval DSL, registry, executor."""
|
|
2
|
+
|
|
3
|
+
from evalgate_sdk.runtime.eval import create_result, define_eval, define_suite
|
|
4
|
+
from evalgate_sdk.runtime.executor import create_local_executor, default_local_executor
|
|
5
|
+
from evalgate_sdk.runtime.registry import (
|
|
6
|
+
create_eval_runtime,
|
|
7
|
+
dispose_active_runtime,
|
|
8
|
+
get_active_runtime,
|
|
9
|
+
set_active_runtime,
|
|
10
|
+
with_runtime,
|
|
11
|
+
)
|
|
12
|
+
from evalgate_sdk.runtime.types import (
|
|
13
|
+
EvalContext,
|
|
14
|
+
EvalExecutionError,
|
|
15
|
+
EvalResult,
|
|
16
|
+
EvalRuntimeError,
|
|
17
|
+
EvalSpec,
|
|
18
|
+
ExecutorCapabilities,
|
|
19
|
+
SpecConfig,
|
|
20
|
+
SpecExecutionError,
|
|
21
|
+
SpecOptions,
|
|
22
|
+
SpecRegistrationError,
|
|
23
|
+
)
|
|
24
|
+
from evalgate_sdk.runtime.types import (
|
|
25
|
+
RuntimeError as EvalSDKRuntimeError,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"define_eval",
|
|
30
|
+
"define_suite",
|
|
31
|
+
"create_result",
|
|
32
|
+
"create_local_executor",
|
|
33
|
+
"default_local_executor",
|
|
34
|
+
"create_eval_runtime",
|
|
35
|
+
"get_active_runtime",
|
|
36
|
+
"set_active_runtime",
|
|
37
|
+
"dispose_active_runtime",
|
|
38
|
+
"with_runtime",
|
|
39
|
+
"EvalSpec",
|
|
40
|
+
"EvalContext",
|
|
41
|
+
"EvalResult",
|
|
42
|
+
"SpecConfig",
|
|
43
|
+
"SpecOptions",
|
|
44
|
+
"ExecutorCapabilities",
|
|
45
|
+
"EvalRuntimeError",
|
|
46
|
+
"SpecRegistrationError",
|
|
47
|
+
"SpecExecutionError",
|
|
48
|
+
"EvalSDKRuntimeError",
|
|
49
|
+
"EvalExecutionError",
|
|
50
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Runtime adapters for legacy → DSL migration."""
|