cortexops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,195 @@
1
+ """CortexOps CLI — cortexops <command> [options]
2
+
3
+ Commands:
4
+ eval run Run an evaluation suite
5
+ eval diff Diff two eval runs
6
+ failures Show recent failures
7
+ traces List recent traces
8
+ version Print SDK version
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import sys
17
+ from pathlib import Path
18
+
19
+
20
+ def cmd_eval_run(args: argparse.Namespace) -> int:
21
+ """cortexops eval run --dataset golden_v1.yaml --project my-agent"""
22
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
23
+
24
+ from cortexops import EvalSuite
25
+ from cortexops.eval import EvalThresholdError
26
+
27
+ print(f"CortexOps eval\n dataset : {args.dataset}\n project : {args.project or 'from dataset'}")
28
+ if args.fail_on:
29
+ print(f" fail-on : {args.fail_on}")
30
+ print()
31
+
32
+ def passthrough_agent(inp: dict) -> dict:
33
+ """Placeholder — replace with your actual agent import."""
34
+ return {"output": f"[no agent bound] input was: {inp}"}
35
+
36
+ try:
37
+ agent = _load_agent(args.agent) if args.agent else passthrough_agent
38
+ summary = EvalSuite.run(
39
+ dataset=args.dataset,
40
+ agent=agent,
41
+ verbose=not args.quiet,
42
+ fail_on=args.fail_on,
43
+ )
44
+ except EvalThresholdError as e:
45
+ print(f"\nCI gate FAILED: {e}", file=sys.stderr)
46
+ return 1
47
+ except FileNotFoundError as e:
48
+ print(f"Error: {e}", file=sys.stderr)
49
+ return 1
50
+
51
+ if args.output:
52
+ Path(args.output).write_text(json.dumps(summary.model_dump(mode="json"), indent=2))
53
+ print(f"\nResults written to {args.output}")
54
+
55
+ return 0 if summary.failed == 0 else 1
56
+
57
+
58
+ def cmd_eval_diff(args: argparse.Namespace) -> int:
59
+ """cortexops eval diff <run_a> <run_b> --api-key cxo-..."""
60
+ from cortexops import CortexClient
61
+
62
+ api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
63
+ if not api_key:
64
+ print("Error: --api-key or CORTEXOPS_API_KEY required for diff", file=sys.stderr)
65
+ return 1
66
+
67
+ client = CortexClient(api_key=api_key, base_url=args.base_url)
68
+ try:
69
+ diff = client.diff(args.run_a, args.run_b)
70
+ except Exception as e:
71
+ print(f"Error fetching diff: {e}", file=sys.stderr)
72
+ return 1
73
+
74
+ delta_tc = diff.get("task_completion_delta", 0)
75
+ delta_tool = diff.get("tool_accuracy_delta", 0)
76
+ regressions = diff.get("regressions", [])
77
+ improvements = diff.get("improvements", [])
78
+
79
+ sign = lambda v: f"+{v:.1%}" if v >= 0 else f"{v:.1%}"
80
+ print(f"Diff: {args.run_a[:8]} → {args.run_b[:8]}")
81
+ print(f" Task completion : {sign(delta_tc)}")
82
+ print(f" Tool accuracy : {sign(delta_tool / 100)}")
83
+ if regressions:
84
+ print(f" Regressions ({len(regressions)}): {', '.join(regressions)}")
85
+ if improvements:
86
+ print(f" Improvements ({len(improvements)}): {', '.join(improvements)}")
87
+
88
+ return 1 if regressions else 0
89
+
90
+
91
+ def cmd_failures(args: argparse.Namespace) -> int:
92
+ """cortexops failures --project my-agent --last 24h"""
93
+ from cortexops import CortexClient
94
+
95
+ api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
96
+ if not api_key:
97
+ print("Error: --api-key or CORTEXOPS_API_KEY required", file=sys.stderr)
98
+ return 1
99
+
100
+ client = CortexClient(api_key=api_key, base_url=args.base_url)
101
+ try:
102
+ traces = client.list_traces(project=args.project, limit=args.limit)
103
+ except Exception as e:
104
+ print(f"Error: {e}", file=sys.stderr)
105
+ return 1
106
+
107
+ failed = [t for t in traces if t.get("status") == "failed"]
108
+ if not failed:
109
+ print(f"No failures found for project '{args.project}'")
110
+ return 0
111
+
112
+ print(f"Failures — {args.project} (last {len(traces)} traces)")
113
+ print(f"{'Trace ID':<36} {'Failure kind':<28} Latency")
114
+ print("-" * 78)
115
+ for t in failed[:args.limit]:
116
+ print(
117
+ f"{t['trace_id']:<36} {t.get('failure_kind') or 'unknown':<28} "
118
+ f"{t.get('total_latency_ms', 0):.0f}ms"
119
+ )
120
+ return 0
121
+
122
+
123
+ def cmd_version(_: argparse.Namespace) -> int:
124
+ from cortexops import __version__
125
+ print(f"cortexops {__version__}")
126
+ return 0
127
+
128
+
129
+ def _load_agent(agent_path: str):
130
+ """Load an agent from a dotted path like 'mymodule:my_agent'."""
131
+ if ":" not in agent_path:
132
+ print(f"Error: --agent must be in the format 'module:object', got '{agent_path}'", file=sys.stderr)
133
+ sys.exit(1)
134
+ module_path, attr = agent_path.rsplit(":", 1)
135
+ import importlib
136
+ module = importlib.import_module(module_path)
137
+ return getattr(module, attr)
138
+
139
+
140
+ def main() -> None:
141
+ parser = argparse.ArgumentParser(
142
+ prog="cortexops",
143
+ description="CortexOps — reliability infrastructure for AI agents",
144
+ )
145
+ sub = parser.add_subparsers(dest="command")
146
+
147
+ # ── eval ──────────────────────────────────────────────────────────────
148
+ eval_parser = sub.add_parser("eval", help="Evaluation commands")
149
+ eval_sub = eval_parser.add_subparsers(dest="eval_command")
150
+
151
+ run_p = eval_sub.add_parser("run", help="Run an eval suite")
152
+ run_p.add_argument("--dataset", "-d", required=True, help="Path to golden dataset YAML")
153
+ run_p.add_argument("--project", "-p", default=None, help="Project name (overrides dataset)")
154
+ run_p.add_argument("--agent", "-a", default=None, help="Agent to evaluate (module:object)")
155
+ run_p.add_argument("--fail-on", default=None, help="e.g. 'task_completion < 0.90'")
156
+ run_p.add_argument("--output", "-o", default=None, help="Save JSON results to file")
157
+ run_p.add_argument("--quiet", "-q", action="store_true", help="Suppress per-case output")
158
+
159
+ diff_p = eval_sub.add_parser("diff", help="Diff two eval runs")
160
+ diff_p.add_argument("run_a", help="First run ID")
161
+ diff_p.add_argument("run_b", help="Second run ID")
162
+ diff_p.add_argument("--api-key", default=None)
163
+ diff_p.add_argument("--base-url", default="https://api.cortexops.ai")
164
+
165
+ # ── failures ──────────────────────────────────────────────────────────
166
+ fail_p = sub.add_parser("failures", help="List recent agent failures")
167
+ fail_p.add_argument("--project", "-p", required=True)
168
+ fail_p.add_argument("--limit", "-n", type=int, default=20)
169
+ fail_p.add_argument("--api-key", default=None)
170
+ fail_p.add_argument("--base-url", default="https://api.cortexops.ai")
171
+
172
+ # ── version ───────────────────────────────────────────────────────────
173
+ sub.add_parser("version", help="Print version and exit")
174
+
175
+ args = parser.parse_args()
176
+
177
+ handlers = {
178
+ ("eval", "run"): cmd_eval_run,
179
+ ("eval", "diff"): cmd_eval_diff,
180
+ ("failures", None): cmd_failures,
181
+ ("version", None): cmd_version,
182
+ }
183
+
184
+ key = (args.command, getattr(args, "eval_command", None))
185
+ handler = handlers.get(key)
186
+
187
+ if handler is None:
188
+ parser.print_help()
189
+ sys.exit(0)
190
+
191
+ sys.exit(handler(args))
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from urllib.parse import urljoin
5
+
6
+ from .models import EvalSummary, Trace
7
+
8
+
9
+ class CortexClient:
10
+ """HTTP client for the CortexOps backend API.
11
+
12
+ Used by the SDK to push traces and pull eval history.
13
+ Not required for local-only usage.
14
+
15
+ Usage:
16
+ client = CortexClient(api_key="cxo-...", base_url="https://api.cortexops.ai")
17
+ client.push_trace(tracer.last_trace())
18
+ history = client.list_runs(project="payments-agent", limit=10)
19
+ """
20
+
21
+ DEFAULT_BASE_URL = "https://api.cortexops.ai"
22
+
23
+ def __init__(
24
+ self,
25
+ api_key: str,
26
+ base_url: str | None = None,
27
+ timeout: float = 10.0,
28
+ ) -> None:
29
+ self.api_key = api_key
30
+ self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
31
+ self.timeout = timeout
32
+
33
+ def _headers(self) -> dict[str, str]:
34
+ return {
35
+ "Authorization": f"Bearer {self.api_key}",
36
+ "Content-Type": "application/json",
37
+ }
38
+
39
+ def _get(self, path: str, params: dict | None = None) -> dict:
40
+ import httpx
41
+ r = httpx.get(
42
+ urljoin(self.base_url + "/", path.lstrip("/")),
43
+ headers=self._headers(),
44
+ params=params,
45
+ timeout=self.timeout,
46
+ )
47
+ r.raise_for_status()
48
+ return r.json()
49
+
50
+ def _post(self, path: str, data: dict) -> dict:
51
+ import httpx
52
+ r = httpx.post(
53
+ urljoin(self.base_url + "/", path.lstrip("/")),
54
+ headers=self._headers(),
55
+ json=data,
56
+ timeout=self.timeout,
57
+ )
58
+ r.raise_for_status()
59
+ return r.json()
60
+
61
+ def push_trace(self, trace: Trace) -> dict:
62
+ return self._post("/v1/traces", trace.model_dump(mode="json"))
63
+
64
+ def get_trace(self, trace_id: str) -> dict:
65
+ return self._get(f"/v1/traces/{trace_id}")
66
+
67
+ def list_traces(self, project: str, limit: int = 50) -> list[dict]:
68
+ return self._get("/v1/traces", {"project": project, "limit": limit})
69
+
70
+ def push_eval(self, summary: EvalSummary) -> dict:
71
+ return self._post("/v1/evals", summary.model_dump(mode="json"))
72
+
73
+ def list_runs(self, project: str, limit: int = 10) -> list[dict]:
74
+ return self._get("/v1/evals", {"project": project, "limit": limit})
75
+
76
+ def run_eval(self, dataset: str, project: str) -> dict:
77
+ """Trigger a server-side eval run (async via Celery)."""
78
+ return self._post("/v1/evals/run", {"dataset": dataset, "project": project})
79
+
80
+ def get_eval(self, run_id: str) -> dict:
81
+ return self._get(f"/v1/evals/{run_id}")
82
+
83
+ def diff(self, run_id_a: str, run_id_b: str) -> dict:
84
+ return self._get("/v1/evals/diff", {"a": run_id_a, "b": run_id_b})
@@ -0,0 +1,216 @@
1
+ from __future__ import annotations
2
+
3
+ import statistics
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import yaml
9
+
10
+ from .metrics import compute_case_result
11
+ from .models import (
12
+ EvalCase,
13
+ EvalDataset,
14
+ EvalSummary,
15
+ FailureKind,
16
+ RunStatus,
17
+ Trace,
18
+ TraceNode,
19
+ )
20
+
21
+
22
+ class EvalSuite:
23
+ """Run evaluation suites against any instrumented agent.
24
+
25
+ Usage:
26
+ results = EvalSuite.run(
27
+ dataset="golden_v1.yaml",
28
+ agent=your_langgraph_app, # wrapped or raw callable
29
+ )
30
+ print(results.summary())
31
+ """
32
+
33
+ @classmethod
34
+ def run(
35
+ cls,
36
+ dataset: str | Path | dict | EvalDataset,
37
+ agent: Any,
38
+ *,
39
+ metrics: list[str] | None = None,
40
+ verbose: bool = True,
41
+ fail_on: str | None = None,
42
+ ) -> EvalSummary:
43
+ """Run a full eval suite.
44
+
45
+ Args:
46
+ dataset: Path to YAML, dict, or EvalDataset object.
47
+ agent: Any callable that accepts a string or dict input.
48
+ metrics: Optional subset of metrics to run.
49
+ verbose: Print case-by-case progress.
50
+ fail_on: Threshold expression like "task_completion < 0.90".
51
+ Raises EvalThresholdError if the condition is met.
52
+ """
53
+ ds = cls._load_dataset(dataset)
54
+ case_results = []
55
+
56
+ for i, case in enumerate(ds.cases):
57
+ if verbose:
58
+ print(f" [{i+1}/{len(ds.cases)}] {case.id} ... ", end="", flush=True)
59
+
60
+ trace = cls._run_case(agent, case)
61
+ result = compute_case_result(case, trace)
62
+ case_results.append(result)
63
+
64
+ if verbose:
65
+ status = "pass" if result.passed else "FAIL"
66
+ print(f"{status} ({result.score:.0f})")
67
+
68
+ latencies = [r.latency_ms for r in case_results]
69
+ latencies_sorted = sorted(latencies)
70
+ n = len(latencies_sorted)
71
+
72
+ summary = EvalSummary(
73
+ project=ds.project,
74
+ dataset_version=ds.version,
75
+ total_cases=len(case_results),
76
+ passed=sum(1 for r in case_results if r.passed),
77
+ failed=sum(1 for r in case_results if not r.passed),
78
+ warnings=sum(1 for r in case_results if not r.passed and r.score >= 60),
79
+ task_completion_rate=sum(1 for r in case_results if r.task_completion) / max(n, 1),
80
+ tool_accuracy=statistics.mean(r.tool_accuracy for r in case_results) if case_results else 0.0,
81
+ latency_p50_ms=latencies_sorted[int(n * 0.50) - 1] if n else 0.0,
82
+ latency_p95_ms=latencies_sorted[int(n * 0.95) - 1] if n else 0.0,
83
+ case_results=case_results,
84
+ )
85
+
86
+ if verbose:
87
+ print()
88
+ print(summary.summary())
89
+
90
+ if fail_on:
91
+ cls._check_threshold(summary, fail_on)
92
+
93
+ return summary
94
+
95
+ @classmethod
96
+ def _run_case(cls, agent: Any, case: EvalCase) -> Trace:
97
+ input_data = case.input if isinstance(case.input, dict) else {"input": case.input}
98
+ t0 = time.perf_counter()
99
+
100
+ try:
101
+ if hasattr(agent, "invoke"):
102
+ output = agent.invoke(input_data)
103
+ elif callable(agent):
104
+ output = agent(input_data)
105
+ else:
106
+ raise TypeError(f"Agent {type(agent).__name__} is not callable")
107
+ latency_ms = (time.perf_counter() - t0) * 1000
108
+ output_dict = output if isinstance(output, dict) else {"output": str(output)}
109
+
110
+ return Trace(
111
+ project="eval",
112
+ case_id=case.id,
113
+ input=input_data,
114
+ output=output_dict,
115
+ total_latency_ms=latency_ms,
116
+ status=RunStatus.COMPLETED,
117
+ nodes=[
118
+ TraceNode(
119
+ node_id="eval_root",
120
+ node_name="agent",
121
+ input=input_data,
122
+ output=output_dict,
123
+ latency_ms=latency_ms,
124
+ )
125
+ ],
126
+ )
127
+
128
+ except Exception as exc:
129
+ latency_ms = (time.perf_counter() - t0) * 1000
130
+ return Trace(
131
+ project="eval",
132
+ case_id=case.id,
133
+ input=input_data,
134
+ output={},
135
+ total_latency_ms=latency_ms,
136
+ status=RunStatus.FAILED,
137
+ failure_kind=FailureKind.UNKNOWN,
138
+ failure_detail=str(exc),
139
+ )
140
+
141
+ @classmethod
142
+ def _load_dataset(cls, dataset: str | Path | dict | EvalDataset) -> EvalDataset:
143
+ if isinstance(dataset, EvalDataset):
144
+ return dataset
145
+
146
+ if isinstance(dataset, dict):
147
+ return cls._parse_dataset_dict(dataset)
148
+
149
+ path = Path(dataset)
150
+ if not path.exists():
151
+ raise FileNotFoundError(f"Dataset not found: {path}")
152
+
153
+ raw = yaml.safe_load(path.read_text())
154
+ return cls._parse_dataset_dict(raw)
155
+
156
+ @classmethod
157
+ def _parse_dataset_dict(cls, raw: dict) -> EvalDataset:
158
+ cases = []
159
+ for c in raw.get("cases", []):
160
+ cases.append(
161
+ EvalCase(
162
+ id=str(c["id"]),
163
+ input=c["input"],
164
+ expected_tool_calls=c.get("expected_tool_calls", []),
165
+ expected_output_contains=c.get("expected_output_contains", []),
166
+ expected_output_not_contains=c.get("expected_output_not_contains", []),
167
+ max_latency_ms=c.get("max_latency_ms"),
168
+ judge=c.get("judge", "rule"),
169
+ judge_criteria=c.get("judge_criteria"),
170
+ tags=c.get("tags", []),
171
+ )
172
+ )
173
+ return EvalDataset(
174
+ version=raw.get("version", 1),
175
+ project=raw.get("project", "unknown"),
176
+ description=raw.get("description", ""),
177
+ cases=cases,
178
+ )
179
+
180
+ @classmethod
181
+ def _check_threshold(cls, summary: EvalSummary, fail_on: str) -> None:
182
+ """Parse and evaluate a threshold expression like 'task_completion < 0.90'."""
183
+ import re
184
+
185
+ m = re.match(r"(\w+)\s*([<>]=?)\s*([\d.]+)", fail_on.strip())
186
+ if not m:
187
+ raise ValueError(f"Invalid fail_on expression: '{fail_on}'")
188
+
189
+ metric, op, threshold_str = m.groups()
190
+ threshold = float(threshold_str)
191
+
192
+ actual = {
193
+ "task_completion": summary.task_completion_rate,
194
+ "tool_accuracy": summary.tool_accuracy / 100.0,
195
+ "pass_rate": summary.passed / max(summary.total_cases, 1),
196
+ }.get(metric)
197
+
198
+ if actual is None:
199
+ raise ValueError(f"Unknown metric in fail_on: '{metric}'")
200
+
201
+ failed = {
202
+ "<": actual < threshold,
203
+ "<=": actual <= threshold,
204
+ ">": actual > threshold,
205
+ ">=": actual >= threshold,
206
+ }.get(op, False)
207
+
208
+ if failed:
209
+ raise EvalThresholdError(
210
+ f"Eval gate failed: {metric}={actual:.3f} {op} {threshold} "
211
+ f"(project={summary.project})"
212
+ )
213
+
214
+
215
+ class EvalThresholdError(Exception):
216
+ """Raised when an eval run fails a CI threshold gate."""
@@ -0,0 +1,155 @@
1
+ """LLM-as-judge metric for CortexOps.
2
+
3
+ Uses an LLM (default: gpt-4o-mini) to score open-ended agent output
4
+ against natural language criteria. Works with any OpenAI-compatible API.
5
+
6
+ Usage in golden dataset:
7
+ - id: refund_explanation_01
8
+ input: "Why was my refund rejected?"
9
+ judge: llm
10
+ judge_criteria: >
11
+ The response should explain the rejection reason clearly,
12
+ be empathetic, and offer a next step to the customer.
13
+ It must NOT contain jargon or mention internal system errors.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import os
20
+ from typing import Any
21
+
22
+ from .metrics import Metric
23
+ from .models import EvalCase, FailureKind, Trace
24
+
25
+
26
+ JUDGE_SYSTEM_PROMPT = """You are a strict but fair evaluator of AI agent outputs.
27
+ You will be given:
28
+ - The user's input to the agent
29
+ - The agent's output
30
+ - Evaluation criteria
31
+
32
+ Score the output from 0 to 100 and explain your reasoning briefly.
33
+
34
+ Respond ONLY with valid JSON in this exact format:
35
+ {
36
+ "score": <integer 0-100>,
37
+ "passed": <true|false>,
38
+ "reasoning": "<one sentence>"
39
+ }
40
+
41
+ Rules:
42
+ - 90-100: Fully meets all criteria, no issues
43
+ - 70-89: Mostly meets criteria, minor gaps
44
+ - 50-69: Partially meets criteria, notable gaps
45
+ - 0-49: Fails to meet criteria or contains harmful/incorrect content
46
+ - passed = true only if score >= 70
47
+ """
48
+
49
+
50
+ class LLMJudgeMetric(Metric):
51
+ """Score agent output using an LLM judge.
52
+
53
+ Falls back to a heuristic score if the LLM API is unavailable,
54
+ so evals never block on API failures.
55
+
56
+ Args:
57
+ model: OpenAI model to use. Default: gpt-4o-mini.
58
+ api_key: OpenAI API key. Falls back to OPENAI_API_KEY env var.
59
+ base_url: OpenAI-compatible base URL. Useful for local LLMs.
60
+ temperature: Judge temperature. Keep low (0.1) for consistency.
61
+ timeout: HTTP timeout in seconds.
62
+ """
63
+
64
+ name = "llm_judge"
65
+
66
+ def __init__(
67
+ self,
68
+ model: str = "gpt-4o-mini",
69
+ api_key: str | None = None,
70
+ base_url: str = "https://api.openai.com/v1",
71
+ temperature: float = 0.1,
72
+ timeout: float = 30.0,
73
+ ) -> None:
74
+ self.model = model
75
+ self.api_key = api_key or os.getenv("OPENAI_API_KEY") or os.getenv("CORTEXOPS_JUDGE_API_KEY")
76
+ self.base_url = base_url.rstrip("/")
77
+ self.temperature = temperature
78
+ self.timeout = timeout
79
+
80
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
81
+ if not case.judge_criteria:
82
+ return 100.0, None, None
83
+
84
+ if case.judge != "llm":
85
+ return 100.0, None, None
86
+
87
+ user_input = str(case.input)
88
+ agent_output = str(trace.output.get("output", trace.output))
89
+
90
+ try:
91
+ result = self._call_judge(user_input, agent_output, case.judge_criteria)
92
+ score = float(result.get("score", 0))
93
+ passed = result.get("passed", score >= 70)
94
+ reasoning = result.get("reasoning", "")
95
+
96
+ if not passed:
97
+ return score, FailureKind.OUTPUT_FORMAT, f"LLM judge: {reasoning}"
98
+ return score, None, None
99
+
100
+ except Exception as exc:
101
+ return self._heuristic_fallback(case, trace, str(exc))
102
+
103
+ def _call_judge(self, user_input: str, agent_output: str, criteria: str) -> dict[str, Any]:
104
+ import httpx
105
+
106
+ if not self.api_key:
107
+ raise ValueError(
108
+ "No API key found for LLM judge. Set OPENAI_API_KEY or pass api_key= to LLMJudgeMetric()."
109
+ )
110
+
111
+ user_message = (
112
+ f"USER INPUT:\n{user_input}\n\n"
113
+ f"AGENT OUTPUT:\n{agent_output}\n\n"
114
+ f"EVALUATION CRITERIA:\n{criteria}"
115
+ )
116
+
117
+ response = httpx.post(
118
+ f"{self.base_url}/chat/completions",
119
+ headers={
120
+ "Authorization": f"Bearer {self.api_key}",
121
+ "Content-Type": "application/json",
122
+ },
123
+ json={
124
+ "model": self.model,
125
+ "messages": [
126
+ {"role": "system", "content": JUDGE_SYSTEM_PROMPT},
127
+ {"role": "user", "content": user_message},
128
+ ],
129
+ "temperature": self.temperature,
130
+ "response_format": {"type": "json_object"},
131
+ },
132
+ timeout=self.timeout,
133
+ )
134
+ response.raise_for_status()
135
+ content = response.json()["choices"][0]["message"]["content"]
136
+ return json.loads(content)
137
+
138
+ def _heuristic_fallback(
139
+ self, case: EvalCase, trace: Trace, error: str
140
+ ) -> tuple[float, FailureKind | None, str | None]:
141
+ """Simple keyword fallback when the LLM is unavailable."""
142
+ output = str(trace.output.get("output", "")).lower()
143
+ criteria_words = (case.judge_criteria or "").lower().split()
144
+ meaningful_words = [w for w in criteria_words if len(w) > 4]
145
+
146
+ if not meaningful_words:
147
+ return 70.0, None, f"LLM judge unavailable ({error[:60]}); heuristic used"
148
+
149
+ hits = sum(1 for w in meaningful_words if w in output)
150
+ ratio = hits / len(meaningful_words)
151
+ score = 50.0 + 50.0 * ratio
152
+
153
+ if score < 70:
154
+ return score, FailureKind.OUTPUT_FORMAT, f"LLM judge unavailable; heuristic score {score:.0f}"
155
+ return score, None, f"LLM judge unavailable ({error[:60]}); heuristic used"