cortexops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexops/LICENSE +21 -0
- cortexops/README.md +106 -0
- cortexops/__init__.py +58 -0
- cortexops/cli.py +195 -0
- cortexops/client.py +84 -0
- cortexops/cortexops/__init__.py +58 -0
- cortexops/cortexops/cli.py +195 -0
- cortexops/cortexops/client.py +84 -0
- cortexops/cortexops/eval.py +216 -0
- cortexops/cortexops/judge.py +155 -0
- cortexops/cortexops/metrics.py +184 -0
- cortexops/cortexops/models.py +141 -0
- cortexops/cortexops/tracer.py +210 -0
- cortexops/eval.py +216 -0
- cortexops/judge.py +155 -0
- cortexops/metrics.py +184 -0
- cortexops/models.py +141 -0
- cortexops/pyproject.toml +87 -0
- cortexops/tests/__init__.py +0 -0
- cortexops/tests/test_cortexops.py +211 -0
- cortexops/tests/test_enhancements.py +222 -0
- cortexops/tracer.py +210 -0
- cortexops-0.1.0.dist-info/METADATA +169 -0
- cortexops-0.1.0.dist-info/RECORD +27 -0
- cortexops-0.1.0.dist-info/WHEEL +4 -0
- cortexops-0.1.0.dist-info/entry_points.txt +2 -0
- cortexops-0.1.0.dist-info/licenses/LICENSE +21 -0
cortexops/eval.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import statistics
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from .metrics import compute_case_result
|
|
11
|
+
from .models import (
|
|
12
|
+
EvalCase,
|
|
13
|
+
EvalDataset,
|
|
14
|
+
EvalSummary,
|
|
15
|
+
FailureKind,
|
|
16
|
+
RunStatus,
|
|
17
|
+
Trace,
|
|
18
|
+
TraceNode,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class EvalSuite:
|
|
23
|
+
"""Run evaluation suites against any instrumented agent.
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
results = EvalSuite.run(
|
|
27
|
+
dataset="golden_v1.yaml",
|
|
28
|
+
agent=your_langgraph_app, # wrapped or raw callable
|
|
29
|
+
)
|
|
30
|
+
print(results.summary())
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def run(
|
|
35
|
+
cls,
|
|
36
|
+
dataset: str | Path | dict | EvalDataset,
|
|
37
|
+
agent: Any,
|
|
38
|
+
*,
|
|
39
|
+
metrics: list[str] | None = None,
|
|
40
|
+
verbose: bool = True,
|
|
41
|
+
fail_on: str | None = None,
|
|
42
|
+
) -> EvalSummary:
|
|
43
|
+
"""Run a full eval suite.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
dataset: Path to YAML, dict, or EvalDataset object.
|
|
47
|
+
agent: Any callable that accepts a string or dict input.
|
|
48
|
+
metrics: Optional subset of metrics to run.
|
|
49
|
+
verbose: Print case-by-case progress.
|
|
50
|
+
fail_on: Threshold expression like "task_completion < 0.90".
|
|
51
|
+
Raises EvalThresholdError if the condition is met.
|
|
52
|
+
"""
|
|
53
|
+
ds = cls._load_dataset(dataset)
|
|
54
|
+
case_results = []
|
|
55
|
+
|
|
56
|
+
for i, case in enumerate(ds.cases):
|
|
57
|
+
if verbose:
|
|
58
|
+
print(f" [{i+1}/{len(ds.cases)}] {case.id} ... ", end="", flush=True)
|
|
59
|
+
|
|
60
|
+
trace = cls._run_case(agent, case)
|
|
61
|
+
result = compute_case_result(case, trace)
|
|
62
|
+
case_results.append(result)
|
|
63
|
+
|
|
64
|
+
if verbose:
|
|
65
|
+
status = "pass" if result.passed else "FAIL"
|
|
66
|
+
print(f"{status} ({result.score:.0f})")
|
|
67
|
+
|
|
68
|
+
latencies = [r.latency_ms for r in case_results]
|
|
69
|
+
latencies_sorted = sorted(latencies)
|
|
70
|
+
n = len(latencies_sorted)
|
|
71
|
+
|
|
72
|
+
summary = EvalSummary(
|
|
73
|
+
project=ds.project,
|
|
74
|
+
dataset_version=ds.version,
|
|
75
|
+
total_cases=len(case_results),
|
|
76
|
+
passed=sum(1 for r in case_results if r.passed),
|
|
77
|
+
failed=sum(1 for r in case_results if not r.passed),
|
|
78
|
+
warnings=sum(1 for r in case_results if not r.passed and r.score >= 60),
|
|
79
|
+
task_completion_rate=sum(1 for r in case_results if r.task_completion) / max(n, 1),
|
|
80
|
+
tool_accuracy=statistics.mean(r.tool_accuracy for r in case_results) if case_results else 0.0,
|
|
81
|
+
latency_p50_ms=latencies_sorted[int(n * 0.50) - 1] if n else 0.0,
|
|
82
|
+
latency_p95_ms=latencies_sorted[int(n * 0.95) - 1] if n else 0.0,
|
|
83
|
+
case_results=case_results,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
if verbose:
|
|
87
|
+
print()
|
|
88
|
+
print(summary.summary())
|
|
89
|
+
|
|
90
|
+
if fail_on:
|
|
91
|
+
cls._check_threshold(summary, fail_on)
|
|
92
|
+
|
|
93
|
+
return summary
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def _run_case(cls, agent: Any, case: EvalCase) -> Trace:
|
|
97
|
+
input_data = case.input if isinstance(case.input, dict) else {"input": case.input}
|
|
98
|
+
t0 = time.perf_counter()
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
if hasattr(agent, "invoke"):
|
|
102
|
+
output = agent.invoke(input_data)
|
|
103
|
+
elif callable(agent):
|
|
104
|
+
output = agent(input_data)
|
|
105
|
+
else:
|
|
106
|
+
raise TypeError(f"Agent {type(agent).__name__} is not callable")
|
|
107
|
+
latency_ms = (time.perf_counter() - t0) * 1000
|
|
108
|
+
output_dict = output if isinstance(output, dict) else {"output": str(output)}
|
|
109
|
+
|
|
110
|
+
return Trace(
|
|
111
|
+
project="eval",
|
|
112
|
+
case_id=case.id,
|
|
113
|
+
input=input_data,
|
|
114
|
+
output=output_dict,
|
|
115
|
+
total_latency_ms=latency_ms,
|
|
116
|
+
status=RunStatus.COMPLETED,
|
|
117
|
+
nodes=[
|
|
118
|
+
TraceNode(
|
|
119
|
+
node_id="eval_root",
|
|
120
|
+
node_name="agent",
|
|
121
|
+
input=input_data,
|
|
122
|
+
output=output_dict,
|
|
123
|
+
latency_ms=latency_ms,
|
|
124
|
+
)
|
|
125
|
+
],
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
latency_ms = (time.perf_counter() - t0) * 1000
|
|
130
|
+
return Trace(
|
|
131
|
+
project="eval",
|
|
132
|
+
case_id=case.id,
|
|
133
|
+
input=input_data,
|
|
134
|
+
output={},
|
|
135
|
+
total_latency_ms=latency_ms,
|
|
136
|
+
status=RunStatus.FAILED,
|
|
137
|
+
failure_kind=FailureKind.UNKNOWN,
|
|
138
|
+
failure_detail=str(exc),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def _load_dataset(cls, dataset: str | Path | dict | EvalDataset) -> EvalDataset:
|
|
143
|
+
if isinstance(dataset, EvalDataset):
|
|
144
|
+
return dataset
|
|
145
|
+
|
|
146
|
+
if isinstance(dataset, dict):
|
|
147
|
+
return cls._parse_dataset_dict(dataset)
|
|
148
|
+
|
|
149
|
+
path = Path(dataset)
|
|
150
|
+
if not path.exists():
|
|
151
|
+
raise FileNotFoundError(f"Dataset not found: {path}")
|
|
152
|
+
|
|
153
|
+
raw = yaml.safe_load(path.read_text())
|
|
154
|
+
return cls._parse_dataset_dict(raw)
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def _parse_dataset_dict(cls, raw: dict) -> EvalDataset:
|
|
158
|
+
cases = []
|
|
159
|
+
for c in raw.get("cases", []):
|
|
160
|
+
cases.append(
|
|
161
|
+
EvalCase(
|
|
162
|
+
id=str(c["id"]),
|
|
163
|
+
input=c["input"],
|
|
164
|
+
expected_tool_calls=c.get("expected_tool_calls", []),
|
|
165
|
+
expected_output_contains=c.get("expected_output_contains", []),
|
|
166
|
+
expected_output_not_contains=c.get("expected_output_not_contains", []),
|
|
167
|
+
max_latency_ms=c.get("max_latency_ms"),
|
|
168
|
+
judge=c.get("judge", "rule"),
|
|
169
|
+
judge_criteria=c.get("judge_criteria"),
|
|
170
|
+
tags=c.get("tags", []),
|
|
171
|
+
)
|
|
172
|
+
)
|
|
173
|
+
return EvalDataset(
|
|
174
|
+
version=raw.get("version", 1),
|
|
175
|
+
project=raw.get("project", "unknown"),
|
|
176
|
+
description=raw.get("description", ""),
|
|
177
|
+
cases=cases,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@classmethod
|
|
181
|
+
def _check_threshold(cls, summary: EvalSummary, fail_on: str) -> None:
|
|
182
|
+
"""Parse and evaluate a threshold expression like 'task_completion < 0.90'."""
|
|
183
|
+
import re
|
|
184
|
+
|
|
185
|
+
m = re.match(r"(\w+)\s*([<>]=?)\s*([\d.]+)", fail_on.strip())
|
|
186
|
+
if not m:
|
|
187
|
+
raise ValueError(f"Invalid fail_on expression: '{fail_on}'")
|
|
188
|
+
|
|
189
|
+
metric, op, threshold_str = m.groups()
|
|
190
|
+
threshold = float(threshold_str)
|
|
191
|
+
|
|
192
|
+
actual = {
|
|
193
|
+
"task_completion": summary.task_completion_rate,
|
|
194
|
+
"tool_accuracy": summary.tool_accuracy / 100.0,
|
|
195
|
+
"pass_rate": summary.passed / max(summary.total_cases, 1),
|
|
196
|
+
}.get(metric)
|
|
197
|
+
|
|
198
|
+
if actual is None:
|
|
199
|
+
raise ValueError(f"Unknown metric in fail_on: '{metric}'")
|
|
200
|
+
|
|
201
|
+
failed = {
|
|
202
|
+
"<": actual < threshold,
|
|
203
|
+
"<=": actual <= threshold,
|
|
204
|
+
">": actual > threshold,
|
|
205
|
+
">=": actual >= threshold,
|
|
206
|
+
}.get(op, False)
|
|
207
|
+
|
|
208
|
+
if failed:
|
|
209
|
+
raise EvalThresholdError(
|
|
210
|
+
f"Eval gate failed: {metric}={actual:.3f} {op} {threshold} "
|
|
211
|
+
f"(project={summary.project})"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class EvalThresholdError(Exception):
|
|
216
|
+
"""Raised when an eval run fails a CI threshold gate."""
|
cortexops/judge.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""LLM-as-judge metric for CortexOps.
|
|
2
|
+
|
|
3
|
+
Uses an LLM (default: gpt-4o-mini) to score open-ended agent output
|
|
4
|
+
against natural language criteria. Works with any OpenAI-compatible API.
|
|
5
|
+
|
|
6
|
+
Usage in golden dataset:
|
|
7
|
+
- id: refund_explanation_01
|
|
8
|
+
input: "Why was my refund rejected?"
|
|
9
|
+
judge: llm
|
|
10
|
+
judge_criteria: >
|
|
11
|
+
The response should explain the rejection reason clearly,
|
|
12
|
+
be empathetic, and offer a next step to the customer.
|
|
13
|
+
It must NOT contain jargon or mention internal system errors.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from .metrics import Metric
|
|
23
|
+
from .models import EvalCase, FailureKind, Trace
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
JUDGE_SYSTEM_PROMPT = """You are a strict but fair evaluator of AI agent outputs.
|
|
27
|
+
You will be given:
|
|
28
|
+
- The user's input to the agent
|
|
29
|
+
- The agent's output
|
|
30
|
+
- Evaluation criteria
|
|
31
|
+
|
|
32
|
+
Score the output from 0 to 100 and explain your reasoning briefly.
|
|
33
|
+
|
|
34
|
+
Respond ONLY with valid JSON in this exact format:
|
|
35
|
+
{
|
|
36
|
+
"score": <integer 0-100>,
|
|
37
|
+
"passed": <true|false>,
|
|
38
|
+
"reasoning": "<one sentence>"
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
Rules:
|
|
42
|
+
- 90-100: Fully meets all criteria, no issues
|
|
43
|
+
- 70-89: Mostly meets criteria, minor gaps
|
|
44
|
+
- 50-69: Partially meets criteria, notable gaps
|
|
45
|
+
- 0-49: Fails to meet criteria or contains harmful/incorrect content
|
|
46
|
+
- passed = true only if score >= 70
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class LLMJudgeMetric(Metric):
|
|
51
|
+
"""Score agent output using an LLM judge.
|
|
52
|
+
|
|
53
|
+
Falls back to a heuristic score if the LLM API is unavailable,
|
|
54
|
+
so evals never block on API failures.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
model: OpenAI model to use. Default: gpt-4o-mini.
|
|
58
|
+
api_key: OpenAI API key. Falls back to OPENAI_API_KEY env var.
|
|
59
|
+
base_url: OpenAI-compatible base URL. Useful for local LLMs.
|
|
60
|
+
temperature: Judge temperature. Keep low (0.1) for consistency.
|
|
61
|
+
timeout: HTTP timeout in seconds.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
name = "llm_judge"
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
model: str = "gpt-4o-mini",
|
|
69
|
+
api_key: str | None = None,
|
|
70
|
+
base_url: str = "https://api.openai.com/v1",
|
|
71
|
+
temperature: float = 0.1,
|
|
72
|
+
timeout: float = 30.0,
|
|
73
|
+
) -> None:
|
|
74
|
+
self.model = model
|
|
75
|
+
self.api_key = api_key or os.getenv("OPENAI_API_KEY") or os.getenv("CORTEXOPS_JUDGE_API_KEY")
|
|
76
|
+
self.base_url = base_url.rstrip("/")
|
|
77
|
+
self.temperature = temperature
|
|
78
|
+
self.timeout = timeout
|
|
79
|
+
|
|
80
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
81
|
+
if not case.judge_criteria:
|
|
82
|
+
return 100.0, None, None
|
|
83
|
+
|
|
84
|
+
if case.judge != "llm":
|
|
85
|
+
return 100.0, None, None
|
|
86
|
+
|
|
87
|
+
user_input = str(case.input)
|
|
88
|
+
agent_output = str(trace.output.get("output", trace.output))
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
result = self._call_judge(user_input, agent_output, case.judge_criteria)
|
|
92
|
+
score = float(result.get("score", 0))
|
|
93
|
+
passed = result.get("passed", score >= 70)
|
|
94
|
+
reasoning = result.get("reasoning", "")
|
|
95
|
+
|
|
96
|
+
if not passed:
|
|
97
|
+
return score, FailureKind.OUTPUT_FORMAT, f"LLM judge: {reasoning}"
|
|
98
|
+
return score, None, None
|
|
99
|
+
|
|
100
|
+
except Exception as exc:
|
|
101
|
+
return self._heuristic_fallback(case, trace, str(exc))
|
|
102
|
+
|
|
103
|
+
def _call_judge(self, user_input: str, agent_output: str, criteria: str) -> dict[str, Any]:
|
|
104
|
+
import httpx
|
|
105
|
+
|
|
106
|
+
if not self.api_key:
|
|
107
|
+
raise ValueError(
|
|
108
|
+
"No API key found for LLM judge. Set OPENAI_API_KEY or pass api_key= to LLMJudgeMetric()."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
user_message = (
|
|
112
|
+
f"USER INPUT:\n{user_input}\n\n"
|
|
113
|
+
f"AGENT OUTPUT:\n{agent_output}\n\n"
|
|
114
|
+
f"EVALUATION CRITERIA:\n{criteria}"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
response = httpx.post(
|
|
118
|
+
f"{self.base_url}/chat/completions",
|
|
119
|
+
headers={
|
|
120
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
121
|
+
"Content-Type": "application/json",
|
|
122
|
+
},
|
|
123
|
+
json={
|
|
124
|
+
"model": self.model,
|
|
125
|
+
"messages": [
|
|
126
|
+
{"role": "system", "content": JUDGE_SYSTEM_PROMPT},
|
|
127
|
+
{"role": "user", "content": user_message},
|
|
128
|
+
],
|
|
129
|
+
"temperature": self.temperature,
|
|
130
|
+
"response_format": {"type": "json_object"},
|
|
131
|
+
},
|
|
132
|
+
timeout=self.timeout,
|
|
133
|
+
)
|
|
134
|
+
response.raise_for_status()
|
|
135
|
+
content = response.json()["choices"][0]["message"]["content"]
|
|
136
|
+
return json.loads(content)
|
|
137
|
+
|
|
138
|
+
def _heuristic_fallback(
|
|
139
|
+
self, case: EvalCase, trace: Trace, error: str
|
|
140
|
+
) -> tuple[float, FailureKind | None, str | None]:
|
|
141
|
+
"""Simple keyword fallback when the LLM is unavailable."""
|
|
142
|
+
output = str(trace.output.get("output", "")).lower()
|
|
143
|
+
criteria_words = (case.judge_criteria or "").lower().split()
|
|
144
|
+
meaningful_words = [w for w in criteria_words if len(w) > 4]
|
|
145
|
+
|
|
146
|
+
if not meaningful_words:
|
|
147
|
+
return 70.0, None, f"LLM judge unavailable ({error[:60]}); heuristic used"
|
|
148
|
+
|
|
149
|
+
hits = sum(1 for w in meaningful_words if w in output)
|
|
150
|
+
ratio = hits / len(meaningful_words)
|
|
151
|
+
score = 50.0 + 50.0 * ratio
|
|
152
|
+
|
|
153
|
+
if score < 70:
|
|
154
|
+
return score, FailureKind.OUTPUT_FORMAT, f"LLM judge unavailable; heuristic score {score:.0f}"
|
|
155
|
+
return score, None, f"LLM judge unavailable ({error[:60]}); heuristic used"
|
cortexops/metrics.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .models import CaseResult, EvalCase, FailureKind, Trace
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Metric(ABC):
|
|
11
|
+
"""Base class for all CortexOps eval metrics.
|
|
12
|
+
Subclass this and implement score() to add custom metrics.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
name: str = "base"
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
19
|
+
"""Return (score 0-100, failure_kind or None, failure_detail or None)."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TaskCompletionMetric(Metric):
|
|
23
|
+
"""Checks whether the agent produced a non-empty, non-error output."""
|
|
24
|
+
|
|
25
|
+
name = "task_completion"
|
|
26
|
+
|
|
27
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
28
|
+
output = trace.output or {}
|
|
29
|
+
output_str = str(output.get("output", output.get("result", output.get("answer", ""))))
|
|
30
|
+
|
|
31
|
+
if not output_str or output_str.lower() in ("none", "null", ""):
|
|
32
|
+
return 0.0, FailureKind.UNKNOWN, "Agent produced no output"
|
|
33
|
+
|
|
34
|
+
error_patterns = [r"error:", r"exception:", r"traceback", r"failed to"]
|
|
35
|
+
for pat in error_patterns:
|
|
36
|
+
if re.search(pat, output_str, re.IGNORECASE):
|
|
37
|
+
return 20.0, FailureKind.UNKNOWN, f"Output contains error signal: {output_str[:100]}"
|
|
38
|
+
|
|
39
|
+
if case.expected_output_contains:
|
|
40
|
+
hits = sum(1 for kw in case.expected_output_contains if kw.lower() in output_str.lower())
|
|
41
|
+
ratio = hits / len(case.expected_output_contains)
|
|
42
|
+
if ratio < 1.0:
|
|
43
|
+
missing = [kw for kw in case.expected_output_contains if kw.lower() not in output_str.lower()]
|
|
44
|
+
return (
|
|
45
|
+
50.0 + 50.0 * ratio,
|
|
46
|
+
FailureKind.OUTPUT_FORMAT,
|
|
47
|
+
f"Missing expected content: {missing}",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if case.expected_output_not_contains:
|
|
51
|
+
violations = [kw for kw in case.expected_output_not_contains if kw.lower() in output_str.lower()]
|
|
52
|
+
if violations:
|
|
53
|
+
return (
|
|
54
|
+
30.0,
|
|
55
|
+
FailureKind.HALLUCINATION,
|
|
56
|
+
f"Output contains prohibited content: {violations}",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return 100.0, None, None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ToolAccuracyMetric(Metric):
|
|
63
|
+
"""Checks whether expected tool calls were actually made.
|
|
64
|
+
|
|
65
|
+
Looks in two places (in priority order):
|
|
66
|
+
1. Trace node tool_calls (full instrumentation via CortexTracer.record_tool_call)
|
|
67
|
+
2. output['tool_calls_made'] list (lightweight self-reporting from the agent)
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
name = "tool_accuracy"
|
|
71
|
+
|
|
72
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
73
|
+
if not case.expected_tool_calls:
|
|
74
|
+
return 100.0, None, None
|
|
75
|
+
|
|
76
|
+
# Priority 1: instrumented trace nodes
|
|
77
|
+
actual_calls = {tc.name for tc in trace.tool_calls()}
|
|
78
|
+
|
|
79
|
+
# Priority 2: agent self-reported via output dict
|
|
80
|
+
if not actual_calls:
|
|
81
|
+
reported = trace.output.get("tool_calls_made", [])
|
|
82
|
+
if isinstance(reported, list):
|
|
83
|
+
actual_calls = set(reported)
|
|
84
|
+
|
|
85
|
+
expected = set(case.expected_tool_calls)
|
|
86
|
+
missing = expected - actual_calls
|
|
87
|
+
|
|
88
|
+
if not missing:
|
|
89
|
+
return 100.0, None, None
|
|
90
|
+
|
|
91
|
+
ratio = len(expected - missing) / len(expected)
|
|
92
|
+
return (
|
|
93
|
+
round(ratio * 100, 1),
|
|
94
|
+
FailureKind.TOOL_CALL_MISMATCH,
|
|
95
|
+
f"Missing tool calls: {sorted(missing)}",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class LatencyMetric(Metric):
|
|
100
|
+
"""Checks whether the agent responded within the required latency budget."""
|
|
101
|
+
|
|
102
|
+
name = "latency"
|
|
103
|
+
|
|
104
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
105
|
+
if case.max_latency_ms is None:
|
|
106
|
+
return 100.0, None, None
|
|
107
|
+
if trace.total_latency_ms <= case.max_latency_ms:
|
|
108
|
+
return 100.0, None, None
|
|
109
|
+
overage = trace.total_latency_ms - case.max_latency_ms
|
|
110
|
+
return (
|
|
111
|
+
max(0.0, 100.0 - (overage / case.max_latency_ms) * 100),
|
|
112
|
+
FailureKind.TIMEOUT,
|
|
113
|
+
f"Latency {trace.total_latency_ms:.0f}ms exceeded budget {case.max_latency_ms:.0f}ms",
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class HallucinationMetric(Metric):
|
|
118
|
+
"""Detects common hallucination signals in agent output.
|
|
119
|
+
Flags confident fabrications, contradictions, and forbidden facts.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
name = "hallucination"
|
|
123
|
+
|
|
124
|
+
HALLUCINATION_PATTERNS = [
|
|
125
|
+
r"\bas of (january|february|march|april|may|june|july|august|september|october|november|december) 20[0-9]{2}\b",
|
|
126
|
+
r"\bi (don't|do not) have (access|information|data)\b",
|
|
127
|
+
r"\bi cannot (access|retrieve|look up)\b",
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
|
|
131
|
+
output = str(trace.output)
|
|
132
|
+
for pat in self.HALLUCINATION_PATTERNS:
|
|
133
|
+
if re.search(pat, output, re.IGNORECASE):
|
|
134
|
+
return (
|
|
135
|
+
40.0,
|
|
136
|
+
FailureKind.HALLUCINATION,
|
|
137
|
+
f"Hallucination signal detected: pattern '{pat}'",
|
|
138
|
+
)
|
|
139
|
+
return 100.0, None, None
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def compute_case_result(case: EvalCase, trace: Trace, extra_metrics: "list[Metric] | None" = None) -> CaseResult:
|
|
143
|
+
metrics: list[Metric] = [
|
|
144
|
+
TaskCompletionMetric(),
|
|
145
|
+
ToolAccuracyMetric(),
|
|
146
|
+
LatencyMetric(),
|
|
147
|
+
HallucinationMetric(),
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
if case.judge == "llm":
|
|
151
|
+
from .judge import LLMJudgeMetric
|
|
152
|
+
metrics.append(LLMJudgeMetric())
|
|
153
|
+
|
|
154
|
+
if extra_metrics:
|
|
155
|
+
metrics.extend(extra_metrics)
|
|
156
|
+
|
|
157
|
+
scores: list[float] = []
|
|
158
|
+
failure_kind: FailureKind | None = None
|
|
159
|
+
failure_detail: str | None = None
|
|
160
|
+
|
|
161
|
+
for metric in metrics:
|
|
162
|
+
s, fk, fd = metric.score(case, trace)
|
|
163
|
+
scores.append(s)
|
|
164
|
+
if s < 100.0 and failure_kind is None:
|
|
165
|
+
failure_kind = fk
|
|
166
|
+
failure_detail = fd
|
|
167
|
+
|
|
168
|
+
final_score = sum(scores) / len(scores)
|
|
169
|
+
task_ok_score, _, _ = TaskCompletionMetric().score(case, trace)
|
|
170
|
+
tool_score, _, _ = ToolAccuracyMetric().score(case, trace)
|
|
171
|
+
lat_score, _, _ = LatencyMetric().score(case, trace)
|
|
172
|
+
|
|
173
|
+
return CaseResult(
|
|
174
|
+
case_id=case.id,
|
|
175
|
+
passed=final_score >= 80.0,
|
|
176
|
+
score=round(final_score, 1),
|
|
177
|
+
task_completion=task_ok_score >= 80.0,
|
|
178
|
+
tool_accuracy=round(tool_score, 1),
|
|
179
|
+
latency_ms=trace.total_latency_ms,
|
|
180
|
+
latency_ok=lat_score >= 80.0,
|
|
181
|
+
failure_kind=failure_kind,
|
|
182
|
+
failure_detail=failure_detail,
|
|
183
|
+
trace=trace,
|
|
184
|
+
)
|
cortexops/models.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import uuid
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RunStatus(str, Enum):
|
|
12
|
+
PENDING = "pending"
|
|
13
|
+
RUNNING = "running"
|
|
14
|
+
COMPLETED = "completed"
|
|
15
|
+
FAILED = "failed"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ToolCallStatus(str, Enum):
|
|
19
|
+
SUCCESS = "success"
|
|
20
|
+
ERROR = "error"
|
|
21
|
+
TIMEOUT = "timeout"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class FailureKind(str, Enum):
|
|
25
|
+
TOOL_CALL_MISMATCH = "tool_call_mismatch"
|
|
26
|
+
HALLUCINATION = "hallucination"
|
|
27
|
+
PLAN_DEVIATION = "plan_deviation"
|
|
28
|
+
TIMEOUT = "timeout"
|
|
29
|
+
CONTEXT_OVERFLOW = "context_overflow"
|
|
30
|
+
OUTPUT_FORMAT = "output_format"
|
|
31
|
+
UNKNOWN = "unknown"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ToolCall(BaseModel):
|
|
35
|
+
name: str
|
|
36
|
+
args: dict[str, Any] = Field(default_factory=dict)
|
|
37
|
+
result: Any = None
|
|
38
|
+
status: ToolCallStatus = ToolCallStatus.SUCCESS
|
|
39
|
+
latency_ms: float = 0.0
|
|
40
|
+
error: str | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TraceNode(BaseModel):
|
|
44
|
+
node_id: str
|
|
45
|
+
node_name: str
|
|
46
|
+
input: dict[str, Any] = Field(default_factory=dict)
|
|
47
|
+
output: dict[str, Any] = Field(default_factory=dict)
|
|
48
|
+
tool_calls: list[ToolCall] = Field(default_factory=list)
|
|
49
|
+
llm_prompt: str | None = None
|
|
50
|
+
llm_response: str | None = None
|
|
51
|
+
latency_ms: float = 0.0
|
|
52
|
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Trace(BaseModel):
|
|
56
|
+
trace_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
57
|
+
project: str
|
|
58
|
+
run_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
59
|
+
case_id: str | None = None
|
|
60
|
+
nodes: list[TraceNode] = Field(default_factory=list)
|
|
61
|
+
input: dict[str, Any] = Field(default_factory=dict)
|
|
62
|
+
output: dict[str, Any] = Field(default_factory=dict)
|
|
63
|
+
total_latency_ms: float = 0.0
|
|
64
|
+
status: RunStatus = RunStatus.COMPLETED
|
|
65
|
+
failure_kind: FailureKind | None = None
|
|
66
|
+
failure_detail: str | None = None
|
|
67
|
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
|
68
|
+
|
|
69
|
+
def tool_calls(self) -> list[ToolCall]:
|
|
70
|
+
return [tc for node in self.nodes for tc in node.tool_calls]
|
|
71
|
+
|
|
72
|
+
def total_tool_calls(self) -> int:
|
|
73
|
+
return len(self.tool_calls())
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class EvalCase(BaseModel):
|
|
77
|
+
id: str
|
|
78
|
+
input: str | dict[str, Any]
|
|
79
|
+
expected_tool_calls: list[str] = Field(default_factory=list)
|
|
80
|
+
expected_output_contains: list[str] = Field(default_factory=list)
|
|
81
|
+
expected_output_not_contains: list[str] = Field(default_factory=list)
|
|
82
|
+
max_latency_ms: float | None = None
|
|
83
|
+
judge: str = "rule" # "rule" | "llm"
|
|
84
|
+
judge_criteria: str | None = None
|
|
85
|
+
tags: list[str] = Field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class EvalDataset(BaseModel):
|
|
89
|
+
version: int = 1
|
|
90
|
+
project: str
|
|
91
|
+
description: str = ""
|
|
92
|
+
cases: list[EvalCase] = Field(default_factory=list)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class CaseResult(BaseModel):
|
|
96
|
+
case_id: str
|
|
97
|
+
passed: bool
|
|
98
|
+
score: float # 0.0 - 100.0
|
|
99
|
+
task_completion: bool
|
|
100
|
+
tool_accuracy: float
|
|
101
|
+
latency_ms: float
|
|
102
|
+
latency_ok: bool
|
|
103
|
+
failure_kind: FailureKind | None = None
|
|
104
|
+
failure_detail: str | None = None
|
|
105
|
+
trace: Trace | None = None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class EvalSummary(BaseModel):
|
|
109
|
+
run_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
|
110
|
+
project: str
|
|
111
|
+
dataset_version: int
|
|
112
|
+
total_cases: int
|
|
113
|
+
passed: int
|
|
114
|
+
failed: int
|
|
115
|
+
warnings: int
|
|
116
|
+
task_completion_rate: float
|
|
117
|
+
tool_accuracy: float
|
|
118
|
+
latency_p50_ms: float
|
|
119
|
+
latency_p95_ms: float
|
|
120
|
+
regressions: int = 0
|
|
121
|
+
baseline_run_id: str | None = None
|
|
122
|
+
case_results: list[CaseResult] = Field(default_factory=list)
|
|
123
|
+
timestamp: datetime = Field(default_factory=datetime.utcnow)
|
|
124
|
+
|
|
125
|
+
def summary(self) -> str:
|
|
126
|
+
lines = [
|
|
127
|
+
f"CortexOps eval — {self.project}",
|
|
128
|
+
f" Run ID : {self.run_id}",
|
|
129
|
+
f" Cases : {self.total_cases} ({self.passed} passed, {self.failed} failed)",
|
|
130
|
+
f" Task completion : {self.task_completion_rate:.1%}",
|
|
131
|
+
f" Tool accuracy : {self.tool_accuracy:.1f}/100",
|
|
132
|
+
f" Latency p50/p95 : {self.latency_p50_ms:.0f}ms / {self.latency_p95_ms:.0f}ms",
|
|
133
|
+
]
|
|
134
|
+
if self.regressions:
|
|
135
|
+
lines.append(f" Regressions : {self.regressions} (vs baseline {self.baseline_run_id})")
|
|
136
|
+
failing = [r for r in self.case_results if not r.passed]
|
|
137
|
+
if failing:
|
|
138
|
+
lines.append(" Failed cases:")
|
|
139
|
+
for r in failing:
|
|
140
|
+
lines.append(f" - {r.case_id}: {r.failure_kind or 'unknown'} (score {r.score:.0f})")
|
|
141
|
+
return "\n".join(lines)
|