cortexops 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cortexops-0.2.0 → cortexops-0.4.0}/PKG-INFO +170 -33
- cortexops-0.4.0/README.md +243 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/__init__.py +2 -2
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/cli.py +82 -0
- cortexops-0.4.0/cortexops/dataset.py +242 -0
- cortexops-0.4.0/cortexops/judge.py +393 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/pyproject.toml +13 -9
- cortexops-0.4.0/cortexops/tracer.py +696 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/pyproject.toml +1 -1
- {cortexops-0.2.0 → cortexops-0.4.0}/tests/test_enhancements.py +175 -1
- cortexops-0.2.0/cortexops/README.md +0 -106
- cortexops-0.2.0/cortexops/judge.py +0 -154
- cortexops-0.2.0/cortexops/tracer.py +0 -278
- {cortexops-0.2.0 → cortexops-0.4.0}/.gitignore +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/LICENSE +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/LICENSE +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0/cortexops}/README.md +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/auth.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/client.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/eval.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/metrics.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/cortexops/models.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/tests/__init__.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/tests/conftest.py +0 -0
- {cortexops-0.2.0 → cortexops-0.4.0}/tests/test_cortexops.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cortexops
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Reliability infrastructure for AI agents — evaluation, observability, and regression testing
|
|
5
5
|
Project-URL: Homepage, https://getcortexops.com
|
|
6
6
|
Project-URL: Repository, https://github.com/ashishodu2023/cortexops
|
|
@@ -68,35 +68,63 @@ Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
|
68
68
|
[](https://pypi.org/project/cortexops/)
|
|
69
69
|
[](https://www.python.org/downloads/)
|
|
70
70
|
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
71
|
-
[](
|
|
71
|
+
[](LICENSE)
|
|
72
72
|
|
|
73
73
|
---
|
|
74
74
|
|
|
75
|
-
##
|
|
75
|
+
## What's New in v0.4.0
|
|
76
76
|
|
|
77
|
-
|
|
77
|
+
### LLM-as-judge evaluation
|
|
78
|
+
```python
|
|
79
|
+
from cortexops.judge import LLMJudge
|
|
80
|
+
|
|
81
|
+
judge = LLMJudge(api_key="sk-...")
|
|
82
|
+
result = judge.evaluate(
|
|
83
|
+
case_id="case-001",
|
|
84
|
+
input="Process refund for order #4821",
|
|
85
|
+
output="Refund of $49.99 approved and processed.",
|
|
86
|
+
rubric="task_completion",
|
|
87
|
+
)
|
|
88
|
+
print(result.score, result.passed, result.reasoning)
|
|
89
|
+
```
|
|
78
90
|
|
|
79
|
-
|
|
80
|
-
|
|
91
|
+
### Golden dataset API
|
|
92
|
+
```python
|
|
93
|
+
from cortexops.dataset import GoldenDataset
|
|
81
94
|
|
|
82
|
-
|
|
95
|
+
ds = GoldenDataset(name="refund-agent-v1")
|
|
96
|
+
ds.add(input="Refund order #4821", expected="refund_approved")
|
|
97
|
+
ds.add(input="Cancel subscription", expected="subscription_cancelled")
|
|
98
|
+
ds.save("datasets/refund_agent.yaml")
|
|
83
99
|
|
|
84
|
-
|
|
100
|
+
results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
|
|
101
|
+
```
|
|
85
102
|
|
|
103
|
+
### CI/CD eval gate
|
|
86
104
|
```bash
|
|
87
|
-
|
|
105
|
+
cortexops eval run \
|
|
106
|
+
--dataset datasets/refund_agent.yaml \
|
|
107
|
+
--judge \
|
|
108
|
+
--fail-on "task_completion < 0.90"
|
|
109
|
+
# Exit code 1 if regression detected — drop into GitHub Actions
|
|
110
|
+
```
|
|
88
111
|
|
|
89
|
-
# With HTTP client (for pushing traces to hosted API):
|
|
90
|
-
pip install cortexops[http]
|
|
91
112
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
113
|
+
## The problem
|
|
114
|
+
|
|
115
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
116
|
+
|
|
117
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
118
|
+
CortexOps fixes that.
|
|
95
119
|
|
|
96
120
|
---
|
|
97
121
|
|
|
98
122
|
## Quickstart
|
|
99
123
|
|
|
124
|
+
```bash
|
|
125
|
+
pip install cortexops # v0.4.0
|
|
126
|
+
```
|
|
127
|
+
|
|
100
128
|
```python
|
|
101
129
|
from cortexops import CortexTracer, EvalSuite
|
|
102
130
|
|
|
@@ -109,14 +137,25 @@ results = EvalSuite.run(
|
|
|
109
137
|
dataset="golden_v1.yaml",
|
|
110
138
|
agent=graph,
|
|
111
139
|
)
|
|
140
|
+
|
|
112
141
|
print(results.summary())
|
|
142
|
+
# CortexOps eval — payments-agent
|
|
143
|
+
# Cases : 9 (7 passed, 2 failed)
|
|
144
|
+
# Task completion : 91.4%
|
|
145
|
+
# Tool accuracy : 97.0/100
|
|
146
|
+
# Latency p50/p95 : 42ms / 187ms
|
|
147
|
+
# Failed cases:
|
|
148
|
+
# - escalation_router: tool_call_mismatch (score 41)
|
|
113
149
|
```
|
|
114
150
|
|
|
115
151
|
---
|
|
116
152
|
|
|
117
|
-
## Golden dataset
|
|
153
|
+
## Golden dataset format
|
|
154
|
+
|
|
155
|
+
Define test cases in YAML. Run them locally or in CI.
|
|
118
156
|
|
|
119
157
|
```yaml
|
|
158
|
+
# golden_v1.yaml
|
|
120
159
|
version: 1
|
|
121
160
|
project: payments-agent
|
|
122
161
|
|
|
@@ -127,25 +166,90 @@ cases:
|
|
|
127
166
|
expected_output_contains: ["approved", "REF-8821"]
|
|
128
167
|
max_latency_ms: 3000
|
|
129
168
|
|
|
130
|
-
- id:
|
|
131
|
-
input: "
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
169
|
+
- id: dispute_escalation_01
|
|
170
|
+
input: "I was charged twice — this is unauthorized"
|
|
171
|
+
expected_tool_calls: [classify_dispute, route_escalation]
|
|
172
|
+
expected_output_contains: ["escalated"]
|
|
173
|
+
max_latency_ms: 5000
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
## CI eval gate
|
|
179
|
+
|
|
180
|
+
Add to `.github/workflows/eval.yml`:
|
|
181
|
+
|
|
182
|
+
```yaml
|
|
183
|
+
- name: CortexOps eval gate
|
|
184
|
+
run: |
|
|
185
|
+
python examples/langgraph_payments/run_eval.py \
|
|
186
|
+
--dataset golden_v1.yaml \
|
|
187
|
+
--fail-on "task_completion < 0.90"
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
If the eval drops below threshold, the job exits non-zero and the PR is blocked.
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Repo structure
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
cortexops/
|
|
198
|
+
├── sdk/ # pip install cortexops # v0.4.0
|
|
199
|
+
│ ├── cortexops/
|
|
200
|
+
│ │ ├── tracer.py # CortexTracer — wraps LangGraph / CrewAI
|
|
201
|
+
│ │ ├── eval.py # EvalSuite — golden dataset runner
|
|
202
|
+
│ │ ├── metrics.py # task_completion, tool_accuracy, latency, hallucination
|
|
203
|
+
│ │ ├── models.py # Pydantic data models
|
|
204
|
+
│ │ └── client.py # HTTP client for hosted API
|
|
205
|
+
│ └── tests/
|
|
206
|
+
├── backend/ # FastAPI + Celery + SQLite/Postgres
|
|
207
|
+
│ ├── app/
|
|
208
|
+
│ │ ├── main.py
|
|
209
|
+
│ │ ├── routers/ # /v1/evals, /v1/traces
|
|
210
|
+
│ │ ├── models/ # DB records + API schemas
|
|
211
|
+
│ │ └── worker/ # Celery async eval tasks
|
|
212
|
+
│ └── Dockerfile
|
|
213
|
+
├── frontend/ # React + TypeScript dashboard
|
|
214
|
+
├── examples/
|
|
215
|
+
│ └── langgraph_payments/ # Full runnable demo
|
|
216
|
+
│ ├── agent.py
|
|
217
|
+
│ ├── golden_v1.yaml
|
|
218
|
+
│ └── run_eval.py
|
|
219
|
+
└── docker-compose.yml
|
|
136
220
|
```
|
|
137
221
|
|
|
138
222
|
---
|
|
139
223
|
|
|
140
|
-
##
|
|
224
|
+
## Run the full stack locally
|
|
141
225
|
|
|
142
226
|
```bash
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
227
|
+
git clone https://github.com/ashishodu2023/cortexops
|
|
228
|
+
cd cortexops
|
|
229
|
+
|
|
230
|
+
# Start API + worker + Redis
|
|
231
|
+
docker compose up --build
|
|
232
|
+
|
|
233
|
+
# In another terminal — run the demo eval
|
|
234
|
+
cd examples/langgraph_payments
|
|
235
|
+
pip install -e ../../sdk/
|
|
236
|
+
python run_eval.py
|
|
237
|
+
|
|
238
|
+
# API docs at http://localhost:8000/docs
|
|
239
|
+
# Dashboard at http://localhost:3000
|
|
146
240
|
```
|
|
147
241
|
|
|
148
|
-
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Supported frameworks
|
|
245
|
+
|
|
246
|
+
| Framework | Status |
|
|
247
|
+
|---|---|
|
|
248
|
+
| LangGraph | Stable |
|
|
249
|
+
| CrewAI | Stable |
|
|
250
|
+
| AutoGen | Beta |
|
|
251
|
+
| LlamaIndex agents | Coming soon |
|
|
252
|
+
| Custom callables | Supported via `CortexTracer.wrap()` |
|
|
149
253
|
|
|
150
254
|
---
|
|
151
255
|
|
|
@@ -153,16 +257,49 @@ Exits non-zero if the threshold is not met — blocks the PR.
|
|
|
153
257
|
|
|
154
258
|
| Metric | What it checks |
|
|
155
259
|
|---|---|
|
|
156
|
-
| `task_completion` |
|
|
260
|
+
| `task_completion` | Agent produced a valid, non-error output |
|
|
157
261
|
| `tool_accuracy` | Expected tool calls were actually made |
|
|
158
262
|
| `latency` | Response within `max_latency_ms` budget |
|
|
159
|
-
| `hallucination` |
|
|
160
|
-
|
|
263
|
+
| `hallucination` | Detects fabrication signals in output |
|
|
264
|
+
|
|
265
|
+
Add custom metrics by subclassing `cortexops.Metric`.
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Contributing
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
git clone https://github.com/ashishodu2023/cortexops
|
|
273
|
+
cd cortexops/sdk
|
|
274
|
+
pip install -e ".[dev]"
|
|
275
|
+
pytest tests/ -v
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
|
|
161
279
|
|
|
162
280
|
---
|
|
163
281
|
|
|
164
|
-
##
|
|
282
|
+
## Citation
|
|
283
|
+
|
|
284
|
+
```bibtex
|
|
285
|
+
@software{cortexops2025,
|
|
286
|
+
author = {Ashish, et al.},
|
|
287
|
+
title = {CortexOps: Reliability Infrastructure for AI Agents},
|
|
288
|
+
year = {2025},
|
|
289
|
+
url = {https://github.com/ashishodu2023/cortexops},
|
|
290
|
+
}
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## License
|
|
296
|
+
|
|
297
|
+
MIT — see [LICENSE](LICENSE).
|
|
298
|
+
|
|
299
|
+
---
|
|
165
300
|
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
301
|
+
<p align="center">
|
|
302
|
+
<a href="https://cortexops.ai">cortexops.ai</a> ·
|
|
303
|
+
<a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
|
|
304
|
+
<a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
|
|
305
|
+
</p>
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# CortexOps
|
|
2
|
+
|
|
3
|
+
**Reliability infrastructure for AI agents.**
|
|
4
|
+
Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
5
|
+
|
|
6
|
+
[](https://pypi.org/project/cortexops/)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
9
|
+
[](LICENSE)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## What's New in v0.4.0
|
|
14
|
+
|
|
15
|
+
### LLM-as-judge evaluation
|
|
16
|
+
```python
|
|
17
|
+
from cortexops.judge import LLMJudge
|
|
18
|
+
|
|
19
|
+
judge = LLMJudge(api_key="sk-...")
|
|
20
|
+
result = judge.evaluate(
|
|
21
|
+
case_id="case-001",
|
|
22
|
+
input="Process refund for order #4821",
|
|
23
|
+
output="Refund of $49.99 approved and processed.",
|
|
24
|
+
rubric="task_completion",
|
|
25
|
+
)
|
|
26
|
+
print(result.score, result.passed, result.reasoning)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### Golden dataset API
|
|
30
|
+
```python
|
|
31
|
+
from cortexops.dataset import GoldenDataset
|
|
32
|
+
|
|
33
|
+
ds = GoldenDataset(name="refund-agent-v1")
|
|
34
|
+
ds.add(input="Refund order #4821", expected="refund_approved")
|
|
35
|
+
ds.add(input="Cancel subscription", expected="subscription_cancelled")
|
|
36
|
+
ds.save("datasets/refund_agent.yaml")
|
|
37
|
+
|
|
38
|
+
results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### CI/CD eval gate
|
|
42
|
+
```bash
|
|
43
|
+
cortexops eval run \
|
|
44
|
+
--dataset datasets/refund_agent.yaml \
|
|
45
|
+
--judge \
|
|
46
|
+
--fail-on "task_completion < 0.90"
|
|
47
|
+
# Exit code 1 if regression detected — drop into GitHub Actions
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
## The problem
|
|
52
|
+
|
|
53
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
54
|
+
|
|
55
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
56
|
+
CortexOps fixes that.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Quickstart
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install cortexops # v0.4.0
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from cortexops import CortexTracer, EvalSuite
|
|
68
|
+
|
|
69
|
+
# Wrap your LangGraph app — zero refactor required
|
|
70
|
+
tracer = CortexTracer(project="payments-agent")
|
|
71
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
72
|
+
|
|
73
|
+
# Run evaluations against a golden dataset
|
|
74
|
+
results = EvalSuite.run(
|
|
75
|
+
dataset="golden_v1.yaml",
|
|
76
|
+
agent=graph,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
print(results.summary())
|
|
80
|
+
# CortexOps eval — payments-agent
|
|
81
|
+
# Cases : 9 (7 passed, 2 failed)
|
|
82
|
+
# Task completion : 91.4%
|
|
83
|
+
# Tool accuracy : 97.0/100
|
|
84
|
+
# Latency p50/p95 : 42ms / 187ms
|
|
85
|
+
# Failed cases:
|
|
86
|
+
# - escalation_router: tool_call_mismatch (score 41)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## Golden dataset format
|
|
92
|
+
|
|
93
|
+
Define test cases in YAML. Run them locally or in CI.
|
|
94
|
+
|
|
95
|
+
```yaml
|
|
96
|
+
# golden_v1.yaml
|
|
97
|
+
version: 1
|
|
98
|
+
project: payments-agent
|
|
99
|
+
|
|
100
|
+
cases:
|
|
101
|
+
- id: refund_lookup_01
|
|
102
|
+
input: "What is the status of refund REF-8821?"
|
|
103
|
+
expected_tool_calls: [lookup_refund]
|
|
104
|
+
expected_output_contains: ["approved", "REF-8821"]
|
|
105
|
+
max_latency_ms: 3000
|
|
106
|
+
|
|
107
|
+
- id: dispute_escalation_01
|
|
108
|
+
input: "I was charged twice — this is unauthorized"
|
|
109
|
+
expected_tool_calls: [classify_dispute, route_escalation]
|
|
110
|
+
expected_output_contains: ["escalated"]
|
|
111
|
+
max_latency_ms: 5000
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## CI eval gate
|
|
117
|
+
|
|
118
|
+
Add to `.github/workflows/eval.yml`:
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
- name: CortexOps eval gate
|
|
122
|
+
run: |
|
|
123
|
+
python examples/langgraph_payments/run_eval.py \
|
|
124
|
+
--dataset golden_v1.yaml \
|
|
125
|
+
--fail-on "task_completion < 0.90"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
If the eval drops below threshold, the job exits non-zero and the PR is blocked.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Repo structure
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
cortexops/
|
|
136
|
+
├── sdk/ # pip install cortexops # v0.4.0
|
|
137
|
+
│ ├── cortexops/
|
|
138
|
+
│ │ ├── tracer.py # CortexTracer — wraps LangGraph / CrewAI
|
|
139
|
+
│ │ ├── eval.py # EvalSuite — golden dataset runner
|
|
140
|
+
│ │ ├── metrics.py # task_completion, tool_accuracy, latency, hallucination
|
|
141
|
+
│ │ ├── models.py # Pydantic data models
|
|
142
|
+
│ │ └── client.py # HTTP client for hosted API
|
|
143
|
+
│ └── tests/
|
|
144
|
+
├── backend/ # FastAPI + Celery + SQLite/Postgres
|
|
145
|
+
│ ├── app/
|
|
146
|
+
│ │ ├── main.py
|
|
147
|
+
│ │ ├── routers/ # /v1/evals, /v1/traces
|
|
148
|
+
│ │ ├── models/ # DB records + API schemas
|
|
149
|
+
│ │ └── worker/ # Celery async eval tasks
|
|
150
|
+
│ └── Dockerfile
|
|
151
|
+
├── frontend/ # React + TypeScript dashboard
|
|
152
|
+
├── examples/
|
|
153
|
+
│ └── langgraph_payments/ # Full runnable demo
|
|
154
|
+
│ ├── agent.py
|
|
155
|
+
│ ├── golden_v1.yaml
|
|
156
|
+
│ └── run_eval.py
|
|
157
|
+
└── docker-compose.yml
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Run the full stack locally
|
|
163
|
+
|
|
164
|
+
```bash
|
|
165
|
+
git clone https://github.com/ashishodu2023/cortexops
|
|
166
|
+
cd cortexops
|
|
167
|
+
|
|
168
|
+
# Start API + worker + Redis
|
|
169
|
+
docker compose up --build
|
|
170
|
+
|
|
171
|
+
# In another terminal — run the demo eval
|
|
172
|
+
cd examples/langgraph_payments
|
|
173
|
+
pip install -e ../../sdk/
|
|
174
|
+
python run_eval.py
|
|
175
|
+
|
|
176
|
+
# API docs at http://localhost:8000/docs
|
|
177
|
+
# Dashboard at http://localhost:3000
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## Supported frameworks
|
|
183
|
+
|
|
184
|
+
| Framework | Status |
|
|
185
|
+
|---|---|
|
|
186
|
+
| LangGraph | Stable |
|
|
187
|
+
| CrewAI | Stable |
|
|
188
|
+
| AutoGen | Beta |
|
|
189
|
+
| LlamaIndex agents | Coming soon |
|
|
190
|
+
| Custom callables | Supported via `CortexTracer.wrap()` |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Built-in metrics
|
|
195
|
+
|
|
196
|
+
| Metric | What it checks |
|
|
197
|
+
|---|---|
|
|
198
|
+
| `task_completion` | Agent produced a valid, non-error output |
|
|
199
|
+
| `tool_accuracy` | Expected tool calls were actually made |
|
|
200
|
+
| `latency` | Response within `max_latency_ms` budget |
|
|
201
|
+
| `hallucination` | Detects fabrication signals in output |
|
|
202
|
+
|
|
203
|
+
Add custom metrics by subclassing `cortexops.Metric`.
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Contributing
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
git clone https://github.com/ashishodu2023/cortexops
|
|
211
|
+
cd cortexops/sdk
|
|
212
|
+
pip install -e ".[dev]"
|
|
213
|
+
pytest tests/ -v
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## Citation
|
|
221
|
+
|
|
222
|
+
```bibtex
|
|
223
|
+
@software{cortexops2025,
|
|
224
|
+
author = {Ashish, et al.},
|
|
225
|
+
title = {CortexOps: Reliability Infrastructure for AI Agents},
|
|
226
|
+
year = {2025},
|
|
227
|
+
url = {https://github.com/ashishodu2023/cortexops},
|
|
228
|
+
}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## License
|
|
234
|
+
|
|
235
|
+
MIT — see [LICENSE](LICENSE).
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
<p align="center">
|
|
240
|
+
<a href="https://cortexops.ai">cortexops.ai</a> ·
|
|
241
|
+
<a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
|
|
242
|
+
<a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
|
|
243
|
+
</p>
|
|
@@ -10,6 +10,7 @@ Quickstart:
|
|
|
10
10
|
print(results.summary())
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
from .auth import cmd_login, cmd_logout, cmd_whoami, load_credentials, save_credentials
|
|
13
14
|
from .client import CortexClient
|
|
14
15
|
from .eval import EvalSuite, EvalThresholdError
|
|
15
16
|
from .judge import LLMJudgeMetric
|
|
@@ -31,10 +32,9 @@ from .models import (
|
|
|
31
32
|
Trace,
|
|
32
33
|
TraceNode,
|
|
33
34
|
)
|
|
34
|
-
from .auth import cmd_login, cmd_logout, cmd_whoami, save_credentials, load_credentials
|
|
35
35
|
from .tracer import CortexTracer
|
|
36
36
|
|
|
37
|
-
__version__ = "0.
|
|
37
|
+
__version__ = "0.4.0"
|
|
38
38
|
|
|
39
39
|
__all__ = [
|
|
40
40
|
"CortexTracer",
|
|
@@ -127,6 +127,88 @@ def cmd_version(_: argparse.Namespace) -> int:
|
|
|
127
127
|
return 0
|
|
128
128
|
|
|
129
129
|
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def cmd_dataset_create(args: argparse.Namespace) -> int:
|
|
133
|
+
"""cortexops dataset create --name my-dataset --output dataset.yaml"""
|
|
134
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
|
135
|
+
from cortexops.dataset import GoldenDataset
|
|
136
|
+
|
|
137
|
+
ds = GoldenDataset(name=args.name, description=args.description or "")
|
|
138
|
+
ds.save(args.output)
|
|
139
|
+
print(f"Created dataset: {args.output}")
|
|
140
|
+
print(" Add cases by editing the YAML file or using ds.add() in Python.")
|
|
141
|
+
return 0
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def cmd_eval_judge(args: argparse.Namespace) -> int:
|
|
145
|
+
"""cortexops eval judge --input <str> --output <str> --rubric task_completion"""
|
|
146
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
|
147
|
+
from cortexops.judge import RUBRICS, LLMJudge
|
|
148
|
+
|
|
149
|
+
api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
|
|
150
|
+
if not api_key:
|
|
151
|
+
print("Error: OPENAI_API_KEY not set. Pass --api-key or set the env var.", file=sys.stderr)
|
|
152
|
+
return 1
|
|
153
|
+
|
|
154
|
+
judge = LLMJudge(api_key=api_key, model=args.model or "gpt-4o-mini")
|
|
155
|
+
result = judge.evaluate(
|
|
156
|
+
case_id="cli-eval",
|
|
157
|
+
input=args.input,
|
|
158
|
+
output=args.output,
|
|
159
|
+
rubric=args.rubric or "task_completion",
|
|
160
|
+
expected=args.expected,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
icon = "✓ PASS" if result.passed else "✗ FAIL"
|
|
164
|
+
print(f"\nLLM Judge Result: {icon}")
|
|
165
|
+
rubric_obj = RUBRICS.get(args.rubric or 'task_completion')
|
|
166
|
+
threshold = rubric_obj.pass_threshold if rubric_obj else 0.70
|
|
167
|
+
print(f" Score: {result.score:.3f} (threshold: {threshold:.2f})")
|
|
168
|
+
print(f" Model: {result.model} ({result.latency_ms}ms)")
|
|
169
|
+
print(f" Reasoning: {result.reasoning}")
|
|
170
|
+
if args.verbose:
|
|
171
|
+
print(f" Criteria: {result.criteria_scores}")
|
|
172
|
+
|
|
173
|
+
return 0 if result.passed else 1
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def cmd_eval_run_with_judge(args: argparse.Namespace) -> int:
|
|
177
|
+
"""cortexops eval run --dataset d.yaml --judge --fail-on task_completion<0.90"""
|
|
178
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
|
179
|
+
from cortexops.dataset import GoldenDataset
|
|
180
|
+
|
|
181
|
+
ds = GoldenDataset.load(args.dataset)
|
|
182
|
+
print("CortexOps eval gate")
|
|
183
|
+
print(f" dataset : {args.dataset} ({len(ds)} cases)")
|
|
184
|
+
print(f" project : {args.project or ds.name}")
|
|
185
|
+
if args.fail_on:
|
|
186
|
+
print(f" fail-on : {args.fail_on}")
|
|
187
|
+
if args.judge:
|
|
188
|
+
print(f" judge : LLM-as-judge ({args.model or 'gpt-4o-mini'})")
|
|
189
|
+
print()
|
|
190
|
+
|
|
191
|
+
def passthrough_agent(inp):
|
|
192
|
+
return {"output": f"[no agent] input: {inp}"}
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
agent = _load_agent(args.agent) if getattr(args, "agent", None) else passthrough_agent
|
|
196
|
+
result = ds.run(
|
|
197
|
+
agent=agent,
|
|
198
|
+
fail_on=args.fail_on,
|
|
199
|
+
verbose=True,
|
|
200
|
+
use_judge=getattr(args, "judge", False),
|
|
201
|
+
judge_rubric=getattr(args, "rubric", "task_completion"),
|
|
202
|
+
judge_api_key=os.getenv("OPENAI_API_KEY"),
|
|
203
|
+
)
|
|
204
|
+
result.print_report()
|
|
205
|
+
return 0 if result.passed() else 1
|
|
206
|
+
except Exception as e:
|
|
207
|
+
print(f"\nEval failed: {e}", file=sys.stderr)
|
|
208
|
+
return 1
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
|
|
130
212
|
def _load_agent(agent_path: str):
|
|
131
213
|
"""Load an agent from a dotted path like 'mymodule:my_agent'."""
|
|
132
214
|
if ":" not in agent_path:
|