cortexops 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cortexops
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Reliability infrastructure for AI agents — evaluation, observability, and regression testing
5
5
  Project-URL: Homepage, https://getcortexops.com
6
6
  Project-URL: Repository, https://github.com/ashishodu2023/cortexops
@@ -68,35 +68,63 @@ Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
68
68
  [![PyPI version](https://img.shields.io/pypi/v/cortexops.svg)](https://pypi.org/project/cortexops/)
69
69
  [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
70
70
  [![CI](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml/badge.svg)](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
71
- [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
71
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
72
72
 
73
73
  ---
74
74
 
75
- ## The problem
75
+ ## What's New in v0.4.0
76
76
 
77
- You deployed an agent. You have no idea if it regressed overnight.
77
+ ### LLM-as-judge evaluation
78
+ ```python
79
+ from cortexops.judge import LLMJudge
80
+
81
+ judge = LLMJudge(api_key="sk-...")
82
+ result = judge.evaluate(
83
+ case_id="case-001",
84
+ input="Process refund for order #4821",
85
+ output="Refund of $49.99 approved and processed.",
86
+ rubric="task_completion",
87
+ )
88
+ print(result.score, result.passed, result.reasoning)
89
+ ```
78
90
 
79
- No standard eval format. No failure traces. No CI gate before the next prompt change ships.
80
- CortexOps fixes that.
91
+ ### Golden dataset API
92
+ ```python
93
+ from cortexops.dataset import GoldenDataset
81
94
 
82
- ---
95
+ ds = GoldenDataset(name="refund-agent-v1")
96
+ ds.add(input="Refund order #4821", expected="refund_approved")
97
+ ds.add(input="Cancel subscription", expected="subscription_cancelled")
98
+ ds.save("datasets/refund_agent.yaml")
83
99
 
84
- ## Install
100
+ results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
101
+ ```
85
102
 
103
+ ### CI/CD eval gate
86
104
  ```bash
87
- pip install cortexops
105
+ cortexops eval run \
106
+ --dataset datasets/refund_agent.yaml \
107
+ --judge \
108
+ --fail-on "task_completion < 0.90"
109
+ # Exit code 1 if regression detected — drop into GitHub Actions
110
+ ```
88
111
 
89
- # With HTTP client (for pushing traces to hosted API):
90
- pip install cortexops[http]
91
112
 
92
- # With LLM judge support:
93
- pip install cortexops[llm]
94
- ```
113
+ ## The problem
114
+
115
+ You deployed an agent. You have no idea if it regressed overnight.
116
+
117
+ No standard eval format. No failure traces. No CI gate before the next prompt change ships.
118
+ CortexOps fixes that.
95
119
 
96
120
  ---
97
121
 
98
122
  ## Quickstart
99
123
 
124
+ ```bash
125
+ pip install cortexops # v0.4.0
126
+ ```
127
+
100
128
  ```python
101
129
  from cortexops import CortexTracer, EvalSuite
102
130
 
@@ -109,14 +137,25 @@ results = EvalSuite.run(
109
137
  dataset="golden_v1.yaml",
110
138
  agent=graph,
111
139
  )
140
+
112
141
  print(results.summary())
142
+ # CortexOps eval — payments-agent
143
+ # Cases : 9 (7 passed, 2 failed)
144
+ # Task completion : 91.4%
145
+ # Tool accuracy : 97.0/100
146
+ # Latency p50/p95 : 42ms / 187ms
147
+ # Failed cases:
148
+ # - escalation_router: tool_call_mismatch (score 41)
113
149
  ```
114
150
 
115
151
  ---
116
152
 
117
- ## Golden dataset (YAML)
153
+ ## Golden dataset format
154
+
155
+ Define test cases in YAML. Run them locally or in CI.
118
156
 
119
157
  ```yaml
158
+ # golden_v1.yaml
120
159
  version: 1
121
160
  project: payments-agent
122
161
 
@@ -127,25 +166,90 @@ cases:
127
166
  expected_output_contains: ["approved", "REF-8821"]
128
167
  max_latency_ms: 3000
129
168
 
130
- - id: open_ended_explanation_01
131
- input: "Why was my refund rejected?"
132
- judge: llm
133
- judge_criteria: >
134
- The response must explain the rejection reason clearly,
135
- be empathetic, and offer a concrete next step. No jargon.
169
+ - id: dispute_escalation_01
170
+ input: "I was charged twice — this is unauthorized"
171
+ expected_tool_calls: [classify_dispute, route_escalation]
172
+ expected_output_contains: ["escalated"]
173
+ max_latency_ms: 5000
174
+ ```
175
+
176
+ ---
177
+
178
+ ## CI eval gate
179
+
180
+ Add to `.github/workflows/eval.yml`:
181
+
182
+ ```yaml
183
+ - name: CortexOps eval gate
184
+ run: |
185
+ python examples/langgraph_payments/run_eval.py \
186
+ --dataset golden_v1.yaml \
187
+ --fail-on "task_completion < 0.90"
188
+ ```
189
+
190
+ If the eval drops below threshold, the job exits non-zero and the PR is blocked.
191
+
192
+ ---
193
+
194
+ ## Repo structure
195
+
196
+ ```
197
+ cortexops/
198
+ ├── sdk/ # pip install cortexops # v0.4.0
199
+ │ ├── cortexops/
200
+ │ │ ├── tracer.py # CortexTracer — wraps LangGraph / CrewAI
201
+ │ │ ├── eval.py # EvalSuite — golden dataset runner
202
+ │ │ ├── metrics.py # task_completion, tool_accuracy, latency, hallucination
203
+ │ │ ├── models.py # Pydantic data models
204
+ │ │ └── client.py # HTTP client for hosted API
205
+ │ └── tests/
206
+ ├── backend/ # FastAPI + Celery + SQLite/Postgres
207
+ │ ├── app/
208
+ │ │ ├── main.py
209
+ │ │ ├── routers/ # /v1/evals, /v1/traces
210
+ │ │ ├── models/ # DB records + API schemas
211
+ │ │ └── worker/ # Celery async eval tasks
212
+ │ └── Dockerfile
213
+ ├── frontend/ # React + TypeScript dashboard
214
+ ├── examples/
215
+ │ └── langgraph_payments/ # Full runnable demo
216
+ │ ├── agent.py
217
+ │ ├── golden_v1.yaml
218
+ │ └── run_eval.py
219
+ └── docker-compose.yml
136
220
  ```
137
221
 
138
222
  ---
139
223
 
140
- ## CI gate
224
+ ## Run the full stack locally
141
225
 
142
226
  ```bash
143
- cortexops eval run \
144
- --dataset golden_v1.yaml \
145
- --fail-on "task_completion < 0.90"
227
+ git clone https://github.com/ashishodu2023/cortexops
228
+ cd cortexops
229
+
230
+ # Start API + worker + Redis
231
+ docker compose up --build
232
+
233
+ # In another terminal — run the demo eval
234
+ cd examples/langgraph_payments
235
+ pip install -e ../../sdk/
236
+ python run_eval.py
237
+
238
+ # API docs at http://localhost:8000/docs
239
+ # Dashboard at http://localhost:3000
146
240
  ```
147
241
 
148
- Exits non-zero if the threshold is not met — blocks the PR.
242
+ ---
243
+
244
+ ## Supported frameworks
245
+
246
+ | Framework | Status |
247
+ |---|---|
248
+ | LangGraph | Stable |
249
+ | CrewAI | Stable |
250
+ | AutoGen | Beta |
251
+ | LlamaIndex agents | Coming soon |
252
+ | Custom callables | Supported via `CortexTracer.wrap()` |
149
253
 
150
254
  ---
151
255
 
@@ -153,16 +257,49 @@ Exits non-zero if the threshold is not met — blocks the PR.
153
257
 
154
258
  | Metric | What it checks |
155
259
  |---|---|
156
- | `task_completion` | Non-empty, non-error output with expected content |
260
+ | `task_completion` | Agent produced a valid, non-error output |
157
261
  | `tool_accuracy` | Expected tool calls were actually made |
158
262
  | `latency` | Response within `max_latency_ms` budget |
159
- | `hallucination` | Fabrication signals in output |
160
- | `llm_judge` | GPT-4o scores against natural-language criteria |
263
+ | `hallucination` | Detects fabrication signals in output |
264
+
265
+ Add custom metrics by subclassing `cortexops.Metric`.
266
+
267
+ ---
268
+
269
+ ## Contributing
270
+
271
+ ```bash
272
+ git clone https://github.com/ashishodu2023/cortexops
273
+ cd cortexops/sdk
274
+ pip install -e ".[dev]"
275
+ pytest tests/ -v
276
+ ```
277
+
278
+ See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
161
279
 
162
280
  ---
163
281
 
164
- ## Links
282
+ ## Citation
283
+
284
+ ```bibtex
285
+ @software{cortexops2025,
286
+ author = {Ashish, et al.},
287
+ title = {CortexOps: Reliability Infrastructure for AI Agents},
288
+ year = {2025},
289
+ url = {https://github.com/ashishodu2023/cortexops},
290
+ }
291
+ ```
292
+
293
+ ---
294
+
295
+ ## License
296
+
297
+ MIT — see [LICENSE](LICENSE).
298
+
299
+ ---
165
300
 
166
- - **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
167
- - **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
168
- - **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
301
+ <p align="center">
302
+ <a href="https://cortexops.ai">cortexops.ai</a> ·
303
+ <a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
304
+ <a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
305
+ </p>
@@ -0,0 +1,243 @@
1
+ # CortexOps
2
+
3
+ **Reliability infrastructure for AI agents.**
4
+ Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
5
+
6
+ [![PyPI version](https://img.shields.io/pypi/v/cortexops.svg)](https://pypi.org/project/cortexops/)
7
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
8
+ [![CI](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml/badge.svg)](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
9
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
10
+
11
+ ---
12
+
13
+ ## What's New in v0.4.0
14
+
15
+ ### LLM-as-judge evaluation
16
+ ```python
17
+ from cortexops.judge import LLMJudge
18
+
19
+ judge = LLMJudge(api_key="sk-...")
20
+ result = judge.evaluate(
21
+ case_id="case-001",
22
+ input="Process refund for order #4821",
23
+ output="Refund of $49.99 approved and processed.",
24
+ rubric="task_completion",
25
+ )
26
+ print(result.score, result.passed, result.reasoning)
27
+ ```
28
+
29
+ ### Golden dataset API
30
+ ```python
31
+ from cortexops.dataset import GoldenDataset
32
+
33
+ ds = GoldenDataset(name="refund-agent-v1")
34
+ ds.add(input="Refund order #4821", expected="refund_approved")
35
+ ds.add(input="Cancel subscription", expected="subscription_cancelled")
36
+ ds.save("datasets/refund_agent.yaml")
37
+
38
+ results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
39
+ ```
40
+
41
+ ### CI/CD eval gate
42
+ ```bash
43
+ cortexops eval run \
44
+ --dataset datasets/refund_agent.yaml \
45
+ --judge \
46
+ --fail-on "task_completion < 0.90"
47
+ # Exit code 1 if regression detected — drop into GitHub Actions
48
+ ```
49
+
50
+
51
+ ## The problem
52
+
53
+ You deployed an agent. You have no idea if it regressed overnight.
54
+
55
+ No standard eval format. No failure traces. No CI gate before the next prompt change ships.
56
+ CortexOps fixes that.
57
+
58
+ ---
59
+
60
+ ## Quickstart
61
+
62
+ ```bash
63
+ pip install cortexops # v0.4.0
64
+ ```
65
+
66
+ ```python
67
+ from cortexops import CortexTracer, EvalSuite
68
+
69
+ # Wrap your LangGraph app — zero refactor required
70
+ tracer = CortexTracer(project="payments-agent")
71
+ graph = tracer.wrap(your_langgraph_app)
72
+
73
+ # Run evaluations against a golden dataset
74
+ results = EvalSuite.run(
75
+ dataset="golden_v1.yaml",
76
+ agent=graph,
77
+ )
78
+
79
+ print(results.summary())
80
+ # CortexOps eval — payments-agent
81
+ # Cases : 9 (7 passed, 2 failed)
82
+ # Task completion : 91.4%
83
+ # Tool accuracy : 97.0/100
84
+ # Latency p50/p95 : 42ms / 187ms
85
+ # Failed cases:
86
+ # - escalation_router: tool_call_mismatch (score 41)
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Golden dataset format
92
+
93
+ Define test cases in YAML. Run them locally or in CI.
94
+
95
+ ```yaml
96
+ # golden_v1.yaml
97
+ version: 1
98
+ project: payments-agent
99
+
100
+ cases:
101
+ - id: refund_lookup_01
102
+ input: "What is the status of refund REF-8821?"
103
+ expected_tool_calls: [lookup_refund]
104
+ expected_output_contains: ["approved", "REF-8821"]
105
+ max_latency_ms: 3000
106
+
107
+ - id: dispute_escalation_01
108
+ input: "I was charged twice — this is unauthorized"
109
+ expected_tool_calls: [classify_dispute, route_escalation]
110
+ expected_output_contains: ["escalated"]
111
+ max_latency_ms: 5000
112
+ ```
113
+
114
+ ---
115
+
116
+ ## CI eval gate
117
+
118
+ Add to `.github/workflows/eval.yml`:
119
+
120
+ ```yaml
121
+ - name: CortexOps eval gate
122
+ run: |
123
+ python examples/langgraph_payments/run_eval.py \
124
+ --dataset golden_v1.yaml \
125
+ --fail-on "task_completion < 0.90"
126
+ ```
127
+
128
+ If the eval drops below threshold, the job exits non-zero and the PR is blocked.
129
+
130
+ ---
131
+
132
+ ## Repo structure
133
+
134
+ ```
135
+ cortexops/
136
+ ├── sdk/ # pip install cortexops # v0.4.0
137
+ │ ├── cortexops/
138
+ │ │ ├── tracer.py # CortexTracer — wraps LangGraph / CrewAI
139
+ │ │ ├── eval.py # EvalSuite — golden dataset runner
140
+ │ │ ├── metrics.py # task_completion, tool_accuracy, latency, hallucination
141
+ │ │ ├── models.py # Pydantic data models
142
+ │ │ └── client.py # HTTP client for hosted API
143
+ │ └── tests/
144
+ ├── backend/ # FastAPI + Celery + SQLite/Postgres
145
+ │ ├── app/
146
+ │ │ ├── main.py
147
+ │ │ ├── routers/ # /v1/evals, /v1/traces
148
+ │ │ ├── models/ # DB records + API schemas
149
+ │ │ └── worker/ # Celery async eval tasks
150
+ │ └── Dockerfile
151
+ ├── frontend/ # React + TypeScript dashboard
152
+ ├── examples/
153
+ │ └── langgraph_payments/ # Full runnable demo
154
+ │ ├── agent.py
155
+ │ ├── golden_v1.yaml
156
+ │ └── run_eval.py
157
+ └── docker-compose.yml
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Run the full stack locally
163
+
164
+ ```bash
165
+ git clone https://github.com/ashishodu2023/cortexops
166
+ cd cortexops
167
+
168
+ # Start API + worker + Redis
169
+ docker compose up --build
170
+
171
+ # In another terminal — run the demo eval
172
+ cd examples/langgraph_payments
173
+ pip install -e ../../sdk/
174
+ python run_eval.py
175
+
176
+ # API docs at http://localhost:8000/docs
177
+ # Dashboard at http://localhost:3000
178
+ ```
179
+
180
+ ---
181
+
182
+ ## Supported frameworks
183
+
184
+ | Framework | Status |
185
+ |---|---|
186
+ | LangGraph | Stable |
187
+ | CrewAI | Stable |
188
+ | AutoGen | Beta |
189
+ | LlamaIndex agents | Coming soon |
190
+ | Custom callables | Supported via `CortexTracer.wrap()` |
191
+
192
+ ---
193
+
194
+ ## Built-in metrics
195
+
196
+ | Metric | What it checks |
197
+ |---|---|
198
+ | `task_completion` | Agent produced a valid, non-error output |
199
+ | `tool_accuracy` | Expected tool calls were actually made |
200
+ | `latency` | Response within `max_latency_ms` budget |
201
+ | `hallucination` | Detects fabrication signals in output |
202
+
203
+ Add custom metrics by subclassing `cortexops.Metric`.
204
+
205
+ ---
206
+
207
+ ## Contributing
208
+
209
+ ```bash
210
+ git clone https://github.com/ashishodu2023/cortexops
211
+ cd cortexops/sdk
212
+ pip install -e ".[dev]"
213
+ pytest tests/ -v
214
+ ```
215
+
216
+ See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
217
+
218
+ ---
219
+
220
+ ## Citation
221
+
222
+ ```bibtex
223
+ @software{cortexops2025,
224
+ author = {Ashish, et al.},
225
+ title = {CortexOps: Reliability Infrastructure for AI Agents},
226
+ year = {2025},
227
+ url = {https://github.com/ashishodu2023/cortexops},
228
+ }
229
+ ```
230
+
231
+ ---
232
+
233
+ ## License
234
+
235
+ MIT — see [LICENSE](LICENSE).
236
+
237
+ ---
238
+
239
+ <p align="center">
240
+ <a href="https://cortexops.ai">cortexops.ai</a> ·
241
+ <a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
242
+ <a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
243
+ </p>
@@ -34,7 +34,7 @@ from .models import (
34
34
  )
35
35
  from .tracer import CortexTracer
36
36
 
37
- __version__ = "0.3.0"
37
+ __version__ = "0.4.0"
38
38
 
39
39
  __all__ = [
40
40
  "CortexTracer",
@@ -127,6 +127,88 @@ def cmd_version(_: argparse.Namespace) -> int:
127
127
  return 0
128
128
 
129
129
 
130
+
131
+
132
+ def cmd_dataset_create(args: argparse.Namespace) -> int:
133
+ """cortexops dataset create --name my-dataset --output dataset.yaml"""
134
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
135
+ from cortexops.dataset import GoldenDataset
136
+
137
+ ds = GoldenDataset(name=args.name, description=args.description or "")
138
+ ds.save(args.output)
139
+ print(f"Created dataset: {args.output}")
140
+ print(" Add cases by editing the YAML file or using ds.add() in Python.")
141
+ return 0
142
+
143
+
144
+ def cmd_eval_judge(args: argparse.Namespace) -> int:
145
+ """cortexops eval judge --input <str> --output <str> --rubric task_completion"""
146
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
147
+ from cortexops.judge import RUBRICS, LLMJudge
148
+
149
+ api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
150
+ if not api_key:
151
+ print("Error: OPENAI_API_KEY not set. Pass --api-key or set the env var.", file=sys.stderr)
152
+ return 1
153
+
154
+ judge = LLMJudge(api_key=api_key, model=args.model or "gpt-4o-mini")
155
+ result = judge.evaluate(
156
+ case_id="cli-eval",
157
+ input=args.input,
158
+ output=args.output,
159
+ rubric=args.rubric or "task_completion",
160
+ expected=args.expected,
161
+ )
162
+
163
+ icon = "✓ PASS" if result.passed else "✗ FAIL"
164
+ print(f"\nLLM Judge Result: {icon}")
165
+ rubric_obj = RUBRICS.get(args.rubric or 'task_completion')
166
+ threshold = rubric_obj.pass_threshold if rubric_obj else 0.70
167
+ print(f" Score: {result.score:.3f} (threshold: {threshold:.2f})")
168
+ print(f" Model: {result.model} ({result.latency_ms}ms)")
169
+ print(f" Reasoning: {result.reasoning}")
170
+ if args.verbose:
171
+ print(f" Criteria: {result.criteria_scores}")
172
+
173
+ return 0 if result.passed else 1
174
+
175
+
176
+ def cmd_eval_run_with_judge(args: argparse.Namespace) -> int:
177
+ """cortexops eval run --dataset d.yaml --judge --fail-on task_completion<0.90"""
178
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
179
+ from cortexops.dataset import GoldenDataset
180
+
181
+ ds = GoldenDataset.load(args.dataset)
182
+ print("CortexOps eval gate")
183
+ print(f" dataset : {args.dataset} ({len(ds)} cases)")
184
+ print(f" project : {args.project or ds.name}")
185
+ if args.fail_on:
186
+ print(f" fail-on : {args.fail_on}")
187
+ if args.judge:
188
+ print(f" judge : LLM-as-judge ({args.model or 'gpt-4o-mini'})")
189
+ print()
190
+
191
+ def passthrough_agent(inp):
192
+ return {"output": f"[no agent] input: {inp}"}
193
+
194
+ try:
195
+ agent = _load_agent(args.agent) if getattr(args, "agent", None) else passthrough_agent
196
+ result = ds.run(
197
+ agent=agent,
198
+ fail_on=args.fail_on,
199
+ verbose=True,
200
+ use_judge=getattr(args, "judge", False),
201
+ judge_rubric=getattr(args, "rubric", "task_completion"),
202
+ judge_api_key=os.getenv("OPENAI_API_KEY"),
203
+ )
204
+ result.print_report()
205
+ return 0 if result.passed() else 1
206
+ except Exception as e:
207
+ print(f"\nEval failed: {e}", file=sys.stderr)
208
+ return 1
209
+
210
+
211
+
130
212
  def _load_agent(agent_path: str):
131
213
  """Load an agent from a dotted path like 'mymodule:my_agent'."""
132
214
  if ":" not in agent_path: