cortexops 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexops-0.1.0/.gitignore +34 -0
- cortexops-0.1.0/LICENSE +21 -0
- cortexops-0.1.0/PKG-INFO +169 -0
- cortexops-0.1.0/README.md +106 -0
- cortexops-0.1.0/cortexops/LICENSE +21 -0
- cortexops-0.1.0/cortexops/README.md +106 -0
- cortexops-0.1.0/cortexops/__init__.py +58 -0
- cortexops-0.1.0/cortexops/cli.py +195 -0
- cortexops-0.1.0/cortexops/client.py +84 -0
- cortexops-0.1.0/cortexops/cortexops/__init__.py +58 -0
- cortexops-0.1.0/cortexops/cortexops/cli.py +195 -0
- cortexops-0.1.0/cortexops/cortexops/client.py +84 -0
- cortexops-0.1.0/cortexops/cortexops/eval.py +216 -0
- cortexops-0.1.0/cortexops/cortexops/judge.py +155 -0
- cortexops-0.1.0/cortexops/cortexops/metrics.py +184 -0
- cortexops-0.1.0/cortexops/cortexops/models.py +141 -0
- cortexops-0.1.0/cortexops/cortexops/tracer.py +210 -0
- cortexops-0.1.0/cortexops/eval.py +216 -0
- cortexops-0.1.0/cortexops/judge.py +155 -0
- cortexops-0.1.0/cortexops/metrics.py +184 -0
- cortexops-0.1.0/cortexops/models.py +141 -0
- cortexops-0.1.0/cortexops/pyproject.toml +87 -0
- cortexops-0.1.0/cortexops/tests/__init__.py +0 -0
- cortexops-0.1.0/cortexops/tests/test_cortexops.py +211 -0
- cortexops-0.1.0/cortexops/tests/test_enhancements.py +222 -0
- cortexops-0.1.0/cortexops/tracer.py +210 -0
- cortexops-0.1.0/pyproject.toml +88 -0
- cortexops-0.1.0/tests/__init__.py +0 -0
- cortexops-0.1.0/tests/test_cortexops.py +211 -0
- cortexops-0.1.0/tests/test_enhancements.py +222 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
.venv/
|
|
6
|
+
venv/
|
|
7
|
+
.env
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
PKG-INFO
|
|
12
|
+
*.whl
|
|
13
|
+
*.tar.gz
|
|
14
|
+
|
|
15
|
+
# Test / lint caches
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.ruff_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
|
|
20
|
+
# Package managers
|
|
21
|
+
uv.lock
|
|
22
|
+
.python-version
|
|
23
|
+
|
|
24
|
+
# Database
|
|
25
|
+
*.db
|
|
26
|
+
*.sqlite
|
|
27
|
+
|
|
28
|
+
# IDE
|
|
29
|
+
.vscode/
|
|
30
|
+
.idea/
|
|
31
|
+
|
|
32
|
+
# OS
|
|
33
|
+
.DS_Store
|
|
34
|
+
Thumbs.db
|
cortexops-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 CortexOps Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cortexops-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cortexops
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reliability infrastructure for AI agents — evaluation, observability, and regression testing
|
|
5
|
+
Project-URL: Homepage, https://cortexops.ai
|
|
6
|
+
Project-URL: Repository, https://github.com/ashishodu2023/cortexops
|
|
7
|
+
Project-URL: Documentation, https://docs.cortexops.ai
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/ashishodu2023/cortexops/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/ashishodu2023/cortexops/releases
|
|
10
|
+
Author-email: Ashish <ashishodu2023@gmail.com>
|
|
11
|
+
License: MIT License
|
|
12
|
+
|
|
13
|
+
Copyright (c) 2025 CortexOps Contributors
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
+
in the Software without restriction, including without limitation the rights
|
|
18
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
+
furnished to do so, subject to the following conditions:
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in all
|
|
23
|
+
copies or substantial portions of the Software.
|
|
24
|
+
|
|
25
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
+
SOFTWARE.
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Keywords: agents,ai,autogen,crewai,evaluation,langgraph,llm,observability,testing
|
|
34
|
+
Classifier: Development Status :: 3 - Alpha
|
|
35
|
+
Classifier: Intended Audience :: Developers
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
43
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
44
|
+
Classifier: Topic :: Software Development :: Testing
|
|
45
|
+
Classifier: Typing :: Typed
|
|
46
|
+
Requires-Python: >=3.10
|
|
47
|
+
Requires-Dist: pydantic>=2.0
|
|
48
|
+
Requires-Dist: pyyaml>=6.0
|
|
49
|
+
Requires-Dist: setuptools>=82.0.1
|
|
50
|
+
Provides-Extra: all
|
|
51
|
+
Requires-Dist: httpx>=0.27; extra == 'all'
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
54
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
55
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
56
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
57
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
58
|
+
Provides-Extra: http
|
|
59
|
+
Requires-Dist: httpx>=0.27; extra == 'http'
|
|
60
|
+
Provides-Extra: llm
|
|
61
|
+
Requires-Dist: httpx>=0.27; extra == 'llm'
|
|
62
|
+
Description-Content-Type: text/markdown
|
|
63
|
+
|
|
64
|
+
# CortexOps
|
|
65
|
+
|
|
66
|
+
**Reliability infrastructure for AI agents.**
|
|
67
|
+
Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
68
|
+
|
|
69
|
+
[](https://pypi.org/project/cortexops/)
|
|
70
|
+
[](https://www.python.org/downloads/)
|
|
71
|
+
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
72
|
+
[](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## The problem
|
|
77
|
+
|
|
78
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
79
|
+
|
|
80
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
81
|
+
CortexOps fixes that.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
pip install cortexops
|
|
89
|
+
|
|
90
|
+
# With HTTP client (for pushing traces to hosted API):
|
|
91
|
+
pip install cortexops[http]
|
|
92
|
+
|
|
93
|
+
# With LLM judge support:
|
|
94
|
+
pip install cortexops[llm]
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Quickstart
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from cortexops import CortexTracer, EvalSuite
|
|
103
|
+
|
|
104
|
+
# Wrap your LangGraph app — zero refactor required
|
|
105
|
+
tracer = CortexTracer(project="payments-agent")
|
|
106
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
107
|
+
|
|
108
|
+
# Run evaluations against a golden dataset
|
|
109
|
+
results = EvalSuite.run(
|
|
110
|
+
dataset="golden_v1.yaml",
|
|
111
|
+
agent=graph,
|
|
112
|
+
)
|
|
113
|
+
print(results.summary())
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Golden dataset (YAML)
|
|
119
|
+
|
|
120
|
+
```yaml
|
|
121
|
+
version: 1
|
|
122
|
+
project: payments-agent
|
|
123
|
+
|
|
124
|
+
cases:
|
|
125
|
+
- id: refund_lookup_01
|
|
126
|
+
input: "What is the status of refund REF-8821?"
|
|
127
|
+
expected_tool_calls: [lookup_refund]
|
|
128
|
+
expected_output_contains: ["approved", "REF-8821"]
|
|
129
|
+
max_latency_ms: 3000
|
|
130
|
+
|
|
131
|
+
- id: open_ended_explanation_01
|
|
132
|
+
input: "Why was my refund rejected?"
|
|
133
|
+
judge: llm
|
|
134
|
+
judge_criteria: >
|
|
135
|
+
The response must explain the rejection reason clearly,
|
|
136
|
+
be empathetic, and offer a concrete next step. No jargon.
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
## CI gate
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
cortexops eval run \
|
|
145
|
+
--dataset golden_v1.yaml \
|
|
146
|
+
--fail-on "task_completion < 0.90"
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Exits non-zero if the threshold is not met — blocks the PR.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Built-in metrics
|
|
154
|
+
|
|
155
|
+
| Metric | What it checks |
|
|
156
|
+
|---|---|
|
|
157
|
+
| `task_completion` | Non-empty, non-error output with expected content |
|
|
158
|
+
| `tool_accuracy` | Expected tool calls were actually made |
|
|
159
|
+
| `latency` | Response within `max_latency_ms` budget |
|
|
160
|
+
| `hallucination` | Fabrication signals in output |
|
|
161
|
+
| `llm_judge` | GPT-4o scores against natural-language criteria |
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Links
|
|
166
|
+
|
|
167
|
+
- **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
|
|
168
|
+
- **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
|
|
169
|
+
- **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# CortexOps
|
|
2
|
+
|
|
3
|
+
**Reliability infrastructure for AI agents.**
|
|
4
|
+
Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
5
|
+
|
|
6
|
+
[](https://pypi.org/project/cortexops/)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
9
|
+
[](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## The problem
|
|
14
|
+
|
|
15
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
16
|
+
|
|
17
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
18
|
+
CortexOps fixes that.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install cortexops
|
|
26
|
+
|
|
27
|
+
# With HTTP client (for pushing traces to hosted API):
|
|
28
|
+
pip install cortexops[http]
|
|
29
|
+
|
|
30
|
+
# With LLM judge support:
|
|
31
|
+
pip install cortexops[llm]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from cortexops import CortexTracer, EvalSuite
|
|
40
|
+
|
|
41
|
+
# Wrap your LangGraph app — zero refactor required
|
|
42
|
+
tracer = CortexTracer(project="payments-agent")
|
|
43
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
44
|
+
|
|
45
|
+
# Run evaluations against a golden dataset
|
|
46
|
+
results = EvalSuite.run(
|
|
47
|
+
dataset="golden_v1.yaml",
|
|
48
|
+
agent=graph,
|
|
49
|
+
)
|
|
50
|
+
print(results.summary())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Golden dataset (YAML)
|
|
56
|
+
|
|
57
|
+
```yaml
|
|
58
|
+
version: 1
|
|
59
|
+
project: payments-agent
|
|
60
|
+
|
|
61
|
+
cases:
|
|
62
|
+
- id: refund_lookup_01
|
|
63
|
+
input: "What is the status of refund REF-8821?"
|
|
64
|
+
expected_tool_calls: [lookup_refund]
|
|
65
|
+
expected_output_contains: ["approved", "REF-8821"]
|
|
66
|
+
max_latency_ms: 3000
|
|
67
|
+
|
|
68
|
+
- id: open_ended_explanation_01
|
|
69
|
+
input: "Why was my refund rejected?"
|
|
70
|
+
judge: llm
|
|
71
|
+
judge_criteria: >
|
|
72
|
+
The response must explain the rejection reason clearly,
|
|
73
|
+
be empathetic, and offer a concrete next step. No jargon.
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## CI gate
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
cortexops eval run \
|
|
82
|
+
--dataset golden_v1.yaml \
|
|
83
|
+
--fail-on "task_completion < 0.90"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Exits non-zero if the threshold is not met — blocks the PR.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Built-in metrics
|
|
91
|
+
|
|
92
|
+
| Metric | What it checks |
|
|
93
|
+
|---|---|
|
|
94
|
+
| `task_completion` | Non-empty, non-error output with expected content |
|
|
95
|
+
| `tool_accuracy` | Expected tool calls were actually made |
|
|
96
|
+
| `latency` | Response within `max_latency_ms` budget |
|
|
97
|
+
| `hallucination` | Fabrication signals in output |
|
|
98
|
+
| `llm_judge` | GPT-4o scores against natural-language criteria |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Links
|
|
103
|
+
|
|
104
|
+
- **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
|
|
105
|
+
- **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
|
|
106
|
+
- **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 CortexOps Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# CortexOps
|
|
2
|
+
|
|
3
|
+
**Reliability infrastructure for AI agents.**
|
|
4
|
+
Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
5
|
+
|
|
6
|
+
[](https://pypi.org/project/cortexops/)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
9
|
+
[](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## The problem
|
|
14
|
+
|
|
15
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
16
|
+
|
|
17
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
18
|
+
CortexOps fixes that.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install cortexops
|
|
26
|
+
|
|
27
|
+
# With HTTP client (for pushing traces to hosted API):
|
|
28
|
+
pip install cortexops[http]
|
|
29
|
+
|
|
30
|
+
# With LLM judge support:
|
|
31
|
+
pip install cortexops[llm]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from cortexops import CortexTracer, EvalSuite
|
|
40
|
+
|
|
41
|
+
# Wrap your LangGraph app — zero refactor required
|
|
42
|
+
tracer = CortexTracer(project="payments-agent")
|
|
43
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
44
|
+
|
|
45
|
+
# Run evaluations against a golden dataset
|
|
46
|
+
results = EvalSuite.run(
|
|
47
|
+
dataset="golden_v1.yaml",
|
|
48
|
+
agent=graph,
|
|
49
|
+
)
|
|
50
|
+
print(results.summary())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Golden dataset (YAML)
|
|
56
|
+
|
|
57
|
+
```yaml
|
|
58
|
+
version: 1
|
|
59
|
+
project: payments-agent
|
|
60
|
+
|
|
61
|
+
cases:
|
|
62
|
+
- id: refund_lookup_01
|
|
63
|
+
input: "What is the status of refund REF-8821?"
|
|
64
|
+
expected_tool_calls: [lookup_refund]
|
|
65
|
+
expected_output_contains: ["approved", "REF-8821"]
|
|
66
|
+
max_latency_ms: 3000
|
|
67
|
+
|
|
68
|
+
- id: open_ended_explanation_01
|
|
69
|
+
input: "Why was my refund rejected?"
|
|
70
|
+
judge: llm
|
|
71
|
+
judge_criteria: >
|
|
72
|
+
The response must explain the rejection reason clearly,
|
|
73
|
+
be empathetic, and offer a concrete next step. No jargon.
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## CI gate
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
cortexops eval run \
|
|
82
|
+
--dataset golden_v1.yaml \
|
|
83
|
+
--fail-on "task_completion < 0.90"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Exits non-zero if the threshold is not met — blocks the PR.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Built-in metrics
|
|
91
|
+
|
|
92
|
+
| Metric | What it checks |
|
|
93
|
+
|---|---|
|
|
94
|
+
| `task_completion` | Non-empty, non-error output with expected content |
|
|
95
|
+
| `tool_accuracy` | Expected tool calls were actually made |
|
|
96
|
+
| `latency` | Response within `max_latency_ms` budget |
|
|
97
|
+
| `hallucination` | Fabrication signals in output |
|
|
98
|
+
| `llm_judge` | GPT-4o scores against natural-language criteria |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Links
|
|
103
|
+
|
|
104
|
+
- **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
|
|
105
|
+
- **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
|
|
106
|
+
- **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""CortexOps — Reliability infrastructure for AI agents.
|
|
2
|
+
|
|
3
|
+
Quickstart:
|
|
4
|
+
from cortexops import CortexTracer, EvalSuite
|
|
5
|
+
|
|
6
|
+
tracer = CortexTracer(project="my-agent")
|
|
7
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
8
|
+
|
|
9
|
+
results = EvalSuite.run(dataset="golden_v1.yaml", agent=graph)
|
|
10
|
+
print(results.summary())
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .client import CortexClient
|
|
14
|
+
from .eval import EvalSuite, EvalThresholdError
|
|
15
|
+
from .judge import LLMJudgeMetric
|
|
16
|
+
from .metrics import (
|
|
17
|
+
HallucinationMetric,
|
|
18
|
+
LatencyMetric,
|
|
19
|
+
Metric,
|
|
20
|
+
TaskCompletionMetric,
|
|
21
|
+
ToolAccuracyMetric,
|
|
22
|
+
)
|
|
23
|
+
from .models import (
|
|
24
|
+
CaseResult,
|
|
25
|
+
EvalCase,
|
|
26
|
+
EvalDataset,
|
|
27
|
+
EvalSummary,
|
|
28
|
+
FailureKind,
|
|
29
|
+
RunStatus,
|
|
30
|
+
Trace,
|
|
31
|
+
TraceNode,
|
|
32
|
+
ToolCall,
|
|
33
|
+
)
|
|
34
|
+
from .tracer import CortexTracer
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"CortexTracer",
|
|
40
|
+
"EvalSuite",
|
|
41
|
+
"EvalThresholdError",
|
|
42
|
+
"CortexClient",
|
|
43
|
+
"Metric",
|
|
44
|
+
"TaskCompletionMetric",
|
|
45
|
+
"ToolAccuracyMetric",
|
|
46
|
+
"LatencyMetric",
|
|
47
|
+
"HallucinationMetric",
|
|
48
|
+
"LLMJudgeMetric",
|
|
49
|
+
"Trace",
|
|
50
|
+
"TraceNode",
|
|
51
|
+
"ToolCall",
|
|
52
|
+
"EvalCase",
|
|
53
|
+
"EvalDataset",
|
|
54
|
+
"EvalSummary",
|
|
55
|
+
"CaseResult",
|
|
56
|
+
"FailureKind",
|
|
57
|
+
"RunStatus",
|
|
58
|
+
]
|