cortexops 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cortexops/LICENSE +21 -0
- cortexops/README.md +106 -0
- cortexops/__init__.py +58 -0
- cortexops/cli.py +195 -0
- cortexops/client.py +84 -0
- cortexops/cortexops/__init__.py +58 -0
- cortexops/cortexops/cli.py +195 -0
- cortexops/cortexops/client.py +84 -0
- cortexops/cortexops/eval.py +216 -0
- cortexops/cortexops/judge.py +155 -0
- cortexops/cortexops/metrics.py +184 -0
- cortexops/cortexops/models.py +141 -0
- cortexops/cortexops/tracer.py +210 -0
- cortexops/eval.py +216 -0
- cortexops/judge.py +155 -0
- cortexops/metrics.py +184 -0
- cortexops/models.py +141 -0
- cortexops/pyproject.toml +87 -0
- cortexops/tests/__init__.py +0 -0
- cortexops/tests/test_cortexops.py +211 -0
- cortexops/tests/test_enhancements.py +222 -0
- cortexops/tracer.py +210 -0
- cortexops-0.1.0.dist-info/METADATA +169 -0
- cortexops-0.1.0.dist-info/RECORD +27 -0
- cortexops-0.1.0.dist-info/WHEEL +4 -0
- cortexops-0.1.0.dist-info/entry_points.txt +2 -0
- cortexops-0.1.0.dist-info/licenses/LICENSE +21 -0
cortexops/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 CortexOps Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cortexops/README.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# CortexOps
|
|
2
|
+
|
|
3
|
+
**Reliability infrastructure for AI agents.**
|
|
4
|
+
Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
|
|
5
|
+
|
|
6
|
+
[](https://pypi.org/project/cortexops/)
|
|
7
|
+
[](https://www.python.org/downloads/)
|
|
8
|
+
[](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
|
|
9
|
+
[](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## The problem
|
|
14
|
+
|
|
15
|
+
You deployed an agent. You have no idea if it regressed overnight.
|
|
16
|
+
|
|
17
|
+
No standard eval format. No failure traces. No CI gate before the next prompt change ships.
|
|
18
|
+
CortexOps fixes that.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install cortexops
|
|
26
|
+
|
|
27
|
+
# With HTTP client (for pushing traces to hosted API):
|
|
28
|
+
pip install cortexops[http]
|
|
29
|
+
|
|
30
|
+
# With LLM judge support:
|
|
31
|
+
pip install cortexops[llm]
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Quickstart
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from cortexops import CortexTracer, EvalSuite
|
|
40
|
+
|
|
41
|
+
# Wrap your LangGraph app — zero refactor required
|
|
42
|
+
tracer = CortexTracer(project="payments-agent")
|
|
43
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
44
|
+
|
|
45
|
+
# Run evaluations against a golden dataset
|
|
46
|
+
results = EvalSuite.run(
|
|
47
|
+
dataset="golden_v1.yaml",
|
|
48
|
+
agent=graph,
|
|
49
|
+
)
|
|
50
|
+
print(results.summary())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Golden dataset (YAML)
|
|
56
|
+
|
|
57
|
+
```yaml
|
|
58
|
+
version: 1
|
|
59
|
+
project: payments-agent
|
|
60
|
+
|
|
61
|
+
cases:
|
|
62
|
+
- id: refund_lookup_01
|
|
63
|
+
input: "What is the status of refund REF-8821?"
|
|
64
|
+
expected_tool_calls: [lookup_refund]
|
|
65
|
+
expected_output_contains: ["approved", "REF-8821"]
|
|
66
|
+
max_latency_ms: 3000
|
|
67
|
+
|
|
68
|
+
- id: open_ended_explanation_01
|
|
69
|
+
input: "Why was my refund rejected?"
|
|
70
|
+
judge: llm
|
|
71
|
+
judge_criteria: >
|
|
72
|
+
The response must explain the rejection reason clearly,
|
|
73
|
+
be empathetic, and offer a concrete next step. No jargon.
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## CI gate
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
cortexops eval run \
|
|
82
|
+
--dataset golden_v1.yaml \
|
|
83
|
+
--fail-on "task_completion < 0.90"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Exits non-zero if the threshold is not met — blocks the PR.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Built-in metrics
|
|
91
|
+
|
|
92
|
+
| Metric | What it checks |
|
|
93
|
+
|---|---|
|
|
94
|
+
| `task_completion` | Non-empty, non-error output with expected content |
|
|
95
|
+
| `tool_accuracy` | Expected tool calls were actually made |
|
|
96
|
+
| `latency` | Response within `max_latency_ms` budget |
|
|
97
|
+
| `hallucination` | Fabrication signals in output |
|
|
98
|
+
| `llm_judge` | GPT-4o scores against natural-language criteria |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Links
|
|
103
|
+
|
|
104
|
+
- **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
|
|
105
|
+
- **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
|
|
106
|
+
- **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
|
cortexops/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""CortexOps — Reliability infrastructure for AI agents.
|
|
2
|
+
|
|
3
|
+
Quickstart:
|
|
4
|
+
from cortexops import CortexTracer, EvalSuite
|
|
5
|
+
|
|
6
|
+
tracer = CortexTracer(project="my-agent")
|
|
7
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
8
|
+
|
|
9
|
+
results = EvalSuite.run(dataset="golden_v1.yaml", agent=graph)
|
|
10
|
+
print(results.summary())
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .client import CortexClient
|
|
14
|
+
from .eval import EvalSuite, EvalThresholdError
|
|
15
|
+
from .judge import LLMJudgeMetric
|
|
16
|
+
from .metrics import (
|
|
17
|
+
HallucinationMetric,
|
|
18
|
+
LatencyMetric,
|
|
19
|
+
Metric,
|
|
20
|
+
TaskCompletionMetric,
|
|
21
|
+
ToolAccuracyMetric,
|
|
22
|
+
)
|
|
23
|
+
from .models import (
|
|
24
|
+
CaseResult,
|
|
25
|
+
EvalCase,
|
|
26
|
+
EvalDataset,
|
|
27
|
+
EvalSummary,
|
|
28
|
+
FailureKind,
|
|
29
|
+
RunStatus,
|
|
30
|
+
Trace,
|
|
31
|
+
TraceNode,
|
|
32
|
+
ToolCall,
|
|
33
|
+
)
|
|
34
|
+
from .tracer import CortexTracer
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"CortexTracer",
|
|
40
|
+
"EvalSuite",
|
|
41
|
+
"EvalThresholdError",
|
|
42
|
+
"CortexClient",
|
|
43
|
+
"Metric",
|
|
44
|
+
"TaskCompletionMetric",
|
|
45
|
+
"ToolAccuracyMetric",
|
|
46
|
+
"LatencyMetric",
|
|
47
|
+
"HallucinationMetric",
|
|
48
|
+
"LLMJudgeMetric",
|
|
49
|
+
"Trace",
|
|
50
|
+
"TraceNode",
|
|
51
|
+
"ToolCall",
|
|
52
|
+
"EvalCase",
|
|
53
|
+
"EvalDataset",
|
|
54
|
+
"EvalSummary",
|
|
55
|
+
"CaseResult",
|
|
56
|
+
"FailureKind",
|
|
57
|
+
"RunStatus",
|
|
58
|
+
]
|
cortexops/cli.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""CortexOps CLI — cortexops <command> [options]
|
|
2
|
+
|
|
3
|
+
Commands:
|
|
4
|
+
eval run Run an evaluation suite
|
|
5
|
+
eval diff Diff two eval runs
|
|
6
|
+
failures Show recent failures
|
|
7
|
+
traces List recent traces
|
|
8
|
+
version Print SDK version
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import sys
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cmd_eval_run(args: argparse.Namespace) -> int:
|
|
21
|
+
"""cortexops eval run --dataset golden_v1.yaml --project my-agent"""
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
|
23
|
+
|
|
24
|
+
from cortexops import EvalSuite
|
|
25
|
+
from cortexops.eval import EvalThresholdError
|
|
26
|
+
|
|
27
|
+
print(f"CortexOps eval\n dataset : {args.dataset}\n project : {args.project or 'from dataset'}")
|
|
28
|
+
if args.fail_on:
|
|
29
|
+
print(f" fail-on : {args.fail_on}")
|
|
30
|
+
print()
|
|
31
|
+
|
|
32
|
+
def passthrough_agent(inp: dict) -> dict:
|
|
33
|
+
"""Placeholder — replace with your actual agent import."""
|
|
34
|
+
return {"output": f"[no agent bound] input was: {inp}"}
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
agent = _load_agent(args.agent) if args.agent else passthrough_agent
|
|
38
|
+
summary = EvalSuite.run(
|
|
39
|
+
dataset=args.dataset,
|
|
40
|
+
agent=agent,
|
|
41
|
+
verbose=not args.quiet,
|
|
42
|
+
fail_on=args.fail_on,
|
|
43
|
+
)
|
|
44
|
+
except EvalThresholdError as e:
|
|
45
|
+
print(f"\nCI gate FAILED: {e}", file=sys.stderr)
|
|
46
|
+
return 1
|
|
47
|
+
except FileNotFoundError as e:
|
|
48
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
|
|
51
|
+
if args.output:
|
|
52
|
+
Path(args.output).write_text(json.dumps(summary.model_dump(mode="json"), indent=2))
|
|
53
|
+
print(f"\nResults written to {args.output}")
|
|
54
|
+
|
|
55
|
+
return 0 if summary.failed == 0 else 1
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def cmd_eval_diff(args: argparse.Namespace) -> int:
|
|
59
|
+
"""cortexops eval diff <run_a> <run_b> --api-key cxo-..."""
|
|
60
|
+
from cortexops import CortexClient
|
|
61
|
+
|
|
62
|
+
api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
|
|
63
|
+
if not api_key:
|
|
64
|
+
print("Error: --api-key or CORTEXOPS_API_KEY required for diff", file=sys.stderr)
|
|
65
|
+
return 1
|
|
66
|
+
|
|
67
|
+
client = CortexClient(api_key=api_key, base_url=args.base_url)
|
|
68
|
+
try:
|
|
69
|
+
diff = client.diff(args.run_a, args.run_b)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"Error fetching diff: {e}", file=sys.stderr)
|
|
72
|
+
return 1
|
|
73
|
+
|
|
74
|
+
delta_tc = diff.get("task_completion_delta", 0)
|
|
75
|
+
delta_tool = diff.get("tool_accuracy_delta", 0)
|
|
76
|
+
regressions = diff.get("regressions", [])
|
|
77
|
+
improvements = diff.get("improvements", [])
|
|
78
|
+
|
|
79
|
+
sign = lambda v: f"+{v:.1%}" if v >= 0 else f"{v:.1%}"
|
|
80
|
+
print(f"Diff: {args.run_a[:8]} → {args.run_b[:8]}")
|
|
81
|
+
print(f" Task completion : {sign(delta_tc)}")
|
|
82
|
+
print(f" Tool accuracy : {sign(delta_tool / 100)}")
|
|
83
|
+
if regressions:
|
|
84
|
+
print(f" Regressions ({len(regressions)}): {', '.join(regressions)}")
|
|
85
|
+
if improvements:
|
|
86
|
+
print(f" Improvements ({len(improvements)}): {', '.join(improvements)}")
|
|
87
|
+
|
|
88
|
+
return 1 if regressions else 0
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def cmd_failures(args: argparse.Namespace) -> int:
|
|
92
|
+
"""cortexops failures --project my-agent --last 24h"""
|
|
93
|
+
from cortexops import CortexClient
|
|
94
|
+
|
|
95
|
+
api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
|
|
96
|
+
if not api_key:
|
|
97
|
+
print("Error: --api-key or CORTEXOPS_API_KEY required", file=sys.stderr)
|
|
98
|
+
return 1
|
|
99
|
+
|
|
100
|
+
client = CortexClient(api_key=api_key, base_url=args.base_url)
|
|
101
|
+
try:
|
|
102
|
+
traces = client.list_traces(project=args.project, limit=args.limit)
|
|
103
|
+
except Exception as e:
|
|
104
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
105
|
+
return 1
|
|
106
|
+
|
|
107
|
+
failed = [t for t in traces if t.get("status") == "failed"]
|
|
108
|
+
if not failed:
|
|
109
|
+
print(f"No failures found for project '{args.project}'")
|
|
110
|
+
return 0
|
|
111
|
+
|
|
112
|
+
print(f"Failures — {args.project} (last {len(traces)} traces)")
|
|
113
|
+
print(f"{'Trace ID':<36} {'Failure kind':<28} Latency")
|
|
114
|
+
print("-" * 78)
|
|
115
|
+
for t in failed[:args.limit]:
|
|
116
|
+
print(
|
|
117
|
+
f"{t['trace_id']:<36} {t.get('failure_kind') or 'unknown':<28} "
|
|
118
|
+
f"{t.get('total_latency_ms', 0):.0f}ms"
|
|
119
|
+
)
|
|
120
|
+
return 0
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def cmd_version(_: argparse.Namespace) -> int:
|
|
124
|
+
from cortexops import __version__
|
|
125
|
+
print(f"cortexops {__version__}")
|
|
126
|
+
return 0
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _load_agent(agent_path: str):
|
|
130
|
+
"""Load an agent from a dotted path like 'mymodule:my_agent'."""
|
|
131
|
+
if ":" not in agent_path:
|
|
132
|
+
print(f"Error: --agent must be in the format 'module:object', got '{agent_path}'", file=sys.stderr)
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
module_path, attr = agent_path.rsplit(":", 1)
|
|
135
|
+
import importlib
|
|
136
|
+
module = importlib.import_module(module_path)
|
|
137
|
+
return getattr(module, attr)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def main() -> None:
|
|
141
|
+
parser = argparse.ArgumentParser(
|
|
142
|
+
prog="cortexops",
|
|
143
|
+
description="CortexOps — reliability infrastructure for AI agents",
|
|
144
|
+
)
|
|
145
|
+
sub = parser.add_subparsers(dest="command")
|
|
146
|
+
|
|
147
|
+
# ── eval ──────────────────────────────────────────────────────────────
|
|
148
|
+
eval_parser = sub.add_parser("eval", help="Evaluation commands")
|
|
149
|
+
eval_sub = eval_parser.add_subparsers(dest="eval_command")
|
|
150
|
+
|
|
151
|
+
run_p = eval_sub.add_parser("run", help="Run an eval suite")
|
|
152
|
+
run_p.add_argument("--dataset", "-d", required=True, help="Path to golden dataset YAML")
|
|
153
|
+
run_p.add_argument("--project", "-p", default=None, help="Project name (overrides dataset)")
|
|
154
|
+
run_p.add_argument("--agent", "-a", default=None, help="Agent to evaluate (module:object)")
|
|
155
|
+
run_p.add_argument("--fail-on", default=None, help="e.g. 'task_completion < 0.90'")
|
|
156
|
+
run_p.add_argument("--output", "-o", default=None, help="Save JSON results to file")
|
|
157
|
+
run_p.add_argument("--quiet", "-q", action="store_true", help="Suppress per-case output")
|
|
158
|
+
|
|
159
|
+
diff_p = eval_sub.add_parser("diff", help="Diff two eval runs")
|
|
160
|
+
diff_p.add_argument("run_a", help="First run ID")
|
|
161
|
+
diff_p.add_argument("run_b", help="Second run ID")
|
|
162
|
+
diff_p.add_argument("--api-key", default=None)
|
|
163
|
+
diff_p.add_argument("--base-url", default="https://api.cortexops.ai")
|
|
164
|
+
|
|
165
|
+
# ── failures ──────────────────────────────────────────────────────────
|
|
166
|
+
fail_p = sub.add_parser("failures", help="List recent agent failures")
|
|
167
|
+
fail_p.add_argument("--project", "-p", required=True)
|
|
168
|
+
fail_p.add_argument("--limit", "-n", type=int, default=20)
|
|
169
|
+
fail_p.add_argument("--api-key", default=None)
|
|
170
|
+
fail_p.add_argument("--base-url", default="https://api.cortexops.ai")
|
|
171
|
+
|
|
172
|
+
# ── version ───────────────────────────────────────────────────────────
|
|
173
|
+
sub.add_parser("version", help="Print version and exit")
|
|
174
|
+
|
|
175
|
+
args = parser.parse_args()
|
|
176
|
+
|
|
177
|
+
handlers = {
|
|
178
|
+
("eval", "run"): cmd_eval_run,
|
|
179
|
+
("eval", "diff"): cmd_eval_diff,
|
|
180
|
+
("failures", None): cmd_failures,
|
|
181
|
+
("version", None): cmd_version,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
key = (args.command, getattr(args, "eval_command", None))
|
|
185
|
+
handler = handlers.get(key)
|
|
186
|
+
|
|
187
|
+
if handler is None:
|
|
188
|
+
parser.print_help()
|
|
189
|
+
sys.exit(0)
|
|
190
|
+
|
|
191
|
+
sys.exit(handler(args))
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
main()
|
cortexops/client.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from urllib.parse import urljoin
|
|
5
|
+
|
|
6
|
+
from .models import EvalSummary, Trace
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CortexClient:
|
|
10
|
+
"""HTTP client for the CortexOps backend API.
|
|
11
|
+
|
|
12
|
+
Used by the SDK to push traces and pull eval history.
|
|
13
|
+
Not required for local-only usage.
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
client = CortexClient(api_key="cxo-...", base_url="https://api.cortexops.ai")
|
|
17
|
+
client.push_trace(tracer.last_trace())
|
|
18
|
+
history = client.list_runs(project="payments-agent", limit=10)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
DEFAULT_BASE_URL = "https://api.cortexops.ai"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
api_key: str,
|
|
26
|
+
base_url: str | None = None,
|
|
27
|
+
timeout: float = 10.0,
|
|
28
|
+
) -> None:
|
|
29
|
+
self.api_key = api_key
|
|
30
|
+
self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
|
|
31
|
+
self.timeout = timeout
|
|
32
|
+
|
|
33
|
+
def _headers(self) -> dict[str, str]:
|
|
34
|
+
return {
|
|
35
|
+
"Authorization": f"Bearer {self.api_key}",
|
|
36
|
+
"Content-Type": "application/json",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def _get(self, path: str, params: dict | None = None) -> dict:
|
|
40
|
+
import httpx
|
|
41
|
+
r = httpx.get(
|
|
42
|
+
urljoin(self.base_url + "/", path.lstrip("/")),
|
|
43
|
+
headers=self._headers(),
|
|
44
|
+
params=params,
|
|
45
|
+
timeout=self.timeout,
|
|
46
|
+
)
|
|
47
|
+
r.raise_for_status()
|
|
48
|
+
return r.json()
|
|
49
|
+
|
|
50
|
+
def _post(self, path: str, data: dict) -> dict:
|
|
51
|
+
import httpx
|
|
52
|
+
r = httpx.post(
|
|
53
|
+
urljoin(self.base_url + "/", path.lstrip("/")),
|
|
54
|
+
headers=self._headers(),
|
|
55
|
+
json=data,
|
|
56
|
+
timeout=self.timeout,
|
|
57
|
+
)
|
|
58
|
+
r.raise_for_status()
|
|
59
|
+
return r.json()
|
|
60
|
+
|
|
61
|
+
def push_trace(self, trace: Trace) -> dict:
|
|
62
|
+
return self._post("/v1/traces", trace.model_dump(mode="json"))
|
|
63
|
+
|
|
64
|
+
def get_trace(self, trace_id: str) -> dict:
|
|
65
|
+
return self._get(f"/v1/traces/{trace_id}")
|
|
66
|
+
|
|
67
|
+
def list_traces(self, project: str, limit: int = 50) -> list[dict]:
|
|
68
|
+
return self._get("/v1/traces", {"project": project, "limit": limit})
|
|
69
|
+
|
|
70
|
+
def push_eval(self, summary: EvalSummary) -> dict:
|
|
71
|
+
return self._post("/v1/evals", summary.model_dump(mode="json"))
|
|
72
|
+
|
|
73
|
+
def list_runs(self, project: str, limit: int = 10) -> list[dict]:
|
|
74
|
+
return self._get("/v1/evals", {"project": project, "limit": limit})
|
|
75
|
+
|
|
76
|
+
def run_eval(self, dataset: str, project: str) -> dict:
|
|
77
|
+
"""Trigger a server-side eval run (async via Celery)."""
|
|
78
|
+
return self._post("/v1/evals/run", {"dataset": dataset, "project": project})
|
|
79
|
+
|
|
80
|
+
def get_eval(self, run_id: str) -> dict:
|
|
81
|
+
return self._get(f"/v1/evals/{run_id}")
|
|
82
|
+
|
|
83
|
+
def diff(self, run_id_a: str, run_id_b: str) -> dict:
|
|
84
|
+
return self._get("/v1/evals/diff", {"a": run_id_a, "b": run_id_b})
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""CortexOps — Reliability infrastructure for AI agents.
|
|
2
|
+
|
|
3
|
+
Quickstart:
|
|
4
|
+
from cortexops import CortexTracer, EvalSuite
|
|
5
|
+
|
|
6
|
+
tracer = CortexTracer(project="my-agent")
|
|
7
|
+
graph = tracer.wrap(your_langgraph_app)
|
|
8
|
+
|
|
9
|
+
results = EvalSuite.run(dataset="golden_v1.yaml", agent=graph)
|
|
10
|
+
print(results.summary())
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from .client import CortexClient
|
|
14
|
+
from .eval import EvalSuite, EvalThresholdError
|
|
15
|
+
from .judge import LLMJudgeMetric
|
|
16
|
+
from .metrics import (
|
|
17
|
+
HallucinationMetric,
|
|
18
|
+
LatencyMetric,
|
|
19
|
+
Metric,
|
|
20
|
+
TaskCompletionMetric,
|
|
21
|
+
ToolAccuracyMetric,
|
|
22
|
+
)
|
|
23
|
+
from .models import (
|
|
24
|
+
CaseResult,
|
|
25
|
+
EvalCase,
|
|
26
|
+
EvalDataset,
|
|
27
|
+
EvalSummary,
|
|
28
|
+
FailureKind,
|
|
29
|
+
RunStatus,
|
|
30
|
+
Trace,
|
|
31
|
+
TraceNode,
|
|
32
|
+
ToolCall,
|
|
33
|
+
)
|
|
34
|
+
from .tracer import CortexTracer
|
|
35
|
+
|
|
36
|
+
__version__ = "0.1.0"
|
|
37
|
+
|
|
38
|
+
__all__ = [
|
|
39
|
+
"CortexTracer",
|
|
40
|
+
"EvalSuite",
|
|
41
|
+
"EvalThresholdError",
|
|
42
|
+
"CortexClient",
|
|
43
|
+
"Metric",
|
|
44
|
+
"TaskCompletionMetric",
|
|
45
|
+
"ToolAccuracyMetric",
|
|
46
|
+
"LatencyMetric",
|
|
47
|
+
"HallucinationMetric",
|
|
48
|
+
"LLMJudgeMetric",
|
|
49
|
+
"Trace",
|
|
50
|
+
"TraceNode",
|
|
51
|
+
"ToolCall",
|
|
52
|
+
"EvalCase",
|
|
53
|
+
"EvalDataset",
|
|
54
|
+
"EvalSummary",
|
|
55
|
+
"CaseResult",
|
|
56
|
+
"FailureKind",
|
|
57
|
+
"RunStatus",
|
|
58
|
+
]
|