cortexops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cortexops/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 CortexOps Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
cortexops/README.md ADDED
@@ -0,0 +1,106 @@
1
+ # CortexOps
2
+
3
+ **Reliability infrastructure for AI agents.**
4
+ Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
5
+
6
+ [![PyPI version](https://img.shields.io/pypi/v/cortexops.svg)](https://pypi.org/project/cortexops/)
7
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
8
+ [![CI](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml/badge.svg)](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
9
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
10
+
11
+ ---
12
+
13
+ ## The problem
14
+
15
+ You deployed an agent. You have no idea if it regressed overnight.
16
+
17
+ No standard eval format. No failure traces. No CI gate before the next prompt change ships.
18
+ CortexOps fixes that.
19
+
20
+ ---
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install cortexops
26
+
27
+ # With HTTP client (for pushing traces to hosted API):
28
+ pip install cortexops[http]
29
+
30
+ # With LLM judge support:
31
+ pip install cortexops[llm]
32
+ ```
33
+
34
+ ---
35
+
36
+ ## Quickstart
37
+
38
+ ```python
39
+ from cortexops import CortexTracer, EvalSuite
40
+
41
+ # Wrap your LangGraph app — zero refactor required
42
+ tracer = CortexTracer(project="payments-agent")
43
+ graph = tracer.wrap(your_langgraph_app)
44
+
45
+ # Run evaluations against a golden dataset
46
+ results = EvalSuite.run(
47
+ dataset="golden_v1.yaml",
48
+ agent=graph,
49
+ )
50
+ print(results.summary())
51
+ ```
52
+
53
+ ---
54
+
55
+ ## Golden dataset (YAML)
56
+
57
+ ```yaml
58
+ version: 1
59
+ project: payments-agent
60
+
61
+ cases:
62
+ - id: refund_lookup_01
63
+ input: "What is the status of refund REF-8821?"
64
+ expected_tool_calls: [lookup_refund]
65
+ expected_output_contains: ["approved", "REF-8821"]
66
+ max_latency_ms: 3000
67
+
68
+ - id: open_ended_explanation_01
69
+ input: "Why was my refund rejected?"
70
+ judge: llm
71
+ judge_criteria: >
72
+ The response must explain the rejection reason clearly,
73
+ be empathetic, and offer a concrete next step. No jargon.
74
+ ```
75
+
76
+ ---
77
+
78
+ ## CI gate
79
+
80
+ ```bash
81
+ cortexops eval run \
82
+ --dataset golden_v1.yaml \
83
+ --fail-on "task_completion < 0.90"
84
+ ```
85
+
86
+ Exits non-zero if the threshold is not met — blocks the PR.
87
+
88
+ ---
89
+
90
+ ## Built-in metrics
91
+
92
+ | Metric | What it checks |
93
+ |---|---|
94
+ | `task_completion` | Non-empty, non-error output with expected content |
95
+ | `tool_accuracy` | Expected tool calls were actually made |
96
+ | `latency` | Response within `max_latency_ms` budget |
97
+ | `hallucination` | Fabrication signals in output |
98
+ | `llm_judge` | GPT-4o scores against natural-language criteria |
99
+
100
+ ---
101
+
102
+ ## Links
103
+
104
+ - **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
105
+ - **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
106
+ - **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
cortexops/__init__.py ADDED
@@ -0,0 +1,58 @@
1
+ """CortexOps — Reliability infrastructure for AI agents.
2
+
3
+ Quickstart:
4
+ from cortexops import CortexTracer, EvalSuite
5
+
6
+ tracer = CortexTracer(project="my-agent")
7
+ graph = tracer.wrap(your_langgraph_app)
8
+
9
+ results = EvalSuite.run(dataset="golden_v1.yaml", agent=graph)
10
+ print(results.summary())
11
+ """
12
+
13
+ from .client import CortexClient
14
+ from .eval import EvalSuite, EvalThresholdError
15
+ from .judge import LLMJudgeMetric
16
+ from .metrics import (
17
+ HallucinationMetric,
18
+ LatencyMetric,
19
+ Metric,
20
+ TaskCompletionMetric,
21
+ ToolAccuracyMetric,
22
+ )
23
+ from .models import (
24
+ CaseResult,
25
+ EvalCase,
26
+ EvalDataset,
27
+ EvalSummary,
28
+ FailureKind,
29
+ RunStatus,
30
+ Trace,
31
+ TraceNode,
32
+ ToolCall,
33
+ )
34
+ from .tracer import CortexTracer
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ "CortexTracer",
40
+ "EvalSuite",
41
+ "EvalThresholdError",
42
+ "CortexClient",
43
+ "Metric",
44
+ "TaskCompletionMetric",
45
+ "ToolAccuracyMetric",
46
+ "LatencyMetric",
47
+ "HallucinationMetric",
48
+ "LLMJudgeMetric",
49
+ "Trace",
50
+ "TraceNode",
51
+ "ToolCall",
52
+ "EvalCase",
53
+ "EvalDataset",
54
+ "EvalSummary",
55
+ "CaseResult",
56
+ "FailureKind",
57
+ "RunStatus",
58
+ ]
cortexops/cli.py ADDED
@@ -0,0 +1,195 @@
1
+ """CortexOps CLI — cortexops <command> [options]
2
+
3
+ Commands:
4
+ eval run Run an evaluation suite
5
+ eval diff Diff two eval runs
6
+ failures Show recent failures
7
+ traces List recent traces
8
+ version Print SDK version
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import sys
17
+ from pathlib import Path
18
+
19
+
20
+ def cmd_eval_run(args: argparse.Namespace) -> int:
21
+ """cortexops eval run --dataset golden_v1.yaml --project my-agent"""
22
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
23
+
24
+ from cortexops import EvalSuite
25
+ from cortexops.eval import EvalThresholdError
26
+
27
+ print(f"CortexOps eval\n dataset : {args.dataset}\n project : {args.project or 'from dataset'}")
28
+ if args.fail_on:
29
+ print(f" fail-on : {args.fail_on}")
30
+ print()
31
+
32
+ def passthrough_agent(inp: dict) -> dict:
33
+ """Placeholder — replace with your actual agent import."""
34
+ return {"output": f"[no agent bound] input was: {inp}"}
35
+
36
+ try:
37
+ agent = _load_agent(args.agent) if args.agent else passthrough_agent
38
+ summary = EvalSuite.run(
39
+ dataset=args.dataset,
40
+ agent=agent,
41
+ verbose=not args.quiet,
42
+ fail_on=args.fail_on,
43
+ )
44
+ except EvalThresholdError as e:
45
+ print(f"\nCI gate FAILED: {e}", file=sys.stderr)
46
+ return 1
47
+ except FileNotFoundError as e:
48
+ print(f"Error: {e}", file=sys.stderr)
49
+ return 1
50
+
51
+ if args.output:
52
+ Path(args.output).write_text(json.dumps(summary.model_dump(mode="json"), indent=2))
53
+ print(f"\nResults written to {args.output}")
54
+
55
+ return 0 if summary.failed == 0 else 1
56
+
57
+
58
+ def cmd_eval_diff(args: argparse.Namespace) -> int:
59
+ """cortexops eval diff <run_a> <run_b> --api-key cxo-..."""
60
+ from cortexops import CortexClient
61
+
62
+ api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
63
+ if not api_key:
64
+ print("Error: --api-key or CORTEXOPS_API_KEY required for diff", file=sys.stderr)
65
+ return 1
66
+
67
+ client = CortexClient(api_key=api_key, base_url=args.base_url)
68
+ try:
69
+ diff = client.diff(args.run_a, args.run_b)
70
+ except Exception as e:
71
+ print(f"Error fetching diff: {e}", file=sys.stderr)
72
+ return 1
73
+
74
+ delta_tc = diff.get("task_completion_delta", 0)
75
+ delta_tool = diff.get("tool_accuracy_delta", 0)
76
+ regressions = diff.get("regressions", [])
77
+ improvements = diff.get("improvements", [])
78
+
79
+ sign = lambda v: f"+{v:.1%}" if v >= 0 else f"{v:.1%}"
80
+ print(f"Diff: {args.run_a[:8]} → {args.run_b[:8]}")
81
+ print(f" Task completion : {sign(delta_tc)}")
82
+ print(f" Tool accuracy : {sign(delta_tool / 100)}")
83
+ if regressions:
84
+ print(f" Regressions ({len(regressions)}): {', '.join(regressions)}")
85
+ if improvements:
86
+ print(f" Improvements ({len(improvements)}): {', '.join(improvements)}")
87
+
88
+ return 1 if regressions else 0
89
+
90
+
91
+ def cmd_failures(args: argparse.Namespace) -> int:
92
+ """cortexops failures --project my-agent --last 24h"""
93
+ from cortexops import CortexClient
94
+
95
+ api_key = args.api_key or os.getenv("CORTEXOPS_API_KEY")
96
+ if not api_key:
97
+ print("Error: --api-key or CORTEXOPS_API_KEY required", file=sys.stderr)
98
+ return 1
99
+
100
+ client = CortexClient(api_key=api_key, base_url=args.base_url)
101
+ try:
102
+ traces = client.list_traces(project=args.project, limit=args.limit)
103
+ except Exception as e:
104
+ print(f"Error: {e}", file=sys.stderr)
105
+ return 1
106
+
107
+ failed = [t for t in traces if t.get("status") == "failed"]
108
+ if not failed:
109
+ print(f"No failures found for project '{args.project}'")
110
+ return 0
111
+
112
+ print(f"Failures — {args.project} (last {len(traces)} traces)")
113
+ print(f"{'Trace ID':<36} {'Failure kind':<28} Latency")
114
+ print("-" * 78)
115
+ for t in failed[:args.limit]:
116
+ print(
117
+ f"{t['trace_id']:<36} {t.get('failure_kind') or 'unknown':<28} "
118
+ f"{t.get('total_latency_ms', 0):.0f}ms"
119
+ )
120
+ return 0
121
+
122
+
123
+ def cmd_version(_: argparse.Namespace) -> int:
124
+ from cortexops import __version__
125
+ print(f"cortexops {__version__}")
126
+ return 0
127
+
128
+
129
+ def _load_agent(agent_path: str):
130
+ """Load an agent from a dotted path like 'mymodule:my_agent'."""
131
+ if ":" not in agent_path:
132
+ print(f"Error: --agent must be in the format 'module:object', got '{agent_path}'", file=sys.stderr)
133
+ sys.exit(1)
134
+ module_path, attr = agent_path.rsplit(":", 1)
135
+ import importlib
136
+ module = importlib.import_module(module_path)
137
+ return getattr(module, attr)
138
+
139
+
140
+ def main() -> None:
141
+ parser = argparse.ArgumentParser(
142
+ prog="cortexops",
143
+ description="CortexOps — reliability infrastructure for AI agents",
144
+ )
145
+ sub = parser.add_subparsers(dest="command")
146
+
147
+ # ── eval ──────────────────────────────────────────────────────────────
148
+ eval_parser = sub.add_parser("eval", help="Evaluation commands")
149
+ eval_sub = eval_parser.add_subparsers(dest="eval_command")
150
+
151
+ run_p = eval_sub.add_parser("run", help="Run an eval suite")
152
+ run_p.add_argument("--dataset", "-d", required=True, help="Path to golden dataset YAML")
153
+ run_p.add_argument("--project", "-p", default=None, help="Project name (overrides dataset)")
154
+ run_p.add_argument("--agent", "-a", default=None, help="Agent to evaluate (module:object)")
155
+ run_p.add_argument("--fail-on", default=None, help="e.g. 'task_completion < 0.90'")
156
+ run_p.add_argument("--output", "-o", default=None, help="Save JSON results to file")
157
+ run_p.add_argument("--quiet", "-q", action="store_true", help="Suppress per-case output")
158
+
159
+ diff_p = eval_sub.add_parser("diff", help="Diff two eval runs")
160
+ diff_p.add_argument("run_a", help="First run ID")
161
+ diff_p.add_argument("run_b", help="Second run ID")
162
+ diff_p.add_argument("--api-key", default=None)
163
+ diff_p.add_argument("--base-url", default="https://api.cortexops.ai")
164
+
165
+ # ── failures ──────────────────────────────────────────────────────────
166
+ fail_p = sub.add_parser("failures", help="List recent agent failures")
167
+ fail_p.add_argument("--project", "-p", required=True)
168
+ fail_p.add_argument("--limit", "-n", type=int, default=20)
169
+ fail_p.add_argument("--api-key", default=None)
170
+ fail_p.add_argument("--base-url", default="https://api.cortexops.ai")
171
+
172
+ # ── version ───────────────────────────────────────────────────────────
173
+ sub.add_parser("version", help="Print version and exit")
174
+
175
+ args = parser.parse_args()
176
+
177
+ handlers = {
178
+ ("eval", "run"): cmd_eval_run,
179
+ ("eval", "diff"): cmd_eval_diff,
180
+ ("failures", None): cmd_failures,
181
+ ("version", None): cmd_version,
182
+ }
183
+
184
+ key = (args.command, getattr(args, "eval_command", None))
185
+ handler = handlers.get(key)
186
+
187
+ if handler is None:
188
+ parser.print_help()
189
+ sys.exit(0)
190
+
191
+ sys.exit(handler(args))
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
cortexops/client.py ADDED
@@ -0,0 +1,84 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from urllib.parse import urljoin
5
+
6
+ from .models import EvalSummary, Trace
7
+
8
+
9
+ class CortexClient:
10
+ """HTTP client for the CortexOps backend API.
11
+
12
+ Used by the SDK to push traces and pull eval history.
13
+ Not required for local-only usage.
14
+
15
+ Usage:
16
+ client = CortexClient(api_key="cxo-...", base_url="https://api.cortexops.ai")
17
+ client.push_trace(tracer.last_trace())
18
+ history = client.list_runs(project="payments-agent", limit=10)
19
+ """
20
+
21
+ DEFAULT_BASE_URL = "https://api.cortexops.ai"
22
+
23
+ def __init__(
24
+ self,
25
+ api_key: str,
26
+ base_url: str | None = None,
27
+ timeout: float = 10.0,
28
+ ) -> None:
29
+ self.api_key = api_key
30
+ self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
31
+ self.timeout = timeout
32
+
33
+ def _headers(self) -> dict[str, str]:
34
+ return {
35
+ "Authorization": f"Bearer {self.api_key}",
36
+ "Content-Type": "application/json",
37
+ }
38
+
39
+ def _get(self, path: str, params: dict | None = None) -> dict:
40
+ import httpx
41
+ r = httpx.get(
42
+ urljoin(self.base_url + "/", path.lstrip("/")),
43
+ headers=self._headers(),
44
+ params=params,
45
+ timeout=self.timeout,
46
+ )
47
+ r.raise_for_status()
48
+ return r.json()
49
+
50
+ def _post(self, path: str, data: dict) -> dict:
51
+ import httpx
52
+ r = httpx.post(
53
+ urljoin(self.base_url + "/", path.lstrip("/")),
54
+ headers=self._headers(),
55
+ json=data,
56
+ timeout=self.timeout,
57
+ )
58
+ r.raise_for_status()
59
+ return r.json()
60
+
61
+ def push_trace(self, trace: Trace) -> dict:
62
+ return self._post("/v1/traces", trace.model_dump(mode="json"))
63
+
64
+ def get_trace(self, trace_id: str) -> dict:
65
+ return self._get(f"/v1/traces/{trace_id}")
66
+
67
+ def list_traces(self, project: str, limit: int = 50) -> list[dict]:
68
+ return self._get("/v1/traces", {"project": project, "limit": limit})
69
+
70
+ def push_eval(self, summary: EvalSummary) -> dict:
71
+ return self._post("/v1/evals", summary.model_dump(mode="json"))
72
+
73
+ def list_runs(self, project: str, limit: int = 10) -> list[dict]:
74
+ return self._get("/v1/evals", {"project": project, "limit": limit})
75
+
76
+ def run_eval(self, dataset: str, project: str) -> dict:
77
+ """Trigger a server-side eval run (async via Celery)."""
78
+ return self._post("/v1/evals/run", {"dataset": dataset, "project": project})
79
+
80
+ def get_eval(self, run_id: str) -> dict:
81
+ return self._get(f"/v1/evals/{run_id}")
82
+
83
+ def diff(self, run_id_a: str, run_id_b: str) -> dict:
84
+ return self._get("/v1/evals/diff", {"a": run_id_a, "b": run_id_b})
@@ -0,0 +1,58 @@
1
+ """CortexOps — Reliability infrastructure for AI agents.
2
+
3
+ Quickstart:
4
+ from cortexops import CortexTracer, EvalSuite
5
+
6
+ tracer = CortexTracer(project="my-agent")
7
+ graph = tracer.wrap(your_langgraph_app)
8
+
9
+ results = EvalSuite.run(dataset="golden_v1.yaml", agent=graph)
10
+ print(results.summary())
11
+ """
12
+
13
+ from .client import CortexClient
14
+ from .eval import EvalSuite, EvalThresholdError
15
+ from .judge import LLMJudgeMetric
16
+ from .metrics import (
17
+ HallucinationMetric,
18
+ LatencyMetric,
19
+ Metric,
20
+ TaskCompletionMetric,
21
+ ToolAccuracyMetric,
22
+ )
23
+ from .models import (
24
+ CaseResult,
25
+ EvalCase,
26
+ EvalDataset,
27
+ EvalSummary,
28
+ FailureKind,
29
+ RunStatus,
30
+ Trace,
31
+ TraceNode,
32
+ ToolCall,
33
+ )
34
+ from .tracer import CortexTracer
35
+
36
+ __version__ = "0.1.0"
37
+
38
+ __all__ = [
39
+ "CortexTracer",
40
+ "EvalSuite",
41
+ "EvalThresholdError",
42
+ "CortexClient",
43
+ "Metric",
44
+ "TaskCompletionMetric",
45
+ "ToolAccuracyMetric",
46
+ "LatencyMetric",
47
+ "HallucinationMetric",
48
+ "LLMJudgeMetric",
49
+ "Trace",
50
+ "TraceNode",
51
+ "ToolCall",
52
+ "EvalCase",
53
+ "EvalDataset",
54
+ "EvalSummary",
55
+ "CaseResult",
56
+ "FailureKind",
57
+ "RunStatus",
58
+ ]