cortexops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,87 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "cortexops"
7
+ version = "0.1.0"
8
+ description = "Reliability infrastructure for AI agents — evaluation, observability, and regression testing"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ authors = [
12
+ { name = "Ashish", email = "ashishodu2023@gmail.com" },
13
+ ]
14
+ keywords = [
15
+ "llm", "agents", "evaluation", "observability",
16
+ "langgraph", "crewai", "autogen", "ai", "testing",
17
+ ]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Intended Audience :: Developers",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Operating System :: OS Independent",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Software Development :: Testing",
28
+ "Topic :: Software Development :: Quality Assurance",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ "Typing :: Typed",
31
+ ]
32
+ requires-python = ">=3.10"
33
+ dependencies = [
34
+ "pydantic>=2.0",
35
+ "pyyaml>=6.0",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ http = ["httpx>=0.27"]
40
+ llm = ["httpx>=0.27"]
41
+ all = ["httpx>=0.27"]
42
+ dev = [
43
+ "pytest>=8.0",
44
+ "pytest-asyncio>=0.23",
45
+ "httpx>=0.27",
46
+ "ruff>=0.4",
47
+ "mypy>=1.10",
48
+ ]
49
+
50
+ [project.urls]
51
+ Homepage = "https://cortexops.ai"
52
+ Repository = "https://github.com/ashishodu2023/cortexops"
53
+ Documentation = "https://docs.cortexops.ai"
54
+ "Bug Tracker" = "https://github.com/ashishodu2023/cortexops/issues"
55
+ Changelog = "https://github.com/ashishodu2023/cortexops/releases"
56
+
57
+ [project.scripts]
58
+ cortexops = "cortexops.cli:main"
59
+
60
+ [tool.hatch.build.targets.wheel]
61
+ packages = ["cortexops"]
62
+
63
+ [tool.hatch.build.targets.sdist]
64
+ include = [
65
+ "cortexops/",
66
+ "tests/",
67
+ "README.md",
68
+ "LICENSE",
69
+ "pyproject.toml",
70
+ ]
71
+
72
+ [tool.ruff]
73
+ line-length = 100
74
+ target-version = "py310"
75
+
76
+ [tool.ruff.lint]
77
+ select = ["E", "F", "I", "UP", "B"]
78
+ ignore = ["B008"]
79
+
80
+ [tool.mypy]
81
+ python_version = "3.10"
82
+ strict = true
83
+ ignore_missing_imports = true
84
+
85
+ [tool.pytest.ini_options]
86
+ asyncio_mode = "auto"
87
+ testpaths = ["tests"]
File without changes
@@ -0,0 +1,211 @@
1
+ """Tests for CortexOps SDK — tracer, eval, and metrics."""
2
+
3
+ import pytest
4
+
5
+ from cortexops import (
6
+ CortexTracer,
7
+ EvalSuite,
8
+ EvalThresholdError,
9
+ FailureKind,
10
+ RunStatus,
11
+ )
12
+ from cortexops.models import EvalCase, EvalDataset, Trace, TraceNode, ToolCall, ToolCallStatus
13
+ from cortexops.metrics import TaskCompletionMetric, ToolAccuracyMetric, LatencyMetric
14
+
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Helpers
18
+ # ---------------------------------------------------------------------------
19
+
20
+ def make_trace(output: dict, tool_calls: list[str] | None = None, latency_ms: float = 100.0) -> Trace:
21
+ tcs = [ToolCall(name=n, status=ToolCallStatus.SUCCESS) for n in (tool_calls or [])]
22
+ node = TraceNode(node_id="n1", node_name="agent", output=output, tool_calls=tcs, latency_ms=latency_ms)
23
+ return Trace(project="test", total_latency_ms=latency_ms, output=output, nodes=[node], status=RunStatus.COMPLETED)
24
+
25
+
26
+ def echo_agent(input: dict) -> dict:
27
+ return {"output": f"Processed: {input.get('input', '')}"}
28
+
29
+
30
+ def failing_agent(input: dict) -> dict:
31
+ raise RuntimeError("agent exploded")
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # CortexTracer
36
+ # ---------------------------------------------------------------------------
37
+
38
+ class TestCortexTracer:
39
+ def test_wraps_callable_and_records_trace(self):
40
+ tracer = CortexTracer(project="test")
41
+ wrapped = tracer.wrap(echo_agent)
42
+ result = wrapped({"input": "hello"})
43
+ assert "Processed" in str(result)
44
+ trace = tracer.last_trace()
45
+ assert trace is not None
46
+ assert trace.project == "test"
47
+ assert trace.status == RunStatus.COMPLETED
48
+
49
+ def test_records_failure_on_exception(self):
50
+ tracer = CortexTracer(project="test")
51
+ wrapped = tracer.wrap(failing_agent)
52
+ with pytest.raises(RuntimeError):
53
+ wrapped({"input": "boom"})
54
+ trace = tracer.last_trace()
55
+ assert trace.status == RunStatus.FAILED
56
+ assert trace.failure_kind == FailureKind.UNKNOWN
57
+
58
+ def test_latency_is_captured(self):
59
+ tracer = CortexTracer(project="test")
60
+ wrapped = tracer.wrap(echo_agent)
61
+ wrapped({"input": "timing"})
62
+ trace = tracer.last_trace()
63
+ assert trace.total_latency_ms >= 0
64
+
65
+ def test_clear_resets_traces(self):
66
+ tracer = CortexTracer(project="test")
67
+ wrapped = tracer.wrap(echo_agent)
68
+ wrapped({"input": "a"})
69
+ wrapped({"input": "b"})
70
+ assert len(tracer.traces()) == 2
71
+ tracer.clear()
72
+ assert len(tracer.traces()) == 0
73
+ assert tracer.last_trace() is None
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Metrics
78
+ # ---------------------------------------------------------------------------
79
+
80
+ class TestTaskCompletionMetric:
81
+ metric = TaskCompletionMetric()
82
+
83
+ def test_passes_with_output(self):
84
+ case = EvalCase(id="c1", input="test")
85
+ trace = make_trace({"output": "the refund was approved"})
86
+ score, fk, _ = self.metric.score(case, trace)
87
+ assert score == 100.0
88
+ assert fk is None
89
+
90
+ def test_fails_empty_output(self):
91
+ case = EvalCase(id="c1", input="test")
92
+ trace = make_trace({"output": ""})
93
+ score, fk, _ = self.metric.score(case, trace)
94
+ assert score == 0.0
95
+
96
+ def test_partial_score_missing_keywords(self):
97
+ case = EvalCase(id="c1", input="test", expected_output_contains=["approved", "REF-8821"])
98
+ trace = make_trace({"output": "The refund was approved"})
99
+ score, fk, _ = self.metric.score(case, trace)
100
+ assert 50.0 <= score < 100.0
101
+ assert fk == FailureKind.OUTPUT_FORMAT
102
+
103
+ def test_full_score_all_keywords_present(self):
104
+ case = EvalCase(id="c1", input="test", expected_output_contains=["approved", "REF-8821"])
105
+ trace = make_trace({"output": "Refund REF-8821 was approved successfully"})
106
+ score, fk, _ = self.metric.score(case, trace)
107
+ assert score == 100.0
108
+
109
+
110
+ class TestToolAccuracyMetric:
111
+ metric = ToolAccuracyMetric()
112
+
113
+ def test_no_expected_tools_is_perfect(self):
114
+ case = EvalCase(id="c1", input="test")
115
+ trace = make_trace({})
116
+ score, _, _ = self.metric.score(case, trace)
117
+ assert score == 100.0
118
+
119
+ def test_all_tools_called(self):
120
+ case = EvalCase(id="c1", input="test", expected_tool_calls=["lookup_refund", "send_email"])
121
+ trace = make_trace({}, tool_calls=["lookup_refund", "send_email"])
122
+ score, _, _ = self.metric.score(case, trace)
123
+ assert score == 100.0
124
+
125
+ def test_missing_tool_reduces_score(self):
126
+ case = EvalCase(id="c1", input="test", expected_tool_calls=["lookup_refund", "send_email"])
127
+ trace = make_trace({}, tool_calls=["lookup_refund"])
128
+ score, fk, fd = self.metric.score(case, trace)
129
+ assert score == 50.0
130
+ assert fk == FailureKind.TOOL_CALL_MISMATCH
131
+ assert "send_email" in fd
132
+
133
+
134
+ class TestLatencyMetric:
135
+ metric = LatencyMetric()
136
+
137
+ def test_within_budget(self):
138
+ case = EvalCase(id="c1", input="test", max_latency_ms=2000)
139
+ trace = make_trace({}, latency_ms=800)
140
+ score, _, _ = self.metric.score(case, trace)
141
+ assert score == 100.0
142
+
143
+ def test_over_budget(self):
144
+ case = EvalCase(id="c1", input="test", max_latency_ms=1000)
145
+ trace = make_trace({}, latency_ms=2000)
146
+ score, fk, _ = self.metric.score(case, trace)
147
+ assert score < 100.0
148
+ assert fk == FailureKind.TIMEOUT
149
+
150
+ def test_no_budget_always_passes(self):
151
+ case = EvalCase(id="c1", input="test")
152
+ trace = make_trace({}, latency_ms=99999)
153
+ score, _, _ = self.metric.score(case, trace)
154
+ assert score == 100.0
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # EvalSuite
159
+ # ---------------------------------------------------------------------------
160
+
161
+ class TestEvalSuite:
162
+ def _make_dataset(self) -> EvalDataset:
163
+ return EvalDataset(
164
+ version=1,
165
+ project="test-agent",
166
+ cases=[
167
+ EvalCase(id="case_01", input="What is 2+2?", expected_output_contains=["4"]),
168
+ EvalCase(id="case_02", input="Say hello", expected_output_contains=["hello"]),
169
+ ],
170
+ )
171
+
172
+ def test_run_passes_with_matching_agent(self):
173
+ def smart_agent(inp: dict) -> dict:
174
+ q = inp.get("input", "")
175
+ if "2+2" in q:
176
+ return {"output": "The answer is 4"}
177
+ return {"output": "hello there"}
178
+
179
+ ds = self._make_dataset()
180
+ summary = EvalSuite.run(dataset=ds, agent=smart_agent, verbose=False)
181
+ assert summary.total_cases == 2
182
+ assert summary.passed == 2
183
+ assert summary.task_completion_rate == 1.0
184
+
185
+ def test_run_detects_failures(self):
186
+ def dumb_agent(inp: dict) -> dict:
187
+ return {"output": "I don't know"}
188
+
189
+ ds = self._make_dataset()
190
+ summary = EvalSuite.run(dataset=ds, agent=dumb_agent, verbose=False)
191
+ # task_completion should be 0 — agent never produced expected keywords
192
+ assert summary.task_completion_rate == 0.0
193
+
194
+ def test_fail_on_threshold_raises(self):
195
+ def bad_agent(inp: dict) -> dict:
196
+ return {"output": "nothing useful"}
197
+
198
+ ds = self._make_dataset()
199
+ with pytest.raises(EvalThresholdError):
200
+ # task_completion will be 0.0 < 0.5 → CI gate fires
201
+ EvalSuite.run(dataset=ds, agent=bad_agent, verbose=False, fail_on="task_completion < 0.5")
202
+
203
+ def test_summary_string_renders(self):
204
+ def agent(inp: dict) -> dict:
205
+ return {"output": inp.get("input", "")}
206
+
207
+ ds = self._make_dataset()
208
+ summary = EvalSuite.run(dataset=ds, agent=agent, verbose=False)
209
+ text = summary.summary()
210
+ assert "test-agent" in text
211
+ assert "Task completion" in text
@@ -0,0 +1,222 @@
1
+ """Tests for CortexOps enhancements — LLM judge, CLI, alerting."""
2
+
3
+ import sys
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
10
+ sys.path.insert(0, str(Path(__file__).resolve().parents[3] / "backend"))
11
+ os.environ.setdefault("DATABASE_URL", "sqlite+aiosqlite:///./test.db")
12
+
13
+ from cortexops.judge import LLMJudgeMetric
14
+ from cortexops.models import EvalCase, RunStatus, Trace, TraceNode
15
+
16
+
17
+ def make_trace(output: str, latency_ms: float = 100.0) -> Trace:
18
+ node = TraceNode(node_id="n1", node_name="agent", output={"output": output}, latency_ms=latency_ms)
19
+ return Trace(
20
+ project="test",
21
+ total_latency_ms=latency_ms,
22
+ output={"output": output},
23
+ nodes=[node],
24
+ status=RunStatus.COMPLETED,
25
+ )
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # LLM judge metric
30
+ # ---------------------------------------------------------------------------
31
+
32
+ class TestLLMJudgeMetric:
33
+ def test_skips_when_judge_is_rule(self):
34
+ metric = LLMJudgeMetric()
35
+ case = EvalCase(id="c1", input="test", judge="rule", judge_criteria="must be helpful")
36
+ trace = make_trace("here is a helpful response")
37
+ score, fk, _ = metric.score(case, trace)
38
+ assert score == 100.0
39
+ assert fk is None
40
+
41
+ def test_skips_when_no_criteria(self):
42
+ metric = LLMJudgeMetric()
43
+ case = EvalCase(id="c1", input="test", judge="llm")
44
+ trace = make_trace("some output")
45
+ score, fk, _ = metric.score(case, trace)
46
+ assert score == 100.0
47
+
48
+ def test_heuristic_fallback_high_match(self):
49
+ metric = LLMJudgeMetric(api_key="placeholder")
50
+ case = EvalCase(
51
+ id="c1",
52
+ input="Explain refund policy",
53
+ judge="llm",
54
+ judge_criteria="response should mention refund policy clearly and offer assistance",
55
+ )
56
+ trace = make_trace("Our refund policy allows returns within 30 days. I am happy to assist you.")
57
+ score, fk, fd = metric.score(case, trace)
58
+ assert score > 50.0
59
+ assert fd is not None
60
+
61
+ def test_heuristic_fallback_low_match(self):
62
+ metric = LLMJudgeMetric(api_key="placeholder")
63
+ case = EvalCase(
64
+ id="c1",
65
+ input="Explain refund policy",
66
+ judge="llm",
67
+ judge_criteria="response should mention refund policy clearly and offer assistance",
68
+ )
69
+ trace = make_trace("I cannot help with that request.")
70
+ score, fk, _ = metric.score(case, trace)
71
+ assert score < 100.0
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # Alerting
76
+ # ---------------------------------------------------------------------------
77
+
78
+ class TestAlertPayload:
79
+ def _get_classes(self):
80
+ from app.services.alerting import AlertPayload, SlackAlerter
81
+ return AlertPayload, SlackAlerter
82
+
83
+ def test_should_alert_on_failures(self):
84
+ AlertPayload, SlackAlerter = self._get_classes()
85
+ payload = AlertPayload(
86
+ project="test", run_id="abc",
87
+ task_completion_rate=0.8, tool_accuracy=90.0,
88
+ passed=8, failed=2, total_cases=10, regressions=0,
89
+ failed_cases=[{"case_id": "c1", "failure_kind": "tool_call_mismatch", "score": 40}],
90
+ )
91
+ alerter = SlackAlerter(webhook_url=None, threshold=0.90)
92
+ assert alerter.should_alert(payload) is True
93
+
94
+ def test_no_alert_when_passing(self):
95
+ AlertPayload, SlackAlerter = self._get_classes()
96
+ payload = AlertPayload(
97
+ project="test", run_id="abc",
98
+ task_completion_rate=0.95, tool_accuracy=98.0,
99
+ passed=10, failed=0, total_cases=10, regressions=0, failed_cases=[],
100
+ )
101
+ alerter = SlackAlerter(webhook_url=None, threshold=0.90)
102
+ assert alerter.should_alert(payload) is False
103
+
104
+ def test_alert_on_regression(self):
105
+ AlertPayload, SlackAlerter = self._get_classes()
106
+ payload = AlertPayload(
107
+ project="test", run_id="abc",
108
+ task_completion_rate=0.95, tool_accuracy=98.0,
109
+ passed=10, failed=0, total_cases=10, regressions=2, failed_cases=[],
110
+ )
111
+ alerter = SlackAlerter(webhook_url=None, threshold=0.90)
112
+ assert alerter.should_alert(payload) is True
113
+
114
+
115
+ # ---------------------------------------------------------------------------
116
+ # Prompt diff logic
117
+ # ---------------------------------------------------------------------------
118
+
119
+ class TestPromptDiff:
120
+ def test_unified_diff_detects_changes(self):
121
+ import difflib
122
+
123
+ v1 = "You are a helpful assistant.\nAlways respond in English."
124
+ v2 = "You are a helpful payments assistant.\nAlways respond in English.\nBe concise."
125
+
126
+ diff = list(difflib.unified_diff(
127
+ v1.splitlines(keepends=True),
128
+ v2.splitlines(keepends=True),
129
+ fromfile="v1", tofile="v2", lineterm="",
130
+ ))
131
+ additions = sum(1 for l in diff if l.startswith("+") and not l.startswith("+++"))
132
+ deletions = sum(1 for l in diff if l.startswith("-") and not l.startswith("---"))
133
+
134
+ assert additions >= 1
135
+ assert deletions >= 1
136
+
137
+ def test_identical_prompts_no_diff(self):
138
+ import difflib
139
+
140
+ v1 = v2 = "You are a helpful assistant."
141
+ diff = list(difflib.unified_diff(
142
+ v1.splitlines(keepends=True),
143
+ v2.splitlines(keepends=True),
144
+ fromfile="v1", tofile="v2", lineterm="",
145
+ ))
146
+ assert diff == []
147
+
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # CLI imports
151
+ # ---------------------------------------------------------------------------
152
+
153
+ class TestCLIImports:
154
+ def test_cli_module_imports(self):
155
+ from cortexops.cli import main, cmd_eval_run, cmd_version
156
+ assert callable(main)
157
+ assert callable(cmd_eval_run)
158
+ assert callable(cmd_version)
159
+
160
+ def test_version_command(self, capsys):
161
+ import argparse
162
+ from cortexops.cli import cmd_version
163
+ cmd_version(argparse.Namespace())
164
+ captured = capsys.readouterr()
165
+ assert "cortexops" in captured.out
166
+ assert "0.1.0" in captured.out
167
+
168
+
169
+ # ---------------------------------------------------------------------------
170
+ # API key generation
171
+ # ---------------------------------------------------------------------------
172
+
173
+ class TestApiKeyGeneration:
174
+ def test_generate_produces_cxo_prefix(self):
175
+ from app.auth import generate_api_key
176
+ raw, hashed = generate_api_key()
177
+ assert raw.startswith("cxo-")
178
+ assert len(raw) == 4 + 1 + 64 # "cxo-" + 32 hex bytes = 69 chars
179
+
180
+ def test_hash_is_deterministic(self):
181
+ from app.auth import hash_key
182
+ assert hash_key("test-key") == hash_key("test-key")
183
+ assert hash_key("key-a") != hash_key("key-b")
184
+
185
+ def test_generated_keys_unique(self):
186
+ from app.auth import generate_api_key
187
+ keys = {generate_api_key()[0] for _ in range(20)}
188
+ assert len(keys) == 20
189
+
190
+
191
+ # ---------------------------------------------------------------------------
192
+ # Auth key generation — pure logic, no FastAPI dependency
193
+ # ---------------------------------------------------------------------------
194
+
195
+ class TestApiKeyPureFunctions:
196
+ """Tests the pure key generation logic, independent of FastAPI."""
197
+
198
+ def _gen(self):
199
+ import secrets, hashlib
200
+ raw = f"cxo-{secrets.token_hex(32)}"
201
+ hashed = hashlib.sha256(raw.encode()).hexdigest()
202
+ return raw, hashed
203
+
204
+ def _hash(self, raw: str) -> str:
205
+ import hashlib
206
+ return hashlib.sha256(raw.encode()).hexdigest()
207
+
208
+ def test_key_has_cxo_prefix(self):
209
+ raw, _ = self._gen()
210
+ assert raw.startswith("cxo-")
211
+
212
+ def test_hash_is_deterministic(self):
213
+ assert self._hash("test-key") == self._hash("test-key")
214
+ assert self._hash("key-a") != self._hash("key-b")
215
+
216
+ def test_generated_keys_are_unique(self):
217
+ keys = {self._gen()[0] for _ in range(20)}
218
+ assert len(keys) == 20
219
+
220
+ def test_raw_key_length(self):
221
+ raw, _ = self._gen()
222
+ assert len(raw) == 68 # "cxo-" (4) + "-" (0 included in prefix) + 64 hex chars