PyPI - cortexops - Versions diffs - 0.1.0__py3-none-any.whl - Mend

cortexops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

cortexops/LICENSE +21 -0
cortexops/README.md +106 -0
cortexops/__init__.py +58 -0
cortexops/cli.py +195 -0
cortexops/client.py +84 -0
cortexops/cortexops/__init__.py +58 -0
cortexops/cortexops/cli.py +195 -0
cortexops/cortexops/client.py +84 -0
cortexops/cortexops/eval.py +216 -0
cortexops/cortexops/judge.py +155 -0
cortexops/cortexops/metrics.py +184 -0
cortexops/cortexops/models.py +141 -0
cortexops/cortexops/tracer.py +210 -0
cortexops/eval.py +216 -0
cortexops/judge.py +155 -0
cortexops/metrics.py +184 -0
cortexops/models.py +141 -0
cortexops/pyproject.toml +87 -0
cortexops/tests/__init__.py +0 -0
cortexops/tests/test_cortexops.py +211 -0
cortexops/tests/test_enhancements.py +222 -0
cortexops/tracer.py +210 -0
cortexops-0.1.0.dist-info/METADATA +169 -0
cortexops-0.1.0.dist-info/RECORD +27 -0
cortexops-0.1.0.dist-info/WHEEL +4 -0
cortexops-0.1.0.dist-info/entry_points.txt +2 -0
cortexops-0.1.0.dist-info/licenses/LICENSE +21 -0

cortexops/pyproject.toml ADDED Viewed

@@ -0,0 +1,87 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "cortexops"
+version = "0.1.0"
+description = "Reliability infrastructure for AI agents — evaluation, observability, and regression testing"
+readme = "README.md"
+license = { file = "LICENSE" }
+authors = [
+    { name = "Ashish", email = "ashishodu2023@gmail.com" },
+]
+keywords = [
+    "llm", "agents", "evaluation", "observability",
+    "langgraph", "crewai", "autogen", "ai", "testing",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Software Development :: Testing",
+    "Topic :: Software Development :: Quality Assurance",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Typing :: Typed",
+]
+requires-python = ">=3.10"
+dependencies = [
+    "pydantic>=2.0",
+    "pyyaml>=6.0",
+]
+[project.optional-dependencies]
+http = ["httpx>=0.27"]
+llm  = ["httpx>=0.27"]
+all  = ["httpx>=0.27"]
+dev  = [
+    "pytest>=8.0",
+    "pytest-asyncio>=0.23",
+    "httpx>=0.27",
+    "ruff>=0.4",
+    "mypy>=1.10",
+]
+[project.urls]
+Homepage      = "https://cortexops.ai"
+Repository    = "https://github.com/ashishodu2023/cortexops"
+Documentation = "https://docs.cortexops.ai"
+"Bug Tracker" = "https://github.com/ashishodu2023/cortexops/issues"
+Changelog     = "https://github.com/ashishodu2023/cortexops/releases"
+[project.scripts]
+cortexops = "cortexops.cli:main"
+[tool.hatch.build.targets.wheel]
+packages = ["cortexops"]
+[tool.hatch.build.targets.sdist]
+include = [
+    "cortexops/",
+    "tests/",
+    "README.md",
+    "LICENSE",
+    "pyproject.toml",
+]
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+[tool.ruff.lint]
+select = ["E", "F", "I", "UP", "B"]
+ignore = ["B008"]
+[tool.mypy]
+python_version = "3.10"
+strict = true
+ignore_missing_imports = true
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]

cortexops/tests/__init__.py ADDED Viewed

File without changes

cortexops/tests/test_cortexops.py ADDED Viewed

@@ -0,0 +1,211 @@
+"""Tests for CortexOps SDK — tracer, eval, and metrics."""
+import pytest
+from cortexops import (
+    CortexTracer,
+    EvalSuite,
+    EvalThresholdError,
+    FailureKind,
+    RunStatus,
+)
+from cortexops.models import EvalCase, EvalDataset, Trace, TraceNode, ToolCall, ToolCallStatus
+from cortexops.metrics import TaskCompletionMetric, ToolAccuracyMetric, LatencyMetric
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def make_trace(output: dict, tool_calls: list[str] | None = None, latency_ms: float = 100.0) -> Trace:
+    tcs = [ToolCall(name=n, status=ToolCallStatus.SUCCESS) for n in (tool_calls or [])]
+    node = TraceNode(node_id="n1", node_name="agent", output=output, tool_calls=tcs, latency_ms=latency_ms)
+    return Trace(project="test", total_latency_ms=latency_ms, output=output, nodes=[node], status=RunStatus.COMPLETED)
+def echo_agent(input: dict) -> dict:
+    return {"output": f"Processed: {input.get('input', '')}"}
+def failing_agent(input: dict) -> dict:
+    raise RuntimeError("agent exploded")
+# ---------------------------------------------------------------------------
+# CortexTracer
+# ---------------------------------------------------------------------------
+class TestCortexTracer:
+    def test_wraps_callable_and_records_trace(self):
+        tracer = CortexTracer(project="test")
+        wrapped = tracer.wrap(echo_agent)
+        result = wrapped({"input": "hello"})
+        assert "Processed" in str(result)
+        trace = tracer.last_trace()
+        assert trace is not None
+        assert trace.project == "test"
+        assert trace.status == RunStatus.COMPLETED
+    def test_records_failure_on_exception(self):
+        tracer = CortexTracer(project="test")
+        wrapped = tracer.wrap(failing_agent)
+        with pytest.raises(RuntimeError):
+            wrapped({"input": "boom"})
+        trace = tracer.last_trace()
+        assert trace.status == RunStatus.FAILED
+        assert trace.failure_kind == FailureKind.UNKNOWN
+    def test_latency_is_captured(self):
+        tracer = CortexTracer(project="test")
+        wrapped = tracer.wrap(echo_agent)
+        wrapped({"input": "timing"})
+        trace = tracer.last_trace()
+        assert trace.total_latency_ms >= 0
+    def test_clear_resets_traces(self):
+        tracer = CortexTracer(project="test")
+        wrapped = tracer.wrap(echo_agent)
+        wrapped({"input": "a"})
+        wrapped({"input": "b"})
+        assert len(tracer.traces()) == 2
+        tracer.clear()
+        assert len(tracer.traces()) == 0
+        assert tracer.last_trace() is None
+# ---------------------------------------------------------------------------
+# Metrics
+# ---------------------------------------------------------------------------
+class TestTaskCompletionMetric:
+    metric = TaskCompletionMetric()
+    def test_passes_with_output(self):
+        case = EvalCase(id="c1", input="test")
+        trace = make_trace({"output": "the refund was approved"})
+        score, fk, _ = self.metric.score(case, trace)
+        assert score == 100.0
+        assert fk is None
+    def test_fails_empty_output(self):
+        case = EvalCase(id="c1", input="test")
+        trace = make_trace({"output": ""})
+        score, fk, _ = self.metric.score(case, trace)
+        assert score == 0.0
+    def test_partial_score_missing_keywords(self):
+        case = EvalCase(id="c1", input="test", expected_output_contains=["approved", "REF-8821"])
+        trace = make_trace({"output": "The refund was approved"})
+        score, fk, _ = self.metric.score(case, trace)
+        assert 50.0 <= score < 100.0
+        assert fk == FailureKind.OUTPUT_FORMAT
+    def test_full_score_all_keywords_present(self):
+        case = EvalCase(id="c1", input="test", expected_output_contains=["approved", "REF-8821"])
+        trace = make_trace({"output": "Refund REF-8821 was approved successfully"})
+        score, fk, _ = self.metric.score(case, trace)
+        assert score == 100.0
+class TestToolAccuracyMetric:
+    metric = ToolAccuracyMetric()
+    def test_no_expected_tools_is_perfect(self):
+        case = EvalCase(id="c1", input="test")
+        trace = make_trace({})
+        score, _, _ = self.metric.score(case, trace)
+        assert score == 100.0
+    def test_all_tools_called(self):
+        case = EvalCase(id="c1", input="test", expected_tool_calls=["lookup_refund", "send_email"])
+        trace = make_trace({}, tool_calls=["lookup_refund", "send_email"])
+        score, _, _ = self.metric.score(case, trace)
+        assert score == 100.0
+    def test_missing_tool_reduces_score(self):
+        case = EvalCase(id="c1", input="test", expected_tool_calls=["lookup_refund", "send_email"])
+        trace = make_trace({}, tool_calls=["lookup_refund"])
+        score, fk, fd = self.metric.score(case, trace)
+        assert score == 50.0
+        assert fk == FailureKind.TOOL_CALL_MISMATCH
+        assert "send_email" in fd
+class TestLatencyMetric:
+    metric = LatencyMetric()
+    def test_within_budget(self):
+        case = EvalCase(id="c1", input="test", max_latency_ms=2000)
+        trace = make_trace({}, latency_ms=800)
+        score, _, _ = self.metric.score(case, trace)
+        assert score == 100.0
+    def test_over_budget(self):
+        case = EvalCase(id="c1", input="test", max_latency_ms=1000)
+        trace = make_trace({}, latency_ms=2000)
+        score, fk, _ = self.metric.score(case, trace)
+        assert score < 100.0
+        assert fk == FailureKind.TIMEOUT
+    def test_no_budget_always_passes(self):
+        case = EvalCase(id="c1", input="test")
+        trace = make_trace({}, latency_ms=99999)
+        score, _, _ = self.metric.score(case, trace)
+        assert score == 100.0
+# ---------------------------------------------------------------------------
+# EvalSuite
+# ---------------------------------------------------------------------------
+class TestEvalSuite:
+    def _make_dataset(self) -> EvalDataset:
+        return EvalDataset(
+            version=1,
+            project="test-agent",
+            cases=[
+                EvalCase(id="case_01", input="What is 2+2?", expected_output_contains=["4"]),
+                EvalCase(id="case_02", input="Say hello", expected_output_contains=["hello"]),
+            ],
+        )
+    def test_run_passes_with_matching_agent(self):
+        def smart_agent(inp: dict) -> dict:
+            q = inp.get("input", "")
+            if "2+2" in q:
+                return {"output": "The answer is 4"}
+            return {"output": "hello there"}
+        ds = self._make_dataset()
+        summary = EvalSuite.run(dataset=ds, agent=smart_agent, verbose=False)
+        assert summary.total_cases == 2
+        assert summary.passed == 2
+        assert summary.task_completion_rate == 1.0
+    def test_run_detects_failures(self):
+        def dumb_agent(inp: dict) -> dict:
+            return {"output": "I don't know"}
+        ds = self._make_dataset()
+        summary = EvalSuite.run(dataset=ds, agent=dumb_agent, verbose=False)
+        # task_completion should be 0 — agent never produced expected keywords
+        assert summary.task_completion_rate == 0.0
+    def test_fail_on_threshold_raises(self):
+        def bad_agent(inp: dict) -> dict:
+            return {"output": "nothing useful"}
+        ds = self._make_dataset()
+        with pytest.raises(EvalThresholdError):
+            # task_completion will be 0.0 < 0.5 → CI gate fires
+            EvalSuite.run(dataset=ds, agent=bad_agent, verbose=False, fail_on="task_completion < 0.5")
+    def test_summary_string_renders(self):
+        def agent(inp: dict) -> dict:
+            return {"output": inp.get("input", "")}
+        ds = self._make_dataset()
+        summary = EvalSuite.run(dataset=ds, agent=agent, verbose=False)
+        text = summary.summary()
+        assert "test-agent" in text
+        assert "Task completion" in text

cortexops/tests/test_enhancements.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""Tests for CortexOps enhancements — LLM judge, CLI, alerting."""
+import sys
+import os
+from pathlib import Path
+import pytest
+sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+sys.path.insert(0, str(Path(__file__).resolve().parents[3] / "backend"))
+os.environ.setdefault("DATABASE_URL", "sqlite+aiosqlite:///./test.db")
+from cortexops.judge import LLMJudgeMetric
+from cortexops.models import EvalCase, RunStatus, Trace, TraceNode
+def make_trace(output: str, latency_ms: float = 100.0) -> Trace:
+    node = TraceNode(node_id="n1", node_name="agent", output={"output": output}, latency_ms=latency_ms)
+    return Trace(
+        project="test",
+        total_latency_ms=latency_ms,
+        output={"output": output},
+        nodes=[node],
+        status=RunStatus.COMPLETED,
+    )
+# ---------------------------------------------------------------------------
+# LLM judge metric
+# ---------------------------------------------------------------------------
+class TestLLMJudgeMetric:
+    def test_skips_when_judge_is_rule(self):
+        metric = LLMJudgeMetric()
+        case = EvalCase(id="c1", input="test", judge="rule", judge_criteria="must be helpful")
+        trace = make_trace("here is a helpful response")
+        score, fk, _ = metric.score(case, trace)
+        assert score == 100.0
+        assert fk is None
+    def test_skips_when_no_criteria(self):
+        metric = LLMJudgeMetric()
+        case = EvalCase(id="c1", input="test", judge="llm")
+        trace = make_trace("some output")
+        score, fk, _ = metric.score(case, trace)
+        assert score == 100.0
+    def test_heuristic_fallback_high_match(self):
+        metric = LLMJudgeMetric(api_key="placeholder")
+        case = EvalCase(
+            id="c1",
+            input="Explain refund policy",
+            judge="llm",
+            judge_criteria="response should mention refund policy clearly and offer assistance",
+        )
+        trace = make_trace("Our refund policy allows returns within 30 days. I am happy to assist you.")
+        score, fk, fd = metric.score(case, trace)
+        assert score > 50.0
+        assert fd is not None
+    def test_heuristic_fallback_low_match(self):
+        metric = LLMJudgeMetric(api_key="placeholder")
+        case = EvalCase(
+            id="c1",
+            input="Explain refund policy",
+            judge="llm",
+            judge_criteria="response should mention refund policy clearly and offer assistance",
+        )
+        trace = make_trace("I cannot help with that request.")
+        score, fk, _ = metric.score(case, trace)
+        assert score < 100.0
+# ---------------------------------------------------------------------------
+# Alerting
+# ---------------------------------------------------------------------------
+class TestAlertPayload:
+    def _get_classes(self):
+        from app.services.alerting import AlertPayload, SlackAlerter
+        return AlertPayload, SlackAlerter
+    def test_should_alert_on_failures(self):
+        AlertPayload, SlackAlerter = self._get_classes()
+        payload = AlertPayload(
+            project="test", run_id="abc",
+            task_completion_rate=0.8, tool_accuracy=90.0,
+            passed=8, failed=2, total_cases=10, regressions=0,
+            failed_cases=[{"case_id": "c1", "failure_kind": "tool_call_mismatch", "score": 40}],
+        )
+        alerter = SlackAlerter(webhook_url=None, threshold=0.90)
+        assert alerter.should_alert(payload) is True
+    def test_no_alert_when_passing(self):
+        AlertPayload, SlackAlerter = self._get_classes()
+        payload = AlertPayload(
+            project="test", run_id="abc",
+            task_completion_rate=0.95, tool_accuracy=98.0,
+            passed=10, failed=0, total_cases=10, regressions=0, failed_cases=[],
+        )
+        alerter = SlackAlerter(webhook_url=None, threshold=0.90)
+        assert alerter.should_alert(payload) is False
+    def test_alert_on_regression(self):
+        AlertPayload, SlackAlerter = self._get_classes()
+        payload = AlertPayload(
+            project="test", run_id="abc",
+            task_completion_rate=0.95, tool_accuracy=98.0,
+            passed=10, failed=0, total_cases=10, regressions=2, failed_cases=[],
+        )
+        alerter = SlackAlerter(webhook_url=None, threshold=0.90)
+        assert alerter.should_alert(payload) is True
+# ---------------------------------------------------------------------------
+# Prompt diff logic
+# ---------------------------------------------------------------------------
+class TestPromptDiff:
+    def test_unified_diff_detects_changes(self):
+        import difflib
+        v1 = "You are a helpful assistant.\nAlways respond in English."
+        v2 = "You are a helpful payments assistant.\nAlways respond in English.\nBe concise."
+        diff = list(difflib.unified_diff(
+            v1.splitlines(keepends=True),
+            v2.splitlines(keepends=True),
+            fromfile="v1", tofile="v2", lineterm="",
+        ))
+        additions = sum(1 for l in diff if l.startswith("+") and not l.startswith("+++"))
+        deletions = sum(1 for l in diff if l.startswith("-") and not l.startswith("---"))
+        assert additions >= 1
+        assert deletions >= 1
+    def test_identical_prompts_no_diff(self):
+        import difflib
+        v1 = v2 = "You are a helpful assistant."
+        diff = list(difflib.unified_diff(
+            v1.splitlines(keepends=True),
+            v2.splitlines(keepends=True),
+            fromfile="v1", tofile="v2", lineterm="",
+        ))
+        assert diff == []
+# ---------------------------------------------------------------------------
+# CLI imports
+# ---------------------------------------------------------------------------
+class TestCLIImports:
+    def test_cli_module_imports(self):
+        from cortexops.cli import main, cmd_eval_run, cmd_version
+        assert callable(main)
+        assert callable(cmd_eval_run)
+        assert callable(cmd_version)
+    def test_version_command(self, capsys):
+        import argparse
+        from cortexops.cli import cmd_version
+        cmd_version(argparse.Namespace())
+        captured = capsys.readouterr()
+        assert "cortexops" in captured.out
+        assert "0.1.0" in captured.out
+# ---------------------------------------------------------------------------
+# API key generation
+# ---------------------------------------------------------------------------
+class TestApiKeyGeneration:
+    def test_generate_produces_cxo_prefix(self):
+        from app.auth import generate_api_key
+        raw, hashed = generate_api_key()
+        assert raw.startswith("cxo-")
+        assert len(raw) == 4 + 1 + 64  # "cxo-" + 32 hex bytes = 69 chars
+    def test_hash_is_deterministic(self):
+        from app.auth import hash_key
+        assert hash_key("test-key") == hash_key("test-key")
+        assert hash_key("key-a") != hash_key("key-b")
+    def test_generated_keys_unique(self):
+        from app.auth import generate_api_key
+        keys = {generate_api_key()[0] for _ in range(20)}
+        assert len(keys) == 20
+# ---------------------------------------------------------------------------
+# Auth key generation — pure logic, no FastAPI dependency
+# ---------------------------------------------------------------------------
+class TestApiKeyPureFunctions:
+    """Tests the pure key generation logic, independent of FastAPI."""
+    def _gen(self):
+        import secrets, hashlib
+        raw = f"cxo-{secrets.token_hex(32)}"
+        hashed = hashlib.sha256(raw.encode()).hexdigest()
+        return raw, hashed
+    def _hash(self, raw: str) -> str:
+        import hashlib
+        return hashlib.sha256(raw.encode()).hexdigest()
+    def test_key_has_cxo_prefix(self):
+        raw, _ = self._gen()
+        assert raw.startswith("cxo-")
+    def test_hash_is_deterministic(self):
+        assert self._hash("test-key") == self._hash("test-key")
+        assert self._hash("key-a") != self._hash("key-b")
+    def test_generated_keys_are_unique(self):
+        keys = {self._gen()[0] for _ in range(20)}
+        assert len(keys) == 20
+    def test_raw_key_length(self):
+        raw, _ = self._gen()
+        assert len(raw) == 68  # "cxo-" (4) + "-" (0 included in prefix) + 64 hex chars