PyPI - cortexops - Versions diffs - 0.2.0__tar.gz → 0.4.0__tar.gz - Mend

cortexops 0.2.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{cortexops-0.2.0 → cortexops-0.4.0}/PKG-INFO +170 -33
cortexops-0.4.0/README.md +243 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/__init__.py +2 -2
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/cli.py +82 -0
cortexops-0.4.0/cortexops/dataset.py +242 -0
cortexops-0.4.0/cortexops/judge.py +393 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/pyproject.toml +13 -9
cortexops-0.4.0/cortexops/tracer.py +696 -0
{cortexops-0.2.0 → cortexops-0.4.0}/pyproject.toml +1 -1
{cortexops-0.2.0 → cortexops-0.4.0}/tests/test_enhancements.py +175 -1
cortexops-0.2.0/cortexops/README.md +0 -106
cortexops-0.2.0/cortexops/judge.py +0 -154
cortexops-0.2.0/cortexops/tracer.py +0 -278
{cortexops-0.2.0 → cortexops-0.4.0}/.gitignore +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/LICENSE +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/LICENSE +0 -0
{cortexops-0.2.0 → cortexops-0.4.0/cortexops}/README.md +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/auth.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/client.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/eval.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/metrics.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/models.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/tests/__init__.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/tests/conftest.py +0 -0
{cortexops-0.2.0 → cortexops-0.4.0}/tests/test_cortexops.py +0 -0

{cortexops-0.2.0 → cortexops-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: cortexops
-Version: 0.2.0
+Version: 0.4.0
 Summary: Reliability infrastructure for AI agents — evaluation, observability, and regression testing
 Project-URL: Homepage, https://getcortexops.com
 Project-URL: Repository, https://github.com/ashishodu2023/cortexops
@@ -68,35 +68,63 @@ Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
 [![PyPI version](https://img.shields.io/pypi/v/cortexops.svg)](https://pypi.org/project/cortexops/)
 [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
 [![CI](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml/badge.svg)](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
-[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/ashishodu2023/cortexops/blob/main/LICENSE)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
 ---
-## The problem
+## What's New in v0.4.0
-You deployed an agent. You have no idea if it regressed overnight.
+### LLM-as-judge evaluation
+```python
+from cortexops.judge import LLMJudge
+judge = LLMJudge(api_key="sk-...")
+result = judge.evaluate(
+    case_id="case-001",
+    input="Process refund for order #4821",
+    output="Refund of $49.99 approved and processed.",
+    rubric="task_completion",
+)
+print(result.score, result.passed, result.reasoning)
+```
-No standard eval format. No failure traces. No CI gate before the next prompt change ships.
-CortexOps fixes that.
+### Golden dataset API
+```python
+from cortexops.dataset import GoldenDataset
----
+ds = GoldenDataset(name="refund-agent-v1")
+ds.add(input="Refund order #4821", expected="refund_approved")
+ds.add(input="Cancel subscription", expected="subscription_cancelled")
+ds.save("datasets/refund_agent.yaml")
-## Install
+results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
+```
+### CI/CD eval gate
 ```bash
-pip install cortexops
+cortexops eval run \
+  --dataset datasets/refund_agent.yaml \
+  --judge \
+  --fail-on "task_completion < 0.90"
+# Exit code 1 if regression detected — drop into GitHub Actions
+```
-# With HTTP client (for pushing traces to hosted API):
-pip install cortexops[http]
-# With LLM judge support:
-pip install cortexops[llm]
-```
+## The problem
+You deployed an agent. You have no idea if it regressed overnight.
+No standard eval format. No failure traces. No CI gate before the next prompt change ships.
+CortexOps fixes that.
 ---
 ## Quickstart
+```bash
+pip install cortexops  # v0.4.0
+```
 ```python
 from cortexops import CortexTracer, EvalSuite
@@ -109,14 +137,25 @@ results = EvalSuite.run(
     dataset="golden_v1.yaml",
     agent=graph,
 )
 print(results.summary())
+# CortexOps eval — payments-agent
+#   Cases           : 9  (7 passed, 2 failed)
+#   Task completion : 91.4%
+#   Tool accuracy   : 97.0/100
+#   Latency p50/p95 : 42ms / 187ms
+#   Failed cases:
+#     - escalation_router: tool_call_mismatch (score 41)
 ```
 ---
-## Golden dataset (YAML)
+## Golden dataset format
+Define test cases in YAML. Run them locally or in CI.
 ```yaml
+# golden_v1.yaml
 version: 1
 project: payments-agent
@@ -127,25 +166,90 @@ cases:
     expected_output_contains: ["approved", "REF-8821"]
     max_latency_ms: 3000
-  - id: open_ended_explanation_01
-    input: "Why was my refund rejected?"
-    judge: llm
-    judge_criteria: >
-      The response must explain the rejection reason clearly,
-      be empathetic, and offer a concrete next step. No jargon.
+  - id: dispute_escalation_01
+    input: "I was charged twice — this is unauthorized"
+    expected_tool_calls: [classify_dispute, route_escalation]
+    expected_output_contains: ["escalated"]
+    max_latency_ms: 5000
+```
+---
+## CI eval gate
+Add to `.github/workflows/eval.yml`:
+```yaml
+- name: CortexOps eval gate
+  run: |
+    python examples/langgraph_payments/run_eval.py \
+      --dataset golden_v1.yaml \
+      --fail-on "task_completion < 0.90"
+```
+If the eval drops below threshold, the job exits non-zero and the PR is blocked.
+---
+## Repo structure
+```
+cortexops/
+├── sdk/                        # pip install cortexops  # v0.4.0
+│   ├── cortexops/
+│   │   ├── tracer.py           # CortexTracer — wraps LangGraph / CrewAI
+│   │   ├── eval.py             # EvalSuite — golden dataset runner
+│   │   ├── metrics.py          # task_completion, tool_accuracy, latency, hallucination
+│   │   ├── models.py           # Pydantic data models
+│   │   └── client.py           # HTTP client for hosted API
+│   └── tests/
+├── backend/                    # FastAPI + Celery + SQLite/Postgres
+│   ├── app/
+│   │   ├── main.py
+│   │   ├── routers/            # /v1/evals, /v1/traces
+│   │   ├── models/             # DB records + API schemas
+│   │   └── worker/             # Celery async eval tasks
+│   └── Dockerfile
+├── frontend/                   # React + TypeScript dashboard
+├── examples/
+│   └── langgraph_payments/     # Full runnable demo
+│       ├── agent.py
+│       ├── golden_v1.yaml
+│       └── run_eval.py
+└── docker-compose.yml
 ```
 ---
-## CI gate
+## Run the full stack locally
 ```bash
-cortexops eval run \
-  --dataset golden_v1.yaml \
-  --fail-on "task_completion < 0.90"
+git clone https://github.com/ashishodu2023/cortexops
+cd cortexops
+# Start API + worker + Redis
+docker compose up --build
+# In another terminal — run the demo eval
+cd examples/langgraph_payments
+pip install -e ../../sdk/
+python run_eval.py
+# API docs at http://localhost:8000/docs
+# Dashboard at http://localhost:3000
 ```
-Exits non-zero if the threshold is not met — blocks the PR.
+---
+## Supported frameworks
+| Framework | Status |
+|---|---|
+| LangGraph | Stable |
+| CrewAI | Stable |
+| AutoGen | Beta |
+| LlamaIndex agents | Coming soon |
+| Custom callables | Supported via `CortexTracer.wrap()` |
 ---
@@ -153,16 +257,49 @@ Exits non-zero if the threshold is not met — blocks the PR.
 | Metric | What it checks |
 |---|---|
-| `task_completion` | Non-empty, non-error output with expected content |
+| `task_completion` | Agent produced a valid, non-error output |
 | `tool_accuracy` | Expected tool calls were actually made |
 | `latency` | Response within `max_latency_ms` budget |
-| `hallucination` | Fabrication signals in output |
-| `llm_judge` | GPT-4o scores against natural-language criteria |
+| `hallucination` | Detects fabrication signals in output |
+Add custom metrics by subclassing `cortexops.Metric`.
+---
+## Contributing
+```bash
+git clone https://github.com/ashishodu2023/cortexops
+cd cortexops/sdk
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
 ---
-## Links
+## Citation
+```bibtex
+@software{cortexops2025,
+  author  = {Ashish, et al.},
+  title   = {CortexOps: Reliability Infrastructure for AI Agents},
+  year    = {2025},
+  url     = {https://github.com/ashishodu2023/cortexops},
+}
+```
+---
+## License
+MIT — see [LICENSE](LICENSE).
+---
-- **Docs**: [docs.cortexops.ai](https://docs.cortexops.ai)
-- **Repo**: [github.com/ashishodu2023/cortexops](https://github.com/ashishodu2023/cortexops)
-- **Issues**: [GitHub Issues](https://github.com/ashishodu2023/cortexops/issues)
+<p align="center">
+  <a href="https://cortexops.ai">cortexops.ai</a> ·
+  <a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
+  <a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
+</p>

cortexops-0.4.0/README.md ADDED Viewed

@@ -0,0 +1,243 @@
+# CortexOps
+**Reliability infrastructure for AI agents.**
+Evaluate · Observe · Operate — for LangGraph, CrewAI, and AutoGen.
+[![PyPI version](https://img.shields.io/pypi/v/cortexops.svg)](https://pypi.org/project/cortexops/)
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
+[![CI](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml/badge.svg)](https://github.com/ashishodu2023/cortexops/actions/workflows/eval.yml)
+[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
+---
+## What's New in v0.4.0
+### LLM-as-judge evaluation
+```python
+from cortexops.judge import LLMJudge
+judge = LLMJudge(api_key="sk-...")
+result = judge.evaluate(
+    case_id="case-001",
+    input="Process refund for order #4821",
+    output="Refund of $49.99 approved and processed.",
+    rubric="task_completion",
+)
+print(result.score, result.passed, result.reasoning)
+```
+### Golden dataset API
+```python
+from cortexops.dataset import GoldenDataset
+ds = GoldenDataset(name="refund-agent-v1")
+ds.add(input="Refund order #4821", expected="refund_approved")
+ds.add(input="Cancel subscription", expected="subscription_cancelled")
+ds.save("datasets/refund_agent.yaml")
+results = ds.run(agent=your_agent, fail_on="task_completion < 0.90")
+```
+### CI/CD eval gate
+```bash
+cortexops eval run \
+  --dataset datasets/refund_agent.yaml \
+  --judge \
+  --fail-on "task_completion < 0.90"
+# Exit code 1 if regression detected — drop into GitHub Actions
+```
+## The problem
+You deployed an agent. You have no idea if it regressed overnight.
+No standard eval format. No failure traces. No CI gate before the next prompt change ships.
+CortexOps fixes that.
+---
+## Quickstart
+```bash
+pip install cortexops  # v0.4.0
+```
+```python
+from cortexops import CortexTracer, EvalSuite
+# Wrap your LangGraph app — zero refactor required
+tracer = CortexTracer(project="payments-agent")
+graph  = tracer.wrap(your_langgraph_app)
+# Run evaluations against a golden dataset
+results = EvalSuite.run(
+    dataset="golden_v1.yaml",
+    agent=graph,
+)
+print(results.summary())
+# CortexOps eval — payments-agent
+#   Cases           : 9  (7 passed, 2 failed)
+#   Task completion : 91.4%
+#   Tool accuracy   : 97.0/100
+#   Latency p50/p95 : 42ms / 187ms
+#   Failed cases:
+#     - escalation_router: tool_call_mismatch (score 41)
+```
+---
+## Golden dataset format
+Define test cases in YAML. Run them locally or in CI.
+```yaml
+# golden_v1.yaml
+version: 1
+project: payments-agent
+cases:
+  - id: refund_lookup_01
+    input: "What is the status of refund REF-8821?"
+    expected_tool_calls: [lookup_refund]
+    expected_output_contains: ["approved", "REF-8821"]
+    max_latency_ms: 3000
+  - id: dispute_escalation_01
+    input: "I was charged twice — this is unauthorized"
+    expected_tool_calls: [classify_dispute, route_escalation]
+    expected_output_contains: ["escalated"]
+    max_latency_ms: 5000
+```
+---
+## CI eval gate
+Add to `.github/workflows/eval.yml`:
+```yaml
+- name: CortexOps eval gate
+  run: |
+    python examples/langgraph_payments/run_eval.py \
+      --dataset golden_v1.yaml \
+      --fail-on "task_completion < 0.90"
+```
+If the eval drops below threshold, the job exits non-zero and the PR is blocked.
+---
+## Repo structure
+```
+cortexops/
+├── sdk/                        # pip install cortexops  # v0.4.0
+│   ├── cortexops/
+│   │   ├── tracer.py           # CortexTracer — wraps LangGraph / CrewAI
+│   │   ├── eval.py             # EvalSuite — golden dataset runner
+│   │   ├── metrics.py          # task_completion, tool_accuracy, latency, hallucination
+│   │   ├── models.py           # Pydantic data models
+│   │   └── client.py           # HTTP client for hosted API
+│   └── tests/
+├── backend/                    # FastAPI + Celery + SQLite/Postgres
+│   ├── app/
+│   │   ├── main.py
+│   │   ├── routers/            # /v1/evals, /v1/traces
+│   │   ├── models/             # DB records + API schemas
+│   │   └── worker/             # Celery async eval tasks
+│   └── Dockerfile
+├── frontend/                   # React + TypeScript dashboard
+├── examples/
+│   └── langgraph_payments/     # Full runnable demo
+│       ├── agent.py
+│       ├── golden_v1.yaml
+│       └── run_eval.py
+└── docker-compose.yml
+```
+---
+## Run the full stack locally
+```bash
+git clone https://github.com/ashishodu2023/cortexops
+cd cortexops
+# Start API + worker + Redis
+docker compose up --build
+# In another terminal — run the demo eval
+cd examples/langgraph_payments
+pip install -e ../../sdk/
+python run_eval.py
+# API docs at http://localhost:8000/docs
+# Dashboard at http://localhost:3000
+```
+---
+## Supported frameworks
+| Framework | Status |
+|---|---|
+| LangGraph | Stable |
+| CrewAI | Stable |
+| AutoGen | Beta |
+| LlamaIndex agents | Coming soon |
+| Custom callables | Supported via `CortexTracer.wrap()` |
+---
+## Built-in metrics
+| Metric | What it checks |
+|---|---|
+| `task_completion` | Agent produced a valid, non-error output |
+| `tool_accuracy` | Expected tool calls were actually made |
+| `latency` | Response within `max_latency_ms` budget |
+| `hallucination` | Detects fabrication signals in output |
+Add custom metrics by subclassing `cortexops.Metric`.
+---
+## Contributing
+```bash
+git clone https://github.com/ashishodu2023/cortexops
+cd cortexops/sdk
+pip install -e ".[dev]"
+pytest tests/ -v
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md). Issues labeled `good first issue` are a great place to start.
+---
+## Citation
+```bibtex
+@software{cortexops2025,
+  author  = {Ashish, et al.},
+  title   = {CortexOps: Reliability Infrastructure for AI Agents},
+  year    = {2025},
+  url     = {https://github.com/ashishodu2023/cortexops},
+}
+```
+---
+## License
+MIT — see [LICENSE](LICENSE).
+---
+<p align="center">
+  <a href="https://cortexops.ai">cortexops.ai</a> ·
+  <a href="https://github.com/ashishodu2023/cortexops/issues">Issues</a> ·
+  <a href="https://github.com/ashishodu2023/cortexops/discussions">Discussions</a>
+</p>

{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/__init__.py RENAMED Viewed

@@ -10,6 +10,7 @@ Quickstart:
     print(results.summary())
 """
+from .auth import cmd_login, cmd_logout, cmd_whoami, load_credentials, save_credentials
 from .client import CortexClient
 from .eval import EvalSuite, EvalThresholdError
 from .judge import LLMJudgeMetric
@@ -31,10 +32,9 @@ from .models import (
     Trace,
     TraceNode,
 )
-from .auth import cmd_login, cmd_logout, cmd_whoami, save_credentials, load_credentials
 from .tracer import CortexTracer
-__version__ = "0.2.0"
+__version__ = "0.4.0"
 __all__ = [
     "CortexTracer",

{cortexops-0.2.0 → cortexops-0.4.0}/cortexops/cli.py RENAMED Viewed

@@ -127,6 +127,88 @@ def cmd_version(_: argparse.Namespace) -> int:
     return 0
+def cmd_dataset_create(args: argparse.Namespace) -> int:
+    """cortexops dataset create --name my-dataset --output dataset.yaml"""
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from cortexops.dataset import GoldenDataset
+    ds = GoldenDataset(name=args.name, description=args.description or "")
+    ds.save(args.output)
+    print(f"Created dataset: {args.output}")
+    print("  Add cases by editing the YAML file or using ds.add() in Python.")
+    return 0
+def cmd_eval_judge(args: argparse.Namespace) -> int:
+    """cortexops eval judge --input <str> --output <str> --rubric task_completion"""
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from cortexops.judge import RUBRICS, LLMJudge
+    api_key = args.api_key or os.getenv("OPENAI_API_KEY", "")
+    if not api_key:
+        print("Error: OPENAI_API_KEY not set. Pass --api-key or set the env var.", file=sys.stderr)
+        return 1
+    judge = LLMJudge(api_key=api_key, model=args.model or "gpt-4o-mini")
+    result = judge.evaluate(
+        case_id="cli-eval",
+        input=args.input,
+        output=args.output,
+        rubric=args.rubric or "task_completion",
+        expected=args.expected,
+    )
+    icon = "✓ PASS" if result.passed else "✗ FAIL"
+    print(f"\nLLM Judge Result: {icon}")
+    rubric_obj   = RUBRICS.get(args.rubric or 'task_completion')
+    threshold    = rubric_obj.pass_threshold if rubric_obj else 0.70
+    print(f"  Score:     {result.score:.3f} (threshold: {threshold:.2f})")
+    print(f"  Model:     {result.model}  ({result.latency_ms}ms)")
+    print(f"  Reasoning: {result.reasoning}")
+    if args.verbose:
+        print(f"  Criteria:  {result.criteria_scores}")
+    return 0 if result.passed else 1
+def cmd_eval_run_with_judge(args: argparse.Namespace) -> int:
+    """cortexops eval run --dataset d.yaml --judge --fail-on task_completion<0.90"""
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
+    from cortexops.dataset import GoldenDataset
+    ds = GoldenDataset.load(args.dataset)
+    print("CortexOps eval gate")
+    print(f"  dataset : {args.dataset} ({len(ds)} cases)")
+    print(f"  project : {args.project or ds.name}")
+    if args.fail_on:
+        print(f"  fail-on : {args.fail_on}")
+    if args.judge:
+        print(f"  judge   : LLM-as-judge ({args.model or 'gpt-4o-mini'})")
+    print()
+    def passthrough_agent(inp):
+        return {"output": f"[no agent] input: {inp}"}
+    try:
+        agent = _load_agent(args.agent) if getattr(args, "agent", None) else passthrough_agent
+        result = ds.run(
+            agent=agent,
+            fail_on=args.fail_on,
+            verbose=True,
+            use_judge=getattr(args, "judge", False),
+            judge_rubric=getattr(args, "rubric", "task_completion"),
+            judge_api_key=os.getenv("OPENAI_API_KEY"),
+        )
+        result.print_report()
+        return 0 if result.passed() else 1
+    except Exception as e:
+        print(f"\nEval failed: {e}", file=sys.stderr)
+        return 1
 def _load_agent(agent_path: str):
     """Load an agent from a dotted path like 'mymodule:my_agent'."""
     if ":" not in agent_path:

cortexops 0.2.0__tar.gz → 0.4.0__tar.gz

cortexops 0.2.0tar.gz → 0.4.0tar.gz