cortexops 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any
6
+
7
+ from .models import CaseResult, EvalCase, FailureKind, Trace
8
+
9
+
10
+ class Metric(ABC):
11
+ """Base class for all CortexOps eval metrics.
12
+ Subclass this and implement score() to add custom metrics.
13
+ """
14
+
15
+ name: str = "base"
16
+
17
+ @abstractmethod
18
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
19
+ """Return (score 0-100, failure_kind or None, failure_detail or None)."""
20
+
21
+
22
+ class TaskCompletionMetric(Metric):
23
+ """Checks whether the agent produced a non-empty, non-error output."""
24
+
25
+ name = "task_completion"
26
+
27
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
28
+ output = trace.output or {}
29
+ output_str = str(output.get("output", output.get("result", output.get("answer", ""))))
30
+
31
+ if not output_str or output_str.lower() in ("none", "null", ""):
32
+ return 0.0, FailureKind.UNKNOWN, "Agent produced no output"
33
+
34
+ error_patterns = [r"error:", r"exception:", r"traceback", r"failed to"]
35
+ for pat in error_patterns:
36
+ if re.search(pat, output_str, re.IGNORECASE):
37
+ return 20.0, FailureKind.UNKNOWN, f"Output contains error signal: {output_str[:100]}"
38
+
39
+ if case.expected_output_contains:
40
+ hits = sum(1 for kw in case.expected_output_contains if kw.lower() in output_str.lower())
41
+ ratio = hits / len(case.expected_output_contains)
42
+ if ratio < 1.0:
43
+ missing = [kw for kw in case.expected_output_contains if kw.lower() not in output_str.lower()]
44
+ return (
45
+ 50.0 + 50.0 * ratio,
46
+ FailureKind.OUTPUT_FORMAT,
47
+ f"Missing expected content: {missing}",
48
+ )
49
+
50
+ if case.expected_output_not_contains:
51
+ violations = [kw for kw in case.expected_output_not_contains if kw.lower() in output_str.lower()]
52
+ if violations:
53
+ return (
54
+ 30.0,
55
+ FailureKind.HALLUCINATION,
56
+ f"Output contains prohibited content: {violations}",
57
+ )
58
+
59
+ return 100.0, None, None
60
+
61
+
62
+ class ToolAccuracyMetric(Metric):
63
+ """Checks whether expected tool calls were actually made.
64
+
65
+ Looks in two places (in priority order):
66
+ 1. Trace node tool_calls (full instrumentation via CortexTracer.record_tool_call)
67
+ 2. output['tool_calls_made'] list (lightweight self-reporting from the agent)
68
+ """
69
+
70
+ name = "tool_accuracy"
71
+
72
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
73
+ if not case.expected_tool_calls:
74
+ return 100.0, None, None
75
+
76
+ # Priority 1: instrumented trace nodes
77
+ actual_calls = {tc.name for tc in trace.tool_calls()}
78
+
79
+ # Priority 2: agent self-reported via output dict
80
+ if not actual_calls:
81
+ reported = trace.output.get("tool_calls_made", [])
82
+ if isinstance(reported, list):
83
+ actual_calls = set(reported)
84
+
85
+ expected = set(case.expected_tool_calls)
86
+ missing = expected - actual_calls
87
+
88
+ if not missing:
89
+ return 100.0, None, None
90
+
91
+ ratio = len(expected - missing) / len(expected)
92
+ return (
93
+ round(ratio * 100, 1),
94
+ FailureKind.TOOL_CALL_MISMATCH,
95
+ f"Missing tool calls: {sorted(missing)}",
96
+ )
97
+
98
+
99
+ class LatencyMetric(Metric):
100
+ """Checks whether the agent responded within the required latency budget."""
101
+
102
+ name = "latency"
103
+
104
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
105
+ if case.max_latency_ms is None:
106
+ return 100.0, None, None
107
+ if trace.total_latency_ms <= case.max_latency_ms:
108
+ return 100.0, None, None
109
+ overage = trace.total_latency_ms - case.max_latency_ms
110
+ return (
111
+ max(0.0, 100.0 - (overage / case.max_latency_ms) * 100),
112
+ FailureKind.TIMEOUT,
113
+ f"Latency {trace.total_latency_ms:.0f}ms exceeded budget {case.max_latency_ms:.0f}ms",
114
+ )
115
+
116
+
117
+ class HallucinationMetric(Metric):
118
+ """Detects common hallucination signals in agent output.
119
+ Flags confident fabrications, contradictions, and forbidden facts.
120
+ """
121
+
122
+ name = "hallucination"
123
+
124
+ HALLUCINATION_PATTERNS = [
125
+ r"\bas of (january|february|march|april|may|june|july|august|september|october|november|december) 20[0-9]{2}\b",
126
+ r"\bi (don't|do not) have (access|information|data)\b",
127
+ r"\bi cannot (access|retrieve|look up)\b",
128
+ ]
129
+
130
+ def score(self, case: EvalCase, trace: Trace) -> tuple[float, FailureKind | None, str | None]:
131
+ output = str(trace.output)
132
+ for pat in self.HALLUCINATION_PATTERNS:
133
+ if re.search(pat, output, re.IGNORECASE):
134
+ return (
135
+ 40.0,
136
+ FailureKind.HALLUCINATION,
137
+ f"Hallucination signal detected: pattern '{pat}'",
138
+ )
139
+ return 100.0, None, None
140
+
141
+
142
+ def compute_case_result(case: EvalCase, trace: Trace, extra_metrics: "list[Metric] | None" = None) -> CaseResult:
143
+ metrics: list[Metric] = [
144
+ TaskCompletionMetric(),
145
+ ToolAccuracyMetric(),
146
+ LatencyMetric(),
147
+ HallucinationMetric(),
148
+ ]
149
+
150
+ if case.judge == "llm":
151
+ from .judge import LLMJudgeMetric
152
+ metrics.append(LLMJudgeMetric())
153
+
154
+ if extra_metrics:
155
+ metrics.extend(extra_metrics)
156
+
157
+ scores: list[float] = []
158
+ failure_kind: FailureKind | None = None
159
+ failure_detail: str | None = None
160
+
161
+ for metric in metrics:
162
+ s, fk, fd = metric.score(case, trace)
163
+ scores.append(s)
164
+ if s < 100.0 and failure_kind is None:
165
+ failure_kind = fk
166
+ failure_detail = fd
167
+
168
+ final_score = sum(scores) / len(scores)
169
+ task_ok_score, _, _ = TaskCompletionMetric().score(case, trace)
170
+ tool_score, _, _ = ToolAccuracyMetric().score(case, trace)
171
+ lat_score, _, _ = LatencyMetric().score(case, trace)
172
+
173
+ return CaseResult(
174
+ case_id=case.id,
175
+ passed=final_score >= 80.0,
176
+ score=round(final_score, 1),
177
+ task_completion=task_ok_score >= 80.0,
178
+ tool_accuracy=round(tool_score, 1),
179
+ latency_ms=trace.total_latency_ms,
180
+ latency_ok=lat_score >= 80.0,
181
+ failure_kind=failure_kind,
182
+ failure_detail=failure_detail,
183
+ trace=trace,
184
+ )
@@ -0,0 +1,141 @@
1
+ from __future__ import annotations
2
+
3
+ import uuid
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class RunStatus(str, Enum):
12
+ PENDING = "pending"
13
+ RUNNING = "running"
14
+ COMPLETED = "completed"
15
+ FAILED = "failed"
16
+
17
+
18
+ class ToolCallStatus(str, Enum):
19
+ SUCCESS = "success"
20
+ ERROR = "error"
21
+ TIMEOUT = "timeout"
22
+
23
+
24
+ class FailureKind(str, Enum):
25
+ TOOL_CALL_MISMATCH = "tool_call_mismatch"
26
+ HALLUCINATION = "hallucination"
27
+ PLAN_DEVIATION = "plan_deviation"
28
+ TIMEOUT = "timeout"
29
+ CONTEXT_OVERFLOW = "context_overflow"
30
+ OUTPUT_FORMAT = "output_format"
31
+ UNKNOWN = "unknown"
32
+
33
+
34
+ class ToolCall(BaseModel):
35
+ name: str
36
+ args: dict[str, Any] = Field(default_factory=dict)
37
+ result: Any = None
38
+ status: ToolCallStatus = ToolCallStatus.SUCCESS
39
+ latency_ms: float = 0.0
40
+ error: str | None = None
41
+
42
+
43
+ class TraceNode(BaseModel):
44
+ node_id: str
45
+ node_name: str
46
+ input: dict[str, Any] = Field(default_factory=dict)
47
+ output: dict[str, Any] = Field(default_factory=dict)
48
+ tool_calls: list[ToolCall] = Field(default_factory=list)
49
+ llm_prompt: str | None = None
50
+ llm_response: str | None = None
51
+ latency_ms: float = 0.0
52
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
53
+
54
+
55
+ class Trace(BaseModel):
56
+ trace_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
57
+ project: str
58
+ run_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
59
+ case_id: str | None = None
60
+ nodes: list[TraceNode] = Field(default_factory=list)
61
+ input: dict[str, Any] = Field(default_factory=dict)
62
+ output: dict[str, Any] = Field(default_factory=dict)
63
+ total_latency_ms: float = 0.0
64
+ status: RunStatus = RunStatus.COMPLETED
65
+ failure_kind: FailureKind | None = None
66
+ failure_detail: str | None = None
67
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
68
+
69
+ def tool_calls(self) -> list[ToolCall]:
70
+ return [tc for node in self.nodes for tc in node.tool_calls]
71
+
72
+ def total_tool_calls(self) -> int:
73
+ return len(self.tool_calls())
74
+
75
+
76
+ class EvalCase(BaseModel):
77
+ id: str
78
+ input: str | dict[str, Any]
79
+ expected_tool_calls: list[str] = Field(default_factory=list)
80
+ expected_output_contains: list[str] = Field(default_factory=list)
81
+ expected_output_not_contains: list[str] = Field(default_factory=list)
82
+ max_latency_ms: float | None = None
83
+ judge: str = "rule" # "rule" | "llm"
84
+ judge_criteria: str | None = None
85
+ tags: list[str] = Field(default_factory=list)
86
+
87
+
88
+ class EvalDataset(BaseModel):
89
+ version: int = 1
90
+ project: str
91
+ description: str = ""
92
+ cases: list[EvalCase] = Field(default_factory=list)
93
+
94
+
95
+ class CaseResult(BaseModel):
96
+ case_id: str
97
+ passed: bool
98
+ score: float # 0.0 - 100.0
99
+ task_completion: bool
100
+ tool_accuracy: float
101
+ latency_ms: float
102
+ latency_ok: bool
103
+ failure_kind: FailureKind | None = None
104
+ failure_detail: str | None = None
105
+ trace: Trace | None = None
106
+
107
+
108
+ class EvalSummary(BaseModel):
109
+ run_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
110
+ project: str
111
+ dataset_version: int
112
+ total_cases: int
113
+ passed: int
114
+ failed: int
115
+ warnings: int
116
+ task_completion_rate: float
117
+ tool_accuracy: float
118
+ latency_p50_ms: float
119
+ latency_p95_ms: float
120
+ regressions: int = 0
121
+ baseline_run_id: str | None = None
122
+ case_results: list[CaseResult] = Field(default_factory=list)
123
+ timestamp: datetime = Field(default_factory=datetime.utcnow)
124
+
125
+ def summary(self) -> str:
126
+ lines = [
127
+ f"CortexOps eval — {self.project}",
128
+ f" Run ID : {self.run_id}",
129
+ f" Cases : {self.total_cases} ({self.passed} passed, {self.failed} failed)",
130
+ f" Task completion : {self.task_completion_rate:.1%}",
131
+ f" Tool accuracy : {self.tool_accuracy:.1f}/100",
132
+ f" Latency p50/p95 : {self.latency_p50_ms:.0f}ms / {self.latency_p95_ms:.0f}ms",
133
+ ]
134
+ if self.regressions:
135
+ lines.append(f" Regressions : {self.regressions} (vs baseline {self.baseline_run_id})")
136
+ failing = [r for r in self.case_results if not r.passed]
137
+ if failing:
138
+ lines.append(" Failed cases:")
139
+ for r in failing:
140
+ lines.append(f" - {r.case_id}: {r.failure_kind or 'unknown'} (score {r.score:.0f})")
141
+ return "\n".join(lines)
@@ -0,0 +1,210 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import uuid
5
+ from contextlib import contextmanager
6
+ from typing import Any, Callable
7
+
8
+ from .models import FailureKind, RunStatus, Trace, TraceNode, ToolCall, ToolCallStatus
9
+
10
+
11
+ class CortexTracer:
12
+ """Instruments AI agents with zero-refactor tracing.
13
+
14
+ Supports LangGraph StateGraph and CrewAI Crew out of the box.
15
+ Falls back to a generic callable wrapper for any other agent type.
16
+
17
+ Usage:
18
+ tracer = CortexTracer(project="payments-agent")
19
+ graph = tracer.wrap(your_langgraph_app)
20
+ result = graph.invoke({"messages": [...]})
21
+ trace = tracer.last_trace()
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ project: str,
27
+ api_key: str | None = None,
28
+ environment: str = "development",
29
+ sample_rate: float = 1.0,
30
+ local_store: bool = True,
31
+ ) -> None:
32
+ self.project = project
33
+ self.api_key = api_key
34
+ self.environment = environment
35
+ self.sample_rate = sample_rate
36
+ self.local_store = local_store
37
+ self._traces: list[Trace] = []
38
+ self._current_trace: Trace | None = None
39
+
40
+ def wrap(self, agent: Any) -> Any:
41
+ """Auto-detect agent type and return an instrumented wrapper."""
42
+ agent_type = type(agent).__name__
43
+
44
+ if agent_type == "CompiledStateGraph":
45
+ return self._wrap_langgraph(agent)
46
+
47
+ if agent_type == "Crew":
48
+ return self._wrap_crewai(agent)
49
+
50
+ if callable(agent) or hasattr(agent, "invoke"):
51
+ return self._wrap_callable(agent)
52
+
53
+ raise TypeError(
54
+ f"CortexTracer.wrap() does not support {agent_type}. "
55
+ "Pass a LangGraph CompiledStateGraph, CrewAI Crew, or any callable."
56
+ )
57
+
58
+ def _wrap_langgraph(self, graph: Any) -> Any:
59
+ tracer = self
60
+
61
+ class InstrumentedGraph:
62
+ def invoke(self_, input: dict, config: dict | None = None, **kwargs) -> dict:
63
+ return tracer._run_traced(
64
+ fn=lambda: graph.invoke(input, config, **kwargs),
65
+ input=input,
66
+ framework="langgraph",
67
+ )
68
+
69
+ async def ainvoke(self_, input: dict, config: dict | None = None, **kwargs) -> dict:
70
+ import asyncio
71
+ return await asyncio.get_event_loop().run_in_executor(
72
+ None, lambda: tracer._run_traced(
73
+ fn=lambda: graph.invoke(input, config, **kwargs),
74
+ input=input,
75
+ framework="langgraph",
76
+ )
77
+ )
78
+
79
+ def stream(self_, input: dict, config: dict | None = None, **kwargs):
80
+ return graph.stream(input, config, **kwargs)
81
+
82
+ def __getattr__(self_, name: str):
83
+ return getattr(graph, name)
84
+
85
+ return InstrumentedGraph()
86
+
87
+ def _wrap_crewai(self, crew: Any) -> Any:
88
+ tracer = self
89
+
90
+ class InstrumentedCrew:
91
+ def kickoff(self_, inputs: dict | None = None) -> Any:
92
+ return tracer._run_traced(
93
+ fn=lambda: crew.kickoff(inputs=inputs),
94
+ input=inputs or {},
95
+ framework="crewai",
96
+ )
97
+
98
+ def __getattr__(self_, name: str):
99
+ return getattr(crew, name)
100
+
101
+ return InstrumentedCrew()
102
+
103
+ def _wrap_callable(self, fn: Any) -> Any:
104
+ tracer = self
105
+
106
+ if hasattr(fn, "invoke"):
107
+ # Object with .invoke() — wrap that method
108
+ original_invoke = fn.invoke
109
+
110
+ class InvokeWrapper:
111
+ def invoke(self_, *args, **kwargs):
112
+ input_data = args[0] if args else kwargs
113
+ return tracer._run_traced(
114
+ fn=lambda: original_invoke(*args, **kwargs),
115
+ input=input_data if isinstance(input_data, dict) else {"input": input_data},
116
+ framework="generic",
117
+ )
118
+
119
+ def __getattr__(self_, name: str):
120
+ return getattr(fn, name)
121
+
122
+ return InvokeWrapper()
123
+
124
+ # Plain callable
125
+ def wrapper(*args, **kwargs):
126
+ input_data = {"args": list(args), "kwargs": kwargs}
127
+ return tracer._run_traced(fn=lambda: fn(*args, **kwargs), input=input_data, framework="generic")
128
+
129
+ return wrapper
130
+
131
+ def _run_traced(self, fn: Callable, input: dict, framework: str) -> Any:
132
+ trace = Trace(
133
+ project=self.project,
134
+ input=input,
135
+ )
136
+ self._current_trace = trace
137
+ t0 = time.perf_counter()
138
+
139
+ try:
140
+ result = fn()
141
+ trace.total_latency_ms = (time.perf_counter() - t0) * 1000
142
+ trace.status = RunStatus.COMPLETED
143
+ trace.output = result if isinstance(result, dict) else {"result": str(result)}
144
+ except Exception as exc:
145
+ trace.total_latency_ms = (time.perf_counter() - t0) * 1000
146
+ trace.status = RunStatus.FAILED
147
+ trace.failure_kind = FailureKind.UNKNOWN
148
+ trace.failure_detail = str(exc)
149
+ raise
150
+ finally:
151
+ self._traces.append(trace)
152
+ if self.api_key:
153
+ self._flush_trace(trace)
154
+
155
+ return result
156
+
157
+ @contextmanager
158
+ def trace_node(self, node_name: str):
159
+ """Context manager to manually instrument a single node."""
160
+ node = TraceNode(node_id=str(uuid.uuid4()), node_name=node_name)
161
+ t0 = time.perf_counter()
162
+ try:
163
+ yield node
164
+ finally:
165
+ node.latency_ms = (time.perf_counter() - t0) * 1000
166
+ if self._current_trace:
167
+ self._current_trace.nodes.append(node)
168
+
169
+ def record_tool_call(
170
+ self,
171
+ name: str,
172
+ args: dict | None = None,
173
+ result: Any = None,
174
+ error: str | None = None,
175
+ latency_ms: float = 0.0,
176
+ ) -> ToolCall:
177
+ """Manually record a tool call onto the current active trace."""
178
+ tc = ToolCall(
179
+ name=name,
180
+ args=args or {},
181
+ result=result,
182
+ status=ToolCallStatus.ERROR if error else ToolCallStatus.SUCCESS,
183
+ latency_ms=latency_ms,
184
+ error=error,
185
+ )
186
+ if self._current_trace and self._current_trace.nodes:
187
+ self._current_trace.nodes[-1].tool_calls.append(tc)
188
+ return tc
189
+
190
+ def last_trace(self) -> Trace | None:
191
+ return self._traces[-1] if self._traces else None
192
+
193
+ def traces(self) -> list[Trace]:
194
+ return list(self._traces)
195
+
196
+ def clear(self) -> None:
197
+ self._traces.clear()
198
+ self._current_trace = None
199
+
200
+ def _flush_trace(self, trace: Trace) -> None:
201
+ try:
202
+ import httpx
203
+ httpx.post(
204
+ "https://api.cortexops.ai/v1/traces",
205
+ json=trace.model_dump(mode="json"),
206
+ headers={"Authorization": f"Bearer {self.api_key}"},
207
+ timeout=2.0,
208
+ )
209
+ except Exception:
210
+ pass # non-blocking — tracing never breaks the agent