evalcraft 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. evalcraft/__init__.py +50 -0
  2. evalcraft/adapters/__init__.py +47 -0
  3. evalcraft/adapters/anthropic_adapter.py +298 -0
  4. evalcraft/adapters/autogen_adapter.py +362 -0
  5. evalcraft/adapters/crewai_adapter.py +361 -0
  6. evalcraft/adapters/langgraph_adapter.py +546 -0
  7. evalcraft/adapters/llamaindex_adapter.py +434 -0
  8. evalcraft/adapters/openai_adapter.py +305 -0
  9. evalcraft/alerts/__init__.py +7 -0
  10. evalcraft/alerts/email.py +195 -0
  11. evalcraft/alerts/slack.py +201 -0
  12. evalcraft/alerts/webhook.py +82 -0
  13. evalcraft/capture/__init__.py +0 -0
  14. evalcraft/capture/recorder.py +318 -0
  15. evalcraft/cli/__init__.py +0 -0
  16. evalcraft/cli/init_cmd.py +245 -0
  17. evalcraft/cli/main.py +1128 -0
  18. evalcraft/cli/templates/__init__.py +1 -0
  19. evalcraft/cli/templates/conftest.py +42 -0
  20. evalcraft/cli/templates/evalcraft.toml +40 -0
  21. evalcraft/cli/templates/test_agent_anthropic.py +148 -0
  22. evalcraft/cli/templates/test_agent_crewai.py +138 -0
  23. evalcraft/cli/templates/test_agent_generic.py +155 -0
  24. evalcraft/cli/templates/test_agent_langgraph.py +141 -0
  25. evalcraft/cli/templates/test_agent_openai.py +140 -0
  26. evalcraft/cloud/__init__.py +5 -0
  27. evalcraft/cloud/client.py +361 -0
  28. evalcraft/core/__init__.py +0 -0
  29. evalcraft/core/models.py +329 -0
  30. evalcraft/eval/__init__.py +0 -0
  31. evalcraft/eval/scorers/__init__.py +368 -0
  32. evalcraft/golden/__init__.py +5 -0
  33. evalcraft/golden/manager.py +402 -0
  34. evalcraft/mock/__init__.py +0 -0
  35. evalcraft/mock/llm.py +243 -0
  36. evalcraft/mock/tool.py +213 -0
  37. evalcraft/pytest_plugin/__init__.py +35 -0
  38. evalcraft/pytest_plugin/plugin.py +397 -0
  39. evalcraft/regression/__init__.py +10 -0
  40. evalcraft/regression/detector.py +447 -0
  41. evalcraft/replay/__init__.py +0 -0
  42. evalcraft/replay/engine.py +337 -0
  43. evalcraft/replay/network_guard.py +181 -0
  44. evalcraft/sanitize/__init__.py +5 -0
  45. evalcraft/sanitize/redactor.py +261 -0
  46. evalcraft-0.1.0.dist-info/METADATA +431 -0
  47. evalcraft-0.1.0.dist-info/RECORD +50 -0
  48. evalcraft-0.1.0.dist-info/WHEEL +4 -0
  49. evalcraft-0.1.0.dist-info/entry_points.txt +5 -0
  50. evalcraft-0.1.0.dist-info/licenses/LICENSE +21 -0
evalcraft/__init__.py ADDED
@@ -0,0 +1,50 @@
1
+ """Evalcraft — The pytest for AI agents.
2
+
3
+ Capture, replay, mock, and evaluate agent behavior.
4
+ """
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ from evalcraft.capture.recorder import capture, CaptureContext
9
+ from evalcraft.replay.engine import replay, ReplayEngine
10
+ from evalcraft.mock.llm import MockLLM
11
+ from evalcraft.mock.tool import MockTool
12
+ from evalcraft.eval.scorers import (
13
+ assert_tool_called,
14
+ assert_tool_order,
15
+ assert_no_tool_called,
16
+ assert_output_contains,
17
+ assert_output_matches,
18
+ assert_cost_under,
19
+ assert_latency_under,
20
+ assert_token_count_under,
21
+ )
22
+ from evalcraft.core.models import Span, Cassette, AgentRun, EvalResult
23
+ from evalcraft.golden.manager import GoldenSet
24
+ from evalcraft.regression.detector import RegressionDetector, RegressionReport
25
+ from evalcraft.cloud.client import EvalcraftCloud
26
+
27
+ __all__ = [
28
+ "capture",
29
+ "CaptureContext",
30
+ "replay",
31
+ "ReplayEngine",
32
+ "MockLLM",
33
+ "MockTool",
34
+ "assert_tool_called",
35
+ "assert_tool_order",
36
+ "assert_no_tool_called",
37
+ "assert_output_contains",
38
+ "assert_output_matches",
39
+ "assert_cost_under",
40
+ "assert_latency_under",
41
+ "assert_token_count_under",
42
+ "Span",
43
+ "Cassette",
44
+ "AgentRun",
45
+ "EvalResult",
46
+ "GoldenSet",
47
+ "RegressionDetector",
48
+ "RegressionReport",
49
+ "EvalcraftCloud",
50
+ ]
@@ -0,0 +1,47 @@
1
+ """Framework adapters — auto-capture LLM and agent calls into evalcraft spans.
2
+
3
+ Available adapters:
4
+
5
+ - :class:`OpenAIAdapter` — patches the OpenAI Python SDK to record every
6
+ ``chat.completions.create()`` call (sync and async).
7
+
8
+ - :class:`AnthropicAdapter` — patches the Anthropic Python SDK to record every
9
+ ``messages.create()`` call (sync and async).
10
+
11
+ - :class:`LangGraphAdapter` — injects a LangChain callback handler into a
12
+ compiled LangGraph graph to record node executions, LLM calls, and tool
13
+ calls.
14
+
15
+ - :class:`CrewAIAdapter` — instruments a CrewAI ``Crew`` to capture
16
+ ``kickoff()`` timing, agent tool calls, task completions, and
17
+ inter-agent delegations.
18
+
19
+ - :class:`AutoGenAdapter` — patches AutoGen's ``ConversableAgent`` to capture
20
+ inter-agent messages, LLM responses, and function / tool call executions.
21
+
22
+ - :class:`LlamaIndexAdapter` — hooks into LlamaIndex's callback system to
23
+ capture queries, retrieval, LLM synthesis, and function calls.
24
+
25
+ Usage::
26
+
27
+ from evalcraft.adapters import (
28
+ OpenAIAdapter, AnthropicAdapter, LangGraphAdapter, CrewAIAdapter,
29
+ AutoGenAdapter, LlamaIndexAdapter,
30
+ )
31
+ """
32
+
33
+ from evalcraft.adapters.anthropic_adapter import AnthropicAdapter
34
+ from evalcraft.adapters.autogen_adapter import AutoGenAdapter
35
+ from evalcraft.adapters.crewai_adapter import CrewAIAdapter
36
+ from evalcraft.adapters.langgraph_adapter import LangGraphAdapter
37
+ from evalcraft.adapters.llamaindex_adapter import LlamaIndexAdapter
38
+ from evalcraft.adapters.openai_adapter import OpenAIAdapter
39
+
40
+ __all__ = [
41
+ "OpenAIAdapter",
42
+ "AnthropicAdapter",
43
+ "LangGraphAdapter",
44
+ "CrewAIAdapter",
45
+ "AutoGenAdapter",
46
+ "LlamaIndexAdapter",
47
+ ]
@@ -0,0 +1,298 @@
1
+ """Anthropic SDK adapter — auto-captures LLM calls into evalcraft spans.
2
+
3
+ Monkey-patches ``anthropic.resources.messages.Messages`` so every call to
4
+ ``client.messages.create()`` (sync or async) is automatically recorded into
5
+ the active :class:`~evalcraft.capture.recorder.CaptureContext`.
6
+
7
+ Usage::
8
+
9
+ from evalcraft.adapters import AnthropicAdapter
10
+ from evalcraft import CaptureContext
11
+ import anthropic
12
+
13
+ client = anthropic.Anthropic()
14
+
15
+ with CaptureContext(name="weather_test") as ctx:
16
+ with AnthropicAdapter():
17
+ response = client.messages.create(
18
+ model="claude-3-5-sonnet-20241022",
19
+ max_tokens=1024,
20
+ messages=[{"role": "user", "content": "What's the weather?"}],
21
+ )
22
+
23
+ cassette = ctx.cassette
24
+ print(cassette.total_tokens, cassette.total_cost_usd)
25
+
26
+ The adapter works with any Anthropic client instance because it patches the
27
+ class-level method rather than a specific client instance.
28
+
29
+ Thread / async safety: the adapter is NOT reentrant — don't nest two
30
+ ``AnthropicAdapter`` context managers. It restores the original methods on
31
+ exit even if an exception is raised.
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import time
37
+ from typing import Any
38
+
39
+ from evalcraft.capture.recorder import get_active_context
40
+ from evalcraft.core.models import Span, SpanKind, TokenUsage
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Pricing table — approximate cost per 1 M tokens (input_usd, output_usd).
45
+ # Prices reflect Anthropic's public rates as of early 2026; update as needed.
46
+ # ---------------------------------------------------------------------------
47
+ _MODEL_PRICING: dict[str, tuple[float, float]] = {
48
+ # Claude 4.x
49
+ "claude-opus-4-6": (15.00, 75.00),
50
+ "claude-sonnet-4-6": (3.00, 15.00),
51
+ "claude-haiku-4-5-20251001": (0.80, 4.00),
52
+ # Claude 3.5
53
+ "claude-3-5-sonnet-20241022": (3.00, 15.00),
54
+ "claude-3-5-sonnet-20240620": (3.00, 15.00),
55
+ "claude-3-5-haiku-20241022": (0.80, 4.00),
56
+ # Claude 3
57
+ "claude-3-opus-20240229": (15.00, 75.00),
58
+ "claude-3-sonnet-20240229": (3.00, 15.00),
59
+ "claude-3-haiku-20240307": (0.25, 1.25),
60
+ # Claude 2
61
+ "claude-2.1": (8.00, 24.00),
62
+ "claude-2.0": (8.00, 24.00),
63
+ # Claude Instant
64
+ "claude-instant-1.2": (0.80, 2.40),
65
+ }
66
+
67
+ _UNKNOWN_MODEL = "unknown"
68
+
69
+
70
+ # ---------------------------------------------------------------------------
71
+ # Helpers
72
+ # ---------------------------------------------------------------------------
73
+
74
+ def _estimate_cost(model: str, input_tokens: int, output_tokens: int) -> float | None:
75
+ """Return an estimated USD cost or *None* if the model is not in the table."""
76
+ pricing = _MODEL_PRICING.get(model)
77
+ if pricing is None:
78
+ # Prefix-match for dated model variants not listed explicitly.
79
+ for key, prices in _MODEL_PRICING.items():
80
+ if model.startswith(key):
81
+ pricing = prices
82
+ break
83
+ if pricing is None:
84
+ return None
85
+ input_usd, output_usd = pricing
86
+ return (input_tokens * input_usd + output_tokens * output_usd) / 1_000_000
87
+
88
+
89
+ def _messages_to_str(messages: list[dict[str, Any]]) -> str:
90
+ """Flatten an Anthropic ``messages`` list into a single readable string."""
91
+ parts: list[str] = []
92
+ for msg in messages:
93
+ role = msg.get("role", "unknown")
94
+ content = msg.get("content", "")
95
+ if isinstance(content, list):
96
+ # Multi-modal content blocks — extract text parts only.
97
+ text_parts = [
98
+ block.get("text", "") if isinstance(block, dict) else str(block)
99
+ for block in content
100
+ if not isinstance(block, dict) or block.get("type") == "text"
101
+ ]
102
+ content = " ".join(text_parts)
103
+ parts.append(f"{role}: {content}")
104
+ return "\n".join(parts)
105
+
106
+
107
+ def _response_to_str(response: Any) -> str:
108
+ """Extract assistant text (and any tool-use summaries) from a Message response."""
109
+ try:
110
+ content_blocks = response.content
111
+ if not content_blocks:
112
+ return ""
113
+ parts: list[str] = []
114
+ for block in content_blocks:
115
+ block_type = getattr(block, "type", None)
116
+ if block_type == "text":
117
+ parts.append(getattr(block, "text", ""))
118
+ elif block_type == "tool_use":
119
+ name = getattr(block, "name", "")
120
+ tool_input = getattr(block, "input", {})
121
+ parts.append(f"[tool_use:{name}({tool_input})]")
122
+ return " ".join(parts).strip()
123
+ except (AttributeError, TypeError):
124
+ return str(response)
125
+
126
+
127
+ def _get_stop_reason(response: Any) -> str:
128
+ try:
129
+ return getattr(response, "stop_reason", "") or ""
130
+ except AttributeError:
131
+ return ""
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Adapter
136
+ # ---------------------------------------------------------------------------
137
+
138
+ class AnthropicAdapter:
139
+ """Patches the Anthropic SDK to auto-record every messages.create() call.
140
+
141
+ Works as both a **sync** and **async** context manager. Patches the
142
+ ``Messages`` and ``AsyncMessages`` classes so *all* client instances
143
+ are captured.
144
+
145
+ .. code-block:: python
146
+
147
+ with AnthropicAdapter():
148
+ response = client.messages.create(...)
149
+
150
+ async with AnthropicAdapter():
151
+ response = await client.messages.create(...)
152
+
153
+ Spans are silently dropped when no :class:`CaptureContext` is active —
154
+ so the adapter is safe to leave in place during non-test code paths.
155
+
156
+ Raises:
157
+ ImportError: if ``anthropic`` is not installed.
158
+ """
159
+
160
+ def __init__(self) -> None:
161
+ self._Messages: Any = None
162
+ self._AsyncMessages: Any = None
163
+ self._original_sync_create: Any = None
164
+ self._original_async_create: Any = None
165
+ self._patched: bool = False
166
+
167
+ # -- context manager protocol ------------------------------------------
168
+
169
+ def __enter__(self) -> "AnthropicAdapter":
170
+ self._patch()
171
+ return self
172
+
173
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
174
+ self._unpatch()
175
+
176
+ async def __aenter__(self) -> "AnthropicAdapter":
177
+ self._patch()
178
+ return self
179
+
180
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
181
+ self._unpatch()
182
+
183
+ # -- patching -----------------------------------------------------------
184
+
185
+ def _patch(self) -> None:
186
+ if self._patched:
187
+ return
188
+ try:
189
+ from anthropic.resources.messages import ( # type: ignore[import]
190
+ AsyncMessages,
191
+ Messages,
192
+ )
193
+ except ImportError as exc:
194
+ raise ImportError(
195
+ "The 'anthropic' package is required for AnthropicAdapter. "
196
+ "Install it with: pip install 'evalcraft[anthropic]'"
197
+ ) from exc
198
+
199
+ self._Messages = Messages
200
+ self._AsyncMessages = AsyncMessages
201
+ self._original_sync_create = Messages.create
202
+ self._original_async_create = AsyncMessages.create
203
+
204
+ adapter = self
205
+ original_sync = self._original_sync_create
206
+ original_async = self._original_async_create
207
+
208
+ def patched_sync_create(self_messages: Any, *args: Any, **kwargs: Any) -> Any:
209
+ start = time.monotonic()
210
+ try:
211
+ response = original_sync(self_messages, *args, **kwargs)
212
+ except Exception as exc:
213
+ duration_ms = (time.monotonic() - start) * 1000
214
+ adapter._record_error(kwargs, duration_ms, str(exc))
215
+ raise
216
+ duration_ms = (time.monotonic() - start) * 1000
217
+ adapter._record_response(kwargs, response, duration_ms)
218
+ return response
219
+
220
+ async def patched_async_create(self_messages: Any, *args: Any, **kwargs: Any) -> Any:
221
+ start = time.monotonic()
222
+ try:
223
+ response = await original_async(self_messages, *args, **kwargs)
224
+ except Exception as exc:
225
+ duration_ms = (time.monotonic() - start) * 1000
226
+ adapter._record_error(kwargs, duration_ms, str(exc))
227
+ raise
228
+ duration_ms = (time.monotonic() - start) * 1000
229
+ adapter._record_response(kwargs, response, duration_ms)
230
+ return response
231
+
232
+ Messages.create = patched_sync_create # type: ignore[method-assign]
233
+ AsyncMessages.create = patched_async_create # type: ignore[method-assign]
234
+ self._patched = True
235
+
236
+ def _unpatch(self) -> None:
237
+ if not self._patched:
238
+ return
239
+ if self._Messages is not None and self._original_sync_create is not None:
240
+ self._Messages.create = self._original_sync_create # type: ignore[method-assign]
241
+ if self._AsyncMessages is not None and self._original_async_create is not None:
242
+ self._AsyncMessages.create = self._original_async_create # type: ignore[method-assign]
243
+ self._patched = False
244
+
245
+ # -- recording helpers --------------------------------------------------
246
+
247
+ def _record_response(self, kwargs: dict[str, Any], response: Any, duration_ms: float) -> None:
248
+ ctx = get_active_context()
249
+ if ctx is None:
250
+ return
251
+
252
+ model: str = getattr(response, "model", None) or kwargs.get("model", _UNKNOWN_MODEL)
253
+ messages = kwargs.get("messages", [])
254
+ input_str = _messages_to_str(messages) if isinstance(messages, list) else str(messages)
255
+ output_str = _response_to_str(response)
256
+
257
+ input_tokens = 0
258
+ output_tokens = 0
259
+ try:
260
+ usage = response.usage
261
+ if usage:
262
+ input_tokens = getattr(usage, "input_tokens", 0) or 0
263
+ output_tokens = getattr(usage, "output_tokens", 0) or 0
264
+ except AttributeError:
265
+ pass
266
+
267
+ cost_usd = _estimate_cost(model, input_tokens, output_tokens)
268
+
269
+ ctx.record_llm_call(
270
+ model=model,
271
+ input=input_str,
272
+ output=output_str,
273
+ duration_ms=duration_ms,
274
+ prompt_tokens=input_tokens,
275
+ completion_tokens=output_tokens,
276
+ cost_usd=cost_usd,
277
+ metadata={"stop_reason": _get_stop_reason(response)},
278
+ )
279
+
280
+ def _record_error(self, kwargs: dict[str, Any], duration_ms: float, error: str) -> None:
281
+ ctx = get_active_context()
282
+ if ctx is None:
283
+ return
284
+
285
+ model: str = kwargs.get("model", _UNKNOWN_MODEL)
286
+ messages = kwargs.get("messages", [])
287
+ input_str = _messages_to_str(messages) if isinstance(messages, list) else str(messages)
288
+
289
+ span = Span(
290
+ kind=SpanKind.LLM_RESPONSE,
291
+ name=f"llm:{model}",
292
+ duration_ms=duration_ms,
293
+ input=input_str,
294
+ output=None,
295
+ model=model,
296
+ error=error,
297
+ )
298
+ ctx.record_span(span)