prela 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. prela/__init__.py +394 -0
  2. prela/_version.py +3 -0
  3. prela/contrib/CLI.md +431 -0
  4. prela/contrib/README.md +118 -0
  5. prela/contrib/__init__.py +5 -0
  6. prela/contrib/cli.py +1063 -0
  7. prela/contrib/explorer.py +571 -0
  8. prela/core/__init__.py +64 -0
  9. prela/core/clock.py +98 -0
  10. prela/core/context.py +228 -0
  11. prela/core/replay.py +403 -0
  12. prela/core/sampler.py +178 -0
  13. prela/core/span.py +295 -0
  14. prela/core/tracer.py +498 -0
  15. prela/evals/__init__.py +94 -0
  16. prela/evals/assertions/README.md +484 -0
  17. prela/evals/assertions/__init__.py +78 -0
  18. prela/evals/assertions/base.py +90 -0
  19. prela/evals/assertions/multi_agent.py +625 -0
  20. prela/evals/assertions/semantic.py +223 -0
  21. prela/evals/assertions/structural.py +443 -0
  22. prela/evals/assertions/tool.py +380 -0
  23. prela/evals/case.py +370 -0
  24. prela/evals/n8n/__init__.py +69 -0
  25. prela/evals/n8n/assertions.py +450 -0
  26. prela/evals/n8n/runner.py +497 -0
  27. prela/evals/reporters/README.md +184 -0
  28. prela/evals/reporters/__init__.py +32 -0
  29. prela/evals/reporters/console.py +251 -0
  30. prela/evals/reporters/json.py +176 -0
  31. prela/evals/reporters/junit.py +278 -0
  32. prela/evals/runner.py +525 -0
  33. prela/evals/suite.py +316 -0
  34. prela/exporters/__init__.py +27 -0
  35. prela/exporters/base.py +189 -0
  36. prela/exporters/console.py +443 -0
  37. prela/exporters/file.py +322 -0
  38. prela/exporters/http.py +394 -0
  39. prela/exporters/multi.py +154 -0
  40. prela/exporters/otlp.py +388 -0
  41. prela/instrumentation/ANTHROPIC.md +297 -0
  42. prela/instrumentation/LANGCHAIN.md +480 -0
  43. prela/instrumentation/OPENAI.md +59 -0
  44. prela/instrumentation/__init__.py +49 -0
  45. prela/instrumentation/anthropic.py +1436 -0
  46. prela/instrumentation/auto.py +129 -0
  47. prela/instrumentation/base.py +436 -0
  48. prela/instrumentation/langchain.py +959 -0
  49. prela/instrumentation/llamaindex.py +719 -0
  50. prela/instrumentation/multi_agent/__init__.py +48 -0
  51. prela/instrumentation/multi_agent/autogen.py +357 -0
  52. prela/instrumentation/multi_agent/crewai.py +404 -0
  53. prela/instrumentation/multi_agent/langgraph.py +299 -0
  54. prela/instrumentation/multi_agent/models.py +203 -0
  55. prela/instrumentation/multi_agent/swarm.py +231 -0
  56. prela/instrumentation/n8n/__init__.py +68 -0
  57. prela/instrumentation/n8n/code_node.py +534 -0
  58. prela/instrumentation/n8n/models.py +336 -0
  59. prela/instrumentation/n8n/webhook.py +489 -0
  60. prela/instrumentation/openai.py +1198 -0
  61. prela/license.py +245 -0
  62. prela/replay/__init__.py +31 -0
  63. prela/replay/comparison.py +390 -0
  64. prela/replay/engine.py +1227 -0
  65. prela/replay/loader.py +231 -0
  66. prela/replay/result.py +196 -0
  67. prela-0.1.0.dist-info/METADATA +399 -0
  68. prela-0.1.0.dist-info/RECORD +71 -0
  69. prela-0.1.0.dist-info/WHEEL +4 -0
  70. prela-0.1.0.dist-info/entry_points.txt +2 -0
  71. prela-0.1.0.dist-info/licenses/LICENSE +190 -0
prela/core/replay.py ADDED
@@ -0,0 +1,403 @@
1
+ """Replay capture for deterministic re-execution of AI agent workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ from datetime import datetime, timezone
8
+ from typing import Any
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ # Storage size warning threshold (100 KB)
13
+ REPLAY_SIZE_WARNING_THRESHOLD = 100 * 1024
14
+
15
+
16
+ class ReplaySnapshot:
17
+ """Complete replay-enabling data for a span.
18
+
19
+ This class holds all information needed to deterministically replay
20
+ an agent execution, including full request/response data, tool I/O,
21
+ retrieval results, and agent state.
22
+
23
+ Different span types populate different fields:
24
+ - LLM spans: llm_request, llm_response, llm_streaming_chunks, model_info
25
+ - Tool spans: tool_name, tool_input, tool_output, has_side_effects
26
+ - Retrieval spans: retrieval_query, retrieved_documents, retrieval_metadata
27
+ - Agent spans: system_prompt, available_tools, agent_memory, agent_config
28
+
29
+ Memory efficiency: Uses __slots__ to minimize per-instance overhead.
30
+ """
31
+
32
+ __slots__ = (
33
+ # LLM fields
34
+ "llm_request",
35
+ "llm_response",
36
+ "llm_streaming_chunks",
37
+ "model_info",
38
+ "request_timestamp",
39
+ # Tool fields
40
+ "tool_name",
41
+ "tool_description",
42
+ "tool_input",
43
+ "tool_output",
44
+ "has_side_effects",
45
+ # Retrieval fields
46
+ "retrieval_query",
47
+ "retrieved_documents",
48
+ "retrieval_scores",
49
+ "retrieval_metadata",
50
+ # Agent fields
51
+ "system_prompt",
52
+ "available_tools",
53
+ "agent_memory",
54
+ "agent_config",
55
+ )
56
+
57
+ def __init__(
58
+ self,
59
+ # LLM fields
60
+ llm_request: dict[str, Any] | None = None,
61
+ llm_response: dict[str, Any] | None = None,
62
+ llm_streaming_chunks: list[dict[str, Any]] | None = None,
63
+ model_info: dict[str, Any] | None = None,
64
+ request_timestamp: datetime | None = None,
65
+ # Tool fields
66
+ tool_name: str | None = None,
67
+ tool_description: str | None = None,
68
+ tool_input: dict[str, Any] | str | None = None,
69
+ tool_output: Any = None,
70
+ has_side_effects: bool = True,
71
+ # Retrieval fields
72
+ retrieval_query: str | None = None,
73
+ retrieved_documents: list[dict[str, Any]] | None = None,
74
+ retrieval_scores: list[float] | None = None,
75
+ retrieval_metadata: dict[str, Any] | None = None,
76
+ # Agent fields
77
+ system_prompt: str | None = None,
78
+ available_tools: list[dict[str, Any]] | None = None,
79
+ agent_memory: dict[str, Any] | None = None,
80
+ agent_config: dict[str, Any] | None = None,
81
+ ) -> None:
82
+ """Initialize replay snapshot with optional fields."""
83
+ self.llm_request = llm_request
84
+ self.llm_response = llm_response
85
+ self.llm_streaming_chunks = llm_streaming_chunks
86
+ self.model_info = model_info
87
+ self.request_timestamp = request_timestamp
88
+ self.tool_name = tool_name
89
+ self.tool_description = tool_description
90
+ self.tool_input = tool_input
91
+ self.tool_output = tool_output
92
+ self.has_side_effects = has_side_effects
93
+ self.retrieval_query = retrieval_query
94
+ self.retrieved_documents = retrieved_documents
95
+ self.retrieval_scores = retrieval_scores
96
+ self.retrieval_metadata = retrieval_metadata
97
+ self.system_prompt = system_prompt
98
+ self.available_tools = available_tools
99
+ self.agent_memory = agent_memory
100
+ self.agent_config = agent_config
101
+
102
+ def to_dict(self) -> dict[str, Any]:
103
+ """Serialize to JSON-compatible dict.
104
+
105
+ Returns:
106
+ Dictionary containing all non-None fields
107
+ """
108
+ result = {}
109
+
110
+ for field_name in self.__slots__:
111
+ value = getattr(self, field_name)
112
+ if value is not None:
113
+ # Handle datetime serialization
114
+ if isinstance(value, datetime):
115
+ result[field_name] = value.isoformat()
116
+ else:
117
+ result[field_name] = value
118
+
119
+ return result
120
+
121
+ @classmethod
122
+ def from_dict(cls, data: dict[str, Any]) -> ReplaySnapshot:
123
+ """Deserialize from dict.
124
+
125
+ Args:
126
+ data: Dictionary from to_dict()
127
+
128
+ Returns:
129
+ ReplaySnapshot instance
130
+ """
131
+ # Convert ISO timestamp back to datetime
132
+ if "request_timestamp" in data and isinstance(data["request_timestamp"], str):
133
+ data["request_timestamp"] = datetime.fromisoformat(data["request_timestamp"])
134
+
135
+ return cls(**data)
136
+
137
+ def estimate_size_bytes(self) -> int:
138
+ """Estimate storage size in bytes.
139
+
140
+ This is an approximation based on JSON serialization size.
141
+ Useful for monitoring storage costs.
142
+
143
+ Logs a warning if size exceeds 100 KB threshold.
144
+
145
+ Returns:
146
+ Estimated size in bytes
147
+ """
148
+ serialized = json.dumps(self.to_dict())
149
+ size_bytes = len(serialized.encode("utf-8"))
150
+
151
+ # Warn if exceeds threshold
152
+ if size_bytes > REPLAY_SIZE_WARNING_THRESHOLD:
153
+ logger.warning(
154
+ f"Replay snapshot size ({size_bytes / 1024:.1f} KB) exceeds "
155
+ f"recommended threshold ({REPLAY_SIZE_WARNING_THRESHOLD / 1024:.0f} KB). "
156
+ f"Consider reducing captured data or increasing storage budget."
157
+ )
158
+
159
+ return size_bytes
160
+
161
+
162
+ class ReplayCapture:
163
+ """Helper for building ReplaySnapshot during span execution.
164
+
165
+ This class provides a builder-style API for incrementally capturing
166
+ replay data as a span executes.
167
+
168
+ Example:
169
+ ```python
170
+ capture = ReplayCapture()
171
+ capture.set_llm_request(model="gpt-4", messages=[...])
172
+ capture.set_llm_response(text="...", tokens=100)
173
+ snapshot = capture.build()
174
+ ```
175
+ """
176
+
177
+ def __init__(self) -> None:
178
+ """Initialize empty capture."""
179
+ self._snapshot = ReplaySnapshot()
180
+
181
+ # LLM capture methods
182
+ def set_llm_request(
183
+ self,
184
+ model: str,
185
+ messages: list[dict[str, Any]] | None = None,
186
+ prompt: str | None = None,
187
+ temperature: float | None = None,
188
+ max_tokens: int | None = None,
189
+ **kwargs: Any,
190
+ ) -> None:
191
+ """Capture LLM request details.
192
+
193
+ Args:
194
+ model: Model identifier (e.g., "gpt-4", "claude-sonnet-4")
195
+ messages: Chat messages (OpenAI/Anthropic format)
196
+ prompt: Single prompt string (legacy completions)
197
+ temperature: Sampling temperature
198
+ max_tokens: Maximum tokens to generate
199
+ **kwargs: Additional provider-specific parameters
200
+ """
201
+ request: dict[str, Any] = {"model": model}
202
+
203
+ if messages is not None:
204
+ request["messages"] = messages
205
+ if prompt is not None:
206
+ request["prompt"] = prompt
207
+ if temperature is not None:
208
+ request["temperature"] = temperature
209
+ if max_tokens is not None:
210
+ request["max_tokens"] = max_tokens
211
+
212
+ # Capture all other kwargs (top_p, frequency_penalty, etc.)
213
+ request.update(kwargs)
214
+
215
+ self._snapshot.llm_request = request
216
+ self._snapshot.request_timestamp = datetime.now(timezone.utc)
217
+
218
+ def set_llm_response(
219
+ self,
220
+ text: str,
221
+ finish_reason: str | None = None,
222
+ model: str | None = None,
223
+ prompt_tokens: int | None = None,
224
+ completion_tokens: int | None = None,
225
+ **kwargs: Any,
226
+ ) -> None:
227
+ """Capture LLM response details.
228
+
229
+ Args:
230
+ text: Complete response text
231
+ finish_reason: Why generation stopped (stop, length, tool_calls)
232
+ model: Actual model used (may differ from requested)
233
+ prompt_tokens: Tokens in prompt
234
+ completion_tokens: Tokens in completion
235
+ **kwargs: Additional response metadata
236
+ """
237
+ response: dict[str, Any] = {"text": text}
238
+
239
+ if finish_reason is not None:
240
+ response["finish_reason"] = finish_reason
241
+ if model is not None:
242
+ response["model"] = model
243
+ if prompt_tokens is not None:
244
+ response["prompt_tokens"] = prompt_tokens
245
+ if completion_tokens is not None:
246
+ response["completion_tokens"] = completion_tokens
247
+
248
+ response.update(kwargs)
249
+ self._snapshot.llm_response = response
250
+
251
+ def add_streaming_chunk(
252
+ self,
253
+ chunk: dict[str, Any],
254
+ ) -> None:
255
+ """Add a streaming chunk to the replay data.
256
+
257
+ For streaming LLM responses, each delta/chunk is captured separately
258
+ to enable exact replay of streaming behavior.
259
+
260
+ Args:
261
+ chunk: Chunk data (provider-specific format)
262
+ """
263
+ if self._snapshot.llm_streaming_chunks is None:
264
+ self._snapshot.llm_streaming_chunks = []
265
+
266
+ self._snapshot.llm_streaming_chunks.append(chunk)
267
+
268
+ def set_model_info(self, **info: Any) -> None:
269
+ """Capture model version/endpoint info.
270
+
271
+ Args:
272
+ **info: Model metadata (version, endpoint, created timestamp, etc.)
273
+ """
274
+ self._snapshot.model_info = info
275
+
276
+ # Tool capture methods
277
+ def set_tool_call(
278
+ self,
279
+ name: str,
280
+ description: str | None = None,
281
+ input_args: dict[str, Any] | str | None = None,
282
+ output: Any = None,
283
+ has_side_effects: bool = True, # SAFE DEFAULT
284
+ ) -> None:
285
+ """Capture tool call details.
286
+
287
+ Args:
288
+ name: Tool name
289
+ description: Tool description
290
+ input_args: Input arguments (dict or JSON string)
291
+ output: Tool output/return value
292
+ has_side_effects: Whether tool modifies external state (default: True)
293
+ """
294
+ self._snapshot.tool_name = name
295
+ self._snapshot.tool_description = description
296
+ self._snapshot.tool_input = input_args
297
+ self._snapshot.tool_output = output
298
+ self._snapshot.has_side_effects = has_side_effects
299
+
300
+ # Retrieval capture methods
301
+ def set_retrieval(
302
+ self,
303
+ query: str,
304
+ documents: list[dict[str, Any]],
305
+ scores: list[float] | None = None,
306
+ metadata: dict[str, Any] | None = None,
307
+ ) -> None:
308
+ """Capture retrieval operation details.
309
+
310
+ Args:
311
+ query: Query text
312
+ documents: Retrieved documents (full content)
313
+ scores: Similarity scores for each document
314
+ metadata: Retrieval metadata (index name, collection, etc.)
315
+ """
316
+ self._snapshot.retrieval_query = query
317
+ self._snapshot.retrieved_documents = documents
318
+ self._snapshot.retrieval_scores = scores
319
+ self._snapshot.retrieval_metadata = metadata
320
+
321
+ # Agent capture methods
322
+ def set_agent_context(
323
+ self,
324
+ system_prompt: str | None = None,
325
+ available_tools: list[dict[str, Any]] | None = None,
326
+ memory: dict[str, Any] | None = None,
327
+ config: dict[str, Any] | None = None,
328
+ ) -> None:
329
+ """Capture agent context and configuration.
330
+
331
+ Args:
332
+ system_prompt: System/instruction prompt
333
+ available_tools: List of tools with schemas
334
+ memory: Agent memory/context state
335
+ config: Agent configuration
336
+ """
337
+ if system_prompt is not None:
338
+ self._snapshot.system_prompt = system_prompt
339
+ if available_tools is not None:
340
+ self._snapshot.available_tools = available_tools
341
+ if memory is not None:
342
+ self._snapshot.agent_memory = memory
343
+ if config is not None:
344
+ self._snapshot.agent_config = config
345
+
346
+ def build(self) -> ReplaySnapshot:
347
+ """Return the completed snapshot.
348
+
349
+ Returns:
350
+ ReplaySnapshot with all captured data
351
+ """
352
+ return self._snapshot
353
+
354
+
355
+ def estimate_replay_storage(
356
+ span: Any, # Span type (avoid circular import)
357
+ replay_snapshot: ReplaySnapshot | None = None,
358
+ ) -> int:
359
+ """Estimate total storage size for span with replay data.
360
+
361
+ Args:
362
+ span: The span to estimate (must have .to_dict() method)
363
+ replay_snapshot: Optional replay snapshot (if not attached to span)
364
+
365
+ Returns:
366
+ Estimated size in bytes
367
+ """
368
+ # Base span size
369
+ span_dict = span.to_dict()
370
+ base_size = len(json.dumps(span_dict).encode("utf-8"))
371
+
372
+ # Replay data size
373
+ replay_size = 0
374
+ if replay_snapshot is not None:
375
+ replay_size = replay_snapshot.estimate_size_bytes()
376
+ elif hasattr(span, "replay_snapshot") and span.replay_snapshot is not None:
377
+ replay_size = span.replay_snapshot.estimate_size_bytes()
378
+
379
+ return base_size + replay_size
380
+
381
+
382
+ def serialize_replay_data(value: Any) -> Any:
383
+ """Serialize arbitrary Python values for replay storage.
384
+
385
+ Handles common types that may appear in tool I/O or agent state.
386
+
387
+ Args:
388
+ value: Value to serialize
389
+
390
+ Returns:
391
+ JSON-compatible value
392
+ """
393
+ if isinstance(value, (str, int, float, bool, type(None))):
394
+ return value
395
+ elif isinstance(value, datetime):
396
+ return value.isoformat()
397
+ elif isinstance(value, (list, tuple)):
398
+ return [serialize_replay_data(item) for item in value]
399
+ elif isinstance(value, dict):
400
+ return {k: serialize_replay_data(v) for k, v in value.items()}
401
+ else:
402
+ # Fallback: convert to string representation
403
+ return str(value)
prela/core/sampler.py ADDED
@@ -0,0 +1,178 @@
1
+ """Sampling strategies for trace collection.
2
+
3
+ This module provides different sampling strategies to control which traces
4
+ are collected and exported. Sampling helps reduce overhead and costs while
5
+ still providing useful observability data.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import hashlib
11
+ import time
12
+ from abc import ABC, abstractmethod
13
+ from threading import Lock
14
+
15
+
16
+ class BaseSampler(ABC):
17
+ """Abstract base class for trace samplers.
18
+
19
+ Samplers determine whether a trace should be collected based on
20
+ the trace ID and potentially other factors.
21
+ """
22
+
23
+ @abstractmethod
24
+ def should_sample(self, trace_id: str) -> bool:
25
+ """Determine if a trace should be sampled.
26
+
27
+ Args:
28
+ trace_id: The trace ID to make a sampling decision for
29
+
30
+ Returns:
31
+ True if the trace should be sampled, False otherwise
32
+ """
33
+ pass
34
+
35
+
36
+ class AlwaysOnSampler(BaseSampler):
37
+ """Sampler that always samples every trace.
38
+
39
+ Use this in development or when you need complete trace coverage.
40
+ Be aware this may generate high data volumes in production.
41
+ """
42
+
43
+ def should_sample(self, trace_id: str) -> bool:
44
+ """Always return True.
45
+
46
+ Args:
47
+ trace_id: The trace ID (unused)
48
+
49
+ Returns:
50
+ Always True
51
+ """
52
+ return True
53
+
54
+
55
+ class AlwaysOffSampler(BaseSampler):
56
+ """Sampler that never samples any traces.
57
+
58
+ Use this to completely disable tracing, for example during
59
+ maintenance windows or in testing environments.
60
+ """
61
+
62
+ def should_sample(self, trace_id: str) -> bool:
63
+ """Always return False.
64
+
65
+ Args:
66
+ trace_id: The trace ID (unused)
67
+
68
+ Returns:
69
+ Always False
70
+ """
71
+ return False
72
+
73
+
74
+ class ProbabilitySampler(BaseSampler):
75
+ """Sampler that samples traces with a fixed probability.
76
+
77
+ This sampler uses a deterministic hash-based approach to ensure
78
+ consistent sampling decisions for the same trace ID across
79
+ different services and processes.
80
+ """
81
+
82
+ def __init__(self, rate: float) -> None:
83
+ """Initialize the probability sampler.
84
+
85
+ Args:
86
+ rate: Sampling rate between 0.0 and 1.0 (inclusive)
87
+
88
+ Raises:
89
+ ValueError: If rate is not between 0.0 and 1.0
90
+ """
91
+ if not 0.0 <= rate <= 1.0:
92
+ raise ValueError(f"Sampling rate must be between 0.0 and 1.0, got {rate}")
93
+ self.rate = rate
94
+
95
+ def should_sample(self, trace_id: str) -> bool:
96
+ """Sample based on trace ID hash.
97
+
98
+ Uses MD5 hash of trace_id to make a deterministic sampling decision.
99
+ This ensures the same trace_id always gets the same sampling decision
100
+ across different processes and services.
101
+
102
+ Args:
103
+ trace_id: The trace ID to make a sampling decision for
104
+
105
+ Returns:
106
+ True if the trace should be sampled, False otherwise
107
+ """
108
+ if self.rate == 0.0:
109
+ return False
110
+ if self.rate == 1.0:
111
+ return True
112
+
113
+ # Use MD5 hash to get a deterministic value between 0 and 1
114
+ hash_bytes = hashlib.md5(trace_id.encode()).digest()
115
+ # Take first 8 bytes and convert to int, then normalize to [0, 1]
116
+ hash_value = int.from_bytes(hash_bytes[:8], byteorder="big")
117
+ probability = hash_value / (2**64 - 1)
118
+
119
+ return probability < self.rate
120
+
121
+
122
+ class RateLimitingSampler(BaseSampler):
123
+ """Sampler that limits the number of traces sampled per second.
124
+
125
+ This sampler uses a token bucket algorithm to enforce a maximum
126
+ rate of sampled traces per second. Useful for controlling costs
127
+ and backend load.
128
+ """
129
+
130
+ def __init__(self, traces_per_second: float) -> None:
131
+ """Initialize the rate limiting sampler.
132
+
133
+ Args:
134
+ traces_per_second: Maximum number of traces to sample per second
135
+
136
+ Raises:
137
+ ValueError: If traces_per_second is negative
138
+ """
139
+ if traces_per_second < 0:
140
+ raise ValueError(f"traces_per_second must be non-negative, got {traces_per_second}")
141
+
142
+ self.traces_per_second = traces_per_second
143
+ self._tokens = traces_per_second
144
+ self._last_update = time.perf_counter()
145
+ self._lock = Lock()
146
+
147
+ def should_sample(self, trace_id: str) -> bool:
148
+ """Sample if tokens are available.
149
+
150
+ Uses a token bucket algorithm: tokens regenerate at the configured
151
+ rate, and each sampling decision consumes one token.
152
+
153
+ Args:
154
+ trace_id: The trace ID (unused)
155
+
156
+ Returns:
157
+ True if a token is available, False otherwise
158
+ """
159
+ if self.traces_per_second == 0:
160
+ return False
161
+
162
+ with self._lock:
163
+ now = time.perf_counter()
164
+ elapsed = now - self._last_update
165
+
166
+ # Refill tokens based on elapsed time
167
+ self._tokens = min(
168
+ self.traces_per_second,
169
+ self._tokens + (elapsed * self.traces_per_second),
170
+ )
171
+ self._last_update = now
172
+
173
+ # Try to consume a token
174
+ if self._tokens >= 1.0:
175
+ self._tokens -= 1.0
176
+ return True
177
+
178
+ return False