agentreplay 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,202 @@
1
+ # Copyright 2025 Sushanth (https://github.com/sushanthpy)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Bootstrap module for zero-code auto-instrumentation.
16
+
17
+ This module is called by the .pth file on Python startup when AGENTREPLAY_ENABLED=true.
18
+ It initializes OpenTelemetry instrumentation with minimal overhead.
19
+
20
+ Environment Variables:
21
+ AGENTREPLAY_ENABLED: Set to 'true' to enable auto-instrumentation
22
+ AGENTREPLAY_SERVICE_NAME: Service name for traces (default: 'agentreplay-app')
23
+ AGENTREPLAY_OTLP_ENDPOINT: OTLP gRPC endpoint (default: 'localhost:47117')
24
+ AGENTREPLAY_PROJECT_ID: Project ID for traces
25
+ AGENTREPLAY_TENANT_ID: Tenant ID for traces (default: 1)
26
+ AGENTREPLAY_DEBUG: Enable debug logging (default: false)
27
+ AGENTREPLAY_CAPTURE_CONTENT: Capture LLM request/response content (default: true)
28
+ OTEL_EXPORTER_OTLP_ENDPOINT: Standard OTEL endpoint override
29
+
30
+ Example:
31
+ # Option 1: Automatic via .pth file
32
+ $ export AGENTREPLAY_ENABLED=true
33
+ $ export AGENTREPLAY_PROJECT_ID=27986
34
+ $ python my_app.py # Auto-instrumented!
35
+
36
+ # Option 2: Manual initialization
37
+ >>> from agentreplay.bootstrap import init_otel_instrumentation
38
+ >>> init_otel_instrumentation()
39
+ """
40
+
41
+ import os
42
+ import logging
43
+ from typing import Optional
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ # Global flag to prevent double-initialization
48
+ _initialized = False
49
+
50
+
51
+ def init_otel_instrumentation(
52
+ service_name: Optional[str] = None,
53
+ otlp_endpoint: Optional[str] = None,
54
+ project_id: Optional[int] = None,
55
+ tenant_id: Optional[int] = None,
56
+ capture_content: Optional[bool] = None,
57
+ debug: Optional[bool] = None,
58
+ ) -> bool:
59
+ """Initialize OpenTelemetry instrumentation.
60
+
61
+ This function sets up the OpenTelemetry SDK with OTLP exporter and
62
+ automatically instruments all available libraries.
63
+
64
+ Args:
65
+ service_name: Service name (default: from env or 'agentreplay-app')
66
+ otlp_endpoint: OTLP endpoint (default: from env or 'localhost:47117')
67
+ project_id: Project ID (default: from env)
68
+ tenant_id: Tenant ID (default: from env or 1)
69
+ capture_content: Capture LLM content (default: from env or True)
70
+ debug: Enable debug logging (default: from env or False)
71
+
72
+ Returns:
73
+ True if initialization succeeded, False if already initialized
74
+
75
+ Example:
76
+ >>> from agentreplay.bootstrap import init_otel_instrumentation
77
+ >>> init_otel_instrumentation(
78
+ ... service_name="my-agent",
79
+ ... project_id=27986
80
+ ... )
81
+ """
82
+ global _initialized
83
+
84
+ if _initialized:
85
+ logger.debug("Agentreplay already initialized, skipping")
86
+ return False
87
+
88
+ # Read from environment with fallbacks
89
+ service_name = service_name or os.getenv("AGENTREPLAY_SERVICE_NAME", "agentreplay-app")
90
+ otlp_endpoint = otlp_endpoint or os.getenv(
91
+ "AGENTREPLAY_OTLP_ENDPOINT",
92
+ os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:47117")
93
+ )
94
+
95
+ # Project/tenant IDs
96
+ if project_id is None:
97
+ project_id_str = os.getenv("AGENTREPLAY_PROJECT_ID", "0")
98
+ try:
99
+ project_id = int(project_id_str)
100
+ except ValueError:
101
+ logger.warning(f"Invalid AGENTREPLAY_PROJECT_ID: {project_id_str}, using 0")
102
+ project_id = 0
103
+
104
+ if tenant_id is None:
105
+ tenant_id_str = os.getenv("AGENTREPLAY_TENANT_ID", "1")
106
+ try:
107
+ tenant_id = int(tenant_id_str)
108
+ except ValueError:
109
+ logger.warning(f"Invalid AGENTREPLAY_TENANT_ID: {tenant_id_str}, using 1")
110
+ tenant_id = 1
111
+
112
+ # Flags
113
+ if capture_content is None:
114
+ capture_content = os.getenv("AGENTREPLAY_CAPTURE_CONTENT", "true").lower() in {
115
+ "1", "true", "yes"
116
+ }
117
+
118
+ if debug is None:
119
+ debug = os.getenv("AGENTREPLAY_DEBUG", "false").lower() in {"1", "true", "yes"}
120
+
121
+ if debug:
122
+ logging.basicConfig(level=logging.DEBUG)
123
+ logger.setLevel(logging.DEBUG)
124
+
125
+ try:
126
+ # Import here to avoid loading OTEL on every Python startup
127
+ from agentreplay.auto_instrument import setup_instrumentation
128
+
129
+ logger.info(f"🚀 Initializing Agentreplay for service: {service_name}")
130
+ logger.debug(f" OTLP Endpoint: {otlp_endpoint}")
131
+ logger.debug(f" Project ID: {project_id}")
132
+ logger.debug(f" Tenant ID: {tenant_id}")
133
+ logger.debug(f" Capture Content: {capture_content}")
134
+
135
+ setup_instrumentation(
136
+ service_name=service_name,
137
+ otlp_endpoint=otlp_endpoint,
138
+ tenant_id=tenant_id,
139
+ project_id=project_id,
140
+ capture_content=capture_content,
141
+ debug=debug,
142
+ )
143
+
144
+ _initialized = True
145
+ logger.info("✅ Agentreplay initialization complete")
146
+ return True
147
+
148
+ except Exception as e:
149
+ logger.error(f"❌ Failed to initialize Agentreplay: {e}", exc_info=debug)
150
+ # Don't crash the user's app - fail open
151
+ return False
152
+
153
+
154
+ def _auto_init():
155
+ """Called by the .pth file on Python startup.
156
+
157
+ Only initializes if AGENTREPLAY_ENABLED=true to avoid overhead.
158
+ This is the entry point for zero-code auto-instrumentation.
159
+
160
+ Automatically loads .env file if present for developer convenience.
161
+ """
162
+ # Try to load .env file first (if python-dotenv is available)
163
+ if os.path.exists('.env'):
164
+ try:
165
+ from dotenv import load_dotenv
166
+ load_dotenv('.env', override=False) # Don't override existing env vars
167
+ except ImportError:
168
+ pass # python-dotenv not installed, no problem
169
+ except Exception as e:
170
+ pass # Any other error, fail silently
171
+
172
+ if not os.getenv("AGENTREPLAY_ENABLED", "").lower() in {"1", "true", "yes"}:
173
+ # Not enabled, skip silently
174
+ return
175
+
176
+ try:
177
+ init_otel_instrumentation(debug=True) # Enable debug to see what's happening
178
+ except Exception as e:
179
+ # Fail open - don't break user's app if SDK has issues
180
+ import sys
181
+ print(f"Agentreplay auto-init failed: {e}", file=sys.stderr)
182
+ pass
183
+
184
+
185
+ def is_initialized() -> bool:
186
+ """Check if Agentreplay has been initialized.
187
+
188
+ Returns:
189
+ True if initialized, False otherwise
190
+ """
191
+ return _initialized
192
+
193
+
194
+ def reset_initialization():
195
+ """Reset initialization state (primarily for testing).
196
+
197
+ Warning:
198
+ This does not actually tear down the OTEL SDK, it only resets
199
+ the initialization flag. Use only in tests.
200
+ """
201
+ global _initialized
202
+ _initialized = False
@@ -0,0 +1,300 @@
1
+ # Copyright 2025 Sushanth (https://github.com/sushanthpy)
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ """Circuit Breaker pattern for Agentreplay backend resilience.
16
+
17
+ This module implements a circuit breaker to prevent cascading failures when
18
+ the Agentreplay backend is unavailable. The circuit breaker has three states:
19
+
20
+ - CLOSED: Normal operation, requests pass through
21
+ - OPEN: Backend is failing, requests are rejected immediately
22
+ - HALF_OPEN: Testing recovery, allowing limited requests through
23
+
24
+ The circuit breaker helps maintain application responsiveness during backend
25
+ outages by failing fast rather than blocking on retries.
26
+
27
+ Usage:
28
+ >>> from agentreplay.circuit_breaker import CircuitBreaker, CircuitBreakerOpen
29
+ >>>
30
+ >>> breaker = CircuitBreaker()
31
+ >>>
32
+ >>> try:
33
+ ... with breaker:
34
+ ... send_spans_to_backend()
35
+ ... except CircuitBreakerOpen:
36
+ ... logger.warning("Agentreplay backend unavailable, dropping spans")
37
+ """
38
+
39
+ import time
40
+ import threading
41
+ import logging
42
+ from enum import Enum
43
+ from typing import Optional, Callable, Any
44
+ from functools import wraps
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ class CircuitState(Enum):
50
+ """Circuit breaker states."""
51
+ CLOSED = "closed" # Normal operation
52
+ OPEN = "open" # Failing, reject requests
53
+ HALF_OPEN = "half_open" # Testing recovery
54
+
55
+
56
+ class CircuitBreakerOpen(Exception):
57
+ """Exception raised when circuit breaker is open."""
58
+ pass
59
+
60
+
61
+ class CircuitBreaker:
62
+ """Thread-safe circuit breaker for backend resilience.
63
+
64
+ Args:
65
+ failure_threshold: Number of failures before opening circuit (default: 5)
66
+ recovery_timeout: Seconds before attempting recovery (default: 30)
67
+ success_threshold: Successes needed to close circuit from half-open (default: 3)
68
+ failure_window: Time window in seconds for counting failures (default: 60)
69
+
70
+ Example:
71
+ >>> breaker = CircuitBreaker(failure_threshold=3, recovery_timeout=60)
72
+ >>>
73
+ >>> @breaker.protect
74
+ ... def send_to_backend():
75
+ ... response = requests.post(...)
76
+ ... response.raise_for_status()
77
+ ... return response
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ failure_threshold: int = 5,
83
+ recovery_timeout: float = 30.0,
84
+ success_threshold: int = 3,
85
+ failure_window: float = 60.0,
86
+ ):
87
+ self.failure_threshold = failure_threshold
88
+ self.recovery_timeout = recovery_timeout
89
+ self.success_threshold = success_threshold
90
+ self.failure_window = failure_window
91
+
92
+ self._state = CircuitState.CLOSED
93
+ self._failure_count = 0
94
+ self._success_count = 0
95
+ self._last_failure_time: Optional[float] = None
96
+ self._last_state_change: float = time.time()
97
+ self._lock = threading.Lock()
98
+
99
+ # Track failure times for windowed counting
100
+ self._failure_times: list[float] = []
101
+
102
+ @property
103
+ def state(self) -> CircuitState:
104
+ """Get current circuit state."""
105
+ with self._lock:
106
+ self._check_state_transition()
107
+ return self._state
108
+
109
+ @property
110
+ def is_closed(self) -> bool:
111
+ """Check if circuit is closed (normal operation)."""
112
+ return self.state == CircuitState.CLOSED
113
+
114
+ @property
115
+ def is_open(self) -> bool:
116
+ """Check if circuit is open (rejecting requests)."""
117
+ return self.state == CircuitState.OPEN
118
+
119
+ def _check_state_transition(self) -> None:
120
+ """Check if state should transition (called with lock held)."""
121
+ now = time.time()
122
+
123
+ if self._state == CircuitState.OPEN:
124
+ # Check if recovery timeout has passed
125
+ if now - self._last_state_change >= self.recovery_timeout:
126
+ self._transition_to(CircuitState.HALF_OPEN)
127
+
128
+ elif self._state == CircuitState.CLOSED:
129
+ # Clean up old failures outside the window
130
+ cutoff = now - self.failure_window
131
+ self._failure_times = [t for t in self._failure_times if t > cutoff]
132
+ self._failure_count = len(self._failure_times)
133
+
134
+ def _transition_to(self, new_state: CircuitState) -> None:
135
+ """Transition to a new state (called with lock held)."""
136
+ old_state = self._state
137
+ self._state = new_state
138
+ self._last_state_change = time.time()
139
+
140
+ if new_state == CircuitState.CLOSED:
141
+ self._failure_count = 0
142
+ self._failure_times.clear()
143
+ self._success_count = 0
144
+ elif new_state == CircuitState.HALF_OPEN:
145
+ self._success_count = 0
146
+
147
+ logger.info(
148
+ f"Circuit breaker state change: {old_state.value} -> {new_state.value}"
149
+ )
150
+
151
+ def record_success(self) -> None:
152
+ """Record a successful request."""
153
+ with self._lock:
154
+ if self._state == CircuitState.HALF_OPEN:
155
+ self._success_count += 1
156
+ if self._success_count >= self.success_threshold:
157
+ self._transition_to(CircuitState.CLOSED)
158
+
159
+ def record_failure(self, error: Optional[Exception] = None) -> None:
160
+ """Record a failed request."""
161
+ now = time.time()
162
+
163
+ with self._lock:
164
+ self._last_failure_time = now
165
+
166
+ if self._state == CircuitState.CLOSED:
167
+ self._failure_times.append(now)
168
+ self._failure_count = len(self._failure_times)
169
+
170
+ if self._failure_count >= self.failure_threshold:
171
+ self._transition_to(CircuitState.OPEN)
172
+ logger.warning(
173
+ f"Circuit breaker opened after {self._failure_count} failures. "
174
+ f"Will retry after {self.recovery_timeout}s."
175
+ )
176
+
177
+ elif self._state == CircuitState.HALF_OPEN:
178
+ # Any failure in half-open state reopens the circuit
179
+ self._transition_to(CircuitState.OPEN)
180
+ logger.warning("Circuit breaker reopened after recovery test failure.")
181
+
182
+ def allow_request(self) -> bool:
183
+ """Check if a request should be allowed through.
184
+
185
+ Returns:
186
+ True if request is allowed, False if circuit is open
187
+ """
188
+ with self._lock:
189
+ self._check_state_transition()
190
+
191
+ if self._state == CircuitState.CLOSED:
192
+ return True
193
+ elif self._state == CircuitState.OPEN:
194
+ return False
195
+ else: # HALF_OPEN
196
+ return True # Allow test requests through
197
+
198
+ def __enter__(self) -> "CircuitBreaker":
199
+ """Context manager entry - check if request is allowed."""
200
+ if not self.allow_request():
201
+ raise CircuitBreakerOpen(
202
+ f"Circuit breaker is open. Recovery in "
203
+ f"{self.recovery_timeout - (time.time() - self._last_state_change):.1f}s"
204
+ )
205
+ return self
206
+
207
+ def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
208
+ """Context manager exit - record success or failure."""
209
+ if exc_type is None:
210
+ self.record_success()
211
+ else:
212
+ self.record_failure(exc_val)
213
+ return False # Don't suppress exceptions
214
+
215
+ def protect(self, func: Callable) -> Callable:
216
+ """Decorator to protect a function with the circuit breaker.
217
+
218
+ Args:
219
+ func: Function to protect
220
+
221
+ Returns:
222
+ Wrapped function that respects circuit breaker state
223
+
224
+ Example:
225
+ >>> @breaker.protect
226
+ ... def send_spans():
227
+ ... pass
228
+ """
229
+ @wraps(func)
230
+ def wrapper(*args, **kwargs) -> Any:
231
+ with self:
232
+ return func(*args, **kwargs)
233
+ return wrapper
234
+
235
+ def reset(self) -> None:
236
+ """Manually reset the circuit breaker to closed state."""
237
+ with self._lock:
238
+ self._transition_to(CircuitState.CLOSED)
239
+ logger.info("Circuit breaker manually reset")
240
+
241
+ def stats(self) -> dict:
242
+ """Get circuit breaker statistics.
243
+
244
+ Returns:
245
+ Dictionary with current state and counters
246
+ """
247
+ with self._lock:
248
+ return {
249
+ "state": self._state.value,
250
+ "failure_count": self._failure_count,
251
+ "success_count": self._success_count,
252
+ "last_failure_time": self._last_failure_time,
253
+ "last_state_change": self._last_state_change,
254
+ "seconds_in_state": time.time() - self._last_state_change,
255
+ }
256
+
257
+
258
+ # Global circuit breaker instance for the Agentreplay backend
259
+ _default_breaker: Optional[CircuitBreaker] = None
260
+
261
+
262
+ def get_circuit_breaker() -> CircuitBreaker:
263
+ """Get the default circuit breaker instance.
264
+
265
+ Creates one if it doesn't exist with default settings.
266
+
267
+ Returns:
268
+ Default CircuitBreaker instance
269
+ """
270
+ global _default_breaker
271
+ if _default_breaker is None:
272
+ _default_breaker = CircuitBreaker()
273
+ return _default_breaker
274
+
275
+
276
+ def configure_circuit_breaker(
277
+ failure_threshold: int = 5,
278
+ recovery_timeout: float = 30.0,
279
+ success_threshold: int = 3,
280
+ failure_window: float = 60.0,
281
+ ) -> CircuitBreaker:
282
+ """Configure the default circuit breaker.
283
+
284
+ Args:
285
+ failure_threshold: Number of failures before opening circuit
286
+ recovery_timeout: Seconds before attempting recovery
287
+ success_threshold: Successes needed to close circuit from half-open
288
+ failure_window: Time window in seconds for counting failures
289
+
290
+ Returns:
291
+ Configured CircuitBreaker instance
292
+ """
293
+ global _default_breaker
294
+ _default_breaker = CircuitBreaker(
295
+ failure_threshold=failure_threshold,
296
+ recovery_timeout=recovery_timeout,
297
+ success_threshold=success_threshold,
298
+ failure_window=failure_window,
299
+ )
300
+ return _default_breaker