agent-tool-resilience 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,215 @@
1
+ """
2
+ Retry policies with exponential backoff, jitter, and configurable conditions.
3
+ """
4
+
5
+ import asyncio
6
+ import random
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from typing import Any, Callable, Optional, Sequence, Type, Union
10
+
11
+
12
+ class RetryError(Exception):
13
+ """Raised when all retry attempts have been exhausted."""
14
+
15
+ def __init__(self, message: str, attempts: int, last_exception: Optional[Exception] = None):
16
+ super().__init__(message)
17
+ self.attempts = attempts
18
+ self.last_exception = last_exception
19
+
20
+
21
+ @dataclass
22
+ class RetryPolicy:
23
+ """
24
+ Configurable retry policy with various backoff strategies.
25
+
26
+ Attributes:
27
+ max_attempts: Maximum number of attempts (including initial call)
28
+ backoff: Backoff strategy - "constant", "linear", "exponential"
29
+ base_delay: Base delay in seconds
30
+ max_delay: Maximum delay cap in seconds
31
+ jitter: Whether to add random jitter to delays
32
+ jitter_range: Range for jitter as (min_factor, max_factor)
33
+ retry_on: Exception types to retry on
34
+ retry_if: Optional predicate function for custom retry conditions
35
+ on_retry: Optional callback called before each retry
36
+ """
37
+ max_attempts: int = 3
38
+ backoff: str = "exponential"
39
+ base_delay: float = 1.0
40
+ max_delay: float = 60.0
41
+ jitter: bool = True
42
+ jitter_range: tuple[float, float] = (0.5, 1.5)
43
+ retry_on: Sequence[Type[Exception]] = field(default_factory=lambda: [Exception])
44
+ retry_if: Optional[Callable[[Exception], bool]] = None
45
+ on_retry: Optional[Callable[[int, Exception, float], None]] = None
46
+
47
+ def calculate_delay(self, attempt: int) -> float:
48
+ """Calculate delay for the given attempt number (0-indexed)."""
49
+ if self.backoff == "constant":
50
+ delay = self.base_delay
51
+ elif self.backoff == "linear":
52
+ delay = self.base_delay * (attempt + 1)
53
+ elif self.backoff == "exponential":
54
+ delay = self.base_delay * (2 ** attempt)
55
+ else:
56
+ raise ValueError(f"Unknown backoff strategy: {self.backoff}")
57
+
58
+ # Apply max delay cap
59
+ delay = min(delay, self.max_delay)
60
+
61
+ # Apply jitter
62
+ if self.jitter:
63
+ min_factor, max_factor = self.jitter_range
64
+ delay *= random.uniform(min_factor, max_factor)
65
+
66
+ return delay
67
+
68
+ def should_retry(self, exception: Exception) -> bool:
69
+ """Check if the exception should trigger a retry."""
70
+ # Check exception type
71
+ if not isinstance(exception, tuple(self.retry_on)):
72
+ return False
73
+
74
+ # Check custom predicate
75
+ if self.retry_if is not None:
76
+ return self.retry_if(exception)
77
+
78
+ return True
79
+
80
+ def execute(
81
+ self,
82
+ func: Callable[..., Any],
83
+ *args: Any,
84
+ **kwargs: Any
85
+ ) -> Any:
86
+ """
87
+ Execute a function with retry logic.
88
+
89
+ Args:
90
+ func: Function to execute
91
+ *args: Positional arguments for the function
92
+ **kwargs: Keyword arguments for the function
93
+
94
+ Returns:
95
+ The function's return value
96
+
97
+ Raises:
98
+ RetryError: If all retry attempts are exhausted
99
+ """
100
+ last_exception: Optional[Exception] = None
101
+
102
+ for attempt in range(self.max_attempts):
103
+ try:
104
+ return func(*args, **kwargs)
105
+ except Exception as e:
106
+ last_exception = e
107
+
108
+ # Check if we should retry
109
+ if attempt >= self.max_attempts - 1:
110
+ break
111
+
112
+ if not self.should_retry(e):
113
+ raise
114
+
115
+ # Calculate and apply delay
116
+ delay = self.calculate_delay(attempt)
117
+
118
+ # Call retry callback if provided
119
+ if self.on_retry:
120
+ self.on_retry(attempt + 1, e, delay)
121
+
122
+ time.sleep(delay)
123
+
124
+ raise RetryError(
125
+ f"All {self.max_attempts} retry attempts exhausted",
126
+ attempts=self.max_attempts,
127
+ last_exception=last_exception
128
+ )
129
+
130
+ async def execute_async(
131
+ self,
132
+ func: Callable[..., Any],
133
+ *args: Any,
134
+ **kwargs: Any
135
+ ) -> Any:
136
+ """
137
+ Execute an async function with retry logic.
138
+
139
+ Args:
140
+ func: Async function to execute
141
+ *args: Positional arguments for the function
142
+ **kwargs: Keyword arguments for the function
143
+
144
+ Returns:
145
+ The function's return value
146
+
147
+ Raises:
148
+ RetryError: If all retry attempts are exhausted
149
+ """
150
+ last_exception: Optional[Exception] = None
151
+
152
+ for attempt in range(self.max_attempts):
153
+ try:
154
+ return await func(*args, **kwargs)
155
+ except Exception as e:
156
+ last_exception = e
157
+
158
+ # Check if we should retry
159
+ if attempt >= self.max_attempts - 1:
160
+ break
161
+
162
+ if not self.should_retry(e):
163
+ raise
164
+
165
+ # Calculate and apply delay
166
+ delay = self.calculate_delay(attempt)
167
+
168
+ # Call retry callback if provided
169
+ if self.on_retry:
170
+ self.on_retry(attempt + 1, e, delay)
171
+
172
+ await asyncio.sleep(delay)
173
+
174
+ raise RetryError(
175
+ f"All {self.max_attempts} retry attempts exhausted",
176
+ attempts=self.max_attempts,
177
+ last_exception=last_exception
178
+ )
179
+
180
+
181
+ # Convenience functions for common retry policies
182
+ def no_retry() -> RetryPolicy:
183
+ """Create a policy that doesn't retry."""
184
+ return RetryPolicy(max_attempts=1)
185
+
186
+
187
+ def retry_with_backoff(
188
+ max_attempts: int = 3,
189
+ base_delay: float = 1.0,
190
+ max_delay: float = 60.0
191
+ ) -> RetryPolicy:
192
+ """Create an exponential backoff retry policy."""
193
+ return RetryPolicy(
194
+ max_attempts=max_attempts,
195
+ backoff="exponential",
196
+ base_delay=base_delay,
197
+ max_delay=max_delay,
198
+ jitter=True
199
+ )
200
+
201
+
202
+ def retry_on_network_errors(max_attempts: int = 5) -> RetryPolicy:
203
+ """Create a retry policy for common network errors."""
204
+ return RetryPolicy(
205
+ max_attempts=max_attempts,
206
+ backoff="exponential",
207
+ base_delay=1.0,
208
+ max_delay=30.0,
209
+ jitter=True,
210
+ retry_on=[
211
+ ConnectionError,
212
+ TimeoutError,
213
+ OSError,
214
+ ]
215
+ )
@@ -0,0 +1,319 @@
1
+ """
2
+ Execution tracing for observability.
3
+ """
4
+
5
+ import json
6
+ import threading
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from enum import Enum
11
+ from typing import Any, Callable, Optional
12
+
13
+
14
+ class ExecutionStatus(Enum):
15
+ """Status of a tool execution."""
16
+ PENDING = "pending"
17
+ RUNNING = "running"
18
+ SUCCESS = "success"
19
+ FAILED = "failed"
20
+ RETRYING = "retrying"
21
+ CIRCUIT_OPEN = "circuit_open"
22
+ FALLBACK = "fallback"
23
+
24
+
25
+ @dataclass
26
+ class ExecutionEvent:
27
+ """
28
+ A single execution event in the trace.
29
+
30
+ Attributes:
31
+ tool_name: Name of the tool being executed
32
+ attempt: Attempt number (1-indexed)
33
+ status: Status of this execution
34
+ timestamp: When this event occurred
35
+ duration_ms: Duration in milliseconds (if completed)
36
+ error: Error message if failed
37
+ error_type: Type of exception if failed
38
+ result_summary: Brief summary of result (if successful)
39
+ metadata: Additional context
40
+ """
41
+ tool_name: str
42
+ attempt: int
43
+ status: ExecutionStatus
44
+ timestamp: datetime = field(default_factory=datetime.now)
45
+ duration_ms: Optional[float] = None
46
+ error: Optional[str] = None
47
+ error_type: Optional[str] = None
48
+ result_summary: Optional[str] = None
49
+ metadata: dict = field(default_factory=dict)
50
+
51
+ def to_dict(self) -> dict:
52
+ """Convert to dictionary for serialization."""
53
+ return {
54
+ "tool_name": self.tool_name,
55
+ "attempt": self.attempt,
56
+ "status": self.status.value,
57
+ "timestamp": self.timestamp.isoformat(),
58
+ "duration_ms": self.duration_ms,
59
+ "error": self.error,
60
+ "error_type": self.error_type,
61
+ "result_summary": self.result_summary,
62
+ "metadata": self.metadata,
63
+ }
64
+
65
+
66
+ @dataclass
67
+ class ToolExecutionTracer:
68
+ """
69
+ Traces tool executions for observability.
70
+
71
+ Provides full visibility into what happened during tool execution,
72
+ including retries, fallbacks, and errors.
73
+
74
+ Attributes:
75
+ max_events: Maximum events to keep in memory
76
+ summarize_results: Whether to capture result summaries
77
+ result_max_length: Max length for result summaries
78
+ on_event: Callback for each event
79
+ """
80
+ max_events: int = 1000
81
+ summarize_results: bool = True
82
+ result_max_length: int = 200
83
+ on_event: Optional[Callable[[ExecutionEvent], None]] = None
84
+
85
+ _events: list[ExecutionEvent] = field(default_factory=list, init=False)
86
+ _lock: threading.Lock = field(default_factory=threading.Lock, init=False)
87
+ _active_executions: dict[str, tuple[int, float]] = field(
88
+ default_factory=dict, init=False
89
+ )
90
+
91
+ def _add_event(self, event: ExecutionEvent) -> None:
92
+ """Add an event to the trace."""
93
+ with self._lock:
94
+ self._events.append(event)
95
+
96
+ # Trim if over max
97
+ if len(self._events) > self.max_events:
98
+ self._events = self._events[-self.max_events:]
99
+
100
+ if self.on_event:
101
+ self.on_event(event)
102
+
103
+ def _summarize_result(self, result: Any) -> str:
104
+ """Create a brief summary of a result."""
105
+ if not self.summarize_results:
106
+ return None
107
+
108
+ try:
109
+ if result is None:
110
+ return "None"
111
+ elif isinstance(result, (str, int, float, bool)):
112
+ summary = str(result)
113
+ elif isinstance(result, dict):
114
+ summary = f"dict({len(result)} keys)"
115
+ elif isinstance(result, (list, tuple)):
116
+ summary = f"list({len(result)} items)"
117
+ else:
118
+ summary = f"{type(result).__name__}"
119
+
120
+ if len(summary) > self.result_max_length:
121
+ summary = summary[:self.result_max_length - 3] + "..."
122
+
123
+ return summary
124
+ except Exception:
125
+ return "unknown"
126
+
127
+ def start_execution(
128
+ self,
129
+ tool_name: str,
130
+ attempt: int = 1,
131
+ metadata: Optional[dict] = None
132
+ ) -> str:
133
+ """
134
+ Record the start of a tool execution.
135
+
136
+ Returns an execution ID for tracking.
137
+ """
138
+ exec_id = f"{tool_name}_{id(self)}_{time.time()}"
139
+
140
+ with self._lock:
141
+ self._active_executions[exec_id] = (attempt, time.time())
142
+
143
+ event = ExecutionEvent(
144
+ tool_name=tool_name,
145
+ attempt=attempt,
146
+ status=ExecutionStatus.RUNNING,
147
+ metadata=metadata or {},
148
+ )
149
+ self._add_event(event)
150
+
151
+ return exec_id
152
+
153
+ def record_success(
154
+ self,
155
+ exec_id: str,
156
+ tool_name: str,
157
+ attempt: int,
158
+ result: Any,
159
+ metadata: Optional[dict] = None
160
+ ) -> None:
161
+ """Record a successful execution."""
162
+ duration_ms = None
163
+
164
+ with self._lock:
165
+ if exec_id in self._active_executions:
166
+ _, start_time = self._active_executions.pop(exec_id)
167
+ duration_ms = (time.time() - start_time) * 1000
168
+
169
+ event = ExecutionEvent(
170
+ tool_name=tool_name,
171
+ attempt=attempt,
172
+ status=ExecutionStatus.SUCCESS,
173
+ duration_ms=duration_ms,
174
+ result_summary=self._summarize_result(result),
175
+ metadata=metadata or {},
176
+ )
177
+ self._add_event(event)
178
+
179
+ def record_failure(
180
+ self,
181
+ exec_id: str,
182
+ tool_name: str,
183
+ attempt: int,
184
+ error: Exception,
185
+ will_retry: bool = False,
186
+ metadata: Optional[dict] = None
187
+ ) -> None:
188
+ """Record a failed execution."""
189
+ duration_ms = None
190
+
191
+ with self._lock:
192
+ if exec_id in self._active_executions:
193
+ _, start_time = self._active_executions.pop(exec_id)
194
+ duration_ms = (time.time() - start_time) * 1000
195
+
196
+ status = ExecutionStatus.RETRYING if will_retry else ExecutionStatus.FAILED
197
+
198
+ event = ExecutionEvent(
199
+ tool_name=tool_name,
200
+ attempt=attempt,
201
+ status=status,
202
+ duration_ms=duration_ms,
203
+ error=str(error),
204
+ error_type=type(error).__name__,
205
+ metadata=metadata or {},
206
+ )
207
+ self._add_event(event)
208
+
209
+ def record_circuit_open(
210
+ self,
211
+ tool_name: str,
212
+ metadata: Optional[dict] = None
213
+ ) -> None:
214
+ """Record that execution was blocked by circuit breaker."""
215
+ event = ExecutionEvent(
216
+ tool_name=tool_name,
217
+ attempt=0,
218
+ status=ExecutionStatus.CIRCUIT_OPEN,
219
+ metadata=metadata or {},
220
+ )
221
+ self._add_event(event)
222
+
223
+ def record_fallback(
224
+ self,
225
+ tool_name: str,
226
+ fallback_name: str,
227
+ result: Any,
228
+ metadata: Optional[dict] = None
229
+ ) -> None:
230
+ """Record that a fallback was used."""
231
+ event = ExecutionEvent(
232
+ tool_name=tool_name,
233
+ attempt=0,
234
+ status=ExecutionStatus.FALLBACK,
235
+ result_summary=self._summarize_result(result),
236
+ metadata={"fallback_name": fallback_name, **(metadata or {})},
237
+ )
238
+ self._add_event(event)
239
+
240
+ def get_events(
241
+ self,
242
+ tool_name: Optional[str] = None,
243
+ status: Optional[ExecutionStatus] = None,
244
+ limit: Optional[int] = None
245
+ ) -> list[ExecutionEvent]:
246
+ """Get events with optional filtering."""
247
+ with self._lock:
248
+ events = self._events.copy()
249
+
250
+ if tool_name:
251
+ events = [e for e in events if e.tool_name == tool_name]
252
+
253
+ if status:
254
+ events = [e for e in events if e.status == status]
255
+
256
+ if limit:
257
+ events = events[-limit:]
258
+
259
+ return events
260
+
261
+ def get_execution_log(
262
+ self,
263
+ tool_name: Optional[str] = None,
264
+ limit: Optional[int] = None
265
+ ) -> list[dict]:
266
+ """Get execution log as list of dicts."""
267
+ events = self.get_events(tool_name=tool_name, limit=limit)
268
+ return [e.to_dict() for e in events]
269
+
270
+ def get_stats(self, tool_name: Optional[str] = None) -> dict:
271
+ """Get execution statistics."""
272
+ events = self.get_events(tool_name=tool_name)
273
+
274
+ if not events:
275
+ return {
276
+ "total_executions": 0,
277
+ "success_count": 0,
278
+ "failure_count": 0,
279
+ "retry_count": 0,
280
+ "circuit_open_count": 0,
281
+ "fallback_count": 0,
282
+ "success_rate": 0.0,
283
+ "avg_duration_ms": 0.0,
284
+ }
285
+
286
+ success_count = len([e for e in events if e.status == ExecutionStatus.SUCCESS])
287
+ failure_count = len([e for e in events if e.status == ExecutionStatus.FAILED])
288
+ retry_count = len([e for e in events if e.status == ExecutionStatus.RETRYING])
289
+ circuit_open_count = len([e for e in events if e.status == ExecutionStatus.CIRCUIT_OPEN])
290
+ fallback_count = len([e for e in events if e.status == ExecutionStatus.FALLBACK])
291
+
292
+ durations = [e.duration_ms for e in events if e.duration_ms is not None]
293
+ avg_duration = sum(durations) / len(durations) if durations else 0.0
294
+
295
+ total = success_count + failure_count
296
+ success_rate = success_count / total if total > 0 else 0.0
297
+
298
+ return {
299
+ "total_executions": total,
300
+ "success_count": success_count,
301
+ "failure_count": failure_count,
302
+ "retry_count": retry_count,
303
+ "circuit_open_count": circuit_open_count,
304
+ "fallback_count": fallback_count,
305
+ "success_rate": success_rate,
306
+ "avg_duration_ms": avg_duration,
307
+ }
308
+
309
+ def clear(self) -> None:
310
+ """Clear all events."""
311
+ with self._lock:
312
+ self._events.clear()
313
+ self._active_executions.clear()
314
+
315
+ def export_json(self, filepath: str) -> None:
316
+ """Export events to a JSON file."""
317
+ log = self.get_execution_log()
318
+ with open(filepath, "w") as f:
319
+ json.dump(log, f, indent=2)