asap-protocol 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
asap/transport/client.py CHANGED
@@ -20,15 +20,30 @@ Example:
20
20
  ... print(response.payload_type)
21
21
  """
22
22
 
23
+ import asyncio
24
+ import itertools
25
+ import random
23
26
  import time
24
- from typing import Any
27
+ from dataclasses import dataclass
28
+ from email.utils import parsedate_to_datetime
29
+ from typing import Any, Optional
30
+ from urllib.parse import ParseResult
25
31
 
26
32
  import httpx
27
33
 
34
+ from asap.errors import CircuitOpenError
35
+ from asap.models.constants import (
36
+ DEFAULT_BASE_DELAY,
37
+ DEFAULT_CIRCUIT_BREAKER_THRESHOLD,
38
+ DEFAULT_CIRCUIT_BREAKER_TIMEOUT,
39
+ DEFAULT_MAX_DELAY,
40
+ )
28
41
  from asap.models.envelope import Envelope
29
42
  from asap.models.ids import generate_id
30
43
  from asap.observability import get_logger
44
+ from asap.transport.circuit_breaker import CircuitBreaker, CircuitState, get_registry
31
45
  from asap.transport.jsonrpc import ASAP_METHOD
46
+ from asap.utils.sanitization import sanitize_url
32
47
 
33
48
  # Module logger
34
49
  logger = get_logger(__name__)
@@ -40,6 +55,32 @@ DEFAULT_TIMEOUT = 60.0
40
55
  DEFAULT_MAX_RETRIES = 3
41
56
 
42
57
 
58
+ @dataclass
59
+ class RetryConfig:
60
+ """Configuration for retry logic and circuit breaker.
61
+
62
+ Groups retry and circuit breaker parameters to simplify client initialization
63
+ and avoid boolean trap issues.
64
+
65
+ Attributes:
66
+ max_retries: Maximum retry attempts for transient failures (default: 3)
67
+ base_delay: Base delay in seconds for exponential backoff (default: 1.0)
68
+ max_delay: Maximum delay in seconds for exponential backoff (default: 60.0)
69
+ jitter: Whether to add random jitter to backoff delays (default: True)
70
+ circuit_breaker_enabled: Enable circuit breaker pattern (default: False)
71
+ circuit_breaker_threshold: Number of consecutive failures before opening circuit (default: 5)
72
+ circuit_breaker_timeout: Seconds before transitioning OPEN -> HALF_OPEN (default: 60.0)
73
+ """
74
+
75
+ max_retries: int = DEFAULT_MAX_RETRIES
76
+ base_delay: float = DEFAULT_BASE_DELAY
77
+ max_delay: float = DEFAULT_MAX_DELAY
78
+ jitter: bool = True
79
+ circuit_breaker_enabled: bool = False
80
+ circuit_breaker_threshold: int = DEFAULT_CIRCUIT_BREAKER_THRESHOLD
81
+ circuit_breaker_timeout: float = DEFAULT_CIRCUIT_BREAKER_TIMEOUT
82
+
83
+
43
84
  class ASAPConnectionError(Exception):
44
85
  """Raised when connection to remote agent fails.
45
86
 
@@ -47,20 +88,36 @@ class ASAPConnectionError(Exception):
47
88
  or when the remote server returns an HTTP error status.
48
89
 
49
90
  Attributes:
50
- message: Error description
91
+ message: Error description with troubleshooting suggestions
51
92
  cause: Original exception that caused this error
93
+ url: URL that failed to connect (if available)
52
94
  """
53
95
 
54
- def __init__(self, message: str, cause: Exception | None = None) -> None:
96
+ def __init__(
97
+ self, message: str, cause: Exception | None = None, url: str | None = None
98
+ ) -> None:
55
99
  """Initialize connection error.
56
100
 
57
101
  Args:
58
102
  message: Error description
59
103
  cause: Original exception that caused this error
104
+ url: URL that failed to connect (for better error messages)
60
105
  """
61
- super().__init__(message)
62
- self.message = message
106
+ # Enhance message with troubleshooting suggestions if URL is provided
107
+ if url and "Verify" not in message and "troubleshooting" not in message.lower():
108
+ enhanced_message = (
109
+ f"{message}\n"
110
+ f"Troubleshooting: Connection failed to {url}. "
111
+ "Verify the agent is running and accessible. "
112
+ "Check the URL format, network connectivity, and firewall settings."
113
+ )
114
+ else:
115
+ enhanced_message = message
116
+
117
+ super().__init__(enhanced_message)
118
+ self.message = enhanced_message
63
119
  self.cause = cause
120
+ self.url = url
64
121
 
65
122
 
66
123
  class ASAPTimeoutError(Exception):
@@ -125,28 +182,101 @@ class ASAPClient:
125
182
  base_url: Base URL of the remote agent
126
183
  timeout: Request timeout in seconds
127
184
  max_retries: Maximum retry attempts for transient failures
185
+ require_https: Whether HTTPS is required for non-localhost connections
128
186
  is_connected: Whether the client has an active connection
187
+ _circuit_breaker: Optional circuit breaker instance
129
188
 
130
189
  Example:
131
190
  >>> async with ASAPClient("http://localhost:8000") as client:
132
191
  ... response = await client.send(envelope)
133
192
  """
134
193
 
194
+ _circuit_breaker: Optional[CircuitBreaker]
195
+
135
196
  def __init__(
136
197
  self,
137
198
  base_url: str,
138
199
  timeout: float = DEFAULT_TIMEOUT,
139
- max_retries: int = DEFAULT_MAX_RETRIES,
140
- transport: httpx.AsyncBaseTransport | httpx.BaseTransport | None = None,
200
+ transport: httpx.AsyncBaseTransport | None = None,
201
+ require_https: bool = True,
202
+ retry_config: Optional[RetryConfig] = None,
203
+ # Individual retry parameters (for backward compatibility)
204
+ # If retry_config is provided, these are ignored
205
+ max_retries: int | None = None,
206
+ base_delay: float | None = None,
207
+ max_delay: float | None = None,
208
+ jitter: bool | None = None,
209
+ circuit_breaker_enabled: bool | None = None,
210
+ circuit_breaker_threshold: int | None = None,
211
+ circuit_breaker_timeout: float | None = None,
141
212
  ) -> None:
142
213
  """Initialize ASAP client.
143
214
 
144
215
  Args:
145
216
  base_url: Base URL of the remote agent (e.g., "http://localhost:8000")
146
217
  timeout: Request timeout in seconds (default: 60)
147
- max_retries: Maximum retry attempts for transient failures (default: 3)
148
- transport: Optional custom transport (for testing). Can be sync or async.
218
+ transport: Optional custom async transport (for testing). Must be an instance
219
+ of httpx.AsyncBaseTransport (e.g., httpx.MockTransport).
220
+ require_https: If True, enforces HTTPS for non-localhost connections (default: True).
221
+ HTTP connections to localhost are allowed with a warning for development.
222
+ retry_config: Optional RetryConfig dataclass to group retry and circuit breaker parameters.
223
+ If provided, individual retry parameters are ignored.
224
+ max_retries: Maximum retry attempts for transient failures (default: 3).
225
+ Ignored if retry_config is provided.
226
+ base_delay: Base delay in seconds for exponential backoff (default: 1.0).
227
+ Ignored if retry_config is provided.
228
+ max_delay: Maximum delay in seconds for exponential backoff (default: 60.0).
229
+ Ignored if retry_config is provided.
230
+ jitter: Whether to add random jitter to backoff delays (default: True).
231
+ Ignored if retry_config is provided.
232
+ circuit_breaker_enabled: Enable circuit breaker pattern (default: False).
233
+ Ignored if retry_config is provided.
234
+ circuit_breaker_threshold: Number of consecutive failures before opening circuit (default: 5).
235
+ Ignored if retry_config is provided.
236
+ circuit_breaker_timeout: Seconds before transitioning OPEN -> HALF_OPEN (default: 60.0).
237
+ Ignored if retry_config is provided.
238
+
239
+ Raises:
240
+ ValueError: If URL format is invalid, scheme is not HTTP/HTTPS, or HTTPS is
241
+ required but URL uses HTTP for non-localhost connections.
242
+
243
+ Example:
244
+ >>> # Using individual parameters (backward compatible)
245
+ >>> client = ASAPClient("http://localhost:8000", max_retries=5)
246
+ >>>
247
+ >>> # Using RetryConfig (recommended)
248
+ >>> config = RetryConfig(max_retries=5, circuit_breaker_enabled=True)
249
+ >>> client = ASAPClient("http://localhost:8000", retry_config=config)
149
250
  """
251
+ # Extract retry config values
252
+ if retry_config is not None:
253
+ # Use retry_config values
254
+ max_retries_val = retry_config.max_retries
255
+ base_delay_val = retry_config.base_delay
256
+ max_delay_val = retry_config.max_delay
257
+ jitter_val = retry_config.jitter
258
+ circuit_breaker_enabled_val = retry_config.circuit_breaker_enabled
259
+ circuit_breaker_threshold_val = retry_config.circuit_breaker_threshold
260
+ circuit_breaker_timeout_val = retry_config.circuit_breaker_timeout
261
+ else:
262
+ # Use individual parameters with defaults
263
+ max_retries_val = max_retries if max_retries is not None else DEFAULT_MAX_RETRIES
264
+ base_delay_val = base_delay if base_delay is not None else DEFAULT_BASE_DELAY
265
+ max_delay_val = max_delay if max_delay is not None else DEFAULT_MAX_DELAY
266
+ jitter_val = jitter if jitter is not None else True
267
+ circuit_breaker_enabled_val = (
268
+ circuit_breaker_enabled if circuit_breaker_enabled is not None else False
269
+ )
270
+ circuit_breaker_threshold_val = (
271
+ circuit_breaker_threshold
272
+ if circuit_breaker_threshold is not None
273
+ else DEFAULT_CIRCUIT_BREAKER_THRESHOLD
274
+ )
275
+ circuit_breaker_timeout_val = (
276
+ circuit_breaker_timeout
277
+ if circuit_breaker_timeout is not None
278
+ else DEFAULT_CIRCUIT_BREAKER_TIMEOUT
279
+ )
150
280
  # Validate URL format and scheme
151
281
  from urllib.parse import urlparse
152
282
 
@@ -163,12 +293,184 @@ class ASAPClient:
163
293
  f"Received: {base_url}"
164
294
  )
165
295
 
296
+ # Validate HTTPS requirement
297
+ is_https = parsed.scheme.lower() == "https"
298
+ is_local = self._is_localhost(parsed)
299
+
300
+ if require_https and not is_https:
301
+ if is_local:
302
+ # Allow HTTP for localhost with warning
303
+ logger.warning(
304
+ "asap.client.http_localhost",
305
+ url=base_url,
306
+ message=(
307
+ "Using HTTP for localhost connection. "
308
+ "For production, use HTTPS. "
309
+ "To disable this warning, set require_https=False."
310
+ ),
311
+ )
312
+ else:
313
+ # Reject HTTP for non-localhost
314
+ raise ValueError(
315
+ f"HTTPS is required for non-localhost connections. "
316
+ f"Received HTTP URL: {base_url}. "
317
+ f"Please use HTTPS or set require_https=False to override "
318
+ f"(not recommended for production)."
319
+ )
320
+
166
321
  self.base_url = base_url.rstrip("/")
167
322
  self.timeout = timeout
168
- self.max_retries = max_retries
323
+ self.max_retries = max_retries_val
324
+ self.require_https = require_https
325
+ self.base_delay = base_delay_val
326
+ self.max_delay = max_delay_val
327
+ self.jitter = jitter_val
328
+ self.circuit_breaker_enabled = circuit_breaker_enabled_val
169
329
  self._transport = transport
170
330
  self._client: httpx.AsyncClient | None = None
171
- self._request_counter = 0
331
+ # Thread-safe counter using itertools.count
332
+ self._request_counter = itertools.count(1)
333
+
334
+ # Initialize circuit breaker if enabled
335
+ # Use registry to ensure state is shared across multiple client instances
336
+ # for the same base_url
337
+ if circuit_breaker_enabled_val:
338
+ registry = get_registry()
339
+ self._circuit_breaker = registry.get_or_create(
340
+ base_url=sanitize_url(self.base_url),
341
+ threshold=circuit_breaker_threshold_val,
342
+ timeout=circuit_breaker_timeout_val,
343
+ )
344
+ else:
345
+ self._circuit_breaker = None
346
+
347
+ @staticmethod
348
+ def _is_localhost(parsed_url: ParseResult) -> bool:
349
+ """Check if URL points to localhost.
350
+
351
+ Detects localhost, 127.0.0.1, and ::1 (IPv6 localhost).
352
+
353
+ Args:
354
+ parsed_url: Parsed URL from urlparse
355
+
356
+ Returns:
357
+ True if URL points to localhost, False otherwise
358
+ """
359
+ hostname = parsed_url.hostname
360
+ if not hostname:
361
+ return False
362
+
363
+ hostname_lower = hostname.lower()
364
+ # Handle both ::1 and [::1] (bracket notation from URL parsing)
365
+ return hostname_lower in ("localhost", "127.0.0.1", "::1", "[::1]")
366
+
367
+ def _calculate_backoff(self, attempt: int) -> float:
368
+ """Calculate exponential backoff delay for retry attempt.
369
+
370
+ Implements exponential backoff with optional jitter:
371
+ delay = base_delay * (2 ** attempt) + jitter
372
+
373
+ The delay is capped at max_delay to prevent excessively long waits.
374
+
375
+ Args:
376
+ attempt: Zero-based attempt number (0 = first retry)
377
+
378
+ Returns:
379
+ Delay in seconds before next retry attempt
380
+ """
381
+ # Calculate exponential delay: base_delay * (2 ** attempt)
382
+ delay = self.base_delay * (2**attempt)
383
+
384
+ # Cap at max_delay
385
+ delay = min(delay, self.max_delay)
386
+
387
+ # Add jitter if enabled (random value between 0 and 10% of delay)
388
+ # Note: random.uniform is appropriate here - jitter for retry backoff
389
+ # does not require cryptographic security, only statistical distribution
390
+ if self.jitter:
391
+ jitter_amount: float = random.uniform(0, delay * 0.1) # nosec B311
392
+ delay += jitter_amount
393
+
394
+ return float(delay)
395
+
396
+ async def _validate_connection(self) -> bool:
397
+ """Validate that the agent endpoint is accessible.
398
+
399
+ Performs a pre-flight check by attempting to access the agent's
400
+ manifest endpoint. This can be used to detect connection issues
401
+ before sending actual requests.
402
+
403
+ Note: This is an optional validation step that can be disabled
404
+ for performance reasons in production environments.
405
+
406
+ Returns:
407
+ True if connection is valid, False otherwise
408
+
409
+ Raises:
410
+ ASAPConnectionError: If connection validation fails
411
+ """
412
+ if not self._client:
413
+ raise ASAPConnectionError(
414
+ "Client not connected. Use 'async with' context.",
415
+ url=sanitize_url(self.base_url),
416
+ )
417
+
418
+ try:
419
+ # Try to access a lightweight endpoint (manifest or health check)
420
+ # Using HEAD request to minimize bandwidth
421
+ response = await self._client.head(
422
+ f"{self.base_url}/.well-known/asap/manifest.json",
423
+ timeout=min(self.timeout, 5.0), # Shorter timeout for validation
424
+ )
425
+ # Any 2xx or 3xx response indicates the server is reachable
426
+ is_valid = 200 <= response.status_code < 400
427
+ if not is_valid:
428
+ logger.warning(
429
+ "asap.client.connection_validation_failed",
430
+ target_url=sanitize_url(self.base_url),
431
+ status_code=response.status_code,
432
+ message=(
433
+ f"Connection validation failed for {self.base_url}. "
434
+ f"Server returned status {response.status_code}. "
435
+ f"Verify the agent is running and the URL is correct."
436
+ ),
437
+ )
438
+ return is_valid
439
+ except httpx.ConnectError as e:
440
+ logger.warning(
441
+ "asap.client.connection_validation_failed",
442
+ target_url=sanitize_url(self.base_url),
443
+ error=str(e),
444
+ message=(
445
+ f"Connection validation failed for {self.base_url}. "
446
+ f"Cannot reach the agent. Verify the agent is running and accessible. "
447
+ f"Error: {str(e)[:200]}"
448
+ ),
449
+ )
450
+ return False
451
+ except httpx.TimeoutException:
452
+ logger.warning(
453
+ "asap.client.connection_validation_timeout",
454
+ target_url=sanitize_url(self.base_url),
455
+ timeout=self.timeout,
456
+ message=(
457
+ f"Connection validation timed out for {self.base_url}. "
458
+ f"Check network connectivity and firewall settings."
459
+ ),
460
+ )
461
+ return False
462
+ except Exception as e:
463
+ logger.warning(
464
+ "asap.client.connection_validation_error",
465
+ target_url=sanitize_url(self.base_url),
466
+ error=str(e),
467
+ error_type=type(e).__name__,
468
+ message=(
469
+ f"Connection validation encountered an error for {self.base_url}: {e}. "
470
+ f"Verify the agent is running and accessible."
471
+ ),
472
+ )
473
+ return False
172
474
 
173
475
  @property
174
476
  def is_connected(self) -> bool:
@@ -179,10 +481,8 @@ class ASAPClient:
179
481
  """Enter async context and open connection."""
180
482
  # Create the async client
181
483
  if self._transport:
182
- # MockTransport works for both sync and async, so we cast it
183
- # This is safe because httpx.MockTransport is compatible with async usage
184
484
  self._client = httpx.AsyncClient(
185
- transport=self._transport, # type: ignore[arg-type]
485
+ transport=self._transport,
186
486
  timeout=self.timeout,
187
487
  )
188
488
  else:
@@ -215,35 +515,56 @@ class ASAPClient:
215
515
  Response envelope from the remote agent
216
516
 
217
517
  Raises:
518
+ ValueError: If envelope is None
218
519
  ASAPConnectionError: If connection fails or HTTP error occurs
219
520
  ASAPTimeoutError: If request times out
220
521
  ASAPRemoteError: If remote agent returns JSON-RPC error
522
+ CircuitOpenError: If circuit breaker is open and request is rejected
221
523
 
222
524
  Example:
223
525
  >>> async with ASAPClient("http://localhost:8000") as client:
224
526
  ... response = await client.send(envelope)
225
527
  ... response.payload_type
226
528
  """
529
+ if envelope is None:
530
+ raise ValueError("envelope cannot be None")
531
+
227
532
  if not self._client:
228
- raise ASAPConnectionError("Client not connected. Use 'async with' context.")
533
+ raise ASAPConnectionError(
534
+ "Client not connected. Use 'async with' context.",
535
+ url=sanitize_url(self.base_url),
536
+ )
537
+
538
+ # Check circuit breaker state before attempting request
539
+ if self._circuit_breaker is not None and not self._circuit_breaker.can_attempt():
540
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
541
+ raise CircuitOpenError(
542
+ base_url=sanitize_url(self.base_url),
543
+ consecutive_failures=consecutive_failures,
544
+ )
229
545
 
230
546
  start_time = time.perf_counter()
231
547
 
232
548
  # Generate idempotency key for retries
233
549
  idempotency_key = generate_id()
234
550
 
235
- # Increment request counter for JSON-RPC id
236
- self._request_counter += 1
237
- request_id = f"req-{self._request_counter}"
551
+ # Get next request counter value (thread-safe)
552
+ request_id = f"req-{next(self._request_counter)}"
238
553
 
239
- # Log send attempt
554
+ # Log send attempt with context (sanitize URL to hide credentials)
555
+ sanitized_url = sanitize_url(self.base_url)
240
556
  logger.info(
241
557
  "asap.client.send",
242
- target_url=self.base_url,
558
+ target_url=sanitized_url,
243
559
  envelope_id=envelope.id,
244
560
  trace_id=envelope.trace_id,
245
561
  payload_type=envelope.payload_type,
246
562
  idempotency_key=idempotency_key,
563
+ max_retries=self.max_retries,
564
+ message=(
565
+ f"Sending envelope {envelope.id} to {sanitized_url} "
566
+ f"(payload: {envelope.payload_type}, max_retries: {self.max_retries})"
567
+ ),
247
568
  )
248
569
 
249
570
  # Build JSON-RPC request
@@ -273,24 +594,166 @@ class ASAPClient:
273
594
  # Check HTTP status
274
595
  if response.status_code >= 500:
275
596
  # Server errors (5xx) are retriable
597
+ error_msg = (
598
+ f"HTTP server error {response.status_code} from {self.base_url}. "
599
+ f"Server returned: {response.text[:200]}"
600
+ )
276
601
  if attempt < self.max_retries - 1:
602
+ delay = self._calculate_backoff(attempt)
277
603
  logger.warning(
278
604
  "asap.client.server_error",
279
605
  status_code=response.status_code,
280
606
  attempt=attempt + 1,
281
607
  max_retries=self.max_retries,
608
+ delay_seconds=round(delay, 2),
609
+ target_url=sanitize_url(self.base_url),
610
+ message=f"Server error {response.status_code}, retrying in {delay:.2f}s (attempt {attempt + 1}/{self.max_retries})",
611
+ )
612
+ logger.info(
613
+ "asap.client.retry",
614
+ target_url=sanitize_url(self.base_url),
615
+ envelope_id=envelope.id,
616
+ attempt=attempt + 1,
617
+ max_retries=self.max_retries,
618
+ delay_seconds=round(delay, 2),
619
+ )
620
+ await asyncio.sleep(delay)
621
+ last_exception = ASAPConnectionError(error_msg, url=self.base_url)
622
+ continue
623
+ # All retries exhausted, record failure in circuit breaker
624
+ if self._circuit_breaker is not None:
625
+ previous_state = self._circuit_breaker.get_state()
626
+ self._circuit_breaker.record_failure()
627
+ current_state = self._circuit_breaker.get_state()
628
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
629
+ # Log state change if circuit opened
630
+ if previous_state != current_state and current_state == CircuitState.OPEN:
631
+ logger.warning(
632
+ "asap.client.circuit_opened",
633
+ target_url=sanitize_url(self.base_url),
634
+ consecutive_failures=consecutive_failures,
635
+ threshold=self._circuit_breaker.threshold,
636
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
637
+ )
638
+ raise ASAPConnectionError(error_msg, url=self.base_url)
639
+ if response.status_code == 429:
640
+ # Rate limit (429) is retriable, respect Retry-After header
641
+ if attempt < self.max_retries - 1:
642
+ # Check for Retry-After header
643
+ retry_after = response.headers.get("Retry-After")
644
+ if retry_after:
645
+ retry_delay: Optional[float] = None
646
+ # Retry-After can be seconds (int/float) or HTTP date
647
+ # First, try to parse as seconds (numeric)
648
+ if retry_after.replace(".", "", 1).isdigit():
649
+ try:
650
+ retry_delay = float(retry_after)
651
+ logger.info(
652
+ "asap.client.retry_after",
653
+ target_url=sanitize_url(self.base_url),
654
+ envelope_id=envelope.id,
655
+ attempt=attempt + 1,
656
+ retry_after_seconds=retry_delay,
657
+ message=f"Respecting server Retry-After: {retry_delay}s",
658
+ )
659
+ except ValueError:
660
+ pass # Fall through to date parsing
661
+ else:
662
+ # Try to parse as HTTP date
663
+ try:
664
+ retry_date = parsedate_to_datetime(retry_after)
665
+ if retry_date:
666
+ # Calculate delay in seconds from now until retry_date
667
+ now_timestamp = time.time()
668
+ retry_timestamp = retry_date.timestamp()
669
+ calculated_delay = retry_timestamp - now_timestamp
670
+ # If date is in the past or delay is invalid, fall back to calculated backoff
671
+ if calculated_delay <= 0:
672
+ retry_delay = None # Will trigger fallback
673
+ else:
674
+ retry_delay = calculated_delay
675
+ logger.info(
676
+ "asap.client.retry_after",
677
+ target_url=sanitize_url(self.base_url),
678
+ envelope_id=envelope.id,
679
+ attempt=attempt + 1,
680
+ retry_after_seconds=round(retry_delay, 2),
681
+ retry_after_date=retry_after,
682
+ message=f"Respecting server Retry-After date: {retry_after} ({retry_delay:.2f}s)",
683
+ )
684
+ except (ValueError, TypeError, AttributeError, OSError):
685
+ # Invalid date format or timestamp conversion error, fall back to calculated backoff
686
+ pass
687
+
688
+ # If parsing failed or delay is invalid (None or <= 0), use calculated backoff
689
+ if retry_delay is None or retry_delay <= 0:
690
+ retry_delay = self._calculate_backoff(attempt)
691
+ logger.warning(
692
+ "asap.client.retry_after_invalid",
693
+ target_url=sanitize_url(self.base_url),
694
+ envelope_id=envelope.id,
695
+ retry_after_header=retry_after,
696
+ fallback_delay=round(retry_delay, 2),
697
+ message="Invalid Retry-After format, using calculated backoff",
698
+ )
699
+ delay = retry_delay
700
+ else:
701
+ # No Retry-After header, use calculated backoff
702
+ delay = self._calculate_backoff(attempt)
703
+ logger.warning(
704
+ "asap.client.rate_limited",
705
+ status_code=429,
706
+ attempt=attempt + 1,
707
+ max_retries=self.max_retries,
708
+ delay_seconds=round(delay, 2),
709
+ )
710
+ logger.info(
711
+ "asap.client.retry",
712
+ target_url=sanitize_url(self.base_url),
713
+ envelope_id=envelope.id,
714
+ attempt=attempt + 1,
715
+ max_retries=self.max_retries,
716
+ delay_seconds=round(delay, 2),
282
717
  )
718
+ await asyncio.sleep(delay)
283
719
  last_exception = ASAPConnectionError(
284
- f"HTTP server error {response.status_code}: {response.text}"
720
+ f"HTTP rate limit error 429 from {self.base_url}. "
721
+ f"Server response: {response.text[:200]}",
722
+ url=sanitize_url(self.base_url),
285
723
  )
286
724
  continue
725
+ # All retries exhausted, record failure in circuit breaker
726
+ if self._circuit_breaker is not None:
727
+ previous_state = self._circuit_breaker.get_state()
728
+ self._circuit_breaker.record_failure()
729
+ current_state = self._circuit_breaker.get_state()
730
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
731
+ # Log state change if circuit opened
732
+ if previous_state != current_state and current_state == CircuitState.OPEN:
733
+ logger.warning(
734
+ "asap.client.circuit_opened",
735
+ target_url=sanitize_url(self.base_url),
736
+ consecutive_failures=consecutive_failures,
737
+ threshold=self._circuit_breaker.threshold,
738
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures (rate limited)",
739
+ )
287
740
  raise ASAPConnectionError(
288
- f"HTTP server error {response.status_code}: {response.text}"
741
+ f"HTTP rate limit error 429 from {self.base_url} after {self.max_retries} attempts. "
742
+ f"Server response: {response.text[:200]}",
743
+ url=sanitize_url(self.base_url),
289
744
  )
290
745
  if response.status_code >= 400:
291
- # Client errors (4xx) are not retriable
746
+ # Client errors (4xx) are not retriable (except 429 handled above)
747
+ # We record a failure in the circuit breaker here because persistent 4xx
748
+ # (like 401/403) can indicate an unhealthy configuration or system state.
749
+ if self._circuit_breaker is not None:
750
+ self._circuit_breaker.record_failure()
751
+
292
752
  raise ASAPConnectionError(
293
- f"HTTP client error {response.status_code}: {response.text}"
753
+ f"HTTP client error {response.status_code} from {self.base_url}. "
754
+ f"This indicates a problem with the request. "
755
+ f"Server response: {response.text[:200]}",
756
+ url=sanitize_url(self.base_url),
294
757
  )
295
758
 
296
759
  # Parse JSON response
@@ -301,6 +764,11 @@ class ASAPClient:
301
764
 
302
765
  # Check for JSON-RPC error
303
766
  if "error" in json_response:
767
+ # Record success pattern (service is reachable)
768
+ # A valid JSON-RPC error means the connection and transport are healthy
769
+ if self._circuit_breaker is not None:
770
+ self._circuit_breaker.record_success()
771
+
304
772
  error = json_response["error"]
305
773
  raise ASAPRemoteError(
306
774
  error.get("code", -32603),
@@ -316,11 +784,24 @@ class ASAPClient:
316
784
 
317
785
  response_envelope = Envelope(**envelope_data)
318
786
 
787
+ # Record success in circuit breaker
788
+ if self._circuit_breaker is not None:
789
+ previous_state = self._circuit_breaker.get_state()
790
+ self._circuit_breaker.record_success()
791
+ current_state = self._circuit_breaker.get_state()
792
+ # Log state change if circuit was closed
793
+ if previous_state != current_state and current_state == CircuitState.CLOSED:
794
+ logger.info(
795
+ "asap.client.circuit_closed",
796
+ target_url=sanitize_url(self.base_url),
797
+ message="Circuit breaker closed after successful request",
798
+ )
799
+
319
800
  # Calculate duration and log success
320
801
  duration_ms = (time.perf_counter() - start_time) * 1000
321
802
  logger.info(
322
803
  "asap.client.response",
323
- target_url=self.base_url,
804
+ target_url=sanitize_url(self.base_url),
324
805
  envelope_id=envelope.id,
325
806
  response_id=response_envelope.id,
326
807
  trace_id=envelope.trace_id,
@@ -330,70 +811,124 @@ class ASAPClient:
330
811
 
331
812
  return response_envelope
332
813
 
333
- except httpx.ConnectError as e:
334
- last_exception = ASAPConnectionError(f"Connection error: {e}", cause=e)
814
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
815
+ is_timeout = isinstance(e, httpx.TimeoutException)
816
+ error_type = "Timeout" if is_timeout else "Connection error"
817
+ error_msg = (
818
+ f"{error_type} to {self.base_url}: {e}. "
819
+ f"Verify the agent is running and accessible."
820
+ )
821
+ if is_timeout:
822
+ last_exception = ASAPTimeoutError(
823
+ f"Request timeout after {self.timeout}s", timeout=self.timeout
824
+ )
825
+ else:
826
+ last_exception = ASAPConnectionError(error_msg, cause=e, url=self.base_url)
827
+
335
828
  # Log retry attempt
336
829
  if attempt < self.max_retries - 1:
830
+ delay = self._calculate_backoff(attempt)
337
831
  logger.warning(
338
832
  "asap.client.retry",
339
- target_url=self.base_url,
833
+ target_url=sanitize_url(self.base_url),
340
834
  envelope_id=envelope.id,
341
835
  attempt=attempt + 1,
342
836
  max_retries=self.max_retries,
343
837
  error=str(e),
838
+ delay_seconds=round(delay, 2),
839
+ message=(
840
+ f"{error_type} to {self.base_url} (attempt {attempt + 1}/{self.max_retries}). "
841
+ f"Retrying in {delay:.2f}s. "
842
+ f"Error: {str(e)[:100]}"
843
+ ),
344
844
  )
845
+ await asyncio.sleep(delay)
345
846
  continue
346
- # Log final failure
347
- duration_ms = (time.perf_counter() - start_time) * 1000
348
- logger.error(
349
- "asap.client.error",
350
- target_url=self.base_url,
351
- envelope_id=envelope.id,
352
- error="Connection failed after retries",
353
- error_type="ASAPConnectionError",
354
- duration_ms=round(duration_ms, 2),
355
- attempts=attempt + 1,
356
- )
357
- raise last_exception from e
358
847
 
359
- except httpx.TimeoutException as e:
848
+ # All retries exhausted, record failure in circuit breaker
849
+ if self._circuit_breaker is not None:
850
+ previous_state = self._circuit_breaker.get_state()
851
+ self._circuit_breaker.record_failure()
852
+ current_state = self._circuit_breaker.get_state()
853
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
854
+ # Log state change if circuit opened
855
+ if previous_state != current_state and current_state == CircuitState.OPEN:
856
+ logger.warning(
857
+ "asap.client.circuit_opened",
858
+ target_url=sanitize_url(self.base_url),
859
+ consecutive_failures=consecutive_failures,
860
+ threshold=self._circuit_breaker.threshold,
861
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
862
+ )
863
+
864
+ # Log final failure with detailed context
360
865
  duration_ms = (time.perf_counter() - start_time) * 1000
361
- last_exception = ASAPTimeoutError(
362
- f"Request timeout after {self.timeout}s", timeout=self.timeout
363
- )
364
- # Log timeout (don't retry)
866
+ error_type_name = "ASAPTimeoutError" if is_timeout else "ASAPConnectionError"
365
867
  logger.error(
366
868
  "asap.client.error",
367
- target_url=self.base_url,
869
+ target_url=sanitize_url(self.base_url),
368
870
  envelope_id=envelope.id,
369
- error="Request timeout",
370
- error_type="ASAPTimeoutError",
371
- timeout=self.timeout,
871
+ error=f"{error_type} after retries",
872
+ error_type=error_type_name,
372
873
  duration_ms=round(duration_ms, 2),
874
+ attempts=attempt + 1,
875
+ max_retries=self.max_retries,
876
+ timeout=self.timeout if is_timeout else None,
877
+ message=(
878
+ f"{error_type} to {self.base_url} failed after {attempt + 1} attempts. "
879
+ f"Total duration: {duration_ms:.2f}ms. "
880
+ f"Troubleshooting: Verify the agent is running, check network connectivity, "
881
+ f"and ensure the URL is correct. Original error: {str(e)[:200]}"
882
+ ),
373
883
  )
374
884
  raise last_exception from e
375
885
 
376
886
  except (ASAPConnectionError, ASAPRemoteError, ASAPTimeoutError):
377
- # Re-raise our custom errors
887
+ # Re-raise our custom errors without recording failure again
888
+ # (failures are already recorded before these exceptions are raised)
378
889
  raise
379
890
 
380
891
  except Exception as e:
892
+ # Record failure in circuit breaker
893
+ if self._circuit_breaker is not None:
894
+ previous_state = self._circuit_breaker.get_state()
895
+ self._circuit_breaker.record_failure()
896
+ current_state = self._circuit_breaker.get_state()
897
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
898
+ # Log state change if circuit opened
899
+ if previous_state != current_state and current_state == CircuitState.OPEN:
900
+ logger.warning(
901
+ "asap.client.circuit_opened",
902
+ target_url=sanitize_url(self.base_url),
903
+ consecutive_failures=consecutive_failures,
904
+ threshold=self._circuit_breaker.threshold,
905
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
906
+ )
381
907
  # Log unexpected error
382
908
  duration_ms = (time.perf_counter() - start_time) * 1000
383
909
  logger.exception(
384
910
  "asap.client.error",
385
- target_url=self.base_url,
911
+ target_url=sanitize_url(self.base_url),
386
912
  envelope_id=envelope.id,
387
913
  error=str(e),
388
914
  error_type=type(e).__name__,
389
915
  duration_ms=round(duration_ms, 2),
390
916
  )
391
917
  # Wrap unexpected errors
392
- raise ASAPConnectionError(f"Unexpected error: {e}", cause=e) from e
918
+ raise ASAPConnectionError(
919
+ f"Unexpected error connecting to {self.base_url}: {e}. "
920
+ f"Verify the agent is running and accessible.",
921
+ cause=e,
922
+ url=sanitize_url(self.base_url),
923
+ ) from e
393
924
 
394
925
  # Defensive code: This should never be reached because the loop above
395
926
  # always either returns successfully or raises an exception.
396
927
  # Kept as a safety net for future code changes.
397
928
  if last_exception: # pragma: no cover
398
929
  raise last_exception
399
- raise ASAPConnectionError("Max retries exceeded") # pragma: no cover
930
+ raise ASAPConnectionError(
931
+ f"Max retries ({self.max_retries}) exceeded for {self.base_url}. "
932
+ f"Verify the agent is running and accessible.",
933
+ url=sanitize_url(self.base_url),
934
+ ) # pragma: no cover