asap-protocol 0.3.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. asap/__init__.py +1 -1
  2. asap/cli.py +137 -2
  3. asap/errors.py +167 -0
  4. asap/examples/README.md +81 -10
  5. asap/examples/auth_patterns.py +212 -0
  6. asap/examples/error_recovery.py +248 -0
  7. asap/examples/long_running.py +287 -0
  8. asap/examples/mcp_integration.py +240 -0
  9. asap/examples/multi_step_workflow.py +134 -0
  10. asap/examples/orchestration.py +293 -0
  11. asap/examples/rate_limiting.py +137 -0
  12. asap/examples/run_demo.py +9 -4
  13. asap/examples/secure_handler.py +84 -0
  14. asap/examples/state_migration.py +240 -0
  15. asap/examples/streaming_response.py +108 -0
  16. asap/examples/websocket_concept.py +129 -0
  17. asap/mcp/__init__.py +43 -0
  18. asap/mcp/client.py +224 -0
  19. asap/mcp/protocol.py +179 -0
  20. asap/mcp/server.py +333 -0
  21. asap/mcp/server_runner.py +40 -0
  22. asap/models/__init__.py +4 -0
  23. asap/models/base.py +0 -3
  24. asap/models/constants.py +76 -1
  25. asap/models/entities.py +58 -7
  26. asap/models/envelope.py +14 -1
  27. asap/models/ids.py +8 -4
  28. asap/models/parts.py +33 -3
  29. asap/models/validators.py +16 -0
  30. asap/observability/__init__.py +6 -0
  31. asap/observability/dashboards/README.md +24 -0
  32. asap/observability/dashboards/asap-detailed.json +131 -0
  33. asap/observability/dashboards/asap-red.json +129 -0
  34. asap/observability/logging.py +81 -1
  35. asap/observability/metrics.py +15 -1
  36. asap/observability/trace_parser.py +238 -0
  37. asap/observability/trace_ui.py +218 -0
  38. asap/observability/tracing.py +293 -0
  39. asap/state/machine.py +15 -2
  40. asap/state/snapshot.py +0 -9
  41. asap/testing/__init__.py +31 -0
  42. asap/testing/assertions.py +108 -0
  43. asap/testing/fixtures.py +113 -0
  44. asap/testing/mocks.py +152 -0
  45. asap/transport/__init__.py +31 -0
  46. asap/transport/cache.py +180 -0
  47. asap/transport/circuit_breaker.py +194 -0
  48. asap/transport/client.py +989 -72
  49. asap/transport/compression.py +389 -0
  50. asap/transport/handlers.py +106 -53
  51. asap/transport/middleware.py +64 -39
  52. asap/transport/server.py +461 -94
  53. asap/transport/validators.py +320 -0
  54. asap/utils/__init__.py +7 -0
  55. asap/utils/sanitization.py +134 -0
  56. asap_protocol-1.0.0.dist-info/METADATA +264 -0
  57. asap_protocol-1.0.0.dist-info/RECORD +70 -0
  58. asap_protocol-0.3.0.dist-info/METADATA +0 -227
  59. asap_protocol-0.3.0.dist-info/RECORD +0 -37
  60. {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/WHEEL +0 -0
  61. {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/entry_points.txt +0 -0
  62. {asap_protocol-0.3.0.dist-info → asap_protocol-1.0.0.dist-info}/licenses/LICENSE +0 -0
asap/transport/client.py CHANGED
@@ -10,6 +10,7 @@ The ASAPClient provides:
10
10
  - Retry logic with idempotency keys
11
11
  - Proper error handling and timeouts
12
12
  - Structured logging for observability
13
+ - Compression support (gzip/brotli) for bandwidth reduction
13
14
 
14
15
  Example:
15
16
  >>> from asap.transport.client import ASAPClient
@@ -18,17 +19,45 @@ Example:
18
19
  >>> async with ASAPClient("http://agent.example.com") as client:
19
20
  ... response = await client.send(request_envelope)
20
21
  ... print(response.payload_type)
22
+ >>>
23
+ >>> # With compression enabled (default for payloads > 1KB)
24
+ >>> async with ASAPClient("http://agent.example.com", compression=True) as client:
25
+ ... response = await client.send(large_envelope) # Compressed automatically
21
26
  """
22
27
 
28
+ import asyncio
29
+ import itertools
30
+ import json
31
+ import random
23
32
  import time
24
- from typing import Any
33
+ from dataclasses import dataclass
34
+ from email.utils import parsedate_to_datetime
35
+ from typing import Any, Optional
36
+ from urllib.parse import ParseResult
25
37
 
26
38
  import httpx
27
39
 
40
+ from asap.errors import CircuitOpenError
41
+ from asap.models.constants import (
42
+ DEFAULT_BASE_DELAY,
43
+ DEFAULT_CIRCUIT_BREAKER_THRESHOLD,
44
+ DEFAULT_CIRCUIT_BREAKER_TIMEOUT,
45
+ DEFAULT_MAX_DELAY,
46
+ )
47
+ from asap.models.entities import Manifest
28
48
  from asap.models.envelope import Envelope
29
49
  from asap.models.ids import generate_id
30
- from asap.observability import get_logger
50
+ from asap.observability import get_logger, get_metrics
51
+ from asap.transport.cache import DEFAULT_MAX_SIZE, ManifestCache
52
+ from asap.transport.circuit_breaker import CircuitBreaker, CircuitState, get_registry
53
+ from asap.transport.compression import (
54
+ COMPRESSION_THRESHOLD,
55
+ CompressionAlgorithm,
56
+ compress_payload,
57
+ get_accept_encoding_header,
58
+ )
31
59
  from asap.transport.jsonrpc import ASAP_METHOD
60
+ from asap.utils.sanitization import sanitize_url
32
61
 
33
62
  # Module logger
34
63
  logger = get_logger(__name__)
@@ -39,6 +68,56 @@ DEFAULT_TIMEOUT = 60.0
39
68
  # Default maximum retries
40
69
  DEFAULT_MAX_RETRIES = 3
41
70
 
71
+ # Connection pool defaults (support 1000+ concurrent via reuse)
72
+ DEFAULT_POOL_CONNECTIONS = 100
73
+ DEFAULT_POOL_MAXSIZE = 100
74
+ # Timeout for acquiring a connection from the pool (distinct from request timeout)
75
+ DEFAULT_POOL_TIMEOUT = 5.0
76
+ # Maximum time to wait for manifest retrieval
77
+ MANIFEST_REQUEST_TIMEOUT = 10.0
78
+
79
+
80
+ def _record_send_error_metrics(start_time: float, error: BaseException) -> None:
81
+ """Record transport send error metrics (status=error, duration, reason)."""
82
+ duration_seconds = time.perf_counter() - start_time
83
+ metrics = get_metrics()
84
+ metrics.increment_counter("asap_transport_send_total", {"status": "error"})
85
+ metrics.increment_counter(
86
+ "asap_transport_send_errors_total",
87
+ {"reason": type(error).__name__},
88
+ )
89
+ metrics.observe_histogram(
90
+ "asap_transport_send_duration_seconds",
91
+ duration_seconds,
92
+ {"status": "error"},
93
+ )
94
+
95
+
96
+ @dataclass
97
+ class RetryConfig:
98
+ """Configuration for retry logic and circuit breaker.
99
+
100
+ Groups retry and circuit breaker parameters to simplify client initialization
101
+ and avoid boolean trap issues.
102
+
103
+ Attributes:
104
+ max_retries: Maximum retry attempts for transient failures (default: 3)
105
+ base_delay: Base delay in seconds for exponential backoff (default: 1.0)
106
+ max_delay: Maximum delay in seconds for exponential backoff (default: 60.0)
107
+ jitter: Whether to add random jitter to backoff delays (default: True)
108
+ circuit_breaker_enabled: Enable circuit breaker pattern (default: False)
109
+ circuit_breaker_threshold: Number of consecutive failures before opening circuit (default: 5)
110
+ circuit_breaker_timeout: Seconds before transitioning OPEN -> HALF_OPEN (default: 60.0)
111
+ """
112
+
113
+ max_retries: int = DEFAULT_MAX_RETRIES
114
+ base_delay: float = DEFAULT_BASE_DELAY
115
+ max_delay: float = DEFAULT_MAX_DELAY
116
+ jitter: bool = True
117
+ circuit_breaker_enabled: bool = False
118
+ circuit_breaker_threshold: int = DEFAULT_CIRCUIT_BREAKER_THRESHOLD
119
+ circuit_breaker_timeout: float = DEFAULT_CIRCUIT_BREAKER_TIMEOUT
120
+
42
121
 
43
122
  class ASAPConnectionError(Exception):
44
123
  """Raised when connection to remote agent fails.
@@ -47,20 +126,36 @@ class ASAPConnectionError(Exception):
47
126
  or when the remote server returns an HTTP error status.
48
127
 
49
128
  Attributes:
50
- message: Error description
129
+ message: Error description with troubleshooting suggestions
51
130
  cause: Original exception that caused this error
131
+ url: URL that failed to connect (if available)
52
132
  """
53
133
 
54
- def __init__(self, message: str, cause: Exception | None = None) -> None:
134
+ def __init__(
135
+ self, message: str, cause: Exception | None = None, url: str | None = None
136
+ ) -> None:
55
137
  """Initialize connection error.
56
138
 
57
139
  Args:
58
140
  message: Error description
59
141
  cause: Original exception that caused this error
142
+ url: URL that failed to connect (for better error messages)
60
143
  """
61
- super().__init__(message)
62
- self.message = message
144
+ # Enhance message with troubleshooting suggestions if URL is provided
145
+ if url and "Verify" not in message and "troubleshooting" not in message.lower():
146
+ enhanced_message = (
147
+ f"{message}\n"
148
+ f"Troubleshooting: Connection failed to {url}. "
149
+ "Verify the agent is running and accessible. "
150
+ "Check the URL format, network connectivity, and firewall settings."
151
+ )
152
+ else:
153
+ enhanced_message = message
154
+
155
+ super().__init__(enhanced_message)
156
+ self.message = enhanced_message
63
157
  self.cause = cause
158
+ self.url = url
64
159
 
65
160
 
66
161
  class ASAPTimeoutError(Exception):
@@ -121,33 +216,174 @@ class ASAPClient:
121
216
  The client should be used as an async context manager to ensure
122
217
  proper connection lifecycle management.
123
218
 
219
+ Features:
220
+ - HTTP/2 multiplexing (enabled by default) for improved batch performance
221
+ - Connection pooling supporting 1000+ concurrent requests
222
+ - Automatic retry with exponential backoff
223
+ - Circuit breaker pattern for fault tolerance
224
+ - Batch operations via send_batch() method
225
+ - Compression support (gzip/brotli) for bandwidth reduction
226
+
124
227
  Attributes:
125
228
  base_url: Base URL of the remote agent
126
229
  timeout: Request timeout in seconds
127
230
  max_retries: Maximum retry attempts for transient failures
231
+ require_https: Whether HTTPS is required for non-localhost connections
128
232
  is_connected: Whether the client has an active connection
233
+ compression: Whether compression is enabled for requests
234
+ compression_threshold: Minimum payload size to trigger compression
235
+ _circuit_breaker: Optional circuit breaker instance
236
+
237
+ Pool sizing (pool_connections / pool_maxsize):
238
+ Single-agent: 100 (default). Small cluster: 200–500. Large cluster: 500–1000.
239
+ Supports 1000+ concurrent requests via connection reuse when pool_maxsize < concurrency.
240
+
241
+ HTTP/2 Multiplexing:
242
+ HTTP/2 is enabled by default (http2=True) and provides request multiplexing over
243
+ a single TCP connection, reducing latency for batch operations. If the server
244
+ doesn't support HTTP/2, the client automatically falls back to HTTP/1.1.
245
+
246
+ Compression:
247
+ Compression is enabled by default (compression=True) for payloads exceeding
248
+ 1KB. Supports gzip (standard) and brotli (optional, requires brotli package).
249
+ Brotli provides ~20% better compression than gzip for JSON payloads.
129
250
 
130
251
  Example:
131
252
  >>> async with ASAPClient("http://localhost:8000") as client:
132
253
  ... response = await client.send(envelope)
254
+ >>>
255
+ >>> # Batch operations with HTTP/2 multiplexing
256
+ >>> async with ASAPClient("https://agent.example.com") as client:
257
+ ... responses = await client.send_batch([env1, env2, env3])
258
+ >>>
259
+ >>> # Disable compression for specific client
260
+ >>> async with ASAPClient("http://localhost:8000", compression=False) as client:
261
+ ... response = await client.send(envelope) # No compression
133
262
  """
134
263
 
264
+ _circuit_breaker: Optional[CircuitBreaker]
265
+
135
266
  def __init__(
136
267
  self,
137
268
  base_url: str,
138
269
  timeout: float = DEFAULT_TIMEOUT,
139
- max_retries: int = DEFAULT_MAX_RETRIES,
140
- transport: httpx.AsyncBaseTransport | httpx.BaseTransport | None = None,
270
+ transport: httpx.AsyncBaseTransport | None = None,
271
+ require_https: bool = True,
272
+ retry_config: Optional[RetryConfig] = None,
273
+ # Connection pool (httpx.Limits); enables 1000+ concurrent via reuse
274
+ pool_connections: int | None = None,
275
+ pool_maxsize: int | None = None,
276
+ pool_timeout: float | None = None,
277
+ # HTTP/2 multiplexing for improved batch performance
278
+ http2: bool = True,
279
+ # Compression settings for bandwidth reduction
280
+ compression: bool = True,
281
+ compression_threshold: int = COMPRESSION_THRESHOLD,
282
+ # Individual retry parameters (for backward compatibility)
283
+ # If retry_config is provided, these are ignored
284
+ max_retries: int | None = None,
285
+ base_delay: float | None = None,
286
+ max_delay: float | None = None,
287
+ jitter: bool | None = None,
288
+ circuit_breaker_enabled: bool | None = None,
289
+ circuit_breaker_threshold: int | None = None,
290
+ circuit_breaker_timeout: float | None = None,
291
+ manifest_cache_size: int | None = None,
141
292
  ) -> None:
142
293
  """Initialize ASAP client.
143
294
 
144
295
  Args:
145
296
  base_url: Base URL of the remote agent (e.g., "http://localhost:8000")
146
297
  timeout: Request timeout in seconds (default: 60)
147
- max_retries: Maximum retry attempts for transient failures (default: 3)
148
- transport: Optional custom transport (for testing). Can be sync or async.
298
+ transport: Optional custom async transport (for testing). Must be an instance
299
+ of httpx.AsyncBaseTransport (e.g., httpx.MockTransport).
300
+ require_https: If True, enforces HTTPS for non-localhost connections (default: True).
301
+ pool_connections: Max keep-alive connections in pool. Default: DEFAULT_POOL_CONNECTIONS (100).
302
+ Controls how many idle connections are kept open.
303
+ pool_maxsize: Max total connections in pool. Default: DEFAULT_POOL_MAXSIZE (100).
304
+ Controls maximum number of concurrent connections.
305
+ Tuning:
306
+ - Single agent: 100 (default)
307
+ - Small cluster: 200-500
308
+ - Large cluster: 500-1000
309
+ Safe to increase if OS file descriptor limits allow.
310
+ pool_timeout: Seconds to wait for connection from pool. Default: DEFAULT_POOL_TIMEOUT (5.0).
311
+ Increase if you see PoolTimeout exceptions under high load.
312
+ HTTP connections to localhost are allowed with a warning for development.
313
+ http2: Enable HTTP/2 multiplexing for improved batch performance (default: True).
314
+ HTTP/2 allows multiple concurrent requests over a single TCP connection,
315
+ reducing latency for batch operations. Falls back to HTTP/1.1 if server
316
+ doesn't support HTTP/2.
317
+ compression: Enable request compression for bandwidth reduction (default: True).
318
+ When enabled, payloads exceeding compression_threshold are compressed
319
+ using gzip or brotli (if available). The server must support the
320
+ Content-Encoding header to decompress requests.
321
+ compression_threshold: Minimum payload size in bytes to trigger compression
322
+ (default: 1024 = 1KB). Payloads smaller than this are sent uncompressed.
323
+ retry_config: Optional RetryConfig dataclass to group retry and circuit breaker parameters.
324
+ If provided, individual retry parameters are ignored.
325
+ max_retries: Maximum retry attempts for transient failures (default: 3).
326
+ Ignored if retry_config is provided.
327
+ base_delay: Base delay in seconds for exponential backoff (default: 1.0).
328
+ Ignored if retry_config is provided.
329
+ max_delay: Maximum delay in seconds for exponential backoff (default: 60.0).
330
+ Ignored if retry_config is provided.
331
+ jitter: Whether to add random jitter to backoff delays (default: True).
332
+ Ignored if retry_config is provided.
333
+ circuit_breaker_enabled: Enable circuit breaker pattern (default: False).
334
+ Ignored if retry_config is provided.
335
+ circuit_breaker_threshold: Number of consecutive failures before opening circuit (default: 5).
336
+ Ignored if retry_config is provided.
337
+ circuit_breaker_timeout: Seconds before transitioning OPEN -> HALF_OPEN (default: 60.0).
338
+ Ignored if retry_config is provided.
339
+ manifest_cache_size: Maximum number of manifests to cache (default: 1000).
340
+ Increase for high-cardinality environments (e.g. thousands of agents).
341
+ Set to 0 for unlimited. See ManifestCache for cleanup latency notes.
342
+
343
+ Raises:
344
+ ValueError: If URL format is invalid, scheme is not HTTP/HTTPS, or HTTPS is
345
+ required but URL uses HTTP for non-localhost connections.
346
+
347
+ Example:
348
+ >>> # Using individual parameters (backward compatible)
349
+ >>> client = ASAPClient("http://localhost:8000", max_retries=5)
350
+ >>>
351
+ >>> # Using RetryConfig (recommended)
352
+ >>> config = RetryConfig(max_retries=5, circuit_breaker_enabled=True)
353
+ >>> client = ASAPClient("http://localhost:8000", retry_config=config)
354
+ >>>
355
+ >>> # With compression disabled
356
+ >>> client = ASAPClient("http://localhost:8000", compression=False)
149
357
  """
150
- # Validate URL format and scheme
358
+ # Extract retry config values
359
+ if retry_config is not None:
360
+ # Use retry_config values
361
+ max_retries_val = retry_config.max_retries
362
+ base_delay_val = retry_config.base_delay
363
+ max_delay_val = retry_config.max_delay
364
+ jitter_val = retry_config.jitter
365
+ circuit_breaker_enabled_val = retry_config.circuit_breaker_enabled
366
+ circuit_breaker_threshold_val = retry_config.circuit_breaker_threshold
367
+ circuit_breaker_timeout_val = retry_config.circuit_breaker_timeout
368
+ else:
369
+ # Use individual parameters with defaults
370
+ max_retries_val = max_retries if max_retries is not None else DEFAULT_MAX_RETRIES
371
+ base_delay_val = base_delay if base_delay is not None else DEFAULT_BASE_DELAY
372
+ max_delay_val = max_delay if max_delay is not None else DEFAULT_MAX_DELAY
373
+ jitter_val = jitter if jitter is not None else True
374
+ circuit_breaker_enabled_val = (
375
+ circuit_breaker_enabled if circuit_breaker_enabled is not None else False
376
+ )
377
+ circuit_breaker_threshold_val = (
378
+ circuit_breaker_threshold
379
+ if circuit_breaker_threshold is not None
380
+ else DEFAULT_CIRCUIT_BREAKER_THRESHOLD
381
+ )
382
+ circuit_breaker_timeout_val = (
383
+ circuit_breaker_timeout
384
+ if circuit_breaker_timeout is not None
385
+ else DEFAULT_CIRCUIT_BREAKER_TIMEOUT
386
+ )
151
387
  from urllib.parse import urlparse
152
388
 
153
389
  parsed = urlparse(base_url)
@@ -163,12 +399,192 @@ class ASAPClient:
163
399
  f"Received: {base_url}"
164
400
  )
165
401
 
402
+ is_https = parsed.scheme.lower() == "https"
403
+ is_local = self._is_localhost(parsed)
404
+
405
+ if require_https and not is_https:
406
+ if is_local:
407
+ # Allow HTTP for localhost with warning
408
+ logger.warning(
409
+ "asap.client.http_localhost",
410
+ url=base_url,
411
+ message=(
412
+ "Using HTTP for localhost connection. "
413
+ "For production, use HTTPS. "
414
+ "To disable this warning, set require_https=False."
415
+ ),
416
+ )
417
+ else:
418
+ # Reject HTTP for non-localhost
419
+ raise ValueError(
420
+ f"HTTPS is required for non-localhost connections. "
421
+ f"Received HTTP URL: {base_url}. "
422
+ f"Please use HTTPS or set require_https=False to override "
423
+ f"(not recommended for production)."
424
+ )
425
+
166
426
  self.base_url = base_url.rstrip("/")
167
427
  self.timeout = timeout
168
- self.max_retries = max_retries
428
+ self._pool_connections = (
429
+ pool_connections if pool_connections is not None else DEFAULT_POOL_CONNECTIONS
430
+ )
431
+ self._pool_maxsize = pool_maxsize if pool_maxsize is not None else DEFAULT_POOL_MAXSIZE
432
+ self._pool_timeout = pool_timeout if pool_timeout is not None else DEFAULT_POOL_TIMEOUT
433
+ self.max_retries = max_retries_val
434
+ self.require_https = require_https
435
+ self.base_delay = base_delay_val
436
+ self.max_delay = max_delay_val
437
+ self.jitter = jitter_val
438
+ self.circuit_breaker_enabled = circuit_breaker_enabled_val
169
439
  self._transport = transport
440
+ self._http2 = http2
441
+ self._compression = compression
442
+ self._compression_threshold = compression_threshold
170
443
  self._client: httpx.AsyncClient | None = None
171
- self._request_counter = 0
444
+ # Thread-safe counter using itertools.count
445
+ self._request_counter = itertools.count(1)
446
+
447
+ # Initialize circuit breaker if enabled
448
+ # Use registry to ensure state is shared across multiple client instances
449
+ # for the same base_url
450
+ if circuit_breaker_enabled_val:
451
+ registry = get_registry()
452
+ self._circuit_breaker = registry.get_or_create(
453
+ base_url=sanitize_url(self.base_url),
454
+ threshold=circuit_breaker_threshold_val,
455
+ timeout=circuit_breaker_timeout_val,
456
+ )
457
+ else:
458
+ self._circuit_breaker = None
459
+
460
+ # Per-client manifest cache (not shared like circuit breaker).
461
+ cache_max = manifest_cache_size if manifest_cache_size is not None else DEFAULT_MAX_SIZE
462
+ self._manifest_cache = ManifestCache(max_size=cache_max)
463
+
464
+ @staticmethod
465
+ def _is_localhost(parsed_url: ParseResult) -> bool:
466
+ """Check if URL points to localhost.
467
+
468
+ Detects localhost, 127.0.0.1, and ::1 (IPv6 localhost).
469
+
470
+ Args:
471
+ parsed_url: Parsed URL from urlparse
472
+
473
+ Returns:
474
+ True if URL points to localhost, False otherwise
475
+ """
476
+ hostname = parsed_url.hostname
477
+ if not hostname:
478
+ return False
479
+
480
+ hostname_lower = hostname.lower()
481
+ # Handle both ::1 and [::1] (bracket notation from URL parsing)
482
+ return hostname_lower in ("localhost", "127.0.0.1", "::1", "[::1]")
483
+
484
+ def _calculate_backoff(self, attempt: int) -> float:
485
+ """Calculate exponential backoff delay for retry attempt.
486
+
487
+ Implements exponential backoff with optional jitter:
488
+ delay = base_delay * (2 ** attempt) + jitter
489
+
490
+ The delay is capped at max_delay to prevent excessively long waits.
491
+
492
+ Args:
493
+ attempt: Zero-based attempt number (0 = first retry)
494
+
495
+ Returns:
496
+ Delay in seconds before next retry attempt
497
+ """
498
+ # Calculate exponential delay: base_delay * (2 ** attempt)
499
+ delay = self.base_delay * (2**attempt)
500
+
501
+ # Cap at max_delay
502
+ delay = min(delay, self.max_delay)
503
+
504
+ if self.jitter:
505
+ jitter_amount: float = random.uniform(0, delay * 0.1) # nosec B311
506
+ delay += jitter_amount
507
+
508
+ return float(delay)
509
+
510
+ async def _validate_connection(self) -> bool:
511
+ """Validate that the agent endpoint is accessible.
512
+
513
+ Performs a pre-flight check by attempting to access the agent's
514
+ manifest endpoint. This can be used to detect connection issues
515
+ before sending actual requests.
516
+
517
+ Note: This is an optional validation step that can be disabled
518
+ for performance reasons in production environments.
519
+
520
+ Returns:
521
+ True if connection is valid, False otherwise
522
+
523
+ Raises:
524
+ ASAPConnectionError: If connection validation fails
525
+ """
526
+ if not self._client:
527
+ raise ASAPConnectionError(
528
+ "Client not connected. Use 'async with' context.",
529
+ url=sanitize_url(self.base_url),
530
+ )
531
+
532
+ try:
533
+ # Try to access a lightweight endpoint (manifest or health check)
534
+ # Using HEAD request to minimize bandwidth
535
+ response = await self._client.head(
536
+ f"{self.base_url}/.well-known/asap/manifest.json",
537
+ timeout=min(self.timeout, 5.0), # Shorter timeout for validation
538
+ )
539
+ # Any 2xx or 3xx response indicates the server is reachable
540
+ is_valid = 200 <= response.status_code < 400
541
+ if not is_valid:
542
+ logger.warning(
543
+ "asap.client.connection_validation_failed",
544
+ target_url=sanitize_url(self.base_url),
545
+ status_code=response.status_code,
546
+ message=(
547
+ f"Connection validation failed for {self.base_url}. "
548
+ f"Server returned status {response.status_code}. "
549
+ f"Verify the agent is running and the URL is correct."
550
+ ),
551
+ )
552
+ return is_valid
553
+ except httpx.ConnectError as e:
554
+ logger.warning(
555
+ "asap.client.connection_validation_failed",
556
+ target_url=sanitize_url(self.base_url),
557
+ error=str(e),
558
+ message=(
559
+ f"Connection validation failed for {self.base_url}. "
560
+ f"Cannot reach the agent. Verify the agent is running and accessible. "
561
+ f"Error: {str(e)[:200]}"
562
+ ),
563
+ )
564
+ return False
565
+ except httpx.TimeoutException:
566
+ logger.warning(
567
+ "asap.client.connection_validation_timeout",
568
+ target_url=sanitize_url(self.base_url),
569
+ timeout=self.timeout,
570
+ message=(
571
+ f"Connection validation timed out for {self.base_url}. "
572
+ f"Check network connectivity and firewall settings."
573
+ ),
574
+ )
575
+ return False
576
+ except Exception as e:
577
+ logger.warning(
578
+ "asap.client.connection_validation_error",
579
+ target_url=sanitize_url(self.base_url),
580
+ error=str(e),
581
+ error_type=type(e).__name__,
582
+ message=(
583
+ f"Connection validation encountered an error for {self.base_url}: {e}. "
584
+ f"Verify the agent is running and accessible."
585
+ ),
586
+ )
587
+ return False
172
588
 
173
589
  @property
174
590
  def is_connected(self) -> bool:
@@ -176,18 +592,30 @@ class ASAPClient:
176
592
  return self._client is not None
177
593
 
178
594
  async def __aenter__(self) -> "ASAPClient":
179
- """Enter async context and open connection."""
180
- # Create the async client
595
+ """Enter async context and open connection.
596
+
597
+ Creates an httpx.AsyncClient with configured pool limits and HTTP/2 support.
598
+ HTTP/2 enables multiplexing for improved batch performance.
599
+ """
600
+ limits = httpx.Limits(
601
+ max_keepalive_connections=self._pool_connections,
602
+ max_connections=self._pool_maxsize,
603
+ keepalive_expiry=DEFAULT_POOL_TIMEOUT,
604
+ )
605
+ timeout_config = httpx.Timeout(self.timeout, pool=self._pool_timeout)
181
606
  if self._transport:
182
- # MockTransport works for both sync and async, so we cast it
183
- # This is safe because httpx.MockTransport is compatible with async usage
607
+ # Custom transport (for testing) - http2 not applicable with mock transports
184
608
  self._client = httpx.AsyncClient(
185
- transport=self._transport, # type: ignore[arg-type]
186
- timeout=self.timeout,
609
+ transport=self._transport,
610
+ timeout=timeout_config,
611
+ limits=limits,
187
612
  )
188
613
  else:
614
+ # Production client with HTTP/2 multiplexing support
189
615
  self._client = httpx.AsyncClient(
190
- timeout=self.timeout,
616
+ timeout=timeout_config,
617
+ limits=limits,
618
+ http2=self._http2,
191
619
  )
192
620
  return self
193
621
 
@@ -215,35 +643,55 @@ class ASAPClient:
215
643
  Response envelope from the remote agent
216
644
 
217
645
  Raises:
646
+ ValueError: If envelope is None
218
647
  ASAPConnectionError: If connection fails or HTTP error occurs
219
648
  ASAPTimeoutError: If request times out
220
649
  ASAPRemoteError: If remote agent returns JSON-RPC error
650
+ CircuitOpenError: If circuit breaker is open and request is rejected
221
651
 
222
652
  Example:
223
653
  >>> async with ASAPClient("http://localhost:8000") as client:
224
654
  ... response = await client.send(envelope)
225
655
  ... response.payload_type
226
656
  """
657
+ if envelope is None:
658
+ raise ValueError("envelope cannot be None")
659
+
227
660
  if not self._client:
228
- raise ASAPConnectionError("Client not connected. Use 'async with' context.")
661
+ raise ASAPConnectionError(
662
+ "Client not connected. Use 'async with' context.",
663
+ url=sanitize_url(self.base_url),
664
+ )
665
+
666
+ if self._circuit_breaker is not None and not self._circuit_breaker.can_attempt():
667
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
668
+ raise CircuitOpenError(
669
+ base_url=sanitize_url(self.base_url),
670
+ consecutive_failures=consecutive_failures,
671
+ )
229
672
 
230
673
  start_time = time.perf_counter()
231
674
 
232
675
  # Generate idempotency key for retries
233
676
  idempotency_key = generate_id()
234
677
 
235
- # Increment request counter for JSON-RPC id
236
- self._request_counter += 1
237
- request_id = f"req-{self._request_counter}"
678
+ # Get next request counter value (thread-safe)
679
+ request_id = f"req-{next(self._request_counter)}"
238
680
 
239
- # Log send attempt
681
+ # Log send attempt with context (sanitize URL to hide credentials)
682
+ sanitized_url = sanitize_url(self.base_url)
240
683
  logger.info(
241
684
  "asap.client.send",
242
- target_url=self.base_url,
685
+ target_url=sanitized_url,
243
686
  envelope_id=envelope.id,
244
687
  trace_id=envelope.trace_id,
245
688
  payload_type=envelope.payload_type,
246
689
  idempotency_key=idempotency_key,
690
+ max_retries=self.max_retries,
691
+ message=(
692
+ f"Sending envelope {envelope.id} to {sanitized_url} "
693
+ f"(payload: {envelope.payload_type}, max_retries: {self.max_retries})"
694
+ ),
247
695
  )
248
696
 
249
697
  # Build JSON-RPC request
@@ -257,40 +705,216 @@ class ASAPClient:
257
705
  "id": request_id,
258
706
  }
259
707
 
708
+ # Serialize to bytes for compression
709
+ request_body = json.dumps(json_rpc_request).encode("utf-8")
710
+
711
+ # Apply compression if enabled and payload exceeds threshold
712
+ content_encoding: str | None = None
713
+ if self._compression:
714
+ compressed_body, algorithm = compress_payload(
715
+ request_body,
716
+ threshold=self._compression_threshold,
717
+ )
718
+ if algorithm != CompressionAlgorithm.IDENTITY:
719
+ request_body = compressed_body
720
+ content_encoding = algorithm.value
721
+ logger.debug(
722
+ "asap.client.compression_applied",
723
+ target_url=sanitized_url,
724
+ envelope_id=envelope.id,
725
+ algorithm=content_encoding,
726
+ original_size=len(json.dumps(json_rpc_request).encode("utf-8")),
727
+ compressed_size=len(request_body),
728
+ )
729
+
260
730
  # Attempt with retries
261
731
  last_exception: Exception | None = None
262
732
  for attempt in range(self.max_retries):
733
+ if attempt > 0:
734
+ get_metrics().increment_counter("asap_transport_retries_total")
263
735
  try:
736
+ # Build headers
737
+ headers = {
738
+ "Content-Type": "application/json",
739
+ "X-Idempotency-Key": idempotency_key,
740
+ "Accept-Encoding": get_accept_encoding_header(),
741
+ }
742
+ if content_encoding:
743
+ headers["Content-Encoding"] = content_encoding
744
+
264
745
  response = await self._client.post(
265
746
  f"{self.base_url}/asap",
266
- json=json_rpc_request,
267
- headers={
268
- "Content-Type": "application/json",
269
- "X-Idempotency-Key": idempotency_key,
270
- },
747
+ headers=headers,
748
+ content=request_body,
271
749
  )
272
750
 
273
- # Check HTTP status
751
+ # Log HTTP protocol version for debugging fallback behavior
752
+ if self._http2 and response.http_version != "HTTP/2":
753
+ logger.debug(
754
+ "asap.client.http_fallback",
755
+ target_url=sanitize_url(self.base_url),
756
+ requested="HTTP/2",
757
+ actual=response.http_version,
758
+ message=f"HTTP/2 requested but used {response.http_version}",
759
+ )
760
+
274
761
  if response.status_code >= 500:
275
762
  # Server errors (5xx) are retriable
763
+ error_msg = (
764
+ f"HTTP server error {response.status_code} from {self.base_url}. "
765
+ f"Server returned: {response.text[:200]}"
766
+ )
276
767
  if attempt < self.max_retries - 1:
768
+ delay = self._calculate_backoff(attempt)
277
769
  logger.warning(
278
- "asap.client.server_error",
770
+ "asap.client.retry_server_error",
279
771
  status_code=response.status_code,
280
772
  attempt=attempt + 1,
281
773
  max_retries=self.max_retries,
774
+ delay_seconds=round(delay, 2),
775
+ target_url=sanitize_url(self.base_url),
776
+ envelope_id=envelope.id,
777
+ message=(
778
+ f"Server error {response.status_code}, "
779
+ f"retrying in {delay:.2f}s "
780
+ f"(attempt {attempt + 1}/{self.max_retries})"
781
+ ),
782
+ )
783
+ await asyncio.sleep(delay)
784
+ last_exception = ASAPConnectionError(error_msg, url=self.base_url)
785
+ continue
786
+ # All retries exhausted, record failure in circuit breaker
787
+ if self._circuit_breaker is not None:
788
+ previous_state = self._circuit_breaker.get_state()
789
+ self._circuit_breaker.record_failure()
790
+ current_state = self._circuit_breaker.get_state()
791
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
792
+ # Log state change if circuit opened
793
+ if previous_state != current_state and current_state == CircuitState.OPEN:
794
+ logger.warning(
795
+ "asap.client.circuit_opened",
796
+ target_url=sanitize_url(self.base_url),
797
+ consecutive_failures=consecutive_failures,
798
+ threshold=self._circuit_breaker.threshold,
799
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
800
+ )
801
+ raise ASAPConnectionError(error_msg, url=self.base_url)
802
+ if response.status_code == 429:
803
+ if attempt < self.max_retries - 1:
804
+ retry_after = response.headers.get("Retry-After")
805
+ if retry_after:
806
+ retry_delay: Optional[float] = None
807
+ # Retry-After can be seconds (int/float) or HTTP date
808
+ # First, try to parse as seconds (numeric)
809
+ if retry_after.replace(".", "", 1).isdigit():
810
+ try:
811
+ retry_delay = float(retry_after)
812
+ logger.info(
813
+ "asap.client.retry_after",
814
+ target_url=sanitize_url(self.base_url),
815
+ envelope_id=envelope.id,
816
+ attempt=attempt + 1,
817
+ retry_after_seconds=retry_delay,
818
+ message=f"Respecting server Retry-After: {retry_delay}s",
819
+ )
820
+ except ValueError:
821
+ pass # Fall through to date parsing
822
+ else:
823
+ # Try to parse as HTTP date
824
+ try:
825
+ retry_date = parsedate_to_datetime(retry_after)
826
+ if retry_date:
827
+ # Calculate delay in seconds from now until retry_date
828
+ now_timestamp = time.time()
829
+ retry_timestamp = retry_date.timestamp()
830
+ calculated_delay = retry_timestamp - now_timestamp
831
+ # If date is in the past or delay is invalid, fall back to calculated backoff
832
+ if calculated_delay <= 0:
833
+ retry_delay = None # Will trigger fallback
834
+ else:
835
+ retry_delay = calculated_delay
836
+ logger.info(
837
+ "asap.client.retry_after",
838
+ target_url=sanitize_url(self.base_url),
839
+ envelope_id=envelope.id,
840
+ attempt=attempt + 1,
841
+ retry_after_seconds=round(retry_delay, 2),
842
+ retry_after_date=retry_after,
843
+ message=f"Respecting server Retry-After date: {retry_after} ({retry_delay:.2f}s)",
844
+ )
845
+ except (ValueError, TypeError, AttributeError, OSError):
846
+ # Invalid date format or timestamp conversion error, fall back to calculated backoff
847
+ pass
848
+
849
+ # If parsing failed or delay is invalid (None or <= 0), use calculated backoff
850
+ if retry_delay is None or retry_delay <= 0:
851
+ retry_delay = self._calculate_backoff(attempt)
852
+ logger.warning(
853
+ "asap.client.retry_after_invalid",
854
+ target_url=sanitize_url(self.base_url),
855
+ envelope_id=envelope.id,
856
+ retry_after_header=retry_after,
857
+ fallback_delay=round(retry_delay, 2),
858
+ message="Invalid Retry-After format, using calculated backoff",
859
+ )
860
+ delay = retry_delay
861
+ else:
862
+ # No Retry-After header, use calculated backoff
863
+ delay = self._calculate_backoff(attempt)
864
+ logger.warning(
865
+ "asap.client.rate_limited",
866
+ status_code=429,
867
+ attempt=attempt + 1,
868
+ max_retries=self.max_retries,
869
+ delay_seconds=round(delay, 2),
870
+ )
871
+ logger.info(
872
+ "asap.client.retry",
873
+ target_url=sanitize_url(self.base_url),
874
+ envelope_id=envelope.id,
875
+ attempt=attempt + 1,
876
+ max_retries=self.max_retries,
877
+ delay_seconds=round(delay, 2),
282
878
  )
879
+ await asyncio.sleep(delay)
283
880
  last_exception = ASAPConnectionError(
284
- f"HTTP server error {response.status_code}: {response.text}"
881
+ f"HTTP rate limit error 429 from {self.base_url}. "
882
+ f"Server response: {response.text[:200]}",
883
+ url=sanitize_url(self.base_url),
285
884
  )
286
885
  continue
886
+ # All retries exhausted, record failure in circuit breaker
887
+ if self._circuit_breaker is not None:
888
+ previous_state = self._circuit_breaker.get_state()
889
+ self._circuit_breaker.record_failure()
890
+ current_state = self._circuit_breaker.get_state()
891
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
892
+ # Log state change if circuit opened
893
+ if previous_state != current_state and current_state == CircuitState.OPEN:
894
+ logger.warning(
895
+ "asap.client.circuit_opened",
896
+ target_url=sanitize_url(self.base_url),
897
+ consecutive_failures=consecutive_failures,
898
+ threshold=self._circuit_breaker.threshold,
899
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures (rate limited)",
900
+ )
287
901
  raise ASAPConnectionError(
288
- f"HTTP server error {response.status_code}: {response.text}"
902
+ f"HTTP rate limit error 429 from {self.base_url} after {self.max_retries} attempts. "
903
+ f"Server response: {response.text[:200]}",
904
+ url=sanitize_url(self.base_url),
289
905
  )
290
906
  if response.status_code >= 400:
291
- # Client errors (4xx) are not retriable
907
+ # Client errors (4xx) are not retriable (except 429 handled above)
908
+ # We record a failure in the circuit breaker here because persistent 4xx
909
+ # (like 401/403) can indicate an unhealthy configuration or system state.
910
+ if self._circuit_breaker is not None:
911
+ self._circuit_breaker.record_failure()
912
+
292
913
  raise ASAPConnectionError(
293
- f"HTTP client error {response.status_code}: {response.text}"
914
+ f"HTTP client error {response.status_code} from {self.base_url}. "
915
+ f"This indicates a problem with the request. "
916
+ f"Server response: {response.text[:200]}",
917
+ url=sanitize_url(self.base_url),
294
918
  )
295
919
 
296
920
  # Parse JSON response
@@ -299,8 +923,10 @@ class ASAPClient:
299
923
  except Exception as e:
300
924
  raise ASAPRemoteError(-32700, f"Invalid JSON response: {e}") from e
301
925
 
302
- # Check for JSON-RPC error
303
926
  if "error" in json_response:
927
+ if self._circuit_breaker is not None:
928
+ self._circuit_breaker.record_success()
929
+
304
930
  error = json_response["error"]
305
931
  raise ASAPRemoteError(
306
932
  error.get("code", -32603),
@@ -316,84 +942,375 @@ class ASAPClient:
316
942
 
317
943
  response_envelope = Envelope(**envelope_data)
318
944
 
945
+ # Record success in circuit breaker
946
+ if self._circuit_breaker is not None:
947
+ previous_state = self._circuit_breaker.get_state()
948
+ self._circuit_breaker.record_success()
949
+ current_state = self._circuit_breaker.get_state()
950
+ # Log state change if circuit was closed
951
+ if previous_state != current_state and current_state == CircuitState.CLOSED:
952
+ logger.info(
953
+ "asap.client.circuit_closed",
954
+ target_url=sanitize_url(self.base_url),
955
+ message="Circuit breaker closed after successful request",
956
+ )
957
+
319
958
  # Calculate duration and log success
320
- duration_ms = (time.perf_counter() - start_time) * 1000
959
+ duration_seconds = time.perf_counter() - start_time
960
+ duration_ms = duration_seconds * 1000
321
961
  logger.info(
322
962
  "asap.client.response",
323
- target_url=self.base_url,
963
+ target_url=sanitize_url(self.base_url),
324
964
  envelope_id=envelope.id,
325
965
  response_id=response_envelope.id,
326
966
  trace_id=envelope.trace_id,
327
967
  duration_ms=round(duration_ms, 2),
328
968
  attempts=attempt + 1,
329
969
  )
330
-
970
+ metrics = get_metrics()
971
+ metrics.increment_counter("asap_transport_send_total", {"status": "success"})
972
+ metrics.observe_histogram(
973
+ "asap_transport_send_duration_seconds",
974
+ duration_seconds,
975
+ {"status": "success"},
976
+ )
331
977
  return response_envelope
332
978
 
333
- except httpx.ConnectError as e:
334
- last_exception = ASAPConnectionError(f"Connection error: {e}", cause=e)
979
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
980
+ is_timeout = isinstance(e, httpx.TimeoutException)
981
+ error_type = "Timeout" if is_timeout else "Connection error"
982
+ error_msg = (
983
+ f"{error_type} to {self.base_url}: {e}. "
984
+ f"Verify the agent is running and accessible."
985
+ )
986
+ if is_timeout:
987
+ last_exception = ASAPTimeoutError(
988
+ f"Request timeout after {self.timeout}s", timeout=self.timeout
989
+ )
990
+ else:
991
+ last_exception = ASAPConnectionError(error_msg, cause=e, url=self.base_url)
992
+
335
993
  # Log retry attempt
336
994
  if attempt < self.max_retries - 1:
995
+ delay = self._calculate_backoff(attempt)
337
996
  logger.warning(
338
997
  "asap.client.retry",
339
- target_url=self.base_url,
998
+ target_url=sanitize_url(self.base_url),
340
999
  envelope_id=envelope.id,
341
1000
  attempt=attempt + 1,
342
1001
  max_retries=self.max_retries,
343
1002
  error=str(e),
1003
+ delay_seconds=round(delay, 2),
1004
+ message=(
1005
+ f"{error_type} to {self.base_url} (attempt {attempt + 1}/{self.max_retries}). "
1006
+ f"Retrying in {delay:.2f}s. "
1007
+ f"Error: {str(e)[:100]}"
1008
+ ),
344
1009
  )
1010
+ await asyncio.sleep(delay)
345
1011
  continue
346
- # Log final failure
347
- duration_ms = (time.perf_counter() - start_time) * 1000
348
- logger.error(
349
- "asap.client.error",
350
- target_url=self.base_url,
351
- envelope_id=envelope.id,
352
- error="Connection failed after retries",
353
- error_type="ASAPConnectionError",
354
- duration_ms=round(duration_ms, 2),
355
- attempts=attempt + 1,
356
- )
357
- raise last_exception from e
358
1012
 
359
- except httpx.TimeoutException as e:
1013
+ # All retries exhausted, record failure in circuit breaker
1014
+ if self._circuit_breaker is not None:
1015
+ previous_state = self._circuit_breaker.get_state()
1016
+ self._circuit_breaker.record_failure()
1017
+ current_state = self._circuit_breaker.get_state()
1018
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
1019
+ # Log state change if circuit opened
1020
+ if previous_state != current_state and current_state == CircuitState.OPEN:
1021
+ logger.warning(
1022
+ "asap.client.circuit_opened",
1023
+ target_url=sanitize_url(self.base_url),
1024
+ consecutive_failures=consecutive_failures,
1025
+ threshold=self._circuit_breaker.threshold,
1026
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
1027
+ )
1028
+
1029
+ # Log final failure with detailed context
360
1030
  duration_ms = (time.perf_counter() - start_time) * 1000
361
- last_exception = ASAPTimeoutError(
362
- f"Request timeout after {self.timeout}s", timeout=self.timeout
363
- )
364
- # Log timeout (don't retry)
1031
+ error_type_name = "ASAPTimeoutError" if is_timeout else "ASAPConnectionError"
365
1032
  logger.error(
366
1033
  "asap.client.error",
367
- target_url=self.base_url,
1034
+ target_url=sanitize_url(self.base_url),
368
1035
  envelope_id=envelope.id,
369
- error="Request timeout",
370
- error_type="ASAPTimeoutError",
371
- timeout=self.timeout,
1036
+ error=f"{error_type} after retries",
1037
+ error_type=error_type_name,
372
1038
  duration_ms=round(duration_ms, 2),
1039
+ attempts=attempt + 1,
1040
+ max_retries=self.max_retries,
1041
+ timeout=self.timeout if is_timeout else None,
1042
+ message=(
1043
+ f"{error_type} to {self.base_url} failed after {attempt + 1} attempts. "
1044
+ f"Total duration: {duration_ms:.2f}ms. "
1045
+ f"Troubleshooting: Verify the agent is running, check network connectivity, "
1046
+ f"and ensure the URL is correct. Original error: {str(e)[:200]}"
1047
+ ),
373
1048
  )
374
1049
  raise last_exception from e
375
1050
 
376
1051
  except (ASAPConnectionError, ASAPRemoteError, ASAPTimeoutError):
377
- # Re-raise our custom errors
1052
+ # Re-raise our custom errors without recording failure again
1053
+ # (failures are already recorded before these exceptions are raised)
378
1054
  raise
379
1055
 
380
1056
  except Exception as e:
1057
+ # Record failure in circuit breaker
1058
+ if self._circuit_breaker is not None:
1059
+ previous_state = self._circuit_breaker.get_state()
1060
+ self._circuit_breaker.record_failure()
1061
+ current_state = self._circuit_breaker.get_state()
1062
+ consecutive_failures = self._circuit_breaker.get_consecutive_failures()
1063
+ # Log state change if circuit opened
1064
+ if previous_state != current_state and current_state == CircuitState.OPEN:
1065
+ logger.warning(
1066
+ "asap.client.circuit_opened",
1067
+ target_url=sanitize_url(self.base_url),
1068
+ consecutive_failures=consecutive_failures,
1069
+ threshold=self._circuit_breaker.threshold,
1070
+ message=f"Circuit breaker opened after {consecutive_failures} consecutive failures",
1071
+ )
381
1072
  # Log unexpected error
382
1073
  duration_ms = (time.perf_counter() - start_time) * 1000
383
1074
  logger.exception(
384
1075
  "asap.client.error",
385
- target_url=self.base_url,
1076
+ target_url=sanitize_url(self.base_url),
386
1077
  envelope_id=envelope.id,
387
1078
  error=str(e),
388
1079
  error_type=type(e).__name__,
389
1080
  duration_ms=round(duration_ms, 2),
390
1081
  )
1082
+ _record_send_error_metrics(start_time, e)
391
1083
  # Wrap unexpected errors
392
- raise ASAPConnectionError(f"Unexpected error: {e}", cause=e) from e
1084
+ raise ASAPConnectionError(
1085
+ f"Unexpected error connecting to {self.base_url}: {e}. "
1086
+ f"Verify the agent is running and accessible.",
1087
+ cause=e,
1088
+ url=sanitize_url(self.base_url),
1089
+ ) from e
393
1090
 
394
- # Defensive code: This should never be reached because the loop above
395
- # always either returns successfully or raises an exception.
396
- # Kept as a safety net for future code changes.
397
1091
  if last_exception: # pragma: no cover
1092
+ _record_send_error_metrics(start_time, last_exception)
398
1093
  raise last_exception
399
- raise ASAPConnectionError("Max retries exceeded") # pragma: no cover
1094
+ raise ASAPConnectionError(
1095
+ f"Max retries ({self.max_retries}) exceeded for {self.base_url}. "
1096
+ f"Verify the agent is running and accessible.",
1097
+ url=sanitize_url(self.base_url),
1098
+ ) # pragma: no cover
1099
+
1100
+ async def get_manifest(self, url: str | None = None) -> Manifest:
1101
+ """Get agent manifest from cache or HTTP endpoint.
1102
+
1103
+ Checks cache first, then fetches from HTTP if not cached or expired.
1104
+ Caches successful responses with TTL (default: 5 minutes).
1105
+ Invalidates cache entry on error.
1106
+
1107
+ Args:
1108
+ url: Manifest URL (defaults to {base_url}/.well-known/asap/manifest.json)
1109
+
1110
+ Returns:
1111
+ Manifest object
1112
+
1113
+ Raises:
1114
+ ASAPConnectionError: If HTTP request fails
1115
+ ASAPTimeoutError: If request times out
1116
+ ValueError: If manifest JSON is invalid
1117
+
1118
+ Example:
1119
+ >>> async with ASAPClient("http://agent.example.com") as client:
1120
+ ... manifest = await client.get_manifest()
1121
+ ... print(manifest.id, manifest.name)
1122
+ """
1123
+ if url is None:
1124
+ url = f"{self.base_url}/.well-known/asap/manifest.json"
1125
+
1126
+ if not self._client:
1127
+ raise ASAPConnectionError(
1128
+ "Client not connected. Use 'async with' context.",
1129
+ url=sanitize_url(url),
1130
+ )
1131
+
1132
+ cached = self._manifest_cache.get(url)
1133
+ if cached is not None:
1134
+ logger.debug(
1135
+ "asap.client.manifest_cache_hit",
1136
+ url=sanitize_url(url),
1137
+ manifest_id=cached.id,
1138
+ message=f"Manifest cache hit for {sanitize_url(url)}",
1139
+ )
1140
+ return cached
1141
+
1142
+ # Cache miss - fetch from HTTP
1143
+ logger.debug(
1144
+ "asap.client.manifest_cache_miss",
1145
+ url=sanitize_url(url),
1146
+ message=f"Manifest cache miss for {sanitize_url(url)}, fetching from HTTP",
1147
+ )
1148
+
1149
+ try:
1150
+ response = await self._client.get(
1151
+ url,
1152
+ timeout=min(self.timeout, MANIFEST_REQUEST_TIMEOUT), # Cap timeout for manifest
1153
+ )
1154
+
1155
+ if response.status_code >= 400:
1156
+ # HTTP error - invalidate cache if entry exists
1157
+ self._manifest_cache.invalidate(url)
1158
+ raise ASAPConnectionError(
1159
+ f"HTTP error {response.status_code} fetching manifest from {url}. "
1160
+ f"Server response: {response.text[:200]}",
1161
+ url=sanitize_url(url),
1162
+ )
1163
+
1164
+ # Parse JSON response
1165
+ try:
1166
+ manifest_data = response.json()
1167
+ except Exception as e:
1168
+ self._manifest_cache.invalidate(url)
1169
+ raise ValueError(f"Invalid JSON in manifest response: {e}") from e
1170
+
1171
+ # Parse Manifest object
1172
+ try:
1173
+ manifest = Manifest(**manifest_data)
1174
+ except Exception as e:
1175
+ self._manifest_cache.invalidate(url)
1176
+ raise ValueError(f"Invalid manifest format: {e}") from e
1177
+
1178
+ # Cache successful response
1179
+ self._manifest_cache.set(url, manifest)
1180
+ logger.info(
1181
+ "asap.client.manifest_fetched",
1182
+ url=sanitize_url(url),
1183
+ manifest_id=manifest.id,
1184
+ message=f"Manifest fetched and cached for {sanitize_url(url)}",
1185
+ )
1186
+
1187
+ return manifest
1188
+
1189
+ except httpx.TimeoutException as e:
1190
+ self._manifest_cache.invalidate(url)
1191
+ raise ASAPTimeoutError(
1192
+ f"Manifest request timeout after {self.timeout}s", timeout=self.timeout
1193
+ ) from e
1194
+ except httpx.ConnectError as e:
1195
+ self._manifest_cache.invalidate(url)
1196
+ raise ASAPConnectionError(
1197
+ f"Connection error fetching manifest from {url}: {e}. "
1198
+ f"Verify the agent is running and accessible.",
1199
+ cause=e,
1200
+ url=sanitize_url(url),
1201
+ ) from e
1202
+ except (ASAPConnectionError, ASAPTimeoutError, ValueError):
1203
+ # Re-raise our custom errors (cache already invalidated above)
1204
+ raise
1205
+ except Exception as e:
1206
+ # Unexpected error - invalidate cache
1207
+ self._manifest_cache.invalidate(url)
1208
+ logger.exception(
1209
+ "asap.client.manifest_error",
1210
+ url=sanitize_url(url),
1211
+ error=str(e),
1212
+ error_type=type(e).__name__,
1213
+ message=f"Unexpected error fetching manifest from {url}: {e}",
1214
+ )
1215
+ raise ASAPConnectionError(
1216
+ f"Unexpected error fetching manifest from {url}: {e}. "
1217
+ f"Verify the agent is running and accessible.",
1218
+ cause=e,
1219
+ url=sanitize_url(url),
1220
+ ) from e
1221
+
1222
+ async def send_batch(
1223
+ self,
1224
+ envelopes: list[Envelope],
1225
+ return_exceptions: bool = False,
1226
+ ) -> list[Envelope | BaseException]:
1227
+ """Send multiple envelopes in parallel using asyncio.gather.
1228
+
1229
+ Uses asyncio.gather to send all envelopes concurrently, leveraging
1230
+ connection pooling and HTTP/2 multiplexing for optimal throughput.
1231
+
1232
+ Args:
1233
+ envelopes: List of ASAP envelopes to send
1234
+ return_exceptions: If True, exceptions are returned in the result list
1235
+ instead of being raised. If False (default), the first exception
1236
+ encountered will be raised.
1237
+
1238
+ Returns:
1239
+ List of response envelopes in the same order as input envelopes.
1240
+ If return_exceptions=True, failed sends will have the exception
1241
+ in their position instead of an Envelope.
1242
+
1243
+ Raises:
1244
+ ValueError: If envelopes list is empty
1245
+ ASAPConnectionError: If any send fails (when return_exceptions=False)
1246
+ ASAPTimeoutError: If any send times out (when return_exceptions=False)
1247
+ ASAPRemoteError: If any remote agent returns error (when return_exceptions=False)
1248
+ CircuitOpenError: If circuit breaker is open (when return_exceptions=False)
1249
+
1250
+ Example:
1251
+ >>> async with ASAPClient("http://localhost:8000") as client:
1252
+ ... responses = await client.send_batch([env1, env2, env3])
1253
+ ... for response in responses:
1254
+ ... print(response.payload_type)
1255
+ >>>
1256
+ >>> # With error handling
1257
+ >>> async with ASAPClient("http://localhost:8000") as client:
1258
+ ... results = await client.send_batch(envelopes, return_exceptions=True)
1259
+ ... for i, result in enumerate(results):
1260
+ ... if isinstance(result, BaseException):
1261
+ ... print(f"Envelope {i} failed: {result}")
1262
+ ... else:
1263
+ ... print(f"Envelope {i} succeeded: {result.id}")
1264
+ """
1265
+ if not envelopes:
1266
+ raise ValueError("envelopes list cannot be empty")
1267
+
1268
+ if not self._client:
1269
+ raise ASAPConnectionError(
1270
+ "Client not connected. Use 'async with' context.",
1271
+ url=sanitize_url(self.base_url),
1272
+ )
1273
+
1274
+ batch_size = len(envelopes)
1275
+ logger.info(
1276
+ "asap.client.send_batch",
1277
+ target_url=sanitize_url(self.base_url),
1278
+ batch_size=batch_size,
1279
+ message=f"Sending batch of {batch_size} envelopes to {sanitize_url(self.base_url)}",
1280
+ )
1281
+
1282
+ start_time = time.perf_counter()
1283
+
1284
+ # Create send tasks for all envelopes
1285
+ tasks = [self.send(envelope) for envelope in envelopes]
1286
+
1287
+ # Execute all tasks concurrently
1288
+ results = await asyncio.gather(*tasks, return_exceptions=return_exceptions)
1289
+
1290
+ duration_ms = (time.perf_counter() - start_time) * 1000
1291
+
1292
+ # Count successes and failures
1293
+ if return_exceptions:
1294
+ success_count = sum(1 for r in results if isinstance(r, Envelope))
1295
+ failure_count = batch_size - success_count
1296
+ else:
1297
+ success_count = batch_size
1298
+ failure_count = 0
1299
+
1300
+ logger.info(
1301
+ "asap.client.send_batch_complete",
1302
+ target_url=sanitize_url(self.base_url),
1303
+ batch_size=batch_size,
1304
+ success_count=success_count,
1305
+ failure_count=failure_count,
1306
+ duration_ms=round(duration_ms, 2),
1307
+ throughput_per_second=round(batch_size / (duration_ms / 1000), 2)
1308
+ if duration_ms > 0
1309
+ else 0,
1310
+ message=(
1311
+ f"Batch of {batch_size} envelopes completed in {duration_ms:.2f}ms "
1312
+ f"({success_count} succeeded, {failure_count} failed)"
1313
+ ),
1314
+ )
1315
+
1316
+ return results