dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ from typing import Any, TYPE_CHECKING
14
14
  from dory.types import StateBackend
15
15
  from dory.migration.serialization import StateSerializer
16
16
  from dory.migration.configmap import ConfigMapStore
17
+ from dory.migration.s3_store import S3Store, S3Config
17
18
  from dory.utils.errors import DoryStateError
18
19
 
19
20
  if TYPE_CHECKING:
@@ -55,6 +56,7 @@ class StateManager:
55
56
  self._config = config
56
57
  self._serializer = StateSerializer()
57
58
  self._configmap_store: ConfigMapStore | None = None
59
+ self._s3_store: S3Store | None = None
58
60
 
59
61
  # Get namespace from environment
60
62
  self._namespace = os.environ.get("POD_NAMESPACE", "default")
@@ -285,17 +287,73 @@ class StateManager:
285
287
  raise DoryStateError(f"Failed to delete state from PVC {path}: {e}", cause=e)
286
288
 
287
289
  # =========================================================================
288
- # S3 Backend (placeholder - would need boto3)
290
+ # S3 Backend
289
291
  # =========================================================================
290
292
 
293
+ def _get_s3_store(self) -> S3Store:
294
+ """Get or create S3 store instance."""
295
+ if self._s3_store is None:
296
+ # Try to get S3 config from DoryConfig if available
297
+ s3_config = None
298
+ if self._config and hasattr(self._config, "s3_config"):
299
+ s3_config = self._config.s3_config
300
+
301
+ self._s3_store = S3Store(config=s3_config)
302
+
303
+ return self._s3_store
304
+
291
305
  async def _save_to_s3(self, processor_id: str, state_json: str) -> None:
292
- """Save state to S3."""
293
- raise DoryStateError("S3 backend not yet implemented")
306
+ """Save state to S3 with offline buffering support."""
307
+ store = self._get_s3_store()
308
+ await store.save(
309
+ processor_id,
310
+ state_json,
311
+ metadata={
312
+ "pod-name": self._pod_name,
313
+ "namespace": self._namespace,
314
+ },
315
+ )
294
316
 
295
317
  async def _load_from_s3(self, processor_id: str) -> str | None:
296
- """Load state from S3."""
297
- raise DoryStateError("S3 backend not yet implemented")
318
+ """Load state from S3 (falls back to local buffer if unavailable)."""
319
+ store = self._get_s3_store()
320
+ return await store.load(processor_id)
298
321
 
299
322
  async def _delete_from_s3(self, processor_id: str) -> bool:
300
323
  """Delete state from S3."""
301
- raise DoryStateError("S3 backend not yet implemented")
324
+ store = self._get_s3_store()
325
+ return await store.delete(processor_id)
326
+
327
+ async def sync_s3_buffer(self) -> int:
328
+ """
329
+ Sync locally buffered states to S3.
330
+
331
+ Call this periodically on edge nodes to upload states
332
+ that were buffered during connectivity issues.
333
+
334
+ Returns:
335
+ Number of states synced
336
+ """
337
+ if self._backend != StateBackend.S3:
338
+ return 0
339
+
340
+ store = self._get_s3_store()
341
+ return await store.sync_buffer()
342
+
343
+ async def start_s3_background_sync(self, interval_seconds: float = 60.0) -> None:
344
+ """
345
+ Start background S3 sync for edge nodes.
346
+
347
+ Args:
348
+ interval_seconds: Interval between sync attempts
349
+ """
350
+ if self._backend != StateBackend.S3:
351
+ return
352
+
353
+ store = self._get_s3_store()
354
+ await store.start_background_sync(interval_seconds)
355
+
356
+ async def stop_s3_background_sync(self) -> None:
357
+ """Stop background S3 sync."""
358
+ if self._s3_store:
359
+ await self._s3_store.stop_background_sync()
@@ -0,0 +1,382 @@
1
+ """
2
+ State transfer utilities with timeout and size validation.
3
+
4
+ Provides utilities for safe state capture and restore operations
5
+ that align with Orchestrator timeout expectations.
6
+ """
7
+
8
+ import asyncio
9
+ import functools
10
+ import logging
11
+ import time
12
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FuturesTimeoutError
13
+ from dataclasses import dataclass
14
+ from typing import Any, Callable, TypeVar
15
+
16
+ from dory.utils.errors import DoryStateError
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Type variable for generic return type
21
+ T = TypeVar("T")
22
+
23
+ # Orchestrator constants (from transfer.go)
24
+ ORCHESTRATOR_STATE_TIMEOUT_SEC = 30 # DefaultHTTPTimeout in transfer.go
25
+ ORCHESTRATOR_MAX_STATE_SIZE = 10 * 1024 * 1024 # MaxResponseBodySize in transfer.go (10MB)
26
+
27
+ # Default SDK limits (with safety margin)
28
+ DEFAULT_CAPTURE_TIMEOUT_SEC = 25 # 5s buffer before Orchestrator timeout
29
+ DEFAULT_RESTORE_TIMEOUT_SEC = 25
30
+ DEFAULT_MAX_STATE_SIZE = 8 * 1024 * 1024 # 8MB, 2MB buffer before Orchestrator limit
31
+ DEFAULT_SIZE_WARN_THRESHOLD = 0.75 # Warn at 75% of max
32
+
33
+
34
+ @dataclass
35
+ class TransferConfig:
36
+ """Configuration for state transfer operations."""
37
+
38
+ capture_timeout_sec: float = DEFAULT_CAPTURE_TIMEOUT_SEC
39
+ restore_timeout_sec: float = DEFAULT_RESTORE_TIMEOUT_SEC
40
+ max_size_bytes: int = DEFAULT_MAX_STATE_SIZE
41
+ size_warn_threshold: float = DEFAULT_SIZE_WARN_THRESHOLD
42
+
43
+ def __post_init__(self):
44
+ """Validate configuration against Orchestrator limits."""
45
+ if self.capture_timeout_sec >= ORCHESTRATOR_STATE_TIMEOUT_SEC:
46
+ logger.warning(
47
+ f"state_capture_timeout_sec ({self.capture_timeout_sec}s) >= "
48
+ f"Orchestrator timeout ({ORCHESTRATOR_STATE_TIMEOUT_SEC}s). "
49
+ "Reducing to {ORCHESTRATOR_STATE_TIMEOUT_SEC - 5}s."
50
+ )
51
+ self.capture_timeout_sec = ORCHESTRATOR_STATE_TIMEOUT_SEC - 5
52
+
53
+ if self.restore_timeout_sec >= ORCHESTRATOR_STATE_TIMEOUT_SEC:
54
+ logger.warning(
55
+ f"state_restore_timeout_sec ({self.restore_timeout_sec}s) >= "
56
+ f"Orchestrator timeout ({ORCHESTRATOR_STATE_TIMEOUT_SEC}s). "
57
+ f"Reducing to {ORCHESTRATOR_STATE_TIMEOUT_SEC - 5}s."
58
+ )
59
+ self.restore_timeout_sec = ORCHESTRATOR_STATE_TIMEOUT_SEC - 5
60
+
61
+ if self.max_size_bytes > ORCHESTRATOR_MAX_STATE_SIZE:
62
+ logger.warning(
63
+ f"state_max_size_bytes ({self.max_size_bytes}) > "
64
+ f"Orchestrator limit ({ORCHESTRATOR_MAX_STATE_SIZE}). "
65
+ f"Reducing to {ORCHESTRATOR_MAX_STATE_SIZE}."
66
+ )
67
+ self.max_size_bytes = ORCHESTRATOR_MAX_STATE_SIZE
68
+
69
+
70
+ @dataclass
71
+ class TransferMetrics:
72
+ """Metrics from a state transfer operation."""
73
+
74
+ duration_sec: float
75
+ size_bytes: int
76
+ size_ratio: float # size / max_size
77
+ timed_out: bool
78
+ size_exceeded: bool
79
+
80
+
81
+ class StateTransferError(DoryStateError):
82
+ """Error during state transfer operation."""
83
+
84
+ def __init__(
85
+ self,
86
+ message: str,
87
+ metrics: TransferMetrics | None = None,
88
+ cause: Exception | None = None,
89
+ ):
90
+ super().__init__(message, cause=cause)
91
+ self.metrics = metrics
92
+
93
+
94
+ class StateTransferTimeout(StateTransferError):
95
+ """State transfer operation timed out."""
96
+ pass
97
+
98
+
99
+ class StateSizeExceeded(StateTransferError):
100
+ """State size exceeds configured maximum."""
101
+ pass
102
+
103
+
104
+ def validate_state_size(
105
+ state_json: str,
106
+ max_size: int = DEFAULT_MAX_STATE_SIZE,
107
+ warn_threshold: float = DEFAULT_SIZE_WARN_THRESHOLD,
108
+ ) -> TransferMetrics:
109
+ """
110
+ Validate state size against limits.
111
+
112
+ Args:
113
+ state_json: Serialized state JSON string
114
+ max_size: Maximum allowed size in bytes
115
+ warn_threshold: Fraction of max_size to trigger warning
116
+
117
+ Returns:
118
+ TransferMetrics with size information
119
+
120
+ Raises:
121
+ StateSizeExceeded: If state exceeds max_size
122
+ """
123
+ size_bytes = len(state_json.encode("utf-8"))
124
+ size_ratio = size_bytes / max_size if max_size > 0 else 0
125
+
126
+ metrics = TransferMetrics(
127
+ duration_sec=0,
128
+ size_bytes=size_bytes,
129
+ size_ratio=size_ratio,
130
+ timed_out=False,
131
+ size_exceeded=size_bytes > max_size,
132
+ )
133
+
134
+ if size_bytes > max_size:
135
+ raise StateSizeExceeded(
136
+ f"State size ({size_bytes:,} bytes) exceeds maximum "
137
+ f"({max_size:,} bytes). Orchestrator will reject this state.",
138
+ metrics=metrics,
139
+ )
140
+
141
+ if size_ratio >= warn_threshold:
142
+ logger.warning(
143
+ f"State size ({size_bytes:,} bytes) is {size_ratio:.1%} of maximum "
144
+ f"({max_size:,} bytes). Consider reducing state size to avoid "
145
+ "transfer failures."
146
+ )
147
+
148
+ return metrics
149
+
150
+
151
+ def with_timeout(
152
+ timeout_sec: float,
153
+ operation_name: str = "operation",
154
+ ) -> Callable[[Callable[..., T]], Callable[..., T]]:
155
+ """
156
+ Decorator to add timeout to synchronous functions.
157
+
158
+ Runs the function in a thread pool executor with timeout.
159
+
160
+ Args:
161
+ timeout_sec: Timeout in seconds
162
+ operation_name: Name for error messages
163
+
164
+ Returns:
165
+ Decorated function with timeout
166
+ """
167
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
168
+ @functools.wraps(func)
169
+ def wrapper(*args: Any, **kwargs: Any) -> T:
170
+ start_time = time.monotonic()
171
+
172
+ with ThreadPoolExecutor(max_workers=1) as executor:
173
+ future = executor.submit(func, *args, **kwargs)
174
+
175
+ try:
176
+ result = future.result(timeout=timeout_sec)
177
+ duration = time.monotonic() - start_time
178
+
179
+ # Log if operation took significant time
180
+ if duration > timeout_sec * 0.5:
181
+ logger.warning(
182
+ f"{operation_name} took {duration:.2f}s "
183
+ f"({duration/timeout_sec:.1%} of {timeout_sec}s timeout)"
184
+ )
185
+
186
+ return result
187
+
188
+ except FuturesTimeoutError:
189
+ duration = time.monotonic() - start_time
190
+ metrics = TransferMetrics(
191
+ duration_sec=duration,
192
+ size_bytes=0,
193
+ size_ratio=0,
194
+ timed_out=True,
195
+ size_exceeded=False,
196
+ )
197
+ raise StateTransferTimeout(
198
+ f"{operation_name} timed out after {timeout_sec}s. "
199
+ "Consider reducing state size or optimizing get_state().",
200
+ metrics=metrics,
201
+ )
202
+
203
+ return wrapper
204
+ return decorator
205
+
206
+
207
+ async def async_with_timeout(
208
+ coro: Any,
209
+ timeout_sec: float,
210
+ operation_name: str = "operation",
211
+ ) -> Any:
212
+ """
213
+ Execute coroutine with timeout.
214
+
215
+ Args:
216
+ coro: Coroutine to execute
217
+ timeout_sec: Timeout in seconds
218
+ operation_name: Name for error messages
219
+
220
+ Returns:
221
+ Result of the coroutine
222
+
223
+ Raises:
224
+ StateTransferTimeout: If operation times out
225
+ """
226
+ start_time = time.monotonic()
227
+
228
+ try:
229
+ result = await asyncio.wait_for(coro, timeout=timeout_sec)
230
+ duration = time.monotonic() - start_time
231
+
232
+ # Log if operation took significant time
233
+ if duration > timeout_sec * 0.5:
234
+ logger.warning(
235
+ f"{operation_name} took {duration:.2f}s "
236
+ f"({duration/timeout_sec:.1%} of {timeout_sec}s timeout)"
237
+ )
238
+
239
+ return result
240
+
241
+ except asyncio.TimeoutError:
242
+ duration = time.monotonic() - start_time
243
+ metrics = TransferMetrics(
244
+ duration_sec=duration,
245
+ size_bytes=0,
246
+ size_ratio=0,
247
+ timed_out=True,
248
+ size_exceeded=False,
249
+ )
250
+ raise StateTransferTimeout(
251
+ f"{operation_name} timed out after {timeout_sec}s. "
252
+ "Consider reducing state size or optimizing the operation.",
253
+ metrics=metrics,
254
+ )
255
+
256
+
257
+ class StateCaptureGuard:
258
+ """
259
+ Context manager for safe state capture with timeout and size validation.
260
+
261
+ Usage:
262
+ config = TransferConfig(capture_timeout_sec=25, max_size_bytes=8*1024*1024)
263
+
264
+ with StateCaptureGuard(config) as guard:
265
+ state = processor.get_state()
266
+ state_json = json.dumps(state)
267
+ guard.validate(state_json)
268
+ """
269
+
270
+ def __init__(self, config: TransferConfig | None = None):
271
+ """
272
+ Initialize capture guard.
273
+
274
+ Args:
275
+ config: Transfer configuration
276
+ """
277
+ self._config = config or TransferConfig()
278
+ self._start_time: float = 0
279
+ self._metrics: TransferMetrics | None = None
280
+
281
+ def __enter__(self) -> "StateCaptureGuard":
282
+ self._start_time = time.monotonic()
283
+ return self
284
+
285
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> bool:
286
+ duration = time.monotonic() - self._start_time
287
+
288
+ if duration > self._config.capture_timeout_sec:
289
+ logger.error(
290
+ f"State capture took {duration:.2f}s, exceeding "
291
+ f"{self._config.capture_timeout_sec}s timeout"
292
+ )
293
+
294
+ return False # Don't suppress exceptions
295
+
296
+ def validate(self, state_json: str) -> TransferMetrics:
297
+ """
298
+ Validate captured state.
299
+
300
+ Args:
301
+ state_json: Serialized state JSON
302
+
303
+ Returns:
304
+ TransferMetrics with capture information
305
+
306
+ Raises:
307
+ StateSizeExceeded: If state exceeds max size
308
+ StateTransferTimeout: If capture exceeded timeout
309
+ """
310
+ duration = time.monotonic() - self._start_time
311
+
312
+ # Check timeout
313
+ if duration > self._config.capture_timeout_sec:
314
+ self._metrics = TransferMetrics(
315
+ duration_sec=duration,
316
+ size_bytes=len(state_json.encode("utf-8")),
317
+ size_ratio=0,
318
+ timed_out=True,
319
+ size_exceeded=False,
320
+ )
321
+ raise StateTransferTimeout(
322
+ f"State capture took {duration:.2f}s, exceeding "
323
+ f"{self._config.capture_timeout_sec}s timeout",
324
+ metrics=self._metrics,
325
+ )
326
+
327
+ # Check size
328
+ size_metrics = validate_state_size(
329
+ state_json,
330
+ max_size=self._config.max_size_bytes,
331
+ warn_threshold=self._config.size_warn_threshold,
332
+ )
333
+
334
+ self._metrics = TransferMetrics(
335
+ duration_sec=duration,
336
+ size_bytes=size_metrics.size_bytes,
337
+ size_ratio=size_metrics.size_ratio,
338
+ timed_out=False,
339
+ size_exceeded=size_metrics.size_exceeded,
340
+ )
341
+
342
+ return self._metrics
343
+
344
+ @property
345
+ def metrics(self) -> TransferMetrics | None:
346
+ """Get capture metrics."""
347
+ return self._metrics
348
+
349
+
350
+ def log_transfer_summary(
351
+ operation: str,
352
+ metrics: TransferMetrics,
353
+ config: TransferConfig,
354
+ ) -> None:
355
+ """
356
+ Log a summary of the transfer operation.
357
+
358
+ Args:
359
+ operation: Operation name (e.g., "capture", "restore")
360
+ metrics: Transfer metrics
361
+ config: Transfer configuration
362
+ """
363
+ level = logging.INFO
364
+ status = "completed"
365
+
366
+ if metrics.timed_out:
367
+ level = logging.ERROR
368
+ status = "TIMED OUT"
369
+ elif metrics.size_exceeded:
370
+ level = logging.ERROR
371
+ status = "SIZE EXCEEDED"
372
+ elif metrics.size_ratio >= config.size_warn_threshold:
373
+ level = logging.WARNING
374
+ status = "completed (size warning)"
375
+
376
+ logger.log(
377
+ level,
378
+ f"State {operation} {status}: "
379
+ f"duration={metrics.duration_sec:.2f}s/{config.capture_timeout_sec}s, "
380
+ f"size={metrics.size_bytes:,}B/{config.max_size_bytes:,}B "
381
+ f"({metrics.size_ratio:.1%})",
382
+ )