dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dory/edge/adaptive.py ADDED
@@ -0,0 +1,648 @@
1
+ """Adaptive behavior for edge vs cloud environments.
2
+
3
+ Provides location-aware processing that automatically adjusts behavior
4
+ based on whether the workload is running on edge or cloud nodes.
5
+ """
6
+
7
+ import asyncio
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from enum import Enum
11
+ from typing import Any, Callable, Coroutine
12
+
13
+ from dory.edge.detector import (
14
+ WorkloadContext,
15
+ WorkloadDetector,
16
+ NodeType,
17
+ get_workload_context,
18
+ )
19
+ from dory.edge.heartbeat import (
20
+ HeartbeatConfig,
21
+ HeartbeatManager,
22
+ ConnectivityStatus,
23
+ )
24
+ from dory.edge.fencing import FencingConfig, FencingManager
25
+ from dory.edge.role import RoleManager, ProcessorRole
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class OperationMode(Enum):
31
+ """Operating mode based on location and connectivity."""
32
+
33
+ EDGE_CONNECTED = "edge_connected" # Edge node, orchestrator reachable
34
+ EDGE_OFFLINE = "edge_offline" # Edge node, orchestrator unreachable
35
+ CLOUD_NORMAL = "cloud_normal" # Cloud node, normal operation
36
+ CLOUD_FAILOVER = "cloud_failover" # Cloud node, handling edge failover
37
+ DEGRADED = "degraded" # Unknown state, conservative mode
38
+
39
+
40
+ @dataclass
41
+ class EdgeConfig:
42
+ """Configuration for edge-specific behavior."""
43
+
44
+ # Heartbeat settings for edge (more aggressive)
45
+ edge_heartbeat_interval_sec: float = 5.0
46
+ edge_heartbeat_timeout_sec: float = 10.0
47
+ edge_max_missed_heartbeats: int = 3
48
+
49
+ # Heartbeat settings for cloud (standard)
50
+ cloud_heartbeat_interval_sec: float = 15.0
51
+ cloud_heartbeat_timeout_sec: float = 30.0
52
+ cloud_max_missed_heartbeats: int = 2
53
+
54
+ # Offline buffer settings (edge only)
55
+ offline_buffer_enabled: bool = True
56
+ offline_buffer_max_size_mb: int = 100
57
+ offline_buffer_flush_interval_sec: float = 30.0
58
+
59
+ # State checkpoint settings
60
+ edge_checkpoint_interval_sec: float = 10.0 # More frequent on edge
61
+ cloud_checkpoint_interval_sec: float = 60.0 # Less frequent on cloud
62
+
63
+ # Retry settings
64
+ edge_max_retries: int = 5
65
+ edge_retry_backoff_sec: float = 1.0
66
+ cloud_max_retries: int = 3
67
+ cloud_retry_backoff_sec: float = 2.0
68
+
69
+ # Resource constraints
70
+ edge_max_batch_size: int = 100 # Smaller batches on edge
71
+ cloud_max_batch_size: int = 1000 # Larger batches on cloud
72
+ edge_max_concurrent: int = 2 # Limited concurrency on edge
73
+ cloud_max_concurrent: int = 10 # Higher concurrency on cloud
74
+
75
+
76
+ @dataclass
77
+ class AdaptiveConfig:
78
+ """Full configuration for adaptive processor."""
79
+
80
+ app_name: str
81
+ processor_id: str
82
+ orchestrator_url: str | None = None
83
+ edge_config: EdgeConfig = field(default_factory=EdgeConfig)
84
+ fencing_config: FencingConfig | None = None
85
+ custom_detector: WorkloadDetector | None = None
86
+
87
+
88
+ class AdaptiveProcessor:
89
+ """Processor that adapts behavior based on edge/cloud location.
90
+
91
+ Automatically adjusts:
92
+ - Heartbeat intervals and retry behavior
93
+ - State checkpoint frequency
94
+ - Batch sizes and concurrency
95
+ - Offline buffering (edge only)
96
+ - Failover handling
97
+
98
+ Usage:
99
+ processor = AdaptiveProcessor(AdaptiveConfig(
100
+ app_name="my-app",
101
+ processor_id="processor-1",
102
+ orchestrator_url="http://orchestrator:8080",
103
+ ))
104
+
105
+ await processor.start()
106
+
107
+ # Processor automatically adapts based on location
108
+ if processor.is_edge:
109
+ # Running on edge with edge-optimized settings
110
+ pass
111
+
112
+ if processor.is_offline:
113
+ # Edge node offline, using local buffer
114
+ pass
115
+ """
116
+
117
+ def __init__(self, config: AdaptiveConfig):
118
+ self.config = config
119
+ self._detector = config.custom_detector or WorkloadDetector()
120
+ self._context: WorkloadContext | None = None
121
+ self._mode = OperationMode.DEGRADED
122
+ self._heartbeat: HeartbeatManager | None = None
123
+ self._fencing: FencingManager | None = None
124
+ self._role_manager: RoleManager | None = None
125
+ self._offline_buffer: list[dict[str, Any]] = []
126
+ self._state: dict[str, Any] = {}
127
+ self._running = False
128
+ self._checkpoint_task: asyncio.Task | None = None
129
+ self._flush_task: asyncio.Task | None = None
130
+
131
+ # Callbacks
132
+ self._on_mode_change: Callable[[OperationMode, OperationMode], Coroutine] | None = None
133
+ self._on_connectivity_change: Callable[[ConnectivityStatus], Coroutine] | None = None
134
+ self._on_failover: Callable[[str], Coroutine] | None = None # original_node
135
+ self._on_failback: Callable[[], Coroutine] | None = None
136
+
137
+ # =========================================================================
138
+ # Properties
139
+ # =========================================================================
140
+
141
+ @property
142
+ def context(self) -> WorkloadContext | None:
143
+ """Get current workload context."""
144
+ return self._context
145
+
146
+ @property
147
+ def mode(self) -> OperationMode:
148
+ """Get current operation mode."""
149
+ return self._mode
150
+
151
+ @property
152
+ def is_edge(self) -> bool:
153
+ """Check if running on edge node."""
154
+ return self._context.is_edge if self._context else False
155
+
156
+ @property
157
+ def is_cloud(self) -> bool:
158
+ """Check if running on cloud node."""
159
+ return not self.is_edge
160
+
161
+ @property
162
+ def is_migrated(self) -> bool:
163
+ """Check if this is a migrated (failover) workload."""
164
+ return self._context.is_migrated if self._context else False
165
+
166
+ @property
167
+ def is_offline(self) -> bool:
168
+ """Check if currently offline (edge only)."""
169
+ return self._mode == OperationMode.EDGE_OFFLINE
170
+
171
+ @property
172
+ def is_connected(self) -> bool:
173
+ """Check if connected to orchestrator."""
174
+ if self._heartbeat:
175
+ return self._heartbeat.is_connected()
176
+ return False
177
+
178
+ @property
179
+ def connectivity_status(self) -> ConnectivityStatus:
180
+ """Get current connectivity status."""
181
+ if self._heartbeat:
182
+ return self._heartbeat.get_status()
183
+ return ConnectivityStatus.UNKNOWN
184
+
185
+ @property
186
+ def role(self) -> ProcessorRole:
187
+ """Get current processor role."""
188
+ if self._role_manager:
189
+ return self._role_manager.role
190
+ return ProcessorRole.INITIALIZING
191
+
192
+ # =========================================================================
193
+ # Configuration Helpers
194
+ # =========================================================================
195
+
196
+ def get_heartbeat_config(self) -> HeartbeatConfig:
197
+ """Get heartbeat config based on location."""
198
+ ec = self.config.edge_config
199
+
200
+ if self.is_edge:
201
+ return HeartbeatConfig(
202
+ interval_sec=ec.edge_heartbeat_interval_sec,
203
+ timeout_sec=ec.edge_heartbeat_timeout_sec,
204
+ missed_threshold=ec.edge_max_missed_heartbeats,
205
+ )
206
+ else:
207
+ return HeartbeatConfig(
208
+ interval_sec=ec.cloud_heartbeat_interval_sec,
209
+ timeout_sec=ec.cloud_heartbeat_timeout_sec,
210
+ missed_threshold=ec.cloud_max_missed_heartbeats,
211
+ )
212
+
213
+ def get_checkpoint_interval(self) -> float:
214
+ """Get checkpoint interval based on location."""
215
+ ec = self.config.edge_config
216
+ return ec.edge_checkpoint_interval_sec if self.is_edge else ec.cloud_checkpoint_interval_sec
217
+
218
+ def get_max_batch_size(self) -> int:
219
+ """Get max batch size based on location."""
220
+ ec = self.config.edge_config
221
+ return ec.edge_max_batch_size if self.is_edge else ec.cloud_max_batch_size
222
+
223
+ def get_max_concurrent(self) -> int:
224
+ """Get max concurrency based on location."""
225
+ ec = self.config.edge_config
226
+ return ec.edge_max_concurrent if self.is_edge else ec.cloud_max_concurrent
227
+
228
+ def get_retry_config(self) -> tuple[int, float]:
229
+ """Get retry config (max_retries, backoff_sec) based on location."""
230
+ ec = self.config.edge_config
231
+ if self.is_edge:
232
+ return ec.edge_max_retries, ec.edge_retry_backoff_sec
233
+ return ec.cloud_max_retries, ec.cloud_retry_backoff_sec
234
+
235
+ # =========================================================================
236
+ # Lifecycle
237
+ # =========================================================================
238
+
239
+ async def start(self) -> OperationMode:
240
+ """Start the adaptive processor.
241
+
242
+ Returns:
243
+ Initial operation mode
244
+ """
245
+ logger.info(f"Starting adaptive processor: {self.config.processor_id}")
246
+
247
+ # Detect workload context
248
+ self._context = self._detector.detect()
249
+ logger.info(
250
+ f"Workload context: node_type={self._context.node_type.value}, "
251
+ f"is_edge={self._context.is_edge}, is_migrated={self._context.is_migrated}"
252
+ )
253
+
254
+ # Initialize fencing
255
+ fencing_config = self.config.fencing_config or FencingConfig()
256
+ self._fencing = FencingManager(
257
+ app_name=self.config.app_name,
258
+ config=fencing_config,
259
+ )
260
+
261
+ # Initialize role manager
262
+ self._role_manager = RoleManager(
263
+ processor_id=self.config.processor_id,
264
+ fencing_manager=self._fencing,
265
+ )
266
+
267
+ # Initialize heartbeat if orchestrator URL provided
268
+ if self.config.orchestrator_url:
269
+ heartbeat_config = self.get_heartbeat_config()
270
+ self._heartbeat = HeartbeatManager(
271
+ processor_id=self.config.processor_id,
272
+ orchestrator_url=self.config.orchestrator_url,
273
+ config=heartbeat_config,
274
+ )
275
+
276
+ # Register connectivity callback
277
+ if self._on_connectivity_change:
278
+ # HeartbeatManager would need to support this
279
+ pass
280
+
281
+ # Handle migrated workload
282
+ if self._context.is_migrated:
283
+ logger.info(f"Migrated workload from: {self._context.original_node}")
284
+ if self._on_failover:
285
+ await self._on_failover(self._context.original_node or "unknown")
286
+
287
+ # Start role manager (acquires fencing token)
288
+ await self._role_manager.start()
289
+
290
+ # Start heartbeat
291
+ if self._heartbeat:
292
+ await self._heartbeat.start()
293
+
294
+ # Determine initial mode
295
+ self._mode = self._determine_mode()
296
+ logger.info(f"Initial operation mode: {self._mode.value}")
297
+
298
+ # Start background tasks
299
+ self._running = True
300
+ self._checkpoint_task = asyncio.create_task(self._checkpoint_loop())
301
+
302
+ if self.is_edge and self.config.edge_config.offline_buffer_enabled:
303
+ self._flush_task = asyncio.create_task(self._flush_loop())
304
+
305
+ return self._mode
306
+
307
+ async def stop(self, reason: str = "shutdown") -> None:
308
+ """Stop the adaptive processor.
309
+
310
+ Args:
311
+ reason: Reason for stopping
312
+ """
313
+ logger.info(f"Stopping adaptive processor: {reason}")
314
+ self._running = False
315
+
316
+ # Cancel background tasks
317
+ if self._checkpoint_task:
318
+ self._checkpoint_task.cancel()
319
+ try:
320
+ await self._checkpoint_task
321
+ except asyncio.CancelledError:
322
+ pass
323
+
324
+ if self._flush_task:
325
+ self._flush_task.cancel()
326
+ try:
327
+ await self._flush_task
328
+ except asyncio.CancelledError:
329
+ pass
330
+
331
+ # Flush any remaining offline buffer
332
+ if self._offline_buffer:
333
+ await self._flush_offline_buffer()
334
+
335
+ # Final state checkpoint
336
+ await self._save_state()
337
+
338
+ # Stop heartbeat
339
+ if self._heartbeat:
340
+ await self._heartbeat.stop()
341
+
342
+ # Release fencing
343
+ if self._role_manager:
344
+ await self._role_manager.stop(reason)
345
+
346
+ logger.info("Adaptive processor stopped")
347
+
348
+ # =========================================================================
349
+ # Mode Management
350
+ # =========================================================================
351
+
352
+ def _determine_mode(self) -> OperationMode:
353
+ """Determine current operation mode."""
354
+ if not self._context:
355
+ return OperationMode.DEGRADED
356
+
357
+ if self._context.is_edge:
358
+ # Edge node
359
+ if self.is_connected:
360
+ return OperationMode.EDGE_CONNECTED
361
+ else:
362
+ return OperationMode.EDGE_OFFLINE
363
+ else:
364
+ # Cloud node
365
+ if self._context.is_migrated:
366
+ return OperationMode.CLOUD_FAILOVER
367
+ else:
368
+ return OperationMode.CLOUD_NORMAL
369
+
370
+ async def _update_mode(self) -> None:
371
+ """Update operation mode and trigger callbacks if changed."""
372
+ old_mode = self._mode
373
+ new_mode = self._determine_mode()
374
+
375
+ if old_mode != new_mode:
376
+ logger.info(f"Mode change: {old_mode.value} -> {new_mode.value}")
377
+ self._mode = new_mode
378
+
379
+ if self._on_mode_change:
380
+ await self._on_mode_change(old_mode, new_mode)
381
+
382
+ # Handle specific transitions
383
+ if old_mode == OperationMode.EDGE_OFFLINE and new_mode == OperationMode.EDGE_CONNECTED:
384
+ # Came back online - flush buffer
385
+ await self._flush_offline_buffer()
386
+
387
+ # =========================================================================
388
+ # State Management
389
+ # =========================================================================
390
+
391
+ def get_state(self) -> dict[str, Any]:
392
+ """Get current processor state."""
393
+ return self._state.copy()
394
+
395
+ def set_state(self, state: dict[str, Any]) -> None:
396
+ """Set processor state."""
397
+ self._state = state.copy()
398
+
399
+ async def restore_state(self, state: dict[str, Any]) -> None:
400
+ """Restore state (called during failover recovery)."""
401
+ self._state = state.copy()
402
+ logger.info(f"State restored: {len(state)} keys")
403
+
404
+ async def _save_state(self) -> None:
405
+ """Save state to appropriate storage."""
406
+ if self.is_offline:
407
+ # Buffer locally when offline
408
+ self._buffer_state(self._state)
409
+ else:
410
+ # Save to remote storage
411
+ await self._save_state_remote(self._state)
412
+
413
+ async def _save_state_remote(self, state: dict[str, Any]) -> None:
414
+ """Save state to remote storage (orchestrator/S3)."""
415
+ # This would integrate with StateManager
416
+ logger.debug(f"Saving state remotely: {len(state)} keys")
417
+
418
+ def _buffer_state(self, state: dict[str, Any]) -> None:
419
+ """Buffer state locally (edge offline mode)."""
420
+ max_size = self.config.edge_config.offline_buffer_max_size_mb * 1024 * 1024
421
+ # Simplified size check
422
+ if len(self._offline_buffer) < 10000: # Rough limit
423
+ self._offline_buffer.append(state.copy())
424
+ logger.debug(f"State buffered locally: {len(self._offline_buffer)} items")
425
+
426
+ async def _flush_offline_buffer(self) -> None:
427
+ """Flush offline buffer to remote storage."""
428
+ if not self._offline_buffer:
429
+ return
430
+
431
+ logger.info(f"Flushing offline buffer: {len(self._offline_buffer)} items")
432
+
433
+ # In real implementation, this would batch-upload to S3/orchestrator
434
+ flushed = 0
435
+ while self._offline_buffer and self.is_connected:
436
+ state = self._offline_buffer.pop(0)
437
+ try:
438
+ await self._save_state_remote(state)
439
+ flushed += 1
440
+ except Exception as e:
441
+ # Put back and retry later
442
+ self._offline_buffer.insert(0, state)
443
+ logger.warning(f"Buffer flush failed: {e}")
444
+ break
445
+
446
+ logger.info(f"Flushed {flushed} items from offline buffer")
447
+
448
+ # =========================================================================
449
+ # Background Tasks
450
+ # =========================================================================
451
+
452
+ async def _checkpoint_loop(self) -> None:
453
+ """Periodic state checkpoint."""
454
+ interval = self.get_checkpoint_interval()
455
+
456
+ while self._running:
457
+ try:
458
+ await asyncio.sleep(interval)
459
+ await self._save_state()
460
+ await self._update_mode()
461
+ except asyncio.CancelledError:
462
+ break
463
+ except Exception as e:
464
+ logger.error(f"Checkpoint error: {e}")
465
+
466
+ async def _flush_loop(self) -> None:
467
+ """Periodic offline buffer flush (edge only)."""
468
+ interval = self.config.edge_config.offline_buffer_flush_interval_sec
469
+
470
+ while self._running:
471
+ try:
472
+ await asyncio.sleep(interval)
473
+ if self.is_connected and self._offline_buffer:
474
+ await self._flush_offline_buffer()
475
+ except asyncio.CancelledError:
476
+ break
477
+ except Exception as e:
478
+ logger.error(f"Flush error: {e}")
479
+
480
+ # =========================================================================
481
+ # Callbacks
482
+ # =========================================================================
483
+
484
+ def on_mode_change(
485
+ self, callback: Callable[[OperationMode, OperationMode], Coroutine]
486
+ ) -> None:
487
+ """Register callback for mode changes."""
488
+ self._on_mode_change = callback
489
+
490
+ def on_connectivity_change(
491
+ self, callback: Callable[[ConnectivityStatus], Coroutine]
492
+ ) -> None:
493
+ """Register callback for connectivity changes."""
494
+ self._on_connectivity_change = callback
495
+
496
+ def on_failover(self, callback: Callable[[str], Coroutine]) -> None:
497
+ """Register callback for failover (receives original_node)."""
498
+ self._on_failover = callback
499
+
500
+ def on_failback(self, callback: Callable[[], Coroutine]) -> None:
501
+ """Register callback for failback to edge."""
502
+ self._on_failback = callback
503
+
504
+ # =========================================================================
505
+ # Processing Helpers
506
+ # =========================================================================
507
+
508
+ async def process_with_fencing(
509
+ self,
510
+ operation: Callable[[], Coroutine[Any, Any, Any]],
511
+ ) -> Any:
512
+ """Execute operation with fencing validation.
513
+
514
+ Validates fencing token before and after operation to ensure
515
+ we're still the primary processor.
516
+
517
+ Args:
518
+ operation: Async operation to execute
519
+
520
+ Returns:
521
+ Operation result
522
+
523
+ Raises:
524
+ FenceViolation: If fencing is violated
525
+ """
526
+ if not self._role_manager:
527
+ raise RuntimeError("Processor not started")
528
+
529
+ # Validate before
530
+ if not await self._role_manager.validate_fencing_or_fence():
531
+ from dory.edge.fencing import FenceViolation
532
+ raise FenceViolation("Fencing validation failed before operation")
533
+
534
+ # Execute operation
535
+ result = await operation()
536
+
537
+ # Validate after
538
+ if not await self._role_manager.validate_fencing_or_fence():
539
+ from dory.edge.fencing import FenceViolation
540
+ raise FenceViolation("Fencing validation failed after operation")
541
+
542
+ return result
543
+
544
+ async def process_batch(
545
+ self,
546
+ items: list[Any],
547
+ processor: Callable[[Any], Coroutine[Any, Any, Any]],
548
+ ) -> list[Any]:
549
+ """Process items in batches appropriate for current location.
550
+
551
+ Automatically adjusts batch size and concurrency based on
552
+ whether running on edge or cloud.
553
+
554
+ Args:
555
+ items: Items to process
556
+ processor: Async function to process each item
557
+
558
+ Returns:
559
+ List of results
560
+ """
561
+ batch_size = self.get_max_batch_size()
562
+ max_concurrent = self.get_max_concurrent()
563
+
564
+ results = []
565
+ semaphore = asyncio.Semaphore(max_concurrent)
566
+
567
+ async def process_with_semaphore(item: Any) -> Any:
568
+ async with semaphore:
569
+ return await processor(item)
570
+
571
+ # Process in batches
572
+ for i in range(0, len(items), batch_size):
573
+ batch = items[i:i + batch_size]
574
+ batch_results = await asyncio.gather(
575
+ *[process_with_semaphore(item) for item in batch],
576
+ return_exceptions=True,
577
+ )
578
+ results.extend(batch_results)
579
+
580
+ # Checkpoint after each batch
581
+ if self.is_edge:
582
+ await self._save_state()
583
+
584
+ return results
585
+
586
+
587
+ # =============================================================================
588
+ # Convenience Functions
589
+ # =============================================================================
590
+
591
+ def create_adaptive_processor(
592
+ app_name: str,
593
+ processor_id: str,
594
+ orchestrator_url: str | None = None,
595
+ **kwargs: Any,
596
+ ) -> AdaptiveProcessor:
597
+ """Create an adaptive processor with sensible defaults.
598
+
599
+ Args:
600
+ app_name: Application name
601
+ processor_id: Unique processor identifier
602
+ orchestrator_url: URL of the orchestrator service
603
+ **kwargs: Additional EdgeConfig parameters
604
+
605
+ Returns:
606
+ Configured AdaptiveProcessor
607
+ """
608
+ edge_config = EdgeConfig(**kwargs) if kwargs else EdgeConfig()
609
+
610
+ config = AdaptiveConfig(
611
+ app_name=app_name,
612
+ processor_id=processor_id,
613
+ orchestrator_url=orchestrator_url,
614
+ edge_config=edge_config,
615
+ )
616
+
617
+ return AdaptiveProcessor(config)
618
+
619
+
620
+ def get_location_aware_settings() -> dict[str, Any]:
621
+ """Get settings appropriate for current location.
622
+
623
+ Returns:
624
+ Dictionary of settings adjusted for edge or cloud
625
+ """
626
+ context = get_workload_context()
627
+ config = EdgeConfig()
628
+
629
+ if context.is_edge:
630
+ return {
631
+ "heartbeat_interval_sec": config.edge_heartbeat_interval_sec,
632
+ "checkpoint_interval_sec": config.edge_checkpoint_interval_sec,
633
+ "max_batch_size": config.edge_max_batch_size,
634
+ "max_concurrent": config.edge_max_concurrent,
635
+ "max_retries": config.edge_max_retries,
636
+ "retry_backoff_sec": config.edge_retry_backoff_sec,
637
+ "offline_buffer_enabled": config.offline_buffer_enabled,
638
+ }
639
+ else:
640
+ return {
641
+ "heartbeat_interval_sec": config.cloud_heartbeat_interval_sec,
642
+ "checkpoint_interval_sec": config.cloud_checkpoint_interval_sec,
643
+ "max_batch_size": config.cloud_max_batch_size,
644
+ "max_concurrent": config.cloud_max_concurrent,
645
+ "max_retries": config.cloud_max_retries,
646
+ "retry_backoff_sec": config.cloud_retry_backoff_sec,
647
+ "offline_buffer_enabled": False,
648
+ }