dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +32 -1
- dory/config/defaults.py +6 -0
- dory/config/schema.py +26 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +648 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +416 -0
- dory/health/server.py +283 -9
- dory/k8s/__init__.py +69 -0
- dory/k8s/labels.py +505 -0
- dory/migration/__init__.py +49 -0
- dory/migration/s3_store.py +656 -0
- dory/migration/state_manager.py +64 -6
- dory/migration/transfer.py +382 -0
- dory/migration/versioning.py +749 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/METADATA +37 -32
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/RECORD +22 -15
- dory_sdk-2.1.4.dist-info/entry_points.txt +2 -0
- dory/sidecar/__init__.py +0 -6
- dory/sidecar/main.py +0 -75
- dory/sidecar/server.py +0 -329
- dory_sdk-2.1.0.dist-info/entry_points.txt +0 -3
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/WHEEL +0 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/top_level.txt +0 -0
dory/edge/adaptive.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
"""Adaptive behavior for edge vs cloud environments.
|
|
2
|
+
|
|
3
|
+
Provides location-aware processing that automatically adjusts behavior
|
|
4
|
+
based on whether the workload is running on edge or cloud nodes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any, Callable, Coroutine
|
|
12
|
+
|
|
13
|
+
from dory.edge.detector import (
|
|
14
|
+
WorkloadContext,
|
|
15
|
+
WorkloadDetector,
|
|
16
|
+
NodeType,
|
|
17
|
+
get_workload_context,
|
|
18
|
+
)
|
|
19
|
+
from dory.edge.heartbeat import (
|
|
20
|
+
HeartbeatConfig,
|
|
21
|
+
HeartbeatManager,
|
|
22
|
+
ConnectivityStatus,
|
|
23
|
+
)
|
|
24
|
+
from dory.edge.fencing import FencingConfig, FencingManager
|
|
25
|
+
from dory.edge.role import RoleManager, ProcessorRole
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class OperationMode(Enum):
|
|
31
|
+
"""Operating mode based on location and connectivity."""
|
|
32
|
+
|
|
33
|
+
EDGE_CONNECTED = "edge_connected" # Edge node, orchestrator reachable
|
|
34
|
+
EDGE_OFFLINE = "edge_offline" # Edge node, orchestrator unreachable
|
|
35
|
+
CLOUD_NORMAL = "cloud_normal" # Cloud node, normal operation
|
|
36
|
+
CLOUD_FAILOVER = "cloud_failover" # Cloud node, handling edge failover
|
|
37
|
+
DEGRADED = "degraded" # Unknown state, conservative mode
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class EdgeConfig:
|
|
42
|
+
"""Configuration for edge-specific behavior."""
|
|
43
|
+
|
|
44
|
+
# Heartbeat settings for edge (more aggressive)
|
|
45
|
+
edge_heartbeat_interval_sec: float = 5.0
|
|
46
|
+
edge_heartbeat_timeout_sec: float = 10.0
|
|
47
|
+
edge_max_missed_heartbeats: int = 3
|
|
48
|
+
|
|
49
|
+
# Heartbeat settings for cloud (standard)
|
|
50
|
+
cloud_heartbeat_interval_sec: float = 15.0
|
|
51
|
+
cloud_heartbeat_timeout_sec: float = 30.0
|
|
52
|
+
cloud_max_missed_heartbeats: int = 2
|
|
53
|
+
|
|
54
|
+
# Offline buffer settings (edge only)
|
|
55
|
+
offline_buffer_enabled: bool = True
|
|
56
|
+
offline_buffer_max_size_mb: int = 100
|
|
57
|
+
offline_buffer_flush_interval_sec: float = 30.0
|
|
58
|
+
|
|
59
|
+
# State checkpoint settings
|
|
60
|
+
edge_checkpoint_interval_sec: float = 10.0 # More frequent on edge
|
|
61
|
+
cloud_checkpoint_interval_sec: float = 60.0 # Less frequent on cloud
|
|
62
|
+
|
|
63
|
+
# Retry settings
|
|
64
|
+
edge_max_retries: int = 5
|
|
65
|
+
edge_retry_backoff_sec: float = 1.0
|
|
66
|
+
cloud_max_retries: int = 3
|
|
67
|
+
cloud_retry_backoff_sec: float = 2.0
|
|
68
|
+
|
|
69
|
+
# Resource constraints
|
|
70
|
+
edge_max_batch_size: int = 100 # Smaller batches on edge
|
|
71
|
+
cloud_max_batch_size: int = 1000 # Larger batches on cloud
|
|
72
|
+
edge_max_concurrent: int = 2 # Limited concurrency on edge
|
|
73
|
+
cloud_max_concurrent: int = 10 # Higher concurrency on cloud
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class AdaptiveConfig:
|
|
78
|
+
"""Full configuration for adaptive processor."""
|
|
79
|
+
|
|
80
|
+
app_name: str
|
|
81
|
+
processor_id: str
|
|
82
|
+
orchestrator_url: str | None = None
|
|
83
|
+
edge_config: EdgeConfig = field(default_factory=EdgeConfig)
|
|
84
|
+
fencing_config: FencingConfig | None = None
|
|
85
|
+
custom_detector: WorkloadDetector | None = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class AdaptiveProcessor:
|
|
89
|
+
"""Processor that adapts behavior based on edge/cloud location.
|
|
90
|
+
|
|
91
|
+
Automatically adjusts:
|
|
92
|
+
- Heartbeat intervals and retry behavior
|
|
93
|
+
- State checkpoint frequency
|
|
94
|
+
- Batch sizes and concurrency
|
|
95
|
+
- Offline buffering (edge only)
|
|
96
|
+
- Failover handling
|
|
97
|
+
|
|
98
|
+
Usage:
|
|
99
|
+
processor = AdaptiveProcessor(AdaptiveConfig(
|
|
100
|
+
app_name="my-app",
|
|
101
|
+
processor_id="processor-1",
|
|
102
|
+
orchestrator_url="http://orchestrator:8080",
|
|
103
|
+
))
|
|
104
|
+
|
|
105
|
+
await processor.start()
|
|
106
|
+
|
|
107
|
+
# Processor automatically adapts based on location
|
|
108
|
+
if processor.is_edge:
|
|
109
|
+
# Running on edge with edge-optimized settings
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
if processor.is_offline:
|
|
113
|
+
# Edge node offline, using local buffer
|
|
114
|
+
pass
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
def __init__(self, config: AdaptiveConfig):
|
|
118
|
+
self.config = config
|
|
119
|
+
self._detector = config.custom_detector or WorkloadDetector()
|
|
120
|
+
self._context: WorkloadContext | None = None
|
|
121
|
+
self._mode = OperationMode.DEGRADED
|
|
122
|
+
self._heartbeat: HeartbeatManager | None = None
|
|
123
|
+
self._fencing: FencingManager | None = None
|
|
124
|
+
self._role_manager: RoleManager | None = None
|
|
125
|
+
self._offline_buffer: list[dict[str, Any]] = []
|
|
126
|
+
self._state: dict[str, Any] = {}
|
|
127
|
+
self._running = False
|
|
128
|
+
self._checkpoint_task: asyncio.Task | None = None
|
|
129
|
+
self._flush_task: asyncio.Task | None = None
|
|
130
|
+
|
|
131
|
+
# Callbacks
|
|
132
|
+
self._on_mode_change: Callable[[OperationMode, OperationMode], Coroutine] | None = None
|
|
133
|
+
self._on_connectivity_change: Callable[[ConnectivityStatus], Coroutine] | None = None
|
|
134
|
+
self._on_failover: Callable[[str], Coroutine] | None = None # original_node
|
|
135
|
+
self._on_failback: Callable[[], Coroutine] | None = None
|
|
136
|
+
|
|
137
|
+
# =========================================================================
|
|
138
|
+
# Properties
|
|
139
|
+
# =========================================================================
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def context(self) -> WorkloadContext | None:
|
|
143
|
+
"""Get current workload context."""
|
|
144
|
+
return self._context
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def mode(self) -> OperationMode:
|
|
148
|
+
"""Get current operation mode."""
|
|
149
|
+
return self._mode
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def is_edge(self) -> bool:
|
|
153
|
+
"""Check if running on edge node."""
|
|
154
|
+
return self._context.is_edge if self._context else False
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def is_cloud(self) -> bool:
|
|
158
|
+
"""Check if running on cloud node."""
|
|
159
|
+
return not self.is_edge
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def is_migrated(self) -> bool:
|
|
163
|
+
"""Check if this is a migrated (failover) workload."""
|
|
164
|
+
return self._context.is_migrated if self._context else False
|
|
165
|
+
|
|
166
|
+
@property
|
|
167
|
+
def is_offline(self) -> bool:
|
|
168
|
+
"""Check if currently offline (edge only)."""
|
|
169
|
+
return self._mode == OperationMode.EDGE_OFFLINE
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def is_connected(self) -> bool:
|
|
173
|
+
"""Check if connected to orchestrator."""
|
|
174
|
+
if self._heartbeat:
|
|
175
|
+
return self._heartbeat.is_connected()
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def connectivity_status(self) -> ConnectivityStatus:
|
|
180
|
+
"""Get current connectivity status."""
|
|
181
|
+
if self._heartbeat:
|
|
182
|
+
return self._heartbeat.get_status()
|
|
183
|
+
return ConnectivityStatus.UNKNOWN
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def role(self) -> ProcessorRole:
|
|
187
|
+
"""Get current processor role."""
|
|
188
|
+
if self._role_manager:
|
|
189
|
+
return self._role_manager.role
|
|
190
|
+
return ProcessorRole.INITIALIZING
|
|
191
|
+
|
|
192
|
+
# =========================================================================
|
|
193
|
+
# Configuration Helpers
|
|
194
|
+
# =========================================================================
|
|
195
|
+
|
|
196
|
+
def get_heartbeat_config(self) -> HeartbeatConfig:
|
|
197
|
+
"""Get heartbeat config based on location."""
|
|
198
|
+
ec = self.config.edge_config
|
|
199
|
+
|
|
200
|
+
if self.is_edge:
|
|
201
|
+
return HeartbeatConfig(
|
|
202
|
+
interval_sec=ec.edge_heartbeat_interval_sec,
|
|
203
|
+
timeout_sec=ec.edge_heartbeat_timeout_sec,
|
|
204
|
+
missed_threshold=ec.edge_max_missed_heartbeats,
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
return HeartbeatConfig(
|
|
208
|
+
interval_sec=ec.cloud_heartbeat_interval_sec,
|
|
209
|
+
timeout_sec=ec.cloud_heartbeat_timeout_sec,
|
|
210
|
+
missed_threshold=ec.cloud_max_missed_heartbeats,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def get_checkpoint_interval(self) -> float:
|
|
214
|
+
"""Get checkpoint interval based on location."""
|
|
215
|
+
ec = self.config.edge_config
|
|
216
|
+
return ec.edge_checkpoint_interval_sec if self.is_edge else ec.cloud_checkpoint_interval_sec
|
|
217
|
+
|
|
218
|
+
def get_max_batch_size(self) -> int:
|
|
219
|
+
"""Get max batch size based on location."""
|
|
220
|
+
ec = self.config.edge_config
|
|
221
|
+
return ec.edge_max_batch_size if self.is_edge else ec.cloud_max_batch_size
|
|
222
|
+
|
|
223
|
+
def get_max_concurrent(self) -> int:
|
|
224
|
+
"""Get max concurrency based on location."""
|
|
225
|
+
ec = self.config.edge_config
|
|
226
|
+
return ec.edge_max_concurrent if self.is_edge else ec.cloud_max_concurrent
|
|
227
|
+
|
|
228
|
+
def get_retry_config(self) -> tuple[int, float]:
|
|
229
|
+
"""Get retry config (max_retries, backoff_sec) based on location."""
|
|
230
|
+
ec = self.config.edge_config
|
|
231
|
+
if self.is_edge:
|
|
232
|
+
return ec.edge_max_retries, ec.edge_retry_backoff_sec
|
|
233
|
+
return ec.cloud_max_retries, ec.cloud_retry_backoff_sec
|
|
234
|
+
|
|
235
|
+
# =========================================================================
|
|
236
|
+
# Lifecycle
|
|
237
|
+
# =========================================================================
|
|
238
|
+
|
|
239
|
+
async def start(self) -> OperationMode:
|
|
240
|
+
"""Start the adaptive processor.
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Initial operation mode
|
|
244
|
+
"""
|
|
245
|
+
logger.info(f"Starting adaptive processor: {self.config.processor_id}")
|
|
246
|
+
|
|
247
|
+
# Detect workload context
|
|
248
|
+
self._context = self._detector.detect()
|
|
249
|
+
logger.info(
|
|
250
|
+
f"Workload context: node_type={self._context.node_type.value}, "
|
|
251
|
+
f"is_edge={self._context.is_edge}, is_migrated={self._context.is_migrated}"
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Initialize fencing
|
|
255
|
+
fencing_config = self.config.fencing_config or FencingConfig()
|
|
256
|
+
self._fencing = FencingManager(
|
|
257
|
+
app_name=self.config.app_name,
|
|
258
|
+
config=fencing_config,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Initialize role manager
|
|
262
|
+
self._role_manager = RoleManager(
|
|
263
|
+
processor_id=self.config.processor_id,
|
|
264
|
+
fencing_manager=self._fencing,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Initialize heartbeat if orchestrator URL provided
|
|
268
|
+
if self.config.orchestrator_url:
|
|
269
|
+
heartbeat_config = self.get_heartbeat_config()
|
|
270
|
+
self._heartbeat = HeartbeatManager(
|
|
271
|
+
processor_id=self.config.processor_id,
|
|
272
|
+
orchestrator_url=self.config.orchestrator_url,
|
|
273
|
+
config=heartbeat_config,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Register connectivity callback
|
|
277
|
+
if self._on_connectivity_change:
|
|
278
|
+
# HeartbeatManager would need to support this
|
|
279
|
+
pass
|
|
280
|
+
|
|
281
|
+
# Handle migrated workload
|
|
282
|
+
if self._context.is_migrated:
|
|
283
|
+
logger.info(f"Migrated workload from: {self._context.original_node}")
|
|
284
|
+
if self._on_failover:
|
|
285
|
+
await self._on_failover(self._context.original_node or "unknown")
|
|
286
|
+
|
|
287
|
+
# Start role manager (acquires fencing token)
|
|
288
|
+
await self._role_manager.start()
|
|
289
|
+
|
|
290
|
+
# Start heartbeat
|
|
291
|
+
if self._heartbeat:
|
|
292
|
+
await self._heartbeat.start()
|
|
293
|
+
|
|
294
|
+
# Determine initial mode
|
|
295
|
+
self._mode = self._determine_mode()
|
|
296
|
+
logger.info(f"Initial operation mode: {self._mode.value}")
|
|
297
|
+
|
|
298
|
+
# Start background tasks
|
|
299
|
+
self._running = True
|
|
300
|
+
self._checkpoint_task = asyncio.create_task(self._checkpoint_loop())
|
|
301
|
+
|
|
302
|
+
if self.is_edge and self.config.edge_config.offline_buffer_enabled:
|
|
303
|
+
self._flush_task = asyncio.create_task(self._flush_loop())
|
|
304
|
+
|
|
305
|
+
return self._mode
|
|
306
|
+
|
|
307
|
+
async def stop(self, reason: str = "shutdown") -> None:
|
|
308
|
+
"""Stop the adaptive processor.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
reason: Reason for stopping
|
|
312
|
+
"""
|
|
313
|
+
logger.info(f"Stopping adaptive processor: {reason}")
|
|
314
|
+
self._running = False
|
|
315
|
+
|
|
316
|
+
# Cancel background tasks
|
|
317
|
+
if self._checkpoint_task:
|
|
318
|
+
self._checkpoint_task.cancel()
|
|
319
|
+
try:
|
|
320
|
+
await self._checkpoint_task
|
|
321
|
+
except asyncio.CancelledError:
|
|
322
|
+
pass
|
|
323
|
+
|
|
324
|
+
if self._flush_task:
|
|
325
|
+
self._flush_task.cancel()
|
|
326
|
+
try:
|
|
327
|
+
await self._flush_task
|
|
328
|
+
except asyncio.CancelledError:
|
|
329
|
+
pass
|
|
330
|
+
|
|
331
|
+
# Flush any remaining offline buffer
|
|
332
|
+
if self._offline_buffer:
|
|
333
|
+
await self._flush_offline_buffer()
|
|
334
|
+
|
|
335
|
+
# Final state checkpoint
|
|
336
|
+
await self._save_state()
|
|
337
|
+
|
|
338
|
+
# Stop heartbeat
|
|
339
|
+
if self._heartbeat:
|
|
340
|
+
await self._heartbeat.stop()
|
|
341
|
+
|
|
342
|
+
# Release fencing
|
|
343
|
+
if self._role_manager:
|
|
344
|
+
await self._role_manager.stop(reason)
|
|
345
|
+
|
|
346
|
+
logger.info("Adaptive processor stopped")
|
|
347
|
+
|
|
348
|
+
# =========================================================================
|
|
349
|
+
# Mode Management
|
|
350
|
+
# =========================================================================
|
|
351
|
+
|
|
352
|
+
def _determine_mode(self) -> OperationMode:
|
|
353
|
+
"""Determine current operation mode."""
|
|
354
|
+
if not self._context:
|
|
355
|
+
return OperationMode.DEGRADED
|
|
356
|
+
|
|
357
|
+
if self._context.is_edge:
|
|
358
|
+
# Edge node
|
|
359
|
+
if self.is_connected:
|
|
360
|
+
return OperationMode.EDGE_CONNECTED
|
|
361
|
+
else:
|
|
362
|
+
return OperationMode.EDGE_OFFLINE
|
|
363
|
+
else:
|
|
364
|
+
# Cloud node
|
|
365
|
+
if self._context.is_migrated:
|
|
366
|
+
return OperationMode.CLOUD_FAILOVER
|
|
367
|
+
else:
|
|
368
|
+
return OperationMode.CLOUD_NORMAL
|
|
369
|
+
|
|
370
|
+
async def _update_mode(self) -> None:
|
|
371
|
+
"""Update operation mode and trigger callbacks if changed."""
|
|
372
|
+
old_mode = self._mode
|
|
373
|
+
new_mode = self._determine_mode()
|
|
374
|
+
|
|
375
|
+
if old_mode != new_mode:
|
|
376
|
+
logger.info(f"Mode change: {old_mode.value} -> {new_mode.value}")
|
|
377
|
+
self._mode = new_mode
|
|
378
|
+
|
|
379
|
+
if self._on_mode_change:
|
|
380
|
+
await self._on_mode_change(old_mode, new_mode)
|
|
381
|
+
|
|
382
|
+
# Handle specific transitions
|
|
383
|
+
if old_mode == OperationMode.EDGE_OFFLINE and new_mode == OperationMode.EDGE_CONNECTED:
|
|
384
|
+
# Came back online - flush buffer
|
|
385
|
+
await self._flush_offline_buffer()
|
|
386
|
+
|
|
387
|
+
# =========================================================================
|
|
388
|
+
# State Management
|
|
389
|
+
# =========================================================================
|
|
390
|
+
|
|
391
|
+
def get_state(self) -> dict[str, Any]:
|
|
392
|
+
"""Get current processor state."""
|
|
393
|
+
return self._state.copy()
|
|
394
|
+
|
|
395
|
+
def set_state(self, state: dict[str, Any]) -> None:
|
|
396
|
+
"""Set processor state."""
|
|
397
|
+
self._state = state.copy()
|
|
398
|
+
|
|
399
|
+
async def restore_state(self, state: dict[str, Any]) -> None:
|
|
400
|
+
"""Restore state (called during failover recovery)."""
|
|
401
|
+
self._state = state.copy()
|
|
402
|
+
logger.info(f"State restored: {len(state)} keys")
|
|
403
|
+
|
|
404
|
+
async def _save_state(self) -> None:
|
|
405
|
+
"""Save state to appropriate storage."""
|
|
406
|
+
if self.is_offline:
|
|
407
|
+
# Buffer locally when offline
|
|
408
|
+
self._buffer_state(self._state)
|
|
409
|
+
else:
|
|
410
|
+
# Save to remote storage
|
|
411
|
+
await self._save_state_remote(self._state)
|
|
412
|
+
|
|
413
|
+
async def _save_state_remote(self, state: dict[str, Any]) -> None:
|
|
414
|
+
"""Save state to remote storage (orchestrator/S3)."""
|
|
415
|
+
# This would integrate with StateManager
|
|
416
|
+
logger.debug(f"Saving state remotely: {len(state)} keys")
|
|
417
|
+
|
|
418
|
+
def _buffer_state(self, state: dict[str, Any]) -> None:
|
|
419
|
+
"""Buffer state locally (edge offline mode)."""
|
|
420
|
+
max_size = self.config.edge_config.offline_buffer_max_size_mb * 1024 * 1024
|
|
421
|
+
# Simplified size check
|
|
422
|
+
if len(self._offline_buffer) < 10000: # Rough limit
|
|
423
|
+
self._offline_buffer.append(state.copy())
|
|
424
|
+
logger.debug(f"State buffered locally: {len(self._offline_buffer)} items")
|
|
425
|
+
|
|
426
|
+
async def _flush_offline_buffer(self) -> None:
|
|
427
|
+
"""Flush offline buffer to remote storage."""
|
|
428
|
+
if not self._offline_buffer:
|
|
429
|
+
return
|
|
430
|
+
|
|
431
|
+
logger.info(f"Flushing offline buffer: {len(self._offline_buffer)} items")
|
|
432
|
+
|
|
433
|
+
# In real implementation, this would batch-upload to S3/orchestrator
|
|
434
|
+
flushed = 0
|
|
435
|
+
while self._offline_buffer and self.is_connected:
|
|
436
|
+
state = self._offline_buffer.pop(0)
|
|
437
|
+
try:
|
|
438
|
+
await self._save_state_remote(state)
|
|
439
|
+
flushed += 1
|
|
440
|
+
except Exception as e:
|
|
441
|
+
# Put back and retry later
|
|
442
|
+
self._offline_buffer.insert(0, state)
|
|
443
|
+
logger.warning(f"Buffer flush failed: {e}")
|
|
444
|
+
break
|
|
445
|
+
|
|
446
|
+
logger.info(f"Flushed {flushed} items from offline buffer")
|
|
447
|
+
|
|
448
|
+
# =========================================================================
|
|
449
|
+
# Background Tasks
|
|
450
|
+
# =========================================================================
|
|
451
|
+
|
|
452
|
+
async def _checkpoint_loop(self) -> None:
|
|
453
|
+
"""Periodic state checkpoint."""
|
|
454
|
+
interval = self.get_checkpoint_interval()
|
|
455
|
+
|
|
456
|
+
while self._running:
|
|
457
|
+
try:
|
|
458
|
+
await asyncio.sleep(interval)
|
|
459
|
+
await self._save_state()
|
|
460
|
+
await self._update_mode()
|
|
461
|
+
except asyncio.CancelledError:
|
|
462
|
+
break
|
|
463
|
+
except Exception as e:
|
|
464
|
+
logger.error(f"Checkpoint error: {e}")
|
|
465
|
+
|
|
466
|
+
async def _flush_loop(self) -> None:
|
|
467
|
+
"""Periodic offline buffer flush (edge only)."""
|
|
468
|
+
interval = self.config.edge_config.offline_buffer_flush_interval_sec
|
|
469
|
+
|
|
470
|
+
while self._running:
|
|
471
|
+
try:
|
|
472
|
+
await asyncio.sleep(interval)
|
|
473
|
+
if self.is_connected and self._offline_buffer:
|
|
474
|
+
await self._flush_offline_buffer()
|
|
475
|
+
except asyncio.CancelledError:
|
|
476
|
+
break
|
|
477
|
+
except Exception as e:
|
|
478
|
+
logger.error(f"Flush error: {e}")
|
|
479
|
+
|
|
480
|
+
# =========================================================================
|
|
481
|
+
# Callbacks
|
|
482
|
+
# =========================================================================
|
|
483
|
+
|
|
484
|
+
def on_mode_change(
|
|
485
|
+
self, callback: Callable[[OperationMode, OperationMode], Coroutine]
|
|
486
|
+
) -> None:
|
|
487
|
+
"""Register callback for mode changes."""
|
|
488
|
+
self._on_mode_change = callback
|
|
489
|
+
|
|
490
|
+
def on_connectivity_change(
|
|
491
|
+
self, callback: Callable[[ConnectivityStatus], Coroutine]
|
|
492
|
+
) -> None:
|
|
493
|
+
"""Register callback for connectivity changes."""
|
|
494
|
+
self._on_connectivity_change = callback
|
|
495
|
+
|
|
496
|
+
def on_failover(self, callback: Callable[[str], Coroutine]) -> None:
|
|
497
|
+
"""Register callback for failover (receives original_node)."""
|
|
498
|
+
self._on_failover = callback
|
|
499
|
+
|
|
500
|
+
def on_failback(self, callback: Callable[[], Coroutine]) -> None:
|
|
501
|
+
"""Register callback for failback to edge."""
|
|
502
|
+
self._on_failback = callback
|
|
503
|
+
|
|
504
|
+
# =========================================================================
|
|
505
|
+
# Processing Helpers
|
|
506
|
+
# =========================================================================
|
|
507
|
+
|
|
508
|
+
async def process_with_fencing(
|
|
509
|
+
self,
|
|
510
|
+
operation: Callable[[], Coroutine[Any, Any, Any]],
|
|
511
|
+
) -> Any:
|
|
512
|
+
"""Execute operation with fencing validation.
|
|
513
|
+
|
|
514
|
+
Validates fencing token before and after operation to ensure
|
|
515
|
+
we're still the primary processor.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
operation: Async operation to execute
|
|
519
|
+
|
|
520
|
+
Returns:
|
|
521
|
+
Operation result
|
|
522
|
+
|
|
523
|
+
Raises:
|
|
524
|
+
FenceViolation: If fencing is violated
|
|
525
|
+
"""
|
|
526
|
+
if not self._role_manager:
|
|
527
|
+
raise RuntimeError("Processor not started")
|
|
528
|
+
|
|
529
|
+
# Validate before
|
|
530
|
+
if not await self._role_manager.validate_fencing_or_fence():
|
|
531
|
+
from dory.edge.fencing import FenceViolation
|
|
532
|
+
raise FenceViolation("Fencing validation failed before operation")
|
|
533
|
+
|
|
534
|
+
# Execute operation
|
|
535
|
+
result = await operation()
|
|
536
|
+
|
|
537
|
+
# Validate after
|
|
538
|
+
if not await self._role_manager.validate_fencing_or_fence():
|
|
539
|
+
from dory.edge.fencing import FenceViolation
|
|
540
|
+
raise FenceViolation("Fencing validation failed after operation")
|
|
541
|
+
|
|
542
|
+
return result
|
|
543
|
+
|
|
544
|
+
async def process_batch(
|
|
545
|
+
self,
|
|
546
|
+
items: list[Any],
|
|
547
|
+
processor: Callable[[Any], Coroutine[Any, Any, Any]],
|
|
548
|
+
) -> list[Any]:
|
|
549
|
+
"""Process items in batches appropriate for current location.
|
|
550
|
+
|
|
551
|
+
Automatically adjusts batch size and concurrency based on
|
|
552
|
+
whether running on edge or cloud.
|
|
553
|
+
|
|
554
|
+
Args:
|
|
555
|
+
items: Items to process
|
|
556
|
+
processor: Async function to process each item
|
|
557
|
+
|
|
558
|
+
Returns:
|
|
559
|
+
List of results
|
|
560
|
+
"""
|
|
561
|
+
batch_size = self.get_max_batch_size()
|
|
562
|
+
max_concurrent = self.get_max_concurrent()
|
|
563
|
+
|
|
564
|
+
results = []
|
|
565
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
566
|
+
|
|
567
|
+
async def process_with_semaphore(item: Any) -> Any:
|
|
568
|
+
async with semaphore:
|
|
569
|
+
return await processor(item)
|
|
570
|
+
|
|
571
|
+
# Process in batches
|
|
572
|
+
for i in range(0, len(items), batch_size):
|
|
573
|
+
batch = items[i:i + batch_size]
|
|
574
|
+
batch_results = await asyncio.gather(
|
|
575
|
+
*[process_with_semaphore(item) for item in batch],
|
|
576
|
+
return_exceptions=True,
|
|
577
|
+
)
|
|
578
|
+
results.extend(batch_results)
|
|
579
|
+
|
|
580
|
+
# Checkpoint after each batch
|
|
581
|
+
if self.is_edge:
|
|
582
|
+
await self._save_state()
|
|
583
|
+
|
|
584
|
+
return results
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
# =============================================================================
|
|
588
|
+
# Convenience Functions
|
|
589
|
+
# =============================================================================
|
|
590
|
+
|
|
591
|
+
def create_adaptive_processor(
|
|
592
|
+
app_name: str,
|
|
593
|
+
processor_id: str,
|
|
594
|
+
orchestrator_url: str | None = None,
|
|
595
|
+
**kwargs: Any,
|
|
596
|
+
) -> AdaptiveProcessor:
|
|
597
|
+
"""Create an adaptive processor with sensible defaults.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
app_name: Application name
|
|
601
|
+
processor_id: Unique processor identifier
|
|
602
|
+
orchestrator_url: URL of the orchestrator service
|
|
603
|
+
**kwargs: Additional EdgeConfig parameters
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
Configured AdaptiveProcessor
|
|
607
|
+
"""
|
|
608
|
+
edge_config = EdgeConfig(**kwargs) if kwargs else EdgeConfig()
|
|
609
|
+
|
|
610
|
+
config = AdaptiveConfig(
|
|
611
|
+
app_name=app_name,
|
|
612
|
+
processor_id=processor_id,
|
|
613
|
+
orchestrator_url=orchestrator_url,
|
|
614
|
+
edge_config=edge_config,
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
return AdaptiveProcessor(config)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def get_location_aware_settings() -> dict[str, Any]:
|
|
621
|
+
"""Get settings appropriate for current location.
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
Dictionary of settings adjusted for edge or cloud
|
|
625
|
+
"""
|
|
626
|
+
context = get_workload_context()
|
|
627
|
+
config = EdgeConfig()
|
|
628
|
+
|
|
629
|
+
if context.is_edge:
|
|
630
|
+
return {
|
|
631
|
+
"heartbeat_interval_sec": config.edge_heartbeat_interval_sec,
|
|
632
|
+
"checkpoint_interval_sec": config.edge_checkpoint_interval_sec,
|
|
633
|
+
"max_batch_size": config.edge_max_batch_size,
|
|
634
|
+
"max_concurrent": config.edge_max_concurrent,
|
|
635
|
+
"max_retries": config.edge_max_retries,
|
|
636
|
+
"retry_backoff_sec": config.edge_retry_backoff_sec,
|
|
637
|
+
"offline_buffer_enabled": config.offline_buffer_enabled,
|
|
638
|
+
}
|
|
639
|
+
else:
|
|
640
|
+
return {
|
|
641
|
+
"heartbeat_interval_sec": config.cloud_heartbeat_interval_sec,
|
|
642
|
+
"checkpoint_interval_sec": config.cloud_checkpoint_interval_sec,
|
|
643
|
+
"max_batch_size": config.cloud_max_batch_size,
|
|
644
|
+
"max_concurrent": config.cloud_max_concurrent,
|
|
645
|
+
"max_retries": config.cloud_max_retries,
|
|
646
|
+
"retry_backoff_sec": config.cloud_retry_backoff_sec,
|
|
647
|
+
"offline_buffer_enabled": False,
|
|
648
|
+
}
|