dory-sdk 2.1.0__py3-none-any.whl → 2.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dory/__init__.py +32 -1
- dory/config/defaults.py +6 -0
- dory/config/schema.py +26 -0
- dory/edge/__init__.py +88 -0
- dory/edge/adaptive.py +648 -0
- dory/edge/detector.py +546 -0
- dory/edge/fencing.py +488 -0
- dory/edge/heartbeat.py +598 -0
- dory/edge/role.py +416 -0
- dory/health/server.py +283 -9
- dory/k8s/__init__.py +69 -0
- dory/k8s/labels.py +505 -0
- dory/migration/__init__.py +49 -0
- dory/migration/s3_store.py +656 -0
- dory/migration/state_manager.py +64 -6
- dory/migration/transfer.py +382 -0
- dory/migration/versioning.py +749 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/METADATA +37 -32
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/RECORD +22 -15
- dory_sdk-2.1.4.dist-info/entry_points.txt +2 -0
- dory/sidecar/__init__.py +0 -6
- dory/sidecar/main.py +0 -75
- dory/sidecar/server.py +0 -329
- dory_sdk-2.1.0.dist-info/entry_points.txt +0 -3
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/WHEEL +0 -0
- {dory_sdk-2.1.0.dist-info → dory_sdk-2.1.4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,656 @@
|
|
|
1
|
+
"""
|
|
2
|
+
S3 storage backend for state persistence.
|
|
3
|
+
|
|
4
|
+
Provides S3-based state storage with local buffering for edge nodes
|
|
5
|
+
that may have intermittent connectivity.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- S3 upload/download with retry logic
|
|
9
|
+
- Local SQLite buffer for offline scenarios
|
|
10
|
+
- Multiple credential options (IAM role, env vars, STS)
|
|
11
|
+
- Automatic sync when connectivity is restored
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import sqlite3
|
|
19
|
+
import time
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from dory.utils.errors import DoryStateError
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
# Optional boto3 import - gracefully handle if not available
|
|
29
|
+
try:
|
|
30
|
+
import boto3
|
|
31
|
+
from botocore.exceptions import ClientError, NoCredentialsError, BotoCoreError
|
|
32
|
+
BOTO3_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
BOTO3_AVAILABLE = False
|
|
35
|
+
boto3 = None
|
|
36
|
+
ClientError = Exception
|
|
37
|
+
NoCredentialsError = Exception
|
|
38
|
+
BotoCoreError = Exception
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class S3Config:
|
|
43
|
+
"""Configuration for S3 state backend."""
|
|
44
|
+
|
|
45
|
+
bucket: str
|
|
46
|
+
prefix: str = "dory-state"
|
|
47
|
+
region: str | None = None
|
|
48
|
+
endpoint_url: str | None = None # For S3-compatible services (MinIO, LocalStack)
|
|
49
|
+
|
|
50
|
+
# Credential options
|
|
51
|
+
access_key_id: str | None = None
|
|
52
|
+
secret_access_key: str | None = None
|
|
53
|
+
session_token: str | None = None # For STS temporary credentials
|
|
54
|
+
role_arn: str | None = None # For assuming a role
|
|
55
|
+
credential_broker_url: str | None = None # For edge credential broker
|
|
56
|
+
|
|
57
|
+
# Offline buffering
|
|
58
|
+
enable_offline_buffer: bool = True
|
|
59
|
+
buffer_path: str = "/data/dory-state-buffer.db"
|
|
60
|
+
buffer_path_fallback: str = "/tmp/dory-state-buffer.db"
|
|
61
|
+
max_buffer_age_seconds: int = 86400 # 24 hours
|
|
62
|
+
|
|
63
|
+
# Retry settings
|
|
64
|
+
max_retries: int = 3
|
|
65
|
+
retry_delay_seconds: float = 1.0
|
|
66
|
+
retry_backoff_multiplier: float = 2.0
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_env(cls) -> "S3Config":
|
|
70
|
+
"""Create config from environment variables."""
|
|
71
|
+
bucket = os.environ.get("DORY_S3_BUCKET")
|
|
72
|
+
if not bucket:
|
|
73
|
+
raise DoryStateError(
|
|
74
|
+
"DORY_S3_BUCKET environment variable is required for S3 backend"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return cls(
|
|
78
|
+
bucket=bucket,
|
|
79
|
+
prefix=os.environ.get("DORY_S3_PREFIX", "dory-state"),
|
|
80
|
+
region=os.environ.get("DORY_S3_REGION", os.environ.get("AWS_REGION")),
|
|
81
|
+
endpoint_url=os.environ.get("DORY_S3_ENDPOINT_URL"),
|
|
82
|
+
access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
|
|
83
|
+
secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
|
|
84
|
+
session_token=os.environ.get("AWS_SESSION_TOKEN"),
|
|
85
|
+
role_arn=os.environ.get("DORY_S3_ROLE_ARN"),
|
|
86
|
+
credential_broker_url=os.environ.get("DORY_CREDENTIAL_BROKER_URL"),
|
|
87
|
+
enable_offline_buffer=os.environ.get(
|
|
88
|
+
"DORY_S3_ENABLE_OFFLINE_BUFFER", "true"
|
|
89
|
+
).lower() == "true",
|
|
90
|
+
buffer_path=os.environ.get(
|
|
91
|
+
"DORY_S3_BUFFER_PATH", "/data/dory-state-buffer.db"
|
|
92
|
+
),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class OfflineBuffer:
|
|
97
|
+
"""
|
|
98
|
+
SQLite-based local buffer for offline state persistence.
|
|
99
|
+
|
|
100
|
+
Stores state locally when S3 is unreachable and syncs when
|
|
101
|
+
connectivity is restored.
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
def __init__(self, db_path: str, fallback_path: str):
|
|
105
|
+
"""
|
|
106
|
+
Initialize offline buffer.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
db_path: Primary path for SQLite database
|
|
110
|
+
fallback_path: Fallback path if primary is not writable
|
|
111
|
+
"""
|
|
112
|
+
self._db_path = self._resolve_path(db_path, fallback_path)
|
|
113
|
+
self._conn: sqlite3.Connection | None = None
|
|
114
|
+
self._initialized = False
|
|
115
|
+
|
|
116
|
+
def _resolve_path(self, primary: str, fallback: str) -> str:
|
|
117
|
+
"""Resolve which path to use for the database."""
|
|
118
|
+
primary_dir = Path(primary).parent
|
|
119
|
+
if primary_dir.exists() and os.access(primary_dir, os.W_OK):
|
|
120
|
+
return primary
|
|
121
|
+
|
|
122
|
+
fallback_dir = Path(fallback).parent
|
|
123
|
+
fallback_dir.mkdir(parents=True, exist_ok=True)
|
|
124
|
+
logger.info(f"Using fallback buffer path: {fallback}")
|
|
125
|
+
return fallback
|
|
126
|
+
|
|
127
|
+
def _ensure_initialized(self) -> None:
|
|
128
|
+
"""Initialize database if not already done."""
|
|
129
|
+
if self._initialized:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
self._conn = sqlite3.connect(self._db_path)
|
|
133
|
+
self._conn.execute("""
|
|
134
|
+
CREATE TABLE IF NOT EXISTS state_buffer (
|
|
135
|
+
processor_id TEXT PRIMARY KEY,
|
|
136
|
+
state_json TEXT NOT NULL,
|
|
137
|
+
created_at REAL NOT NULL,
|
|
138
|
+
synced_at REAL,
|
|
139
|
+
sync_attempts INTEGER DEFAULT 0
|
|
140
|
+
)
|
|
141
|
+
""")
|
|
142
|
+
self._conn.execute("""
|
|
143
|
+
CREATE INDEX IF NOT EXISTS idx_synced
|
|
144
|
+
ON state_buffer(synced_at)
|
|
145
|
+
""")
|
|
146
|
+
self._conn.commit()
|
|
147
|
+
self._initialized = True
|
|
148
|
+
logger.debug(f"Offline buffer initialized at {self._db_path}")
|
|
149
|
+
|
|
150
|
+
def save(self, processor_id: str, state_json: str) -> None:
|
|
151
|
+
"""Save state to local buffer."""
|
|
152
|
+
self._ensure_initialized()
|
|
153
|
+
|
|
154
|
+
self._conn.execute("""
|
|
155
|
+
INSERT OR REPLACE INTO state_buffer
|
|
156
|
+
(processor_id, state_json, created_at, synced_at, sync_attempts)
|
|
157
|
+
VALUES (?, ?, ?, NULL, 0)
|
|
158
|
+
""", (processor_id, state_json, time.time()))
|
|
159
|
+
self._conn.commit()
|
|
160
|
+
logger.debug(f"State buffered locally for {processor_id}")
|
|
161
|
+
|
|
162
|
+
def load(self, processor_id: str) -> str | None:
|
|
163
|
+
"""Load state from local buffer."""
|
|
164
|
+
self._ensure_initialized()
|
|
165
|
+
|
|
166
|
+
cursor = self._conn.execute("""
|
|
167
|
+
SELECT state_json FROM state_buffer
|
|
168
|
+
WHERE processor_id = ?
|
|
169
|
+
""", (processor_id,))
|
|
170
|
+
row = cursor.fetchone()
|
|
171
|
+
return row[0] if row else None
|
|
172
|
+
|
|
173
|
+
def mark_synced(self, processor_id: str) -> None:
|
|
174
|
+
"""Mark state as synced to S3."""
|
|
175
|
+
self._ensure_initialized()
|
|
176
|
+
|
|
177
|
+
self._conn.execute("""
|
|
178
|
+
UPDATE state_buffer
|
|
179
|
+
SET synced_at = ?
|
|
180
|
+
WHERE processor_id = ?
|
|
181
|
+
""", (time.time(), processor_id))
|
|
182
|
+
self._conn.commit()
|
|
183
|
+
|
|
184
|
+
def get_unsynced(self) -> list[tuple[str, str]]:
|
|
185
|
+
"""Get all unsynced states."""
|
|
186
|
+
self._ensure_initialized()
|
|
187
|
+
|
|
188
|
+
cursor = self._conn.execute("""
|
|
189
|
+
SELECT processor_id, state_json
|
|
190
|
+
FROM state_buffer
|
|
191
|
+
WHERE synced_at IS NULL
|
|
192
|
+
ORDER BY created_at ASC
|
|
193
|
+
""")
|
|
194
|
+
return cursor.fetchall()
|
|
195
|
+
|
|
196
|
+
def increment_sync_attempts(self, processor_id: str) -> None:
|
|
197
|
+
"""Increment sync attempt counter."""
|
|
198
|
+
self._ensure_initialized()
|
|
199
|
+
|
|
200
|
+
self._conn.execute("""
|
|
201
|
+
UPDATE state_buffer
|
|
202
|
+
SET sync_attempts = sync_attempts + 1
|
|
203
|
+
WHERE processor_id = ?
|
|
204
|
+
""", (processor_id,))
|
|
205
|
+
self._conn.commit()
|
|
206
|
+
|
|
207
|
+
def delete(self, processor_id: str) -> bool:
|
|
208
|
+
"""Delete state from buffer."""
|
|
209
|
+
self._ensure_initialized()
|
|
210
|
+
|
|
211
|
+
cursor = self._conn.execute("""
|
|
212
|
+
DELETE FROM state_buffer WHERE processor_id = ?
|
|
213
|
+
""", (processor_id,))
|
|
214
|
+
self._conn.commit()
|
|
215
|
+
return cursor.rowcount > 0
|
|
216
|
+
|
|
217
|
+
def cleanup_old(self, max_age_seconds: int) -> int:
|
|
218
|
+
"""Remove entries older than max_age_seconds."""
|
|
219
|
+
self._ensure_initialized()
|
|
220
|
+
|
|
221
|
+
cutoff = time.time() - max_age_seconds
|
|
222
|
+
cursor = self._conn.execute("""
|
|
223
|
+
DELETE FROM state_buffer
|
|
224
|
+
WHERE created_at < ? AND synced_at IS NOT NULL
|
|
225
|
+
""", (cutoff,))
|
|
226
|
+
self._conn.commit()
|
|
227
|
+
return cursor.rowcount
|
|
228
|
+
|
|
229
|
+
def close(self) -> None:
|
|
230
|
+
"""Close database connection."""
|
|
231
|
+
if self._conn:
|
|
232
|
+
self._conn.close()
|
|
233
|
+
self._conn = None
|
|
234
|
+
self._initialized = False
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class S3Store:
|
|
238
|
+
"""
|
|
239
|
+
Store and retrieve state from AWS S3.
|
|
240
|
+
|
|
241
|
+
Supports offline buffering for edge nodes with intermittent connectivity.
|
|
242
|
+
|
|
243
|
+
S3 key format: {prefix}/{processor_id}/state.json
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, config: S3Config | None = None):
|
|
247
|
+
"""
|
|
248
|
+
Initialize S3 store.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
config: S3 configuration (defaults to from_env())
|
|
252
|
+
"""
|
|
253
|
+
if not BOTO3_AVAILABLE:
|
|
254
|
+
raise DoryStateError(
|
|
255
|
+
"boto3 is required for S3 backend. "
|
|
256
|
+
"Install with: pip install boto3"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
self._config = config or S3Config.from_env()
|
|
260
|
+
self._client: Any = None
|
|
261
|
+
self._initialized = False
|
|
262
|
+
|
|
263
|
+
# Offline buffer
|
|
264
|
+
self._buffer: OfflineBuffer | None = None
|
|
265
|
+
if self._config.enable_offline_buffer:
|
|
266
|
+
self._buffer = OfflineBuffer(
|
|
267
|
+
self._config.buffer_path,
|
|
268
|
+
self._config.buffer_path_fallback,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Background sync task
|
|
272
|
+
self._sync_task: asyncio.Task | None = None
|
|
273
|
+
|
|
274
|
+
def _ensure_initialized(self) -> None:
|
|
275
|
+
"""Initialize S3 client if not already done."""
|
|
276
|
+
if self._initialized:
|
|
277
|
+
return
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
session_kwargs = {}
|
|
281
|
+
client_kwargs = {}
|
|
282
|
+
|
|
283
|
+
# Region
|
|
284
|
+
if self._config.region:
|
|
285
|
+
session_kwargs["region_name"] = self._config.region
|
|
286
|
+
|
|
287
|
+
# Explicit credentials
|
|
288
|
+
if self._config.access_key_id and self._config.secret_access_key:
|
|
289
|
+
session_kwargs["aws_access_key_id"] = self._config.access_key_id
|
|
290
|
+
session_kwargs["aws_secret_access_key"] = self._config.secret_access_key
|
|
291
|
+
if self._config.session_token:
|
|
292
|
+
session_kwargs["aws_session_token"] = self._config.session_token
|
|
293
|
+
|
|
294
|
+
# Custom endpoint (MinIO, LocalStack)
|
|
295
|
+
if self._config.endpoint_url:
|
|
296
|
+
client_kwargs["endpoint_url"] = self._config.endpoint_url
|
|
297
|
+
|
|
298
|
+
# Create session and client
|
|
299
|
+
session = boto3.Session(**session_kwargs)
|
|
300
|
+
|
|
301
|
+
# Assume role if specified
|
|
302
|
+
if self._config.role_arn:
|
|
303
|
+
sts = session.client("sts")
|
|
304
|
+
assumed = sts.assume_role(
|
|
305
|
+
RoleArn=self._config.role_arn,
|
|
306
|
+
RoleSessionName="dory-state-manager",
|
|
307
|
+
)
|
|
308
|
+
credentials = assumed["Credentials"]
|
|
309
|
+
session = boto3.Session(
|
|
310
|
+
aws_access_key_id=credentials["AccessKeyId"],
|
|
311
|
+
aws_secret_access_key=credentials["SecretAccessKey"],
|
|
312
|
+
aws_session_token=credentials["SessionToken"],
|
|
313
|
+
region_name=self._config.region,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
self._client = session.client("s3", **client_kwargs)
|
|
317
|
+
self._initialized = True
|
|
318
|
+
logger.debug(f"S3 client initialized for bucket {self._config.bucket}")
|
|
319
|
+
|
|
320
|
+
except NoCredentialsError as e:
|
|
321
|
+
raise DoryStateError(
|
|
322
|
+
"No AWS credentials found. Configure via environment variables, "
|
|
323
|
+
"IAM role, or credential broker.",
|
|
324
|
+
cause=e,
|
|
325
|
+
)
|
|
326
|
+
except Exception as e:
|
|
327
|
+
raise DoryStateError(f"Failed to initialize S3 client: {e}", cause=e)
|
|
328
|
+
|
|
329
|
+
def _s3_key(self, processor_id: str) -> str:
|
|
330
|
+
"""Generate S3 key for processor state."""
|
|
331
|
+
return f"{self._config.prefix}/{processor_id}/state.json"
|
|
332
|
+
|
|
333
|
+
async def _retry_operation(
|
|
334
|
+
self,
|
|
335
|
+
operation: str,
|
|
336
|
+
func: Any,
|
|
337
|
+
*args: Any,
|
|
338
|
+
**kwargs: Any,
|
|
339
|
+
) -> Any:
|
|
340
|
+
"""Execute operation with retry logic."""
|
|
341
|
+
last_error = None
|
|
342
|
+
delay = self._config.retry_delay_seconds
|
|
343
|
+
|
|
344
|
+
for attempt in range(1, self._config.max_retries + 1):
|
|
345
|
+
try:
|
|
346
|
+
# Run synchronous boto3 call in executor
|
|
347
|
+
loop = asyncio.get_event_loop()
|
|
348
|
+
result = await loop.run_in_executor(None, lambda: func(*args, **kwargs))
|
|
349
|
+
return result
|
|
350
|
+
|
|
351
|
+
except (ClientError, BotoCoreError) as e:
|
|
352
|
+
last_error = e
|
|
353
|
+
if attempt < self._config.max_retries:
|
|
354
|
+
logger.warning(
|
|
355
|
+
f"S3 {operation} attempt {attempt} failed: {e}. "
|
|
356
|
+
f"Retrying in {delay:.1f}s..."
|
|
357
|
+
)
|
|
358
|
+
await asyncio.sleep(delay)
|
|
359
|
+
delay *= self._config.retry_backoff_multiplier
|
|
360
|
+
|
|
361
|
+
raise DoryStateError(
|
|
362
|
+
f"S3 {operation} failed after {self._config.max_retries} attempts: {last_error}",
|
|
363
|
+
cause=last_error,
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
async def save(
|
|
367
|
+
self,
|
|
368
|
+
processor_id: str,
|
|
369
|
+
state_json: str,
|
|
370
|
+
metadata: dict[str, str] | None = None,
|
|
371
|
+
) -> None:
|
|
372
|
+
"""
|
|
373
|
+
Save state to S3.
|
|
374
|
+
|
|
375
|
+
If S3 is unreachable and offline buffering is enabled,
|
|
376
|
+
state is saved to local buffer for later sync.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
processor_id: Processor ID
|
|
380
|
+
state_json: JSON-serialized state
|
|
381
|
+
metadata: Optional metadata to store with object
|
|
382
|
+
|
|
383
|
+
Raises:
|
|
384
|
+
DoryStateError: If save fails and no buffer available
|
|
385
|
+
"""
|
|
386
|
+
self._ensure_initialized()
|
|
387
|
+
|
|
388
|
+
s3_key = self._s3_key(processor_id)
|
|
389
|
+
s3_metadata = metadata or {}
|
|
390
|
+
s3_metadata["processor-id"] = processor_id
|
|
391
|
+
s3_metadata["saved-at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
await self._retry_operation(
|
|
395
|
+
"put_object",
|
|
396
|
+
self._client.put_object,
|
|
397
|
+
Bucket=self._config.bucket,
|
|
398
|
+
Key=s3_key,
|
|
399
|
+
Body=state_json.encode("utf-8"),
|
|
400
|
+
ContentType="application/json",
|
|
401
|
+
Metadata=s3_metadata,
|
|
402
|
+
)
|
|
403
|
+
logger.debug(f"State saved to S3: s3://{self._config.bucket}/{s3_key}")
|
|
404
|
+
|
|
405
|
+
# If buffer exists, mark as synced
|
|
406
|
+
if self._buffer:
|
|
407
|
+
self._buffer.save(processor_id, state_json)
|
|
408
|
+
self._buffer.mark_synced(processor_id)
|
|
409
|
+
|
|
410
|
+
except DoryStateError:
|
|
411
|
+
# S3 failed - try to buffer locally
|
|
412
|
+
if self._buffer:
|
|
413
|
+
self._buffer.save(processor_id, state_json)
|
|
414
|
+
logger.warning(
|
|
415
|
+
f"S3 unavailable, state buffered locally for {processor_id}"
|
|
416
|
+
)
|
|
417
|
+
else:
|
|
418
|
+
raise
|
|
419
|
+
|
|
420
|
+
async def load(self, processor_id: str) -> str | None:
|
|
421
|
+
"""
|
|
422
|
+
Load state from S3.
|
|
423
|
+
|
|
424
|
+
Falls back to local buffer if S3 is unreachable.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
processor_id: Processor ID
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
JSON-serialized state, or None if not found
|
|
431
|
+
|
|
432
|
+
Raises:
|
|
433
|
+
DoryStateError: If load fails
|
|
434
|
+
"""
|
|
435
|
+
self._ensure_initialized()
|
|
436
|
+
|
|
437
|
+
s3_key = self._s3_key(processor_id)
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
response = await self._retry_operation(
|
|
441
|
+
"get_object",
|
|
442
|
+
self._client.get_object,
|
|
443
|
+
Bucket=self._config.bucket,
|
|
444
|
+
Key=s3_key,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Read body in executor
|
|
448
|
+
loop = asyncio.get_event_loop()
|
|
449
|
+
body = await loop.run_in_executor(
|
|
450
|
+
None,
|
|
451
|
+
lambda: response["Body"].read().decode("utf-8"),
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
logger.debug(f"State loaded from S3: s3://{self._config.bucket}/{s3_key}")
|
|
455
|
+
return body
|
|
456
|
+
|
|
457
|
+
except DoryStateError as e:
|
|
458
|
+
# Check if it's a 404
|
|
459
|
+
if "NoSuchKey" in str(e) or "404" in str(e):
|
|
460
|
+
logger.debug(f"State not found in S3: {s3_key}")
|
|
461
|
+
|
|
462
|
+
# Try local buffer as fallback
|
|
463
|
+
if self._buffer:
|
|
464
|
+
buffered = self._buffer.load(processor_id)
|
|
465
|
+
if buffered:
|
|
466
|
+
logger.info(
|
|
467
|
+
f"Using buffered state for {processor_id} (not yet synced to S3)"
|
|
468
|
+
)
|
|
469
|
+
return buffered
|
|
470
|
+
|
|
471
|
+
return None
|
|
472
|
+
|
|
473
|
+
# S3 error - try local buffer
|
|
474
|
+
if self._buffer:
|
|
475
|
+
buffered = self._buffer.load(processor_id)
|
|
476
|
+
if buffered:
|
|
477
|
+
logger.warning(
|
|
478
|
+
f"S3 unavailable, using buffered state for {processor_id}"
|
|
479
|
+
)
|
|
480
|
+
return buffered
|
|
481
|
+
|
|
482
|
+
raise
|
|
483
|
+
|
|
484
|
+
except ClientError as e:
|
|
485
|
+
if e.response.get("Error", {}).get("Code") == "NoSuchKey":
|
|
486
|
+
logger.debug(f"State not found in S3: {s3_key}")
|
|
487
|
+
return None
|
|
488
|
+
raise DoryStateError(f"Failed to load state from S3: {e}", cause=e)
|
|
489
|
+
|
|
490
|
+
async def delete(self, processor_id: str) -> bool:
|
|
491
|
+
"""
|
|
492
|
+
Delete state from S3.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
processor_id: Processor ID
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
True if deleted, False if not found
|
|
499
|
+
|
|
500
|
+
Raises:
|
|
501
|
+
DoryStateError: If delete fails
|
|
502
|
+
"""
|
|
503
|
+
self._ensure_initialized()
|
|
504
|
+
|
|
505
|
+
s3_key = self._s3_key(processor_id)
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
# Check if exists first
|
|
509
|
+
try:
|
|
510
|
+
await self._retry_operation(
|
|
511
|
+
"head_object",
|
|
512
|
+
self._client.head_object,
|
|
513
|
+
Bucket=self._config.bucket,
|
|
514
|
+
Key=s3_key,
|
|
515
|
+
)
|
|
516
|
+
except DoryStateError:
|
|
517
|
+
# Not found
|
|
518
|
+
if self._buffer:
|
|
519
|
+
self._buffer.delete(processor_id)
|
|
520
|
+
return False
|
|
521
|
+
|
|
522
|
+
# Delete from S3
|
|
523
|
+
await self._retry_operation(
|
|
524
|
+
"delete_object",
|
|
525
|
+
self._client.delete_object,
|
|
526
|
+
Bucket=self._config.bucket,
|
|
527
|
+
Key=s3_key,
|
|
528
|
+
)
|
|
529
|
+
logger.debug(f"State deleted from S3: s3://{self._config.bucket}/{s3_key}")
|
|
530
|
+
|
|
531
|
+
# Delete from buffer too
|
|
532
|
+
if self._buffer:
|
|
533
|
+
self._buffer.delete(processor_id)
|
|
534
|
+
|
|
535
|
+
return True
|
|
536
|
+
|
|
537
|
+
except ClientError as e:
|
|
538
|
+
if e.response.get("Error", {}).get("Code") == "NoSuchKey":
|
|
539
|
+
return False
|
|
540
|
+
raise DoryStateError(f"Failed to delete state from S3: {e}", cause=e)
|
|
541
|
+
|
|
542
|
+
async def exists(self, processor_id: str) -> bool:
|
|
543
|
+
"""
|
|
544
|
+
Check if state exists in S3.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
processor_id: Processor ID
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
True if state exists
|
|
551
|
+
"""
|
|
552
|
+
self._ensure_initialized()
|
|
553
|
+
|
|
554
|
+
s3_key = self._s3_key(processor_id)
|
|
555
|
+
|
|
556
|
+
try:
|
|
557
|
+
await self._retry_operation(
|
|
558
|
+
"head_object",
|
|
559
|
+
self._client.head_object,
|
|
560
|
+
Bucket=self._config.bucket,
|
|
561
|
+
Key=s3_key,
|
|
562
|
+
)
|
|
563
|
+
return True
|
|
564
|
+
except (DoryStateError, ClientError):
|
|
565
|
+
return False
|
|
566
|
+
|
|
567
|
+
async def sync_buffer(self) -> int:
|
|
568
|
+
"""
|
|
569
|
+
Sync buffered states to S3.
|
|
570
|
+
|
|
571
|
+
Call periodically to upload states that were buffered
|
|
572
|
+
during connectivity issues.
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Number of states synced
|
|
576
|
+
"""
|
|
577
|
+
if not self._buffer:
|
|
578
|
+
return 0
|
|
579
|
+
|
|
580
|
+
unsynced = self._buffer.get_unsynced()
|
|
581
|
+
synced_count = 0
|
|
582
|
+
|
|
583
|
+
for processor_id, state_json in unsynced:
|
|
584
|
+
try:
|
|
585
|
+
s3_key = self._s3_key(processor_id)
|
|
586
|
+
|
|
587
|
+
await self._retry_operation(
|
|
588
|
+
"put_object",
|
|
589
|
+
self._client.put_object,
|
|
590
|
+
Bucket=self._config.bucket,
|
|
591
|
+
Key=s3_key,
|
|
592
|
+
Body=state_json.encode("utf-8"),
|
|
593
|
+
ContentType="application/json",
|
|
594
|
+
Metadata={
|
|
595
|
+
"processor-id": processor_id,
|
|
596
|
+
"synced-from-buffer": "true",
|
|
597
|
+
"synced-at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
598
|
+
},
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
self._buffer.mark_synced(processor_id)
|
|
602
|
+
synced_count += 1
|
|
603
|
+
logger.info(f"Synced buffered state for {processor_id} to S3")
|
|
604
|
+
|
|
605
|
+
except DoryStateError as e:
|
|
606
|
+
self._buffer.increment_sync_attempts(processor_id)
|
|
607
|
+
logger.warning(f"Failed to sync buffered state for {processor_id}: {e}")
|
|
608
|
+
|
|
609
|
+
# Cleanup old synced entries
|
|
610
|
+
if synced_count > 0:
|
|
611
|
+
cleaned = self._buffer.cleanup_old(self._config.max_buffer_age_seconds)
|
|
612
|
+
if cleaned > 0:
|
|
613
|
+
logger.debug(f"Cleaned up {cleaned} old buffer entries")
|
|
614
|
+
|
|
615
|
+
return synced_count
|
|
616
|
+
|
|
617
|
+
async def start_background_sync(self, interval_seconds: float = 60.0) -> None:
|
|
618
|
+
"""
|
|
619
|
+
Start background sync task.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
interval_seconds: Interval between sync attempts
|
|
623
|
+
"""
|
|
624
|
+
if self._sync_task is not None:
|
|
625
|
+
return
|
|
626
|
+
|
|
627
|
+
async def sync_loop():
|
|
628
|
+
while True:
|
|
629
|
+
try:
|
|
630
|
+
await asyncio.sleep(interval_seconds)
|
|
631
|
+
synced = await self.sync_buffer()
|
|
632
|
+
if synced > 0:
|
|
633
|
+
logger.debug(f"Background sync: {synced} states synced")
|
|
634
|
+
except asyncio.CancelledError:
|
|
635
|
+
break
|
|
636
|
+
except Exception as e:
|
|
637
|
+
logger.error(f"Background sync error: {e}")
|
|
638
|
+
|
|
639
|
+
self._sync_task = asyncio.create_task(sync_loop())
|
|
640
|
+
logger.info(f"Started background S3 sync (interval: {interval_seconds}s)")
|
|
641
|
+
|
|
642
|
+
async def stop_background_sync(self) -> None:
|
|
643
|
+
"""Stop background sync task."""
|
|
644
|
+
if self._sync_task:
|
|
645
|
+
self._sync_task.cancel()
|
|
646
|
+
try:
|
|
647
|
+
await self._sync_task
|
|
648
|
+
except asyncio.CancelledError:
|
|
649
|
+
pass
|
|
650
|
+
self._sync_task = None
|
|
651
|
+
logger.info("Stopped background S3 sync")
|
|
652
|
+
|
|
653
|
+
def close(self) -> None:
|
|
654
|
+
"""Close resources."""
|
|
655
|
+
if self._buffer:
|
|
656
|
+
self._buffer.close()
|