flatmachines 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flatmachines/__init__.py +136 -0
- flatmachines/actions.py +408 -0
- flatmachines/adapters/__init__.py +38 -0
- flatmachines/adapters/flatagent.py +86 -0
- flatmachines/adapters/pi_agent_bridge.py +127 -0
- flatmachines/adapters/pi_agent_runner.mjs +99 -0
- flatmachines/adapters/smolagents.py +125 -0
- flatmachines/agents.py +144 -0
- flatmachines/assets/MACHINES.md +141 -0
- flatmachines/assets/README.md +11 -0
- flatmachines/assets/__init__.py +0 -0
- flatmachines/assets/flatagent.d.ts +219 -0
- flatmachines/assets/flatagent.schema.json +271 -0
- flatmachines/assets/flatagent.slim.d.ts +58 -0
- flatmachines/assets/flatagents-runtime.d.ts +523 -0
- flatmachines/assets/flatagents-runtime.schema.json +281 -0
- flatmachines/assets/flatagents-runtime.slim.d.ts +187 -0
- flatmachines/assets/flatmachine.d.ts +403 -0
- flatmachines/assets/flatmachine.schema.json +620 -0
- flatmachines/assets/flatmachine.slim.d.ts +106 -0
- flatmachines/assets/profiles.d.ts +140 -0
- flatmachines/assets/profiles.schema.json +93 -0
- flatmachines/assets/profiles.slim.d.ts +26 -0
- flatmachines/backends.py +222 -0
- flatmachines/distributed.py +835 -0
- flatmachines/distributed_hooks.py +351 -0
- flatmachines/execution.py +638 -0
- flatmachines/expressions/__init__.py +60 -0
- flatmachines/expressions/cel.py +101 -0
- flatmachines/expressions/simple.py +166 -0
- flatmachines/flatmachine.py +1263 -0
- flatmachines/hooks.py +381 -0
- flatmachines/locking.py +69 -0
- flatmachines/monitoring.py +505 -0
- flatmachines/persistence.py +213 -0
- flatmachines/run.py +117 -0
- flatmachines/utils.py +166 -0
- flatmachines/validation.py +79 -0
- flatmachines-1.0.0.dist-info/METADATA +390 -0
- flatmachines-1.0.0.dist-info/RECORD +41 -0
- flatmachines-1.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,835 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distributed backends for FlatAgents worker orchestration.
|
|
3
|
+
|
|
4
|
+
This module provides backends for work distribution and worker lifecycle management
|
|
5
|
+
across ephemeral machines. These backends enable decentralized autoscaling patterns
|
|
6
|
+
where workers claim jobs, process them, and exit.
|
|
7
|
+
|
|
8
|
+
Backends:
|
|
9
|
+
- RegistrationBackend: Worker lifecycle (register, heartbeat, status)
|
|
10
|
+
- WorkBackend: Work distribution via named pools with atomic claiming
|
|
11
|
+
|
|
12
|
+
Implementations:
|
|
13
|
+
- SQLite: Single-file database, suitable for local/container deployments
|
|
14
|
+
- Memory: In-memory storage, suitable for testing and single-process scenarios
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import sqlite3
|
|
21
|
+
import uuid
|
|
22
|
+
from abc import ABC, abstractmethod
|
|
23
|
+
from dataclasses import dataclass, asdict, field
|
|
24
|
+
from datetime import datetime, timezone, timedelta
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# Types
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class WorkerRegistration:
|
|
37
|
+
"""Information for registering a new worker."""
|
|
38
|
+
worker_id: str
|
|
39
|
+
host: Optional[str] = None
|
|
40
|
+
pid: Optional[int] = None
|
|
41
|
+
capabilities: Optional[List[str]] = None
|
|
42
|
+
started_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
45
|
+
return asdict(self)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class WorkerRecord:
|
|
50
|
+
"""Complete worker record including status and heartbeat."""
|
|
51
|
+
worker_id: str
|
|
52
|
+
status: str # "active", "terminating", "terminated", "lost"
|
|
53
|
+
last_heartbeat: str
|
|
54
|
+
host: Optional[str] = None
|
|
55
|
+
pid: Optional[int] = None
|
|
56
|
+
capabilities: Optional[List[str]] = None
|
|
57
|
+
started_at: Optional[str] = None
|
|
58
|
+
current_task_id: Optional[str] = None
|
|
59
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
62
|
+
return asdict(self)
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def from_dict(cls, data: Dict[str, Any]) -> "WorkerRecord":
|
|
66
|
+
return cls(**data)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class WorkerFilter:
|
|
71
|
+
"""Filter criteria for listing workers."""
|
|
72
|
+
status: Optional[str] = None
|
|
73
|
+
capability: Optional[str] = None
|
|
74
|
+
stale_threshold_seconds: Optional[int] = None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class WorkItem:
|
|
79
|
+
"""A claimed work item from a pool."""
|
|
80
|
+
id: str
|
|
81
|
+
data: Any
|
|
82
|
+
claimed_by: Optional[str] = None
|
|
83
|
+
attempts: int = 0
|
|
84
|
+
max_retries: int = 3
|
|
85
|
+
status: str = "pending" # "pending", "claimed", "completed", "failed", "poisoned"
|
|
86
|
+
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
87
|
+
claimed_at: Optional[str] = None
|
|
88
|
+
|
|
89
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
90
|
+
return asdict(self)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# =============================================================================
|
|
94
|
+
# Registration Backend Protocol & Implementations
|
|
95
|
+
# =============================================================================
|
|
96
|
+
|
|
97
|
+
@runtime_checkable
|
|
98
|
+
class RegistrationBackend(Protocol):
|
|
99
|
+
"""
|
|
100
|
+
Protocol for worker lifecycle management.
|
|
101
|
+
|
|
102
|
+
Workers register themselves, send periodic heartbeats, and update status.
|
|
103
|
+
The backend tracks worker liveness for stale detection.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
async def register(self, worker: WorkerRegistration) -> WorkerRecord:
|
|
107
|
+
"""Register a new worker.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
worker: Worker registration information
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Complete worker record with initial status
|
|
114
|
+
"""
|
|
115
|
+
...
|
|
116
|
+
|
|
117
|
+
async def heartbeat(
|
|
118
|
+
self,
|
|
119
|
+
worker_id: str,
|
|
120
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Update worker's last heartbeat timestamp.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
worker_id: ID of the worker
|
|
126
|
+
metadata: Optional metadata to update
|
|
127
|
+
"""
|
|
128
|
+
...
|
|
129
|
+
|
|
130
|
+
async def update_status(self, worker_id: str, status: str) -> None:
|
|
131
|
+
"""Update worker status.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
worker_id: ID of the worker
|
|
135
|
+
status: New status (active, terminating, terminated, lost)
|
|
136
|
+
"""
|
|
137
|
+
...
|
|
138
|
+
|
|
139
|
+
async def get(self, worker_id: str) -> Optional[WorkerRecord]:
|
|
140
|
+
"""Get worker by ID.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
worker_id: ID of the worker
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Worker record or None if not found
|
|
147
|
+
"""
|
|
148
|
+
...
|
|
149
|
+
|
|
150
|
+
async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
|
|
151
|
+
"""List workers matching filter.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
filter: Optional filter criteria
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
List of matching worker records
|
|
158
|
+
"""
|
|
159
|
+
...
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class MemoryRegistrationBackend:
|
|
163
|
+
"""In-memory registration backend for testing and single-process scenarios."""
|
|
164
|
+
|
|
165
|
+
def __init__(self):
|
|
166
|
+
self._workers: Dict[str, WorkerRecord] = {}
|
|
167
|
+
self._lock = asyncio.Lock()
|
|
168
|
+
|
|
169
|
+
async def register(self, worker: WorkerRegistration) -> WorkerRecord:
|
|
170
|
+
async with self._lock:
|
|
171
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
172
|
+
record = WorkerRecord(
|
|
173
|
+
worker_id=worker.worker_id,
|
|
174
|
+
status="active",
|
|
175
|
+
last_heartbeat=now,
|
|
176
|
+
host=worker.host,
|
|
177
|
+
pid=worker.pid,
|
|
178
|
+
capabilities=worker.capabilities,
|
|
179
|
+
started_at=worker.started_at,
|
|
180
|
+
)
|
|
181
|
+
self._workers[worker.worker_id] = record
|
|
182
|
+
logger.debug(f"RegistrationBackend: registered worker {worker.worker_id}")
|
|
183
|
+
return record
|
|
184
|
+
|
|
185
|
+
async def heartbeat(
|
|
186
|
+
self,
|
|
187
|
+
worker_id: str,
|
|
188
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
189
|
+
) -> None:
|
|
190
|
+
async with self._lock:
|
|
191
|
+
if worker_id not in self._workers:
|
|
192
|
+
raise KeyError(f"Worker {worker_id} not found")
|
|
193
|
+
record = self._workers[worker_id]
|
|
194
|
+
record.last_heartbeat = datetime.now(timezone.utc).isoformat()
|
|
195
|
+
if metadata:
|
|
196
|
+
record.metadata = {**(record.metadata or {}), **metadata}
|
|
197
|
+
logger.debug(f"RegistrationBackend: heartbeat for {worker_id}")
|
|
198
|
+
|
|
199
|
+
async def update_status(self, worker_id: str, status: str) -> None:
|
|
200
|
+
async with self._lock:
|
|
201
|
+
if worker_id not in self._workers:
|
|
202
|
+
raise KeyError(f"Worker {worker_id} not found")
|
|
203
|
+
self._workers[worker_id].status = status
|
|
204
|
+
logger.debug(f"RegistrationBackend: {worker_id} status -> {status}")
|
|
205
|
+
|
|
206
|
+
async def get(self, worker_id: str) -> Optional[WorkerRecord]:
|
|
207
|
+
return self._workers.get(worker_id)
|
|
208
|
+
|
|
209
|
+
async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
|
|
210
|
+
workers = list(self._workers.values())
|
|
211
|
+
|
|
212
|
+
if filter:
|
|
213
|
+
if filter.status:
|
|
214
|
+
workers = [w for w in workers if w.status == filter.status]
|
|
215
|
+
if filter.capability:
|
|
216
|
+
workers = [
|
|
217
|
+
w for w in workers
|
|
218
|
+
if w.capabilities and filter.capability in w.capabilities
|
|
219
|
+
]
|
|
220
|
+
if filter.stale_threshold_seconds:
|
|
221
|
+
cutoff = datetime.now(timezone.utc) - timedelta(
|
|
222
|
+
seconds=filter.stale_threshold_seconds
|
|
223
|
+
)
|
|
224
|
+
workers = [
|
|
225
|
+
w for w in workers
|
|
226
|
+
if datetime.fromisoformat(w.last_heartbeat.replace('Z', '+00:00')) < cutoff
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
return workers
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class SQLiteRegistrationBackend:
|
|
233
|
+
"""SQLite-based registration backend for local/container deployments."""
|
|
234
|
+
|
|
235
|
+
SCHEMA = """
|
|
236
|
+
CREATE TABLE IF NOT EXISTS worker_registry (
|
|
237
|
+
worker_id TEXT PRIMARY KEY,
|
|
238
|
+
status TEXT NOT NULL DEFAULT 'active',
|
|
239
|
+
last_heartbeat TEXT NOT NULL,
|
|
240
|
+
host TEXT,
|
|
241
|
+
pid INTEGER,
|
|
242
|
+
capabilities TEXT, -- JSON array
|
|
243
|
+
started_at TEXT,
|
|
244
|
+
current_task_id TEXT,
|
|
245
|
+
metadata TEXT -- JSON object
|
|
246
|
+
);
|
|
247
|
+
|
|
248
|
+
CREATE INDEX IF NOT EXISTS idx_worker_status ON worker_registry(status);
|
|
249
|
+
CREATE INDEX IF NOT EXISTS idx_worker_heartbeat ON worker_registry(last_heartbeat);
|
|
250
|
+
"""
|
|
251
|
+
|
|
252
|
+
def __init__(self, db_path: str = "workers.sqlite"):
|
|
253
|
+
self.db_path = Path(db_path)
|
|
254
|
+
self._lock = asyncio.Lock()
|
|
255
|
+
self._init_db()
|
|
256
|
+
|
|
257
|
+
def _init_db(self) -> None:
|
|
258
|
+
"""Initialize database schema."""
|
|
259
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
260
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
261
|
+
conn.executescript(self.SCHEMA)
|
|
262
|
+
|
|
263
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
264
|
+
conn = sqlite3.connect(self.db_path)
|
|
265
|
+
conn.row_factory = sqlite3.Row
|
|
266
|
+
return conn
|
|
267
|
+
|
|
268
|
+
def _row_to_record(self, row: sqlite3.Row) -> WorkerRecord:
|
|
269
|
+
capabilities = json.loads(row["capabilities"]) if row["capabilities"] else None
|
|
270
|
+
metadata = json.loads(row["metadata"]) if row["metadata"] else None
|
|
271
|
+
return WorkerRecord(
|
|
272
|
+
worker_id=row["worker_id"],
|
|
273
|
+
status=row["status"],
|
|
274
|
+
last_heartbeat=row["last_heartbeat"],
|
|
275
|
+
host=row["host"],
|
|
276
|
+
pid=row["pid"],
|
|
277
|
+
capabilities=capabilities,
|
|
278
|
+
started_at=row["started_at"],
|
|
279
|
+
current_task_id=row["current_task_id"],
|
|
280
|
+
metadata=metadata,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
async def register(self, worker: WorkerRegistration) -> WorkerRecord:
|
|
284
|
+
async with self._lock:
|
|
285
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
286
|
+
capabilities_json = json.dumps(worker.capabilities) if worker.capabilities else None
|
|
287
|
+
|
|
288
|
+
with self._get_conn() as conn:
|
|
289
|
+
conn.execute(
|
|
290
|
+
"""
|
|
291
|
+
INSERT OR REPLACE INTO worker_registry
|
|
292
|
+
(worker_id, status, last_heartbeat, host, pid, capabilities, started_at)
|
|
293
|
+
VALUES (?, 'active', ?, ?, ?, ?, ?)
|
|
294
|
+
""",
|
|
295
|
+
(worker.worker_id, now, worker.host, worker.pid,
|
|
296
|
+
capabilities_json, worker.started_at)
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
logger.debug(f"RegistrationBackend: registered worker {worker.worker_id}")
|
|
300
|
+
return WorkerRecord(
|
|
301
|
+
worker_id=worker.worker_id,
|
|
302
|
+
status="active",
|
|
303
|
+
last_heartbeat=now,
|
|
304
|
+
host=worker.host,
|
|
305
|
+
pid=worker.pid,
|
|
306
|
+
capabilities=worker.capabilities,
|
|
307
|
+
started_at=worker.started_at,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
async def heartbeat(
|
|
311
|
+
self,
|
|
312
|
+
worker_id: str,
|
|
313
|
+
metadata: Optional[Dict[str, Any]] = None
|
|
314
|
+
) -> None:
|
|
315
|
+
async with self._lock:
|
|
316
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
317
|
+
with self._get_conn() as conn:
|
|
318
|
+
if metadata:
|
|
319
|
+
# Merge metadata
|
|
320
|
+
cursor = conn.execute(
|
|
321
|
+
"SELECT metadata FROM worker_registry WHERE worker_id = ?",
|
|
322
|
+
(worker_id,)
|
|
323
|
+
)
|
|
324
|
+
row = cursor.fetchone()
|
|
325
|
+
if not row:
|
|
326
|
+
raise KeyError(f"Worker {worker_id} not found")
|
|
327
|
+
existing = json.loads(row["metadata"]) if row["metadata"] else {}
|
|
328
|
+
merged = {**existing, **metadata}
|
|
329
|
+
conn.execute(
|
|
330
|
+
"""
|
|
331
|
+
UPDATE worker_registry
|
|
332
|
+
SET last_heartbeat = ?, metadata = ?
|
|
333
|
+
WHERE worker_id = ?
|
|
334
|
+
""",
|
|
335
|
+
(now, json.dumps(merged), worker_id)
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
result = conn.execute(
|
|
339
|
+
"UPDATE worker_registry SET last_heartbeat = ? WHERE worker_id = ?",
|
|
340
|
+
(now, worker_id)
|
|
341
|
+
)
|
|
342
|
+
if result.rowcount == 0:
|
|
343
|
+
raise KeyError(f"Worker {worker_id} not found")
|
|
344
|
+
|
|
345
|
+
logger.debug(f"RegistrationBackend: heartbeat for {worker_id}")
|
|
346
|
+
|
|
347
|
+
async def update_status(self, worker_id: str, status: str) -> None:
|
|
348
|
+
async with self._lock:
|
|
349
|
+
with self._get_conn() as conn:
|
|
350
|
+
result = conn.execute(
|
|
351
|
+
"UPDATE worker_registry SET status = ? WHERE worker_id = ?",
|
|
352
|
+
(status, worker_id)
|
|
353
|
+
)
|
|
354
|
+
if result.rowcount == 0:
|
|
355
|
+
raise KeyError(f"Worker {worker_id} not found")
|
|
356
|
+
|
|
357
|
+
logger.debug(f"RegistrationBackend: {worker_id} status -> {status}")
|
|
358
|
+
|
|
359
|
+
async def get(self, worker_id: str) -> Optional[WorkerRecord]:
|
|
360
|
+
with self._get_conn() as conn:
|
|
361
|
+
cursor = conn.execute(
|
|
362
|
+
"SELECT * FROM worker_registry WHERE worker_id = ?",
|
|
363
|
+
(worker_id,)
|
|
364
|
+
)
|
|
365
|
+
row = cursor.fetchone()
|
|
366
|
+
return self._row_to_record(row) if row else None
|
|
367
|
+
|
|
368
|
+
async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
|
|
369
|
+
query = "SELECT * FROM worker_registry WHERE 1=1"
|
|
370
|
+
params: List[Any] = []
|
|
371
|
+
|
|
372
|
+
if filter:
|
|
373
|
+
if filter.status:
|
|
374
|
+
query += " AND status = ?"
|
|
375
|
+
params.append(filter.status)
|
|
376
|
+
if filter.capability:
|
|
377
|
+
# JSON contains check
|
|
378
|
+
query += " AND capabilities LIKE ?"
|
|
379
|
+
params.append(f'%"{filter.capability}"%')
|
|
380
|
+
if filter.stale_threshold_seconds:
|
|
381
|
+
cutoff = (
|
|
382
|
+
datetime.now(timezone.utc) -
|
|
383
|
+
timedelta(seconds=filter.stale_threshold_seconds)
|
|
384
|
+
).isoformat()
|
|
385
|
+
query += " AND last_heartbeat < ?"
|
|
386
|
+
params.append(cutoff)
|
|
387
|
+
|
|
388
|
+
with self._get_conn() as conn:
|
|
389
|
+
cursor = conn.execute(query, params)
|
|
390
|
+
return [self._row_to_record(row) for row in cursor.fetchall()]
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# =============================================================================
|
|
394
|
+
# Work Backend Protocol & Implementations
|
|
395
|
+
# =============================================================================
|
|
396
|
+
|
|
397
|
+
@runtime_checkable
|
|
398
|
+
class WorkPool(Protocol):
|
|
399
|
+
"""
|
|
400
|
+
Protocol for a named work pool with atomic claiming.
|
|
401
|
+
|
|
402
|
+
Workers claim items atomically, process them, and mark complete or failed.
|
|
403
|
+
"""
|
|
404
|
+
|
|
405
|
+
async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
|
|
406
|
+
"""Add work item to pool.
|
|
407
|
+
|
|
408
|
+
Args:
|
|
409
|
+
item: Work item data (must be JSON-serializable)
|
|
410
|
+
options: Optional settings like max_retries
|
|
411
|
+
|
|
412
|
+
Returns:
|
|
413
|
+
Generated item ID
|
|
414
|
+
"""
|
|
415
|
+
...
|
|
416
|
+
|
|
417
|
+
async def claim(self, worker_id: str) -> Optional[WorkItem]:
|
|
418
|
+
"""Atomically claim next available item.
|
|
419
|
+
|
|
420
|
+
Args:
|
|
421
|
+
worker_id: ID of the claiming worker
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
Claimed work item or None if pool is empty
|
|
425
|
+
"""
|
|
426
|
+
...
|
|
427
|
+
|
|
428
|
+
async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
|
|
429
|
+
"""Mark item as completed and remove from pool.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
item_id: ID of the work item
|
|
433
|
+
result: Optional result data
|
|
434
|
+
"""
|
|
435
|
+
...
|
|
436
|
+
|
|
437
|
+
async def fail(self, item_id: str, error: Optional[str] = None) -> None:
|
|
438
|
+
"""Mark item as failed. Returns to pool for retry or marks as poisoned.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
item_id: ID of the work item
|
|
442
|
+
error: Optional error message
|
|
443
|
+
"""
|
|
444
|
+
...
|
|
445
|
+
|
|
446
|
+
async def size(self) -> int:
|
|
447
|
+
"""Get number of unclaimed items in pool."""
|
|
448
|
+
...
|
|
449
|
+
|
|
450
|
+
async def release_by_worker(self, worker_id: str) -> int:
|
|
451
|
+
"""Release all items claimed by a worker (for stale worker cleanup).
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
worker_id: ID of the worker whose items to release
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Number of items released
|
|
458
|
+
"""
|
|
459
|
+
...
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@runtime_checkable
|
|
463
|
+
class WorkBackend(Protocol):
|
|
464
|
+
"""Protocol for work distribution across named pools."""
|
|
465
|
+
|
|
466
|
+
def pool(self, name: str) -> WorkPool:
|
|
467
|
+
"""Get a named work pool.
|
|
468
|
+
|
|
469
|
+
Args:
|
|
470
|
+
name: Pool name (e.g., "paper_analysis", "image_processing")
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
WorkPool instance for the named pool
|
|
474
|
+
"""
|
|
475
|
+
...
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
class MemoryWorkPool:
|
|
479
|
+
"""In-memory work pool implementation."""
|
|
480
|
+
|
|
481
|
+
def __init__(self, name: str):
|
|
482
|
+
self.name = name
|
|
483
|
+
self._items: Dict[str, WorkItem] = {}
|
|
484
|
+
self._lock = asyncio.Lock()
|
|
485
|
+
|
|
486
|
+
async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
|
|
487
|
+
async with self._lock:
|
|
488
|
+
item_id = str(uuid.uuid4())
|
|
489
|
+
max_retries = (options or {}).get("max_retries", 3)
|
|
490
|
+
work_item = WorkItem(
|
|
491
|
+
id=item_id,
|
|
492
|
+
data=item,
|
|
493
|
+
max_retries=max_retries,
|
|
494
|
+
)
|
|
495
|
+
self._items[item_id] = work_item
|
|
496
|
+
logger.debug(f"WorkPool[{self.name}]: pushed item {item_id}")
|
|
497
|
+
return item_id
|
|
498
|
+
|
|
499
|
+
async def claim(self, worker_id: str) -> Optional[WorkItem]:
|
|
500
|
+
async with self._lock:
|
|
501
|
+
# Find first pending item
|
|
502
|
+
for item in self._items.values():
|
|
503
|
+
if item.status == "pending":
|
|
504
|
+
item.status = "claimed"
|
|
505
|
+
item.claimed_by = worker_id
|
|
506
|
+
item.claimed_at = datetime.now(timezone.utc).isoformat()
|
|
507
|
+
item.attempts += 1
|
|
508
|
+
logger.debug(f"WorkPool[{self.name}]: {worker_id} claimed {item.id}")
|
|
509
|
+
return item
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
|
|
513
|
+
async with self._lock:
|
|
514
|
+
if item_id not in self._items:
|
|
515
|
+
raise KeyError(f"Work item {item_id} not found")
|
|
516
|
+
# Remove completed items
|
|
517
|
+
del self._items[item_id]
|
|
518
|
+
logger.debug(f"WorkPool[{self.name}]: completed {item_id}")
|
|
519
|
+
|
|
520
|
+
async def fail(self, item_id: str, error: Optional[str] = None) -> None:
|
|
521
|
+
async with self._lock:
|
|
522
|
+
if item_id not in self._items:
|
|
523
|
+
raise KeyError(f"Work item {item_id} not found")
|
|
524
|
+
item = self._items[item_id]
|
|
525
|
+
|
|
526
|
+
if item.attempts >= item.max_retries:
|
|
527
|
+
item.status = "poisoned"
|
|
528
|
+
logger.warning(
|
|
529
|
+
f"WorkPool[{self.name}]: {item_id} poisoned after {item.attempts} attempts"
|
|
530
|
+
)
|
|
531
|
+
else:
|
|
532
|
+
item.status = "pending"
|
|
533
|
+
item.claimed_by = None
|
|
534
|
+
item.claimed_at = None
|
|
535
|
+
logger.debug(
|
|
536
|
+
f"WorkPool[{self.name}]: {item_id} failed, returning to pool "
|
|
537
|
+
f"(attempt {item.attempts}/{item.max_retries})"
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
async def size(self) -> int:
|
|
541
|
+
return sum(1 for item in self._items.values() if item.status == "pending")
|
|
542
|
+
|
|
543
|
+
async def release_by_worker(self, worker_id: str) -> int:
|
|
544
|
+
async with self._lock:
|
|
545
|
+
released = 0
|
|
546
|
+
for item in self._items.values():
|
|
547
|
+
if item.claimed_by == worker_id and item.status == "claimed":
|
|
548
|
+
item.status = "pending"
|
|
549
|
+
item.claimed_by = None
|
|
550
|
+
item.claimed_at = None
|
|
551
|
+
released += 1
|
|
552
|
+
logger.debug(f"WorkPool[{self.name}]: released {released} items from {worker_id}")
|
|
553
|
+
return released
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
class MemoryWorkBackend:
|
|
557
|
+
"""In-memory work backend with named pools."""
|
|
558
|
+
|
|
559
|
+
def __init__(self):
|
|
560
|
+
self._pools: Dict[str, MemoryWorkPool] = {}
|
|
561
|
+
|
|
562
|
+
def pool(self, name: str) -> MemoryWorkPool:
|
|
563
|
+
if name not in self._pools:
|
|
564
|
+
self._pools[name] = MemoryWorkPool(name)
|
|
565
|
+
return self._pools[name]
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
class SQLiteWorkPool:
|
|
569
|
+
"""SQLite-based work pool implementation."""
|
|
570
|
+
|
|
571
|
+
def __init__(self, name: str, db_path: Path, lock: asyncio.Lock):
|
|
572
|
+
self.name = name
|
|
573
|
+
self.db_path = db_path
|
|
574
|
+
self._lock = lock
|
|
575
|
+
|
|
576
|
+
def _get_conn(self) -> sqlite3.Connection:
|
|
577
|
+
conn = sqlite3.connect(self.db_path)
|
|
578
|
+
conn.row_factory = sqlite3.Row
|
|
579
|
+
return conn
|
|
580
|
+
|
|
581
|
+
def _row_to_item(self, row: sqlite3.Row) -> WorkItem:
|
|
582
|
+
return WorkItem(
|
|
583
|
+
id=row["item_id"],
|
|
584
|
+
data=json.loads(row["data"]),
|
|
585
|
+
claimed_by=row["claimed_by"],
|
|
586
|
+
attempts=row["attempts"],
|
|
587
|
+
max_retries=row["max_retries"],
|
|
588
|
+
status=row["status"],
|
|
589
|
+
created_at=row["created_at"],
|
|
590
|
+
claimed_at=row["claimed_at"],
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
|
|
594
|
+
async with self._lock:
|
|
595
|
+
item_id = str(uuid.uuid4())
|
|
596
|
+
max_retries = (options or {}).get("max_retries", 3)
|
|
597
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
598
|
+
|
|
599
|
+
with self._get_conn() as conn:
|
|
600
|
+
conn.execute(
|
|
601
|
+
"""
|
|
602
|
+
INSERT INTO work_pool
|
|
603
|
+
(item_id, pool_name, data, status, attempts, max_retries, created_at)
|
|
604
|
+
VALUES (?, ?, ?, 'pending', 0, ?, ?)
|
|
605
|
+
""",
|
|
606
|
+
(item_id, self.name, json.dumps(item), max_retries, now)
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
logger.debug(f"WorkPool[{self.name}]: pushed item {item_id}")
|
|
610
|
+
return item_id
|
|
611
|
+
|
|
612
|
+
async def claim(self, worker_id: str) -> Optional[WorkItem]:
|
|
613
|
+
async with self._lock:
|
|
614
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
615
|
+
|
|
616
|
+
with self._get_conn() as conn:
|
|
617
|
+
# Atomic claim: select and update in transaction
|
|
618
|
+
cursor = conn.execute(
|
|
619
|
+
"""
|
|
620
|
+
SELECT item_id FROM work_pool
|
|
621
|
+
WHERE pool_name = ? AND status = 'pending'
|
|
622
|
+
ORDER BY created_at ASC
|
|
623
|
+
LIMIT 1
|
|
624
|
+
""",
|
|
625
|
+
(self.name,)
|
|
626
|
+
)
|
|
627
|
+
row = cursor.fetchone()
|
|
628
|
+
if not row:
|
|
629
|
+
return None
|
|
630
|
+
|
|
631
|
+
item_id = row["item_id"]
|
|
632
|
+
conn.execute(
|
|
633
|
+
"""
|
|
634
|
+
UPDATE work_pool
|
|
635
|
+
SET status = 'claimed', claimed_by = ?, claimed_at = ?,
|
|
636
|
+
attempts = attempts + 1
|
|
637
|
+
WHERE item_id = ?
|
|
638
|
+
""",
|
|
639
|
+
(worker_id, now, item_id)
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
# Fetch the updated item
|
|
643
|
+
cursor = conn.execute(
|
|
644
|
+
"SELECT * FROM work_pool WHERE item_id = ?",
|
|
645
|
+
(item_id,)
|
|
646
|
+
)
|
|
647
|
+
row = cursor.fetchone()
|
|
648
|
+
|
|
649
|
+
logger.debug(f"WorkPool[{self.name}]: {worker_id} claimed {item_id}")
|
|
650
|
+
return self._row_to_item(row) if row else None
|
|
651
|
+
|
|
652
|
+
async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
|
|
653
|
+
async with self._lock:
|
|
654
|
+
with self._get_conn() as conn:
|
|
655
|
+
# Store result and mark completed (or just delete)
|
|
656
|
+
result = conn.execute(
|
|
657
|
+
"DELETE FROM work_pool WHERE item_id = ?",
|
|
658
|
+
(item_id,)
|
|
659
|
+
)
|
|
660
|
+
if result.rowcount == 0:
|
|
661
|
+
raise KeyError(f"Work item {item_id} not found")
|
|
662
|
+
|
|
663
|
+
logger.debug(f"WorkPool[{self.name}]: completed {item_id}")
|
|
664
|
+
|
|
665
|
+
async def fail(self, item_id: str, error: Optional[str] = None) -> None:
|
|
666
|
+
async with self._lock:
|
|
667
|
+
with self._get_conn() as conn:
|
|
668
|
+
cursor = conn.execute(
|
|
669
|
+
"SELECT attempts, max_retries FROM work_pool WHERE item_id = ?",
|
|
670
|
+
(item_id,)
|
|
671
|
+
)
|
|
672
|
+
row = cursor.fetchone()
|
|
673
|
+
if not row:
|
|
674
|
+
raise KeyError(f"Work item {item_id} not found")
|
|
675
|
+
|
|
676
|
+
attempts = row["attempts"]
|
|
677
|
+
max_retries = row["max_retries"]
|
|
678
|
+
|
|
679
|
+
if attempts >= max_retries:
|
|
680
|
+
conn.execute(
|
|
681
|
+
"UPDATE work_pool SET status = 'poisoned' WHERE item_id = ?",
|
|
682
|
+
(item_id,)
|
|
683
|
+
)
|
|
684
|
+
logger.warning(
|
|
685
|
+
f"WorkPool[{self.name}]: {item_id} poisoned after {attempts} attempts"
|
|
686
|
+
)
|
|
687
|
+
else:
|
|
688
|
+
conn.execute(
|
|
689
|
+
"""
|
|
690
|
+
UPDATE work_pool
|
|
691
|
+
SET status = 'pending', claimed_by = NULL, claimed_at = NULL
|
|
692
|
+
WHERE item_id = ?
|
|
693
|
+
""",
|
|
694
|
+
(item_id,)
|
|
695
|
+
)
|
|
696
|
+
logger.debug(
|
|
697
|
+
f"WorkPool[{self.name}]: {item_id} failed, returning to pool "
|
|
698
|
+
f"(attempt {attempts}/{max_retries})"
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
async def size(self) -> int:
|
|
702
|
+
with self._get_conn() as conn:
|
|
703
|
+
cursor = conn.execute(
|
|
704
|
+
"SELECT COUNT(*) as cnt FROM work_pool WHERE pool_name = ? AND status = 'pending'",
|
|
705
|
+
(self.name,)
|
|
706
|
+
)
|
|
707
|
+
return cursor.fetchone()["cnt"]
|
|
708
|
+
|
|
709
|
+
async def release_by_worker(self, worker_id: str) -> int:
|
|
710
|
+
async with self._lock:
|
|
711
|
+
with self._get_conn() as conn:
|
|
712
|
+
result = conn.execute(
|
|
713
|
+
"""
|
|
714
|
+
UPDATE work_pool
|
|
715
|
+
SET status = 'pending', claimed_by = NULL, claimed_at = NULL
|
|
716
|
+
WHERE pool_name = ? AND claimed_by = ? AND status = 'claimed'
|
|
717
|
+
""",
|
|
718
|
+
(self.name, worker_id)
|
|
719
|
+
)
|
|
720
|
+
released = result.rowcount
|
|
721
|
+
|
|
722
|
+
logger.debug(f"WorkPool[{self.name}]: released {released} items from {worker_id}")
|
|
723
|
+
return released
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
class SQLiteWorkBackend:
|
|
727
|
+
"""SQLite-based work backend with named pools."""
|
|
728
|
+
|
|
729
|
+
SCHEMA = """
|
|
730
|
+
CREATE TABLE IF NOT EXISTS work_pool (
|
|
731
|
+
item_id TEXT PRIMARY KEY,
|
|
732
|
+
pool_name TEXT NOT NULL,
|
|
733
|
+
data TEXT NOT NULL, -- JSON
|
|
734
|
+
status TEXT NOT NULL DEFAULT 'pending', -- pending, claimed, completed, failed, poisoned
|
|
735
|
+
claimed_by TEXT,
|
|
736
|
+
claimed_at TEXT,
|
|
737
|
+
attempts INTEGER NOT NULL DEFAULT 0,
|
|
738
|
+
max_retries INTEGER NOT NULL DEFAULT 3,
|
|
739
|
+
created_at TEXT NOT NULL,
|
|
740
|
+
error TEXT
|
|
741
|
+
);
|
|
742
|
+
|
|
743
|
+
CREATE INDEX IF NOT EXISTS idx_work_pool_name ON work_pool(pool_name);
|
|
744
|
+
CREATE INDEX IF NOT EXISTS idx_work_status ON work_pool(status);
|
|
745
|
+
CREATE INDEX IF NOT EXISTS idx_work_claimed_by ON work_pool(claimed_by);
|
|
746
|
+
"""
|
|
747
|
+
|
|
748
|
+
def __init__(self, db_path: str = "workers.sqlite"):
|
|
749
|
+
self.db_path = Path(db_path)
|
|
750
|
+
self._pools: Dict[str, SQLiteWorkPool] = {}
|
|
751
|
+
self._lock = asyncio.Lock()
|
|
752
|
+
self._init_db()
|
|
753
|
+
|
|
754
|
+
def _init_db(self) -> None:
|
|
755
|
+
"""Initialize database schema."""
|
|
756
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
757
|
+
with sqlite3.connect(self.db_path) as conn:
|
|
758
|
+
conn.executescript(self.SCHEMA)
|
|
759
|
+
|
|
760
|
+
def pool(self, name: str) -> SQLiteWorkPool:
|
|
761
|
+
if name not in self._pools:
|
|
762
|
+
self._pools[name] = SQLiteWorkPool(name, self.db_path, self._lock)
|
|
763
|
+
return self._pools[name]
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# =============================================================================
|
|
767
|
+
# Factory Functions
|
|
768
|
+
# =============================================================================
|
|
769
|
+
|
|
770
|
+
def create_registration_backend(
|
|
771
|
+
backend_type: str = "memory",
|
|
772
|
+
**kwargs: Any
|
|
773
|
+
) -> RegistrationBackend:
|
|
774
|
+
"""Create a registration backend by type.
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
backend_type: "memory" or "sqlite"
|
|
778
|
+
**kwargs: Backend-specific options (e.g., db_path for sqlite)
|
|
779
|
+
|
|
780
|
+
Returns:
|
|
781
|
+
RegistrationBackend instance
|
|
782
|
+
"""
|
|
783
|
+
if backend_type == "memory":
|
|
784
|
+
return MemoryRegistrationBackend()
|
|
785
|
+
elif backend_type == "sqlite":
|
|
786
|
+
db_path = kwargs.get("db_path", "workers.sqlite")
|
|
787
|
+
return SQLiteRegistrationBackend(db_path=db_path)
|
|
788
|
+
else:
|
|
789
|
+
raise ValueError(f"Unknown registration backend type: {backend_type}")
|
|
790
|
+
|
|
791
|
+
|
|
792
|
+
def create_work_backend(
|
|
793
|
+
backend_type: str = "memory",
|
|
794
|
+
**kwargs: Any
|
|
795
|
+
) -> WorkBackend:
|
|
796
|
+
"""Create a work backend by type.
|
|
797
|
+
|
|
798
|
+
Args:
|
|
799
|
+
backend_type: "memory" or "sqlite"
|
|
800
|
+
**kwargs: Backend-specific options (e.g., db_path for sqlite)
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
WorkBackend instance
|
|
804
|
+
"""
|
|
805
|
+
if backend_type == "memory":
|
|
806
|
+
return MemoryWorkBackend()
|
|
807
|
+
elif backend_type == "sqlite":
|
|
808
|
+
db_path = kwargs.get("db_path", "workers.sqlite")
|
|
809
|
+
return SQLiteWorkBackend(db_path=db_path)
|
|
810
|
+
else:
|
|
811
|
+
raise ValueError(f"Unknown work backend type: {backend_type}")
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
__all__ = [
|
|
815
|
+
# Types
|
|
816
|
+
"WorkerRegistration",
|
|
817
|
+
"WorkerRecord",
|
|
818
|
+
"WorkerFilter",
|
|
819
|
+
"WorkItem",
|
|
820
|
+
# Protocols
|
|
821
|
+
"RegistrationBackend",
|
|
822
|
+
"WorkPool",
|
|
823
|
+
"WorkBackend",
|
|
824
|
+
# Memory implementations
|
|
825
|
+
"MemoryRegistrationBackend",
|
|
826
|
+
"MemoryWorkPool",
|
|
827
|
+
"MemoryWorkBackend",
|
|
828
|
+
# SQLite implementations
|
|
829
|
+
"SQLiteRegistrationBackend",
|
|
830
|
+
"SQLiteWorkPool",
|
|
831
|
+
"SQLiteWorkBackend",
|
|
832
|
+
# Factory functions
|
|
833
|
+
"create_registration_backend",
|
|
834
|
+
"create_work_backend",
|
|
835
|
+
]
|