flatmachines 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. flatmachines/__init__.py +136 -0
  2. flatmachines/actions.py +408 -0
  3. flatmachines/adapters/__init__.py +38 -0
  4. flatmachines/adapters/flatagent.py +86 -0
  5. flatmachines/adapters/pi_agent_bridge.py +127 -0
  6. flatmachines/adapters/pi_agent_runner.mjs +99 -0
  7. flatmachines/adapters/smolagents.py +125 -0
  8. flatmachines/agents.py +144 -0
  9. flatmachines/assets/MACHINES.md +141 -0
  10. flatmachines/assets/README.md +11 -0
  11. flatmachines/assets/__init__.py +0 -0
  12. flatmachines/assets/flatagent.d.ts +219 -0
  13. flatmachines/assets/flatagent.schema.json +271 -0
  14. flatmachines/assets/flatagent.slim.d.ts +58 -0
  15. flatmachines/assets/flatagents-runtime.d.ts +523 -0
  16. flatmachines/assets/flatagents-runtime.schema.json +281 -0
  17. flatmachines/assets/flatagents-runtime.slim.d.ts +187 -0
  18. flatmachines/assets/flatmachine.d.ts +403 -0
  19. flatmachines/assets/flatmachine.schema.json +620 -0
  20. flatmachines/assets/flatmachine.slim.d.ts +106 -0
  21. flatmachines/assets/profiles.d.ts +140 -0
  22. flatmachines/assets/profiles.schema.json +93 -0
  23. flatmachines/assets/profiles.slim.d.ts +26 -0
  24. flatmachines/backends.py +222 -0
  25. flatmachines/distributed.py +835 -0
  26. flatmachines/distributed_hooks.py +351 -0
  27. flatmachines/execution.py +638 -0
  28. flatmachines/expressions/__init__.py +60 -0
  29. flatmachines/expressions/cel.py +101 -0
  30. flatmachines/expressions/simple.py +166 -0
  31. flatmachines/flatmachine.py +1263 -0
  32. flatmachines/hooks.py +381 -0
  33. flatmachines/locking.py +69 -0
  34. flatmachines/monitoring.py +505 -0
  35. flatmachines/persistence.py +213 -0
  36. flatmachines/run.py +117 -0
  37. flatmachines/utils.py +166 -0
  38. flatmachines/validation.py +79 -0
  39. flatmachines-1.0.0.dist-info/METADATA +390 -0
  40. flatmachines-1.0.0.dist-info/RECORD +41 -0
  41. flatmachines-1.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,835 @@
1
+ """
2
+ Distributed backends for FlatAgents worker orchestration.
3
+
4
+ This module provides backends for work distribution and worker lifecycle management
5
+ across ephemeral machines. These backends enable decentralized autoscaling patterns
6
+ where workers claim jobs, process them, and exit.
7
+
8
+ Backends:
9
+ - RegistrationBackend: Worker lifecycle (register, heartbeat, status)
10
+ - WorkBackend: Work distribution via named pools with atomic claiming
11
+
12
+ Implementations:
13
+ - SQLite: Single-file database, suitable for local/container deployments
14
+ - Memory: In-memory storage, suitable for testing and single-process scenarios
15
+ """
16
+
17
+ import asyncio
18
+ import json
19
+ import logging
20
+ import sqlite3
21
+ import uuid
22
+ from abc import ABC, abstractmethod
23
+ from dataclasses import dataclass, asdict, field
24
+ from datetime import datetime, timezone, timedelta
25
+ from pathlib import Path
26
+ from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ # =============================================================================
32
+ # Types
33
+ # =============================================================================
34
+
35
+ @dataclass
36
+ class WorkerRegistration:
37
+ """Information for registering a new worker."""
38
+ worker_id: str
39
+ host: Optional[str] = None
40
+ pid: Optional[int] = None
41
+ capabilities: Optional[List[str]] = None
42
+ started_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
43
+
44
+ def to_dict(self) -> Dict[str, Any]:
45
+ return asdict(self)
46
+
47
+
48
+ @dataclass
49
+ class WorkerRecord:
50
+ """Complete worker record including status and heartbeat."""
51
+ worker_id: str
52
+ status: str # "active", "terminating", "terminated", "lost"
53
+ last_heartbeat: str
54
+ host: Optional[str] = None
55
+ pid: Optional[int] = None
56
+ capabilities: Optional[List[str]] = None
57
+ started_at: Optional[str] = None
58
+ current_task_id: Optional[str] = None
59
+ metadata: Optional[Dict[str, Any]] = None
60
+
61
+ def to_dict(self) -> Dict[str, Any]:
62
+ return asdict(self)
63
+
64
+ @classmethod
65
+ def from_dict(cls, data: Dict[str, Any]) -> "WorkerRecord":
66
+ return cls(**data)
67
+
68
+
69
+ @dataclass
70
+ class WorkerFilter:
71
+ """Filter criteria for listing workers."""
72
+ status: Optional[str] = None
73
+ capability: Optional[str] = None
74
+ stale_threshold_seconds: Optional[int] = None
75
+
76
+
77
+ @dataclass
78
+ class WorkItem:
79
+ """A claimed work item from a pool."""
80
+ id: str
81
+ data: Any
82
+ claimed_by: Optional[str] = None
83
+ attempts: int = 0
84
+ max_retries: int = 3
85
+ status: str = "pending" # "pending", "claimed", "completed", "failed", "poisoned"
86
+ created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
87
+ claimed_at: Optional[str] = None
88
+
89
+ def to_dict(self) -> Dict[str, Any]:
90
+ return asdict(self)
91
+
92
+
93
+ # =============================================================================
94
+ # Registration Backend Protocol & Implementations
95
+ # =============================================================================
96
+
97
+ @runtime_checkable
98
+ class RegistrationBackend(Protocol):
99
+ """
100
+ Protocol for worker lifecycle management.
101
+
102
+ Workers register themselves, send periodic heartbeats, and update status.
103
+ The backend tracks worker liveness for stale detection.
104
+ """
105
+
106
+ async def register(self, worker: WorkerRegistration) -> WorkerRecord:
107
+ """Register a new worker.
108
+
109
+ Args:
110
+ worker: Worker registration information
111
+
112
+ Returns:
113
+ Complete worker record with initial status
114
+ """
115
+ ...
116
+
117
+ async def heartbeat(
118
+ self,
119
+ worker_id: str,
120
+ metadata: Optional[Dict[str, Any]] = None
121
+ ) -> None:
122
+ """Update worker's last heartbeat timestamp.
123
+
124
+ Args:
125
+ worker_id: ID of the worker
126
+ metadata: Optional metadata to update
127
+ """
128
+ ...
129
+
130
+ async def update_status(self, worker_id: str, status: str) -> None:
131
+ """Update worker status.
132
+
133
+ Args:
134
+ worker_id: ID of the worker
135
+ status: New status (active, terminating, terminated, lost)
136
+ """
137
+ ...
138
+
139
+ async def get(self, worker_id: str) -> Optional[WorkerRecord]:
140
+ """Get worker by ID.
141
+
142
+ Args:
143
+ worker_id: ID of the worker
144
+
145
+ Returns:
146
+ Worker record or None if not found
147
+ """
148
+ ...
149
+
150
+ async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
151
+ """List workers matching filter.
152
+
153
+ Args:
154
+ filter: Optional filter criteria
155
+
156
+ Returns:
157
+ List of matching worker records
158
+ """
159
+ ...
160
+
161
+
162
+ class MemoryRegistrationBackend:
163
+ """In-memory registration backend for testing and single-process scenarios."""
164
+
165
+ def __init__(self):
166
+ self._workers: Dict[str, WorkerRecord] = {}
167
+ self._lock = asyncio.Lock()
168
+
169
+ async def register(self, worker: WorkerRegistration) -> WorkerRecord:
170
+ async with self._lock:
171
+ now = datetime.now(timezone.utc).isoformat()
172
+ record = WorkerRecord(
173
+ worker_id=worker.worker_id,
174
+ status="active",
175
+ last_heartbeat=now,
176
+ host=worker.host,
177
+ pid=worker.pid,
178
+ capabilities=worker.capabilities,
179
+ started_at=worker.started_at,
180
+ )
181
+ self._workers[worker.worker_id] = record
182
+ logger.debug(f"RegistrationBackend: registered worker {worker.worker_id}")
183
+ return record
184
+
185
+ async def heartbeat(
186
+ self,
187
+ worker_id: str,
188
+ metadata: Optional[Dict[str, Any]] = None
189
+ ) -> None:
190
+ async with self._lock:
191
+ if worker_id not in self._workers:
192
+ raise KeyError(f"Worker {worker_id} not found")
193
+ record = self._workers[worker_id]
194
+ record.last_heartbeat = datetime.now(timezone.utc).isoformat()
195
+ if metadata:
196
+ record.metadata = {**(record.metadata or {}), **metadata}
197
+ logger.debug(f"RegistrationBackend: heartbeat for {worker_id}")
198
+
199
+ async def update_status(self, worker_id: str, status: str) -> None:
200
+ async with self._lock:
201
+ if worker_id not in self._workers:
202
+ raise KeyError(f"Worker {worker_id} not found")
203
+ self._workers[worker_id].status = status
204
+ logger.debug(f"RegistrationBackend: {worker_id} status -> {status}")
205
+
206
+ async def get(self, worker_id: str) -> Optional[WorkerRecord]:
207
+ return self._workers.get(worker_id)
208
+
209
+ async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
210
+ workers = list(self._workers.values())
211
+
212
+ if filter:
213
+ if filter.status:
214
+ workers = [w for w in workers if w.status == filter.status]
215
+ if filter.capability:
216
+ workers = [
217
+ w for w in workers
218
+ if w.capabilities and filter.capability in w.capabilities
219
+ ]
220
+ if filter.stale_threshold_seconds:
221
+ cutoff = datetime.now(timezone.utc) - timedelta(
222
+ seconds=filter.stale_threshold_seconds
223
+ )
224
+ workers = [
225
+ w for w in workers
226
+ if datetime.fromisoformat(w.last_heartbeat.replace('Z', '+00:00')) < cutoff
227
+ ]
228
+
229
+ return workers
230
+
231
+
232
+ class SQLiteRegistrationBackend:
233
+ """SQLite-based registration backend for local/container deployments."""
234
+
235
+ SCHEMA = """
236
+ CREATE TABLE IF NOT EXISTS worker_registry (
237
+ worker_id TEXT PRIMARY KEY,
238
+ status TEXT NOT NULL DEFAULT 'active',
239
+ last_heartbeat TEXT NOT NULL,
240
+ host TEXT,
241
+ pid INTEGER,
242
+ capabilities TEXT, -- JSON array
243
+ started_at TEXT,
244
+ current_task_id TEXT,
245
+ metadata TEXT -- JSON object
246
+ );
247
+
248
+ CREATE INDEX IF NOT EXISTS idx_worker_status ON worker_registry(status);
249
+ CREATE INDEX IF NOT EXISTS idx_worker_heartbeat ON worker_registry(last_heartbeat);
250
+ """
251
+
252
+ def __init__(self, db_path: str = "workers.sqlite"):
253
+ self.db_path = Path(db_path)
254
+ self._lock = asyncio.Lock()
255
+ self._init_db()
256
+
257
+ def _init_db(self) -> None:
258
+ """Initialize database schema."""
259
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
260
+ with sqlite3.connect(self.db_path) as conn:
261
+ conn.executescript(self.SCHEMA)
262
+
263
+ def _get_conn(self) -> sqlite3.Connection:
264
+ conn = sqlite3.connect(self.db_path)
265
+ conn.row_factory = sqlite3.Row
266
+ return conn
267
+
268
+ def _row_to_record(self, row: sqlite3.Row) -> WorkerRecord:
269
+ capabilities = json.loads(row["capabilities"]) if row["capabilities"] else None
270
+ metadata = json.loads(row["metadata"]) if row["metadata"] else None
271
+ return WorkerRecord(
272
+ worker_id=row["worker_id"],
273
+ status=row["status"],
274
+ last_heartbeat=row["last_heartbeat"],
275
+ host=row["host"],
276
+ pid=row["pid"],
277
+ capabilities=capabilities,
278
+ started_at=row["started_at"],
279
+ current_task_id=row["current_task_id"],
280
+ metadata=metadata,
281
+ )
282
+
283
+ async def register(self, worker: WorkerRegistration) -> WorkerRecord:
284
+ async with self._lock:
285
+ now = datetime.now(timezone.utc).isoformat()
286
+ capabilities_json = json.dumps(worker.capabilities) if worker.capabilities else None
287
+
288
+ with self._get_conn() as conn:
289
+ conn.execute(
290
+ """
291
+ INSERT OR REPLACE INTO worker_registry
292
+ (worker_id, status, last_heartbeat, host, pid, capabilities, started_at)
293
+ VALUES (?, 'active', ?, ?, ?, ?, ?)
294
+ """,
295
+ (worker.worker_id, now, worker.host, worker.pid,
296
+ capabilities_json, worker.started_at)
297
+ )
298
+
299
+ logger.debug(f"RegistrationBackend: registered worker {worker.worker_id}")
300
+ return WorkerRecord(
301
+ worker_id=worker.worker_id,
302
+ status="active",
303
+ last_heartbeat=now,
304
+ host=worker.host,
305
+ pid=worker.pid,
306
+ capabilities=worker.capabilities,
307
+ started_at=worker.started_at,
308
+ )
309
+
310
+ async def heartbeat(
311
+ self,
312
+ worker_id: str,
313
+ metadata: Optional[Dict[str, Any]] = None
314
+ ) -> None:
315
+ async with self._lock:
316
+ now = datetime.now(timezone.utc).isoformat()
317
+ with self._get_conn() as conn:
318
+ if metadata:
319
+ # Merge metadata
320
+ cursor = conn.execute(
321
+ "SELECT metadata FROM worker_registry WHERE worker_id = ?",
322
+ (worker_id,)
323
+ )
324
+ row = cursor.fetchone()
325
+ if not row:
326
+ raise KeyError(f"Worker {worker_id} not found")
327
+ existing = json.loads(row["metadata"]) if row["metadata"] else {}
328
+ merged = {**existing, **metadata}
329
+ conn.execute(
330
+ """
331
+ UPDATE worker_registry
332
+ SET last_heartbeat = ?, metadata = ?
333
+ WHERE worker_id = ?
334
+ """,
335
+ (now, json.dumps(merged), worker_id)
336
+ )
337
+ else:
338
+ result = conn.execute(
339
+ "UPDATE worker_registry SET last_heartbeat = ? WHERE worker_id = ?",
340
+ (now, worker_id)
341
+ )
342
+ if result.rowcount == 0:
343
+ raise KeyError(f"Worker {worker_id} not found")
344
+
345
+ logger.debug(f"RegistrationBackend: heartbeat for {worker_id}")
346
+
347
+ async def update_status(self, worker_id: str, status: str) -> None:
348
+ async with self._lock:
349
+ with self._get_conn() as conn:
350
+ result = conn.execute(
351
+ "UPDATE worker_registry SET status = ? WHERE worker_id = ?",
352
+ (status, worker_id)
353
+ )
354
+ if result.rowcount == 0:
355
+ raise KeyError(f"Worker {worker_id} not found")
356
+
357
+ logger.debug(f"RegistrationBackend: {worker_id} status -> {status}")
358
+
359
+ async def get(self, worker_id: str) -> Optional[WorkerRecord]:
360
+ with self._get_conn() as conn:
361
+ cursor = conn.execute(
362
+ "SELECT * FROM worker_registry WHERE worker_id = ?",
363
+ (worker_id,)
364
+ )
365
+ row = cursor.fetchone()
366
+ return self._row_to_record(row) if row else None
367
+
368
+ async def list(self, filter: Optional[WorkerFilter] = None) -> List[WorkerRecord]:
369
+ query = "SELECT * FROM worker_registry WHERE 1=1"
370
+ params: List[Any] = []
371
+
372
+ if filter:
373
+ if filter.status:
374
+ query += " AND status = ?"
375
+ params.append(filter.status)
376
+ if filter.capability:
377
+ # JSON contains check
378
+ query += " AND capabilities LIKE ?"
379
+ params.append(f'%"{filter.capability}"%')
380
+ if filter.stale_threshold_seconds:
381
+ cutoff = (
382
+ datetime.now(timezone.utc) -
383
+ timedelta(seconds=filter.stale_threshold_seconds)
384
+ ).isoformat()
385
+ query += " AND last_heartbeat < ?"
386
+ params.append(cutoff)
387
+
388
+ with self._get_conn() as conn:
389
+ cursor = conn.execute(query, params)
390
+ return [self._row_to_record(row) for row in cursor.fetchall()]
391
+
392
+
393
+ # =============================================================================
394
+ # Work Backend Protocol & Implementations
395
+ # =============================================================================
396
+
397
+ @runtime_checkable
398
+ class WorkPool(Protocol):
399
+ """
400
+ Protocol for a named work pool with atomic claiming.
401
+
402
+ Workers claim items atomically, process them, and mark complete or failed.
403
+ """
404
+
405
+ async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
406
+ """Add work item to pool.
407
+
408
+ Args:
409
+ item: Work item data (must be JSON-serializable)
410
+ options: Optional settings like max_retries
411
+
412
+ Returns:
413
+ Generated item ID
414
+ """
415
+ ...
416
+
417
+ async def claim(self, worker_id: str) -> Optional[WorkItem]:
418
+ """Atomically claim next available item.
419
+
420
+ Args:
421
+ worker_id: ID of the claiming worker
422
+
423
+ Returns:
424
+ Claimed work item or None if pool is empty
425
+ """
426
+ ...
427
+
428
+ async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
429
+ """Mark item as completed and remove from pool.
430
+
431
+ Args:
432
+ item_id: ID of the work item
433
+ result: Optional result data
434
+ """
435
+ ...
436
+
437
+ async def fail(self, item_id: str, error: Optional[str] = None) -> None:
438
+ """Mark item as failed. Returns to pool for retry or marks as poisoned.
439
+
440
+ Args:
441
+ item_id: ID of the work item
442
+ error: Optional error message
443
+ """
444
+ ...
445
+
446
+ async def size(self) -> int:
447
+ """Get number of unclaimed items in pool."""
448
+ ...
449
+
450
+ async def release_by_worker(self, worker_id: str) -> int:
451
+ """Release all items claimed by a worker (for stale worker cleanup).
452
+
453
+ Args:
454
+ worker_id: ID of the worker whose items to release
455
+
456
+ Returns:
457
+ Number of items released
458
+ """
459
+ ...
460
+
461
+
462
+ @runtime_checkable
463
+ class WorkBackend(Protocol):
464
+ """Protocol for work distribution across named pools."""
465
+
466
+ def pool(self, name: str) -> WorkPool:
467
+ """Get a named work pool.
468
+
469
+ Args:
470
+ name: Pool name (e.g., "paper_analysis", "image_processing")
471
+
472
+ Returns:
473
+ WorkPool instance for the named pool
474
+ """
475
+ ...
476
+
477
+
478
+ class MemoryWorkPool:
479
+ """In-memory work pool implementation."""
480
+
481
+ def __init__(self, name: str):
482
+ self.name = name
483
+ self._items: Dict[str, WorkItem] = {}
484
+ self._lock = asyncio.Lock()
485
+
486
+ async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
487
+ async with self._lock:
488
+ item_id = str(uuid.uuid4())
489
+ max_retries = (options or {}).get("max_retries", 3)
490
+ work_item = WorkItem(
491
+ id=item_id,
492
+ data=item,
493
+ max_retries=max_retries,
494
+ )
495
+ self._items[item_id] = work_item
496
+ logger.debug(f"WorkPool[{self.name}]: pushed item {item_id}")
497
+ return item_id
498
+
499
+ async def claim(self, worker_id: str) -> Optional[WorkItem]:
500
+ async with self._lock:
501
+ # Find first pending item
502
+ for item in self._items.values():
503
+ if item.status == "pending":
504
+ item.status = "claimed"
505
+ item.claimed_by = worker_id
506
+ item.claimed_at = datetime.now(timezone.utc).isoformat()
507
+ item.attempts += 1
508
+ logger.debug(f"WorkPool[{self.name}]: {worker_id} claimed {item.id}")
509
+ return item
510
+ return None
511
+
512
+ async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
513
+ async with self._lock:
514
+ if item_id not in self._items:
515
+ raise KeyError(f"Work item {item_id} not found")
516
+ # Remove completed items
517
+ del self._items[item_id]
518
+ logger.debug(f"WorkPool[{self.name}]: completed {item_id}")
519
+
520
+ async def fail(self, item_id: str, error: Optional[str] = None) -> None:
521
+ async with self._lock:
522
+ if item_id not in self._items:
523
+ raise KeyError(f"Work item {item_id} not found")
524
+ item = self._items[item_id]
525
+
526
+ if item.attempts >= item.max_retries:
527
+ item.status = "poisoned"
528
+ logger.warning(
529
+ f"WorkPool[{self.name}]: {item_id} poisoned after {item.attempts} attempts"
530
+ )
531
+ else:
532
+ item.status = "pending"
533
+ item.claimed_by = None
534
+ item.claimed_at = None
535
+ logger.debug(
536
+ f"WorkPool[{self.name}]: {item_id} failed, returning to pool "
537
+ f"(attempt {item.attempts}/{item.max_retries})"
538
+ )
539
+
540
+ async def size(self) -> int:
541
+ return sum(1 for item in self._items.values() if item.status == "pending")
542
+
543
+ async def release_by_worker(self, worker_id: str) -> int:
544
+ async with self._lock:
545
+ released = 0
546
+ for item in self._items.values():
547
+ if item.claimed_by == worker_id and item.status == "claimed":
548
+ item.status = "pending"
549
+ item.claimed_by = None
550
+ item.claimed_at = None
551
+ released += 1
552
+ logger.debug(f"WorkPool[{self.name}]: released {released} items from {worker_id}")
553
+ return released
554
+
555
+
556
+ class MemoryWorkBackend:
557
+ """In-memory work backend with named pools."""
558
+
559
+ def __init__(self):
560
+ self._pools: Dict[str, MemoryWorkPool] = {}
561
+
562
+ def pool(self, name: str) -> MemoryWorkPool:
563
+ if name not in self._pools:
564
+ self._pools[name] = MemoryWorkPool(name)
565
+ return self._pools[name]
566
+
567
+
568
+ class SQLiteWorkPool:
569
+ """SQLite-based work pool implementation."""
570
+
571
+ def __init__(self, name: str, db_path: Path, lock: asyncio.Lock):
572
+ self.name = name
573
+ self.db_path = db_path
574
+ self._lock = lock
575
+
576
+ def _get_conn(self) -> sqlite3.Connection:
577
+ conn = sqlite3.connect(self.db_path)
578
+ conn.row_factory = sqlite3.Row
579
+ return conn
580
+
581
+ def _row_to_item(self, row: sqlite3.Row) -> WorkItem:
582
+ return WorkItem(
583
+ id=row["item_id"],
584
+ data=json.loads(row["data"]),
585
+ claimed_by=row["claimed_by"],
586
+ attempts=row["attempts"],
587
+ max_retries=row["max_retries"],
588
+ status=row["status"],
589
+ created_at=row["created_at"],
590
+ claimed_at=row["claimed_at"],
591
+ )
592
+
593
+ async def push(self, item: Any, options: Optional[Dict[str, Any]] = None) -> str:
594
+ async with self._lock:
595
+ item_id = str(uuid.uuid4())
596
+ max_retries = (options or {}).get("max_retries", 3)
597
+ now = datetime.now(timezone.utc).isoformat()
598
+
599
+ with self._get_conn() as conn:
600
+ conn.execute(
601
+ """
602
+ INSERT INTO work_pool
603
+ (item_id, pool_name, data, status, attempts, max_retries, created_at)
604
+ VALUES (?, ?, ?, 'pending', 0, ?, ?)
605
+ """,
606
+ (item_id, self.name, json.dumps(item), max_retries, now)
607
+ )
608
+
609
+ logger.debug(f"WorkPool[{self.name}]: pushed item {item_id}")
610
+ return item_id
611
+
612
+ async def claim(self, worker_id: str) -> Optional[WorkItem]:
613
+ async with self._lock:
614
+ now = datetime.now(timezone.utc).isoformat()
615
+
616
+ with self._get_conn() as conn:
617
+ # Atomic claim: select and update in transaction
618
+ cursor = conn.execute(
619
+ """
620
+ SELECT item_id FROM work_pool
621
+ WHERE pool_name = ? AND status = 'pending'
622
+ ORDER BY created_at ASC
623
+ LIMIT 1
624
+ """,
625
+ (self.name,)
626
+ )
627
+ row = cursor.fetchone()
628
+ if not row:
629
+ return None
630
+
631
+ item_id = row["item_id"]
632
+ conn.execute(
633
+ """
634
+ UPDATE work_pool
635
+ SET status = 'claimed', claimed_by = ?, claimed_at = ?,
636
+ attempts = attempts + 1
637
+ WHERE item_id = ?
638
+ """,
639
+ (worker_id, now, item_id)
640
+ )
641
+
642
+ # Fetch the updated item
643
+ cursor = conn.execute(
644
+ "SELECT * FROM work_pool WHERE item_id = ?",
645
+ (item_id,)
646
+ )
647
+ row = cursor.fetchone()
648
+
649
+ logger.debug(f"WorkPool[{self.name}]: {worker_id} claimed {item_id}")
650
+ return self._row_to_item(row) if row else None
651
+
652
+ async def complete(self, item_id: str, result: Optional[Any] = None) -> None:
653
+ async with self._lock:
654
+ with self._get_conn() as conn:
655
+ # Store result and mark completed (or just delete)
656
+ result = conn.execute(
657
+ "DELETE FROM work_pool WHERE item_id = ?",
658
+ (item_id,)
659
+ )
660
+ if result.rowcount == 0:
661
+ raise KeyError(f"Work item {item_id} not found")
662
+
663
+ logger.debug(f"WorkPool[{self.name}]: completed {item_id}")
664
+
665
+ async def fail(self, item_id: str, error: Optional[str] = None) -> None:
666
+ async with self._lock:
667
+ with self._get_conn() as conn:
668
+ cursor = conn.execute(
669
+ "SELECT attempts, max_retries FROM work_pool WHERE item_id = ?",
670
+ (item_id,)
671
+ )
672
+ row = cursor.fetchone()
673
+ if not row:
674
+ raise KeyError(f"Work item {item_id} not found")
675
+
676
+ attempts = row["attempts"]
677
+ max_retries = row["max_retries"]
678
+
679
+ if attempts >= max_retries:
680
+ conn.execute(
681
+ "UPDATE work_pool SET status = 'poisoned' WHERE item_id = ?",
682
+ (item_id,)
683
+ )
684
+ logger.warning(
685
+ f"WorkPool[{self.name}]: {item_id} poisoned after {attempts} attempts"
686
+ )
687
+ else:
688
+ conn.execute(
689
+ """
690
+ UPDATE work_pool
691
+ SET status = 'pending', claimed_by = NULL, claimed_at = NULL
692
+ WHERE item_id = ?
693
+ """,
694
+ (item_id,)
695
+ )
696
+ logger.debug(
697
+ f"WorkPool[{self.name}]: {item_id} failed, returning to pool "
698
+ f"(attempt {attempts}/{max_retries})"
699
+ )
700
+
701
+ async def size(self) -> int:
702
+ with self._get_conn() as conn:
703
+ cursor = conn.execute(
704
+ "SELECT COUNT(*) as cnt FROM work_pool WHERE pool_name = ? AND status = 'pending'",
705
+ (self.name,)
706
+ )
707
+ return cursor.fetchone()["cnt"]
708
+
709
+ async def release_by_worker(self, worker_id: str) -> int:
710
+ async with self._lock:
711
+ with self._get_conn() as conn:
712
+ result = conn.execute(
713
+ """
714
+ UPDATE work_pool
715
+ SET status = 'pending', claimed_by = NULL, claimed_at = NULL
716
+ WHERE pool_name = ? AND claimed_by = ? AND status = 'claimed'
717
+ """,
718
+ (self.name, worker_id)
719
+ )
720
+ released = result.rowcount
721
+
722
+ logger.debug(f"WorkPool[{self.name}]: released {released} items from {worker_id}")
723
+ return released
724
+
725
+
726
+ class SQLiteWorkBackend:
727
+ """SQLite-based work backend with named pools."""
728
+
729
+ SCHEMA = """
730
+ CREATE TABLE IF NOT EXISTS work_pool (
731
+ item_id TEXT PRIMARY KEY,
732
+ pool_name TEXT NOT NULL,
733
+ data TEXT NOT NULL, -- JSON
734
+ status TEXT NOT NULL DEFAULT 'pending', -- pending, claimed, completed, failed, poisoned
735
+ claimed_by TEXT,
736
+ claimed_at TEXT,
737
+ attempts INTEGER NOT NULL DEFAULT 0,
738
+ max_retries INTEGER NOT NULL DEFAULT 3,
739
+ created_at TEXT NOT NULL,
740
+ error TEXT
741
+ );
742
+
743
+ CREATE INDEX IF NOT EXISTS idx_work_pool_name ON work_pool(pool_name);
744
+ CREATE INDEX IF NOT EXISTS idx_work_status ON work_pool(status);
745
+ CREATE INDEX IF NOT EXISTS idx_work_claimed_by ON work_pool(claimed_by);
746
+ """
747
+
748
+ def __init__(self, db_path: str = "workers.sqlite"):
749
+ self.db_path = Path(db_path)
750
+ self._pools: Dict[str, SQLiteWorkPool] = {}
751
+ self._lock = asyncio.Lock()
752
+ self._init_db()
753
+
754
+ def _init_db(self) -> None:
755
+ """Initialize database schema."""
756
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
757
+ with sqlite3.connect(self.db_path) as conn:
758
+ conn.executescript(self.SCHEMA)
759
+
760
+ def pool(self, name: str) -> SQLiteWorkPool:
761
+ if name not in self._pools:
762
+ self._pools[name] = SQLiteWorkPool(name, self.db_path, self._lock)
763
+ return self._pools[name]
764
+
765
+
766
+ # =============================================================================
767
+ # Factory Functions
768
+ # =============================================================================
769
+
770
+ def create_registration_backend(
771
+ backend_type: str = "memory",
772
+ **kwargs: Any
773
+ ) -> RegistrationBackend:
774
+ """Create a registration backend by type.
775
+
776
+ Args:
777
+ backend_type: "memory" or "sqlite"
778
+ **kwargs: Backend-specific options (e.g., db_path for sqlite)
779
+
780
+ Returns:
781
+ RegistrationBackend instance
782
+ """
783
+ if backend_type == "memory":
784
+ return MemoryRegistrationBackend()
785
+ elif backend_type == "sqlite":
786
+ db_path = kwargs.get("db_path", "workers.sqlite")
787
+ return SQLiteRegistrationBackend(db_path=db_path)
788
+ else:
789
+ raise ValueError(f"Unknown registration backend type: {backend_type}")
790
+
791
+
792
+ def create_work_backend(
793
+ backend_type: str = "memory",
794
+ **kwargs: Any
795
+ ) -> WorkBackend:
796
+ """Create a work backend by type.
797
+
798
+ Args:
799
+ backend_type: "memory" or "sqlite"
800
+ **kwargs: Backend-specific options (e.g., db_path for sqlite)
801
+
802
+ Returns:
803
+ WorkBackend instance
804
+ """
805
+ if backend_type == "memory":
806
+ return MemoryWorkBackend()
807
+ elif backend_type == "sqlite":
808
+ db_path = kwargs.get("db_path", "workers.sqlite")
809
+ return SQLiteWorkBackend(db_path=db_path)
810
+ else:
811
+ raise ValueError(f"Unknown work backend type: {backend_type}")
812
+
813
+
814
+ __all__ = [
815
+ # Types
816
+ "WorkerRegistration",
817
+ "WorkerRecord",
818
+ "WorkerFilter",
819
+ "WorkItem",
820
+ # Protocols
821
+ "RegistrationBackend",
822
+ "WorkPool",
823
+ "WorkBackend",
824
+ # Memory implementations
825
+ "MemoryRegistrationBackend",
826
+ "MemoryWorkPool",
827
+ "MemoryWorkBackend",
828
+ # SQLite implementations
829
+ "SQLiteRegistrationBackend",
830
+ "SQLiteWorkPool",
831
+ "SQLiteWorkBackend",
832
+ # Factory functions
833
+ "create_registration_backend",
834
+ "create_work_backend",
835
+ ]