emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,738 @@
1
+ """
2
+ Distributed Chunk Coordinator for Emergent Language Processing
3
+
4
+ Enables distributed processing of large files across multiple instances
5
+ using emergent language θ symbols as the wire protocol for 87% bandwidth reduction.
6
+
7
+ Architecture:
8
+ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
9
+ │ Ingest │────▶│ Chunk + │────▶│ Distribute │
10
+ │ (10GB) │ │ θ-compress │ │ (1.3GB) │
11
+ └─────────────┘ └─────────────┘ └──────┬──────┘
12
+
13
+ ┌─────────────────────────────────────────┼─────────────────────────────────────────┐
14
+ ▼ ▼ ▼
15
+ ┌─────────┐ ┌─────────┐ ┌─────────┐
16
+ │ Worker 1│ │Worker 50│ │Worker100│
17
+ │ Process │ │ Process │ │ Process │
18
+ └────┬────┘ └────┬────┘ └────┬────┘
19
+ │ │ │
20
+ └─────────────────────────────────────────┼─────────────────────────────────────────┘
21
+
22
+ ┌───────▼───────┐
23
+ │ Reassemble │
24
+ │ θ-decompress │
25
+ └───────────────┘
26
+
27
+ Usage:
28
+ coordinator = ChunkCoordinator(
29
+ workers=["https://instance1.fly.dev", "https://instance2.fly.dev"],
30
+ chunk_size_mb=100
31
+ )
32
+
33
+ job = await coordinator.submit(
34
+ data=large_file_bytes,
35
+ operation="transform"
36
+ )
37
+
38
+ result = await coordinator.wait(job.id)
39
+ """
40
+
41
+ import asyncio
42
+ import hashlib
43
+ import time
44
+ import uuid
45
+ import base64
46
+ import json
47
+ import logging
48
+ from dataclasses import dataclass, field
49
+ from enum import Enum
50
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
51
+ from datetime import datetime
52
+ import struct
53
+
54
+ import httpx
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ class ChunkStatus(Enum):
60
+ """Status of a chunk in the processing pipeline."""
61
+ PENDING = "pending"
62
+ COMPRESSING = "compressing"
63
+ QUEUED = "queued"
64
+ DISPATCHED = "dispatched"
65
+ PROCESSING = "processing"
66
+ COMPLETED = "completed"
67
+ FAILED = "failed"
68
+ RETRYING = "retrying"
69
+
70
+
71
+ class JobStatus(Enum):
72
+ """Status of a distributed job."""
73
+ CREATED = "created"
74
+ CHUNKING = "chunking"
75
+ DISTRIBUTING = "distributing"
76
+ PROCESSING = "processing"
77
+ COLLECTING = "collecting"
78
+ REASSEMBLING = "reassembling"
79
+ COMPLETED = "completed"
80
+ FAILED = "failed"
81
+ CANCELLED = "cancelled"
82
+
83
+
84
+ @dataclass
85
+ class ChunkMetadata:
86
+ """Metadata for tracking a chunk through the pipeline."""
87
+ chunk_id: str
88
+ job_id: str
89
+ sequence: int # Order in original file
90
+ offset: int # Byte offset in original
91
+ size_raw: int # Size before compression
92
+ size_compressed: int = 0 # Size after θ-compression
93
+ checksum_raw: str = "" # SHA-256 of raw data
94
+ checksum_compressed: str = "" # SHA-256 of compressed
95
+ status: ChunkStatus = ChunkStatus.PENDING
96
+ worker_url: Optional[str] = None
97
+ dispatch_time: Optional[float] = None
98
+ complete_time: Optional[float] = None
99
+ retries: int = 0
100
+ error: Optional[str] = None
101
+
102
+
103
+ @dataclass
104
+ class Chunk:
105
+ """A chunk of data ready for distributed processing."""
106
+ metadata: ChunkMetadata
107
+ data_raw: Optional[bytes] = None # Original data (cleared after compression)
108
+ data_compressed: Optional[bytes] = None # θ-compressed data
109
+ result_compressed: Optional[bytes] = None # θ-compressed result from worker
110
+ result_raw: Optional[bytes] = None # Decompressed result
111
+
112
+ @property
113
+ def compression_ratio(self) -> float:
114
+ if self.metadata.size_raw == 0:
115
+ return 0.0
116
+ return self.metadata.size_compressed / self.metadata.size_raw
117
+
118
+
119
+ @dataclass
120
+ class JobStats:
121
+ """Statistics for a distributed job."""
122
+ total_chunks: int = 0
123
+ completed_chunks: int = 0
124
+ failed_chunks: int = 0
125
+ total_bytes_raw: int = 0
126
+ total_bytes_compressed: int = 0
127
+ total_bytes_transferred: int = 0
128
+ start_time: Optional[float] = None
129
+ end_time: Optional[float] = None
130
+
131
+ @property
132
+ def compression_ratio(self) -> float:
133
+ if self.total_bytes_raw == 0:
134
+ return 0.0
135
+ return self.total_bytes_compressed / self.total_bytes_raw
136
+
137
+ @property
138
+ def bandwidth_savings_percent(self) -> float:
139
+ return (1 - self.compression_ratio) * 100
140
+
141
+ @property
142
+ def duration_seconds(self) -> float:
143
+ if not self.start_time or not self.end_time:
144
+ return 0.0
145
+ return self.end_time - self.start_time
146
+
147
+ @property
148
+ def throughput_mbps(self) -> float:
149
+ if self.duration_seconds == 0:
150
+ return 0.0
151
+ return (self.total_bytes_raw / 1024 / 1024) / self.duration_seconds
152
+
153
+
154
+ @dataclass
155
+ class DistributedJob:
156
+ """A distributed processing job."""
157
+ id: str
158
+ operation: str
159
+ status: JobStatus = JobStatus.CREATED
160
+ chunks: List[Chunk] = field(default_factory=list)
161
+ stats: JobStats = field(default_factory=JobStats)
162
+ created_at: datetime = field(default_factory=datetime.now)
163
+ metadata: Dict[str, Any] = field(default_factory=dict)
164
+ result: Optional[bytes] = None
165
+ errors: List[str] = field(default_factory=list)
166
+
167
+ def get_chunk(self, chunk_id: str) -> Optional[Chunk]:
168
+ """Get chunk by ID."""
169
+ for chunk in self.chunks:
170
+ if chunk.metadata.chunk_id == chunk_id:
171
+ return chunk
172
+ return None
173
+
174
+
175
+ class WorkerPool:
176
+ """Manages a pool of worker instances."""
177
+
178
+ def __init__(self, workers: List[str], auth_token: Optional[str] = None):
179
+ self.workers = workers
180
+ self.auth_token = auth_token or "eudaimonia-translator-demo"
181
+ self._worker_stats: Dict[str, Dict] = {
182
+ url: {"active": 0, "completed": 0, "failed": 0, "avg_latency_ms": 0}
183
+ for url in workers
184
+ }
185
+ self._lock = asyncio.Lock()
186
+
187
+ async def get_best_worker(self) -> str:
188
+ """Select the best available worker (least loaded, best performance)."""
189
+ async with self._lock:
190
+ # Simple strategy: least active connections
191
+ best = min(
192
+ self.workers,
193
+ key=lambda w: (
194
+ self._worker_stats[w]["active"],
195
+ -self._worker_stats[w]["completed"],
196
+ self._worker_stats[w]["avg_latency_ms"]
197
+ )
198
+ )
199
+ self._worker_stats[best]["active"] += 1
200
+ return best
201
+
202
+ async def release_worker(self, url: str, success: bool, latency_ms: float):
203
+ """Release a worker after task completion."""
204
+ async with self._lock:
205
+ stats = self._worker_stats[url]
206
+ stats["active"] = max(0, stats["active"] - 1)
207
+ if success:
208
+ stats["completed"] += 1
209
+ # Rolling average latency
210
+ n = stats["completed"]
211
+ stats["avg_latency_ms"] = (
212
+ stats["avg_latency_ms"] * (n - 1) + latency_ms
213
+ ) / n
214
+ else:
215
+ stats["failed"] += 1
216
+
217
+ def get_stats(self) -> Dict[str, Dict]:
218
+ """Get worker pool statistics."""
219
+ return dict(self._worker_stats)
220
+
221
+
222
+ class ChunkCoordinator:
223
+ """
224
+ Coordinates distributed processing of large data across multiple instances.
225
+
226
+ Uses emergent language θ-compression for efficient wire protocol,
227
+ achieving ~87% bandwidth reduction on structured data.
228
+ """
229
+
230
+ def __init__(
231
+ self,
232
+ workers: List[str],
233
+ chunk_size_mb: int = 100,
234
+ max_concurrent: int = 50,
235
+ auth_token: Optional[str] = None,
236
+ compress_endpoint: str = "/translate",
237
+ process_endpoint: str = "/process",
238
+ timeout_seconds: float = 60.0,
239
+ max_retries: int = 3
240
+ ):
241
+ """
242
+ Initialize the chunk coordinator.
243
+
244
+ Args:
245
+ workers: List of worker instance URLs
246
+ chunk_size_mb: Target chunk size in MB
247
+ max_concurrent: Maximum concurrent chunk dispatches
248
+ auth_token: Authentication token for workers
249
+ compress_endpoint: Endpoint for θ-compression
250
+ process_endpoint: Endpoint for processing
251
+ timeout_seconds: Request timeout
252
+ max_retries: Max retries per chunk
253
+ """
254
+ self.worker_pool = WorkerPool(workers, auth_token)
255
+ self.chunk_size_bytes = int(chunk_size_mb * 1024 * 1024)
256
+ self.max_concurrent = max_concurrent
257
+ self.auth_token = auth_token or "eudaimonia-translator-demo"
258
+ self.compress_endpoint = compress_endpoint
259
+ self.process_endpoint = process_endpoint
260
+ self.timeout = timeout_seconds
261
+ self.max_retries = max_retries
262
+
263
+ # Active jobs
264
+ self._jobs: Dict[str, DistributedJob] = {}
265
+ self._semaphore = asyncio.Semaphore(max_concurrent)
266
+
267
+ def _split_into_chunks(self, data: bytes, job_id: str) -> List[Chunk]:
268
+ """Split data into chunks for distribution."""
269
+ chunks = []
270
+ offset = 0
271
+ sequence = 0
272
+
273
+ while offset < len(data):
274
+ chunk_data = data[offset:offset + self.chunk_size_bytes]
275
+ chunk_size = len(chunk_data)
276
+
277
+ metadata = ChunkMetadata(
278
+ chunk_id=f"{job_id}-{sequence:04d}",
279
+ job_id=job_id,
280
+ sequence=sequence,
281
+ offset=offset,
282
+ size_raw=chunk_size,
283
+ checksum_raw=hashlib.sha256(chunk_data).hexdigest()[:16]
284
+ )
285
+
286
+ chunks.append(Chunk(metadata=metadata, data_raw=chunk_data))
287
+
288
+ offset += chunk_size
289
+ sequence += 1
290
+
291
+ return chunks
292
+
293
+ async def _compress_chunk(self, chunk: Chunk, client: httpx.AsyncClient) -> Chunk:
294
+ """Compress a chunk using θ-protocol via local or remote translator."""
295
+ chunk.metadata.status = ChunkStatus.COMPRESSING
296
+
297
+ # For prototype, use first worker for compression
298
+ # In production, could have dedicated compression service
299
+ worker = self.worker_pool.workers[0]
300
+
301
+ try:
302
+ # Encode raw data as base64 for JSON transport
303
+ data_b64 = base64.b64encode(chunk.data_raw).decode('utf-8')
304
+
305
+ response = await client.post(
306
+ f"{worker}{self.compress_endpoint}",
307
+ json={
308
+ "data": f"CHUNK:{chunk.metadata.chunk_id}:{data_b64}",
309
+ "source_format": "text",
310
+ "target_format": "emergent"
311
+ },
312
+ headers={"Authorization": f"Bearer {self.auth_token}"},
313
+ timeout=self.timeout
314
+ )
315
+
316
+ if response.status_code == 200:
317
+ result = response.json()
318
+ compressed_b64 = result.get("translated_data", "")
319
+ compressed_bytes = base64.b64decode(compressed_b64)
320
+
321
+ chunk.data_compressed = compressed_bytes
322
+ chunk.metadata.size_compressed = len(compressed_bytes)
323
+ chunk.metadata.checksum_compressed = hashlib.sha256(compressed_bytes).hexdigest()[:16]
324
+ chunk.metadata.status = ChunkStatus.QUEUED
325
+
326
+ # Clear raw data to free memory
327
+ chunk.data_raw = None
328
+
329
+ logger.debug(
330
+ f"Chunk {chunk.metadata.chunk_id} compressed: "
331
+ f"{chunk.metadata.size_raw} → {chunk.metadata.size_compressed} "
332
+ f"({chunk.compression_ratio:.2%})"
333
+ )
334
+ else:
335
+ chunk.metadata.status = ChunkStatus.FAILED
336
+ chunk.metadata.error = f"Compression failed: HTTP {response.status_code}"
337
+
338
+ except Exception as e:
339
+ chunk.metadata.status = ChunkStatus.FAILED
340
+ chunk.metadata.error = f"Compression error: {str(e)[:100]}"
341
+ logger.error(f"Chunk {chunk.metadata.chunk_id} compression failed: {e}")
342
+
343
+ return chunk
344
+
345
+ async def _dispatch_chunk(
346
+ self,
347
+ chunk: Chunk,
348
+ operation: str,
349
+ client: httpx.AsyncClient,
350
+ operation_params: Optional[Dict] = None
351
+ ) -> Chunk:
352
+ """Dispatch a chunk to a worker for processing."""
353
+ async with self._semaphore:
354
+ worker_url = await self.worker_pool.get_best_worker()
355
+ chunk.metadata.worker_url = worker_url
356
+ chunk.metadata.dispatch_time = time.time()
357
+ chunk.metadata.status = ChunkStatus.DISPATCHED
358
+
359
+ start_time = time.perf_counter()
360
+ success = False
361
+
362
+ try:
363
+ chunk.metadata.status = ChunkStatus.PROCESSING
364
+
365
+ # Send compressed chunk to worker
366
+ payload = {
367
+ "chunk_id": chunk.metadata.chunk_id,
368
+ "operation": operation,
369
+ "data": base64.b64encode(chunk.data_compressed).decode('utf-8'),
370
+ "params": operation_params or {}
371
+ }
372
+
373
+ response = await client.post(
374
+ f"{worker_url}{self.process_endpoint}",
375
+ json=payload,
376
+ headers={"Authorization": f"Bearer {self.auth_token}"},
377
+ timeout=self.timeout
378
+ )
379
+
380
+ latency_ms = (time.perf_counter() - start_time) * 1000
381
+
382
+ if response.status_code == 200:
383
+ result = response.json()
384
+ if result.get("success"):
385
+ result_b64 = result.get("result", "")
386
+ chunk.result_compressed = base64.b64decode(result_b64)
387
+ chunk.metadata.status = ChunkStatus.COMPLETED
388
+ chunk.metadata.complete_time = time.time()
389
+ success = True
390
+ else:
391
+ chunk.metadata.status = ChunkStatus.FAILED
392
+ chunk.metadata.error = result.get("error", "Unknown processing error")
393
+ else:
394
+ chunk.metadata.status = ChunkStatus.FAILED
395
+ chunk.metadata.error = f"HTTP {response.status_code}"
396
+
397
+ except asyncio.TimeoutError:
398
+ chunk.metadata.status = ChunkStatus.FAILED
399
+ chunk.metadata.error = "Timeout"
400
+ latency_ms = self.timeout * 1000
401
+
402
+ except Exception as e:
403
+ chunk.metadata.status = ChunkStatus.FAILED
404
+ chunk.metadata.error = str(e)[:100]
405
+ latency_ms = (time.perf_counter() - start_time) * 1000
406
+
407
+ finally:
408
+ await self.worker_pool.release_worker(worker_url, success, latency_ms)
409
+
410
+ return chunk
411
+
412
+ async def _decompress_result(self, chunk: Chunk, client: httpx.AsyncClient) -> Chunk:
413
+ """Decompress a chunk result back to original format."""
414
+ if not chunk.result_compressed:
415
+ return chunk
416
+
417
+ worker = self.worker_pool.workers[0]
418
+
419
+ try:
420
+ response = await client.post(
421
+ f"{worker}/decompress",
422
+ json={
423
+ "compressed_data": base64.b64encode(chunk.result_compressed).decode('utf-8')
424
+ },
425
+ headers={"Authorization": f"Bearer {self.auth_token}"},
426
+ timeout=self.timeout
427
+ )
428
+
429
+ if response.status_code == 200:
430
+ result = response.json()
431
+ if result.get("success"):
432
+ decompressed = result.get("decompressed_data", "")
433
+ # Handle CHUNK: prefix if present
434
+ if isinstance(decompressed, str) and decompressed.startswith("CHUNK:"):
435
+ parts = decompressed.split(":", 2)
436
+ if len(parts) >= 3:
437
+ decompressed = parts[2]
438
+
439
+ if isinstance(decompressed, str):
440
+ chunk.result_raw = base64.b64decode(decompressed)
441
+ else:
442
+ chunk.result_raw = decompressed
443
+
444
+ except Exception as e:
445
+ logger.error(f"Chunk {chunk.metadata.chunk_id} decompression failed: {e}")
446
+ # Keep compressed result, let reassembly handle it
447
+
448
+ return chunk
449
+
450
+ def _reassemble_results(self, chunks: List[Chunk]) -> bytes:
451
+ """Reassemble chunk results in order."""
452
+ # Sort by sequence number
453
+ sorted_chunks = sorted(chunks, key=lambda c: c.metadata.sequence)
454
+
455
+ result_parts = []
456
+ for chunk in sorted_chunks:
457
+ if chunk.result_raw:
458
+ result_parts.append(chunk.result_raw)
459
+ elif chunk.result_compressed:
460
+ # Fallback: include compressed if decompression failed
461
+ result_parts.append(chunk.result_compressed)
462
+ else:
463
+ logger.warning(f"Chunk {chunk.metadata.chunk_id} has no result")
464
+
465
+ return b''.join(result_parts)
466
+
467
+ async def submit(
468
+ self,
469
+ data: bytes,
470
+ operation: str,
471
+ operation_params: Optional[Dict] = None,
472
+ metadata: Optional[Dict] = None
473
+ ) -> DistributedJob:
474
+ """
475
+ Submit data for distributed processing.
476
+
477
+ Args:
478
+ data: Raw data bytes to process
479
+ operation: Operation name to perform on chunks
480
+ operation_params: Optional parameters for the operation
481
+ metadata: Optional job metadata
482
+
483
+ Returns:
484
+ DistributedJob with job ID for tracking
485
+ """
486
+ job_id = str(uuid.uuid4())[:8]
487
+
488
+ job = DistributedJob(
489
+ id=job_id,
490
+ operation=operation,
491
+ status=JobStatus.CHUNKING,
492
+ metadata=metadata or {}
493
+ )
494
+ job.stats.start_time = time.time()
495
+ job.stats.total_bytes_raw = len(data)
496
+
497
+ logger.info(f"Job {job_id}: Starting chunking of {len(data)} bytes")
498
+
499
+ # Split into chunks
500
+ job.chunks = self._split_into_chunks(data, job_id)
501
+ job.stats.total_chunks = len(job.chunks)
502
+
503
+ logger.info(f"Job {job_id}: Created {len(job.chunks)} chunks")
504
+
505
+ self._jobs[job_id] = job
506
+
507
+ # Start async processing
508
+ asyncio.create_task(self._process_job(job, operation_params))
509
+
510
+ return job
511
+
512
+ async def _process_job(self, job: DistributedJob, operation_params: Optional[Dict] = None):
513
+ """Process a job through the full pipeline."""
514
+ try:
515
+ async with httpx.AsyncClient() as client:
516
+ # Phase 1: Compress all chunks
517
+ job.status = JobStatus.DISTRIBUTING
518
+ logger.info(f"Job {job.id}: Compressing {len(job.chunks)} chunks")
519
+
520
+ compress_tasks = [
521
+ self._compress_chunk(chunk, client)
522
+ for chunk in job.chunks
523
+ ]
524
+ await asyncio.gather(*compress_tasks)
525
+
526
+ # Calculate compression stats
527
+ job.stats.total_bytes_compressed = sum(
528
+ c.metadata.size_compressed for c in job.chunks
529
+ )
530
+
531
+ logger.info(
532
+ f"Job {job.id}: Compression complete - "
533
+ f"{job.stats.total_bytes_raw} → {job.stats.total_bytes_compressed} "
534
+ f"({job.stats.bandwidth_savings_percent:.1f}% savings)"
535
+ )
536
+
537
+ # Phase 2: Dispatch to workers
538
+ job.status = JobStatus.PROCESSING
539
+ logger.info(f"Job {job.id}: Dispatching to workers")
540
+
541
+ # Filter chunks that compressed successfully
542
+ ready_chunks = [
543
+ c for c in job.chunks
544
+ if c.metadata.status == ChunkStatus.QUEUED
545
+ ]
546
+
547
+ dispatch_tasks = [
548
+ self._dispatch_chunk(chunk, job.operation, client, operation_params)
549
+ for chunk in ready_chunks
550
+ ]
551
+ await asyncio.gather(*dispatch_tasks)
552
+
553
+ # Track bytes transferred
554
+ job.stats.total_bytes_transferred = (
555
+ job.stats.total_bytes_compressed * 2 # Round trip
556
+ )
557
+
558
+ # Phase 3: Collect and decompress results
559
+ job.status = JobStatus.COLLECTING
560
+ logger.info(f"Job {job.id}: Collecting results")
561
+
562
+ completed_chunks = [
563
+ c for c in job.chunks
564
+ if c.metadata.status == ChunkStatus.COMPLETED
565
+ ]
566
+
567
+ decompress_tasks = [
568
+ self._decompress_result(chunk, client)
569
+ for chunk in completed_chunks
570
+ ]
571
+ await asyncio.gather(*decompress_tasks)
572
+
573
+ # Phase 4: Reassemble
574
+ job.status = JobStatus.REASSEMBLING
575
+ logger.info(f"Job {job.id}: Reassembling {len(completed_chunks)} chunks")
576
+
577
+ job.result = self._reassemble_results(completed_chunks)
578
+
579
+ # Final stats
580
+ job.stats.completed_chunks = len(completed_chunks)
581
+ job.stats.failed_chunks = len(job.chunks) - len(completed_chunks)
582
+ job.stats.end_time = time.time()
583
+
584
+ if job.stats.failed_chunks == 0:
585
+ job.status = JobStatus.COMPLETED
586
+ logger.info(
587
+ f"Job {job.id}: Completed successfully - "
588
+ f"{job.stats.throughput_mbps:.2f} MB/s, "
589
+ f"{job.stats.bandwidth_savings_percent:.1f}% bandwidth saved"
590
+ )
591
+ else:
592
+ job.status = JobStatus.COMPLETED # Partial success
593
+ job.errors.append(f"{job.stats.failed_chunks} chunks failed")
594
+ logger.warning(
595
+ f"Job {job.id}: Completed with {job.stats.failed_chunks} failed chunks"
596
+ )
597
+
598
+ except Exception as e:
599
+ job.status = JobStatus.FAILED
600
+ job.errors.append(str(e))
601
+ job.stats.end_time = time.time()
602
+ logger.error(f"Job {job.id}: Failed - {e}")
603
+
604
+ async def wait(self, job_id: str, poll_interval: float = 0.5) -> DistributedJob:
605
+ """Wait for a job to complete."""
606
+ while True:
607
+ job = self._jobs.get(job_id)
608
+ if not job:
609
+ raise ValueError(f"Job {job_id} not found")
610
+
611
+ if job.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
612
+ return job
613
+
614
+ await asyncio.sleep(poll_interval)
615
+
616
+ def get_job(self, job_id: str) -> Optional[DistributedJob]:
617
+ """Get job by ID."""
618
+ return self._jobs.get(job_id)
619
+
620
+ def get_job_status(self, job_id: str) -> Dict[str, Any]:
621
+ """Get detailed job status."""
622
+ job = self._jobs.get(job_id)
623
+ if not job:
624
+ return {"error": f"Job {job_id} not found"}
625
+
626
+ chunk_statuses = {}
627
+ for chunk in job.chunks:
628
+ status = chunk.metadata.status.value
629
+ chunk_statuses[status] = chunk_statuses.get(status, 0) + 1
630
+
631
+ return {
632
+ "job_id": job.id,
633
+ "status": job.status.value,
634
+ "operation": job.operation,
635
+ "chunks": {
636
+ "total": job.stats.total_chunks,
637
+ "completed": job.stats.completed_chunks,
638
+ "failed": job.stats.failed_chunks,
639
+ "by_status": chunk_statuses
640
+ },
641
+ "bytes": {
642
+ "raw": job.stats.total_bytes_raw,
643
+ "compressed": job.stats.total_bytes_compressed,
644
+ "transferred": job.stats.total_bytes_transferred,
645
+ "compression_ratio": f"{job.stats.compression_ratio:.2%}",
646
+ "bandwidth_saved": f"{job.stats.bandwidth_savings_percent:.1f}%"
647
+ },
648
+ "timing": {
649
+ "duration_seconds": job.stats.duration_seconds,
650
+ "throughput_mbps": job.stats.throughput_mbps
651
+ },
652
+ "errors": job.errors
653
+ }
654
+
655
+ def list_jobs(self) -> List[Dict[str, Any]]:
656
+ """List all jobs with summary info."""
657
+ return [
658
+ {
659
+ "id": job.id,
660
+ "status": job.status.value,
661
+ "operation": job.operation,
662
+ "chunks": f"{job.stats.completed_chunks}/{job.stats.total_chunks}",
663
+ "created": job.created_at.isoformat()
664
+ }
665
+ for job in self._jobs.values()
666
+ ]
667
+
668
+
669
+ # Convenience function for simple usage
670
+ async def distributed_process(
671
+ data: bytes,
672
+ workers: List[str],
673
+ operation: str = "passthrough",
674
+ chunk_size_mb: int = 100,
675
+ **kwargs
676
+ ) -> Tuple[bytes, Dict[str, Any]]:
677
+ """
678
+ Simple interface for distributed processing.
679
+
680
+ Args:
681
+ data: Data to process
682
+ workers: List of worker URLs
683
+ operation: Operation to perform
684
+ chunk_size_mb: Chunk size
685
+ **kwargs: Additional parameters
686
+
687
+ Returns:
688
+ Tuple of (result_bytes, stats_dict)
689
+ """
690
+ coordinator = ChunkCoordinator(
691
+ workers=workers,
692
+ chunk_size_mb=chunk_size_mb
693
+ )
694
+
695
+ job = await coordinator.submit(data, operation, kwargs.get("params"))
696
+ job = await coordinator.wait(job.id)
697
+
698
+ return job.result or b'', coordinator.get_job_status(job.id)
699
+
700
+
701
+ if __name__ == "__main__":
702
+ # Quick test
703
+ async def test():
704
+ # Create coordinator with single test endpoint
705
+ coordinator = ChunkCoordinator(
706
+ workers=["https://emergent-language.fly.dev"],
707
+ chunk_size_mb=1 # Small chunks for testing
708
+ )
709
+
710
+ # Create test data (1MB of JSON-like content)
711
+ test_data = json.dumps({
712
+ "records": [
713
+ {"id": i, "data": f"record_{i}" * 100}
714
+ for i in range(1000)
715
+ ]
716
+ }).encode('utf-8')
717
+
718
+ print(f"Test data size: {len(test_data)} bytes")
719
+
720
+ # Submit job
721
+ job = await coordinator.submit(
722
+ data=test_data,
723
+ operation="passthrough"
724
+ )
725
+
726
+ print(f"Job submitted: {job.id}")
727
+
728
+ # Wait for completion
729
+ job = await coordinator.wait(job.id)
730
+
731
+ # Print results
732
+ status = coordinator.get_job_status(job.id)
733
+ print(f"\nJob Status: {json.dumps(status, indent=2)}")
734
+
735
+ if job.result:
736
+ print(f"\nResult size: {len(job.result)} bytes")
737
+
738
+ asyncio.run(test())