emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,978 @@
1
+ """
2
+ Chunk Collector - High-Performance Distributed File Processing
3
+
4
+ Combines the sidecar collector pattern with chunk coordination for
5
+ handling many files simultaneously with optimal batching.
6
+
7
+ Architecture:
8
+ ┌─────────┐ ┌─────────┐ ┌─────────┐
9
+ │ File 1 │ │ File 2 │ │ File N │
10
+ └────┬────┘ └────┬────┘ └────┬────┘
11
+ │ │ │
12
+ └──────────┼──────────┘
13
+
14
+ ┌───────────────────────────────────┐
15
+ │ CHUNK COLLECTOR │
16
+ │ ┌─────────────────────────────┐ │
17
+ │ │ Chunking Queue │ │
18
+ │ │ (split files into chunks) │ │
19
+ │ └──────────────┬──────────────┘ │
20
+ │ ▼ │
21
+ │ ┌─────────────────────────────┐ │
22
+ │ │ Compression Queue │ │
23
+ │ │ (batch 75 chunks → θ) │ │
24
+ │ └──────────────┬──────────────┘ │
25
+ │ ▼ │
26
+ │ ┌─────────────────────────────┐ │
27
+ │ │ Process Queue │ │
28
+ │ │ (dispatch to workers) │ │
29
+ │ └──────────────┬──────────────┘ │
30
+ │ ▼ │
31
+ │ ┌─────────────────────────────┐ │
32
+ │ │ Result Collector │ │
33
+ │ │ (reassemble per job) │ │
34
+ │ └─────────────────────────────┘ │
35
+ └───────────────────────────────────┘
36
+
37
+ Performance targets:
38
+ - 10,000+ chunks/second throughput
39
+ - 75-chunk optimal batching
40
+ - Zero-copy where possible
41
+ - Backpressure handling
42
+
43
+ Usage:
44
+ collector = ChunkCollector(workers=[...])
45
+ await collector.start()
46
+
47
+ # Submit files (returns immediately)
48
+ job_id = await collector.submit_file(data, "transform")
49
+
50
+ # Check status
51
+ status = collector.get_job_status(job_id)
52
+
53
+ # Get result when ready
54
+ result = await collector.get_result(job_id)
55
+ """
56
+
57
+ import asyncio
58
+ import hashlib
59
+ import time
60
+ import uuid
61
+ import base64
62
+ import json
63
+ import logging
64
+ from dataclasses import dataclass, field
65
+ from enum import Enum
66
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple
67
+ from datetime import datetime
68
+ from collections import defaultdict
69
+ import struct
70
+
71
+ import httpx
72
+
73
+ logger = logging.getLogger(__name__)
74
+
75
+
76
+ # =============================================================================
77
+ # Configuration
78
+ # =============================================================================
79
+
80
+ DEFAULT_COMPRESSION_BATCH_SIZE = 10 # Smaller batches for compression (avoid 413)
81
+ DEFAULT_DISPATCH_BATCH_SIZE = 50 # Larger batches for dispatch
82
+ DEFAULT_CHUNK_SIZE_MB = 1 # 1MB chunks
83
+ DEFAULT_NUM_COMPRESSORS = 4 # Parallel compression workers
84
+ DEFAULT_NUM_DISPATCHERS = 8 # Parallel dispatch workers
85
+ DEFAULT_BATCH_TIMEOUT_MS = 25 # Max wait before sending partial batch
86
+ DEFAULT_MAX_QUEUE_SIZE = 10000 # Backpressure threshold
87
+
88
+
89
+ # =============================================================================
90
+ # Data Structures
91
+ # =============================================================================
92
+
93
+ class JobStatus(Enum):
94
+ QUEUED = "queued"
95
+ CHUNKING = "chunking"
96
+ COMPRESSING = "compressing"
97
+ PROCESSING = "processing"
98
+ COLLECTING = "collecting"
99
+ COMPLETED = "completed"
100
+ FAILED = "failed"
101
+
102
+
103
+ @dataclass
104
+ class ChunkMeta:
105
+ """Lightweight chunk metadata for queue processing."""
106
+ chunk_id: str
107
+ job_id: str
108
+ sequence: int
109
+ offset: int
110
+ size: int
111
+ checksum: str
112
+
113
+
114
+ @dataclass
115
+ class QueuedChunk:
116
+ """A chunk waiting in the compression queue."""
117
+ meta: ChunkMeta
118
+ data: bytes
119
+ timestamp: float = field(default_factory=time.time)
120
+
121
+
122
+ @dataclass
123
+ class CompressedChunk:
124
+ """A compressed chunk ready for dispatch."""
125
+ meta: ChunkMeta
126
+ compressed_data: bytes
127
+ compression_ratio: float
128
+ timestamp: float = field(default_factory=time.time)
129
+
130
+
131
+ @dataclass
132
+ class ProcessedChunk:
133
+ """A chunk that has been processed by a worker."""
134
+ meta: ChunkMeta
135
+ result_data: bytes
136
+ success: bool
137
+ error: Optional[str] = None
138
+ latency_ms: float = 0.0
139
+
140
+
141
+ @dataclass
142
+ class JobState:
143
+ """State tracking for a distributed job."""
144
+ job_id: str
145
+ operation: str
146
+ status: JobStatus = JobStatus.QUEUED
147
+ total_chunks: int = 0
148
+ chunks_compressed: int = 0
149
+ chunks_dispatched: int = 0
150
+ chunks_completed: int = 0
151
+ chunks_failed: int = 0
152
+ bytes_raw: int = 0
153
+ bytes_compressed: int = 0
154
+ created_at: float = field(default_factory=time.time)
155
+ completed_at: Optional[float] = None
156
+ results: Dict[int, bytes] = field(default_factory=dict) # sequence -> result
157
+ errors: List[str] = field(default_factory=list)
158
+ operation_params: Dict[str, Any] = field(default_factory=dict)
159
+
160
+
161
+ @dataclass
162
+ class CollectorStats:
163
+ """Global collector statistics."""
164
+ files_received: int = 0
165
+ chunks_created: int = 0
166
+ chunks_compressed: int = 0
167
+ chunks_dispatched: int = 0
168
+ chunks_completed: int = 0
169
+ batches_sent: int = 0
170
+ bytes_in: int = 0
171
+ bytes_compressed: int = 0
172
+ bytes_out: int = 0
173
+ errors: int = 0
174
+ start_time: float = field(default_factory=time.time)
175
+
176
+ @property
177
+ def compression_ratio(self) -> float:
178
+ if self.bytes_in == 0:
179
+ return 0.0
180
+ return self.bytes_compressed / self.bytes_in
181
+
182
+ @property
183
+ def bandwidth_saved_pct(self) -> float:
184
+ return (1 - self.compression_ratio) * 100
185
+
186
+ @property
187
+ def throughput_chunks_per_sec(self) -> float:
188
+ elapsed = time.time() - self.start_time
189
+ if elapsed == 0:
190
+ return 0.0
191
+ return self.chunks_completed / elapsed
192
+
193
+ @property
194
+ def throughput_mb_per_sec(self) -> float:
195
+ elapsed = time.time() - self.start_time
196
+ if elapsed == 0:
197
+ return 0.0
198
+ return (self.bytes_in / 1024 / 1024) / elapsed
199
+
200
+
201
+ # =============================================================================
202
+ # Chunk Collector
203
+ # =============================================================================
204
+
205
+ class ChunkCollector:
206
+ """
207
+ High-performance collector for distributed chunk processing.
208
+
209
+ Handles multiple concurrent files with optimal batching at every stage.
210
+ """
211
+
212
+ def __init__(
213
+ self,
214
+ workers: List[str],
215
+ compression_batch_size: int = DEFAULT_COMPRESSION_BATCH_SIZE,
216
+ chunk_size_mb: float = DEFAULT_CHUNK_SIZE_MB,
217
+ num_compressors: int = DEFAULT_NUM_COMPRESSORS,
218
+ num_dispatchers: int = DEFAULT_NUM_DISPATCHERS,
219
+ batch_timeout_ms: float = DEFAULT_BATCH_TIMEOUT_MS,
220
+ max_queue_size: int = DEFAULT_MAX_QUEUE_SIZE,
221
+ auth_token: Optional[str] = None
222
+ ):
223
+ self.workers = workers
224
+ self.compression_batch_size = compression_batch_size
225
+ self.chunk_size_bytes = int(chunk_size_mb * 1024 * 1024)
226
+ self.num_compressors = num_compressors
227
+ self.num_dispatchers = num_dispatchers
228
+ self.batch_timeout_ms = batch_timeout_ms
229
+ self.max_queue_size = max_queue_size
230
+ self.auth_token = auth_token or "eudaimonia-translator-demo"
231
+
232
+ # Queues
233
+ self.chunking_queue: asyncio.Queue[Tuple[str, bytes, str, Dict]] = asyncio.Queue()
234
+ self.compression_queue: asyncio.Queue[QueuedChunk] = asyncio.Queue(maxsize=max_queue_size)
235
+ self.dispatch_queue: asyncio.Queue[CompressedChunk] = asyncio.Queue(maxsize=max_queue_size)
236
+ self.result_queue: asyncio.Queue[ProcessedChunk] = asyncio.Queue()
237
+
238
+ # State
239
+ self.jobs: Dict[str, JobState] = {}
240
+ self.stats = CollectorStats()
241
+ self._running = False
242
+ self._tasks: List[asyncio.Task] = []
243
+ self._http_client: Optional[httpx.AsyncClient] = None
244
+
245
+ # Worker selection (round-robin with tracking)
246
+ self._worker_idx = 0
247
+ self._worker_active: Dict[str, int] = {w: 0 for w in workers}
248
+
249
+ async def start(self):
250
+ """Start all collector workers."""
251
+ self._running = True
252
+ self._http_client = httpx.AsyncClient(timeout=60.0)
253
+
254
+ # Start chunking worker (single - CPU bound)
255
+ self._tasks.append(asyncio.create_task(self._chunking_worker()))
256
+
257
+ # Start compression workers (parallel batchers)
258
+ for i in range(self.num_compressors):
259
+ self._tasks.append(asyncio.create_task(self._compression_worker(i)))
260
+
261
+ # Start dispatch workers (parallel)
262
+ for i in range(self.num_dispatchers):
263
+ self._tasks.append(asyncio.create_task(self._dispatch_worker(i)))
264
+
265
+ # Start result collector
266
+ self._tasks.append(asyncio.create_task(self._result_collector()))
267
+
268
+ logger.info(
269
+ f"ChunkCollector started: {self.num_compressors} compressors, "
270
+ f"{self.num_dispatchers} dispatchers, {len(self.workers)} workers"
271
+ )
272
+
273
+ async def stop(self):
274
+ """Stop all workers gracefully."""
275
+ self._running = False
276
+
277
+ # Cancel all tasks
278
+ for task in self._tasks:
279
+ task.cancel()
280
+
281
+ # Wait for tasks to finish
282
+ await asyncio.gather(*self._tasks, return_exceptions=True)
283
+
284
+ # Close HTTP client
285
+ if self._http_client:
286
+ await self._http_client.aclose()
287
+
288
+ logger.info("ChunkCollector stopped")
289
+
290
+ async def submit_file(
291
+ self,
292
+ data: bytes,
293
+ operation: str,
294
+ operation_params: Optional[Dict] = None,
295
+ metadata: Optional[Dict] = None
296
+ ) -> str:
297
+ """
298
+ Submit a file for distributed processing.
299
+
300
+ Returns immediately with job_id. Use get_job_status() to track progress.
301
+
302
+ Args:
303
+ data: Raw file bytes
304
+ operation: Operation to perform ("transform", "analyze", etc.)
305
+ operation_params: Parameters for the operation
306
+ metadata: Optional metadata to attach to job
307
+
308
+ Returns:
309
+ job_id for tracking
310
+ """
311
+ job_id = str(uuid.uuid4())[:8]
312
+
313
+ # Create job state
314
+ job = JobState(
315
+ job_id=job_id,
316
+ operation=operation,
317
+ bytes_raw=len(data),
318
+ operation_params=operation_params or {}
319
+ )
320
+ self.jobs[job_id] = job
321
+ self.stats.files_received += 1
322
+ self.stats.bytes_in += len(data)
323
+
324
+ # Queue for chunking
325
+ await self.chunking_queue.put((job_id, data, operation, operation_params or {}))
326
+
327
+ logger.info(f"Job {job_id} submitted: {len(data):,} bytes, operation={operation}")
328
+ return job_id
329
+
330
+ async def submit_files(
331
+ self,
332
+ files: List[Tuple[bytes, str, Optional[Dict]]],
333
+ ) -> List[str]:
334
+ """
335
+ Submit multiple files at once.
336
+
337
+ Args:
338
+ files: List of (data, operation, params) tuples
339
+
340
+ Returns:
341
+ List of job_ids
342
+ """
343
+ job_ids = []
344
+ for data, operation, params in files:
345
+ job_id = await self.submit_file(data, operation, params)
346
+ job_ids.append(job_id)
347
+ return job_ids
348
+
349
+ def get_job_status(self, job_id: str) -> Dict[str, Any]:
350
+ """Get detailed status for a job."""
351
+ job = self.jobs.get(job_id)
352
+ if not job:
353
+ return {"error": f"Job {job_id} not found"}
354
+
355
+ elapsed = time.time() - job.created_at
356
+
357
+ return {
358
+ "job_id": job.job_id,
359
+ "status": job.status.value,
360
+ "operation": job.operation,
361
+ "progress": {
362
+ "total_chunks": job.total_chunks,
363
+ "compressed": job.chunks_compressed,
364
+ "dispatched": job.chunks_dispatched,
365
+ "completed": job.chunks_completed,
366
+ "failed": job.chunks_failed,
367
+ "pct_complete": (job.chunks_completed / job.total_chunks * 100) if job.total_chunks > 0 else 0
368
+ },
369
+ "bytes": {
370
+ "raw": job.bytes_raw,
371
+ "compressed": job.bytes_compressed,
372
+ "compression_ratio": f"{job.bytes_compressed / job.bytes_raw:.2%}" if job.bytes_raw > 0 else "0%"
373
+ },
374
+ "timing": {
375
+ "elapsed_seconds": elapsed,
376
+ "throughput_mbps": (job.bytes_raw / 1024 / 1024) / elapsed if elapsed > 0 else 0
377
+ },
378
+ "errors": job.errors
379
+ }
380
+
381
+ async def get_result(self, job_id: str, timeout: float = 300.0) -> Optional[bytes]:
382
+ """
383
+ Wait for and return job result.
384
+
385
+ Args:
386
+ job_id: Job to wait for
387
+ timeout: Maximum seconds to wait
388
+
389
+ Returns:
390
+ Reassembled result bytes, or None if failed/timeout
391
+ """
392
+ start = time.time()
393
+
394
+ while time.time() - start < timeout:
395
+ job = self.jobs.get(job_id)
396
+ if not job:
397
+ return None
398
+
399
+ if job.status == JobStatus.COMPLETED:
400
+ # Reassemble results in order
401
+ result_parts = []
402
+ for seq in sorted(job.results.keys()):
403
+ result_parts.append(job.results[seq])
404
+ return b''.join(result_parts)
405
+
406
+ if job.status == JobStatus.FAILED:
407
+ return None
408
+
409
+ await asyncio.sleep(0.1)
410
+
411
+ return None
412
+
413
+ def get_stats(self) -> Dict[str, Any]:
414
+ """Get global collector statistics."""
415
+ s = self.stats
416
+ return {
417
+ "files_received": s.files_received,
418
+ "chunks": {
419
+ "created": s.chunks_created,
420
+ "compressed": s.chunks_compressed,
421
+ "dispatched": s.chunks_dispatched,
422
+ "completed": s.chunks_completed
423
+ },
424
+ "batches_sent": s.batches_sent,
425
+ "bytes": {
426
+ "in": s.bytes_in,
427
+ "compressed": s.bytes_compressed,
428
+ "out": s.bytes_out,
429
+ "compression_ratio": f"{s.compression_ratio:.2%}",
430
+ "bandwidth_saved": f"{s.bandwidth_saved_pct:.1f}%"
431
+ },
432
+ "throughput": {
433
+ "chunks_per_sec": f"{s.throughput_chunks_per_sec:.0f}",
434
+ "mb_per_sec": f"{s.throughput_mb_per_sec:.2f}"
435
+ },
436
+ "errors": s.errors,
437
+ "queues": {
438
+ "chunking": self.chunking_queue.qsize(),
439
+ "compression": self.compression_queue.qsize(),
440
+ "dispatch": self.dispatch_queue.qsize(),
441
+ "results": self.result_queue.qsize()
442
+ },
443
+ "workers": {
444
+ "total": len(self.workers),
445
+ "active_connections": dict(self._worker_active)
446
+ },
447
+ "uptime_seconds": time.time() - s.start_time
448
+ }
449
+
450
+ # =========================================================================
451
+ # Internal Workers
452
+ # =========================================================================
453
+
454
+ async def _chunking_worker(self):
455
+ """Worker that splits files into chunks."""
456
+ while self._running:
457
+ try:
458
+ job_id, data, operation, params = await asyncio.wait_for(
459
+ self.chunking_queue.get(),
460
+ timeout=1.0
461
+ )
462
+
463
+ job = self.jobs.get(job_id)
464
+ if not job:
465
+ continue
466
+
467
+ job.status = JobStatus.CHUNKING
468
+
469
+ # Split into chunks
470
+ offset = 0
471
+ sequence = 0
472
+
473
+ while offset < len(data):
474
+ chunk_data = data[offset:offset + self.chunk_size_bytes]
475
+ chunk_size = len(chunk_data)
476
+
477
+ meta = ChunkMeta(
478
+ chunk_id=f"{job_id}-{sequence:04d}",
479
+ job_id=job_id,
480
+ sequence=sequence,
481
+ offset=offset,
482
+ size=chunk_size,
483
+ checksum=hashlib.sha256(chunk_data).hexdigest()[:16]
484
+ )
485
+
486
+ # Queue for compression
487
+ await self.compression_queue.put(QueuedChunk(meta=meta, data=chunk_data))
488
+ self.stats.chunks_created += 1
489
+
490
+ offset += chunk_size
491
+ sequence += 1
492
+
493
+ job.total_chunks = sequence
494
+ job.status = JobStatus.COMPRESSING
495
+
496
+ logger.debug(f"Job {job_id}: created {sequence} chunks")
497
+
498
+ except asyncio.TimeoutError:
499
+ continue
500
+ except asyncio.CancelledError:
501
+ break
502
+ except Exception as e:
503
+ logger.error(f"Chunking worker error: {e}")
504
+ self.stats.errors += 1
505
+
506
+ async def _compression_worker(self, worker_id: int):
507
+ """Worker that batches and compresses chunks."""
508
+ batch: List[QueuedChunk] = []
509
+ last_flush = time.time()
510
+
511
+ while self._running:
512
+ try:
513
+ # Try to fill batch
514
+ try:
515
+ chunk = await asyncio.wait_for(
516
+ self.compression_queue.get(),
517
+ timeout=self.batch_timeout_ms / 1000
518
+ )
519
+ batch.append(chunk)
520
+ except asyncio.TimeoutError:
521
+ pass
522
+
523
+ # Check if we should flush
524
+ should_flush = (
525
+ len(batch) >= self.compression_batch_size or
526
+ (batch and (time.time() - last_flush) * 1000 >= self.batch_timeout_ms)
527
+ )
528
+
529
+ if should_flush and batch:
530
+ await self._compress_batch(batch)
531
+ batch = []
532
+ last_flush = time.time()
533
+
534
+ except asyncio.CancelledError:
535
+ # Flush remaining
536
+ if batch:
537
+ await self._compress_batch(batch)
538
+ break
539
+ except Exception as e:
540
+ logger.error(f"Compression worker {worker_id} error: {e}")
541
+ self.stats.errors += 1
542
+
543
+ async def _compress_batch(self, batch: List[QueuedChunk]):
544
+ """Compress chunks via individual API calls (parallel within batch)."""
545
+ if not batch:
546
+ return
547
+
548
+ # Compress all chunks in parallel
549
+ tasks = [self._compress_single(chunk) for chunk in batch]
550
+ await asyncio.gather(*tasks, return_exceptions=True)
551
+ self.stats.batches_sent += 1
552
+
553
+ async def _compress_single(self, chunk: QueuedChunk, retry: int = 0):
554
+ """Compress a single chunk via the /compress endpoint (supports large files)."""
555
+ max_retries = 3
556
+ # Round-robin worker selection for compression
557
+ worker_url = self.workers[self._worker_idx % len(self.workers)]
558
+ self._worker_idx += 1
559
+
560
+ try:
561
+ # Use multipart form upload for larger chunks
562
+ # Prepend chunk ID to data for tracking
563
+ chunk_data_with_id = f"CHUNK:{chunk.meta.chunk_id}:".encode('utf-8') + chunk.data
564
+
565
+ # Create multipart form data
566
+ files = {
567
+ 'file': (f'{chunk.meta.chunk_id}.bin', chunk_data_with_id, 'application/octet-stream')
568
+ }
569
+
570
+ response = await self._http_client.post(
571
+ f"{worker_url}/compress",
572
+ files=files
573
+ )
574
+
575
+ # Handle rate limiting with exponential backoff
576
+ if response.status_code == 429 and retry < max_retries:
577
+ wait_time = (2 ** retry) + (retry * 0.5) # 1s, 2.5s, 4.5s
578
+ logger.debug(f"Rate limited, retrying {chunk.meta.chunk_id} in {wait_time}s")
579
+ await asyncio.sleep(wait_time)
580
+ return await self._compress_single(chunk, retry + 1)
581
+
582
+ if response.status_code == 200:
583
+ result = response.json()
584
+ if result.get("success"):
585
+ # /compress returns compressed_data in base64
586
+ compressed_b64 = result.get("compressed_data", "")
587
+ if compressed_b64:
588
+ compressed_bytes = base64.b64decode(compressed_b64)
589
+ else:
590
+ compressed_bytes = b''
591
+
592
+ compressed = CompressedChunk(
593
+ meta=chunk.meta,
594
+ compressed_data=compressed_bytes,
595
+ compression_ratio=result.get("compression_ratio", 0)
596
+ )
597
+
598
+ # Update job state
599
+ job = self.jobs.get(chunk.meta.job_id)
600
+ if job:
601
+ job.chunks_compressed += 1
602
+ job.bytes_compressed += len(compressed_bytes)
603
+
604
+ # Queue for dispatch
605
+ await self.dispatch_queue.put(compressed)
606
+ self.stats.chunks_compressed += 1
607
+ self.stats.bytes_compressed += len(compressed_bytes)
608
+ return
609
+ else:
610
+ error = result.get("errors", ["Unknown error"])
611
+ logger.error(f"Compression failed for {chunk.meta.chunk_id}: {error}")
612
+ else:
613
+ # Log the actual error response
614
+ try:
615
+ error_detail = response.json().get("detail", response.text[:200])
616
+ except:
617
+ error_detail = response.text[:200]
618
+ logger.error(f"Compression HTTP {response.status_code} for {chunk.meta.chunk_id}: {error_detail}")
619
+
620
+ # Mark as failed
621
+ job = self.jobs.get(chunk.meta.job_id)
622
+ if job:
623
+ job.chunks_failed += 1
624
+ job.errors.append(f"Compression failed for chunk {chunk.meta.chunk_id}")
625
+ self.stats.errors += 1
626
+
627
+ except Exception as e:
628
+ logger.error(f"Compression error for {chunk.meta.chunk_id}: {e}")
629
+ job = self.jobs.get(chunk.meta.job_id)
630
+ if job:
631
+ job.chunks_failed += 1
632
+ self.stats.errors += 1
633
+
634
+ async def _dispatch_worker(self, worker_id: int):
635
+ """Worker that dispatches compressed chunks to processing workers."""
636
+ while self._running:
637
+ try:
638
+ chunk = await asyncio.wait_for(
639
+ self.dispatch_queue.get(),
640
+ timeout=0.5
641
+ )
642
+
643
+ # Get job for operation info
644
+ job = self.jobs.get(chunk.meta.job_id)
645
+ if not job:
646
+ continue
647
+
648
+ if job.status == JobStatus.COMPRESSING:
649
+ job.status = JobStatus.PROCESSING
650
+
651
+ # Select worker (round-robin with least connections)
652
+ worker_url = self._select_worker()
653
+
654
+ # Dispatch to worker
655
+ start_time = time.perf_counter()
656
+
657
+ try:
658
+ self._worker_active[worker_url] += 1
659
+
660
+ response = await self._http_client.post(
661
+ f"{worker_url}/process",
662
+ json={
663
+ "chunk_id": chunk.meta.chunk_id,
664
+ "operation": job.operation,
665
+ "data": base64.b64encode(chunk.compressed_data).decode('utf-8'),
666
+ "params": job.operation_params
667
+ },
668
+ headers={"Authorization": f"Bearer {self.auth_token}"}
669
+ )
670
+
671
+ latency_ms = (time.perf_counter() - start_time) * 1000
672
+
673
+ if response.status_code == 200:
674
+ result = response.json()
675
+ if result.get("success"):
676
+ result_data = base64.b64decode(result.get("result", ""))
677
+
678
+ processed = ProcessedChunk(
679
+ meta=chunk.meta,
680
+ result_data=result_data,
681
+ success=True,
682
+ latency_ms=latency_ms
683
+ )
684
+ await self.result_queue.put(processed)
685
+
686
+ job.chunks_dispatched += 1
687
+ self.stats.chunks_dispatched += 1
688
+ else:
689
+ error = result.get("error", "Unknown error")
690
+ processed = ProcessedChunk(
691
+ meta=chunk.meta,
692
+ result_data=b'',
693
+ success=False,
694
+ error=error,
695
+ latency_ms=latency_ms
696
+ )
697
+ await self.result_queue.put(processed)
698
+ self.stats.errors += 1
699
+ else:
700
+ processed = ProcessedChunk(
701
+ meta=chunk.meta,
702
+ result_data=b'',
703
+ success=False,
704
+ error=f"HTTP {response.status_code}",
705
+ latency_ms=latency_ms
706
+ )
707
+ await self.result_queue.put(processed)
708
+ self.stats.errors += 1
709
+
710
+ finally:
711
+ self._worker_active[worker_url] -= 1
712
+
713
+ except asyncio.TimeoutError:
714
+ continue
715
+ except asyncio.CancelledError:
716
+ break
717
+ except Exception as e:
718
+ logger.error(f"Dispatch worker {worker_id} error: {e}")
719
+ self.stats.errors += 1
720
+
721
+ async def _result_collector(self):
722
+ """Collects processed chunks and updates job state."""
723
+ while self._running:
724
+ try:
725
+ result = await asyncio.wait_for(
726
+ self.result_queue.get(),
727
+ timeout=1.0
728
+ )
729
+
730
+ job = self.jobs.get(result.meta.job_id)
731
+ if not job:
732
+ continue
733
+
734
+ if result.success:
735
+ job.results[result.meta.sequence] = result.result_data
736
+ job.chunks_completed += 1
737
+ self.stats.chunks_completed += 1
738
+ self.stats.bytes_out += len(result.result_data)
739
+ else:
740
+ job.chunks_failed += 1
741
+ job.errors.append(f"Chunk {result.meta.chunk_id}: {result.error}")
742
+
743
+ # Check if job is complete
744
+ total_processed = job.chunks_completed + job.chunks_failed
745
+ if total_processed >= job.total_chunks and job.total_chunks > 0:
746
+ if job.chunks_failed == 0:
747
+ job.status = JobStatus.COMPLETED
748
+ else:
749
+ job.status = JobStatus.COMPLETED # Partial success
750
+ job.completed_at = time.time()
751
+
752
+ elapsed = job.completed_at - job.created_at
753
+ logger.info(
754
+ f"Job {job.job_id} completed: {job.chunks_completed}/{job.total_chunks} chunks, "
755
+ f"{elapsed:.2f}s, {job.bytes_raw/1024/1024/elapsed:.2f} MB/s"
756
+ )
757
+
758
+ except asyncio.TimeoutError:
759
+ continue
760
+ except asyncio.CancelledError:
761
+ break
762
+ except Exception as e:
763
+ logger.error(f"Result collector error: {e}")
764
+
765
+ def _select_worker(self) -> str:
766
+ """Select best worker using least-connections."""
767
+ # Find worker with fewest active connections
768
+ min_active = min(self._worker_active.values())
769
+ candidates = [w for w, a in self._worker_active.items() if a == min_active]
770
+
771
+ # Round-robin among candidates
772
+ self._worker_idx = (self._worker_idx + 1) % len(candidates)
773
+ return candidates[self._worker_idx % len(candidates)]
774
+
775
+
776
+ # =============================================================================
777
+ # FastAPI Application
778
+ # =============================================================================
779
+
780
+ def create_collector_app(
781
+ workers: List[str],
782
+ **kwargs
783
+ ) -> "FastAPI":
784
+ """Create a FastAPI app for the chunk collector."""
785
+ from fastapi import FastAPI, HTTPException, UploadFile, File, Form
786
+ from fastapi.middleware.cors import CORSMiddleware
787
+ from pydantic import BaseModel
788
+ from contextlib import asynccontextmanager
789
+
790
+ collector = ChunkCollector(workers=workers, **kwargs)
791
+
792
+ @asynccontextmanager
793
+ async def lifespan(app: FastAPI):
794
+ await collector.start()
795
+ yield
796
+ await collector.stop()
797
+
798
+ app = FastAPI(
799
+ title="Emergent Language Chunk Collector",
800
+ description="High-performance distributed file processing with θ-compression",
801
+ version="1.0.0",
802
+ lifespan=lifespan
803
+ )
804
+
805
+ app.add_middleware(
806
+ CORSMiddleware,
807
+ allow_origins=["*"],
808
+ allow_methods=["*"],
809
+ allow_headers=["*"],
810
+ )
811
+
812
+ class SubmitRequest(BaseModel):
813
+ data: str # Base64 encoded
814
+ operation: str
815
+ params: Dict[str, Any] = {}
816
+
817
+ class BulkSubmitRequest(BaseModel):
818
+ files: List[SubmitRequest]
819
+
820
+ @app.get("/")
821
+ async def root():
822
+ return {
823
+ "service": "chunk-collector",
824
+ "version": "1.0.0",
825
+ "endpoints": {
826
+ "/submit": "Submit file for processing (POST)",
827
+ "/submit/upload": "Upload file for processing (POST multipart)",
828
+ "/submit/bulk": "Submit multiple files (POST)",
829
+ "/status/{job_id}": "Get job status (GET)",
830
+ "/result/{job_id}": "Get job result (GET)",
831
+ "/stats": "Collector statistics (GET)",
832
+ "/health": "Health check (GET)"
833
+ }
834
+ }
835
+
836
+ @app.get("/health")
837
+ async def health():
838
+ stats = collector.get_stats()
839
+ return {
840
+ "status": "healthy",
841
+ "queues": stats["queues"],
842
+ "workers": stats["workers"],
843
+ "uptime": stats["uptime_seconds"]
844
+ }
845
+
846
+ @app.get("/stats")
847
+ async def stats():
848
+ return collector.get_stats()
849
+
850
+ @app.post("/submit")
851
+ async def submit(request: SubmitRequest):
852
+ """Submit base64-encoded file for processing."""
853
+ try:
854
+ data = base64.b64decode(request.data)
855
+ except Exception:
856
+ raise HTTPException(400, "Invalid base64 data")
857
+
858
+ job_id = await collector.submit_file(
859
+ data=data,
860
+ operation=request.operation,
861
+ operation_params=request.params
862
+ )
863
+
864
+ return {"job_id": job_id, "status": "queued"}
865
+
866
+ @app.post("/submit/upload")
867
+ async def submit_upload(
868
+ file: UploadFile = File(...),
869
+ operation: str = Form(default="passthrough"),
870
+ params: str = Form(default="{}")
871
+ ):
872
+ """Upload file for processing."""
873
+ data = await file.read()
874
+
875
+ try:
876
+ operation_params = json.loads(params)
877
+ except:
878
+ operation_params = {}
879
+
880
+ job_id = await collector.submit_file(
881
+ data=data,
882
+ operation=operation,
883
+ operation_params=operation_params
884
+ )
885
+
886
+ return {
887
+ "job_id": job_id,
888
+ "status": "queued",
889
+ "file_name": file.filename,
890
+ "file_size": len(data)
891
+ }
892
+
893
+ @app.post("/submit/bulk")
894
+ async def submit_bulk(request: BulkSubmitRequest):
895
+ """Submit multiple files at once."""
896
+ job_ids = []
897
+ for f in request.files:
898
+ try:
899
+ data = base64.b64decode(f.data)
900
+ job_id = await collector.submit_file(
901
+ data=data,
902
+ operation=f.operation,
903
+ operation_params=f.params
904
+ )
905
+ job_ids.append({"job_id": job_id, "status": "queued"})
906
+ except Exception as e:
907
+ job_ids.append({"error": str(e)})
908
+
909
+ return {"jobs": job_ids}
910
+
911
+ @app.get("/status/{job_id}")
912
+ async def status(job_id: str):
913
+ return collector.get_job_status(job_id)
914
+
915
+ @app.get("/result/{job_id}")
916
+ async def result(job_id: str, timeout: float = 60.0):
917
+ """Get job result (waits for completion up to timeout)."""
918
+ result_bytes = await collector.get_result(job_id, timeout=timeout)
919
+
920
+ if result_bytes is None:
921
+ job = collector.jobs.get(job_id)
922
+ if job and job.status == JobStatus.FAILED:
923
+ raise HTTPException(500, f"Job failed: {job.errors}")
924
+ raise HTTPException(408, "Timeout waiting for result")
925
+
926
+ return {
927
+ "job_id": job_id,
928
+ "status": "completed",
929
+ "result": base64.b64encode(result_bytes).decode('utf-8'),
930
+ "result_size": len(result_bytes)
931
+ }
932
+
933
+ return app
934
+
935
+
936
+ # =============================================================================
937
+ # CLI Entry Point
938
+ # =============================================================================
939
+
940
+ if __name__ == "__main__":
941
+ import argparse
942
+ import uvicorn
943
+
944
+ parser = argparse.ArgumentParser(description="Chunk Collector Service")
945
+ parser.add_argument("--port", "-p", type=int, default=8080, help="Port to listen on")
946
+ parser.add_argument("--workers", "-w", type=str, default="https://emergent-language.fly.dev",
947
+ help="Comma-separated worker URLs")
948
+ parser.add_argument("--batch-size", "-b", type=int, default=10, help="Compression batch size")
949
+ parser.add_argument("--chunk-size", "-c", type=float, default=1.0, help="Chunk size in MB")
950
+ parser.add_argument("--compressors", type=int, default=4, help="Number of compression workers")
951
+ parser.add_argument("--dispatchers", type=int, default=8, help="Number of dispatch workers")
952
+
953
+ args = parser.parse_args()
954
+
955
+ workers = [w.strip() for w in args.workers.split(",")]
956
+
957
+ app = create_collector_app(
958
+ workers=workers,
959
+ compression_batch_size=args.batch_size,
960
+ chunk_size_mb=args.chunk_size,
961
+ num_compressors=args.compressors,
962
+ num_dispatchers=args.dispatchers
963
+ )
964
+
965
+ print(f"""
966
+ ╔═══════════════════════════════════════════════════════════════════╗
967
+ ║ Emergent Language Chunk Collector ║
968
+ ╠═══════════════════════════════════════════════════════════════════╣
969
+ ║ Port: {args.port:<6} ║
970
+ ║ Workers: {len(workers):<6} ║
971
+ ║ Batch size: {args.batch_size:<6} ║
972
+ ║ Chunk size: {args.chunk_size}MB ║
973
+ ║ Compressors: {args.compressors:<6} ║
974
+ ║ Dispatchers: {args.dispatchers:<6} ║
975
+ ╚═══════════════════════════════════════════════════════════════════╝
976
+ """)
977
+
978
+ uvicorn.run(app, host="0.0.0.0", port=args.port)