emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,555 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Batch Endpoint for Emergent Language
4
+
5
+ Encodes multiple messages in a single request for enterprise use cases.
6
+
7
+ Benefits:
8
+ - Amortize HTTP overhead across many messages
9
+ - Better compression (larger data = better zlib ratios)
10
+ - Reduced connection churn
11
+ - Atomic batch processing
12
+
13
+ Usage:
14
+ from batch_encoder import BatchEncoder
15
+
16
+ encoder = BatchEncoder()
17
+ result = encoder.encode_batch([msg1, msg2, msg3, ...])
18
+
19
+ # result.compressed_bytes - the batch payload
20
+ # result.compression_ratio - overall compression
21
+ # result.messages_encoded - count
22
+ """
23
+
24
+ import json
25
+ import zlib
26
+ import struct
27
+ import time
28
+ from dataclasses import dataclass
29
+ from typing import List, Dict, Any, Tuple
30
+
31
+ # ═══════════════════════════════════════════════════════════════════════════════
32
+ # Emergent Symbol Encoder v2 (same as before)
33
+ # ═══════════════════════════════════════════════════════════════════════════════
34
+
35
+ COMMON_KEYS = {
36
+ "task": 0x01, "type": 0x02, "data": 0x03, "id": 0x04,
37
+ "agent": 0x05, "agent_id": 0x06, "task_type": 0x07,
38
+ "params": 0x08, "parameters": 0x08, "context": 0x09,
39
+ "priority": 0x0A, "status": 0x0B, "result": 0x0C,
40
+ "error": 0x0D, "message": 0x0E, "timestamp": 0x0F,
41
+ "action": 0x10, "target": 0x11, "source": 0x12, "value": 0x13,
42
+ "name": 0x14, "depth": 0x15, "level": 0x16, "mode": 0x17,
43
+ "config": 0x18, "settings": 0x19, "options": 0x1A,
44
+ "request": 0x1B, "response": 0x1C, "callback": 0x1D,
45
+ "coordination": 0x1E, "agents": 0x1F,
46
+ "version": 0x20, "v": 0x20, "capabilities": 0x21, "role": 0x22,
47
+ "task_id": 0x30, "request_id": 0x40, "initiator": 0x41,
48
+ "participants": 0x42, "task_distribution": 0x43, "consensus": 0x44,
49
+ "metrics": 0x50, "summary": 0x53, "confidence": 0x58,
50
+ "recommendations": 0x56, "timeframe": 0x59, "market": 0x60,
51
+ "sentiment": 0x68, "volatility": 0x69, "risk": 0x6A,
52
+ "order": 0x71, "symbol": 0x61, "side": 0x74, "quantity": 0x73, "limit": 0x75,
53
+ }
54
+
55
+ COMMON_VALUES = {
56
+ "analyze": 0x01, "optimize": 0x02, "execute": 0x03, "query": 0x04,
57
+ "high": 0x10, "medium": 0x11, "low": 0x12, "comprehensive": 0x18,
58
+ "pending": 0x20, "running": 0x21, "complete": 0x22, "completed": 0x22,
59
+ "failed": 0x23, "success": 0x26, "analysis": 0x30, "optimization": 0x31,
60
+ "execution": 0x32, "bullish": 0x50, "bearish": 0x51, "volatile": 0x53,
61
+ "buy": 0x56, "sell": 0x57, "hold": 0x58, "market": 0x60, "data": 0x61,
62
+ "trends": 0x62, "moderate": 0x66, "24h": 0x72, "quorum": 0x44,
63
+ }
64
+
65
+ # Reverse lookups for decoding
66
+ COMMON_KEYS_REV = {v: k for k, v in COMMON_KEYS.items()}
67
+ COMMON_VALUES_REV = {v: k for k, v in COMMON_VALUES.items()}
68
+
69
+
70
+ def encode_dict(d: Dict, depth: int = 0, codebook=None) -> bytes:
71
+ if depth > 10 or not d:
72
+ return b'\x60'
73
+ keys_map = codebook.keys if codebook is not None else COMMON_KEYS
74
+ parts = [bytes([0x61, min(len(d), 255)])]
75
+ for k, v in list(d.items())[:255]:
76
+ kl = k.lower()
77
+ if kl in keys_map:
78
+ parts.append(bytes([0x80 | keys_map[kl]]))
79
+ else:
80
+ kb = k.encode()[:63]
81
+ parts.append(bytes([len(kb)]) + kb)
82
+ parts.append(encode_value(v, depth + 1, codebook=codebook))
83
+ return b''.join(parts)
84
+
85
+
86
+ def encode_value(v: Any, depth: int = 0, codebook=None) -> bytes:
87
+ if v is None:
88
+ return b'\x00'
89
+ if isinstance(v, bool):
90
+ return bytes([0x01 if v else 0x02])
91
+ if isinstance(v, int):
92
+ if 0 <= v <= 127:
93
+ return bytes([0x10, v])
94
+ if -128 <= v <= 127:
95
+ return bytes([0x11, v & 0xFF])
96
+ if 0 <= v <= 65535:
97
+ return bytes([0x12]) + struct.pack('>H', v)
98
+ return bytes([0x13]) + struct.pack('>i', v)
99
+ if isinstance(v, float):
100
+ if v == int(v) and 0 <= v <= 65535:
101
+ return encode_value(int(v), depth, codebook=codebook)
102
+ return bytes([0x18]) + struct.pack('>f', v)
103
+ if isinstance(v, str):
104
+ vl = v.lower()
105
+ values_map = codebook.values if codebook is not None else COMMON_VALUES
106
+ if vl in values_map:
107
+ return bytes([0x20, values_map[vl]])
108
+ vb = v.encode()[:255]
109
+ if len(vb) < 16:
110
+ return bytes([0x30 | len(vb)]) + vb
111
+ return bytes([0x40, len(vb)]) + vb
112
+ if isinstance(v, list):
113
+ if not v:
114
+ return b'\x50'
115
+ parts = [bytes([0x51, min(len(v), 255)])]
116
+ for item in v[:255]:
117
+ parts.append(encode_value(item, depth + 1, codebook=codebook))
118
+ return b''.join(parts)
119
+ if isinstance(v, dict):
120
+ return encode_dict(v, depth + 1, codebook=codebook)
121
+ return encode_value(str(v), depth, codebook=codebook)
122
+
123
+
124
+ def encode_single(data: Dict) -> Tuple[bytes, int, int]:
125
+ """Encode single message. Returns (bytes, original_size, compressed_size)."""
126
+ original = json.dumps(data, separators=(',', ':'))
127
+ original_size = len(original.encode())
128
+ raw = encode_dict(data)
129
+ compressed = zlib.compress(raw, 9)
130
+ if len(compressed) < len(raw) - 4:
131
+ payload = b'\x01' + compressed
132
+ else:
133
+ payload = b'\x00' + raw
134
+ final = b'\xE7\x02' + payload
135
+ return final, original_size, len(final)
136
+
137
+
138
+ def decode_value(data: bytes, offset: int = 0, codebook=None) -> Tuple[Any, int]:
139
+ """Decode a single value from bytes. Returns (value, bytes_consumed)."""
140
+ marker = data[offset]
141
+ values_rev_map = codebook.values_rev if codebook is not None else COMMON_VALUES_REV
142
+
143
+ if marker == 0x00:
144
+ return None, 1
145
+ if marker == 0x01:
146
+ return True, 1
147
+ if marker == 0x02:
148
+ return False, 1
149
+
150
+ # Integers
151
+ if marker == 0x10:
152
+ return data[offset + 1], 2
153
+ if marker == 0x11:
154
+ v = data[offset + 1]
155
+ if v > 127:
156
+ v -= 256
157
+ return v, 2
158
+ if marker == 0x12:
159
+ return struct.unpack('>H', data[offset + 1:offset + 3])[0], 3
160
+ if marker == 0x13:
161
+ return struct.unpack('>i', data[offset + 1:offset + 5])[0], 5
162
+
163
+ # Float
164
+ if marker == 0x18:
165
+ return struct.unpack('>f', data[offset + 1:offset + 5])[0], 5
166
+
167
+ # Common value
168
+ if marker == 0x20:
169
+ vid = data[offset + 1]
170
+ return values_rev_map.get(vid, f"<unknown_value:{vid:#x}>"), 2
171
+
172
+ # Short string (0x30-0x3F)
173
+ if 0x30 <= marker <= 0x3F:
174
+ slen = marker & 0x0F
175
+ return data[offset + 1:offset + 1 + slen].decode('utf-8', errors='replace'), 1 + slen
176
+
177
+ # Medium string
178
+ if marker == 0x40:
179
+ slen = data[offset + 1]
180
+ return data[offset + 2:offset + 2 + slen].decode('utf-8', errors='replace'), 2 + slen
181
+
182
+ # Empty list
183
+ if marker == 0x50:
184
+ return [], 1
185
+
186
+ # List with count
187
+ if marker == 0x51:
188
+ count = data[offset + 1]
189
+ pos = 2
190
+ items = []
191
+ for _ in range(count):
192
+ val, consumed = decode_value(data, offset + pos, codebook=codebook)
193
+ items.append(val)
194
+ pos += consumed
195
+ return items, pos
196
+
197
+ # Dict
198
+ if marker == 0x60:
199
+ return {}, 1
200
+ if marker == 0x61:
201
+ val, consumed = decode_dict(data, offset, codebook=codebook)
202
+ return val, consumed
203
+
204
+ raise ValueError(f"Unknown marker {marker:#x} at offset {offset}")
205
+
206
+
207
+ def decode_dict(data: bytes, offset: int = 0, codebook=None) -> Tuple[Dict, int]:
208
+ """Decode a dictionary from bytes. Returns (dict, bytes_consumed)."""
209
+ marker = data[offset]
210
+ if marker == 0x60:
211
+ return {}, 1
212
+
213
+ if marker != 0x61:
214
+ raise ValueError(f"Expected dict marker 0x60/0x61, got {marker:#x} at offset {offset}")
215
+
216
+ keys_rev_map = codebook.keys_rev if codebook is not None else COMMON_KEYS_REV
217
+
218
+ count = data[offset + 1]
219
+ pos = 2
220
+ result = {}
221
+
222
+ for _ in range(count):
223
+ # Decode key
224
+ kb = data[offset + pos]
225
+ if kb & 0x80:
226
+ key_id = kb & 0x7F
227
+ key = keys_rev_map.get(key_id, f"<unknown_key:{key_id:#x}>")
228
+ pos += 1
229
+ else:
230
+ klen = kb
231
+ key = data[offset + pos + 1:offset + pos + 1 + klen].decode('utf-8', errors='replace')
232
+ pos += 1 + klen
233
+
234
+ # Decode value
235
+ val, consumed = decode_value(data, offset + pos, codebook=codebook)
236
+ result[key] = val
237
+ pos += consumed
238
+
239
+ return result, pos
240
+
241
+
242
+ # ═══════════════════════════════════════════════════════════════════════════════
243
+ # Batch Encoder
244
+ # ═══════════════════════════════════════════════════════════════════════════════
245
+
246
+ @dataclass
247
+ class BatchResult:
248
+ """Result of batch encoding."""
249
+ messages_encoded: int
250
+ original_bytes: int
251
+ compressed_bytes: int
252
+ compression_ratio: float
253
+ bandwidth_saved_pct: float
254
+ payload: bytes
255
+ encode_time_ms: float
256
+
257
+ # Comparison with individual encoding
258
+ individual_bytes: int = 0
259
+ batch_advantage_pct: float = 0.0
260
+
261
+
262
+ class BatchEncoder:
263
+ """
264
+ Batch encoder for multiple messages.
265
+
266
+ Format (v1/v2):
267
+ MAGIC (2 bytes): 0xE7 0xB0 (θ batch)
268
+ VERSION (1 byte): 0x01
269
+ COUNT (2 bytes): number of messages (big-endian)
270
+ COMPRESSED (1 byte): 0x00 = raw, 0x01 = zlib
271
+ PAYLOAD: concatenated encoded messages (each prefixed with 2-byte length)
272
+ CHECKSUM (4 bytes): CRC32
273
+
274
+ Format (v3 — adaptive codebook):
275
+ MAGIC (2) + VERSION=0x03 (1) + COUNT (2) + FLAGS (1)
276
+ + CB_VERSION (2) + CB_LEN (2) + EMBEDDED_CB + PAYLOAD + CRC (4)
277
+ """
278
+
279
+ MAGIC = b'\xE7\xB0' # θ batch
280
+ VERSION = 0x01
281
+ VERSION_ADAPTIVE = 0x03
282
+
283
+ def __init__(self, codebook=None):
284
+ self._codebook = codebook
285
+
286
+ def encode_batch(self, messages: List[Dict], embed_codebook: bool = True) -> BatchResult:
287
+ """Encode multiple messages into a single batch payload.
288
+
289
+ Args:
290
+ messages: List of dicts to encode.
291
+ embed_codebook: If True (default), embed the codebook in the v3
292
+ header so payloads are self-contained. Set to False when both
293
+ sides already share the codebook to avoid per-batch overhead.
294
+ """
295
+ start = time.perf_counter()
296
+
297
+ # Auto-observe messages for codebook learning
298
+ if self._codebook is not None:
299
+ self._codebook.observe(messages)
300
+
301
+ # Capture active codebook snapshot
302
+ active_cb = self._codebook.get_active() if self._codebook is not None else None
303
+
304
+ # Calculate original size (JSON)
305
+ original_json = [json.dumps(m, separators=(',', ':')) for m in messages]
306
+ original_bytes = sum(len(j.encode()) for j in original_json)
307
+
308
+ # Encode each message individually (for comparison)
309
+ individual_encoded = []
310
+ for msg in messages:
311
+ encoded, _, size = encode_single(msg)
312
+ individual_encoded.append(encoded)
313
+ individual_bytes = sum(len(e) for e in individual_encoded)
314
+
315
+ # Batch encode - concatenate raw encodings, then compress together
316
+ raw_messages = []
317
+ for msg in messages:
318
+ raw = encode_dict(msg, codebook=active_cb)
319
+ # Prefix each with 2-byte length
320
+ raw_messages.append(struct.pack('>H', len(raw)) + raw)
321
+
322
+ combined_raw = b''.join(raw_messages)
323
+
324
+ # Compress the combined payload
325
+ compressed = zlib.compress(combined_raw, 9)
326
+
327
+ # Use compressed if smaller
328
+ if len(compressed) < len(combined_raw):
329
+ payload_data = compressed
330
+ compression_flag = 0x01
331
+ else:
332
+ payload_data = combined_raw
333
+ compression_flag = 0x00
334
+
335
+ # Build batch header
336
+ if active_cb is not None:
337
+ # V3 header — optionally embed the codebook
338
+ if embed_codebook:
339
+ flags = compression_flag | 0x02 # bit 1 = codebook embedded
340
+ cb_serialized = active_cb.serialize()
341
+ else:
342
+ flags = compression_flag # bit 1 clear = no embedded codebook
343
+ cb_serialized = b''
344
+ header = (
345
+ self.MAGIC +
346
+ bytes([self.VERSION_ADAPTIVE]) +
347
+ struct.pack('>H', len(messages)) +
348
+ bytes([flags]) +
349
+ struct.pack('>H', active_cb.version) +
350
+ struct.pack('>H', len(cb_serialized)) +
351
+ cb_serialized
352
+ )
353
+ else:
354
+ header = (
355
+ self.MAGIC +
356
+ bytes([self.VERSION]) +
357
+ struct.pack('>H', len(messages)) +
358
+ bytes([compression_flag])
359
+ )
360
+
361
+ # Final payload
362
+ payload = header + payload_data
363
+
364
+ # Add checksum
365
+ checksum = zlib.crc32(payload) & 0xFFFFFFFF
366
+ final_payload = payload + struct.pack('>I', checksum)
367
+
368
+ encode_time = (time.perf_counter() - start) * 1000
369
+
370
+ compressed_bytes = len(final_payload)
371
+ compression_ratio = compressed_bytes / original_bytes if original_bytes > 0 else 1.0
372
+ bandwidth_saved = (1 - compression_ratio) * 100
373
+
374
+ # Batch advantage over individual encoding
375
+ batch_advantage = (1 - compressed_bytes / individual_bytes) * 100 if individual_bytes > 0 else 0
376
+
377
+ return BatchResult(
378
+ messages_encoded=len(messages),
379
+ original_bytes=original_bytes,
380
+ compressed_bytes=compressed_bytes,
381
+ compression_ratio=compression_ratio,
382
+ bandwidth_saved_pct=bandwidth_saved,
383
+ payload=final_payload,
384
+ encode_time_ms=encode_time,
385
+ individual_bytes=individual_bytes,
386
+ batch_advantage_pct=batch_advantage
387
+ )
388
+
389
+ def decode_batch_header(self, payload: bytes) -> dict:
390
+ """Decode batch header (for inspection)."""
391
+ if len(payload) < 10:
392
+ return {"error": "payload too short"}
393
+
394
+ magic = payload[:2]
395
+ if magic != self.MAGIC:
396
+ return {"error": f"invalid magic: {magic.hex()}"}
397
+
398
+ version = payload[2]
399
+ count = struct.unpack('>H', payload[3:5])[0]
400
+ flags = payload[5]
401
+
402
+ info = {
403
+ "magic": magic.hex(),
404
+ "version": version,
405
+ "message_count": count,
406
+ "compressed": bool(flags & 0x01),
407
+ "payload_size": len(payload),
408
+ }
409
+
410
+ if version == 0x03:
411
+ info["codebook_embedded"] = bool(flags & 0x02)
412
+ info["codebook_version"] = struct.unpack('>H', payload[6:8])[0]
413
+
414
+ return info
415
+
416
+ def decode_batch(self, payload: bytes, codebook=None) -> List[Dict]:
417
+ """Decode a batch payload back into a list of message dicts.
418
+
419
+ Args:
420
+ payload: The binary batch payload.
421
+ codebook: Optional external CodebookVersion for v3 payloads
422
+ encoded with ``embed_codebook=False``. Ignored when the
423
+ payload already contains an embedded codebook.
424
+ """
425
+ if len(payload) < 10:
426
+ raise ValueError("Payload too short")
427
+
428
+ # Verify checksum (last 4 bytes)
429
+ stored_crc = struct.unpack('>I', payload[-4:])[0]
430
+ computed_crc = zlib.crc32(payload[:-4]) & 0xFFFFFFFF
431
+ if stored_crc != computed_crc:
432
+ raise ValueError(f"CRC32 mismatch: stored={stored_crc:#x}, computed={computed_crc:#x}")
433
+
434
+ # Parse header
435
+ magic = payload[:2]
436
+ if magic != self.MAGIC:
437
+ raise ValueError(f"Invalid magic: {magic.hex()}")
438
+
439
+ version = payload[2]
440
+ if version not in (0x01, 0x02, 0x03):
441
+ raise ValueError(f"Unsupported version: {version:#x}")
442
+
443
+ count = struct.unpack('>H', payload[3:5])[0]
444
+ flags = payload[5]
445
+
446
+ cb = codebook # external codebook (may be None)
447
+ if version == 0x03:
448
+ # V3: parse FLAGS, CB_VERSION, embedded codebook
449
+ compression_flag = flags & 0x01
450
+ cb_embedded = bool(flags & 0x02)
451
+ cb_version_num = struct.unpack('>H', payload[6:8])[0]
452
+ cb_len = struct.unpack('>H', payload[8:10])[0]
453
+ data_offset = 10
454
+
455
+ if cb_embedded:
456
+ from .adaptive_codebook import CodebookVersion
457
+ cb, _ = CodebookVersion.deserialize(payload, data_offset)
458
+ data_offset += cb_len
459
+
460
+ raw_data = payload[data_offset:-4]
461
+ else:
462
+ # V1/V2: simple header
463
+ compression_flag = flags
464
+ raw_data = payload[6:-4]
465
+
466
+ # Decompress if needed
467
+ if compression_flag & 0x01:
468
+ raw_data = zlib.decompress(raw_data)
469
+
470
+ # Split messages by 2-byte length prefixes
471
+ messages = []
472
+ pos = 0
473
+ for _ in range(count):
474
+ msg_len = struct.unpack('>H', raw_data[pos:pos + 2])[0]
475
+ pos += 2
476
+ msg_bytes = raw_data[pos:pos + msg_len]
477
+ msg, _ = decode_dict(msg_bytes, 0, codebook=cb)
478
+ messages.append(msg)
479
+ pos += msg_len
480
+
481
+ return messages
482
+
483
+
484
+ # ═══════════════════════════════════════════════════════════════════════════════
485
+ # Demo
486
+ # ═══════════════════════════════════════════════════════════════════════════════
487
+
488
+ SAMPLE_MESSAGES = [
489
+ {"task": "analyze", "data": "market trends", "priority": "high"},
490
+ {"agent_id": "agent_001", "task_type": "analysis", "status": "pending"},
491
+ {"agent_id": "agent_002", "task_type": "execution", "status": "running"},
492
+ {"coordination": {"request_id": "coord_001", "participants": ["a1", "a2", "a3"]}},
493
+ {"result": {"summary": "bullish", "confidence": 0.85, "recommendations": ["buy"]}},
494
+ {"task_id": "task_001", "status": "complete", "metrics": {"latency": 45, "success": True}},
495
+ {"agent": {"id": "trader", "version": "2.0"}, "context": {"market": "volatile"}},
496
+ {"order": {"symbol": "BTC", "side": "buy", "quantity": 100, "status": "pending"}},
497
+ {"task_type": "optimization", "priority": "high", "context": {"risk": "low"}},
498
+ {"status": "complete", "result": {"value": 42, "confidence": 0.95}},
499
+ ]
500
+
501
+
502
+ def main():
503
+ import random
504
+
505
+ print("\n" + "═" * 70)
506
+ print(" 🎯 BATCH ENCODER DEMO")
507
+ print("═" * 70 + "\n")
508
+
509
+ encoder = BatchEncoder()
510
+
511
+ # Test different batch sizes
512
+ for batch_size in [10, 50, 100, 500, 1000]:
513
+ # Generate batch
514
+ messages = [random.choice(SAMPLE_MESSAGES) for _ in range(batch_size)]
515
+
516
+ result = encoder.encode_batch(messages)
517
+
518
+ print(f" Batch size: {batch_size:,} messages")
519
+ print(f" Original (JSON): {result.original_bytes:,} bytes")
520
+ print(f" Individual encoded: {result.individual_bytes:,} bytes")
521
+ print(f" Batch encoded: {result.compressed_bytes:,} bytes")
522
+ print(f" Compression: {result.bandwidth_saved_pct:.1f}% saved")
523
+ print(f" Batch advantage: {result.batch_advantage_pct:.1f}% smaller than individual")
524
+ print(f" Encode time: {result.encode_time_ms:.2f}ms")
525
+ print()
526
+
527
+ print("═" * 70)
528
+ print(" 📊 BATCH vs INDIVIDUAL COMPARISON")
529
+ print("═" * 70 + "\n")
530
+
531
+ # Detailed comparison for 100 messages
532
+ messages = [random.choice(SAMPLE_MESSAGES) for _ in range(100)]
533
+ result = encoder.encode_batch(messages)
534
+
535
+ print(f" 100 agent messages:")
536
+ print(f" ┌─────────────────────────────────────────┐")
537
+ print(f" │ Method │ Size │ Savings │")
538
+ print(f" ├─────────────────────────────────────────┤")
539
+ print(f" │ Raw JSON │ {result.original_bytes:>6,} B │ - │")
540
+ print(f" │ Individual θ │ {result.individual_bytes:>6,} B │ {(1-result.individual_bytes/result.original_bytes)*100:>5.1f}% │")
541
+ print(f" │ Batch θ │ {result.compressed_bytes:>6,} B │ {result.bandwidth_saved_pct:>5.1f}% │")
542
+ print(f" └─────────────────────────────────────────┘")
543
+ print()
544
+ print(f" Batch is {result.batch_advantage_pct:.1f}% smaller than individual encoding!")
545
+ print(f" Total savings vs JSON: {result.bandwidth_saved_pct:.1f}%")
546
+ print()
547
+
548
+ # Show header info
549
+ header = encoder.decode_batch_header(result.payload)
550
+ print(f" Batch header: {header}")
551
+ print()
552
+
553
+
554
+ if __name__ == "__main__":
555
+ main()