emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ """
2
+ Emergent Language Translator
3
+
4
+ A high-performance API for translating between traditional AI communication formats
5
+ and emergent language symbols with 60x compression efficiency.
6
+
7
+ Example:
8
+ >>> from emergent_translator import BatchEncoder
9
+ >>> encoder = BatchEncoder()
10
+ >>> result = encoder.encode_batch([{"role": "user", "content": "hello"}])
11
+ """
12
+
13
+ # --- Always-available imports (no heavy deps) ---
14
+ from .batch_encoder import (
15
+ BatchEncoder,
16
+ BatchResult,
17
+ COMMON_KEYS,
18
+ COMMON_VALUES,
19
+ )
20
+
21
+ from .adaptive_codebook import (
22
+ AdaptiveCodebook,
23
+ CodebookVersion,
24
+ FrequencyTracker,
25
+ )
26
+
27
+ from .format_handlers import (
28
+ detect_format,
29
+ get_handler,
30
+ is_binary_format,
31
+ )
32
+
33
+ from .emergent_symbols import EmergentSymbolEncoder
34
+
35
+ __version__ = "1.1.0"
36
+ __author__ = "Emergent Language Team"
37
+ __email__ = "hello@emergentlanguage.ai"
38
+ __description__ = "60x compression efficiency for AI communication"
39
+ __url__ = "https://github.com/maco144/emergent-language"
40
+
41
+ __all__ = [
42
+ # Batch encoder
43
+ "BatchEncoder",
44
+ "BatchResult",
45
+ "COMMON_KEYS",
46
+ "COMMON_VALUES",
47
+
48
+ # Adaptive codebook
49
+ "AdaptiveCodebook",
50
+ "CodebookVersion",
51
+ "FrequencyTracker",
52
+
53
+ # Format handlers
54
+ "detect_format",
55
+ "get_handler",
56
+ "is_binary_format",
57
+
58
+ # Symbols
59
+ "EmergentSymbolEncoder",
60
+
61
+ # GPU (lazy-loaded)
62
+ "GPUBatchEncoder",
63
+
64
+ # SDK / server components (lazy-loaded, need extra deps)
65
+ "TranslatorSDK",
66
+ "EmergentTranslatorClient",
67
+ "SyncEmergentTranslatorClient",
68
+ "EmergentLanguageTranslator",
69
+ "ChunkCoordinator",
70
+ "ChunkCollector",
71
+
72
+ # Metadata
73
+ "__version__",
74
+ "__author__",
75
+ "__email__",
76
+ "__description__",
77
+ "__url__",
78
+ ]
79
+
80
+ # --- Lazy imports for modules with heavy/optional dependencies ---
81
+ _LAZY_IMPORTS = {
82
+ # GPU encoder (cupy detection prints at import time)
83
+ "GPUBatchEncoder": (".gpu_batch_encoder", "GPUBatchEncoder"),
84
+ # SDK / client
85
+ "TranslatorSDK": (".client_sdk", "TranslatorSDK"),
86
+ "EmergentTranslatorClient": (".client_sdk", "EmergentTranslatorClient"),
87
+ "SyncEmergentTranslatorClient": (".client_sdk", "SyncEmergentTranslatorClient"),
88
+ # Core (needs eudaimonia)
89
+ "EmergentLanguageTranslator": (".core", "EmergentLanguageTranslator"),
90
+ "TranslationFormat": (".core", "TranslationFormat"),
91
+ "TranslationDirection": (".core", "TranslationDirection"),
92
+ "TranslationResult": (".core", "TranslationResult"),
93
+ "TranslationStats": (".core", "TranslationStats"),
94
+ # Distributed processing
95
+ "ChunkCoordinator": (".chunk_coordinator", "ChunkCoordinator"),
96
+ "DistributedJob": (".chunk_coordinator", "DistributedJob"),
97
+ "Chunk": (".chunk_coordinator", "Chunk"),
98
+ "ChunkStatus": (".chunk_coordinator", "ChunkStatus"),
99
+ "JobStatus": (".chunk_coordinator", "JobStatus"),
100
+ "distributed_process": (".chunk_coordinator", "distributed_process"),
101
+ # Chunk collector
102
+ "ChunkCollector": (".chunk_collector", "ChunkCollector"),
103
+ "create_collector_app": (".chunk_collector", "create_collector_app"),
104
+ "CollectorStats": (".chunk_collector", "CollectorStats"),
105
+ # Claude compression
106
+ "TextCodebook": (".claude_compression", "TextCodebook"),
107
+ "ClaudeCompressor": (".claude_compression", "ClaudeCompressor"),
108
+ "text_compress": (".claude_compression", "compress"),
109
+ "text_decompress": (".claude_compression", "decompress"),
110
+ "estimate_tokens": (".claude_compression", "estimate_tokens"),
111
+ # Code skeleton
112
+ "skeletonize": (".code_skeleton", "skeletonize"),
113
+ "skeletonize_file": (".code_skeleton", "skeletonize_file"),
114
+ "skeletonize_dir": (".code_skeleton", "skeletonize_dir"),
115
+ "SkeletonResult": (".code_skeleton", "SkeletonResult"),
116
+ "CodeSkeleton": (".code_skeleton", "CodeSkeleton"),
117
+ }
118
+
119
+
120
+ def __getattr__(name):
121
+ if name in _LAZY_IMPORTS:
122
+ module_path, attr = _LAZY_IMPORTS[name]
123
+ import importlib
124
+ mod = importlib.import_module(module_path, __name__)
125
+ return getattr(mod, attr)
126
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
@@ -0,0 +1,342 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Adaptive Learned Codebook for Compression
4
+
5
+ Replaces hardcoded COMMON_KEYS/COMMON_VALUES with a codebook that learns
6
+ from observed traffic. Pure frequency-based, no governance.
7
+
8
+ Architecture:
9
+ AdaptiveCodebook
10
+ ├── FrequencyTracker # Counts key/value occurrences (thread-safe)
11
+ ├── CodebookVersion # Immutable snapshot (frozen dataclass)
12
+ ├── versions: Dict[int, CodebookVersion]
13
+ ├── active: CodebookVersion
14
+ ├── rebuild(min_freq) # Create new version from accumulated data
15
+ └── save()/load() # JSON persistence
16
+ """
17
+
18
+ import json
19
+ import struct
20
+ import threading
21
+ import time
22
+ from dataclasses import dataclass, field
23
+ from typing import Any, Dict, List, Optional, Tuple
24
+
25
+ # Import static dictionaries for baseline codebook (version 0)
26
+ from .batch_encoder import COMMON_KEYS, COMMON_VALUES
27
+
28
+
29
+ # =============================================================================
30
+ # FrequencyTracker
31
+ # =============================================================================
32
+
33
+ class FrequencyTracker:
34
+ """Thread-safe frequency counter for keys and string values."""
35
+
36
+ def __init__(self):
37
+ self._lock = threading.Lock()
38
+ self._key_counts: Dict[str, int] = {}
39
+ self._value_counts: Dict[str, int] = {}
40
+ self._total_messages: int = 0
41
+
42
+ def observe(self, msg: Dict[str, Any]) -> None:
43
+ """Record keys and string values from a single message."""
44
+ with self._lock:
45
+ self._total_messages += 1
46
+ self._extract(msg, depth=0)
47
+
48
+ def observe_batch(self, messages: List[Dict[str, Any]]) -> None:
49
+ """Record keys and string values from multiple messages."""
50
+ with self._lock:
51
+ for msg in messages:
52
+ self._total_messages += 1
53
+ self._extract(msg, depth=0)
54
+
55
+ def _extract(self, obj: Any, depth: int) -> None:
56
+ """Recursively pull keys and short string values, lowercased.
57
+
58
+ Must be called while holding self._lock.
59
+ """
60
+ if depth > 10:
61
+ return
62
+ if isinstance(obj, dict):
63
+ for k, v in obj.items():
64
+ kl = k.lower()
65
+ self._key_counts[kl] = self._key_counts.get(kl, 0) + 1
66
+ self._extract(v, depth + 1)
67
+ elif isinstance(obj, list):
68
+ for item in obj:
69
+ self._extract(item, depth + 1)
70
+ elif isinstance(obj, str):
71
+ vl = obj.lower()
72
+ if len(vl) <= 64:
73
+ self._value_counts[vl] = self._value_counts.get(vl, 0) + 1
74
+
75
+ def get_top_keys(self, n: int) -> List[Tuple[str, int]]:
76
+ """Return top-n keys ranked by frequency."""
77
+ with self._lock:
78
+ items = sorted(self._key_counts.items(), key=lambda x: -x[1])
79
+ return items[:n]
80
+
81
+ def get_top_values(self, n: int) -> List[Tuple[str, int]]:
82
+ """Return top-n values ranked by frequency."""
83
+ with self._lock:
84
+ items = sorted(self._value_counts.items(), key=lambda x: -x[1])
85
+ return items[:n]
86
+
87
+ @property
88
+ def total_messages(self) -> int:
89
+ with self._lock:
90
+ return self._total_messages
91
+
92
+ def reset(self) -> None:
93
+ """Clear all counts."""
94
+ with self._lock:
95
+ self._key_counts.clear()
96
+ self._value_counts.clear()
97
+ self._total_messages = 0
98
+
99
+
100
+ # =============================================================================
101
+ # CodebookVersion
102
+ # =============================================================================
103
+
104
+ @dataclass(frozen=True)
105
+ class CodebookVersion:
106
+ """Immutable snapshot of a codebook mapping."""
107
+ version: int
108
+ keys: Dict[str, int] # str → byte_id (0x01-0x7F)
109
+ values: Dict[str, int] # str → byte_id (0x01-0x7F)
110
+ keys_rev: Dict[int, str] # byte_id → str
111
+ values_rev: Dict[int, str] # byte_id → str
112
+ trained_on: int = 0 # number of messages used to train
113
+ created_at: float = 0.0 # timestamp
114
+
115
+ def serialize(self) -> bytes:
116
+ """Compact binary serialization for batch headers.
117
+
118
+ Format:
119
+ KEY_COUNT(1) + [ID(1) + LEN(1) + BYTES]...
120
+ + VAL_COUNT(1) + [ID(1) + LEN(1) + BYTES]...
121
+ """
122
+ parts = []
123
+ # Keys
124
+ key_items = sorted(self.keys.items(), key=lambda x: x[1])
125
+ parts.append(bytes([len(key_items)]))
126
+ for name, byte_id in key_items:
127
+ name_bytes = name.encode('utf-8')[:255]
128
+ parts.append(bytes([byte_id, len(name_bytes)]) + name_bytes)
129
+ # Values
130
+ val_items = sorted(self.values.items(), key=lambda x: x[1])
131
+ parts.append(bytes([len(val_items)]))
132
+ for name, byte_id in val_items:
133
+ name_bytes = name.encode('utf-8')[:255]
134
+ parts.append(bytes([byte_id, len(name_bytes)]) + name_bytes)
135
+ return b''.join(parts)
136
+
137
+ @staticmethod
138
+ def deserialize(data: bytes, offset: int = 0) -> Tuple['CodebookVersion', int]:
139
+ """Reconstruct a CodebookVersion from bytes.
140
+
141
+ Returns (CodebookVersion, bytes_consumed).
142
+ """
143
+ pos = offset
144
+ # Keys
145
+ key_count = data[pos]; pos += 1
146
+ keys = {}
147
+ for _ in range(key_count):
148
+ byte_id = data[pos]; pos += 1
149
+ name_len = data[pos]; pos += 1
150
+ name = data[pos:pos + name_len].decode('utf-8')
151
+ pos += name_len
152
+ keys[name] = byte_id
153
+ # Values
154
+ val_count = data[pos]; pos += 1
155
+ values = {}
156
+ for _ in range(val_count):
157
+ byte_id = data[pos]; pos += 1
158
+ name_len = data[pos]; pos += 1
159
+ name = data[pos:pos + name_len].decode('utf-8')
160
+ pos += name_len
161
+ values[name] = byte_id
162
+
163
+ keys_rev = {v: k for k, v in keys.items()}
164
+ values_rev = {v: k for k, v in values.items()}
165
+
166
+ cb = CodebookVersion(
167
+ version=0, # Will be set by caller if needed
168
+ keys=keys,
169
+ values=values,
170
+ keys_rev=keys_rev,
171
+ values_rev=values_rev,
172
+ )
173
+ return cb, pos - offset
174
+
175
+ def to_dict(self) -> Dict[str, Any]:
176
+ """JSON-serializable representation."""
177
+ return {
178
+ "version": self.version,
179
+ "keys": self.keys,
180
+ "values": self.values,
181
+ "trained_on": self.trained_on,
182
+ "created_at": self.created_at,
183
+ }
184
+
185
+ @staticmethod
186
+ def from_dict(d: Dict[str, Any]) -> 'CodebookVersion':
187
+ """Reconstruct from JSON dict."""
188
+ keys = {k: v for k, v in d["keys"].items()}
189
+ values = {k: v for k, v in d["values"].items()}
190
+ keys_rev = {v: k for k, v in keys.items()}
191
+ values_rev = {v: k for k, v in values.items()}
192
+ return CodebookVersion(
193
+ version=d["version"],
194
+ keys=keys,
195
+ values=values,
196
+ keys_rev=keys_rev,
197
+ values_rev=values_rev,
198
+ trained_on=d.get("trained_on", 0),
199
+ created_at=d.get("created_at", 0.0),
200
+ )
201
+
202
+
203
+ # =============================================================================
204
+ # Baseline codebook (version 0) from static dictionaries
205
+ # =============================================================================
206
+
207
+ def make_baseline_codebook() -> CodebookVersion:
208
+ """Create version 0 codebook from the static COMMON_KEYS/COMMON_VALUES."""
209
+ keys_rev = {v: k for k, v in COMMON_KEYS.items()}
210
+ values_rev = {v: k for k, v in COMMON_VALUES.items()}
211
+ return CodebookVersion(
212
+ version=0,
213
+ keys=dict(COMMON_KEYS),
214
+ values=dict(COMMON_VALUES),
215
+ keys_rev=keys_rev,
216
+ values_rev=values_rev,
217
+ trained_on=0,
218
+ created_at=time.time(),
219
+ )
220
+
221
+
222
+ # =============================================================================
223
+ # AdaptiveCodebook
224
+ # =============================================================================
225
+
226
+ class AdaptiveCodebook:
227
+ """Manages learned codebook versions with frequency tracking."""
228
+
229
+ def __init__(self, persist_path: Optional[str] = None):
230
+ self._persist_path = persist_path
231
+ self._tracker = FrequencyTracker()
232
+ self._versions: Dict[int, CodebookVersion] = {}
233
+ self._active: CodebookVersion = make_baseline_codebook()
234
+ self._versions[0] = self._active
235
+ self._next_version = 1
236
+
237
+ @property
238
+ def tracker(self) -> FrequencyTracker:
239
+ return self._tracker
240
+
241
+ def observe(self, messages: List[Dict[str, Any]]) -> None:
242
+ """Feed messages to the frequency tracker."""
243
+ self._tracker.observe_batch(messages)
244
+
245
+ def rebuild(
246
+ self,
247
+ min_frequency: int = 10,
248
+ max_keys: int = 127,
249
+ max_values: int = 127,
250
+ ) -> CodebookVersion:
251
+ """Build a new codebook version from accumulated frequency data.
252
+
253
+ Keys/values with count >= min_frequency are assigned IDs starting at 0x01.
254
+ Returns the new CodebookVersion (also set as active).
255
+ """
256
+ top_keys = self._tracker.get_top_keys(max_keys)
257
+ top_values = self._tracker.get_top_values(max_values)
258
+
259
+ # Filter by minimum frequency
260
+ top_keys = [(k, c) for k, c in top_keys if c >= min_frequency]
261
+ top_values = [(v, c) for v, c in top_values if c >= min_frequency]
262
+
263
+ # Assign IDs starting at 0x01
264
+ keys = {}
265
+ for i, (k, _) in enumerate(top_keys):
266
+ keys[k] = i + 1 # 0x01, 0x02, ...
267
+
268
+ values = {}
269
+ for i, (v, _) in enumerate(top_values):
270
+ values[v] = i + 1
271
+
272
+ keys_rev = {v: k for k, v in keys.items()}
273
+ values_rev = {v: k for k, v in values.items()}
274
+
275
+ version_num = self._next_version
276
+ self._next_version += 1
277
+
278
+ cb = CodebookVersion(
279
+ version=version_num,
280
+ keys=keys,
281
+ values=values,
282
+ keys_rev=keys_rev,
283
+ values_rev=values_rev,
284
+ trained_on=self._tracker.total_messages,
285
+ created_at=time.time(),
286
+ )
287
+
288
+ self._versions[version_num] = cb
289
+ self._active = cb
290
+ return cb
291
+
292
+ def get_active(self) -> CodebookVersion:
293
+ """Return the current active codebook version."""
294
+ return self._active
295
+
296
+ def get_version(self, v: int) -> Optional[CodebookVersion]:
297
+ """Return a specific codebook version, or None if not found."""
298
+ return self._versions.get(v)
299
+
300
+ def get_stats(self) -> Dict[str, Any]:
301
+ """Return statistics about the codebook."""
302
+ return {
303
+ "active_version": self._active.version,
304
+ "total_versions": len(self._versions),
305
+ "tracked_messages": self._tracker.total_messages,
306
+ "tracked_keys": len(self._tracker.get_top_keys(999999)),
307
+ "tracked_values": len(self._tracker.get_top_values(999999)),
308
+ "active_keys": len(self._active.keys),
309
+ "active_values": len(self._active.values),
310
+ }
311
+
312
+ def save(self) -> None:
313
+ """Persist codebook to JSON file."""
314
+ path = self._persist_path
315
+ if path is None:
316
+ raise ValueError("No persist_path configured")
317
+ data = {
318
+ "next_version": self._next_version,
319
+ "active_version": self._active.version,
320
+ "versions": {
321
+ str(v): cb.to_dict() for v, cb in self._versions.items()
322
+ },
323
+ }
324
+ with open(path, 'w') as f:
325
+ json.dump(data, f, indent=2)
326
+
327
+ def load(self) -> None:
328
+ """Load codebook from JSON file."""
329
+ path = self._persist_path
330
+ if path is None:
331
+ raise ValueError("No persist_path configured")
332
+ with open(path, 'r') as f:
333
+ data = json.load(f)
334
+
335
+ self._next_version = data["next_version"]
336
+ self._versions.clear()
337
+ for v_str, cb_dict in data["versions"].items():
338
+ cb = CodebookVersion.from_dict(cb_dict)
339
+ self._versions[cb.version] = cb
340
+
341
+ active_v = data["active_version"]
342
+ self._active = self._versions[active_v]