emergent-translator 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emergent_translator/__init__.py +126 -0
- emergent_translator/adaptive_codebook.py +342 -0
- emergent_translator/api_server.py +4988 -0
- emergent_translator/batch_encoder.py +555 -0
- emergent_translator/chunk_collector.py +978 -0
- emergent_translator/chunk_coordinator.py +738 -0
- emergent_translator/claude_compression.py +375 -0
- emergent_translator/cli.py +413 -0
- emergent_translator/client_sdk.py +903 -0
- emergent_translator/code_skeleton.py +448 -0
- emergent_translator/core.py +1081 -0
- emergent_translator/emergent_symbols.py +690 -0
- emergent_translator/format_handlers.py +901 -0
- emergent_translator/gpu_batch_encoder.py +848 -0
- emergent_translator/intelligent_router.py +509 -0
- emergent_translator/metrics.py +436 -0
- emergent_translator/py.typed +0 -0
- emergent_translator-1.1.0.dist-info/METADATA +568 -0
- emergent_translator-1.1.0.dist-info/RECORD +23 -0
- emergent_translator-1.1.0.dist-info/WHEEL +5 -0
- emergent_translator-1.1.0.dist-info/entry_points.txt +2 -0
- emergent_translator-1.1.0.dist-info/licenses/LICENSE +82 -0
- emergent_translator-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Claude API Text-Level Compression
|
|
3
|
+
|
|
4
|
+
Reuses the existing COMMON_KEYS/COMMON_VALUES and AdaptiveCodebook to generate
|
|
5
|
+
text short codes that save tokens in Claude API conversations.
|
|
6
|
+
|
|
7
|
+
Before: {"task_type": "analyze", "priority": "high", "status": "pending"} (~18 tokens)
|
|
8
|
+
After: {"tt": "ANL", "pr": "HGH", "st": "PND"} (~12 tokens) + ~250 token legend (one-time)
|
|
9
|
+
|
|
10
|
+
The legend pays for itself after ~40 compressed field occurrences.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from emergent_translator.claude_compression import (
|
|
14
|
+
TextCodebook, ClaudeCompressor, compress, decompress,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
cb = TextCodebook.from_common_tables()
|
|
18
|
+
compressed = compress({"task_type": "analyze", "priority": "high"}, cb)
|
|
19
|
+
original = decompress(compressed, cb)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Private helpers
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
def _generate_candidate(word: str) -> str:
|
|
32
|
+
"""Produce a 1-3 char short code candidate for *word*.
|
|
33
|
+
|
|
34
|
+
Strategy:
|
|
35
|
+
- underscore words -> initials (task_type -> tt)
|
|
36
|
+
- short words (<=3 chars) -> pass through
|
|
37
|
+
- longer words -> first char + consonants (trimmed to 3)
|
|
38
|
+
"""
|
|
39
|
+
if "_" in word:
|
|
40
|
+
return "".join(part[0] for part in word.split("_") if part)
|
|
41
|
+
if len(word) <= 3:
|
|
42
|
+
return word
|
|
43
|
+
consonants = [ch for ch in word[1:] if ch not in "aeiou"]
|
|
44
|
+
if consonants:
|
|
45
|
+
return (word[0] + "".join(consonants))[:3]
|
|
46
|
+
return word[:3]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _build_mapping(names: List[str], is_key: bool) -> Dict[str, str]:
|
|
50
|
+
"""Generate unique short codes for a list of names.
|
|
51
|
+
|
|
52
|
+
Keys -> lowercase codes, values -> UPPERCASE codes (namespace separation).
|
|
53
|
+
Deduplicates with fallback strategies.
|
|
54
|
+
"""
|
|
55
|
+
mapping: Dict[str, str] = {}
|
|
56
|
+
used_codes: set = set()
|
|
57
|
+
|
|
58
|
+
for name in names:
|
|
59
|
+
candidate = _generate_candidate(name.lower())
|
|
60
|
+
|
|
61
|
+
# Apply case convention
|
|
62
|
+
if is_key:
|
|
63
|
+
candidate = candidate.lower()
|
|
64
|
+
else:
|
|
65
|
+
candidate = candidate.upper()
|
|
66
|
+
|
|
67
|
+
# Deduplication fallbacks
|
|
68
|
+
if candidate in used_codes:
|
|
69
|
+
# Fallback 1: first 2 chars
|
|
70
|
+
fb = name.lower()[:2]
|
|
71
|
+
fb = fb.lower() if is_key else fb.upper()
|
|
72
|
+
if fb not in used_codes and fb != candidate:
|
|
73
|
+
candidate = fb
|
|
74
|
+
else:
|
|
75
|
+
# Fallback 2: first + last consonant
|
|
76
|
+
consonants = [ch for ch in name.lower() if ch not in "aeiou_"]
|
|
77
|
+
if len(consonants) >= 2:
|
|
78
|
+
fb2 = consonants[0] + consonants[-1]
|
|
79
|
+
fb2 = fb2.lower() if is_key else fb2.upper()
|
|
80
|
+
if fb2 not in used_codes:
|
|
81
|
+
candidate = fb2
|
|
82
|
+
# fall through to numeric if still colliding
|
|
83
|
+
|
|
84
|
+
# Fallback 3: numeric suffix
|
|
85
|
+
if candidate in used_codes:
|
|
86
|
+
base = candidate
|
|
87
|
+
for i in range(2, 100):
|
|
88
|
+
attempt = f"{base}{i}"
|
|
89
|
+
if attempt not in used_codes:
|
|
90
|
+
candidate = attempt
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
mapping[name] = candidate
|
|
94
|
+
used_codes.add(candidate)
|
|
95
|
+
|
|
96
|
+
return mapping
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ---------------------------------------------------------------------------
|
|
100
|
+
# TextCodebook
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class TextCodebook:
|
|
105
|
+
"""Bidirectional mapping between full names and short text codes."""
|
|
106
|
+
|
|
107
|
+
keys: Dict[str, str] # full_key -> short_code
|
|
108
|
+
values: Dict[str, str] # full_value -> SHORT_CODE
|
|
109
|
+
keys_rev: Dict[str, str] = field(default_factory=dict) # short_code -> full_key
|
|
110
|
+
values_rev: Dict[str, str] = field(default_factory=dict) # SHORT_CODE -> full_value
|
|
111
|
+
|
|
112
|
+
def __post_init__(self):
|
|
113
|
+
if not self.keys_rev:
|
|
114
|
+
self.keys_rev = {v: k for k, v in self.keys.items()}
|
|
115
|
+
if not self.values_rev:
|
|
116
|
+
self.values_rev = {v: k for k, v in self.values.items()}
|
|
117
|
+
|
|
118
|
+
@classmethod
|
|
119
|
+
def from_common_tables(cls) -> "TextCodebook":
|
|
120
|
+
"""Build from existing COMMON_KEYS/COMMON_VALUES tables."""
|
|
121
|
+
from .batch_encoder import COMMON_KEYS, COMMON_VALUES
|
|
122
|
+
|
|
123
|
+
# Deduplicate aliases: keep the first (longest) name per byte id
|
|
124
|
+
seen_key_ids: Dict[int, str] = {}
|
|
125
|
+
for name, byte_id in COMMON_KEYS.items():
|
|
126
|
+
if byte_id not in seen_key_ids or len(name) > len(seen_key_ids[byte_id]):
|
|
127
|
+
seen_key_ids[byte_id] = name
|
|
128
|
+
unique_keys = list(seen_key_ids.values())
|
|
129
|
+
|
|
130
|
+
seen_val_ids: Dict[int, str] = {}
|
|
131
|
+
for name, byte_id in COMMON_VALUES.items():
|
|
132
|
+
if byte_id not in seen_val_ids or len(name) > len(seen_val_ids[byte_id]):
|
|
133
|
+
seen_val_ids[byte_id] = name
|
|
134
|
+
unique_values = list(seen_val_ids.values())
|
|
135
|
+
|
|
136
|
+
keys_map = _build_mapping(unique_keys, is_key=True)
|
|
137
|
+
values_map = _build_mapping(unique_values, is_key=False)
|
|
138
|
+
|
|
139
|
+
return cls(keys=keys_map, values=values_map)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def from_adaptive(cls, codebook) -> "TextCodebook":
|
|
143
|
+
"""Build from an AdaptiveCodebook's active version.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
codebook: An AdaptiveCodebook instance.
|
|
147
|
+
"""
|
|
148
|
+
active = codebook.get_active()
|
|
149
|
+
|
|
150
|
+
# Deduplicate aliases (same logic as from_common_tables)
|
|
151
|
+
seen_key_ids: Dict[int, str] = {}
|
|
152
|
+
for name, byte_id in active.keys.items():
|
|
153
|
+
if byte_id not in seen_key_ids or len(name) > len(seen_key_ids[byte_id]):
|
|
154
|
+
seen_key_ids[byte_id] = name
|
|
155
|
+
unique_keys = list(seen_key_ids.values())
|
|
156
|
+
|
|
157
|
+
seen_val_ids: Dict[int, str] = {}
|
|
158
|
+
for name, byte_id in active.values.items():
|
|
159
|
+
if byte_id not in seen_val_ids or len(name) > len(seen_val_ids[byte_id]):
|
|
160
|
+
seen_val_ids[byte_id] = name
|
|
161
|
+
unique_values = list(seen_val_ids.values())
|
|
162
|
+
|
|
163
|
+
keys_map = _build_mapping(unique_keys, is_key=True)
|
|
164
|
+
values_map = _build_mapping(unique_values, is_key=False)
|
|
165
|
+
|
|
166
|
+
return cls(keys=keys_map, values=values_map)
|
|
167
|
+
|
|
168
|
+
@classmethod
|
|
169
|
+
def from_dicts(cls, keys_map: Dict[str, str], values_map: Dict[str, str]) -> "TextCodebook":
|
|
170
|
+
"""Create from explicit mappings."""
|
|
171
|
+
return cls(keys=dict(keys_map), values=dict(values_map))
|
|
172
|
+
|
|
173
|
+
def legend(self) -> str:
|
|
174
|
+
"""Compact legend string for inclusion in a system prompt.
|
|
175
|
+
|
|
176
|
+
Format: [EL]Keys:tt=task_type st=status...|Vals:ANL=analyze PND=pending...[/EL]
|
|
177
|
+
"""
|
|
178
|
+
key_parts = " ".join(f"{code}={name}" for name, code in sorted(self.keys.items()))
|
|
179
|
+
val_parts = " ".join(f"{code}={name}" for name, code in sorted(self.values.items()))
|
|
180
|
+
return f"[EL]Keys:{key_parts}|Vals:{val_parts}[/EL]"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
# Free functions: compress / decompress
|
|
185
|
+
# ---------------------------------------------------------------------------
|
|
186
|
+
|
|
187
|
+
_default_codebook: Optional[TextCodebook] = None
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _get_default_codebook() -> TextCodebook:
|
|
191
|
+
global _default_codebook
|
|
192
|
+
if _default_codebook is None:
|
|
193
|
+
_default_codebook = TextCodebook.from_common_tables()
|
|
194
|
+
return _default_codebook
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def compress(data: Any, codebook: Optional[TextCodebook] = None) -> Any:
|
|
198
|
+
"""Recursively replace dict keys and string values with short codes.
|
|
199
|
+
|
|
200
|
+
- Dict keys are looked up in codebook.keys (case-insensitive).
|
|
201
|
+
- String values are looked up in codebook.values (case-insensitive).
|
|
202
|
+
- Ints, floats, bools, None pass through unchanged.
|
|
203
|
+
- Unknown keys/values pass through unchanged.
|
|
204
|
+
"""
|
|
205
|
+
cb = codebook if codebook is not None else _get_default_codebook()
|
|
206
|
+
return _compress_recursive(data, cb)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _compress_recursive(data: Any, cb: TextCodebook) -> Any:
|
|
210
|
+
if isinstance(data, dict):
|
|
211
|
+
result = {}
|
|
212
|
+
for k, v in data.items():
|
|
213
|
+
new_key = cb.keys.get(k.lower(), k) if isinstance(k, str) else k
|
|
214
|
+
result[new_key] = _compress_recursive(v, cb)
|
|
215
|
+
return result
|
|
216
|
+
if isinstance(data, list):
|
|
217
|
+
return [_compress_recursive(item, cb) for item in data]
|
|
218
|
+
if isinstance(data, str):
|
|
219
|
+
return cb.values.get(data.lower(), data)
|
|
220
|
+
# int, float, bool, None — pass through
|
|
221
|
+
return data
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def decompress(data: Any, codebook: Optional[TextCodebook] = None) -> Any:
|
|
225
|
+
"""Reverse the short-code mapping back to full names.
|
|
226
|
+
|
|
227
|
+
Unknown codes pass through unchanged.
|
|
228
|
+
"""
|
|
229
|
+
cb = codebook if codebook is not None else _get_default_codebook()
|
|
230
|
+
return _decompress_recursive(data, cb)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _decompress_recursive(data: Any, cb: TextCodebook) -> Any:
|
|
234
|
+
if isinstance(data, dict):
|
|
235
|
+
result = {}
|
|
236
|
+
for k, v in data.items():
|
|
237
|
+
new_key = cb.keys_rev.get(k, k) if isinstance(k, str) else k
|
|
238
|
+
result[new_key] = _decompress_recursive(v, cb)
|
|
239
|
+
return result
|
|
240
|
+
if isinstance(data, list):
|
|
241
|
+
return [_decompress_recursive(item, cb) for item in data]
|
|
242
|
+
if isinstance(data, str):
|
|
243
|
+
return cb.values_rev.get(data, data)
|
|
244
|
+
return data
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# ---------------------------------------------------------------------------
|
|
248
|
+
# Token estimation
|
|
249
|
+
# ---------------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
def estimate_tokens(text: str) -> int:
|
|
252
|
+
"""Estimate token count for *text*.
|
|
253
|
+
|
|
254
|
+
Uses tiktoken cl100k_base if available, otherwise chars//4 heuristic.
|
|
255
|
+
"""
|
|
256
|
+
try:
|
|
257
|
+
import tiktoken
|
|
258
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
259
|
+
return len(enc.encode(text))
|
|
260
|
+
except Exception:
|
|
261
|
+
return max(1, len(text) // 4)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# ---------------------------------------------------------------------------
|
|
265
|
+
# ClaudeCompressor
|
|
266
|
+
# ---------------------------------------------------------------------------
|
|
267
|
+
|
|
268
|
+
class ClaudeCompressor:
|
|
269
|
+
"""High-level wrapper for compressing Claude API messages."""
|
|
270
|
+
|
|
271
|
+
def __init__(self, codebook: Optional[TextCodebook] = None):
|
|
272
|
+
self._codebook = codebook if codebook is not None else TextCodebook.from_common_tables()
|
|
273
|
+
|
|
274
|
+
@property
|
|
275
|
+
def codebook(self) -> TextCodebook:
|
|
276
|
+
return self._codebook
|
|
277
|
+
|
|
278
|
+
def compress_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
279
|
+
"""Compress content in Claude API message format, preserving ``role``."""
|
|
280
|
+
result = []
|
|
281
|
+
for msg in messages:
|
|
282
|
+
new_msg = dict(msg)
|
|
283
|
+
if "content" in new_msg:
|
|
284
|
+
new_msg["content"] = compress(new_msg["content"], self._codebook)
|
|
285
|
+
result.append(new_msg)
|
|
286
|
+
return result
|
|
287
|
+
|
|
288
|
+
def decompress_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
289
|
+
"""Decompress content in Claude API message format."""
|
|
290
|
+
result = []
|
|
291
|
+
for msg in messages:
|
|
292
|
+
new_msg = dict(msg)
|
|
293
|
+
if "content" in new_msg:
|
|
294
|
+
new_msg["content"] = decompress(new_msg["content"], self._codebook)
|
|
295
|
+
result.append(new_msg)
|
|
296
|
+
return result
|
|
297
|
+
|
|
298
|
+
def system_prompt_prefix(self) -> str:
|
|
299
|
+
"""Return legend + instruction text suitable for prepending to a system prompt."""
|
|
300
|
+
legend = self._codebook.legend()
|
|
301
|
+
instruction = (
|
|
302
|
+
"When responding with structured data (JSON/dicts), "
|
|
303
|
+
"use the [EL] short codes defined above for all keys and values that have mappings. "
|
|
304
|
+
"Pass through any keys or values not in the legend unchanged."
|
|
305
|
+
)
|
|
306
|
+
return f"{legend}\n{instruction}"
|
|
307
|
+
|
|
308
|
+
def wrap_tool_result(self, result: Any) -> Any:
|
|
309
|
+
"""Compress a single tool result (dict, list, or string)."""
|
|
310
|
+
return compress(result, self._codebook)
|
|
311
|
+
|
|
312
|
+
def estimate_savings(self, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
313
|
+
"""Estimate token savings from compression.
|
|
314
|
+
|
|
315
|
+
Returns dict with original_tokens, compressed_tokens, legend_tokens,
|
|
316
|
+
net_savings, savings_pct.
|
|
317
|
+
"""
|
|
318
|
+
import json as _json
|
|
319
|
+
|
|
320
|
+
original_text = _json.dumps(messages)
|
|
321
|
+
compressed_msgs = self.compress_messages(messages)
|
|
322
|
+
compressed_text = _json.dumps(compressed_msgs)
|
|
323
|
+
legend_text = self.system_prompt_prefix()
|
|
324
|
+
|
|
325
|
+
original_tokens = estimate_tokens(original_text)
|
|
326
|
+
compressed_tokens = estimate_tokens(compressed_text)
|
|
327
|
+
legend_tokens = estimate_tokens(legend_text)
|
|
328
|
+
|
|
329
|
+
net_savings = original_tokens - compressed_tokens - legend_tokens
|
|
330
|
+
savings_pct = (net_savings / original_tokens * 100) if original_tokens > 0 else 0.0
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
"original_tokens": original_tokens,
|
|
334
|
+
"compressed_tokens": compressed_tokens,
|
|
335
|
+
"legend_tokens": legend_tokens,
|
|
336
|
+
"net_savings": net_savings,
|
|
337
|
+
"savings_pct": savings_pct,
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
# ---------------------------------------------------------------------------
|
|
342
|
+
# wrap_api_call (optional convenience)
|
|
343
|
+
# ---------------------------------------------------------------------------
|
|
344
|
+
|
|
345
|
+
def wrap_api_call(
|
|
346
|
+
messages: List[Dict[str, Any]],
|
|
347
|
+
system: str = "",
|
|
348
|
+
codebook: Optional[TextCodebook] = None,
|
|
349
|
+
**kwargs,
|
|
350
|
+
) -> Any:
|
|
351
|
+
"""Compress messages, prepend legend to system prompt, call Claude API.
|
|
352
|
+
|
|
353
|
+
Lazy-imports the ``anthropic`` SDK. All extra ``**kwargs`` are forwarded
|
|
354
|
+
to ``client.messages.create()``.
|
|
355
|
+
|
|
356
|
+
Returns the raw API response.
|
|
357
|
+
"""
|
|
358
|
+
import anthropic # noqa: lazy import
|
|
359
|
+
|
|
360
|
+
compressor = ClaudeCompressor(codebook)
|
|
361
|
+
compressed = compressor.compress_messages(messages)
|
|
362
|
+
legend = compressor.system_prompt_prefix()
|
|
363
|
+
instruction = (
|
|
364
|
+
"When responding with structured data (JSON/dicts), "
|
|
365
|
+
"use the [EL] short codes defined above for all keys and values that have mappings. "
|
|
366
|
+
"Pass through any keys or values not in the legend unchanged."
|
|
367
|
+
)
|
|
368
|
+
full_system = f"{legend}\n{instruction}\n\n{system}" if system else f"{legend}\n{instruction}"
|
|
369
|
+
|
|
370
|
+
client = anthropic.Anthropic()
|
|
371
|
+
return client.messages.create(
|
|
372
|
+
messages=compressed,
|
|
373
|
+
system=full_system,
|
|
374
|
+
**kwargs,
|
|
375
|
+
)
|