emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ """
2
+ Claude API Text-Level Compression
3
+
4
+ Reuses the existing COMMON_KEYS/COMMON_VALUES and AdaptiveCodebook to generate
5
+ text short codes that save tokens in Claude API conversations.
6
+
7
+ Before: {"task_type": "analyze", "priority": "high", "status": "pending"} (~18 tokens)
8
+ After: {"tt": "ANL", "pr": "HGH", "st": "PND"} (~12 tokens) + ~250 token legend (one-time)
9
+
10
+ The legend pays for itself after ~40 compressed field occurrences.
11
+
12
+ Usage:
13
+ from emergent_translator.claude_compression import (
14
+ TextCodebook, ClaudeCompressor, compress, decompress,
15
+ )
16
+
17
+ cb = TextCodebook.from_common_tables()
18
+ compressed = compress({"task_type": "analyze", "priority": "high"}, cb)
19
+ original = decompress(compressed, cb)
20
+ """
21
+
22
+ import re
23
+ from dataclasses import dataclass, field
24
+ from typing import Any, Dict, List, Optional, Tuple
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Private helpers
29
+ # ---------------------------------------------------------------------------
30
+
31
+ def _generate_candidate(word: str) -> str:
32
+ """Produce a 1-3 char short code candidate for *word*.
33
+
34
+ Strategy:
35
+ - underscore words -> initials (task_type -> tt)
36
+ - short words (<=3 chars) -> pass through
37
+ - longer words -> first char + consonants (trimmed to 3)
38
+ """
39
+ if "_" in word:
40
+ return "".join(part[0] for part in word.split("_") if part)
41
+ if len(word) <= 3:
42
+ return word
43
+ consonants = [ch for ch in word[1:] if ch not in "aeiou"]
44
+ if consonants:
45
+ return (word[0] + "".join(consonants))[:3]
46
+ return word[:3]
47
+
48
+
49
+ def _build_mapping(names: List[str], is_key: bool) -> Dict[str, str]:
50
+ """Generate unique short codes for a list of names.
51
+
52
+ Keys -> lowercase codes, values -> UPPERCASE codes (namespace separation).
53
+ Deduplicates with fallback strategies.
54
+ """
55
+ mapping: Dict[str, str] = {}
56
+ used_codes: set = set()
57
+
58
+ for name in names:
59
+ candidate = _generate_candidate(name.lower())
60
+
61
+ # Apply case convention
62
+ if is_key:
63
+ candidate = candidate.lower()
64
+ else:
65
+ candidate = candidate.upper()
66
+
67
+ # Deduplication fallbacks
68
+ if candidate in used_codes:
69
+ # Fallback 1: first 2 chars
70
+ fb = name.lower()[:2]
71
+ fb = fb.lower() if is_key else fb.upper()
72
+ if fb not in used_codes and fb != candidate:
73
+ candidate = fb
74
+ else:
75
+ # Fallback 2: first + last consonant
76
+ consonants = [ch for ch in name.lower() if ch not in "aeiou_"]
77
+ if len(consonants) >= 2:
78
+ fb2 = consonants[0] + consonants[-1]
79
+ fb2 = fb2.lower() if is_key else fb2.upper()
80
+ if fb2 not in used_codes:
81
+ candidate = fb2
82
+ # fall through to numeric if still colliding
83
+
84
+ # Fallback 3: numeric suffix
85
+ if candidate in used_codes:
86
+ base = candidate
87
+ for i in range(2, 100):
88
+ attempt = f"{base}{i}"
89
+ if attempt not in used_codes:
90
+ candidate = attempt
91
+ break
92
+
93
+ mapping[name] = candidate
94
+ used_codes.add(candidate)
95
+
96
+ return mapping
97
+
98
+
99
+ # ---------------------------------------------------------------------------
100
+ # TextCodebook
101
+ # ---------------------------------------------------------------------------
102
+
103
+ @dataclass
104
+ class TextCodebook:
105
+ """Bidirectional mapping between full names and short text codes."""
106
+
107
+ keys: Dict[str, str] # full_key -> short_code
108
+ values: Dict[str, str] # full_value -> SHORT_CODE
109
+ keys_rev: Dict[str, str] = field(default_factory=dict) # short_code -> full_key
110
+ values_rev: Dict[str, str] = field(default_factory=dict) # SHORT_CODE -> full_value
111
+
112
+ def __post_init__(self):
113
+ if not self.keys_rev:
114
+ self.keys_rev = {v: k for k, v in self.keys.items()}
115
+ if not self.values_rev:
116
+ self.values_rev = {v: k for k, v in self.values.items()}
117
+
118
+ @classmethod
119
+ def from_common_tables(cls) -> "TextCodebook":
120
+ """Build from existing COMMON_KEYS/COMMON_VALUES tables."""
121
+ from .batch_encoder import COMMON_KEYS, COMMON_VALUES
122
+
123
+ # Deduplicate aliases: keep the first (longest) name per byte id
124
+ seen_key_ids: Dict[int, str] = {}
125
+ for name, byte_id in COMMON_KEYS.items():
126
+ if byte_id not in seen_key_ids or len(name) > len(seen_key_ids[byte_id]):
127
+ seen_key_ids[byte_id] = name
128
+ unique_keys = list(seen_key_ids.values())
129
+
130
+ seen_val_ids: Dict[int, str] = {}
131
+ for name, byte_id in COMMON_VALUES.items():
132
+ if byte_id not in seen_val_ids or len(name) > len(seen_val_ids[byte_id]):
133
+ seen_val_ids[byte_id] = name
134
+ unique_values = list(seen_val_ids.values())
135
+
136
+ keys_map = _build_mapping(unique_keys, is_key=True)
137
+ values_map = _build_mapping(unique_values, is_key=False)
138
+
139
+ return cls(keys=keys_map, values=values_map)
140
+
141
+ @classmethod
142
+ def from_adaptive(cls, codebook) -> "TextCodebook":
143
+ """Build from an AdaptiveCodebook's active version.
144
+
145
+ Args:
146
+ codebook: An AdaptiveCodebook instance.
147
+ """
148
+ active = codebook.get_active()
149
+
150
+ # Deduplicate aliases (same logic as from_common_tables)
151
+ seen_key_ids: Dict[int, str] = {}
152
+ for name, byte_id in active.keys.items():
153
+ if byte_id not in seen_key_ids or len(name) > len(seen_key_ids[byte_id]):
154
+ seen_key_ids[byte_id] = name
155
+ unique_keys = list(seen_key_ids.values())
156
+
157
+ seen_val_ids: Dict[int, str] = {}
158
+ for name, byte_id in active.values.items():
159
+ if byte_id not in seen_val_ids or len(name) > len(seen_val_ids[byte_id]):
160
+ seen_val_ids[byte_id] = name
161
+ unique_values = list(seen_val_ids.values())
162
+
163
+ keys_map = _build_mapping(unique_keys, is_key=True)
164
+ values_map = _build_mapping(unique_values, is_key=False)
165
+
166
+ return cls(keys=keys_map, values=values_map)
167
+
168
+ @classmethod
169
+ def from_dicts(cls, keys_map: Dict[str, str], values_map: Dict[str, str]) -> "TextCodebook":
170
+ """Create from explicit mappings."""
171
+ return cls(keys=dict(keys_map), values=dict(values_map))
172
+
173
+ def legend(self) -> str:
174
+ """Compact legend string for inclusion in a system prompt.
175
+
176
+ Format: [EL]Keys:tt=task_type st=status...|Vals:ANL=analyze PND=pending...[/EL]
177
+ """
178
+ key_parts = " ".join(f"{code}={name}" for name, code in sorted(self.keys.items()))
179
+ val_parts = " ".join(f"{code}={name}" for name, code in sorted(self.values.items()))
180
+ return f"[EL]Keys:{key_parts}|Vals:{val_parts}[/EL]"
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # Free functions: compress / decompress
185
+ # ---------------------------------------------------------------------------
186
+
187
+ _default_codebook: Optional[TextCodebook] = None
188
+
189
+
190
+ def _get_default_codebook() -> TextCodebook:
191
+ global _default_codebook
192
+ if _default_codebook is None:
193
+ _default_codebook = TextCodebook.from_common_tables()
194
+ return _default_codebook
195
+
196
+
197
+ def compress(data: Any, codebook: Optional[TextCodebook] = None) -> Any:
198
+ """Recursively replace dict keys and string values with short codes.
199
+
200
+ - Dict keys are looked up in codebook.keys (case-insensitive).
201
+ - String values are looked up in codebook.values (case-insensitive).
202
+ - Ints, floats, bools, None pass through unchanged.
203
+ - Unknown keys/values pass through unchanged.
204
+ """
205
+ cb = codebook if codebook is not None else _get_default_codebook()
206
+ return _compress_recursive(data, cb)
207
+
208
+
209
+ def _compress_recursive(data: Any, cb: TextCodebook) -> Any:
210
+ if isinstance(data, dict):
211
+ result = {}
212
+ for k, v in data.items():
213
+ new_key = cb.keys.get(k.lower(), k) if isinstance(k, str) else k
214
+ result[new_key] = _compress_recursive(v, cb)
215
+ return result
216
+ if isinstance(data, list):
217
+ return [_compress_recursive(item, cb) for item in data]
218
+ if isinstance(data, str):
219
+ return cb.values.get(data.lower(), data)
220
+ # int, float, bool, None — pass through
221
+ return data
222
+
223
+
224
+ def decompress(data: Any, codebook: Optional[TextCodebook] = None) -> Any:
225
+ """Reverse the short-code mapping back to full names.
226
+
227
+ Unknown codes pass through unchanged.
228
+ """
229
+ cb = codebook if codebook is not None else _get_default_codebook()
230
+ return _decompress_recursive(data, cb)
231
+
232
+
233
+ def _decompress_recursive(data: Any, cb: TextCodebook) -> Any:
234
+ if isinstance(data, dict):
235
+ result = {}
236
+ for k, v in data.items():
237
+ new_key = cb.keys_rev.get(k, k) if isinstance(k, str) else k
238
+ result[new_key] = _decompress_recursive(v, cb)
239
+ return result
240
+ if isinstance(data, list):
241
+ return [_decompress_recursive(item, cb) for item in data]
242
+ if isinstance(data, str):
243
+ return cb.values_rev.get(data, data)
244
+ return data
245
+
246
+
247
+ # ---------------------------------------------------------------------------
248
+ # Token estimation
249
+ # ---------------------------------------------------------------------------
250
+
251
+ def estimate_tokens(text: str) -> int:
252
+ """Estimate token count for *text*.
253
+
254
+ Uses tiktoken cl100k_base if available, otherwise chars//4 heuristic.
255
+ """
256
+ try:
257
+ import tiktoken
258
+ enc = tiktoken.get_encoding("cl100k_base")
259
+ return len(enc.encode(text))
260
+ except Exception:
261
+ return max(1, len(text) // 4)
262
+
263
+
264
+ # ---------------------------------------------------------------------------
265
+ # ClaudeCompressor
266
+ # ---------------------------------------------------------------------------
267
+
268
+ class ClaudeCompressor:
269
+ """High-level wrapper for compressing Claude API messages."""
270
+
271
+ def __init__(self, codebook: Optional[TextCodebook] = None):
272
+ self._codebook = codebook if codebook is not None else TextCodebook.from_common_tables()
273
+
274
+ @property
275
+ def codebook(self) -> TextCodebook:
276
+ return self._codebook
277
+
278
+ def compress_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
279
+ """Compress content in Claude API message format, preserving ``role``."""
280
+ result = []
281
+ for msg in messages:
282
+ new_msg = dict(msg)
283
+ if "content" in new_msg:
284
+ new_msg["content"] = compress(new_msg["content"], self._codebook)
285
+ result.append(new_msg)
286
+ return result
287
+
288
+ def decompress_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
289
+ """Decompress content in Claude API message format."""
290
+ result = []
291
+ for msg in messages:
292
+ new_msg = dict(msg)
293
+ if "content" in new_msg:
294
+ new_msg["content"] = decompress(new_msg["content"], self._codebook)
295
+ result.append(new_msg)
296
+ return result
297
+
298
+ def system_prompt_prefix(self) -> str:
299
+ """Return legend + instruction text suitable for prepending to a system prompt."""
300
+ legend = self._codebook.legend()
301
+ instruction = (
302
+ "When responding with structured data (JSON/dicts), "
303
+ "use the [EL] short codes defined above for all keys and values that have mappings. "
304
+ "Pass through any keys or values not in the legend unchanged."
305
+ )
306
+ return f"{legend}\n{instruction}"
307
+
308
+ def wrap_tool_result(self, result: Any) -> Any:
309
+ """Compress a single tool result (dict, list, or string)."""
310
+ return compress(result, self._codebook)
311
+
312
+ def estimate_savings(self, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
313
+ """Estimate token savings from compression.
314
+
315
+ Returns dict with original_tokens, compressed_tokens, legend_tokens,
316
+ net_savings, savings_pct.
317
+ """
318
+ import json as _json
319
+
320
+ original_text = _json.dumps(messages)
321
+ compressed_msgs = self.compress_messages(messages)
322
+ compressed_text = _json.dumps(compressed_msgs)
323
+ legend_text = self.system_prompt_prefix()
324
+
325
+ original_tokens = estimate_tokens(original_text)
326
+ compressed_tokens = estimate_tokens(compressed_text)
327
+ legend_tokens = estimate_tokens(legend_text)
328
+
329
+ net_savings = original_tokens - compressed_tokens - legend_tokens
330
+ savings_pct = (net_savings / original_tokens * 100) if original_tokens > 0 else 0.0
331
+
332
+ return {
333
+ "original_tokens": original_tokens,
334
+ "compressed_tokens": compressed_tokens,
335
+ "legend_tokens": legend_tokens,
336
+ "net_savings": net_savings,
337
+ "savings_pct": savings_pct,
338
+ }
339
+
340
+
341
+ # ---------------------------------------------------------------------------
342
+ # wrap_api_call (optional convenience)
343
+ # ---------------------------------------------------------------------------
344
+
345
+ def wrap_api_call(
346
+ messages: List[Dict[str, Any]],
347
+ system: str = "",
348
+ codebook: Optional[TextCodebook] = None,
349
+ **kwargs,
350
+ ) -> Any:
351
+ """Compress messages, prepend legend to system prompt, call Claude API.
352
+
353
+ Lazy-imports the ``anthropic`` SDK. All extra ``**kwargs`` are forwarded
354
+ to ``client.messages.create()``.
355
+
356
+ Returns the raw API response.
357
+ """
358
+ import anthropic # noqa: lazy import
359
+
360
+ compressor = ClaudeCompressor(codebook)
361
+ compressed = compressor.compress_messages(messages)
362
+ legend = compressor.system_prompt_prefix()
363
+ instruction = (
364
+ "When responding with structured data (JSON/dicts), "
365
+ "use the [EL] short codes defined above for all keys and values that have mappings. "
366
+ "Pass through any keys or values not in the legend unchanged."
367
+ )
368
+ full_system = f"{legend}\n{instruction}\n\n{system}" if system else f"{legend}\n{instruction}"
369
+
370
+ client = anthropic.Anthropic()
371
+ return client.messages.create(
372
+ messages=compressed,
373
+ system=full_system,
374
+ **kwargs,
375
+ )