emergent-translator 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emergent_translator/__init__.py +126 -0
- emergent_translator/adaptive_codebook.py +342 -0
- emergent_translator/api_server.py +4988 -0
- emergent_translator/batch_encoder.py +555 -0
- emergent_translator/chunk_collector.py +978 -0
- emergent_translator/chunk_coordinator.py +738 -0
- emergent_translator/claude_compression.py +375 -0
- emergent_translator/cli.py +413 -0
- emergent_translator/client_sdk.py +903 -0
- emergent_translator/code_skeleton.py +448 -0
- emergent_translator/core.py +1081 -0
- emergent_translator/emergent_symbols.py +690 -0
- emergent_translator/format_handlers.py +901 -0
- emergent_translator/gpu_batch_encoder.py +848 -0
- emergent_translator/intelligent_router.py +509 -0
- emergent_translator/metrics.py +436 -0
- emergent_translator/py.typed +0 -0
- emergent_translator-1.1.0.dist-info/METADATA +568 -0
- emergent_translator-1.1.0.dist-info/RECORD +23 -0
- emergent_translator-1.1.0.dist-info/WHEEL +5 -0
- emergent_translator-1.1.0.dist-info/entry_points.txt +2 -0
- emergent_translator-1.1.0.dist-info/licenses/LICENSE +82 -0
- emergent_translator-1.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,901 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Format Handlers for Emergent Language Batch Encoder
|
|
4
|
+
|
|
5
|
+
Converts CSV, JSONL, YAML, TOML, XML, MessagePack, Protobuf, and Parquet
|
|
6
|
+
to/from List[Dict] for use with BatchEncoder.encode_batch() / decode_batch().
|
|
7
|
+
|
|
8
|
+
Text formats: CSV, JSONL, YAML, TOML, XML
|
|
9
|
+
Binary formats: MessagePack, Protobuf, Parquet
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from format_handlers import csv_to_dicts, dicts_to_csv
|
|
13
|
+
from format_handlers import detect_format, get_handler, is_binary_format
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import csv
|
|
17
|
+
import io
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import xml.etree.ElementTree as ET
|
|
21
|
+
from typing import Any, Callable, Dict, List, Tuple, Union
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
25
|
+
# Type Inference
|
|
26
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
27
|
+
|
|
28
|
+
def infer_type(value: str) -> Any:
|
|
29
|
+
"""Infer Python type from a string value.
|
|
30
|
+
|
|
31
|
+
Recognizes int, float, bool (true/false), None/null/empty, and falls back
|
|
32
|
+
to str.
|
|
33
|
+
"""
|
|
34
|
+
if value == "":
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
lower = value.lower()
|
|
38
|
+
if lower in ("null", "none"):
|
|
39
|
+
return None
|
|
40
|
+
if lower == "true":
|
|
41
|
+
return True
|
|
42
|
+
if lower == "false":
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
# Try int first (must not have decimal point)
|
|
46
|
+
try:
|
|
47
|
+
if "." not in value and "e" not in lower:
|
|
48
|
+
return int(value)
|
|
49
|
+
except ValueError:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
# Try float
|
|
53
|
+
try:
|
|
54
|
+
return float(value)
|
|
55
|
+
except ValueError:
|
|
56
|
+
pass
|
|
57
|
+
|
|
58
|
+
return value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
62
|
+
# CSV
|
|
63
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
64
|
+
|
|
65
|
+
def csv_to_dicts(text: str, infer_types: bool = True) -> List[Dict]:
|
|
66
|
+
"""Parse CSV text into a list of dicts.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: CSV string with a header row.
|
|
70
|
+
infer_types: If True, convert values to int/float/bool/None where
|
|
71
|
+
possible. If False, keep everything as strings.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of dicts, one per CSV row.
|
|
75
|
+
"""
|
|
76
|
+
reader = csv.DictReader(io.StringIO(text))
|
|
77
|
+
rows = []
|
|
78
|
+
for row in reader:
|
|
79
|
+
if infer_types:
|
|
80
|
+
row = {k: infer_type(v) for k, v in row.items()}
|
|
81
|
+
rows.append(dict(row))
|
|
82
|
+
return rows
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def dicts_to_csv(dicts: List[Dict]) -> str:
|
|
86
|
+
"""Serialize a list of dicts to CSV text.
|
|
87
|
+
|
|
88
|
+
Nested values (dict/list) are JSON-stringified. None becomes empty string.
|
|
89
|
+
Field order is the union of all keys, preserving first-seen order.
|
|
90
|
+
"""
|
|
91
|
+
if not dicts:
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
# Collect all keys preserving insertion order
|
|
95
|
+
fieldnames = []
|
|
96
|
+
seen = set()
|
|
97
|
+
for d in dicts:
|
|
98
|
+
for k in d:
|
|
99
|
+
if k not in seen:
|
|
100
|
+
fieldnames.append(k)
|
|
101
|
+
seen.add(k)
|
|
102
|
+
|
|
103
|
+
output = io.StringIO()
|
|
104
|
+
writer = csv.DictWriter(output, fieldnames=fieldnames, extrasaction="ignore")
|
|
105
|
+
writer.writeheader()
|
|
106
|
+
for d in dicts:
|
|
107
|
+
row = {}
|
|
108
|
+
for k in fieldnames:
|
|
109
|
+
v = d.get(k)
|
|
110
|
+
if v is None:
|
|
111
|
+
row[k] = ""
|
|
112
|
+
elif isinstance(v, bool):
|
|
113
|
+
row[k] = str(v).lower()
|
|
114
|
+
elif isinstance(v, (dict, list)):
|
|
115
|
+
row[k] = json.dumps(v, separators=(",", ":"))
|
|
116
|
+
else:
|
|
117
|
+
row[k] = v
|
|
118
|
+
writer.writerow(row)
|
|
119
|
+
return output.getvalue()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
123
|
+
# JSONL / NDJSON
|
|
124
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
125
|
+
|
|
126
|
+
def jsonl_to_dicts(text: str) -> List[Dict]:
|
|
127
|
+
"""Parse newline-delimited JSON (JSONL/NDJSON) into a list of dicts.
|
|
128
|
+
|
|
129
|
+
Each non-empty line is parsed as a separate JSON object.
|
|
130
|
+
"""
|
|
131
|
+
result = []
|
|
132
|
+
for lineno, line in enumerate(text.splitlines(), 1):
|
|
133
|
+
line = line.strip()
|
|
134
|
+
if not line:
|
|
135
|
+
continue
|
|
136
|
+
try:
|
|
137
|
+
obj = json.loads(line)
|
|
138
|
+
except json.JSONDecodeError as e:
|
|
139
|
+
raise ValueError(f"Invalid JSON on line {lineno}: {e}") from e
|
|
140
|
+
if not isinstance(obj, dict):
|
|
141
|
+
raise ValueError(
|
|
142
|
+
f"Expected dict on line {lineno}, got {type(obj).__name__}"
|
|
143
|
+
)
|
|
144
|
+
result.append(obj)
|
|
145
|
+
return result
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def dicts_to_jsonl(dicts: List[Dict]) -> str:
|
|
149
|
+
"""Serialize a list of dicts to JSONL (one JSON object per line)."""
|
|
150
|
+
if not dicts:
|
|
151
|
+
return ""
|
|
152
|
+
return "\n".join(json.dumps(d, separators=(",", ":")) for d in dicts) + "\n"
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
156
|
+
# YAML
|
|
157
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
158
|
+
|
|
159
|
+
def _import_yaml():
|
|
160
|
+
"""Import pyyaml with a helpful error message."""
|
|
161
|
+
try:
|
|
162
|
+
import yaml
|
|
163
|
+
return yaml
|
|
164
|
+
except ImportError:
|
|
165
|
+
raise ImportError(
|
|
166
|
+
"PyYAML is required for YAML support. "
|
|
167
|
+
"Install it with: pip install pyyaml "
|
|
168
|
+
"or: pip install emergent-translator[formats]"
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def yaml_to_dicts(text: str) -> List[Dict]:
|
|
173
|
+
"""Parse YAML text into a list of dicts.
|
|
174
|
+
|
|
175
|
+
Handles:
|
|
176
|
+
- A YAML list of dicts (most common)
|
|
177
|
+
- A single YAML dict (wrapped in a list)
|
|
178
|
+
- Multi-document YAML (--- separators)
|
|
179
|
+
|
|
180
|
+
Raises ValueError for scalars, empty docs, and non-dict list items.
|
|
181
|
+
"""
|
|
182
|
+
yaml = _import_yaml()
|
|
183
|
+
|
|
184
|
+
# Try multi-document first
|
|
185
|
+
docs = list(yaml.safe_load_all(text))
|
|
186
|
+
|
|
187
|
+
# Filter out None docs (empty documents between ---)
|
|
188
|
+
docs = [d for d in docs if d is not None]
|
|
189
|
+
|
|
190
|
+
if not docs:
|
|
191
|
+
raise ValueError("YAML document is empty")
|
|
192
|
+
|
|
193
|
+
# Single document
|
|
194
|
+
if len(docs) == 1:
|
|
195
|
+
data = docs[0]
|
|
196
|
+
if isinstance(data, list):
|
|
197
|
+
for item in data:
|
|
198
|
+
if not isinstance(item, dict):
|
|
199
|
+
raise ValueError(
|
|
200
|
+
f"Expected list of dicts, got list containing {type(item).__name__}"
|
|
201
|
+
)
|
|
202
|
+
return data
|
|
203
|
+
if isinstance(data, dict):
|
|
204
|
+
return [data]
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f"Expected dict or list of dicts, got {type(data).__name__}"
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Multiple documents — each should be a dict
|
|
210
|
+
result = []
|
|
211
|
+
for doc in docs:
|
|
212
|
+
if isinstance(doc, dict):
|
|
213
|
+
result.append(doc)
|
|
214
|
+
elif isinstance(doc, list):
|
|
215
|
+
for item in doc:
|
|
216
|
+
if not isinstance(item, dict):
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Expected dicts in multi-doc YAML, got {type(item).__name__}"
|
|
219
|
+
)
|
|
220
|
+
result.append(item)
|
|
221
|
+
else:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Expected dict in multi-doc YAML, got {type(doc).__name__}"
|
|
224
|
+
)
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def dicts_to_yaml(dicts: List[Dict]) -> str:
|
|
229
|
+
"""Serialize a list of dicts to YAML text."""
|
|
230
|
+
yaml = _import_yaml()
|
|
231
|
+
return yaml.dump(dicts, default_flow_style=False, sort_keys=False)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
235
|
+
# TOML
|
|
236
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
237
|
+
|
|
238
|
+
def _import_tomllib():
|
|
239
|
+
"""Import tomllib (3.11+) or tomli backport for reading TOML."""
|
|
240
|
+
try:
|
|
241
|
+
import tomllib
|
|
242
|
+
return tomllib
|
|
243
|
+
except ImportError:
|
|
244
|
+
pass
|
|
245
|
+
try:
|
|
246
|
+
import tomli as tomllib
|
|
247
|
+
return tomllib
|
|
248
|
+
except ImportError:
|
|
249
|
+
raise ImportError(
|
|
250
|
+
"TOML reading requires Python 3.11+ or the 'tomli' package. "
|
|
251
|
+
"Install it with: pip install tomli "
|
|
252
|
+
"or: pip install emergent-translator[formats]"
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _import_tomli_w():
|
|
257
|
+
"""Import tomli_w for writing TOML."""
|
|
258
|
+
try:
|
|
259
|
+
import tomli_w
|
|
260
|
+
return tomli_w
|
|
261
|
+
except ImportError:
|
|
262
|
+
raise ImportError(
|
|
263
|
+
"TOML writing requires the 'tomli_w' package. "
|
|
264
|
+
"Install it with: pip install tomli_w "
|
|
265
|
+
"or: pip install emergent-translator[formats]"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def toml_to_dicts(text: str) -> List[Dict]:
|
|
270
|
+
"""Parse TOML text into a list of dicts.
|
|
271
|
+
|
|
272
|
+
If the TOML has an ``items``, ``records``, ``entries``, or ``data`` key
|
|
273
|
+
whose value is a list of tables, that list is returned directly.
|
|
274
|
+
Otherwise the root table is returned wrapped in a list.
|
|
275
|
+
"""
|
|
276
|
+
tomllib = _import_tomllib()
|
|
277
|
+
data = tomllib.loads(text)
|
|
278
|
+
|
|
279
|
+
# Look for a conventional list-of-tables key
|
|
280
|
+
for key in ("items", "records", "entries", "data"):
|
|
281
|
+
if key in data and isinstance(data[key], list):
|
|
282
|
+
return data[key]
|
|
283
|
+
|
|
284
|
+
return [data]
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def dicts_to_toml(dicts: List[Dict]) -> str:
|
|
288
|
+
"""Serialize a list of dicts to TOML text.
|
|
289
|
+
|
|
290
|
+
A single dict is written as a flat TOML document. Multiple dicts are
|
|
291
|
+
written as an ``[[items]]`` array of tables. None values are omitted
|
|
292
|
+
(TOML has no null type).
|
|
293
|
+
"""
|
|
294
|
+
tomli_w = _import_tomli_w()
|
|
295
|
+
cleaned = [_toml_clean(d) for d in dicts]
|
|
296
|
+
if len(cleaned) == 1:
|
|
297
|
+
return tomli_w.dumps(cleaned[0])
|
|
298
|
+
return tomli_w.dumps({"items": cleaned})
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _toml_clean(obj: Any) -> Any:
|
|
302
|
+
"""Remove None values from nested structures for TOML compatibility."""
|
|
303
|
+
if isinstance(obj, dict):
|
|
304
|
+
return {k: _toml_clean(v) for k, v in obj.items() if v is not None}
|
|
305
|
+
if isinstance(obj, list):
|
|
306
|
+
return [_toml_clean(item) for item in obj]
|
|
307
|
+
return obj
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
311
|
+
# XML
|
|
312
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
313
|
+
|
|
314
|
+
def xml_to_dicts(text: str) -> List[Dict]:
|
|
315
|
+
"""Parse XML text into a list of dicts.
|
|
316
|
+
|
|
317
|
+
Expected structure:
|
|
318
|
+
<root>
|
|
319
|
+
<item attr="val">
|
|
320
|
+
<key>value</key>
|
|
321
|
+
...
|
|
322
|
+
</item>
|
|
323
|
+
...
|
|
324
|
+
</root>
|
|
325
|
+
|
|
326
|
+
Conventions:
|
|
327
|
+
- XML attributes become ``@attr`` keys.
|
|
328
|
+
- Text content of an element with children/attributes becomes ``#text``.
|
|
329
|
+
- Repeated sibling tags become lists.
|
|
330
|
+
- Leaf text is type-inferred (int/float/bool/None).
|
|
331
|
+
"""
|
|
332
|
+
root = ET.fromstring(text)
|
|
333
|
+
result = []
|
|
334
|
+
for child in root:
|
|
335
|
+
result.append(_xml_element_to_dict(child))
|
|
336
|
+
return result
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _xml_element_to_dict(elem: ET.Element) -> Dict:
|
|
340
|
+
"""Recursively convert an XML element to a dict."""
|
|
341
|
+
d = {}
|
|
342
|
+
|
|
343
|
+
# Attributes → @attr keys
|
|
344
|
+
for attr_name, attr_val in elem.attrib.items():
|
|
345
|
+
d["@" + attr_name] = infer_type(attr_val)
|
|
346
|
+
|
|
347
|
+
# Group children by tag to detect repeated siblings
|
|
348
|
+
children_by_tag = {}
|
|
349
|
+
for child in elem:
|
|
350
|
+
children_by_tag.setdefault(child.tag, []).append(child)
|
|
351
|
+
|
|
352
|
+
for tag, children in children_by_tag.items():
|
|
353
|
+
if len(children) == 1:
|
|
354
|
+
child = children[0]
|
|
355
|
+
if len(child) == 0 and not child.attrib:
|
|
356
|
+
# Leaf element
|
|
357
|
+
d[tag] = infer_type(child.text or "")
|
|
358
|
+
else:
|
|
359
|
+
d[tag] = _xml_element_to_dict(child)
|
|
360
|
+
else:
|
|
361
|
+
# Repeated tag → list
|
|
362
|
+
items = []
|
|
363
|
+
for child in children:
|
|
364
|
+
if len(child) == 0 and not child.attrib:
|
|
365
|
+
items.append(infer_type(child.text or ""))
|
|
366
|
+
else:
|
|
367
|
+
items.append(_xml_element_to_dict(child))
|
|
368
|
+
d[tag] = items
|
|
369
|
+
|
|
370
|
+
# Text content
|
|
371
|
+
text = (elem.text or "").strip()
|
|
372
|
+
if text:
|
|
373
|
+
if d:
|
|
374
|
+
d["#text"] = text
|
|
375
|
+
else:
|
|
376
|
+
d["#text"] = infer_type(text)
|
|
377
|
+
return d
|
|
378
|
+
|
|
379
|
+
return d
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def dicts_to_xml(dicts: List[Dict], root_tag: str = "root",
|
|
383
|
+
item_tag: str = "item") -> str:
|
|
384
|
+
"""Serialize a list of dicts to XML text.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
dicts: Data to serialize.
|
|
388
|
+
root_tag: Name of the root element.
|
|
389
|
+
item_tag: Name of each item element.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
XML string with declaration.
|
|
393
|
+
"""
|
|
394
|
+
root = ET.Element(root_tag)
|
|
395
|
+
for d in dicts:
|
|
396
|
+
item = ET.SubElement(root, item_tag)
|
|
397
|
+
_dict_to_xml_element(d, item)
|
|
398
|
+
|
|
399
|
+
# Pretty-print (ET.indent is 3.9+)
|
|
400
|
+
try:
|
|
401
|
+
ET.indent(root, space=" ")
|
|
402
|
+
except AttributeError:
|
|
403
|
+
pass
|
|
404
|
+
|
|
405
|
+
return ET.tostring(root, encoding="unicode", xml_declaration=True)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _dict_to_xml_element(d: Dict, parent: ET.Element) -> None:
|
|
409
|
+
"""Recursively convert a dict to XML sub-elements under *parent*."""
|
|
410
|
+
for key, value in d.items():
|
|
411
|
+
if key.startswith("@"):
|
|
412
|
+
parent.set(key[1:], _xml_value_str(value))
|
|
413
|
+
elif key == "#text":
|
|
414
|
+
parent.text = _xml_value_str(value)
|
|
415
|
+
elif isinstance(value, dict):
|
|
416
|
+
child = ET.SubElement(parent, key)
|
|
417
|
+
_dict_to_xml_element(value, child)
|
|
418
|
+
elif isinstance(value, list):
|
|
419
|
+
for item in value:
|
|
420
|
+
child = ET.SubElement(parent, key)
|
|
421
|
+
if isinstance(item, dict):
|
|
422
|
+
_dict_to_xml_element(item, child)
|
|
423
|
+
else:
|
|
424
|
+
child.text = _xml_value_str(item)
|
|
425
|
+
else:
|
|
426
|
+
child = ET.SubElement(parent, key)
|
|
427
|
+
child.text = _xml_value_str(value)
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _xml_value_str(value: Any) -> str:
|
|
431
|
+
"""Convert a Python value to its XML text representation."""
|
|
432
|
+
if value is None:
|
|
433
|
+
return ""
|
|
434
|
+
if isinstance(value, bool):
|
|
435
|
+
return str(value).lower()
|
|
436
|
+
return str(value)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
440
|
+
# MessagePack (requires: pip install msgpack)
|
|
441
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
442
|
+
|
|
443
|
+
def _import_msgpack():
|
|
444
|
+
"""Import msgpack with a helpful error message."""
|
|
445
|
+
try:
|
|
446
|
+
import msgpack
|
|
447
|
+
return msgpack
|
|
448
|
+
except ImportError:
|
|
449
|
+
raise ImportError(
|
|
450
|
+
"msgpack is required for MessagePack support. "
|
|
451
|
+
"Install it with: pip install msgpack "
|
|
452
|
+
"or: pip install emergent-translator[formats]"
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def msgpack_to_dicts(data: bytes) -> List[Dict]:
|
|
457
|
+
"""Deserialize MessagePack bytes into a list of dicts.
|
|
458
|
+
|
|
459
|
+
Accepts a packed list of dicts or a single packed dict (wrapped in a list).
|
|
460
|
+
"""
|
|
461
|
+
msgpack = _import_msgpack()
|
|
462
|
+
unpacked = msgpack.unpackb(data, raw=False)
|
|
463
|
+
if isinstance(unpacked, list):
|
|
464
|
+
return unpacked
|
|
465
|
+
if isinstance(unpacked, dict):
|
|
466
|
+
return [unpacked]
|
|
467
|
+
raise ValueError(
|
|
468
|
+
f"Expected list or dict in MessagePack data, got {type(unpacked).__name__}"
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def dicts_to_msgpack(dicts: List[Dict]) -> bytes:
|
|
473
|
+
"""Serialize a list of dicts to MessagePack bytes."""
|
|
474
|
+
msgpack = _import_msgpack()
|
|
475
|
+
return msgpack.packb(dicts, use_bin_type=True)
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
479
|
+
# Protobuf (requires: pip install protobuf)
|
|
480
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
481
|
+
|
|
482
|
+
def _import_protobuf():
|
|
483
|
+
"""Import google.protobuf struct utilities with a helpful error message."""
|
|
484
|
+
try:
|
|
485
|
+
from google.protobuf import struct_pb2, json_format
|
|
486
|
+
return struct_pb2, json_format
|
|
487
|
+
except ImportError:
|
|
488
|
+
raise ImportError(
|
|
489
|
+
"protobuf is required for Protobuf support. "
|
|
490
|
+
"Install it with: pip install protobuf "
|
|
491
|
+
"or: pip install emergent-translator[formats]"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def protobuf_to_dicts(data: bytes) -> List[Dict]:
|
|
496
|
+
"""Deserialize Protobuf bytes (google.protobuf.Struct wrapper) to dicts.
|
|
497
|
+
|
|
498
|
+
Uses ``google.protobuf.struct_pb2.Struct`` as a schema-less container.
|
|
499
|
+
Note: all numeric values are stored as doubles in Protobuf Struct, so
|
|
500
|
+
integers will round-trip as floats (e.g. 42 -> 42.0).
|
|
501
|
+
"""
|
|
502
|
+
struct_pb2, json_format = _import_protobuf()
|
|
503
|
+
wrapper = struct_pb2.Struct()
|
|
504
|
+
wrapper.ParseFromString(data)
|
|
505
|
+
raw = json_format.MessageToDict(wrapper)
|
|
506
|
+
items = raw.get("items", [raw])
|
|
507
|
+
return items
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def dicts_to_protobuf(dicts: List[Dict]) -> bytes:
|
|
511
|
+
"""Serialize a list of dicts to Protobuf bytes using Struct wrapper.
|
|
512
|
+
|
|
513
|
+
Wraps the list under an ``items`` key in a ``google.protobuf.Struct``.
|
|
514
|
+
"""
|
|
515
|
+
struct_pb2, json_format = _import_protobuf()
|
|
516
|
+
wrapper = struct_pb2.Struct()
|
|
517
|
+
json_format.ParseDict({"items": dicts}, wrapper)
|
|
518
|
+
return wrapper.SerializeToString()
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
522
|
+
# Parquet (requires: pip install pyarrow)
|
|
523
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
524
|
+
|
|
525
|
+
def _import_pyarrow():
|
|
526
|
+
"""Import pyarrow with a helpful error message."""
|
|
527
|
+
try:
|
|
528
|
+
import pyarrow as pa
|
|
529
|
+
import pyarrow.parquet as pq
|
|
530
|
+
return pa, pq
|
|
531
|
+
except ImportError:
|
|
532
|
+
raise ImportError(
|
|
533
|
+
"pyarrow is required for Parquet support. "
|
|
534
|
+
"Install it with: pip install pyarrow "
|
|
535
|
+
"or: pip install emergent-translator[formats]"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def parquet_to_dicts(data: bytes) -> List[Dict]:
|
|
540
|
+
"""Deserialize Parquet bytes into a list of dicts.
|
|
541
|
+
|
|
542
|
+
Best suited for flat/tabular data. Nested structures depend on Arrow's
|
|
543
|
+
schema inference.
|
|
544
|
+
"""
|
|
545
|
+
pa, pq = _import_pyarrow()
|
|
546
|
+
buf = pa.BufferReader(data)
|
|
547
|
+
table = pq.read_table(buf)
|
|
548
|
+
return table.to_pylist()
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def dicts_to_parquet(dicts: List[Dict]) -> bytes:
|
|
552
|
+
"""Serialize a list of dicts to Parquet bytes.
|
|
553
|
+
|
|
554
|
+
Best suited for flat/tabular data with uniform schemas.
|
|
555
|
+
"""
|
|
556
|
+
pa, pq = _import_pyarrow()
|
|
557
|
+
table = pa.Table.from_pylist(dicts)
|
|
558
|
+
sink = pa.BufferOutputStream()
|
|
559
|
+
pq.write_table(table, sink)
|
|
560
|
+
return sink.getvalue().to_pybytes()
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
564
|
+
# Arrow IPC / Feather (requires: pip install pyarrow — shared with Parquet)
|
|
565
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
566
|
+
|
|
567
|
+
def arrow_to_dicts(data: bytes) -> List[Dict]:
|
|
568
|
+
"""Deserialize Arrow IPC (Feather v2) bytes into a list of dicts."""
|
|
569
|
+
pa, _ = _import_pyarrow()
|
|
570
|
+
reader = pa.ipc.open_file(pa.BufferReader(data))
|
|
571
|
+
table = reader.read_all()
|
|
572
|
+
return table.to_pylist()
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def dicts_to_arrow(dicts: List[Dict]) -> bytes:
|
|
576
|
+
"""Serialize a list of dicts to Arrow IPC (Feather v2) bytes."""
|
|
577
|
+
pa, _ = _import_pyarrow()
|
|
578
|
+
table = pa.Table.from_pylist(dicts)
|
|
579
|
+
sink = pa.BufferOutputStream()
|
|
580
|
+
writer = pa.ipc.new_file(sink, table.schema)
|
|
581
|
+
writer.write_table(table)
|
|
582
|
+
writer.close()
|
|
583
|
+
return sink.getvalue().to_pybytes()
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
587
|
+
# BSON (requires: pip install pymongo)
|
|
588
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
589
|
+
|
|
590
|
+
def _import_bson():
|
|
591
|
+
"""Import bson (from pymongo) with a helpful error message."""
|
|
592
|
+
try:
|
|
593
|
+
import bson
|
|
594
|
+
return bson
|
|
595
|
+
except ImportError:
|
|
596
|
+
raise ImportError(
|
|
597
|
+
"pymongo is required for BSON support. "
|
|
598
|
+
"Install it with: pip install pymongo "
|
|
599
|
+
"or: pip install emergent-translator[formats]"
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
def bson_to_dicts(data: bytes) -> List[Dict]:
|
|
604
|
+
"""Deserialize BSON bytes into a list of dicts.
|
|
605
|
+
|
|
606
|
+
Expects a sequence of concatenated BSON documents (standard MongoDB
|
|
607
|
+
wire format). ObjectId values are converted to strings.
|
|
608
|
+
"""
|
|
609
|
+
bson_mod = _import_bson()
|
|
610
|
+
result = []
|
|
611
|
+
offset = 0
|
|
612
|
+
while offset < len(data):
|
|
613
|
+
# Each BSON doc starts with a 4-byte little-endian length
|
|
614
|
+
if offset + 4 > len(data):
|
|
615
|
+
break
|
|
616
|
+
import struct as _struct
|
|
617
|
+
doc_len = _struct.unpack_from("<i", data, offset)[0]
|
|
618
|
+
doc_bytes = data[offset:offset + doc_len]
|
|
619
|
+
doc = bson_mod.decode(doc_bytes)
|
|
620
|
+
# Convert ObjectId to string for JSON compatibility
|
|
621
|
+
if "_id" in doc:
|
|
622
|
+
doc["_id"] = str(doc["_id"])
|
|
623
|
+
result.append(doc)
|
|
624
|
+
offset += doc_len
|
|
625
|
+
return result
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def dicts_to_bson(dicts: List[Dict]) -> bytes:
|
|
629
|
+
"""Serialize a list of dicts to concatenated BSON bytes."""
|
|
630
|
+
bson_mod = _import_bson()
|
|
631
|
+
parts = []
|
|
632
|
+
for d in dicts:
|
|
633
|
+
parts.append(bson_mod.encode(d))
|
|
634
|
+
return b"".join(parts)
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
638
|
+
# CBOR (requires: pip install cbor2)
|
|
639
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
640
|
+
|
|
641
|
+
def _import_cbor2():
|
|
642
|
+
"""Import cbor2 with a helpful error message."""
|
|
643
|
+
try:
|
|
644
|
+
import cbor2
|
|
645
|
+
return cbor2
|
|
646
|
+
except ImportError:
|
|
647
|
+
raise ImportError(
|
|
648
|
+
"cbor2 is required for CBOR support. "
|
|
649
|
+
"Install it with: pip install cbor2 "
|
|
650
|
+
"or: pip install emergent-translator[formats]"
|
|
651
|
+
)
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def cbor_to_dicts(data: bytes) -> List[Dict]:
|
|
655
|
+
"""Deserialize CBOR bytes into a list of dicts.
|
|
656
|
+
|
|
657
|
+
Accepts a CBOR-encoded list of maps or a single map (wrapped in a list).
|
|
658
|
+
"""
|
|
659
|
+
cbor2 = _import_cbor2()
|
|
660
|
+
decoded = cbor2.loads(data)
|
|
661
|
+
if isinstance(decoded, list):
|
|
662
|
+
return decoded
|
|
663
|
+
if isinstance(decoded, dict):
|
|
664
|
+
return [decoded]
|
|
665
|
+
raise ValueError(
|
|
666
|
+
f"Expected list or dict in CBOR data, got {type(decoded).__name__}"
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def dicts_to_cbor(dicts: List[Dict]) -> bytes:
|
|
671
|
+
"""Serialize a list of dicts to CBOR bytes."""
|
|
672
|
+
cbor2 = _import_cbor2()
|
|
673
|
+
return cbor2.dumps(dicts)
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
677
|
+
# INI (stdlib configparser)
|
|
678
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
679
|
+
|
|
680
|
+
def ini_to_dicts(text: str) -> List[Dict]:
|
|
681
|
+
"""Parse INI text into a list of dicts, one per section.
|
|
682
|
+
|
|
683
|
+
Each section becomes a dict with all its key-value pairs plus a
|
|
684
|
+
``_section`` key holding the section name. Values are type-inferred.
|
|
685
|
+
The ``[DEFAULT]`` section values are merged into every section (standard
|
|
686
|
+
configparser behaviour).
|
|
687
|
+
"""
|
|
688
|
+
import configparser
|
|
689
|
+
parser = configparser.ConfigParser()
|
|
690
|
+
parser.read_string(text)
|
|
691
|
+
result = []
|
|
692
|
+
for section in parser.sections():
|
|
693
|
+
d = {"_section": section}
|
|
694
|
+
for key, val in parser.items(section):
|
|
695
|
+
d[key] = infer_type(val)
|
|
696
|
+
result.append(d)
|
|
697
|
+
return result
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def dicts_to_ini(dicts: List[Dict]) -> str:
|
|
701
|
+
"""Serialize a list of dicts to INI text.
|
|
702
|
+
|
|
703
|
+
Each dict becomes a section. The section name is taken from a
|
|
704
|
+
``_section`` key (defaulting to ``section_N``). None values are
|
|
705
|
+
written as empty strings.
|
|
706
|
+
"""
|
|
707
|
+
import configparser
|
|
708
|
+
parser = configparser.ConfigParser()
|
|
709
|
+
for i, d in enumerate(dicts):
|
|
710
|
+
section = str(d.get("_section", f"section_{i}"))
|
|
711
|
+
parser.add_section(section)
|
|
712
|
+
for key, val in d.items():
|
|
713
|
+
if key == "_section":
|
|
714
|
+
continue
|
|
715
|
+
if val is None:
|
|
716
|
+
parser.set(section, key, "")
|
|
717
|
+
elif isinstance(val, bool):
|
|
718
|
+
parser.set(section, key, str(val).lower())
|
|
719
|
+
elif isinstance(val, (dict, list)):
|
|
720
|
+
parser.set(section, key, json.dumps(val, separators=(",", ":")))
|
|
721
|
+
else:
|
|
722
|
+
parser.set(section, key, str(val))
|
|
723
|
+
output = io.StringIO()
|
|
724
|
+
parser.write(output)
|
|
725
|
+
return output.getvalue()
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
729
|
+
# Excel / XLSX (requires: pip install openpyxl)
|
|
730
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
731
|
+
|
|
732
|
+
def _import_openpyxl():
|
|
733
|
+
"""Import openpyxl with a helpful error message."""
|
|
734
|
+
try:
|
|
735
|
+
import openpyxl
|
|
736
|
+
return openpyxl
|
|
737
|
+
except ImportError:
|
|
738
|
+
raise ImportError(
|
|
739
|
+
"openpyxl is required for Excel/XLSX support. "
|
|
740
|
+
"Install it with: pip install openpyxl "
|
|
741
|
+
"or: pip install emergent-translator[formats]"
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
|
|
745
|
+
def xlsx_to_dicts(data: bytes) -> List[Dict]:
|
|
746
|
+
"""Deserialize Excel XLSX bytes into a list of dicts.
|
|
747
|
+
|
|
748
|
+
Reads the active sheet. The first row is treated as headers.
|
|
749
|
+
Values are type-inferred from their cell values (Excel already
|
|
750
|
+
distinguishes numbers, strings, booleans, and None).
|
|
751
|
+
"""
|
|
752
|
+
openpyxl = _import_openpyxl()
|
|
753
|
+
wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
|
|
754
|
+
ws = wb.active
|
|
755
|
+
rows = list(ws.iter_rows(values_only=True))
|
|
756
|
+
wb.close()
|
|
757
|
+
if not rows:
|
|
758
|
+
return []
|
|
759
|
+
headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])]
|
|
760
|
+
result = []
|
|
761
|
+
for row in rows[1:]:
|
|
762
|
+
d = {}
|
|
763
|
+
for key, val in zip(headers, row):
|
|
764
|
+
d[key] = val
|
|
765
|
+
result.append(d)
|
|
766
|
+
return result
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def dicts_to_xlsx(dicts: List[Dict]) -> bytes:
|
|
770
|
+
"""Serialize a list of dicts to Excel XLSX bytes.
|
|
771
|
+
|
|
772
|
+
Nested values (dict/list) are JSON-stringified.
|
|
773
|
+
"""
|
|
774
|
+
openpyxl = _import_openpyxl()
|
|
775
|
+
wb = openpyxl.Workbook()
|
|
776
|
+
ws = wb.active
|
|
777
|
+
|
|
778
|
+
if not dicts:
|
|
779
|
+
buf = io.BytesIO()
|
|
780
|
+
wb.save(buf)
|
|
781
|
+
return buf.getvalue()
|
|
782
|
+
|
|
783
|
+
# Collect headers
|
|
784
|
+
fieldnames = []
|
|
785
|
+
seen = set()
|
|
786
|
+
for d in dicts:
|
|
787
|
+
for k in d:
|
|
788
|
+
if k not in seen:
|
|
789
|
+
fieldnames.append(k)
|
|
790
|
+
seen.add(k)
|
|
791
|
+
|
|
792
|
+
ws.append(fieldnames)
|
|
793
|
+
for d in dicts:
|
|
794
|
+
row = []
|
|
795
|
+
for k in fieldnames:
|
|
796
|
+
v = d.get(k)
|
|
797
|
+
if isinstance(v, (dict, list)):
|
|
798
|
+
row.append(json.dumps(v, separators=(",", ":")))
|
|
799
|
+
else:
|
|
800
|
+
row.append(v)
|
|
801
|
+
ws.append(row)
|
|
802
|
+
|
|
803
|
+
buf = io.BytesIO()
|
|
804
|
+
wb.save(buf)
|
|
805
|
+
return buf.getvalue()
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
809
|
+
# Format Registry
|
|
810
|
+
# ═══════════════════════════════════════════════════════════════════════════════
|
|
811
|
+
|
|
812
|
+
# Maps format name → (parse_fn, serialize_fn)
|
|
813
|
+
_HANDLERS: Dict[str, Tuple[Callable, Callable]] = {
|
|
814
|
+
"csv": (csv_to_dicts, dicts_to_csv),
|
|
815
|
+
"jsonl": (jsonl_to_dicts, dicts_to_jsonl),
|
|
816
|
+
"ndjson": (jsonl_to_dicts, dicts_to_jsonl),
|
|
817
|
+
"yaml": (yaml_to_dicts, dicts_to_yaml),
|
|
818
|
+
"yml": (yaml_to_dicts, dicts_to_yaml),
|
|
819
|
+
"toml": (toml_to_dicts, dicts_to_toml),
|
|
820
|
+
"ini": (ini_to_dicts, dicts_to_ini),
|
|
821
|
+
"xml": (xml_to_dicts, dicts_to_xml),
|
|
822
|
+
"msgpack": (msgpack_to_dicts, dicts_to_msgpack),
|
|
823
|
+
"protobuf": (protobuf_to_dicts, dicts_to_protobuf),
|
|
824
|
+
"parquet": (parquet_to_dicts, dicts_to_parquet),
|
|
825
|
+
"arrow": (arrow_to_dicts, dicts_to_arrow),
|
|
826
|
+
"feather": (arrow_to_dicts, dicts_to_arrow),
|
|
827
|
+
"bson": (bson_to_dicts, dicts_to_bson),
|
|
828
|
+
"cbor": (cbor_to_dicts, dicts_to_cbor),
|
|
829
|
+
"xlsx": (xlsx_to_dicts, dicts_to_xlsx),
|
|
830
|
+
"excel": (xlsx_to_dicts, dicts_to_xlsx),
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
_EXTENSION_MAP: Dict[str, str] = {
|
|
834
|
+
".csv": "csv",
|
|
835
|
+
".jsonl": "jsonl",
|
|
836
|
+
".ndjson": "jsonl",
|
|
837
|
+
".yaml": "yaml",
|
|
838
|
+
".yml": "yaml",
|
|
839
|
+
".toml": "toml",
|
|
840
|
+
".ini": "ini",
|
|
841
|
+
".cfg": "ini",
|
|
842
|
+
".xml": "xml",
|
|
843
|
+
".json": "json",
|
|
844
|
+
".msgpack": "msgpack",
|
|
845
|
+
".mpk": "msgpack",
|
|
846
|
+
".pb": "protobuf",
|
|
847
|
+
".parquet": "parquet",
|
|
848
|
+
".arrow": "arrow",
|
|
849
|
+
".feather": "arrow",
|
|
850
|
+
".ipc": "arrow",
|
|
851
|
+
".bson": "bson",
|
|
852
|
+
".cbor": "cbor",
|
|
853
|
+
".xlsx": "xlsx",
|
|
854
|
+
".xls": "xlsx",
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
_BINARY_FORMATS = frozenset({
|
|
858
|
+
"msgpack", "protobuf", "parquet", "arrow", "feather", "bson", "cbor",
|
|
859
|
+
"xlsx", "excel",
|
|
860
|
+
})
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def is_binary_format(name: str) -> bool:
|
|
864
|
+
"""Return True if the named format uses binary (bytes) I/O."""
|
|
865
|
+
return name.lower() in _BINARY_FORMATS
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
def detect_format(filepath: str) -> str:
|
|
869
|
+
"""Detect format from file extension.
|
|
870
|
+
|
|
871
|
+
Returns a format name string (e.g. 'csv', 'yaml', 'json', 'msgpack').
|
|
872
|
+
Raises ValueError for unknown extensions.
|
|
873
|
+
"""
|
|
874
|
+
ext = os.path.splitext(filepath)[1].lower()
|
|
875
|
+
fmt = _EXTENSION_MAP.get(ext)
|
|
876
|
+
if fmt is None:
|
|
877
|
+
raise ValueError(f"Unknown file extension: {ext!r}")
|
|
878
|
+
return fmt
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
def get_handler(name: str) -> Tuple[Callable, Callable]:
|
|
882
|
+
"""Get (parse_fn, serialize_fn) for a format name.
|
|
883
|
+
|
|
884
|
+
Args:
|
|
885
|
+
name: Format name ('csv', 'jsonl', 'yaml', 'toml', 'xml',
|
|
886
|
+
'msgpack', 'protobuf', 'parquet').
|
|
887
|
+
|
|
888
|
+
Returns:
|
|
889
|
+
Tuple of (parse_fn, serialize_fn).
|
|
890
|
+
|
|
891
|
+
Raises:
|
|
892
|
+
ValueError: If format name is not recognized.
|
|
893
|
+
"""
|
|
894
|
+
name = name.lower()
|
|
895
|
+
handler = _HANDLERS.get(name)
|
|
896
|
+
if handler is None:
|
|
897
|
+
raise ValueError(
|
|
898
|
+
f"Unknown format: {name!r}. "
|
|
899
|
+
f"Supported formats: {', '.join(sorted(set(_HANDLERS.keys())))}"
|
|
900
|
+
)
|
|
901
|
+
return handler
|