emergent-translator 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,901 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Format Handlers for Emergent Language Batch Encoder
4
+
5
+ Converts CSV, JSONL, YAML, TOML, XML, MessagePack, Protobuf, and Parquet
6
+ to/from List[Dict] for use with BatchEncoder.encode_batch() / decode_batch().
7
+
8
+ Text formats: CSV, JSONL, YAML, TOML, XML
9
+ Binary formats: MessagePack, Protobuf, Parquet
10
+
11
+ Usage:
12
+ from format_handlers import csv_to_dicts, dicts_to_csv
13
+ from format_handlers import detect_format, get_handler, is_binary_format
14
+ """
15
+
16
+ import csv
17
+ import io
18
+ import json
19
+ import os
20
+ import xml.etree.ElementTree as ET
21
+ from typing import Any, Callable, Dict, List, Tuple, Union
22
+
23
+
24
+ # ═══════════════════════════════════════════════════════════════════════════════
25
+ # Type Inference
26
+ # ═══════════════════════════════════════════════════════════════════════════════
27
+
28
+ def infer_type(value: str) -> Any:
29
+ """Infer Python type from a string value.
30
+
31
+ Recognizes int, float, bool (true/false), None/null/empty, and falls back
32
+ to str.
33
+ """
34
+ if value == "":
35
+ return None
36
+
37
+ lower = value.lower()
38
+ if lower in ("null", "none"):
39
+ return None
40
+ if lower == "true":
41
+ return True
42
+ if lower == "false":
43
+ return False
44
+
45
+ # Try int first (must not have decimal point)
46
+ try:
47
+ if "." not in value and "e" not in lower:
48
+ return int(value)
49
+ except ValueError:
50
+ pass
51
+
52
+ # Try float
53
+ try:
54
+ return float(value)
55
+ except ValueError:
56
+ pass
57
+
58
+ return value
59
+
60
+
61
+ # ═══════════════════════════════════════════════════════════════════════════════
62
+ # CSV
63
+ # ═══════════════════════════════════════════════════════════════════════════════
64
+
65
+ def csv_to_dicts(text: str, infer_types: bool = True) -> List[Dict]:
66
+ """Parse CSV text into a list of dicts.
67
+
68
+ Args:
69
+ text: CSV string with a header row.
70
+ infer_types: If True, convert values to int/float/bool/None where
71
+ possible. If False, keep everything as strings.
72
+
73
+ Returns:
74
+ List of dicts, one per CSV row.
75
+ """
76
+ reader = csv.DictReader(io.StringIO(text))
77
+ rows = []
78
+ for row in reader:
79
+ if infer_types:
80
+ row = {k: infer_type(v) for k, v in row.items()}
81
+ rows.append(dict(row))
82
+ return rows
83
+
84
+
85
+ def dicts_to_csv(dicts: List[Dict]) -> str:
86
+ """Serialize a list of dicts to CSV text.
87
+
88
+ Nested values (dict/list) are JSON-stringified. None becomes empty string.
89
+ Field order is the union of all keys, preserving first-seen order.
90
+ """
91
+ if not dicts:
92
+ return ""
93
+
94
+ # Collect all keys preserving insertion order
95
+ fieldnames = []
96
+ seen = set()
97
+ for d in dicts:
98
+ for k in d:
99
+ if k not in seen:
100
+ fieldnames.append(k)
101
+ seen.add(k)
102
+
103
+ output = io.StringIO()
104
+ writer = csv.DictWriter(output, fieldnames=fieldnames, extrasaction="ignore")
105
+ writer.writeheader()
106
+ for d in dicts:
107
+ row = {}
108
+ for k in fieldnames:
109
+ v = d.get(k)
110
+ if v is None:
111
+ row[k] = ""
112
+ elif isinstance(v, bool):
113
+ row[k] = str(v).lower()
114
+ elif isinstance(v, (dict, list)):
115
+ row[k] = json.dumps(v, separators=(",", ":"))
116
+ else:
117
+ row[k] = v
118
+ writer.writerow(row)
119
+ return output.getvalue()
120
+
121
+
122
+ # ═══════════════════════════════════════════════════════════════════════════════
123
+ # JSONL / NDJSON
124
+ # ═══════════════════════════════════════════════════════════════════════════════
125
+
126
+ def jsonl_to_dicts(text: str) -> List[Dict]:
127
+ """Parse newline-delimited JSON (JSONL/NDJSON) into a list of dicts.
128
+
129
+ Each non-empty line is parsed as a separate JSON object.
130
+ """
131
+ result = []
132
+ for lineno, line in enumerate(text.splitlines(), 1):
133
+ line = line.strip()
134
+ if not line:
135
+ continue
136
+ try:
137
+ obj = json.loads(line)
138
+ except json.JSONDecodeError as e:
139
+ raise ValueError(f"Invalid JSON on line {lineno}: {e}") from e
140
+ if not isinstance(obj, dict):
141
+ raise ValueError(
142
+ f"Expected dict on line {lineno}, got {type(obj).__name__}"
143
+ )
144
+ result.append(obj)
145
+ return result
146
+
147
+
148
+ def dicts_to_jsonl(dicts: List[Dict]) -> str:
149
+ """Serialize a list of dicts to JSONL (one JSON object per line)."""
150
+ if not dicts:
151
+ return ""
152
+ return "\n".join(json.dumps(d, separators=(",", ":")) for d in dicts) + "\n"
153
+
154
+
155
+ # ═══════════════════════════════════════════════════════════════════════════════
156
+ # YAML
157
+ # ═══════════════════════════════════════════════════════════════════════════════
158
+
159
+ def _import_yaml():
160
+ """Import pyyaml with a helpful error message."""
161
+ try:
162
+ import yaml
163
+ return yaml
164
+ except ImportError:
165
+ raise ImportError(
166
+ "PyYAML is required for YAML support. "
167
+ "Install it with: pip install pyyaml "
168
+ "or: pip install emergent-translator[formats]"
169
+ )
170
+
171
+
172
+ def yaml_to_dicts(text: str) -> List[Dict]:
173
+ """Parse YAML text into a list of dicts.
174
+
175
+ Handles:
176
+ - A YAML list of dicts (most common)
177
+ - A single YAML dict (wrapped in a list)
178
+ - Multi-document YAML (--- separators)
179
+
180
+ Raises ValueError for scalars, empty docs, and non-dict list items.
181
+ """
182
+ yaml = _import_yaml()
183
+
184
+ # Try multi-document first
185
+ docs = list(yaml.safe_load_all(text))
186
+
187
+ # Filter out None docs (empty documents between ---)
188
+ docs = [d for d in docs if d is not None]
189
+
190
+ if not docs:
191
+ raise ValueError("YAML document is empty")
192
+
193
+ # Single document
194
+ if len(docs) == 1:
195
+ data = docs[0]
196
+ if isinstance(data, list):
197
+ for item in data:
198
+ if not isinstance(item, dict):
199
+ raise ValueError(
200
+ f"Expected list of dicts, got list containing {type(item).__name__}"
201
+ )
202
+ return data
203
+ if isinstance(data, dict):
204
+ return [data]
205
+ raise ValueError(
206
+ f"Expected dict or list of dicts, got {type(data).__name__}"
207
+ )
208
+
209
+ # Multiple documents — each should be a dict
210
+ result = []
211
+ for doc in docs:
212
+ if isinstance(doc, dict):
213
+ result.append(doc)
214
+ elif isinstance(doc, list):
215
+ for item in doc:
216
+ if not isinstance(item, dict):
217
+ raise ValueError(
218
+ f"Expected dicts in multi-doc YAML, got {type(item).__name__}"
219
+ )
220
+ result.append(item)
221
+ else:
222
+ raise ValueError(
223
+ f"Expected dict in multi-doc YAML, got {type(doc).__name__}"
224
+ )
225
+ return result
226
+
227
+
228
+ def dicts_to_yaml(dicts: List[Dict]) -> str:
229
+ """Serialize a list of dicts to YAML text."""
230
+ yaml = _import_yaml()
231
+ return yaml.dump(dicts, default_flow_style=False, sort_keys=False)
232
+
233
+
234
+ # ═══════════════════════════════════════════════════════════════════════════════
235
+ # TOML
236
+ # ═══════════════════════════════════════════════════════════════════════════════
237
+
238
+ def _import_tomllib():
239
+ """Import tomllib (3.11+) or tomli backport for reading TOML."""
240
+ try:
241
+ import tomllib
242
+ return tomllib
243
+ except ImportError:
244
+ pass
245
+ try:
246
+ import tomli as tomllib
247
+ return tomllib
248
+ except ImportError:
249
+ raise ImportError(
250
+ "TOML reading requires Python 3.11+ or the 'tomli' package. "
251
+ "Install it with: pip install tomli "
252
+ "or: pip install emergent-translator[formats]"
253
+ )
254
+
255
+
256
+ def _import_tomli_w():
257
+ """Import tomli_w for writing TOML."""
258
+ try:
259
+ import tomli_w
260
+ return tomli_w
261
+ except ImportError:
262
+ raise ImportError(
263
+ "TOML writing requires the 'tomli_w' package. "
264
+ "Install it with: pip install tomli_w "
265
+ "or: pip install emergent-translator[formats]"
266
+ )
267
+
268
+
269
+ def toml_to_dicts(text: str) -> List[Dict]:
270
+ """Parse TOML text into a list of dicts.
271
+
272
+ If the TOML has an ``items``, ``records``, ``entries``, or ``data`` key
273
+ whose value is a list of tables, that list is returned directly.
274
+ Otherwise the root table is returned wrapped in a list.
275
+ """
276
+ tomllib = _import_tomllib()
277
+ data = tomllib.loads(text)
278
+
279
+ # Look for a conventional list-of-tables key
280
+ for key in ("items", "records", "entries", "data"):
281
+ if key in data and isinstance(data[key], list):
282
+ return data[key]
283
+
284
+ return [data]
285
+
286
+
287
+ def dicts_to_toml(dicts: List[Dict]) -> str:
288
+ """Serialize a list of dicts to TOML text.
289
+
290
+ A single dict is written as a flat TOML document. Multiple dicts are
291
+ written as an ``[[items]]`` array of tables. None values are omitted
292
+ (TOML has no null type).
293
+ """
294
+ tomli_w = _import_tomli_w()
295
+ cleaned = [_toml_clean(d) for d in dicts]
296
+ if len(cleaned) == 1:
297
+ return tomli_w.dumps(cleaned[0])
298
+ return tomli_w.dumps({"items": cleaned})
299
+
300
+
301
+ def _toml_clean(obj: Any) -> Any:
302
+ """Remove None values from nested structures for TOML compatibility."""
303
+ if isinstance(obj, dict):
304
+ return {k: _toml_clean(v) for k, v in obj.items() if v is not None}
305
+ if isinstance(obj, list):
306
+ return [_toml_clean(item) for item in obj]
307
+ return obj
308
+
309
+
310
+ # ═══════════════════════════════════════════════════════════════════════════════
311
+ # XML
312
+ # ═══════════════════════════════════════════════════════════════════════════════
313
+
314
+ def xml_to_dicts(text: str) -> List[Dict]:
315
+ """Parse XML text into a list of dicts.
316
+
317
+ Expected structure:
318
+ <root>
319
+ <item attr="val">
320
+ <key>value</key>
321
+ ...
322
+ </item>
323
+ ...
324
+ </root>
325
+
326
+ Conventions:
327
+ - XML attributes become ``@attr`` keys.
328
+ - Text content of an element with children/attributes becomes ``#text``.
329
+ - Repeated sibling tags become lists.
330
+ - Leaf text is type-inferred (int/float/bool/None).
331
+ """
332
+ root = ET.fromstring(text)
333
+ result = []
334
+ for child in root:
335
+ result.append(_xml_element_to_dict(child))
336
+ return result
337
+
338
+
339
+ def _xml_element_to_dict(elem: ET.Element) -> Dict:
340
+ """Recursively convert an XML element to a dict."""
341
+ d = {}
342
+
343
+ # Attributes → @attr keys
344
+ for attr_name, attr_val in elem.attrib.items():
345
+ d["@" + attr_name] = infer_type(attr_val)
346
+
347
+ # Group children by tag to detect repeated siblings
348
+ children_by_tag = {}
349
+ for child in elem:
350
+ children_by_tag.setdefault(child.tag, []).append(child)
351
+
352
+ for tag, children in children_by_tag.items():
353
+ if len(children) == 1:
354
+ child = children[0]
355
+ if len(child) == 0 and not child.attrib:
356
+ # Leaf element
357
+ d[tag] = infer_type(child.text or "")
358
+ else:
359
+ d[tag] = _xml_element_to_dict(child)
360
+ else:
361
+ # Repeated tag → list
362
+ items = []
363
+ for child in children:
364
+ if len(child) == 0 and not child.attrib:
365
+ items.append(infer_type(child.text or ""))
366
+ else:
367
+ items.append(_xml_element_to_dict(child))
368
+ d[tag] = items
369
+
370
+ # Text content
371
+ text = (elem.text or "").strip()
372
+ if text:
373
+ if d:
374
+ d["#text"] = text
375
+ else:
376
+ d["#text"] = infer_type(text)
377
+ return d
378
+
379
+ return d
380
+
381
+
382
+ def dicts_to_xml(dicts: List[Dict], root_tag: str = "root",
383
+ item_tag: str = "item") -> str:
384
+ """Serialize a list of dicts to XML text.
385
+
386
+ Args:
387
+ dicts: Data to serialize.
388
+ root_tag: Name of the root element.
389
+ item_tag: Name of each item element.
390
+
391
+ Returns:
392
+ XML string with declaration.
393
+ """
394
+ root = ET.Element(root_tag)
395
+ for d in dicts:
396
+ item = ET.SubElement(root, item_tag)
397
+ _dict_to_xml_element(d, item)
398
+
399
+ # Pretty-print (ET.indent is 3.9+)
400
+ try:
401
+ ET.indent(root, space=" ")
402
+ except AttributeError:
403
+ pass
404
+
405
+ return ET.tostring(root, encoding="unicode", xml_declaration=True)
406
+
407
+
408
+ def _dict_to_xml_element(d: Dict, parent: ET.Element) -> None:
409
+ """Recursively convert a dict to XML sub-elements under *parent*."""
410
+ for key, value in d.items():
411
+ if key.startswith("@"):
412
+ parent.set(key[1:], _xml_value_str(value))
413
+ elif key == "#text":
414
+ parent.text = _xml_value_str(value)
415
+ elif isinstance(value, dict):
416
+ child = ET.SubElement(parent, key)
417
+ _dict_to_xml_element(value, child)
418
+ elif isinstance(value, list):
419
+ for item in value:
420
+ child = ET.SubElement(parent, key)
421
+ if isinstance(item, dict):
422
+ _dict_to_xml_element(item, child)
423
+ else:
424
+ child.text = _xml_value_str(item)
425
+ else:
426
+ child = ET.SubElement(parent, key)
427
+ child.text = _xml_value_str(value)
428
+
429
+
430
+ def _xml_value_str(value: Any) -> str:
431
+ """Convert a Python value to its XML text representation."""
432
+ if value is None:
433
+ return ""
434
+ if isinstance(value, bool):
435
+ return str(value).lower()
436
+ return str(value)
437
+
438
+
439
+ # ═══════════════════════════════════════════════════════════════════════════════
440
+ # MessagePack (requires: pip install msgpack)
441
+ # ═══════════════════════════════════════════════════════════════════════════════
442
+
443
+ def _import_msgpack():
444
+ """Import msgpack with a helpful error message."""
445
+ try:
446
+ import msgpack
447
+ return msgpack
448
+ except ImportError:
449
+ raise ImportError(
450
+ "msgpack is required for MessagePack support. "
451
+ "Install it with: pip install msgpack "
452
+ "or: pip install emergent-translator[formats]"
453
+ )
454
+
455
+
456
+ def msgpack_to_dicts(data: bytes) -> List[Dict]:
457
+ """Deserialize MessagePack bytes into a list of dicts.
458
+
459
+ Accepts a packed list of dicts or a single packed dict (wrapped in a list).
460
+ """
461
+ msgpack = _import_msgpack()
462
+ unpacked = msgpack.unpackb(data, raw=False)
463
+ if isinstance(unpacked, list):
464
+ return unpacked
465
+ if isinstance(unpacked, dict):
466
+ return [unpacked]
467
+ raise ValueError(
468
+ f"Expected list or dict in MessagePack data, got {type(unpacked).__name__}"
469
+ )
470
+
471
+
472
+ def dicts_to_msgpack(dicts: List[Dict]) -> bytes:
473
+ """Serialize a list of dicts to MessagePack bytes."""
474
+ msgpack = _import_msgpack()
475
+ return msgpack.packb(dicts, use_bin_type=True)
476
+
477
+
478
+ # ═══════════════════════════════════════════════════════════════════════════════
479
+ # Protobuf (requires: pip install protobuf)
480
+ # ═══════════════════════════════════════════════════════════════════════════════
481
+
482
+ def _import_protobuf():
483
+ """Import google.protobuf struct utilities with a helpful error message."""
484
+ try:
485
+ from google.protobuf import struct_pb2, json_format
486
+ return struct_pb2, json_format
487
+ except ImportError:
488
+ raise ImportError(
489
+ "protobuf is required for Protobuf support. "
490
+ "Install it with: pip install protobuf "
491
+ "or: pip install emergent-translator[formats]"
492
+ )
493
+
494
+
495
+ def protobuf_to_dicts(data: bytes) -> List[Dict]:
496
+ """Deserialize Protobuf bytes (google.protobuf.Struct wrapper) to dicts.
497
+
498
+ Uses ``google.protobuf.struct_pb2.Struct`` as a schema-less container.
499
+ Note: all numeric values are stored as doubles in Protobuf Struct, so
500
+ integers will round-trip as floats (e.g. 42 -> 42.0).
501
+ """
502
+ struct_pb2, json_format = _import_protobuf()
503
+ wrapper = struct_pb2.Struct()
504
+ wrapper.ParseFromString(data)
505
+ raw = json_format.MessageToDict(wrapper)
506
+ items = raw.get("items", [raw])
507
+ return items
508
+
509
+
510
+ def dicts_to_protobuf(dicts: List[Dict]) -> bytes:
511
+ """Serialize a list of dicts to Protobuf bytes using Struct wrapper.
512
+
513
+ Wraps the list under an ``items`` key in a ``google.protobuf.Struct``.
514
+ """
515
+ struct_pb2, json_format = _import_protobuf()
516
+ wrapper = struct_pb2.Struct()
517
+ json_format.ParseDict({"items": dicts}, wrapper)
518
+ return wrapper.SerializeToString()
519
+
520
+
521
+ # ═══════════════════════════════════════════════════════════════════════════════
522
+ # Parquet (requires: pip install pyarrow)
523
+ # ═══════════════════════════════════════════════════════════════════════════════
524
+
525
+ def _import_pyarrow():
526
+ """Import pyarrow with a helpful error message."""
527
+ try:
528
+ import pyarrow as pa
529
+ import pyarrow.parquet as pq
530
+ return pa, pq
531
+ except ImportError:
532
+ raise ImportError(
533
+ "pyarrow is required for Parquet support. "
534
+ "Install it with: pip install pyarrow "
535
+ "or: pip install emergent-translator[formats]"
536
+ )
537
+
538
+
539
+ def parquet_to_dicts(data: bytes) -> List[Dict]:
540
+ """Deserialize Parquet bytes into a list of dicts.
541
+
542
+ Best suited for flat/tabular data. Nested structures depend on Arrow's
543
+ schema inference.
544
+ """
545
+ pa, pq = _import_pyarrow()
546
+ buf = pa.BufferReader(data)
547
+ table = pq.read_table(buf)
548
+ return table.to_pylist()
549
+
550
+
551
+ def dicts_to_parquet(dicts: List[Dict]) -> bytes:
552
+ """Serialize a list of dicts to Parquet bytes.
553
+
554
+ Best suited for flat/tabular data with uniform schemas.
555
+ """
556
+ pa, pq = _import_pyarrow()
557
+ table = pa.Table.from_pylist(dicts)
558
+ sink = pa.BufferOutputStream()
559
+ pq.write_table(table, sink)
560
+ return sink.getvalue().to_pybytes()
561
+
562
+
563
+ # ═══════════════════════════════════════════════════════════════════════════════
564
+ # Arrow IPC / Feather (requires: pip install pyarrow — shared with Parquet)
565
+ # ═══════════════════════════════════════════════════════════════════════════════
566
+
567
+ def arrow_to_dicts(data: bytes) -> List[Dict]:
568
+ """Deserialize Arrow IPC (Feather v2) bytes into a list of dicts."""
569
+ pa, _ = _import_pyarrow()
570
+ reader = pa.ipc.open_file(pa.BufferReader(data))
571
+ table = reader.read_all()
572
+ return table.to_pylist()
573
+
574
+
575
+ def dicts_to_arrow(dicts: List[Dict]) -> bytes:
576
+ """Serialize a list of dicts to Arrow IPC (Feather v2) bytes."""
577
+ pa, _ = _import_pyarrow()
578
+ table = pa.Table.from_pylist(dicts)
579
+ sink = pa.BufferOutputStream()
580
+ writer = pa.ipc.new_file(sink, table.schema)
581
+ writer.write_table(table)
582
+ writer.close()
583
+ return sink.getvalue().to_pybytes()
584
+
585
+
586
+ # ═══════════════════════════════════════════════════════════════════════════════
587
+ # BSON (requires: pip install pymongo)
588
+ # ═══════════════════════════════════════════════════════════════════════════════
589
+
590
+ def _import_bson():
591
+ """Import bson (from pymongo) with a helpful error message."""
592
+ try:
593
+ import bson
594
+ return bson
595
+ except ImportError:
596
+ raise ImportError(
597
+ "pymongo is required for BSON support. "
598
+ "Install it with: pip install pymongo "
599
+ "or: pip install emergent-translator[formats]"
600
+ )
601
+
602
+
603
+ def bson_to_dicts(data: bytes) -> List[Dict]:
604
+ """Deserialize BSON bytes into a list of dicts.
605
+
606
+ Expects a sequence of concatenated BSON documents (standard MongoDB
607
+ wire format). ObjectId values are converted to strings.
608
+ """
609
+ bson_mod = _import_bson()
610
+ result = []
611
+ offset = 0
612
+ while offset < len(data):
613
+ # Each BSON doc starts with a 4-byte little-endian length
614
+ if offset + 4 > len(data):
615
+ break
616
+ import struct as _struct
617
+ doc_len = _struct.unpack_from("<i", data, offset)[0]
618
+ doc_bytes = data[offset:offset + doc_len]
619
+ doc = bson_mod.decode(doc_bytes)
620
+ # Convert ObjectId to string for JSON compatibility
621
+ if "_id" in doc:
622
+ doc["_id"] = str(doc["_id"])
623
+ result.append(doc)
624
+ offset += doc_len
625
+ return result
626
+
627
+
628
+ def dicts_to_bson(dicts: List[Dict]) -> bytes:
629
+ """Serialize a list of dicts to concatenated BSON bytes."""
630
+ bson_mod = _import_bson()
631
+ parts = []
632
+ for d in dicts:
633
+ parts.append(bson_mod.encode(d))
634
+ return b"".join(parts)
635
+
636
+
637
+ # ═══════════════════════════════════════════════════════════════════════════════
638
+ # CBOR (requires: pip install cbor2)
639
+ # ═══════════════════════════════════════════════════════════════════════════════
640
+
641
+ def _import_cbor2():
642
+ """Import cbor2 with a helpful error message."""
643
+ try:
644
+ import cbor2
645
+ return cbor2
646
+ except ImportError:
647
+ raise ImportError(
648
+ "cbor2 is required for CBOR support. "
649
+ "Install it with: pip install cbor2 "
650
+ "or: pip install emergent-translator[formats]"
651
+ )
652
+
653
+
654
+ def cbor_to_dicts(data: bytes) -> List[Dict]:
655
+ """Deserialize CBOR bytes into a list of dicts.
656
+
657
+ Accepts a CBOR-encoded list of maps or a single map (wrapped in a list).
658
+ """
659
+ cbor2 = _import_cbor2()
660
+ decoded = cbor2.loads(data)
661
+ if isinstance(decoded, list):
662
+ return decoded
663
+ if isinstance(decoded, dict):
664
+ return [decoded]
665
+ raise ValueError(
666
+ f"Expected list or dict in CBOR data, got {type(decoded).__name__}"
667
+ )
668
+
669
+
670
+ def dicts_to_cbor(dicts: List[Dict]) -> bytes:
671
+ """Serialize a list of dicts to CBOR bytes."""
672
+ cbor2 = _import_cbor2()
673
+ return cbor2.dumps(dicts)
674
+
675
+
676
+ # ═══════════════════════════════════════════════════════════════════════════════
677
+ # INI (stdlib configparser)
678
+ # ═══════════════════════════════════════════════════════════════════════════════
679
+
680
+ def ini_to_dicts(text: str) -> List[Dict]:
681
+ """Parse INI text into a list of dicts, one per section.
682
+
683
+ Each section becomes a dict with all its key-value pairs plus a
684
+ ``_section`` key holding the section name. Values are type-inferred.
685
+ The ``[DEFAULT]`` section values are merged into every section (standard
686
+ configparser behaviour).
687
+ """
688
+ import configparser
689
+ parser = configparser.ConfigParser()
690
+ parser.read_string(text)
691
+ result = []
692
+ for section in parser.sections():
693
+ d = {"_section": section}
694
+ for key, val in parser.items(section):
695
+ d[key] = infer_type(val)
696
+ result.append(d)
697
+ return result
698
+
699
+
700
+ def dicts_to_ini(dicts: List[Dict]) -> str:
701
+ """Serialize a list of dicts to INI text.
702
+
703
+ Each dict becomes a section. The section name is taken from a
704
+ ``_section`` key (defaulting to ``section_N``). None values are
705
+ written as empty strings.
706
+ """
707
+ import configparser
708
+ parser = configparser.ConfigParser()
709
+ for i, d in enumerate(dicts):
710
+ section = str(d.get("_section", f"section_{i}"))
711
+ parser.add_section(section)
712
+ for key, val in d.items():
713
+ if key == "_section":
714
+ continue
715
+ if val is None:
716
+ parser.set(section, key, "")
717
+ elif isinstance(val, bool):
718
+ parser.set(section, key, str(val).lower())
719
+ elif isinstance(val, (dict, list)):
720
+ parser.set(section, key, json.dumps(val, separators=(",", ":")))
721
+ else:
722
+ parser.set(section, key, str(val))
723
+ output = io.StringIO()
724
+ parser.write(output)
725
+ return output.getvalue()
726
+
727
+
728
+ # ═══════════════════════════════════════════════════════════════════════════════
729
+ # Excel / XLSX (requires: pip install openpyxl)
730
+ # ═══════════════════════════════════════════════════════════════════════════════
731
+
732
+ def _import_openpyxl():
733
+ """Import openpyxl with a helpful error message."""
734
+ try:
735
+ import openpyxl
736
+ return openpyxl
737
+ except ImportError:
738
+ raise ImportError(
739
+ "openpyxl is required for Excel/XLSX support. "
740
+ "Install it with: pip install openpyxl "
741
+ "or: pip install emergent-translator[formats]"
742
+ )
743
+
744
+
745
+ def xlsx_to_dicts(data: bytes) -> List[Dict]:
746
+ """Deserialize Excel XLSX bytes into a list of dicts.
747
+
748
+ Reads the active sheet. The first row is treated as headers.
749
+ Values are type-inferred from their cell values (Excel already
750
+ distinguishes numbers, strings, booleans, and None).
751
+ """
752
+ openpyxl = _import_openpyxl()
753
+ wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
754
+ ws = wb.active
755
+ rows = list(ws.iter_rows(values_only=True))
756
+ wb.close()
757
+ if not rows:
758
+ return []
759
+ headers = [str(h) if h is not None else f"col_{i}" for i, h in enumerate(rows[0])]
760
+ result = []
761
+ for row in rows[1:]:
762
+ d = {}
763
+ for key, val in zip(headers, row):
764
+ d[key] = val
765
+ result.append(d)
766
+ return result
767
+
768
+
769
+ def dicts_to_xlsx(dicts: List[Dict]) -> bytes:
770
+ """Serialize a list of dicts to Excel XLSX bytes.
771
+
772
+ Nested values (dict/list) are JSON-stringified.
773
+ """
774
+ openpyxl = _import_openpyxl()
775
+ wb = openpyxl.Workbook()
776
+ ws = wb.active
777
+
778
+ if not dicts:
779
+ buf = io.BytesIO()
780
+ wb.save(buf)
781
+ return buf.getvalue()
782
+
783
+ # Collect headers
784
+ fieldnames = []
785
+ seen = set()
786
+ for d in dicts:
787
+ for k in d:
788
+ if k not in seen:
789
+ fieldnames.append(k)
790
+ seen.add(k)
791
+
792
+ ws.append(fieldnames)
793
+ for d in dicts:
794
+ row = []
795
+ for k in fieldnames:
796
+ v = d.get(k)
797
+ if isinstance(v, (dict, list)):
798
+ row.append(json.dumps(v, separators=(",", ":")))
799
+ else:
800
+ row.append(v)
801
+ ws.append(row)
802
+
803
+ buf = io.BytesIO()
804
+ wb.save(buf)
805
+ return buf.getvalue()
806
+
807
+
808
+ # ═══════════════════════════════════════════════════════════════════════════════
809
+ # Format Registry
810
+ # ═══════════════════════════════════════════════════════════════════════════════
811
+
812
+ # Maps format name → (parse_fn, serialize_fn)
813
+ _HANDLERS: Dict[str, Tuple[Callable, Callable]] = {
814
+ "csv": (csv_to_dicts, dicts_to_csv),
815
+ "jsonl": (jsonl_to_dicts, dicts_to_jsonl),
816
+ "ndjson": (jsonl_to_dicts, dicts_to_jsonl),
817
+ "yaml": (yaml_to_dicts, dicts_to_yaml),
818
+ "yml": (yaml_to_dicts, dicts_to_yaml),
819
+ "toml": (toml_to_dicts, dicts_to_toml),
820
+ "ini": (ini_to_dicts, dicts_to_ini),
821
+ "xml": (xml_to_dicts, dicts_to_xml),
822
+ "msgpack": (msgpack_to_dicts, dicts_to_msgpack),
823
+ "protobuf": (protobuf_to_dicts, dicts_to_protobuf),
824
+ "parquet": (parquet_to_dicts, dicts_to_parquet),
825
+ "arrow": (arrow_to_dicts, dicts_to_arrow),
826
+ "feather": (arrow_to_dicts, dicts_to_arrow),
827
+ "bson": (bson_to_dicts, dicts_to_bson),
828
+ "cbor": (cbor_to_dicts, dicts_to_cbor),
829
+ "xlsx": (xlsx_to_dicts, dicts_to_xlsx),
830
+ "excel": (xlsx_to_dicts, dicts_to_xlsx),
831
+ }
832
+
833
+ _EXTENSION_MAP: Dict[str, str] = {
834
+ ".csv": "csv",
835
+ ".jsonl": "jsonl",
836
+ ".ndjson": "jsonl",
837
+ ".yaml": "yaml",
838
+ ".yml": "yaml",
839
+ ".toml": "toml",
840
+ ".ini": "ini",
841
+ ".cfg": "ini",
842
+ ".xml": "xml",
843
+ ".json": "json",
844
+ ".msgpack": "msgpack",
845
+ ".mpk": "msgpack",
846
+ ".pb": "protobuf",
847
+ ".parquet": "parquet",
848
+ ".arrow": "arrow",
849
+ ".feather": "arrow",
850
+ ".ipc": "arrow",
851
+ ".bson": "bson",
852
+ ".cbor": "cbor",
853
+ ".xlsx": "xlsx",
854
+ ".xls": "xlsx",
855
+ }
856
+
857
+ _BINARY_FORMATS = frozenset({
858
+ "msgpack", "protobuf", "parquet", "arrow", "feather", "bson", "cbor",
859
+ "xlsx", "excel",
860
+ })
861
+
862
+
863
+ def is_binary_format(name: str) -> bool:
864
+ """Return True if the named format uses binary (bytes) I/O."""
865
+ return name.lower() in _BINARY_FORMATS
866
+
867
+
868
+ def detect_format(filepath: str) -> str:
869
+ """Detect format from file extension.
870
+
871
+ Returns a format name string (e.g. 'csv', 'yaml', 'json', 'msgpack').
872
+ Raises ValueError for unknown extensions.
873
+ """
874
+ ext = os.path.splitext(filepath)[1].lower()
875
+ fmt = _EXTENSION_MAP.get(ext)
876
+ if fmt is None:
877
+ raise ValueError(f"Unknown file extension: {ext!r}")
878
+ return fmt
879
+
880
+
881
+ def get_handler(name: str) -> Tuple[Callable, Callable]:
882
+ """Get (parse_fn, serialize_fn) for a format name.
883
+
884
+ Args:
885
+ name: Format name ('csv', 'jsonl', 'yaml', 'toml', 'xml',
886
+ 'msgpack', 'protobuf', 'parquet').
887
+
888
+ Returns:
889
+ Tuple of (parse_fn, serialize_fn).
890
+
891
+ Raises:
892
+ ValueError: If format name is not recognized.
893
+ """
894
+ name = name.lower()
895
+ handler = _HANDLERS.get(name)
896
+ if handler is None:
897
+ raise ValueError(
898
+ f"Unknown format: {name!r}. "
899
+ f"Supported formats: {', '.join(sorted(set(_HANDLERS.keys())))}"
900
+ )
901
+ return handler