lokit-python 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {lokit_python-0.1.2 → lokit_python-0.1.3}/PKG-INFO +1 -1
  2. {lokit_python-0.1.2 → lokit_python-0.1.3}/pyproject.toml +1 -1
  3. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/__init__.py +2 -0
  4. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/tmx.py +131 -48
  5. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/xliff.py +36 -20
  6. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/format_detection.py +6 -3
  7. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/importers.py +35 -9
  8. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/__init__.py +4 -1
  9. lokit_python-0.1.3/src/lokit/io/stream_json.py +158 -0
  10. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/logic.py +21 -1
  11. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/base.py +55 -30
  12. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/extraction.py +57 -19
  13. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/props.py +34 -12
  14. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/tags.py +9 -6
  15. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/xml_utils.py +9 -2
  16. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/tags.py +20 -14
  17. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/PKG-INFO +1 -1
  18. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/SOURCES.txt +1 -0
  19. lokit_python-0.1.3/src/lokit_python.egg-info/top_level.txt +2 -0
  20. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_performance_safety.py +20 -1
  21. lokit_python-0.1.2/src/lokit_python.egg-info/top_level.txt +0 -2
  22. {lokit_python-0.1.2 → lokit_python-0.1.3}/README.md +0 -0
  23. {lokit_python-0.1.2 → lokit_python-0.1.3}/setup.cfg +0 -0
  24. {lokit_python-0.1.2 → lokit_python-0.1.3}/setup.py +0 -0
  25. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/core/__init__.py +0 -0
  26. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/core/logger.py +0 -0
  27. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/__init__.py +0 -0
  28. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/lang_codes.py +0 -0
  29. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/structure.py +0 -0
  30. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/tag_types.py +0 -0
  31. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/__init__.py +0 -0
  32. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/csv.py +0 -0
  33. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/html.py +0 -0
  34. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/idml.py +0 -0
  35. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/json_i18n.py +0 -0
  36. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/po.py +0 -0
  37. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/xlsx.py +0 -0
  38. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/atomic.py +0 -0
  39. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/json.py +0 -0
  40. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/__init__.py +0 -0
  41. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/async_bridge.py +0 -0
  42. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/csv/__init__.py +0 -0
  43. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/csv/extraction.py +0 -0
  44. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/html/__init__.py +0 -0
  45. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/html/extraction.py +0 -0
  46. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/idml/__init__.py +0 -0
  47. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/idml/extraction.py +0 -0
  48. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/json_i18n/__init__.py +0 -0
  49. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/json_i18n/extraction.py +0 -0
  50. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/po/__init__.py +0 -0
  51. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/po/extraction.py +0 -0
  52. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/__init__.py +0 -0
  53. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/header.py +0 -0
  54. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/helpers.py +0 -0
  55. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/models.py +0 -0
  56. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/parallel.py +0 -0
  57. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/__init__.py +0 -0
  58. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/extraction.py +0 -0
  59. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xlsx/__init__.py +0 -0
  60. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xlsx/extraction.py +0 -0
  61. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/py.typed +0 -0
  62. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/dependency_links.txt +0 -0
  63. {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/requires.txt +0 -0
  64. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_csv.py +0 -0
  65. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_html.py +0 -0
  66. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_idml.py +0 -0
  67. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_json_i18n.py +0 -0
  68. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_po.py +0 -0
  69. {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_xlsx.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lokit-python
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lokit-python"
3
- version = "0.1.2"
3
+ version = "0.1.3"
4
4
  description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -65,6 +65,7 @@ from lokit.importers import (
65
65
  import_xlsx_async,
66
66
  )
67
67
  from lokit.io import load_lokit_json, load_lokit_json_bytes
68
+ from lokit.io.stream_json import LokitJsonContext
68
69
  from lokit.logic import Lokit, MatchResult
69
70
  from lokit.parsers.csv.extraction import CsvExtractor
70
71
  from lokit.parsers.xlsx.extraction import XlsxExtractor
@@ -86,6 +87,7 @@ __all__ = [
86
87
  "Data",
87
88
  "Meta",
88
89
  "Lokit",
90
+ "LokitJsonContext",
89
91
  "MatchResult",
90
92
  "Origin",
91
93
  "Plural",
@@ -1,7 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterable
4
+ from dataclasses import dataclass
4
5
  from pathlib import Path
6
+ from typing import Any
5
7
 
6
8
  from lxml import etree
7
9
  from lxml.etree import _Element
@@ -25,6 +27,13 @@ from lokit.io.atomic import atomic_output_path
25
27
  Structure = BaseStructure | StreamingStructure
26
28
 
27
29
 
30
+ @dataclass(slots=True)
31
+ class _CommentSummary:
32
+ creator_id: str | None = None
33
+ project: str | None = None
34
+ system: str | None = None
35
+
36
+
28
37
  def export_tmx(document: Structure, filepath: str | Path) -> None:
29
38
  path = Path(filepath)
30
39
  with atomic_output_path(path, "wb") as stream:
@@ -75,9 +84,9 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
75
84
  attrs["creationdate"] = unit.meta.created
76
85
  if unit.meta.updated:
77
86
  attrs["changedate"] = unit.meta.updated
78
- creator_id = _first_creator_id(unit)
79
- if creator_id:
80
- attrs["creationid"] = creator_id
87
+ comment_summary = _comment_summary(unit)
88
+ if comment_summary.creator_id:
89
+ attrs["creationid"] = comment_summary.creator_id
81
90
  change_id = unit.meta.extensions.get("change_id")
82
91
  if change_id:
83
92
  attrs["changeid"] = change_id
@@ -85,7 +94,7 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
85
94
  attrs["usagecount"] = str(unit.meta.usage_count)
86
95
 
87
96
  tu = etree.Element("tu", attrs)
88
- _append_unit_properties(tu, unit)
97
+ _append_unit_properties(tu, unit, comment_summary)
89
98
  _append_comments(tu, unit)
90
99
  tu.append(
91
100
  _build_tuv(
@@ -108,7 +117,7 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
108
117
 
109
118
 
110
119
  def _write_tu(
111
- xf: etree.xmlfile,
120
+ xf: Any,
112
121
  unit_id: str,
113
122
  unit: Data,
114
123
  document: Structure,
@@ -118,9 +127,9 @@ def _write_tu(
118
127
  attrs["creationdate"] = unit.meta.created
119
128
  if unit.meta.updated:
120
129
  attrs["changedate"] = unit.meta.updated
121
- creator_id = _first_creator_id(unit)
122
- if creator_id:
123
- attrs["creationid"] = creator_id
130
+ comment_summary = _comment_summary(unit)
131
+ if comment_summary.creator_id:
132
+ attrs["creationid"] = comment_summary.creator_id
124
133
  change_id = unit.meta.extensions.get("change_id")
125
134
  if change_id:
126
135
  attrs["changeid"] = change_id
@@ -128,31 +137,30 @@ def _write_tu(
128
137
  attrs["usagecount"] = str(unit.meta.usage_count)
129
138
 
130
139
  with xf.element("tu", attrs):
131
- prop_holder = etree.Element("props")
132
- _append_unit_properties(prop_holder, unit)
133
- _append_comments(prop_holder, unit)
134
- for child in prop_holder:
135
- xf.write(child)
136
- xf.write(
137
- _build_tuv(
138
- document.source_locale,
139
- unit.source,
140
- unit.tags.source_parts if unit.tags else [],
141
- unit.tags.source_tag_map if unit.tags else {},
142
- )
140
+ _write_unit_properties(xf, unit, comment_summary)
141
+ _write_comments(xf, unit)
142
+ _write_tuv(
143
+ xf,
144
+ document.source_locale,
145
+ unit.source,
146
+ unit.tags.source_parts if unit.tags else [],
147
+ unit.tags.source_tag_map if unit.tags else {},
143
148
  )
144
149
  if document.target_locale is not None and unit.target is not None:
145
- xf.write(
146
- _build_tuv(
147
- document.target_locale,
148
- unit.target,
149
- unit.tags.target_parts if unit.tags else [],
150
- unit.tags.target_tag_map if unit.tags else {},
151
- )
150
+ _write_tuv(
151
+ xf,
152
+ document.target_locale,
153
+ unit.target,
154
+ unit.tags.target_parts if unit.tags else [],
155
+ unit.tags.target_tag_map if unit.tags else {},
152
156
  )
153
157
 
154
158
 
155
- def _append_unit_properties(tu: _Element, unit: Data) -> None:
159
+ def _append_unit_properties(
160
+ tu: _Element,
161
+ unit: Data,
162
+ comment_summary: _CommentSummary | None = None,
163
+ ) -> None:
156
164
  if unit.status != TranslationStatus.UNKNOWN:
157
165
  prop = etree.SubElement(tu, "prop", type="x-status")
158
166
  prop.text = unit.status.value
@@ -167,19 +175,47 @@ def _append_unit_properties(tu: _Element, unit: Data) -> None:
167
175
  _append_prop_if_present(tu, "x-next-source-text", unit.next_context.source)
168
176
  _append_prop_if_present(tu, "x-next-target-text", unit.next_context.target)
169
177
 
170
- project = _first_project(unit)
171
- if project:
172
- _append_prop_if_present(tu, "x-project", project)
178
+ summary = comment_summary or _comment_summary(unit)
179
+ if summary.project:
180
+ _append_prop_if_present(tu, "x-project", summary.project)
173
181
 
174
- system = _first_system(unit)
175
- if system:
176
- _append_prop_if_present(tu, "x-system", system)
182
+ if summary.system:
183
+ _append_prop_if_present(tu, "x-system", summary.system)
177
184
 
178
185
  for key, value in unit.extensions.items():
179
186
  if key.startswith("property."):
180
187
  _append_prop_if_present(tu, _property_type(key), value)
181
188
 
182
189
 
190
+ def _write_unit_properties(
191
+ xf: Any,
192
+ unit: Data,
193
+ comment_summary: _CommentSummary,
194
+ ) -> None:
195
+ if unit.status != TranslationStatus.UNKNOWN:
196
+ _write_prop(xf, "x-status", unit.status.value)
197
+
198
+ if unit.previous_context is not None:
199
+ _write_prop_if_present(xf, "x-previous-id", unit.previous_context.unit_id)
200
+ _write_prop_if_present(xf, "x-previous-source-text", unit.previous_context.source)
201
+ _write_prop_if_present(xf, "x-previous-target-text", unit.previous_context.target)
202
+
203
+ if unit.next_context is not None:
204
+ _write_prop_if_present(xf, "x-next-id", unit.next_context.unit_id)
205
+ _write_prop_if_present(xf, "x-next-source-text", unit.next_context.source)
206
+ _write_prop_if_present(xf, "x-next-target-text", unit.next_context.target)
207
+
208
+ if comment_summary.project:
209
+ _write_prop(xf, "x-project", comment_summary.project)
210
+
211
+ if comment_summary.system:
212
+ _write_prop(xf, "x-system", comment_summary.system)
213
+
214
+ for key, value in unit.extensions.items():
215
+ if key.startswith("property."):
216
+ _write_prop_if_present(xf, _property_type(key), value)
217
+
218
+
183
219
  def _append_comments(tu: _Element, unit: Data) -> None:
184
220
  for comment in unit.comments:
185
221
  if not comment.context:
@@ -188,6 +224,13 @@ def _append_comments(tu: _Element, unit: Data) -> None:
188
224
  note.text = comment.context
189
225
 
190
226
 
227
+ def _write_comments(xf: Any, unit: Data) -> None:
228
+ for comment in unit.comments:
229
+ if comment.context:
230
+ with xf.element("note"):
231
+ xf.write(comment.context)
232
+
233
+
191
234
  def _build_tuv(
192
235
  locale: str,
193
236
  text: str,
@@ -199,6 +242,17 @@ def _build_tuv(
199
242
  return tuv
200
243
 
201
244
 
245
+ def _write_tuv(
246
+ xf: Any,
247
+ locale: str,
248
+ text: str,
249
+ parts: list[SegmentPart],
250
+ tag_map: dict[str, TieData],
251
+ ) -> None:
252
+ with xf.element("tuv", lang=locale):
253
+ _write_seg(xf, text, parts, tag_map)
254
+
255
+
202
256
  def _build_seg(
203
257
  text: str,
204
258
  parts: list[SegmentPart],
@@ -224,6 +278,25 @@ def _build_seg(
224
278
  return seg
225
279
 
226
280
 
281
+ def _write_seg(
282
+ xf: Any,
283
+ text: str,
284
+ parts: list[SegmentPart],
285
+ tag_map: dict[str, TieData],
286
+ ) -> None:
287
+ effective_parts = parts if parts else [TextPart(text)]
288
+ pair_numbers = _pair_numbers(tag_map)
289
+ with xf.element("seg"):
290
+ for part in effective_parts:
291
+ if isinstance(part, TextPart):
292
+ xf.write(part.value)
293
+ elif isinstance(part, CodePart):
294
+ code = tag_map.get(part.ref)
295
+ if code is None:
296
+ continue
297
+ xf.write(_build_code_element(code, pair_numbers))
298
+
299
+
227
300
  def _build_code_element(code: TieData, pair_numbers: dict[str, str]) -> _Element:
228
301
  if code.original_name in {"bpt", "ept", "ph", "it", "ut", "hi"}:
229
302
  attrs = dict(code.attributes)
@@ -284,25 +357,35 @@ def _append_prop_if_present(tu: _Element, prop_type: str, value: str | None) ->
284
357
  prop.text = value
285
358
 
286
359
 
287
- def _first_creator_id(unit: Data) -> str | None:
288
- for comment in unit.comments:
289
- if comment.origin is not None and comment.origin.creator_id:
290
- return comment.origin.creator_id
291
- return None
360
+ def _write_prop_if_present(xf: Any, prop_type: str, value: str | None) -> None:
361
+ if value is not None and value != "":
362
+ _write_prop(xf, prop_type, value)
292
363
 
293
364
 
294
- def _first_project(unit: Data) -> str | None:
295
- for comment in unit.comments:
296
- if comment.origin is not None and comment.origin.project:
297
- return comment.origin.project
298
- return None
365
+ def _write_prop(xf: Any, prop_type: str, value: str) -> None:
366
+ with xf.element("prop", type=prop_type):
367
+ xf.write(value)
299
368
 
300
369
 
301
- def _first_system(unit: Data) -> str | None:
370
+ def _comment_summary(unit: Data) -> _CommentSummary:
371
+ summary = _CommentSummary()
302
372
  for comment in unit.comments:
303
- if comment.origin is not None and comment.origin.system:
304
- return comment.origin.system
305
- return None
373
+ origin = comment.origin
374
+ if origin is None:
375
+ continue
376
+ if summary.creator_id is None and origin.creator_id:
377
+ summary.creator_id = origin.creator_id
378
+ if summary.project is None and origin.project:
379
+ summary.project = origin.project
380
+ if summary.system is None and origin.system:
381
+ summary.system = origin.system
382
+ if (
383
+ summary.creator_id is not None
384
+ and summary.project is not None
385
+ and summary.system is not None
386
+ ):
387
+ break
388
+ return summary
306
389
 
307
390
 
308
391
  def _property_type(key: str) -> str:
@@ -90,38 +90,54 @@ def _write_file(
90
90
  with xf.element(f"{{{XLIFF_NS}}}file", attrs):
91
91
  xf.write(etree.Element(f"{{{XLIFF_NS}}}header"))
92
92
  with xf.element(f"{{{XLIFF_NS}}}body"):
93
- xf.write(_build_trans_unit(first_id, first_unit))
93
+ _write_trans_unit(xf, first_id, first_unit)
94
94
  for unit_id, unit in unit_iter:
95
- xf.write(_build_trans_unit(unit_id, unit))
95
+ _write_trans_unit(xf, unit_id, unit)
96
96
 
97
97
 
98
- def _build_trans_unit(unit_id: str, unit: Data) -> _Element:
98
+ def _write_trans_unit(xf: Any, unit_id: str, unit: Data) -> None:
99
99
  attrs = {"id": unit.extensions.get("unit_id", unit_id)}
100
100
  space = unit.extensions.get("space")
101
101
  if space:
102
102
  attrs["{http://www.w3.org/XML/1998/namespace}space"] = space
103
- trans_unit = etree.Element(f"{{{XLIFF_NS}}}trans-unit", attrs)
104
- trans_unit.append(
105
- _build_segment(
103
+ with xf.element(f"{{{XLIFF_NS}}}trans-unit", attrs):
104
+ _write_segment(
105
+ xf,
106
106
  "source",
107
107
  unit.source,
108
108
  unit.tags.source_parts if unit.tags else [],
109
109
  unit.tags.source_tag_map if unit.tags else {},
110
110
  )
111
- )
112
- if unit.target is not None:
113
- target = _build_segment(
114
- "target",
115
- unit.target,
116
- unit.tags.target_parts if unit.tags else [],
117
- unit.tags.target_tag_map if unit.tags else {},
118
- )
119
- trans_unit.append(target)
120
- for comment in unit.comments:
121
- if comment.context:
122
- note = etree.SubElement(trans_unit, f"{{{XLIFF_NS}}}note")
123
- note.text = comment.context
124
- return trans_unit
111
+ if unit.target is not None:
112
+ _write_segment(
113
+ xf,
114
+ "target",
115
+ unit.target,
116
+ unit.tags.target_parts if unit.tags else [],
117
+ unit.tags.target_tag_map if unit.tags else {},
118
+ )
119
+ for comment in unit.comments:
120
+ if comment.context:
121
+ with xf.element(f"{{{XLIFF_NS}}}note"):
122
+ xf.write(comment.context)
123
+
124
+
125
+ def _write_segment(
126
+ xf: Any,
127
+ name: str,
128
+ text: str,
129
+ parts: list[SegmentPart],
130
+ tag_map: dict[str, TieData],
131
+ ) -> None:
132
+ with xf.element(f"{{{XLIFF_NS}}}{name}"):
133
+ effective_parts = parts if parts else [TextPart(text)]
134
+ for part in effective_parts:
135
+ if isinstance(part, TextPart):
136
+ xf.write(part.value)
137
+ elif isinstance(part, CodePart):
138
+ code = tag_map.get(part.ref)
139
+ if code is not None:
140
+ xf.write(_build_code(code))
125
141
 
126
142
 
127
143
  def _build_segment(
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import re
4
5
  import zipfile
5
6
  from enum import StrEnum
6
7
  from io import BytesIO
@@ -8,6 +9,8 @@ from pathlib import Path
8
9
 
9
10
  from lokit.parsers.tmx.xml_utils import iterparse_safe, local_name
10
11
 
12
+ _JSON_FORMAT_RE = re.compile(r'"(?:format_version|data)"\s*:')
13
+
11
14
 
12
15
  class LokitInputFormat(StrEnum):
13
16
  TMX = "tmx"
@@ -36,9 +39,9 @@ def detect_format(filepath: str | Path) -> LokitInputFormat:
36
39
  return LokitInputFormat.IDML
37
40
  if suffix == ".json":
38
41
  try:
39
- with path.open("r", encoding="utf-8") as f:
40
- data = json.load(f)
41
- if isinstance(data, dict) and ("format_version" in data or "data" in data):
42
+ with path.open("rb") as f:
43
+ data = f.read(4096)
44
+ if _JSON_FORMAT_RE.search(data.decode("utf-8", errors="ignore")):
42
45
  return LokitInputFormat.LOKIT_JSON
43
46
  except Exception:
44
47
  pass
@@ -7,7 +7,7 @@ from time import perf_counter
7
7
  from lokit.data.structure import BaseStructure, Data, StreamingStructure, ConversionStats
8
8
  from lokit.format_detection import LokitInputFormat, detect_format
9
9
  from lokit.exporters import export_csv, export_tmx, export_xliff
10
- from lokit.parsers.tmx.xml_utils import iterparse_safe, local_name
10
+ from lokit.parsers.tmx.xml_utils import local_name
11
11
  from lokit.parsers.csv.extraction import CsvExtractor
12
12
  from lokit.parsers.xlsx.extraction import XlsxExtractor
13
13
  from lokit.parsers.html.extraction import HtmlExtractor
@@ -62,6 +62,7 @@ def import_tmx_parallel(
62
62
  parse_header=not (source_language and target_language),
63
63
  mode=mode,
64
64
  )
65
+ extractor._initialize_from_file()
65
66
  parsed_data: dict[str, Data] = {
66
67
  unit_id: data
67
68
  for unit_id, data in extract_tmx_parallel(
@@ -93,6 +94,7 @@ def stream_tmx_parallel(
93
94
  parse_header=not (source_language and target_language),
94
95
  mode=mode,
95
96
  )
97
+ extractor._initialize_from_file()
96
98
  return StreamingStructure(
97
99
  source_locale=extractor.source_locale or extractor.native_source,
98
100
  target_locale=extractor.target_locale or extractor.native_target or None,
@@ -584,14 +586,38 @@ def _build_idml_structure(
584
586
 
585
587
 
586
588
  def _validate_xml_root(filepath: str, expected: str) -> None:
587
- context = iterparse_safe(filepath, events=("start",))
588
- for _, element in context:
589
- root = local_name(element.tag).lower()
590
- if root != expected:
591
- raise ValueError(
592
- f"Expected {expected.upper()} XML root in {filepath!r}, found {root!r}"
593
- )
594
- return
589
+ with open(filepath, "rb") as f:
590
+ data = f.read(4096)
591
+ root = _peek_xml_root(data)
592
+ if root != expected:
593
+ found = root or "unknown"
594
+ raise ValueError(
595
+ f"Expected {expected.upper()} XML root in {filepath!r}, found {found!r}"
596
+ )
597
+
598
+
599
+ def _peek_xml_root(data: bytes) -> str:
600
+ index = 0
601
+ data_len = len(data)
602
+ while index < data_len:
603
+ start = data.find(b"<", index)
604
+ if start < 0 or start + 1 >= data_len:
605
+ return ""
606
+ marker = data[start + 1 : start + 2]
607
+ if marker in (b"?", b"!"):
608
+ end = data.find(b">", start + 1)
609
+ if end < 0:
610
+ return ""
611
+ index = end + 1
612
+ continue
613
+ end = start + 1
614
+ while end < data_len and data[end] not in b" />\t\r\n":
615
+ end += 1
616
+ raw = data[start + 1 : end].decode("utf-8", errors="ignore")
617
+ if ":" in raw:
618
+ raw = raw.rsplit(":", 1)[-1]
619
+ return local_name(raw).lower()
620
+ return ""
595
621
 
596
622
 
597
623
  def _convert_tmx(
@@ -1,3 +1,6 @@
1
1
  from lokit.io.json import load_lokit_json, load_lokit_json_bytes
2
2
 
3
- __all__ = ["load_lokit_json", "load_lokit_json_bytes"]
3
+ __all__ = [
4
+ "load_lokit_json",
5
+ "load_lokit_json_bytes",
6
+ ]
@@ -0,0 +1,158 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from collections.abc import AsyncIterator, Iterable
5
+ from dataclasses import asdict, is_dataclass
6
+ from enum import StrEnum
7
+ from typing import TextIO
8
+ from pathlib import Path
9
+
10
+ from lokit.data.structure import Data
11
+ from lokit.format_detection import LokitInputFormat, detect_format
12
+ from lokit.io.atomic import atomic_output_path
13
+ from lokit.parsers.tmx.models import TmxParseMode
14
+
15
+
16
+ class LokitJsonContext(StrEnum):
17
+ SOURCE = "source"
18
+ TARGET = "target"
19
+ PLURAL = "plural"
20
+ TAGS = "tags"
21
+ META = "meta"
22
+ STATUS = "status"
23
+ COMMENTS = "comments"
24
+ PREVIOUS_CONTEXT = "previous_context"
25
+ NEXT_CONTEXT = "next_context"
26
+ EXTENSIONS = "extensions"
27
+
28
+
29
+ DEFAULT_JSON_CONTEXT: tuple[LokitJsonContext, LokitJsonContext] = (
30
+ LokitJsonContext.SOURCE,
31
+ LokitJsonContext.TARGET,
32
+ )
33
+
34
+
35
+ async def write_lokit_json_stream(
36
+ filepath: str | Path,
37
+ output: str | Path,
38
+ context: Iterable[LokitJsonContext | str] | None = None,
39
+ ) -> Path:
40
+ input_path = Path(filepath)
41
+ output_path = _resolve_output_path(input_path, Path(output))
42
+ selected = _normalize_context(context)
43
+ output_path.parent.mkdir(parents=True, exist_ok=True)
44
+ input_format = detect_format(input_path)
45
+
46
+ with atomic_output_path(output_path, "w") as f:
47
+ if input_format is LokitInputFormat.TMX:
48
+ from lokit.parsers.tmx.extraction import TmxExtractor
49
+
50
+ for unit_id, data in TmxExtractor(
51
+ str(input_path),
52
+ mode=_tmx_mode(selected),
53
+ ).extract():
54
+ _write_record(f, unit_id, data, selected)
55
+ else:
56
+ async for unit_id, data in _stream_units(input_path):
57
+ _write_record(f, unit_id, data, selected)
58
+ return output_path
59
+
60
+
61
+ def _resolve_output_path(input_path: Path, output: Path) -> Path:
62
+ if output.suffix:
63
+ return output
64
+ return output / f"{input_path.stem}.jsonl"
65
+
66
+
67
+ def _normalize_context(
68
+ context: Iterable[LokitJsonContext | str] | None,
69
+ ) -> tuple[LokitJsonContext, ...]:
70
+ if context is None:
71
+ return DEFAULT_JSON_CONTEXT
72
+ return tuple(_normalize_context_item(item) for item in context)
73
+
74
+
75
+ def _normalize_context_item(item: LokitJsonContext | str) -> LokitJsonContext:
76
+ if isinstance(item, LokitJsonContext):
77
+ return item
78
+ return LokitJsonContext(item)
79
+
80
+
81
+ def _write_record(
82
+ f: TextIO,
83
+ unit_id: str,
84
+ data: Data,
85
+ selected: tuple[LokitJsonContext, ...],
86
+ ) -> None:
87
+ if selected == DEFAULT_JSON_CONTEXT:
88
+ dumps = json.dumps
89
+ f.write(
90
+ '{"id":'
91
+ + dumps(unit_id, ensure_ascii=False, separators=(",", ":"), default=str)
92
+ + ',"source":'
93
+ + dumps(data.source, ensure_ascii=False, separators=(",", ":"), default=str)
94
+ + ',"target":'
95
+ + dumps(data.target, ensure_ascii=False, separators=(",", ":"), default=str)
96
+ + "}\n"
97
+ )
98
+ return
99
+ record: dict[str, object] = {"id": unit_id}
100
+ for key in selected:
101
+ record[key.value] = _json_value(data, key)
102
+ json.dump(record, f, ensure_ascii=False, separators=(",", ":"), default=str)
103
+ f.write("\n")
104
+
105
+
106
+ def _stream_units(input_path: Path) -> AsyncIterator[tuple[str, Data]]:
107
+ from lokit.importers import import_file_async
108
+
109
+ return import_file_async(str(input_path))
110
+
111
+
112
+ def _tmx_mode(selected: tuple[LokitJsonContext, ...]) -> TmxParseMode:
113
+ full_keys = {
114
+ LokitJsonContext.PLURAL,
115
+ LokitJsonContext.TAGS,
116
+ LokitJsonContext.META,
117
+ LokitJsonContext.COMMENTS,
118
+ LokitJsonContext.PREVIOUS_CONTEXT,
119
+ LokitJsonContext.NEXT_CONTEXT,
120
+ LokitJsonContext.EXTENSIONS,
121
+ }
122
+ if any(key in full_keys for key in selected):
123
+ return TmxParseMode.FULL
124
+ if LokitJsonContext.STATUS in selected:
125
+ return TmxParseMode.TEXT_WITH_STATUS
126
+ return TmxParseMode.TEXT
127
+
128
+
129
+ def _json_value(data: Data, key: LokitJsonContext) -> object:
130
+ if key is LokitJsonContext.SOURCE:
131
+ return data.source
132
+ if key is LokitJsonContext.TARGET:
133
+ return data.target
134
+ if key is LokitJsonContext.PLURAL:
135
+ return _to_jsonable(data.plural)
136
+ if key is LokitJsonContext.TAGS:
137
+ return _to_jsonable(data.tags)
138
+ if key is LokitJsonContext.META:
139
+ return _to_jsonable(data.meta)
140
+ if key is LokitJsonContext.STATUS:
141
+ return data.status.value
142
+ if key is LokitJsonContext.COMMENTS:
143
+ return _to_jsonable(data.comments)
144
+ if key is LokitJsonContext.PREVIOUS_CONTEXT:
145
+ return _to_jsonable(data.previous_context)
146
+ if key is LokitJsonContext.NEXT_CONTEXT:
147
+ return _to_jsonable(data.next_context)
148
+ return _to_jsonable(data.extensions)
149
+
150
+
151
+ def _to_jsonable(value: object) -> object:
152
+ if is_dataclass(value) and not isinstance(value, type):
153
+ return asdict(value)
154
+ if isinstance(value, list):
155
+ return [_to_jsonable(item) for item in value]
156
+ if isinstance(value, dict):
157
+ return {str(key): _to_jsonable(item) for key, item in value.items()}
158
+ return value