lokit-python 0.1.2__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lokit_python-0.1.2 → lokit_python-0.1.3}/PKG-INFO +1 -1
- {lokit_python-0.1.2 → lokit_python-0.1.3}/pyproject.toml +1 -1
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/__init__.py +2 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/tmx.py +131 -48
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/xliff.py +36 -20
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/format_detection.py +6 -3
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/importers.py +35 -9
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/__init__.py +4 -1
- lokit_python-0.1.3/src/lokit/io/stream_json.py +158 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/logic.py +21 -1
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/base.py +55 -30
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/extraction.py +57 -19
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/props.py +34 -12
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/tags.py +9 -6
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/xml_utils.py +9 -2
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/tags.py +20 -14
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/PKG-INFO +1 -1
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/SOURCES.txt +1 -0
- lokit_python-0.1.3/src/lokit_python.egg-info/top_level.txt +2 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_performance_safety.py +20 -1
- lokit_python-0.1.2/src/lokit_python.egg-info/top_level.txt +0 -2
- {lokit_python-0.1.2 → lokit_python-0.1.3}/README.md +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/setup.cfg +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/setup.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/core/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/core/logger.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/lang_codes.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/structure.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/data/tag_types.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/csv.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/html.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/idml.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/json_i18n.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/po.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/exporters/xlsx.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/atomic.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/io/json.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/async_bridge.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/csv/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/csv/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/html/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/html/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/idml/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/idml/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/json_i18n/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/json_i18n/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/po/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/po/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/header.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/helpers.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/models.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/tmx/parallel.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xliff/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xlsx/__init__.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/parsers/xlsx/extraction.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit/py.typed +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/dependency_links.txt +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/src/lokit_python.egg-info/requires.txt +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_csv.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_html.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_idml.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_json_i18n.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_po.py +0 -0
- {lokit_python-0.1.2 → lokit_python-0.1.3}/tests/test_xlsx.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lokit-python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "lokit-python"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.3"
|
|
4
4
|
description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -65,6 +65,7 @@ from lokit.importers import (
|
|
|
65
65
|
import_xlsx_async,
|
|
66
66
|
)
|
|
67
67
|
from lokit.io import load_lokit_json, load_lokit_json_bytes
|
|
68
|
+
from lokit.io.stream_json import LokitJsonContext
|
|
68
69
|
from lokit.logic import Lokit, MatchResult
|
|
69
70
|
from lokit.parsers.csv.extraction import CsvExtractor
|
|
70
71
|
from lokit.parsers.xlsx.extraction import XlsxExtractor
|
|
@@ -86,6 +87,7 @@ __all__ = [
|
|
|
86
87
|
"Data",
|
|
87
88
|
"Meta",
|
|
88
89
|
"Lokit",
|
|
90
|
+
"LokitJsonContext",
|
|
89
91
|
"MatchResult",
|
|
90
92
|
"Origin",
|
|
91
93
|
"Plural",
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterable
|
|
4
|
+
from dataclasses import dataclass
|
|
4
5
|
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
5
7
|
|
|
6
8
|
from lxml import etree
|
|
7
9
|
from lxml.etree import _Element
|
|
@@ -25,6 +27,13 @@ from lokit.io.atomic import atomic_output_path
|
|
|
25
27
|
Structure = BaseStructure | StreamingStructure
|
|
26
28
|
|
|
27
29
|
|
|
30
|
+
@dataclass(slots=True)
|
|
31
|
+
class _CommentSummary:
|
|
32
|
+
creator_id: str | None = None
|
|
33
|
+
project: str | None = None
|
|
34
|
+
system: str | None = None
|
|
35
|
+
|
|
36
|
+
|
|
28
37
|
def export_tmx(document: Structure, filepath: str | Path) -> None:
|
|
29
38
|
path = Path(filepath)
|
|
30
39
|
with atomic_output_path(path, "wb") as stream:
|
|
@@ -75,9 +84,9 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
|
|
|
75
84
|
attrs["creationdate"] = unit.meta.created
|
|
76
85
|
if unit.meta.updated:
|
|
77
86
|
attrs["changedate"] = unit.meta.updated
|
|
78
|
-
|
|
79
|
-
if creator_id:
|
|
80
|
-
attrs["creationid"] = creator_id
|
|
87
|
+
comment_summary = _comment_summary(unit)
|
|
88
|
+
if comment_summary.creator_id:
|
|
89
|
+
attrs["creationid"] = comment_summary.creator_id
|
|
81
90
|
change_id = unit.meta.extensions.get("change_id")
|
|
82
91
|
if change_id:
|
|
83
92
|
attrs["changeid"] = change_id
|
|
@@ -85,7 +94,7 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
|
|
|
85
94
|
attrs["usagecount"] = str(unit.meta.usage_count)
|
|
86
95
|
|
|
87
96
|
tu = etree.Element("tu", attrs)
|
|
88
|
-
_append_unit_properties(tu, unit)
|
|
97
|
+
_append_unit_properties(tu, unit, comment_summary)
|
|
89
98
|
_append_comments(tu, unit)
|
|
90
99
|
tu.append(
|
|
91
100
|
_build_tuv(
|
|
@@ -108,7 +117,7 @@ def _build_tu(unit_id: str, unit: Data, document: BaseStructure) -> _Element:
|
|
|
108
117
|
|
|
109
118
|
|
|
110
119
|
def _write_tu(
|
|
111
|
-
xf:
|
|
120
|
+
xf: Any,
|
|
112
121
|
unit_id: str,
|
|
113
122
|
unit: Data,
|
|
114
123
|
document: Structure,
|
|
@@ -118,9 +127,9 @@ def _write_tu(
|
|
|
118
127
|
attrs["creationdate"] = unit.meta.created
|
|
119
128
|
if unit.meta.updated:
|
|
120
129
|
attrs["changedate"] = unit.meta.updated
|
|
121
|
-
|
|
122
|
-
if creator_id:
|
|
123
|
-
attrs["creationid"] = creator_id
|
|
130
|
+
comment_summary = _comment_summary(unit)
|
|
131
|
+
if comment_summary.creator_id:
|
|
132
|
+
attrs["creationid"] = comment_summary.creator_id
|
|
124
133
|
change_id = unit.meta.extensions.get("change_id")
|
|
125
134
|
if change_id:
|
|
126
135
|
attrs["changeid"] = change_id
|
|
@@ -128,31 +137,30 @@ def _write_tu(
|
|
|
128
137
|
attrs["usagecount"] = str(unit.meta.usage_count)
|
|
129
138
|
|
|
130
139
|
with xf.element("tu", attrs):
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
unit.source,
|
|
140
|
-
unit.tags.source_parts if unit.tags else [],
|
|
141
|
-
unit.tags.source_tag_map if unit.tags else {},
|
|
142
|
-
)
|
|
140
|
+
_write_unit_properties(xf, unit, comment_summary)
|
|
141
|
+
_write_comments(xf, unit)
|
|
142
|
+
_write_tuv(
|
|
143
|
+
xf,
|
|
144
|
+
document.source_locale,
|
|
145
|
+
unit.source,
|
|
146
|
+
unit.tags.source_parts if unit.tags else [],
|
|
147
|
+
unit.tags.source_tag_map if unit.tags else {},
|
|
143
148
|
)
|
|
144
149
|
if document.target_locale is not None and unit.target is not None:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
)
|
|
150
|
+
_write_tuv(
|
|
151
|
+
xf,
|
|
152
|
+
document.target_locale,
|
|
153
|
+
unit.target,
|
|
154
|
+
unit.tags.target_parts if unit.tags else [],
|
|
155
|
+
unit.tags.target_tag_map if unit.tags else {},
|
|
152
156
|
)
|
|
153
157
|
|
|
154
158
|
|
|
155
|
-
def _append_unit_properties(
|
|
159
|
+
def _append_unit_properties(
|
|
160
|
+
tu: _Element,
|
|
161
|
+
unit: Data,
|
|
162
|
+
comment_summary: _CommentSummary | None = None,
|
|
163
|
+
) -> None:
|
|
156
164
|
if unit.status != TranslationStatus.UNKNOWN:
|
|
157
165
|
prop = etree.SubElement(tu, "prop", type="x-status")
|
|
158
166
|
prop.text = unit.status.value
|
|
@@ -167,19 +175,47 @@ def _append_unit_properties(tu: _Element, unit: Data) -> None:
|
|
|
167
175
|
_append_prop_if_present(tu, "x-next-source-text", unit.next_context.source)
|
|
168
176
|
_append_prop_if_present(tu, "x-next-target-text", unit.next_context.target)
|
|
169
177
|
|
|
170
|
-
|
|
171
|
-
if project:
|
|
172
|
-
_append_prop_if_present(tu, "x-project", project)
|
|
178
|
+
summary = comment_summary or _comment_summary(unit)
|
|
179
|
+
if summary.project:
|
|
180
|
+
_append_prop_if_present(tu, "x-project", summary.project)
|
|
173
181
|
|
|
174
|
-
system
|
|
175
|
-
|
|
176
|
-
_append_prop_if_present(tu, "x-system", system)
|
|
182
|
+
if summary.system:
|
|
183
|
+
_append_prop_if_present(tu, "x-system", summary.system)
|
|
177
184
|
|
|
178
185
|
for key, value in unit.extensions.items():
|
|
179
186
|
if key.startswith("property."):
|
|
180
187
|
_append_prop_if_present(tu, _property_type(key), value)
|
|
181
188
|
|
|
182
189
|
|
|
190
|
+
def _write_unit_properties(
|
|
191
|
+
xf: Any,
|
|
192
|
+
unit: Data,
|
|
193
|
+
comment_summary: _CommentSummary,
|
|
194
|
+
) -> None:
|
|
195
|
+
if unit.status != TranslationStatus.UNKNOWN:
|
|
196
|
+
_write_prop(xf, "x-status", unit.status.value)
|
|
197
|
+
|
|
198
|
+
if unit.previous_context is not None:
|
|
199
|
+
_write_prop_if_present(xf, "x-previous-id", unit.previous_context.unit_id)
|
|
200
|
+
_write_prop_if_present(xf, "x-previous-source-text", unit.previous_context.source)
|
|
201
|
+
_write_prop_if_present(xf, "x-previous-target-text", unit.previous_context.target)
|
|
202
|
+
|
|
203
|
+
if unit.next_context is not None:
|
|
204
|
+
_write_prop_if_present(xf, "x-next-id", unit.next_context.unit_id)
|
|
205
|
+
_write_prop_if_present(xf, "x-next-source-text", unit.next_context.source)
|
|
206
|
+
_write_prop_if_present(xf, "x-next-target-text", unit.next_context.target)
|
|
207
|
+
|
|
208
|
+
if comment_summary.project:
|
|
209
|
+
_write_prop(xf, "x-project", comment_summary.project)
|
|
210
|
+
|
|
211
|
+
if comment_summary.system:
|
|
212
|
+
_write_prop(xf, "x-system", comment_summary.system)
|
|
213
|
+
|
|
214
|
+
for key, value in unit.extensions.items():
|
|
215
|
+
if key.startswith("property."):
|
|
216
|
+
_write_prop_if_present(xf, _property_type(key), value)
|
|
217
|
+
|
|
218
|
+
|
|
183
219
|
def _append_comments(tu: _Element, unit: Data) -> None:
|
|
184
220
|
for comment in unit.comments:
|
|
185
221
|
if not comment.context:
|
|
@@ -188,6 +224,13 @@ def _append_comments(tu: _Element, unit: Data) -> None:
|
|
|
188
224
|
note.text = comment.context
|
|
189
225
|
|
|
190
226
|
|
|
227
|
+
def _write_comments(xf: Any, unit: Data) -> None:
|
|
228
|
+
for comment in unit.comments:
|
|
229
|
+
if comment.context:
|
|
230
|
+
with xf.element("note"):
|
|
231
|
+
xf.write(comment.context)
|
|
232
|
+
|
|
233
|
+
|
|
191
234
|
def _build_tuv(
|
|
192
235
|
locale: str,
|
|
193
236
|
text: str,
|
|
@@ -199,6 +242,17 @@ def _build_tuv(
|
|
|
199
242
|
return tuv
|
|
200
243
|
|
|
201
244
|
|
|
245
|
+
def _write_tuv(
|
|
246
|
+
xf: Any,
|
|
247
|
+
locale: str,
|
|
248
|
+
text: str,
|
|
249
|
+
parts: list[SegmentPart],
|
|
250
|
+
tag_map: dict[str, TieData],
|
|
251
|
+
) -> None:
|
|
252
|
+
with xf.element("tuv", lang=locale):
|
|
253
|
+
_write_seg(xf, text, parts, tag_map)
|
|
254
|
+
|
|
255
|
+
|
|
202
256
|
def _build_seg(
|
|
203
257
|
text: str,
|
|
204
258
|
parts: list[SegmentPart],
|
|
@@ -224,6 +278,25 @@ def _build_seg(
|
|
|
224
278
|
return seg
|
|
225
279
|
|
|
226
280
|
|
|
281
|
+
def _write_seg(
|
|
282
|
+
xf: Any,
|
|
283
|
+
text: str,
|
|
284
|
+
parts: list[SegmentPart],
|
|
285
|
+
tag_map: dict[str, TieData],
|
|
286
|
+
) -> None:
|
|
287
|
+
effective_parts = parts if parts else [TextPart(text)]
|
|
288
|
+
pair_numbers = _pair_numbers(tag_map)
|
|
289
|
+
with xf.element("seg"):
|
|
290
|
+
for part in effective_parts:
|
|
291
|
+
if isinstance(part, TextPart):
|
|
292
|
+
xf.write(part.value)
|
|
293
|
+
elif isinstance(part, CodePart):
|
|
294
|
+
code = tag_map.get(part.ref)
|
|
295
|
+
if code is None:
|
|
296
|
+
continue
|
|
297
|
+
xf.write(_build_code_element(code, pair_numbers))
|
|
298
|
+
|
|
299
|
+
|
|
227
300
|
def _build_code_element(code: TieData, pair_numbers: dict[str, str]) -> _Element:
|
|
228
301
|
if code.original_name in {"bpt", "ept", "ph", "it", "ut", "hi"}:
|
|
229
302
|
attrs = dict(code.attributes)
|
|
@@ -284,25 +357,35 @@ def _append_prop_if_present(tu: _Element, prop_type: str, value: str | None) ->
|
|
|
284
357
|
prop.text = value
|
|
285
358
|
|
|
286
359
|
|
|
287
|
-
def
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
return comment.origin.creator_id
|
|
291
|
-
return None
|
|
360
|
+
def _write_prop_if_present(xf: Any, prop_type: str, value: str | None) -> None:
|
|
361
|
+
if value is not None and value != "":
|
|
362
|
+
_write_prop(xf, prop_type, value)
|
|
292
363
|
|
|
293
364
|
|
|
294
|
-
def
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
return comment.origin.project
|
|
298
|
-
return None
|
|
365
|
+
def _write_prop(xf: Any, prop_type: str, value: str) -> None:
|
|
366
|
+
with xf.element("prop", type=prop_type):
|
|
367
|
+
xf.write(value)
|
|
299
368
|
|
|
300
369
|
|
|
301
|
-
def
|
|
370
|
+
def _comment_summary(unit: Data) -> _CommentSummary:
|
|
371
|
+
summary = _CommentSummary()
|
|
302
372
|
for comment in unit.comments:
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
373
|
+
origin = comment.origin
|
|
374
|
+
if origin is None:
|
|
375
|
+
continue
|
|
376
|
+
if summary.creator_id is None and origin.creator_id:
|
|
377
|
+
summary.creator_id = origin.creator_id
|
|
378
|
+
if summary.project is None and origin.project:
|
|
379
|
+
summary.project = origin.project
|
|
380
|
+
if summary.system is None and origin.system:
|
|
381
|
+
summary.system = origin.system
|
|
382
|
+
if (
|
|
383
|
+
summary.creator_id is not None
|
|
384
|
+
and summary.project is not None
|
|
385
|
+
and summary.system is not None
|
|
386
|
+
):
|
|
387
|
+
break
|
|
388
|
+
return summary
|
|
306
389
|
|
|
307
390
|
|
|
308
391
|
def _property_type(key: str) -> str:
|
|
@@ -90,38 +90,54 @@ def _write_file(
|
|
|
90
90
|
with xf.element(f"{{{XLIFF_NS}}}file", attrs):
|
|
91
91
|
xf.write(etree.Element(f"{{{XLIFF_NS}}}header"))
|
|
92
92
|
with xf.element(f"{{{XLIFF_NS}}}body"):
|
|
93
|
-
xf
|
|
93
|
+
_write_trans_unit(xf, first_id, first_unit)
|
|
94
94
|
for unit_id, unit in unit_iter:
|
|
95
|
-
xf
|
|
95
|
+
_write_trans_unit(xf, unit_id, unit)
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def
|
|
98
|
+
def _write_trans_unit(xf: Any, unit_id: str, unit: Data) -> None:
|
|
99
99
|
attrs = {"id": unit.extensions.get("unit_id", unit_id)}
|
|
100
100
|
space = unit.extensions.get("space")
|
|
101
101
|
if space:
|
|
102
102
|
attrs["{http://www.w3.org/XML/1998/namespace}space"] = space
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
103
|
+
with xf.element(f"{{{XLIFF_NS}}}trans-unit", attrs):
|
|
104
|
+
_write_segment(
|
|
105
|
+
xf,
|
|
106
106
|
"source",
|
|
107
107
|
unit.source,
|
|
108
108
|
unit.tags.source_parts if unit.tags else [],
|
|
109
109
|
unit.tags.source_tag_map if unit.tags else {},
|
|
110
110
|
)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
111
|
+
if unit.target is not None:
|
|
112
|
+
_write_segment(
|
|
113
|
+
xf,
|
|
114
|
+
"target",
|
|
115
|
+
unit.target,
|
|
116
|
+
unit.tags.target_parts if unit.tags else [],
|
|
117
|
+
unit.tags.target_tag_map if unit.tags else {},
|
|
118
|
+
)
|
|
119
|
+
for comment in unit.comments:
|
|
120
|
+
if comment.context:
|
|
121
|
+
with xf.element(f"{{{XLIFF_NS}}}note"):
|
|
122
|
+
xf.write(comment.context)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _write_segment(
|
|
126
|
+
xf: Any,
|
|
127
|
+
name: str,
|
|
128
|
+
text: str,
|
|
129
|
+
parts: list[SegmentPart],
|
|
130
|
+
tag_map: dict[str, TieData],
|
|
131
|
+
) -> None:
|
|
132
|
+
with xf.element(f"{{{XLIFF_NS}}}{name}"):
|
|
133
|
+
effective_parts = parts if parts else [TextPart(text)]
|
|
134
|
+
for part in effective_parts:
|
|
135
|
+
if isinstance(part, TextPart):
|
|
136
|
+
xf.write(part.value)
|
|
137
|
+
elif isinstance(part, CodePart):
|
|
138
|
+
code = tag_map.get(part.ref)
|
|
139
|
+
if code is not None:
|
|
140
|
+
xf.write(_build_code(code))
|
|
125
141
|
|
|
126
142
|
|
|
127
143
|
def _build_segment(
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
|
+
import re
|
|
4
5
|
import zipfile
|
|
5
6
|
from enum import StrEnum
|
|
6
7
|
from io import BytesIO
|
|
@@ -8,6 +9,8 @@ from pathlib import Path
|
|
|
8
9
|
|
|
9
10
|
from lokit.parsers.tmx.xml_utils import iterparse_safe, local_name
|
|
10
11
|
|
|
12
|
+
_JSON_FORMAT_RE = re.compile(r'"(?:format_version|data)"\s*:')
|
|
13
|
+
|
|
11
14
|
|
|
12
15
|
class LokitInputFormat(StrEnum):
|
|
13
16
|
TMX = "tmx"
|
|
@@ -36,9 +39,9 @@ def detect_format(filepath: str | Path) -> LokitInputFormat:
|
|
|
36
39
|
return LokitInputFormat.IDML
|
|
37
40
|
if suffix == ".json":
|
|
38
41
|
try:
|
|
39
|
-
with path.open("
|
|
40
|
-
data =
|
|
41
|
-
if
|
|
42
|
+
with path.open("rb") as f:
|
|
43
|
+
data = f.read(4096)
|
|
44
|
+
if _JSON_FORMAT_RE.search(data.decode("utf-8", errors="ignore")):
|
|
42
45
|
return LokitInputFormat.LOKIT_JSON
|
|
43
46
|
except Exception:
|
|
44
47
|
pass
|
|
@@ -7,7 +7,7 @@ from time import perf_counter
|
|
|
7
7
|
from lokit.data.structure import BaseStructure, Data, StreamingStructure, ConversionStats
|
|
8
8
|
from lokit.format_detection import LokitInputFormat, detect_format
|
|
9
9
|
from lokit.exporters import export_csv, export_tmx, export_xliff
|
|
10
|
-
from lokit.parsers.tmx.xml_utils import
|
|
10
|
+
from lokit.parsers.tmx.xml_utils import local_name
|
|
11
11
|
from lokit.parsers.csv.extraction import CsvExtractor
|
|
12
12
|
from lokit.parsers.xlsx.extraction import XlsxExtractor
|
|
13
13
|
from lokit.parsers.html.extraction import HtmlExtractor
|
|
@@ -62,6 +62,7 @@ def import_tmx_parallel(
|
|
|
62
62
|
parse_header=not (source_language and target_language),
|
|
63
63
|
mode=mode,
|
|
64
64
|
)
|
|
65
|
+
extractor._initialize_from_file()
|
|
65
66
|
parsed_data: dict[str, Data] = {
|
|
66
67
|
unit_id: data
|
|
67
68
|
for unit_id, data in extract_tmx_parallel(
|
|
@@ -93,6 +94,7 @@ def stream_tmx_parallel(
|
|
|
93
94
|
parse_header=not (source_language and target_language),
|
|
94
95
|
mode=mode,
|
|
95
96
|
)
|
|
97
|
+
extractor._initialize_from_file()
|
|
96
98
|
return StreamingStructure(
|
|
97
99
|
source_locale=extractor.source_locale or extractor.native_source,
|
|
98
100
|
target_locale=extractor.target_locale or extractor.native_target or None,
|
|
@@ -584,14 +586,38 @@ def _build_idml_structure(
|
|
|
584
586
|
|
|
585
587
|
|
|
586
588
|
def _validate_xml_root(filepath: str, expected: str) -> None:
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
)
|
|
594
|
-
|
|
589
|
+
with open(filepath, "rb") as f:
|
|
590
|
+
data = f.read(4096)
|
|
591
|
+
root = _peek_xml_root(data)
|
|
592
|
+
if root != expected:
|
|
593
|
+
found = root or "unknown"
|
|
594
|
+
raise ValueError(
|
|
595
|
+
f"Expected {expected.upper()} XML root in {filepath!r}, found {found!r}"
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _peek_xml_root(data: bytes) -> str:
|
|
600
|
+
index = 0
|
|
601
|
+
data_len = len(data)
|
|
602
|
+
while index < data_len:
|
|
603
|
+
start = data.find(b"<", index)
|
|
604
|
+
if start < 0 or start + 1 >= data_len:
|
|
605
|
+
return ""
|
|
606
|
+
marker = data[start + 1 : start + 2]
|
|
607
|
+
if marker in (b"?", b"!"):
|
|
608
|
+
end = data.find(b">", start + 1)
|
|
609
|
+
if end < 0:
|
|
610
|
+
return ""
|
|
611
|
+
index = end + 1
|
|
612
|
+
continue
|
|
613
|
+
end = start + 1
|
|
614
|
+
while end < data_len and data[end] not in b" />\t\r\n":
|
|
615
|
+
end += 1
|
|
616
|
+
raw = data[start + 1 : end].decode("utf-8", errors="ignore")
|
|
617
|
+
if ":" in raw:
|
|
618
|
+
raw = raw.rsplit(":", 1)[-1]
|
|
619
|
+
return local_name(raw).lower()
|
|
620
|
+
return ""
|
|
595
621
|
|
|
596
622
|
|
|
597
623
|
def _convert_tmx(
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections.abc import AsyncIterator, Iterable
|
|
5
|
+
from dataclasses import asdict, is_dataclass
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
from typing import TextIO
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from lokit.data.structure import Data
|
|
11
|
+
from lokit.format_detection import LokitInputFormat, detect_format
|
|
12
|
+
from lokit.io.atomic import atomic_output_path
|
|
13
|
+
from lokit.parsers.tmx.models import TmxParseMode
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LokitJsonContext(StrEnum):
|
|
17
|
+
SOURCE = "source"
|
|
18
|
+
TARGET = "target"
|
|
19
|
+
PLURAL = "plural"
|
|
20
|
+
TAGS = "tags"
|
|
21
|
+
META = "meta"
|
|
22
|
+
STATUS = "status"
|
|
23
|
+
COMMENTS = "comments"
|
|
24
|
+
PREVIOUS_CONTEXT = "previous_context"
|
|
25
|
+
NEXT_CONTEXT = "next_context"
|
|
26
|
+
EXTENSIONS = "extensions"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
DEFAULT_JSON_CONTEXT: tuple[LokitJsonContext, LokitJsonContext] = (
|
|
30
|
+
LokitJsonContext.SOURCE,
|
|
31
|
+
LokitJsonContext.TARGET,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def write_lokit_json_stream(
|
|
36
|
+
filepath: str | Path,
|
|
37
|
+
output: str | Path,
|
|
38
|
+
context: Iterable[LokitJsonContext | str] | None = None,
|
|
39
|
+
) -> Path:
|
|
40
|
+
input_path = Path(filepath)
|
|
41
|
+
output_path = _resolve_output_path(input_path, Path(output))
|
|
42
|
+
selected = _normalize_context(context)
|
|
43
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
44
|
+
input_format = detect_format(input_path)
|
|
45
|
+
|
|
46
|
+
with atomic_output_path(output_path, "w") as f:
|
|
47
|
+
if input_format is LokitInputFormat.TMX:
|
|
48
|
+
from lokit.parsers.tmx.extraction import TmxExtractor
|
|
49
|
+
|
|
50
|
+
for unit_id, data in TmxExtractor(
|
|
51
|
+
str(input_path),
|
|
52
|
+
mode=_tmx_mode(selected),
|
|
53
|
+
).extract():
|
|
54
|
+
_write_record(f, unit_id, data, selected)
|
|
55
|
+
else:
|
|
56
|
+
async for unit_id, data in _stream_units(input_path):
|
|
57
|
+
_write_record(f, unit_id, data, selected)
|
|
58
|
+
return output_path
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _resolve_output_path(input_path: Path, output: Path) -> Path:
|
|
62
|
+
if output.suffix:
|
|
63
|
+
return output
|
|
64
|
+
return output / f"{input_path.stem}.jsonl"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _normalize_context(
|
|
68
|
+
context: Iterable[LokitJsonContext | str] | None,
|
|
69
|
+
) -> tuple[LokitJsonContext, ...]:
|
|
70
|
+
if context is None:
|
|
71
|
+
return DEFAULT_JSON_CONTEXT
|
|
72
|
+
return tuple(_normalize_context_item(item) for item in context)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _normalize_context_item(item: LokitJsonContext | str) -> LokitJsonContext:
|
|
76
|
+
if isinstance(item, LokitJsonContext):
|
|
77
|
+
return item
|
|
78
|
+
return LokitJsonContext(item)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _write_record(
|
|
82
|
+
f: TextIO,
|
|
83
|
+
unit_id: str,
|
|
84
|
+
data: Data,
|
|
85
|
+
selected: tuple[LokitJsonContext, ...],
|
|
86
|
+
) -> None:
|
|
87
|
+
if selected == DEFAULT_JSON_CONTEXT:
|
|
88
|
+
dumps = json.dumps
|
|
89
|
+
f.write(
|
|
90
|
+
'{"id":'
|
|
91
|
+
+ dumps(unit_id, ensure_ascii=False, separators=(",", ":"), default=str)
|
|
92
|
+
+ ',"source":'
|
|
93
|
+
+ dumps(data.source, ensure_ascii=False, separators=(",", ":"), default=str)
|
|
94
|
+
+ ',"target":'
|
|
95
|
+
+ dumps(data.target, ensure_ascii=False, separators=(",", ":"), default=str)
|
|
96
|
+
+ "}\n"
|
|
97
|
+
)
|
|
98
|
+
return
|
|
99
|
+
record: dict[str, object] = {"id": unit_id}
|
|
100
|
+
for key in selected:
|
|
101
|
+
record[key.value] = _json_value(data, key)
|
|
102
|
+
json.dump(record, f, ensure_ascii=False, separators=(",", ":"), default=str)
|
|
103
|
+
f.write("\n")
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _stream_units(input_path: Path) -> AsyncIterator[tuple[str, Data]]:
|
|
107
|
+
from lokit.importers import import_file_async
|
|
108
|
+
|
|
109
|
+
return import_file_async(str(input_path))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _tmx_mode(selected: tuple[LokitJsonContext, ...]) -> TmxParseMode:
|
|
113
|
+
full_keys = {
|
|
114
|
+
LokitJsonContext.PLURAL,
|
|
115
|
+
LokitJsonContext.TAGS,
|
|
116
|
+
LokitJsonContext.META,
|
|
117
|
+
LokitJsonContext.COMMENTS,
|
|
118
|
+
LokitJsonContext.PREVIOUS_CONTEXT,
|
|
119
|
+
LokitJsonContext.NEXT_CONTEXT,
|
|
120
|
+
LokitJsonContext.EXTENSIONS,
|
|
121
|
+
}
|
|
122
|
+
if any(key in full_keys for key in selected):
|
|
123
|
+
return TmxParseMode.FULL
|
|
124
|
+
if LokitJsonContext.STATUS in selected:
|
|
125
|
+
return TmxParseMode.TEXT_WITH_STATUS
|
|
126
|
+
return TmxParseMode.TEXT
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _json_value(data: Data, key: LokitJsonContext) -> object:
|
|
130
|
+
if key is LokitJsonContext.SOURCE:
|
|
131
|
+
return data.source
|
|
132
|
+
if key is LokitJsonContext.TARGET:
|
|
133
|
+
return data.target
|
|
134
|
+
if key is LokitJsonContext.PLURAL:
|
|
135
|
+
return _to_jsonable(data.plural)
|
|
136
|
+
if key is LokitJsonContext.TAGS:
|
|
137
|
+
return _to_jsonable(data.tags)
|
|
138
|
+
if key is LokitJsonContext.META:
|
|
139
|
+
return _to_jsonable(data.meta)
|
|
140
|
+
if key is LokitJsonContext.STATUS:
|
|
141
|
+
return data.status.value
|
|
142
|
+
if key is LokitJsonContext.COMMENTS:
|
|
143
|
+
return _to_jsonable(data.comments)
|
|
144
|
+
if key is LokitJsonContext.PREVIOUS_CONTEXT:
|
|
145
|
+
return _to_jsonable(data.previous_context)
|
|
146
|
+
if key is LokitJsonContext.NEXT_CONTEXT:
|
|
147
|
+
return _to_jsonable(data.next_context)
|
|
148
|
+
return _to_jsonable(data.extensions)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _to_jsonable(value: object) -> object:
|
|
152
|
+
if is_dataclass(value) and not isinstance(value, type):
|
|
153
|
+
return asdict(value)
|
|
154
|
+
if isinstance(value, list):
|
|
155
|
+
return [_to_jsonable(item) for item in value]
|
|
156
|
+
if isinstance(value, dict):
|
|
157
|
+
return {str(key): _to_jsonable(item) for key, item in value.items()}
|
|
158
|
+
return value
|