lokit-python 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lokit_python-0.1.1 → lokit_python-0.1.2}/PKG-INFO +1 -1
- {lokit_python-0.1.1 → lokit_python-0.1.2}/pyproject.toml +1 -1
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/__init__.py +12 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/importers.py +140 -3
- lokit_python-0.1.2/src/lokit/io/atomic.py +90 -0
- lokit_python-0.1.2/src/lokit/parsers/async_bridge.py +107 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/base.py +24 -4
- lokit_python-0.1.2/src/lokit/parsers/tmx/extraction.py +124 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/models.py +7 -0
- lokit_python-0.1.2/src/lokit/parsers/tmx/parallel.py +154 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/tags.py +7 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/xml_utils.py +5 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/PKG-INFO +1 -1
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/SOURCES.txt +1 -0
- lokit_python-0.1.2/src/lokit_python.egg-info/top_level.txt +2 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_performance_safety.py +71 -5
- lokit_python-0.1.1/src/lokit/io/atomic.py +0 -39
- lokit_python-0.1.1/src/lokit/parsers/async_bridge.py +0 -81
- lokit_python-0.1.1/src/lokit/parsers/tmx/extraction.py +0 -107
- lokit_python-0.1.1/src/lokit_python.egg-info/top_level.txt +0 -2
- {lokit_python-0.1.1 → lokit_python-0.1.2}/README.md +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/setup.cfg +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/setup.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/core/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/core/logger.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/lang_codes.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/structure.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/tag_types.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/csv.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/html.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/idml.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/json_i18n.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/po.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/tmx.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/xliff.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/xlsx.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/format_detection.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/io/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/io/json.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/logic.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/csv/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/csv/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/html/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/html/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/idml/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/idml/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/json_i18n/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/json_i18n/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/po/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/po/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/header.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/helpers.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/props.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/tags.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xlsx/__init__.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xlsx/extraction.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/py.typed +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/dependency_links.txt +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/requires.txt +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_csv.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_html.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_idml.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_json_i18n.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_po.py +0 -0
- {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_xlsx.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lokit-python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "lokit-python"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.2"
|
|
4
4
|
description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.12"
|
|
@@ -51,7 +51,11 @@ from lokit.importers import (
|
|
|
51
51
|
import_po_async,
|
|
52
52
|
import_tmx,
|
|
53
53
|
import_tmx_async,
|
|
54
|
+
import_tmx_batches_async,
|
|
55
|
+
import_tmx_parallel,
|
|
56
|
+
process_tmx_async,
|
|
54
57
|
stream_tmx,
|
|
58
|
+
stream_tmx_parallel,
|
|
55
59
|
convert_tmx_to_csv,
|
|
56
60
|
convert_tmx_to_tmx,
|
|
57
61
|
convert_tmx_to_xliff,
|
|
@@ -69,6 +73,8 @@ from lokit.parsers.po.extraction import PoExtractor
|
|
|
69
73
|
from lokit.parsers.json_i18n.extraction import JsonI18nExtractor
|
|
70
74
|
from lokit.parsers.idml.extraction import IdmlExtractor
|
|
71
75
|
from lokit.parsers.tmx.extraction import TmxExtractor
|
|
76
|
+
from lokit.parsers.tmx.models import TmxParseMode
|
|
77
|
+
from lokit.parsers.tmx.parallel import TmxParallelOptions
|
|
72
78
|
from lokit.parsers.xliff.extraction import XliffExtractor
|
|
73
79
|
|
|
74
80
|
__all__ = [
|
|
@@ -91,6 +97,8 @@ __all__ = [
|
|
|
91
97
|
"TieData",
|
|
92
98
|
"TieType",
|
|
93
99
|
"TmxExtractor",
|
|
100
|
+
"TmxParseMode",
|
|
101
|
+
"TmxParallelOptions",
|
|
94
102
|
"TranslationStatus",
|
|
95
103
|
"XliffExtractor",
|
|
96
104
|
"CsvExtractor",
|
|
@@ -131,7 +139,11 @@ __all__ = [
|
|
|
131
139
|
"import_po_async",
|
|
132
140
|
"import_tmx",
|
|
133
141
|
"import_tmx_async",
|
|
142
|
+
"import_tmx_batches_async",
|
|
143
|
+
"import_tmx_parallel",
|
|
144
|
+
"process_tmx_async",
|
|
134
145
|
"stream_tmx",
|
|
146
|
+
"stream_tmx_parallel",
|
|
135
147
|
"convert_tmx_to_csv",
|
|
136
148
|
"convert_tmx_to_tmx",
|
|
137
149
|
"convert_tmx_to_xliff",
|
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from collections.abc import AsyncIterator, Callable, Iterable, Iterator
|
|
3
|
+
from collections.abc import AsyncIterator, Awaitable, Callable, Iterable, Iterator
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
from time import perf_counter
|
|
6
|
-
from typing import Any
|
|
7
6
|
|
|
8
7
|
from lokit.data.structure import BaseStructure, Data, StreamingStructure, ConversionStats
|
|
9
8
|
from lokit.format_detection import LokitInputFormat, detect_format
|
|
@@ -15,15 +14,21 @@ from lokit.parsers.html.extraction import HtmlExtractor
|
|
|
15
14
|
from lokit.parsers.po.extraction import PoExtractor
|
|
16
15
|
from lokit.parsers.json_i18n.extraction import JsonI18nExtractor
|
|
17
16
|
from lokit.parsers.idml.extraction import IdmlExtractor
|
|
17
|
+
from lokit.parsers.async_bridge import AsyncExtractionBridge
|
|
18
18
|
from lokit.parsers.tmx.extraction import TmxExtractor
|
|
19
|
+
from lokit.parsers.tmx.models import TmxParseMode
|
|
20
|
+
from lokit.parsers.tmx.parallel import TmxParallelOptions, extract_tmx_parallel
|
|
19
21
|
from lokit.parsers.xliff.extraction import XliffExtractor
|
|
20
22
|
|
|
23
|
+
TmxBatch = list[tuple[str, Data]]
|
|
24
|
+
|
|
21
25
|
|
|
22
26
|
def import_tmx(
|
|
23
27
|
filepath: str,
|
|
24
28
|
source_language: str | None = None,
|
|
25
29
|
target_language: str | None = None,
|
|
26
30
|
domain: str | None = None,
|
|
31
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
27
32
|
) -> BaseStructure:
|
|
28
33
|
_validate_xml_root(filepath, "tmx")
|
|
29
34
|
extractor = TmxExtractor(
|
|
@@ -32,6 +37,7 @@ def import_tmx(
|
|
|
32
37
|
target_language=target_language,
|
|
33
38
|
domain=domain,
|
|
34
39
|
parse_header=not (source_language and target_language),
|
|
40
|
+
mode=mode,
|
|
35
41
|
)
|
|
36
42
|
parsed_data: dict[str, Data] = {
|
|
37
43
|
unit_id: data for unit_id, data in extractor.extract()
|
|
@@ -39,11 +45,77 @@ def import_tmx(
|
|
|
39
45
|
return _build_tmx_structure(extractor, parsed_data)
|
|
40
46
|
|
|
41
47
|
|
|
48
|
+
def import_tmx_parallel(
|
|
49
|
+
filepath: str,
|
|
50
|
+
source_language: str | None = None,
|
|
51
|
+
target_language: str | None = None,
|
|
52
|
+
domain: str | None = None,
|
|
53
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
54
|
+
options: TmxParallelOptions | None = None,
|
|
55
|
+
) -> BaseStructure:
|
|
56
|
+
_validate_xml_root(filepath, "tmx")
|
|
57
|
+
extractor = TmxExtractor(
|
|
58
|
+
filepath=filepath,
|
|
59
|
+
source_language=source_language,
|
|
60
|
+
target_language=target_language,
|
|
61
|
+
domain=domain,
|
|
62
|
+
parse_header=not (source_language and target_language),
|
|
63
|
+
mode=mode,
|
|
64
|
+
)
|
|
65
|
+
parsed_data: dict[str, Data] = {
|
|
66
|
+
unit_id: data
|
|
67
|
+
for unit_id, data in extract_tmx_parallel(
|
|
68
|
+
filepath=filepath,
|
|
69
|
+
source_language=extractor.native_source,
|
|
70
|
+
target_language=extractor.native_target,
|
|
71
|
+
domain=domain,
|
|
72
|
+
mode=mode,
|
|
73
|
+
options=options,
|
|
74
|
+
)
|
|
75
|
+
}
|
|
76
|
+
return _build_tmx_structure(extractor, parsed_data)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def stream_tmx_parallel(
|
|
80
|
+
filepath: str,
|
|
81
|
+
source_language: str | None = None,
|
|
82
|
+
target_language: str | None = None,
|
|
83
|
+
domain: str | None = None,
|
|
84
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
85
|
+
options: TmxParallelOptions | None = None,
|
|
86
|
+
) -> StreamingStructure:
|
|
87
|
+
_validate_xml_root(filepath, "tmx")
|
|
88
|
+
extractor = TmxExtractor(
|
|
89
|
+
filepath=filepath,
|
|
90
|
+
source_language=source_language,
|
|
91
|
+
target_language=target_language,
|
|
92
|
+
domain=domain,
|
|
93
|
+
parse_header=not (source_language and target_language),
|
|
94
|
+
mode=mode,
|
|
95
|
+
)
|
|
96
|
+
return StreamingStructure(
|
|
97
|
+
source_locale=extractor.source_locale or extractor.native_source,
|
|
98
|
+
target_locale=extractor.target_locale or extractor.native_target or None,
|
|
99
|
+
items=extract_tmx_parallel(
|
|
100
|
+
filepath=filepath,
|
|
101
|
+
source_language=extractor.native_source,
|
|
102
|
+
target_language=extractor.native_target,
|
|
103
|
+
domain=domain,
|
|
104
|
+
mode=mode,
|
|
105
|
+
options=options,
|
|
106
|
+
),
|
|
107
|
+
source_language=extractor.source_language,
|
|
108
|
+
target_language=extractor.target_language,
|
|
109
|
+
extensions=extractor.extensions,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
42
113
|
async def import_tmx_async(
|
|
43
114
|
filepath: str,
|
|
44
115
|
source_language: str | None = None,
|
|
45
116
|
target_language: str | None = None,
|
|
46
117
|
domain: str | None = None,
|
|
118
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
47
119
|
) -> AsyncIterator[tuple[str, Data]]:
|
|
48
120
|
_validate_xml_root(filepath, "tmx")
|
|
49
121
|
extractor = TmxExtractor(
|
|
@@ -52,11 +124,74 @@ async def import_tmx_async(
|
|
|
52
124
|
target_language=target_language,
|
|
53
125
|
domain=domain,
|
|
54
126
|
parse_header=not (source_language and target_language),
|
|
127
|
+
mode=mode,
|
|
55
128
|
)
|
|
56
129
|
async for unit_id, data in extractor.extract_async():
|
|
57
130
|
yield unit_id, data
|
|
58
131
|
|
|
59
132
|
|
|
133
|
+
async def import_tmx_batches_async(
|
|
134
|
+
filepath: str,
|
|
135
|
+
source_language: str | None = None,
|
|
136
|
+
target_language: str | None = None,
|
|
137
|
+
domain: str | None = None,
|
|
138
|
+
*,
|
|
139
|
+
batch_size: int = 1000,
|
|
140
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
141
|
+
) -> AsyncIterator[TmxBatch]:
|
|
142
|
+
_validate_xml_root(filepath, "tmx")
|
|
143
|
+
extractor = TmxExtractor(
|
|
144
|
+
filepath=filepath,
|
|
145
|
+
source_language=source_language,
|
|
146
|
+
target_language=target_language,
|
|
147
|
+
domain=domain,
|
|
148
|
+
parse_header=not (source_language and target_language),
|
|
149
|
+
mode=mode,
|
|
150
|
+
)
|
|
151
|
+
async for batch in AsyncExtractionBridge(
|
|
152
|
+
lambda: _iter_batches(extractor.extract(), batch_size),
|
|
153
|
+
batch_size=1,
|
|
154
|
+
):
|
|
155
|
+
yield batch
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _iter_batches(
|
|
159
|
+
items: Iterator[tuple[str, Data]],
|
|
160
|
+
batch_size: int,
|
|
161
|
+
) -> Iterator[TmxBatch]:
|
|
162
|
+
if batch_size < 1:
|
|
163
|
+
raise ValueError("batch_size must be at least 1")
|
|
164
|
+
batch: TmxBatch = []
|
|
165
|
+
for item in items:
|
|
166
|
+
batch.append(item)
|
|
167
|
+
if len(batch) >= batch_size:
|
|
168
|
+
yield batch
|
|
169
|
+
batch = []
|
|
170
|
+
if batch:
|
|
171
|
+
yield batch
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
async def process_tmx_async(
|
|
175
|
+
filepath: str,
|
|
176
|
+
callback: Callable[[TmxBatch], Awaitable[None]],
|
|
177
|
+
source_language: str | None = None,
|
|
178
|
+
target_language: str | None = None,
|
|
179
|
+
domain: str | None = None,
|
|
180
|
+
*,
|
|
181
|
+
batch_size: int = 1000,
|
|
182
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
183
|
+
) -> None:
|
|
184
|
+
async for batch in import_tmx_batches_async(
|
|
185
|
+
filepath,
|
|
186
|
+
source_language=source_language,
|
|
187
|
+
target_language=target_language,
|
|
188
|
+
domain=domain,
|
|
189
|
+
batch_size=batch_size,
|
|
190
|
+
mode=mode,
|
|
191
|
+
):
|
|
192
|
+
await callback(batch)
|
|
193
|
+
|
|
194
|
+
|
|
60
195
|
def import_xliff(filepath: str) -> BaseStructure:
|
|
61
196
|
_validate_xml_root(filepath, "xliff")
|
|
62
197
|
extractor = XliffExtractor(filepath)
|
|
@@ -138,6 +273,7 @@ def stream_tmx(
|
|
|
138
273
|
filepath: str,
|
|
139
274
|
source_language: str | None = None,
|
|
140
275
|
target_language: str | None = None,
|
|
276
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
141
277
|
) -> StreamingStructure:
|
|
142
278
|
_validate_xml_root(filepath, "tmx")
|
|
143
279
|
extractor = TmxExtractor(
|
|
@@ -145,6 +281,7 @@ def stream_tmx(
|
|
|
145
281
|
source_language=source_language,
|
|
146
282
|
target_language=target_language,
|
|
147
283
|
parse_header=not (source_language and target_language),
|
|
284
|
+
mode=mode,
|
|
148
285
|
)
|
|
149
286
|
return StreamingStructure(
|
|
150
287
|
source_locale=extractor.source_locale or extractor.native_source,
|
|
@@ -460,7 +597,7 @@ def _validate_xml_root(filepath: str, expected: str) -> None:
|
|
|
460
597
|
def _convert_tmx(
|
|
461
598
|
source_path: str,
|
|
462
599
|
target_path: str,
|
|
463
|
-
exporter: Callable[[
|
|
600
|
+
exporter: Callable[[StreamingStructure, str], None],
|
|
464
601
|
source_language: str | None,
|
|
465
602
|
target_language: str | None,
|
|
466
603
|
) -> ConversionStats:
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
from collections.abc import Iterator
|
|
7
|
+
from contextlib import AbstractContextManager, contextmanager
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import BinaryIO, Literal, TextIO, cast, overload
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@overload
|
|
13
|
+
def atomic_output_path(
|
|
14
|
+
path: Path,
|
|
15
|
+
mode: Literal[
|
|
16
|
+
"w",
|
|
17
|
+
"wt",
|
|
18
|
+
"w+",
|
|
19
|
+
"wt+",
|
|
20
|
+
"a",
|
|
21
|
+
"at",
|
|
22
|
+
"a+",
|
|
23
|
+
"at+",
|
|
24
|
+
"x",
|
|
25
|
+
"xt",
|
|
26
|
+
"x+",
|
|
27
|
+
"xt+",
|
|
28
|
+
],
|
|
29
|
+
) -> AbstractContextManager[TextIO]: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@overload
|
|
33
|
+
def atomic_output_path(
|
|
34
|
+
path: Path,
|
|
35
|
+
mode: Literal[
|
|
36
|
+
"wb",
|
|
37
|
+
"w+b",
|
|
38
|
+
"wb+",
|
|
39
|
+
"ab",
|
|
40
|
+
"a+b",
|
|
41
|
+
"ab+",
|
|
42
|
+
"xb",
|
|
43
|
+
"x+b",
|
|
44
|
+
"xb+",
|
|
45
|
+
] = "wb",
|
|
46
|
+
) -> AbstractContextManager[BinaryIO]: ...
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@overload
|
|
50
|
+
def atomic_output_path(
|
|
51
|
+
path: Path,
|
|
52
|
+
mode: str,
|
|
53
|
+
) -> AbstractContextManager[BinaryIO | TextIO]: ...
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def atomic_output_path(
|
|
57
|
+
path: Path,
|
|
58
|
+
mode: str = "wb",
|
|
59
|
+
) -> AbstractContextManager[BinaryIO | TextIO]:
|
|
60
|
+
return _atomic_output_path(path, mode)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@contextmanager
|
|
64
|
+
def _atomic_output_path(path: Path, mode: str) -> Iterator[BinaryIO | TextIO]:
|
|
65
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
67
|
+
mode=mode,
|
|
68
|
+
dir=path.parent,
|
|
69
|
+
prefix=f".{path.name}.",
|
|
70
|
+
suffix=".tmp",
|
|
71
|
+
delete=False,
|
|
72
|
+
)
|
|
73
|
+
tmp_path = Path(tmp.name)
|
|
74
|
+
try:
|
|
75
|
+
with tmp:
|
|
76
|
+
yield cast(BinaryIO | TextIO, tmp)
|
|
77
|
+
tmp.flush()
|
|
78
|
+
os.fsync(tmp.fileno())
|
|
79
|
+
os.replace(tmp_path, path)
|
|
80
|
+
directory_flag = getattr(os, "O_DIRECTORY", None)
|
|
81
|
+
if directory_flag is not None:
|
|
82
|
+
dir_fd = os.open(path.parent, directory_flag)
|
|
83
|
+
try:
|
|
84
|
+
os.fsync(dir_fd)
|
|
85
|
+
finally:
|
|
86
|
+
os.close(dir_fd)
|
|
87
|
+
except BaseException:
|
|
88
|
+
with contextlib.suppress(FileNotFoundError):
|
|
89
|
+
tmp_path.unlink()
|
|
90
|
+
raise
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import threading
|
|
5
|
+
from collections.abc import Callable, Iterator
|
|
6
|
+
from typing import Generic, TypeVar
|
|
7
|
+
|
|
8
|
+
T = TypeVar("T")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AsyncExtractionBatch(Generic[T]):
|
|
12
|
+
__slots__ = ("done", "error", "items")
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
items: list[T] | None = None,
|
|
17
|
+
error: BaseException | None = None,
|
|
18
|
+
done: bool = False,
|
|
19
|
+
) -> None:
|
|
20
|
+
self.items = items
|
|
21
|
+
self.error = error
|
|
22
|
+
self.done = done
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AsyncExtractionBridge(Generic[T]):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
iterator_factory: Callable[[], Iterator[T]],
|
|
29
|
+
maxsize: int = 4,
|
|
30
|
+
batch_size: int = 1000,
|
|
31
|
+
) -> None:
|
|
32
|
+
if maxsize < 1:
|
|
33
|
+
raise ValueError("maxsize must be at least 1")
|
|
34
|
+
if batch_size < 1:
|
|
35
|
+
raise ValueError("batch_size must be at least 1")
|
|
36
|
+
self._iterator_factory = iterator_factory
|
|
37
|
+
self._queue: asyncio.Queue[AsyncExtractionBatch[T]] = asyncio.Queue(
|
|
38
|
+
maxsize=maxsize
|
|
39
|
+
)
|
|
40
|
+
self._batch_size = batch_size
|
|
41
|
+
self._current_batch: list[T] = []
|
|
42
|
+
self._batch_index = 0
|
|
43
|
+
self._stop = threading.Event()
|
|
44
|
+
self._producer: asyncio.Task[None] | None = None
|
|
45
|
+
|
|
46
|
+
def __aiter__(self) -> AsyncExtractionBridge[T]:
|
|
47
|
+
return self
|
|
48
|
+
|
|
49
|
+
async def __anext__(self) -> T:
|
|
50
|
+
if self._producer is None:
|
|
51
|
+
self._start()
|
|
52
|
+
|
|
53
|
+
while self._batch_index >= len(self._current_batch):
|
|
54
|
+
result = await self._queue.get()
|
|
55
|
+
if result.done:
|
|
56
|
+
await self.aclose()
|
|
57
|
+
raise StopAsyncIteration
|
|
58
|
+
if result.error is not None:
|
|
59
|
+
await self.aclose()
|
|
60
|
+
raise result.error
|
|
61
|
+
if result.items is None:
|
|
62
|
+
await self.aclose()
|
|
63
|
+
raise StopAsyncIteration
|
|
64
|
+
self._current_batch = result.items
|
|
65
|
+
self._batch_index = 0
|
|
66
|
+
|
|
67
|
+
item = self._current_batch[self._batch_index]
|
|
68
|
+
self._batch_index += 1
|
|
69
|
+
return item
|
|
70
|
+
|
|
71
|
+
async def aclose(self) -> None:
|
|
72
|
+
self._stop.set()
|
|
73
|
+
if self._producer is not None:
|
|
74
|
+
await self._producer
|
|
75
|
+
self._producer = None
|
|
76
|
+
|
|
77
|
+
def _start(self) -> None:
|
|
78
|
+
loop = asyncio.get_running_loop()
|
|
79
|
+
|
|
80
|
+
def produce() -> None:
|
|
81
|
+
try:
|
|
82
|
+
batch: list[T] = []
|
|
83
|
+
for item in self._iterator_factory():
|
|
84
|
+
if self._stop.is_set():
|
|
85
|
+
break
|
|
86
|
+
batch.append(item)
|
|
87
|
+
if len(batch) >= self._batch_size:
|
|
88
|
+
self._put(loop, AsyncExtractionBatch(items=batch))
|
|
89
|
+
batch = []
|
|
90
|
+
if batch:
|
|
91
|
+
self._put(loop, AsyncExtractionBatch(items=batch))
|
|
92
|
+
except BaseException as exc:
|
|
93
|
+
self._put(loop, AsyncExtractionBatch(error=exc))
|
|
94
|
+
finally:
|
|
95
|
+
self._put(loop, AsyncExtractionBatch(done=True))
|
|
96
|
+
|
|
97
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
98
|
+
|
|
99
|
+
def _put(
|
|
100
|
+
self,
|
|
101
|
+
loop: asyncio.AbstractEventLoop,
|
|
102
|
+
result: AsyncExtractionBatch[T],
|
|
103
|
+
) -> None:
|
|
104
|
+
if self._stop.is_set() and not result.done:
|
|
105
|
+
return
|
|
106
|
+
future = asyncio.run_coroutine_threadsafe(self._queue.put(result), loop)
|
|
107
|
+
future.result()
|
|
@@ -5,7 +5,12 @@ from lxml import etree
|
|
|
5
5
|
from lokit.core.logger import logger
|
|
6
6
|
from lokit.parsers.tmx.header import TmxHeaderParser
|
|
7
7
|
from lokit.parsers.tmx.models import HeaderData
|
|
8
|
-
from lokit.parsers.tmx.xml_utils import
|
|
8
|
+
from lokit.parsers.tmx.xml_utils import (
|
|
9
|
+
clear_element,
|
|
10
|
+
element_children,
|
|
11
|
+
iterparse_safe,
|
|
12
|
+
local_name,
|
|
13
|
+
)
|
|
9
14
|
|
|
10
15
|
|
|
11
16
|
class TmxParser:
|
|
@@ -37,6 +42,9 @@ class TmxParser:
|
|
|
37
42
|
if parse_header:
|
|
38
43
|
self._initialize_from_file()
|
|
39
44
|
self._validate_and_set_languages()
|
|
45
|
+
self.native_source_base: str = self._base_lang(self.native_source)
|
|
46
|
+
self.native_target_base: str = self._base_lang(self.native_target)
|
|
47
|
+
self._lang_base_cache: dict[str, str] = {}
|
|
40
48
|
|
|
41
49
|
def _initialize_from_file(self) -> None:
|
|
42
50
|
context = iterparse_safe(self.filepath, events=("end",))
|
|
@@ -82,9 +90,21 @@ class TmxParser:
|
|
|
82
90
|
def _compare_base_lang(self, lang1: str, lang2: str) -> bool:
|
|
83
91
|
if not lang1 or not lang2:
|
|
84
92
|
return False
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
93
|
+
return self._base_lang(lang1) == self._base_lang(lang2)
|
|
94
|
+
|
|
95
|
+
def _base_lang(self, lang: str) -> str:
|
|
96
|
+
if not lang:
|
|
97
|
+
return ""
|
|
98
|
+
normalized = lang.replace("_", "-")
|
|
99
|
+
return normalized.split("-", 1)[0].lower()
|
|
100
|
+
|
|
101
|
+
def _cached_base_lang(self, lang: str) -> str:
|
|
102
|
+
cached = self._lang_base_cache.get(lang)
|
|
103
|
+
if cached is not None:
|
|
104
|
+
return cached
|
|
105
|
+
base_lang = self._base_lang(lang)
|
|
106
|
+
self._lang_base_cache[lang] = base_lang
|
|
107
|
+
return base_lang
|
|
88
108
|
|
|
89
109
|
def _initialize_missing_languages_from_tu(self, element: etree._Element) -> None:
|
|
90
110
|
langs: list[str] = []
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
4
|
+
from uuid import uuid4
|
|
5
|
+
|
|
6
|
+
from lxml.etree import _Element
|
|
7
|
+
|
|
8
|
+
from lokit.data.structure import Data, Meta, SegmentPart, Tags, TranslationStatus
|
|
9
|
+
from lokit.data.tag_types import TieData
|
|
10
|
+
from lokit.parsers.async_bridge import AsyncExtractionBridge
|
|
11
|
+
from lokit.parsers.tmx.base import TmxParser
|
|
12
|
+
from lokit.parsers.tmx.models import TmxParseMode
|
|
13
|
+
from lokit.parsers.tmx.props import ParsedTmxProps, TmxProps
|
|
14
|
+
from lokit.parsers.tmx.tags import TmxTagParser
|
|
15
|
+
from lokit.parsers.tmx.xml_utils import (
|
|
16
|
+
clear_element,
|
|
17
|
+
is_tag,
|
|
18
|
+
iterparse_safe,
|
|
19
|
+
local_name,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
ExtractItem = tuple[str, Data]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TmxExtractor(TmxParser):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
filepath: str,
|
|
29
|
+
source_language: Optional[str] = None,
|
|
30
|
+
target_language: Optional[str] = None,
|
|
31
|
+
domain: Optional[str] = None,
|
|
32
|
+
parse_header: bool = True,
|
|
33
|
+
mode: TmxParseMode = TmxParseMode.FULL,
|
|
34
|
+
) -> None:
|
|
35
|
+
super().__init__(
|
|
36
|
+
tmx_file_path=filepath,
|
|
37
|
+
source_language=source_language,
|
|
38
|
+
target_language=target_language,
|
|
39
|
+
domain=domain,
|
|
40
|
+
parse_header=parse_header,
|
|
41
|
+
)
|
|
42
|
+
self.tag_parser: TmxTagParser = TmxTagParser()
|
|
43
|
+
self.prop_parser: TmxProps = TmxProps()
|
|
44
|
+
self.namespace: str = "{http://www.w3.org/XML/1998/namespace}"
|
|
45
|
+
self.mode = mode
|
|
46
|
+
|
|
47
|
+
def extract(self) -> Iterator[tuple[str, Data]]:
|
|
48
|
+
with open(self.filepath, "rb") as stream:
|
|
49
|
+
context = iterparse_safe(stream, events=("end",))
|
|
50
|
+
|
|
51
|
+
for _, elem in context:
|
|
52
|
+
if local_name(elem.tag) != "tu":
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
yield self.extract_element(elem)
|
|
56
|
+
|
|
57
|
+
clear_element(elem)
|
|
58
|
+
|
|
59
|
+
def extract_element(self, elem: _Element) -> tuple[str, Data]:
|
|
60
|
+
unit_id: str = elem.attrib.get("tuid") or str(uuid4())
|
|
61
|
+
|
|
62
|
+
props: ParsedTmxProps | None = None
|
|
63
|
+
status = TranslationStatus.UNKNOWN
|
|
64
|
+
if self.mode is TmxParseMode.FULL:
|
|
65
|
+
props = self.prop_parser.parse_all(elem)
|
|
66
|
+
status = props.status
|
|
67
|
+
elif self.mode is TmxParseMode.TEXT_WITH_STATUS:
|
|
68
|
+
status = self.prop_parser.parse_status(elem)
|
|
69
|
+
|
|
70
|
+
source_text: str = ""
|
|
71
|
+
target_text: str = ""
|
|
72
|
+
source_tags: dict[str, TieData] | None = None
|
|
73
|
+
target_tags: dict[str, TieData] | None = None
|
|
74
|
+
source_parts: list[SegmentPart] | None = None
|
|
75
|
+
target_parts: list[SegmentPart] | None = None
|
|
76
|
+
|
|
77
|
+
for tuv in elem:
|
|
78
|
+
if not is_tag(tuv, "tuv"):
|
|
79
|
+
continue
|
|
80
|
+
lang: str = tuv.get(f"{self.namespace}lang") or tuv.get("lang") or ""
|
|
81
|
+
seg: _Element | None = None
|
|
82
|
+
for tuv_child in tuv:
|
|
83
|
+
if is_tag(tuv_child, "seg"):
|
|
84
|
+
seg = tuv_child
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
if seg is not None:
|
|
88
|
+
text, tags, parts = self.tag_parser.parse_fast(seg)
|
|
89
|
+
|
|
90
|
+
if self._cached_base_lang(lang) == self.native_source_base:
|
|
91
|
+
source_text = text
|
|
92
|
+
source_tags = tags
|
|
93
|
+
source_parts = parts
|
|
94
|
+
else:
|
|
95
|
+
target_text = text
|
|
96
|
+
target_tags = tags
|
|
97
|
+
target_parts = parts
|
|
98
|
+
|
|
99
|
+
tags_obj: Tags | None = None
|
|
100
|
+
if source_tags is not None or target_tags is not None:
|
|
101
|
+
tags_obj = Tags(
|
|
102
|
+
source_tag_map=source_tags or {},
|
|
103
|
+
target_tag_map=target_tags or {},
|
|
104
|
+
source_parts=source_parts or [],
|
|
105
|
+
target_parts=target_parts or [],
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
data_obj = Data(
|
|
109
|
+
source=source_text,
|
|
110
|
+
target=target_text if target_text else None,
|
|
111
|
+
plural=None,
|
|
112
|
+
tags=tags_obj,
|
|
113
|
+
meta=props.meta if props is not None else Meta(),
|
|
114
|
+
status=status,
|
|
115
|
+
comments=props.comments if props is not None else [],
|
|
116
|
+
previous_context=(props.previous_context if props is not None else None),
|
|
117
|
+
next_context=props.next_context if props is not None else None,
|
|
118
|
+
extensions=props.extensions if props is not None else {},
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return unit_id, data_obj
|
|
122
|
+
|
|
123
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
124
|
+
return AsyncExtractionBridge(self.extract)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
+
from enum import StrEnum
|
|
2
3
|
|
|
3
4
|
|
|
4
5
|
@dataclass
|
|
@@ -8,3 +9,9 @@ class HeaderData:
|
|
|
8
9
|
srclang: str
|
|
9
10
|
tgtlang: str
|
|
10
11
|
extensions: dict[str, str]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TmxParseMode(StrEnum):
|
|
15
|
+
FULL = "full"
|
|
16
|
+
TEXT = "text"
|
|
17
|
+
TEXT_WITH_STATUS = "text_status"
|