lokit-python 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {lokit_python-0.1.1 → lokit_python-0.1.2}/PKG-INFO +1 -1
  2. {lokit_python-0.1.1 → lokit_python-0.1.2}/pyproject.toml +1 -1
  3. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/__init__.py +12 -0
  4. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/importers.py +140 -3
  5. lokit_python-0.1.2/src/lokit/io/atomic.py +90 -0
  6. lokit_python-0.1.2/src/lokit/parsers/async_bridge.py +107 -0
  7. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/base.py +24 -4
  8. lokit_python-0.1.2/src/lokit/parsers/tmx/extraction.py +124 -0
  9. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/models.py +7 -0
  10. lokit_python-0.1.2/src/lokit/parsers/tmx/parallel.py +154 -0
  11. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/tags.py +7 -0
  12. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/xml_utils.py +5 -0
  13. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/PKG-INFO +1 -1
  14. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/SOURCES.txt +1 -0
  15. lokit_python-0.1.2/src/lokit_python.egg-info/top_level.txt +2 -0
  16. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_performance_safety.py +71 -5
  17. lokit_python-0.1.1/src/lokit/io/atomic.py +0 -39
  18. lokit_python-0.1.1/src/lokit/parsers/async_bridge.py +0 -81
  19. lokit_python-0.1.1/src/lokit/parsers/tmx/extraction.py +0 -107
  20. lokit_python-0.1.1/src/lokit_python.egg-info/top_level.txt +0 -2
  21. {lokit_python-0.1.1 → lokit_python-0.1.2}/README.md +0 -0
  22. {lokit_python-0.1.1 → lokit_python-0.1.2}/setup.cfg +0 -0
  23. {lokit_python-0.1.1 → lokit_python-0.1.2}/setup.py +0 -0
  24. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/core/__init__.py +0 -0
  25. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/core/logger.py +0 -0
  26. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/__init__.py +0 -0
  27. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/lang_codes.py +0 -0
  28. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/structure.py +0 -0
  29. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/data/tag_types.py +0 -0
  30. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/__init__.py +0 -0
  31. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/csv.py +0 -0
  32. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/html.py +0 -0
  33. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/idml.py +0 -0
  34. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/json_i18n.py +0 -0
  35. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/po.py +0 -0
  36. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/tmx.py +0 -0
  37. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/xliff.py +0 -0
  38. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/exporters/xlsx.py +0 -0
  39. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/format_detection.py +0 -0
  40. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/io/__init__.py +0 -0
  41. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/io/json.py +0 -0
  42. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/logic.py +0 -0
  43. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/__init__.py +0 -0
  44. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/csv/__init__.py +0 -0
  45. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/csv/extraction.py +0 -0
  46. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/html/__init__.py +0 -0
  47. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/html/extraction.py +0 -0
  48. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/idml/__init__.py +0 -0
  49. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/idml/extraction.py +0 -0
  50. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/json_i18n/__init__.py +0 -0
  51. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/json_i18n/extraction.py +0 -0
  52. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/po/__init__.py +0 -0
  53. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/po/extraction.py +0 -0
  54. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/__init__.py +0 -0
  55. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/header.py +0 -0
  56. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/helpers.py +0 -0
  57. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/tmx/props.py +0 -0
  58. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/__init__.py +0 -0
  59. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/extraction.py +0 -0
  60. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xliff/tags.py +0 -0
  61. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xlsx/__init__.py +0 -0
  62. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/parsers/xlsx/extraction.py +0 -0
  63. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit/py.typed +0 -0
  64. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/dependency_links.txt +0 -0
  65. {lokit_python-0.1.1 → lokit_python-0.1.2}/src/lokit_python.egg-info/requires.txt +0 -0
  66. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_csv.py +0 -0
  67. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_html.py +0 -0
  68. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_idml.py +0 -0
  69. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_json_i18n.py +0 -0
  70. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_po.py +0 -0
  71. {lokit_python-0.1.1 → lokit_python-0.1.2}/tests/test_xlsx.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lokit-python
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
5
5
  Requires-Python: >=3.12
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lokit-python"
3
- version = "0.1.1"
3
+ version = "0.1.2"
4
4
  description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -51,7 +51,11 @@ from lokit.importers import (
51
51
  import_po_async,
52
52
  import_tmx,
53
53
  import_tmx_async,
54
+ import_tmx_batches_async,
55
+ import_tmx_parallel,
56
+ process_tmx_async,
54
57
  stream_tmx,
58
+ stream_tmx_parallel,
55
59
  convert_tmx_to_csv,
56
60
  convert_tmx_to_tmx,
57
61
  convert_tmx_to_xliff,
@@ -69,6 +73,8 @@ from lokit.parsers.po.extraction import PoExtractor
69
73
  from lokit.parsers.json_i18n.extraction import JsonI18nExtractor
70
74
  from lokit.parsers.idml.extraction import IdmlExtractor
71
75
  from lokit.parsers.tmx.extraction import TmxExtractor
76
+ from lokit.parsers.tmx.models import TmxParseMode
77
+ from lokit.parsers.tmx.parallel import TmxParallelOptions
72
78
  from lokit.parsers.xliff.extraction import XliffExtractor
73
79
 
74
80
  __all__ = [
@@ -91,6 +97,8 @@ __all__ = [
91
97
  "TieData",
92
98
  "TieType",
93
99
  "TmxExtractor",
100
+ "TmxParseMode",
101
+ "TmxParallelOptions",
94
102
  "TranslationStatus",
95
103
  "XliffExtractor",
96
104
  "CsvExtractor",
@@ -131,7 +139,11 @@ __all__ = [
131
139
  "import_po_async",
132
140
  "import_tmx",
133
141
  "import_tmx_async",
142
+ "import_tmx_batches_async",
143
+ "import_tmx_parallel",
144
+ "process_tmx_async",
134
145
  "stream_tmx",
146
+ "stream_tmx_parallel",
135
147
  "convert_tmx_to_csv",
136
148
  "convert_tmx_to_tmx",
137
149
  "convert_tmx_to_xliff",
@@ -1,9 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import AsyncIterator, Callable, Iterable, Iterator
3
+ from collections.abc import AsyncIterator, Awaitable, Callable, Iterable, Iterator
4
4
  from pathlib import Path
5
5
  from time import perf_counter
6
- from typing import Any
7
6
 
8
7
  from lokit.data.structure import BaseStructure, Data, StreamingStructure, ConversionStats
9
8
  from lokit.format_detection import LokitInputFormat, detect_format
@@ -15,15 +14,21 @@ from lokit.parsers.html.extraction import HtmlExtractor
15
14
  from lokit.parsers.po.extraction import PoExtractor
16
15
  from lokit.parsers.json_i18n.extraction import JsonI18nExtractor
17
16
  from lokit.parsers.idml.extraction import IdmlExtractor
17
+ from lokit.parsers.async_bridge import AsyncExtractionBridge
18
18
  from lokit.parsers.tmx.extraction import TmxExtractor
19
+ from lokit.parsers.tmx.models import TmxParseMode
20
+ from lokit.parsers.tmx.parallel import TmxParallelOptions, extract_tmx_parallel
19
21
  from lokit.parsers.xliff.extraction import XliffExtractor
20
22
 
23
+ TmxBatch = list[tuple[str, Data]]
24
+
21
25
 
22
26
  def import_tmx(
23
27
  filepath: str,
24
28
  source_language: str | None = None,
25
29
  target_language: str | None = None,
26
30
  domain: str | None = None,
31
+ mode: TmxParseMode = TmxParseMode.FULL,
27
32
  ) -> BaseStructure:
28
33
  _validate_xml_root(filepath, "tmx")
29
34
  extractor = TmxExtractor(
@@ -32,6 +37,7 @@ def import_tmx(
32
37
  target_language=target_language,
33
38
  domain=domain,
34
39
  parse_header=not (source_language and target_language),
40
+ mode=mode,
35
41
  )
36
42
  parsed_data: dict[str, Data] = {
37
43
  unit_id: data for unit_id, data in extractor.extract()
@@ -39,11 +45,77 @@ def import_tmx(
39
45
  return _build_tmx_structure(extractor, parsed_data)
40
46
 
41
47
 
48
+ def import_tmx_parallel(
49
+ filepath: str,
50
+ source_language: str | None = None,
51
+ target_language: str | None = None,
52
+ domain: str | None = None,
53
+ mode: TmxParseMode = TmxParseMode.FULL,
54
+ options: TmxParallelOptions | None = None,
55
+ ) -> BaseStructure:
56
+ _validate_xml_root(filepath, "tmx")
57
+ extractor = TmxExtractor(
58
+ filepath=filepath,
59
+ source_language=source_language,
60
+ target_language=target_language,
61
+ domain=domain,
62
+ parse_header=not (source_language and target_language),
63
+ mode=mode,
64
+ )
65
+ parsed_data: dict[str, Data] = {
66
+ unit_id: data
67
+ for unit_id, data in extract_tmx_parallel(
68
+ filepath=filepath,
69
+ source_language=extractor.native_source,
70
+ target_language=extractor.native_target,
71
+ domain=domain,
72
+ mode=mode,
73
+ options=options,
74
+ )
75
+ }
76
+ return _build_tmx_structure(extractor, parsed_data)
77
+
78
+
79
+ def stream_tmx_parallel(
80
+ filepath: str,
81
+ source_language: str | None = None,
82
+ target_language: str | None = None,
83
+ domain: str | None = None,
84
+ mode: TmxParseMode = TmxParseMode.FULL,
85
+ options: TmxParallelOptions | None = None,
86
+ ) -> StreamingStructure:
87
+ _validate_xml_root(filepath, "tmx")
88
+ extractor = TmxExtractor(
89
+ filepath=filepath,
90
+ source_language=source_language,
91
+ target_language=target_language,
92
+ domain=domain,
93
+ parse_header=not (source_language and target_language),
94
+ mode=mode,
95
+ )
96
+ return StreamingStructure(
97
+ source_locale=extractor.source_locale or extractor.native_source,
98
+ target_locale=extractor.target_locale or extractor.native_target or None,
99
+ items=extract_tmx_parallel(
100
+ filepath=filepath,
101
+ source_language=extractor.native_source,
102
+ target_language=extractor.native_target,
103
+ domain=domain,
104
+ mode=mode,
105
+ options=options,
106
+ ),
107
+ source_language=extractor.source_language,
108
+ target_language=extractor.target_language,
109
+ extensions=extractor.extensions,
110
+ )
111
+
112
+
42
113
  async def import_tmx_async(
43
114
  filepath: str,
44
115
  source_language: str | None = None,
45
116
  target_language: str | None = None,
46
117
  domain: str | None = None,
118
+ mode: TmxParseMode = TmxParseMode.FULL,
47
119
  ) -> AsyncIterator[tuple[str, Data]]:
48
120
  _validate_xml_root(filepath, "tmx")
49
121
  extractor = TmxExtractor(
@@ -52,11 +124,74 @@ async def import_tmx_async(
52
124
  target_language=target_language,
53
125
  domain=domain,
54
126
  parse_header=not (source_language and target_language),
127
+ mode=mode,
55
128
  )
56
129
  async for unit_id, data in extractor.extract_async():
57
130
  yield unit_id, data
58
131
 
59
132
 
133
+ async def import_tmx_batches_async(
134
+ filepath: str,
135
+ source_language: str | None = None,
136
+ target_language: str | None = None,
137
+ domain: str | None = None,
138
+ *,
139
+ batch_size: int = 1000,
140
+ mode: TmxParseMode = TmxParseMode.FULL,
141
+ ) -> AsyncIterator[TmxBatch]:
142
+ _validate_xml_root(filepath, "tmx")
143
+ extractor = TmxExtractor(
144
+ filepath=filepath,
145
+ source_language=source_language,
146
+ target_language=target_language,
147
+ domain=domain,
148
+ parse_header=not (source_language and target_language),
149
+ mode=mode,
150
+ )
151
+ async for batch in AsyncExtractionBridge(
152
+ lambda: _iter_batches(extractor.extract(), batch_size),
153
+ batch_size=1,
154
+ ):
155
+ yield batch
156
+
157
+
158
+ def _iter_batches(
159
+ items: Iterator[tuple[str, Data]],
160
+ batch_size: int,
161
+ ) -> Iterator[TmxBatch]:
162
+ if batch_size < 1:
163
+ raise ValueError("batch_size must be at least 1")
164
+ batch: TmxBatch = []
165
+ for item in items:
166
+ batch.append(item)
167
+ if len(batch) >= batch_size:
168
+ yield batch
169
+ batch = []
170
+ if batch:
171
+ yield batch
172
+
173
+
174
+ async def process_tmx_async(
175
+ filepath: str,
176
+ callback: Callable[[TmxBatch], Awaitable[None]],
177
+ source_language: str | None = None,
178
+ target_language: str | None = None,
179
+ domain: str | None = None,
180
+ *,
181
+ batch_size: int = 1000,
182
+ mode: TmxParseMode = TmxParseMode.FULL,
183
+ ) -> None:
184
+ async for batch in import_tmx_batches_async(
185
+ filepath,
186
+ source_language=source_language,
187
+ target_language=target_language,
188
+ domain=domain,
189
+ batch_size=batch_size,
190
+ mode=mode,
191
+ ):
192
+ await callback(batch)
193
+
194
+
60
195
  def import_xliff(filepath: str) -> BaseStructure:
61
196
  _validate_xml_root(filepath, "xliff")
62
197
  extractor = XliffExtractor(filepath)
@@ -138,6 +273,7 @@ def stream_tmx(
138
273
  filepath: str,
139
274
  source_language: str | None = None,
140
275
  target_language: str | None = None,
276
+ mode: TmxParseMode = TmxParseMode.FULL,
141
277
  ) -> StreamingStructure:
142
278
  _validate_xml_root(filepath, "tmx")
143
279
  extractor = TmxExtractor(
@@ -145,6 +281,7 @@ def stream_tmx(
145
281
  source_language=source_language,
146
282
  target_language=target_language,
147
283
  parse_header=not (source_language and target_language),
284
+ mode=mode,
148
285
  )
149
286
  return StreamingStructure(
150
287
  source_locale=extractor.source_locale or extractor.native_source,
@@ -460,7 +597,7 @@ def _validate_xml_root(filepath: str, expected: str) -> None:
460
597
  def _convert_tmx(
461
598
  source_path: str,
462
599
  target_path: str,
463
- exporter: Callable[[Any, str], None],
600
+ exporter: Callable[[StreamingStructure, str], None],
464
601
  source_language: str | None,
465
602
  target_language: str | None,
466
603
  ) -> ConversionStats:
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ import os
5
+ import tempfile
6
+ from collections.abc import Iterator
7
+ from contextlib import AbstractContextManager, contextmanager
8
+ from pathlib import Path
9
+ from typing import BinaryIO, Literal, TextIO, cast, overload
10
+
11
+
12
+ @overload
13
+ def atomic_output_path(
14
+ path: Path,
15
+ mode: Literal[
16
+ "w",
17
+ "wt",
18
+ "w+",
19
+ "wt+",
20
+ "a",
21
+ "at",
22
+ "a+",
23
+ "at+",
24
+ "x",
25
+ "xt",
26
+ "x+",
27
+ "xt+",
28
+ ],
29
+ ) -> AbstractContextManager[TextIO]: ...
30
+
31
+
32
+ @overload
33
+ def atomic_output_path(
34
+ path: Path,
35
+ mode: Literal[
36
+ "wb",
37
+ "w+b",
38
+ "wb+",
39
+ "ab",
40
+ "a+b",
41
+ "ab+",
42
+ "xb",
43
+ "x+b",
44
+ "xb+",
45
+ ] = "wb",
46
+ ) -> AbstractContextManager[BinaryIO]: ...
47
+
48
+
49
+ @overload
50
+ def atomic_output_path(
51
+ path: Path,
52
+ mode: str,
53
+ ) -> AbstractContextManager[BinaryIO | TextIO]: ...
54
+
55
+
56
+ def atomic_output_path(
57
+ path: Path,
58
+ mode: str = "wb",
59
+ ) -> AbstractContextManager[BinaryIO | TextIO]:
60
+ return _atomic_output_path(path, mode)
61
+
62
+
63
+ @contextmanager
64
+ def _atomic_output_path(path: Path, mode: str) -> Iterator[BinaryIO | TextIO]:
65
+ path.parent.mkdir(parents=True, exist_ok=True)
66
+ tmp = tempfile.NamedTemporaryFile(
67
+ mode=mode,
68
+ dir=path.parent,
69
+ prefix=f".{path.name}.",
70
+ suffix=".tmp",
71
+ delete=False,
72
+ )
73
+ tmp_path = Path(tmp.name)
74
+ try:
75
+ with tmp:
76
+ yield cast(BinaryIO | TextIO, tmp)
77
+ tmp.flush()
78
+ os.fsync(tmp.fileno())
79
+ os.replace(tmp_path, path)
80
+ directory_flag = getattr(os, "O_DIRECTORY", None)
81
+ if directory_flag is not None:
82
+ dir_fd = os.open(path.parent, directory_flag)
83
+ try:
84
+ os.fsync(dir_fd)
85
+ finally:
86
+ os.close(dir_fd)
87
+ except BaseException:
88
+ with contextlib.suppress(FileNotFoundError):
89
+ tmp_path.unlink()
90
+ raise
@@ -0,0 +1,107 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import threading
5
+ from collections.abc import Callable, Iterator
6
+ from typing import Generic, TypeVar
7
+
8
+ T = TypeVar("T")
9
+
10
+
11
+ class AsyncExtractionBatch(Generic[T]):
12
+ __slots__ = ("done", "error", "items")
13
+
14
+ def __init__(
15
+ self,
16
+ items: list[T] | None = None,
17
+ error: BaseException | None = None,
18
+ done: bool = False,
19
+ ) -> None:
20
+ self.items = items
21
+ self.error = error
22
+ self.done = done
23
+
24
+
25
+ class AsyncExtractionBridge(Generic[T]):
26
+ def __init__(
27
+ self,
28
+ iterator_factory: Callable[[], Iterator[T]],
29
+ maxsize: int = 4,
30
+ batch_size: int = 1000,
31
+ ) -> None:
32
+ if maxsize < 1:
33
+ raise ValueError("maxsize must be at least 1")
34
+ if batch_size < 1:
35
+ raise ValueError("batch_size must be at least 1")
36
+ self._iterator_factory = iterator_factory
37
+ self._queue: asyncio.Queue[AsyncExtractionBatch[T]] = asyncio.Queue(
38
+ maxsize=maxsize
39
+ )
40
+ self._batch_size = batch_size
41
+ self._current_batch: list[T] = []
42
+ self._batch_index = 0
43
+ self._stop = threading.Event()
44
+ self._producer: asyncio.Task[None] | None = None
45
+
46
+ def __aiter__(self) -> AsyncExtractionBridge[T]:
47
+ return self
48
+
49
+ async def __anext__(self) -> T:
50
+ if self._producer is None:
51
+ self._start()
52
+
53
+ while self._batch_index >= len(self._current_batch):
54
+ result = await self._queue.get()
55
+ if result.done:
56
+ await self.aclose()
57
+ raise StopAsyncIteration
58
+ if result.error is not None:
59
+ await self.aclose()
60
+ raise result.error
61
+ if result.items is None:
62
+ await self.aclose()
63
+ raise StopAsyncIteration
64
+ self._current_batch = result.items
65
+ self._batch_index = 0
66
+
67
+ item = self._current_batch[self._batch_index]
68
+ self._batch_index += 1
69
+ return item
70
+
71
+ async def aclose(self) -> None:
72
+ self._stop.set()
73
+ if self._producer is not None:
74
+ await self._producer
75
+ self._producer = None
76
+
77
+ def _start(self) -> None:
78
+ loop = asyncio.get_running_loop()
79
+
80
+ def produce() -> None:
81
+ try:
82
+ batch: list[T] = []
83
+ for item in self._iterator_factory():
84
+ if self._stop.is_set():
85
+ break
86
+ batch.append(item)
87
+ if len(batch) >= self._batch_size:
88
+ self._put(loop, AsyncExtractionBatch(items=batch))
89
+ batch = []
90
+ if batch:
91
+ self._put(loop, AsyncExtractionBatch(items=batch))
92
+ except BaseException as exc:
93
+ self._put(loop, AsyncExtractionBatch(error=exc))
94
+ finally:
95
+ self._put(loop, AsyncExtractionBatch(done=True))
96
+
97
+ self._producer = asyncio.create_task(asyncio.to_thread(produce))
98
+
99
+ def _put(
100
+ self,
101
+ loop: asyncio.AbstractEventLoop,
102
+ result: AsyncExtractionBatch[T],
103
+ ) -> None:
104
+ if self._stop.is_set() and not result.done:
105
+ return
106
+ future = asyncio.run_coroutine_threadsafe(self._queue.put(result), loop)
107
+ future.result()
@@ -5,7 +5,12 @@ from lxml import etree
5
5
  from lokit.core.logger import logger
6
6
  from lokit.parsers.tmx.header import TmxHeaderParser
7
7
  from lokit.parsers.tmx.models import HeaderData
8
- from lokit.parsers.tmx.xml_utils import clear_element, element_children, iterparse_safe, local_name
8
+ from lokit.parsers.tmx.xml_utils import (
9
+ clear_element,
10
+ element_children,
11
+ iterparse_safe,
12
+ local_name,
13
+ )
9
14
 
10
15
 
11
16
  class TmxParser:
@@ -37,6 +42,9 @@ class TmxParser:
37
42
  if parse_header:
38
43
  self._initialize_from_file()
39
44
  self._validate_and_set_languages()
45
+ self.native_source_base: str = self._base_lang(self.native_source)
46
+ self.native_target_base: str = self._base_lang(self.native_target)
47
+ self._lang_base_cache: dict[str, str] = {}
40
48
 
41
49
  def _initialize_from_file(self) -> None:
42
50
  context = iterparse_safe(self.filepath, events=("end",))
@@ -82,9 +90,21 @@ class TmxParser:
82
90
  def _compare_base_lang(self, lang1: str, lang2: str) -> bool:
83
91
  if not lang1 or not lang2:
84
92
  return False
85
- l1 = lang1.replace("_", "-").split("-")[0].lower()
86
- l2 = lang2.replace("_", "-").split("-")[0].lower()
87
- return l1 == l2
93
+ return self._base_lang(lang1) == self._base_lang(lang2)
94
+
95
+ def _base_lang(self, lang: str) -> str:
96
+ if not lang:
97
+ return ""
98
+ normalized = lang.replace("_", "-")
99
+ return normalized.split("-", 1)[0].lower()
100
+
101
+ def _cached_base_lang(self, lang: str) -> str:
102
+ cached = self._lang_base_cache.get(lang)
103
+ if cached is not None:
104
+ return cached
105
+ base_lang = self._base_lang(lang)
106
+ self._lang_base_cache[lang] = base_lang
107
+ return base_lang
88
108
 
89
109
  def _initialize_missing_languages_from_tu(self, element: etree._Element) -> None:
90
110
  langs: list[str] = []
@@ -0,0 +1,124 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import AsyncIterator, Iterator, Optional
4
+ from uuid import uuid4
5
+
6
+ from lxml.etree import _Element
7
+
8
+ from lokit.data.structure import Data, Meta, SegmentPart, Tags, TranslationStatus
9
+ from lokit.data.tag_types import TieData
10
+ from lokit.parsers.async_bridge import AsyncExtractionBridge
11
+ from lokit.parsers.tmx.base import TmxParser
12
+ from lokit.parsers.tmx.models import TmxParseMode
13
+ from lokit.parsers.tmx.props import ParsedTmxProps, TmxProps
14
+ from lokit.parsers.tmx.tags import TmxTagParser
15
+ from lokit.parsers.tmx.xml_utils import (
16
+ clear_element,
17
+ is_tag,
18
+ iterparse_safe,
19
+ local_name,
20
+ )
21
+
22
+ ExtractItem = tuple[str, Data]
23
+
24
+
25
+ class TmxExtractor(TmxParser):
26
+ def __init__(
27
+ self,
28
+ filepath: str,
29
+ source_language: Optional[str] = None,
30
+ target_language: Optional[str] = None,
31
+ domain: Optional[str] = None,
32
+ parse_header: bool = True,
33
+ mode: TmxParseMode = TmxParseMode.FULL,
34
+ ) -> None:
35
+ super().__init__(
36
+ tmx_file_path=filepath,
37
+ source_language=source_language,
38
+ target_language=target_language,
39
+ domain=domain,
40
+ parse_header=parse_header,
41
+ )
42
+ self.tag_parser: TmxTagParser = TmxTagParser()
43
+ self.prop_parser: TmxProps = TmxProps()
44
+ self.namespace: str = "{http://www.w3.org/XML/1998/namespace}"
45
+ self.mode = mode
46
+
47
+ def extract(self) -> Iterator[tuple[str, Data]]:
48
+ with open(self.filepath, "rb") as stream:
49
+ context = iterparse_safe(stream, events=("end",))
50
+
51
+ for _, elem in context:
52
+ if local_name(elem.tag) != "tu":
53
+ continue
54
+
55
+ yield self.extract_element(elem)
56
+
57
+ clear_element(elem)
58
+
59
+ def extract_element(self, elem: _Element) -> tuple[str, Data]:
60
+ unit_id: str = elem.attrib.get("tuid") or str(uuid4())
61
+
62
+ props: ParsedTmxProps | None = None
63
+ status = TranslationStatus.UNKNOWN
64
+ if self.mode is TmxParseMode.FULL:
65
+ props = self.prop_parser.parse_all(elem)
66
+ status = props.status
67
+ elif self.mode is TmxParseMode.TEXT_WITH_STATUS:
68
+ status = self.prop_parser.parse_status(elem)
69
+
70
+ source_text: str = ""
71
+ target_text: str = ""
72
+ source_tags: dict[str, TieData] | None = None
73
+ target_tags: dict[str, TieData] | None = None
74
+ source_parts: list[SegmentPart] | None = None
75
+ target_parts: list[SegmentPart] | None = None
76
+
77
+ for tuv in elem:
78
+ if not is_tag(tuv, "tuv"):
79
+ continue
80
+ lang: str = tuv.get(f"{self.namespace}lang") or tuv.get("lang") or ""
81
+ seg: _Element | None = None
82
+ for tuv_child in tuv:
83
+ if is_tag(tuv_child, "seg"):
84
+ seg = tuv_child
85
+ break
86
+
87
+ if seg is not None:
88
+ text, tags, parts = self.tag_parser.parse_fast(seg)
89
+
90
+ if self._cached_base_lang(lang) == self.native_source_base:
91
+ source_text = text
92
+ source_tags = tags
93
+ source_parts = parts
94
+ else:
95
+ target_text = text
96
+ target_tags = tags
97
+ target_parts = parts
98
+
99
+ tags_obj: Tags | None = None
100
+ if source_tags is not None or target_tags is not None:
101
+ tags_obj = Tags(
102
+ source_tag_map=source_tags or {},
103
+ target_tag_map=target_tags or {},
104
+ source_parts=source_parts or [],
105
+ target_parts=target_parts or [],
106
+ )
107
+
108
+ data_obj = Data(
109
+ source=source_text,
110
+ target=target_text if target_text else None,
111
+ plural=None,
112
+ tags=tags_obj,
113
+ meta=props.meta if props is not None else Meta(),
114
+ status=status,
115
+ comments=props.comments if props is not None else [],
116
+ previous_context=(props.previous_context if props is not None else None),
117
+ next_context=props.next_context if props is not None else None,
118
+ extensions=props.extensions if props is not None else {},
119
+ )
120
+
121
+ return unit_id, data_obj
122
+
123
+ def extract_async(self) -> AsyncIterator[ExtractItem]:
124
+ return AsyncExtractionBridge(self.extract)
@@ -1,4 +1,5 @@
1
1
  from dataclasses import dataclass
2
+ from enum import StrEnum
2
3
 
3
4
 
4
5
  @dataclass
@@ -8,3 +9,9 @@ class HeaderData:
8
9
  srclang: str
9
10
  tgtlang: str
10
11
  extensions: dict[str, str]
12
+
13
+
14
+ class TmxParseMode(StrEnum):
15
+ FULL = "full"
16
+ TEXT = "text"
17
+ TEXT_WITH_STATUS = "text_status"