lokit-python 0.1.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
- lokit/__init__.cp313-win_amd64.pyd +0 -0
- lokit/__init__.py +128 -0
- lokit/core/__init__.cp313-win_amd64.pyd +0 -0
- lokit/core/__init__.py +0 -0
- lokit/core/logger.cp313-win_amd64.pyd +0 -0
- lokit/core/logger.py +20 -0
- lokit/data/__init__.cp313-win_amd64.pyd +0 -0
- lokit/data/__init__.py +0 -0
- lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
- lokit/data/lang_codes.py +455 -0
- lokit/data/structure.cp313-win_amd64.pyd +0 -0
- lokit/data/structure.py +118 -0
- lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
- lokit/data/tag_types.py +78 -0
- lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
- lokit/exporters/__init__.py +34 -0
- lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
- lokit/exporters/csv.py +32 -0
- lokit/exporters/html.cp313-win_amd64.pyd +0 -0
- lokit/exporters/html.py +217 -0
- lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
- lokit/exporters/idml.py +178 -0
- lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
- lokit/exporters/json_i18n.py +47 -0
- lokit/exporters/po.cp313-win_amd64.pyd +0 -0
- lokit/exporters/po.py +162 -0
- lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/tmx.py +247 -0
- lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xliff.py +152 -0
- lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xlsx.py +39 -0
- lokit/format_detection.cp313-win_amd64.pyd +0 -0
- lokit/format_detection.py +115 -0
- lokit/importers.py +321 -0
- lokit/io/__init__.cp313-win_amd64.pyd +0 -0
- lokit/io/__init__.py +3 -0
- lokit/io/json.cp313-win_amd64.pyd +0 -0
- lokit/io/json.py +194 -0
- lokit/logic.cp313-win_amd64.pyd +0 -0
- lokit/logic.py +324 -0
- lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/__init__.py +1 -0
- lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/__init__.py +1 -0
- lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/extraction.py +164 -0
- lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/__init__.py +3 -0
- lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/extraction.py +365 -0
- lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/__init__.py +3 -0
- lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/extraction.py +264 -0
- lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/__init__.py +3 -0
- lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/extraction.py +163 -0
- lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/__init__.py +3 -0
- lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/extraction.py +236 -0
- lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/__init__.py +0 -0
- lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/base.py +145 -0
- lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/extraction.py +170 -0
- lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/header.py +55 -0
- lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/helpers.py +9 -0
- lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/models.py +10 -0
- lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/props.py +201 -0
- lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/tags.py +59 -0
- lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/xml_utils.py +46 -0
- lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/__init__.py +3 -0
- lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/extraction.py +229 -0
- lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/tags.py +128 -0
- lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/__init__.py +1 -0
- lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/extraction.py +198 -0
- lokit/py.typed +1 -0
- lokit_python-0.1.0.dist-info/METADATA +149 -0
- lokit_python-0.1.0.dist-info/RECORD +97 -0
- lokit_python-0.1.0.dist-info/WHEEL +5 -0
- lokit_python-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
6
|
+
|
|
7
|
+
from lxml.etree import _Element
|
|
8
|
+
|
|
9
|
+
from lokit.data.structure import Comment, Data, Meta, SegmentPart, Tags, TranslationStatus
|
|
10
|
+
from lokit.data.tag_types import TieData
|
|
11
|
+
from lokit.parsers.tmx.xml_utils import (
|
|
12
|
+
clear_element,
|
|
13
|
+
element_children,
|
|
14
|
+
find_child,
|
|
15
|
+
iterparse_safe,
|
|
16
|
+
local_name,
|
|
17
|
+
)
|
|
18
|
+
from lokit.parsers.xliff.tags import XliffTagParser
|
|
19
|
+
|
|
20
|
+
ExtractItem = tuple[str, Data]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(slots=True)
|
|
24
|
+
class XliffFileContext:
|
|
25
|
+
index: int
|
|
26
|
+
original: str
|
|
27
|
+
source_locale: str
|
|
28
|
+
target_locale: str | None
|
|
29
|
+
data_type: str
|
|
30
|
+
tool_name: str | None = None
|
|
31
|
+
tool_version: str | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(slots=True)
|
|
35
|
+
class _AsyncExtractionResult:
|
|
36
|
+
item: Optional[ExtractItem] = None
|
|
37
|
+
error: Optional[BaseException] = None
|
|
38
|
+
done: bool = False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class AsyncXliffExtraction:
|
|
42
|
+
def __init__(self, extractor: XliffExtractor) -> None:
|
|
43
|
+
self._extractor = extractor
|
|
44
|
+
self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
|
|
45
|
+
self._producer: asyncio.Task[None] | None = None
|
|
46
|
+
|
|
47
|
+
def __aiter__(self) -> AsyncXliffExtraction:
|
|
48
|
+
return self
|
|
49
|
+
|
|
50
|
+
async def __anext__(self) -> ExtractItem:
|
|
51
|
+
if self._producer is None:
|
|
52
|
+
self._start()
|
|
53
|
+
result = await self._queue.get()
|
|
54
|
+
if result.done:
|
|
55
|
+
await self._finish()
|
|
56
|
+
raise StopAsyncIteration
|
|
57
|
+
if result.error is not None:
|
|
58
|
+
await self._finish()
|
|
59
|
+
raise result.error
|
|
60
|
+
if result.item is None:
|
|
61
|
+
await self._finish()
|
|
62
|
+
raise StopAsyncIteration
|
|
63
|
+
return result.item
|
|
64
|
+
|
|
65
|
+
def _start(self) -> None:
|
|
66
|
+
loop = asyncio.get_running_loop()
|
|
67
|
+
|
|
68
|
+
def produce() -> None:
|
|
69
|
+
try:
|
|
70
|
+
for item in self._extractor.extract():
|
|
71
|
+
loop.call_soon_threadsafe(
|
|
72
|
+
self._queue.put_nowait,
|
|
73
|
+
_AsyncExtractionResult(item=item),
|
|
74
|
+
)
|
|
75
|
+
except BaseException as exc:
|
|
76
|
+
loop.call_soon_threadsafe(
|
|
77
|
+
self._queue.put_nowait,
|
|
78
|
+
_AsyncExtractionResult(error=exc),
|
|
79
|
+
)
|
|
80
|
+
finally:
|
|
81
|
+
loop.call_soon_threadsafe(
|
|
82
|
+
self._queue.put_nowait,
|
|
83
|
+
_AsyncExtractionResult(done=True),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
87
|
+
|
|
88
|
+
async def _finish(self) -> None:
|
|
89
|
+
if self._producer is not None:
|
|
90
|
+
await self._producer
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class XliffExtractor:
|
|
94
|
+
def __init__(self, filepath: str) -> None:
|
|
95
|
+
self.filepath = filepath
|
|
96
|
+
self.version = "1.2"
|
|
97
|
+
self.source_locale: str | None = None
|
|
98
|
+
self.target_locale: str | None = None
|
|
99
|
+
self.source_language: str | None = None
|
|
100
|
+
self.target_language: str | None = None
|
|
101
|
+
self.export_origin = ""
|
|
102
|
+
self.export_timestamp = ""
|
|
103
|
+
self.extensions: dict[str, str] = {"input_format": "xliff"}
|
|
104
|
+
self.tag_parser = XliffTagParser()
|
|
105
|
+
|
|
106
|
+
def extract(self) -> Iterator[ExtractItem]:
|
|
107
|
+
context = iterparse_safe(self.filepath, events=("start", "end"))
|
|
108
|
+
file_stack: list[XliffFileContext] = []
|
|
109
|
+
file_index = 0
|
|
110
|
+
|
|
111
|
+
for event, elem in context:
|
|
112
|
+
name = local_name(elem.tag)
|
|
113
|
+
if event == "start" and name == "xliff":
|
|
114
|
+
self.version = elem.attrib.get("version", "1.2")
|
|
115
|
+
self.extensions["xliff_version"] = self.version
|
|
116
|
+
elif event == "start" and name == "file":
|
|
117
|
+
current = self._file_context(elem, file_index)
|
|
118
|
+
file_index += 1
|
|
119
|
+
file_stack.append(current)
|
|
120
|
+
self._set_document_languages(current)
|
|
121
|
+
elif event == "end" and name == "file":
|
|
122
|
+
if file_stack:
|
|
123
|
+
file_stack.pop()
|
|
124
|
+
clear_element(elem)
|
|
125
|
+
elif event == "end" and name == "trans-unit" and file_stack:
|
|
126
|
+
current_file = file_stack[-1]
|
|
127
|
+
yield self._parse_unit(elem, current_file)
|
|
128
|
+
clear_element(elem)
|
|
129
|
+
|
|
130
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
131
|
+
return AsyncXliffExtraction(self)
|
|
132
|
+
|
|
133
|
+
def _file_context(self, element: _Element, index: int) -> XliffFileContext:
|
|
134
|
+
original = element.attrib.get("original", "")
|
|
135
|
+
source_locale = element.attrib.get("source-language", "")
|
|
136
|
+
target_locale = element.attrib.get("target-language")
|
|
137
|
+
data_type = element.attrib.get("datatype", "")
|
|
138
|
+
return XliffFileContext(
|
|
139
|
+
index=index,
|
|
140
|
+
original=original,
|
|
141
|
+
source_locale=source_locale,
|
|
142
|
+
target_locale=target_locale,
|
|
143
|
+
data_type=data_type,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def _set_document_languages(self, context: XliffFileContext) -> None:
|
|
147
|
+
if self.source_locale is None and context.source_locale:
|
|
148
|
+
self.source_locale = context.source_locale
|
|
149
|
+
self.source_language = self._base_language(context.source_locale)
|
|
150
|
+
if self.target_locale is None and context.target_locale:
|
|
151
|
+
self.target_locale = context.target_locale
|
|
152
|
+
self.target_language = self._base_language(context.target_locale)
|
|
153
|
+
|
|
154
|
+
def _parse_unit(
|
|
155
|
+
self,
|
|
156
|
+
element: _Element,
|
|
157
|
+
file_context: XliffFileContext,
|
|
158
|
+
) -> ExtractItem:
|
|
159
|
+
source = find_child(element, "source")
|
|
160
|
+
target = find_child(element, "target")
|
|
161
|
+
source_text, source_tags, source_parts = self._parse_segment(source)
|
|
162
|
+
target_text, target_tags, target_parts = self._parse_segment(target)
|
|
163
|
+
unit_id = element.attrib.get("id", "")
|
|
164
|
+
stable_id = f"{file_context.index}:{unit_id}" if unit_id else f"{file_context.index}"
|
|
165
|
+
tags = Tags(
|
|
166
|
+
source_tag_map=source_tags,
|
|
167
|
+
target_tag_map=target_tags,
|
|
168
|
+
source_parts=source_parts,
|
|
169
|
+
target_parts=target_parts,
|
|
170
|
+
)
|
|
171
|
+
data = Data(
|
|
172
|
+
source=source_text,
|
|
173
|
+
target=target_text if target is not None else None,
|
|
174
|
+
tags=tags if source_tags or target_tags else None,
|
|
175
|
+
meta=Meta(),
|
|
176
|
+
status=self._status(target),
|
|
177
|
+
comments=self._comments(element),
|
|
178
|
+
extensions=self._extensions(element, file_context, unit_id),
|
|
179
|
+
)
|
|
180
|
+
return stable_id, data
|
|
181
|
+
|
|
182
|
+
def _parse_segment(
|
|
183
|
+
self, element: _Element | None
|
|
184
|
+
) -> tuple[str, dict[str, TieData], list[SegmentPart]]:
|
|
185
|
+
if element is None:
|
|
186
|
+
return "", {}, []
|
|
187
|
+
return self.tag_parser.parse(element)
|
|
188
|
+
|
|
189
|
+
def _status(self, target: _Element | None) -> TranslationStatus:
|
|
190
|
+
if target is None:
|
|
191
|
+
return TranslationStatus.NEW
|
|
192
|
+
state = (target.attrib.get("state") or "").lower()
|
|
193
|
+
if state in ("final", "signed-off"):
|
|
194
|
+
return TranslationStatus.APPROVED
|
|
195
|
+
if state in ("translated", "needs-review-translation"):
|
|
196
|
+
return TranslationStatus.TRANSLATED
|
|
197
|
+
if state in ("needs-review-adaptation", "needs-review-l10n"):
|
|
198
|
+
return TranslationStatus.REVIEWED
|
|
199
|
+
if state in ("new", "needs-translation"):
|
|
200
|
+
return TranslationStatus.NEW
|
|
201
|
+
return TranslationStatus.UNKNOWN
|
|
202
|
+
|
|
203
|
+
def _comments(self, element: _Element) -> list[Comment]:
|
|
204
|
+
comments: list[Comment] = []
|
|
205
|
+
for child in element_children(element, "note"):
|
|
206
|
+
if child.text:
|
|
207
|
+
comments.append(Comment(context=child.text.strip()))
|
|
208
|
+
return comments
|
|
209
|
+
|
|
210
|
+
def _extensions(
|
|
211
|
+
self,
|
|
212
|
+
element: _Element,
|
|
213
|
+
file_context: XliffFileContext,
|
|
214
|
+
unit_id: str,
|
|
215
|
+
) -> dict[str, str]:
|
|
216
|
+
extensions = {
|
|
217
|
+
"resource": file_context.original,
|
|
218
|
+
"resource_index": str(file_context.index),
|
|
219
|
+
"unit_id": unit_id,
|
|
220
|
+
}
|
|
221
|
+
if file_context.data_type:
|
|
222
|
+
extensions["data_type"] = file_context.data_type
|
|
223
|
+
xml_space = element.attrib.get("{http://www.w3.org/XML/1998/namespace}space")
|
|
224
|
+
if xml_space:
|
|
225
|
+
extensions["space"] = xml_space
|
|
226
|
+
return extensions
|
|
227
|
+
|
|
228
|
+
def _base_language(self, locale: str) -> str:
|
|
229
|
+
return locale.replace("_", "-").split("-")[0].lower()
|
|
Binary file
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from lxml.etree import _Element
|
|
4
|
+
|
|
5
|
+
from lokit.data.structure import CodePart, SegmentPart, TextPart
|
|
6
|
+
from lokit.data.tag_types import TieData, TieType
|
|
7
|
+
from lokit.parsers.tmx.xml_utils import element_children, local_name
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class XliffTagParser:
|
|
11
|
+
def parse(
|
|
12
|
+
self, element: _Element
|
|
13
|
+
) -> tuple[str, dict[str, TieData], list[SegmentPart]]:
|
|
14
|
+
raw_text = ""
|
|
15
|
+
tag_map: dict[str, TieData] = {}
|
|
16
|
+
parts: list[SegmentPart] = []
|
|
17
|
+
pair_ids: dict[str, str] = {}
|
|
18
|
+
order = 0
|
|
19
|
+
|
|
20
|
+
raw_text, order = self._append_content(
|
|
21
|
+
element,
|
|
22
|
+
raw_text,
|
|
23
|
+
parts,
|
|
24
|
+
tag_map,
|
|
25
|
+
pair_ids,
|
|
26
|
+
order,
|
|
27
|
+
include_element_code=False,
|
|
28
|
+
)
|
|
29
|
+
return raw_text, tag_map, parts
|
|
30
|
+
|
|
31
|
+
def _append_content(
|
|
32
|
+
self,
|
|
33
|
+
element: _Element,
|
|
34
|
+
raw_text: str,
|
|
35
|
+
parts: list[SegmentPart],
|
|
36
|
+
tag_map: dict[str, TieData],
|
|
37
|
+
pair_ids: dict[str, str],
|
|
38
|
+
order: int,
|
|
39
|
+
include_element_code: bool,
|
|
40
|
+
) -> tuple[str, int]:
|
|
41
|
+
if include_element_code:
|
|
42
|
+
open_id = f"c{order}"
|
|
43
|
+
pair_id = self._pair_id(element, pair_ids)
|
|
44
|
+
tag_map[open_id] = TieData(
|
|
45
|
+
id=open_id,
|
|
46
|
+
type=self._open_type(element),
|
|
47
|
+
position=len(raw_text),
|
|
48
|
+
order=order,
|
|
49
|
+
pair_id=pair_id,
|
|
50
|
+
)
|
|
51
|
+
parts.append(CodePart(open_id))
|
|
52
|
+
order += 1
|
|
53
|
+
|
|
54
|
+
if element.text:
|
|
55
|
+
raw_text += element.text
|
|
56
|
+
parts.append(TextPart(element.text))
|
|
57
|
+
|
|
58
|
+
for child in element_children(element):
|
|
59
|
+
child_name = local_name(child.tag)
|
|
60
|
+
if child_name in ("g", "mrk", "sub"):
|
|
61
|
+
raw_text, order = self._append_content(
|
|
62
|
+
child,
|
|
63
|
+
raw_text,
|
|
64
|
+
parts,
|
|
65
|
+
tag_map,
|
|
66
|
+
pair_ids,
|
|
67
|
+
order,
|
|
68
|
+
include_element_code=True,
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
code_id = f"c{order}"
|
|
72
|
+
tag_map[code_id] = TieData(
|
|
73
|
+
id=code_id,
|
|
74
|
+
type=self._inline_type(child),
|
|
75
|
+
position=len(raw_text),
|
|
76
|
+
order=order,
|
|
77
|
+
pair_id=self._pair_id(child, pair_ids),
|
|
78
|
+
)
|
|
79
|
+
parts.append(CodePart(code_id))
|
|
80
|
+
order += 1
|
|
81
|
+
|
|
82
|
+
if child.tail:
|
|
83
|
+
raw_text += child.tail
|
|
84
|
+
parts.append(TextPart(child.tail))
|
|
85
|
+
|
|
86
|
+
if include_element_code:
|
|
87
|
+
close_id = f"c{order}"
|
|
88
|
+
tag_map[close_id] = TieData(
|
|
89
|
+
id=close_id,
|
|
90
|
+
type=self._close_type(element),
|
|
91
|
+
position=len(raw_text),
|
|
92
|
+
order=order,
|
|
93
|
+
pair_id=self._pair_id(element, pair_ids),
|
|
94
|
+
)
|
|
95
|
+
parts.append(CodePart(close_id))
|
|
96
|
+
order += 1
|
|
97
|
+
|
|
98
|
+
return raw_text, order
|
|
99
|
+
|
|
100
|
+
def _pair_id(self, element: _Element, pair_ids: dict[str, str]) -> str | None:
|
|
101
|
+
source_id = (
|
|
102
|
+
element.attrib.get("rid")
|
|
103
|
+
or element.attrib.get("id")
|
|
104
|
+
or element.attrib.get("xid")
|
|
105
|
+
or element.attrib.get("ctype")
|
|
106
|
+
)
|
|
107
|
+
if source_id is None:
|
|
108
|
+
return None
|
|
109
|
+
existing = pair_ids.get(source_id)
|
|
110
|
+
if existing is not None:
|
|
111
|
+
return existing
|
|
112
|
+
normalized = f"p{len(pair_ids)}"
|
|
113
|
+
pair_ids[source_id] = normalized
|
|
114
|
+
return normalized
|
|
115
|
+
|
|
116
|
+
def _inline_type(self, element: _Element) -> TieType:
|
|
117
|
+
name = local_name(element.tag)
|
|
118
|
+
if name in ("bpt", "bx"):
|
|
119
|
+
return TieType.CUSTOM_OPEN
|
|
120
|
+
if name in ("ept", "ex"):
|
|
121
|
+
return TieType.CUSTOM_CLOSE
|
|
122
|
+
return TieType.CUSTOM_STANDALONE
|
|
123
|
+
|
|
124
|
+
def _open_type(self, element: _Element) -> TieType:
|
|
125
|
+
return TieType.CUSTOM_OPEN
|
|
126
|
+
|
|
127
|
+
def _close_type(self, element: _Element) -> TieType:
|
|
128
|
+
return TieType.CUSTOM_CLOSE
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
Binary file
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
7
|
+
|
|
8
|
+
from openpyxl import load_workbook
|
|
9
|
+
from openpyxl.cell.cell import Cell, MergedCell
|
|
10
|
+
|
|
11
|
+
from lokit.data.structure import Comment, Data, TranslationStatus
|
|
12
|
+
|
|
13
|
+
ExtractItem = tuple[str, Data]
|
|
14
|
+
|
|
15
|
+
_KNOWN_COLUMNS = frozenset({"id", "source", "target", "status", "comment"})
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _parse_base_lang(locale: str) -> str:
|
|
19
|
+
return locale.replace("_", "-").split("-")[0].lower()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _parse_status(value: str) -> TranslationStatus:
|
|
23
|
+
normalized = value.strip().lower()
|
|
24
|
+
try:
|
|
25
|
+
return TranslationStatus(normalized)
|
|
26
|
+
except ValueError:
|
|
27
|
+
return TranslationStatus.UNKNOWN
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _cell_str(cell: Cell | MergedCell) -> str:
|
|
31
|
+
if cell.value is None:
|
|
32
|
+
return ""
|
|
33
|
+
return str(cell.value)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _infer_locales_from_filename(filepath: str) -> tuple[str, str | None]:
|
|
37
|
+
stem = Path(filepath).stem
|
|
38
|
+
if "-" in stem:
|
|
39
|
+
parts = stem.split("-")
|
|
40
|
+
if len(parts) == 2:
|
|
41
|
+
return parts[0], parts[1]
|
|
42
|
+
if len(parts) == 4:
|
|
43
|
+
return f"{parts[0]}-{parts[1]}", f"{parts[2]}-{parts[3]}"
|
|
44
|
+
if "_" in stem:
|
|
45
|
+
parts = stem.split("_")
|
|
46
|
+
if len(parts) == 2:
|
|
47
|
+
return parts[0], parts[1]
|
|
48
|
+
if len(parts) == 4:
|
|
49
|
+
return f"{parts[0]}_{parts[1]}", f"{parts[2]}_{parts[3]}"
|
|
50
|
+
return "", None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(slots=True)
|
|
54
|
+
class _AsyncExtractionResult:
|
|
55
|
+
item: Optional[ExtractItem] = None
|
|
56
|
+
error: Optional[BaseException] = None
|
|
57
|
+
done: bool = False
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class AsyncXlsxExtraction:
|
|
61
|
+
def __init__(self, extractor: XlsxExtractor) -> None:
|
|
62
|
+
self._extractor = extractor
|
|
63
|
+
self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
|
|
64
|
+
self._producer: asyncio.Task[None] | None = None
|
|
65
|
+
|
|
66
|
+
def __aiter__(self) -> AsyncXlsxExtraction:
|
|
67
|
+
return self
|
|
68
|
+
|
|
69
|
+
async def __anext__(self) -> ExtractItem:
|
|
70
|
+
if self._producer is None:
|
|
71
|
+
self._start()
|
|
72
|
+
|
|
73
|
+
result = await self._queue.get()
|
|
74
|
+
if result.done:
|
|
75
|
+
await self._finish()
|
|
76
|
+
raise StopAsyncIteration
|
|
77
|
+
if result.error is not None:
|
|
78
|
+
await self._finish()
|
|
79
|
+
raise result.error
|
|
80
|
+
if result.item is None:
|
|
81
|
+
await self._finish()
|
|
82
|
+
raise StopAsyncIteration
|
|
83
|
+
return result.item
|
|
84
|
+
|
|
85
|
+
def _start(self) -> None:
|
|
86
|
+
loop = asyncio.get_running_loop()
|
|
87
|
+
|
|
88
|
+
def produce() -> None:
|
|
89
|
+
try:
|
|
90
|
+
for item in self._extractor.extract():
|
|
91
|
+
loop.call_soon_threadsafe(
|
|
92
|
+
self._queue.put_nowait,
|
|
93
|
+
_AsyncExtractionResult(item=item),
|
|
94
|
+
)
|
|
95
|
+
except BaseException as exc:
|
|
96
|
+
loop.call_soon_threadsafe(
|
|
97
|
+
self._queue.put_nowait,
|
|
98
|
+
_AsyncExtractionResult(error=exc),
|
|
99
|
+
)
|
|
100
|
+
finally:
|
|
101
|
+
loop.call_soon_threadsafe(
|
|
102
|
+
self._queue.put_nowait,
|
|
103
|
+
_AsyncExtractionResult(done=True),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
107
|
+
|
|
108
|
+
async def _finish(self) -> None:
|
|
109
|
+
if self._producer is not None:
|
|
110
|
+
await self._producer
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class XlsxExtractor:
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
filepath: str,
|
|
117
|
+
source_locale: str = "",
|
|
118
|
+
target_locale: str | None = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
self.filepath: str = filepath
|
|
121
|
+
|
|
122
|
+
if source_locale:
|
|
123
|
+
self.source_locale: str = source_locale
|
|
124
|
+
self.target_locale: str | None = target_locale
|
|
125
|
+
else:
|
|
126
|
+
inferred_source, inferred_target = _infer_locales_from_filename(filepath)
|
|
127
|
+
self.source_locale = inferred_source
|
|
128
|
+
self.target_locale = target_locale or inferred_target
|
|
129
|
+
|
|
130
|
+
self.source_language: str | None = (
|
|
131
|
+
_parse_base_lang(self.source_locale) if self.source_locale else None
|
|
132
|
+
)
|
|
133
|
+
self.target_language: str | None = (
|
|
134
|
+
_parse_base_lang(self.target_locale) if self.target_locale else None
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
self.export_origin: str = ""
|
|
138
|
+
self.export_timestamp: str = ""
|
|
139
|
+
self.extensions: dict[str, str] = {"input_format": "xlsx"}
|
|
140
|
+
|
|
141
|
+
def extract(self) -> Iterator[ExtractItem]:
|
|
142
|
+
wb = load_workbook(self.filepath, read_only=True, data_only=True)
|
|
143
|
+
try:
|
|
144
|
+
ws = wb.active
|
|
145
|
+
if ws is None:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
rows = ws.iter_rows()
|
|
149
|
+
header_row = next(rows, None)
|
|
150
|
+
if header_row is None:
|
|
151
|
+
return
|
|
152
|
+
|
|
153
|
+
headers: list[str] = [_cell_str(c).strip().lower() for c in header_row]
|
|
154
|
+
col_map: dict[str, int] = {name: i for i, name in enumerate(headers) if name}
|
|
155
|
+
has_id = "id" in col_map
|
|
156
|
+
extra_columns = [h for h in headers if h and h not in _KNOWN_COLUMNS]
|
|
157
|
+
|
|
158
|
+
for index, row in enumerate(rows):
|
|
159
|
+
cells = list(row)
|
|
160
|
+
|
|
161
|
+
def get(col: str) -> str:
|
|
162
|
+
idx = col_map.get(col)
|
|
163
|
+
if idx is None or idx >= len(cells):
|
|
164
|
+
return ""
|
|
165
|
+
return _cell_str(cells[idx])
|
|
166
|
+
|
|
167
|
+
unit_id = get("id") if has_id else ""
|
|
168
|
+
if not unit_id:
|
|
169
|
+
unit_id = f"xlsx:{index}"
|
|
170
|
+
|
|
171
|
+
source = get("source")
|
|
172
|
+
raw_target = get("target")
|
|
173
|
+
target = raw_target if raw_target else None
|
|
174
|
+
status = _parse_status(get("status")) if get("status") else TranslationStatus.UNKNOWN
|
|
175
|
+
|
|
176
|
+
comments: list[Comment] = []
|
|
177
|
+
comment_text = get("comment").strip()
|
|
178
|
+
if comment_text:
|
|
179
|
+
comments.append(Comment(context=comment_text))
|
|
180
|
+
|
|
181
|
+
extensions: dict[str, str] = {}
|
|
182
|
+
for col in extra_columns:
|
|
183
|
+
val = get(col)
|
|
184
|
+
if val:
|
|
185
|
+
extensions[col] = val
|
|
186
|
+
|
|
187
|
+
yield unit_id, Data(
|
|
188
|
+
source=source,
|
|
189
|
+
target=target,
|
|
190
|
+
status=status,
|
|
191
|
+
comments=comments,
|
|
192
|
+
extensions=extensions,
|
|
193
|
+
)
|
|
194
|
+
finally:
|
|
195
|
+
wb.close()
|
|
196
|
+
|
|
197
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
198
|
+
return AsyncXlsxExtraction(self)
|
lokit/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|