lokit-python 0.1.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
- lokit/__init__.cp313-win_amd64.pyd +0 -0
- lokit/__init__.py +128 -0
- lokit/core/__init__.cp313-win_amd64.pyd +0 -0
- lokit/core/__init__.py +0 -0
- lokit/core/logger.cp313-win_amd64.pyd +0 -0
- lokit/core/logger.py +20 -0
- lokit/data/__init__.cp313-win_amd64.pyd +0 -0
- lokit/data/__init__.py +0 -0
- lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
- lokit/data/lang_codes.py +455 -0
- lokit/data/structure.cp313-win_amd64.pyd +0 -0
- lokit/data/structure.py +118 -0
- lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
- lokit/data/tag_types.py +78 -0
- lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
- lokit/exporters/__init__.py +34 -0
- lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
- lokit/exporters/csv.py +32 -0
- lokit/exporters/html.cp313-win_amd64.pyd +0 -0
- lokit/exporters/html.py +217 -0
- lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
- lokit/exporters/idml.py +178 -0
- lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
- lokit/exporters/json_i18n.py +47 -0
- lokit/exporters/po.cp313-win_amd64.pyd +0 -0
- lokit/exporters/po.py +162 -0
- lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/tmx.py +247 -0
- lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xliff.py +152 -0
- lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xlsx.py +39 -0
- lokit/format_detection.cp313-win_amd64.pyd +0 -0
- lokit/format_detection.py +115 -0
- lokit/importers.py +321 -0
- lokit/io/__init__.cp313-win_amd64.pyd +0 -0
- lokit/io/__init__.py +3 -0
- lokit/io/json.cp313-win_amd64.pyd +0 -0
- lokit/io/json.py +194 -0
- lokit/logic.cp313-win_amd64.pyd +0 -0
- lokit/logic.py +324 -0
- lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/__init__.py +1 -0
- lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/__init__.py +1 -0
- lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/extraction.py +164 -0
- lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/__init__.py +3 -0
- lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/extraction.py +365 -0
- lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/__init__.py +3 -0
- lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/extraction.py +264 -0
- lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/__init__.py +3 -0
- lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/extraction.py +163 -0
- lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/__init__.py +3 -0
- lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/extraction.py +236 -0
- lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/__init__.py +0 -0
- lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/base.py +145 -0
- lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/extraction.py +170 -0
- lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/header.py +55 -0
- lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/helpers.py +9 -0
- lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/models.py +10 -0
- lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/props.py +201 -0
- lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/tags.py +59 -0
- lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/xml_utils.py +46 -0
- lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/__init__.py +3 -0
- lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/extraction.py +229 -0
- lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/tags.py +128 -0
- lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/__init__.py +1 -0
- lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/extraction.py +198 -0
- lokit/py.typed +1 -0
- lokit_python-0.1.0.dist-info/METADATA +149 -0
- lokit_python-0.1.0.dist-info/RECORD +97 -0
- lokit_python-0.1.0.dist-info/WHEEL +5 -0
- lokit_python-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
6
|
+
|
|
7
|
+
from lxml import html as lxml_html
|
|
8
|
+
from lxml.html import HtmlElement
|
|
9
|
+
|
|
10
|
+
from lokit.data.structure import CodePart, Data, Meta, Tags, TextPart, TranslationStatus
|
|
11
|
+
from lokit.data.tag_types import TieData, TieType
|
|
12
|
+
|
|
13
|
+
ExtractItem = tuple[str, Data]
|
|
14
|
+
|
|
15
|
+
_BLOCK_TAGS: frozenset[str] = frozenset({
|
|
16
|
+
"p", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
17
|
+
"li", "td", "th", "dt", "dd", "caption",
|
|
18
|
+
"figcaption", "blockquote", "label", "option", "title",
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
_INLINE_TAGS: frozenset[str] = frozenset({
|
|
22
|
+
"b", "i", "em", "strong", "a", "span", "u", "s",
|
|
23
|
+
"small", "mark", "code", "sub", "sup", "abbr", "q",
|
|
24
|
+
"cite", "dfn", "kbd", "samp", "var", "br", "img", "wbr",
|
|
25
|
+
})
|
|
26
|
+
|
|
27
|
+
_SKIP_TAGS: frozenset[str] = frozenset({"script", "style"})
|
|
28
|
+
|
|
29
|
+
_STANDALONE_TAGS: frozenset[str] = frozenset({"br", "img", "wbr"})
|
|
30
|
+
|
|
31
|
+
_TAG_TYPE_MAP: dict[str, tuple[TieType, TieType | None]] = {
|
|
32
|
+
"a": (TieType.A_OPEN, TieType.A_CLOSE),
|
|
33
|
+
"abbr": (TieType.ABBR_OPEN, TieType.ABBR_CLOSE),
|
|
34
|
+
"b": (TieType.B_OPEN, TieType.B_CLOSE),
|
|
35
|
+
"bdi": (TieType.BDI_OPEN, TieType.BDI_CLOSE),
|
|
36
|
+
"bdo": (TieType.BDO_OPEN, TieType.BDO_CLOSE),
|
|
37
|
+
"br": (TieType.BR, None),
|
|
38
|
+
"cite": (TieType.CITE_OPEN, TieType.CITE_CLOSE),
|
|
39
|
+
"code": (TieType.CODE_OPEN, TieType.CODE_CLOSE),
|
|
40
|
+
"dfn": (TieType.DFN_OPEN, TieType.DFN_CLOSE),
|
|
41
|
+
"em": (TieType.EM_OPEN, TieType.EM_CLOSE),
|
|
42
|
+
"i": (TieType.I_OPEN, TieType.I_CLOSE),
|
|
43
|
+
"img": (TieType.IMG, None),
|
|
44
|
+
"kbd": (TieType.KBD_OPEN, TieType.KBD_CLOSE),
|
|
45
|
+
"mark": (TieType.MARK_OPEN, TieType.MARK_CLOSE),
|
|
46
|
+
"q": (TieType.Q_OPEN, TieType.Q_CLOSE),
|
|
47
|
+
"s": (TieType.S_OPEN, TieType.S_CLOSE),
|
|
48
|
+
"samp": (TieType.SAMP_OPEN, TieType.SAMP_CLOSE),
|
|
49
|
+
"small": (TieType.SMALL_OPEN, TieType.SMALL_CLOSE),
|
|
50
|
+
"span": (TieType.SPAN_OPEN, TieType.SPAN_CLOSE),
|
|
51
|
+
"strong": (TieType.STRONG_OPEN, TieType.STRONG_CLOSE),
|
|
52
|
+
"sub": (TieType.SUB_OPEN, TieType.SUB_CLOSE),
|
|
53
|
+
"sup": (TieType.SUP_OPEN, TieType.SUP_CLOSE),
|
|
54
|
+
"u": (TieType.U_OPEN, TieType.U_CLOSE),
|
|
55
|
+
"var": (TieType.VAR_OPEN, TieType.VAR_CLOSE),
|
|
56
|
+
"wbr": (TieType.WBR, None),
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(slots=True)
|
|
61
|
+
class _AsyncResult:
|
|
62
|
+
item: Optional[ExtractItem] = None
|
|
63
|
+
error: Optional[BaseException] = None
|
|
64
|
+
done: bool = False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class _AsyncHtmlExtraction:
|
|
68
|
+
def __init__(self, extractor: HtmlExtractor) -> None:
|
|
69
|
+
self._extractor = extractor
|
|
70
|
+
self._queue: asyncio.Queue[_AsyncResult] = asyncio.Queue()
|
|
71
|
+
self._producer: asyncio.Task[None] | None = None
|
|
72
|
+
|
|
73
|
+
def __aiter__(self) -> _AsyncHtmlExtraction:
|
|
74
|
+
return self
|
|
75
|
+
|
|
76
|
+
async def __anext__(self) -> ExtractItem:
|
|
77
|
+
if self._producer is None:
|
|
78
|
+
self._start()
|
|
79
|
+
result = await self._queue.get()
|
|
80
|
+
if result.done:
|
|
81
|
+
await self._finish()
|
|
82
|
+
raise StopAsyncIteration
|
|
83
|
+
if result.error is not None:
|
|
84
|
+
await self._finish()
|
|
85
|
+
raise result.error
|
|
86
|
+
if result.item is None:
|
|
87
|
+
await self._finish()
|
|
88
|
+
raise StopAsyncIteration
|
|
89
|
+
return result.item
|
|
90
|
+
|
|
91
|
+
def _start(self) -> None:
|
|
92
|
+
loop = asyncio.get_running_loop()
|
|
93
|
+
|
|
94
|
+
def produce() -> None:
|
|
95
|
+
try:
|
|
96
|
+
for item in self._extractor.extract():
|
|
97
|
+
loop.call_soon_threadsafe(
|
|
98
|
+
self._queue.put_nowait,
|
|
99
|
+
_AsyncResult(item=item),
|
|
100
|
+
)
|
|
101
|
+
except BaseException as exc:
|
|
102
|
+
loop.call_soon_threadsafe(
|
|
103
|
+
self._queue.put_nowait,
|
|
104
|
+
_AsyncResult(error=exc),
|
|
105
|
+
)
|
|
106
|
+
finally:
|
|
107
|
+
loop.call_soon_threadsafe(
|
|
108
|
+
self._queue.put_nowait,
|
|
109
|
+
_AsyncResult(done=True),
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
113
|
+
|
|
114
|
+
async def _finish(self) -> None:
|
|
115
|
+
if self._producer is not None:
|
|
116
|
+
await self._producer
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class HtmlExtractor:
|
|
120
|
+
def __init__(
|
|
121
|
+
self,
|
|
122
|
+
filepath: str,
|
|
123
|
+
source_locale: str = "",
|
|
124
|
+
target_locale: str | None = None,
|
|
125
|
+
) -> None:
|
|
126
|
+
self.filepath = filepath
|
|
127
|
+
self.source_locale = source_locale
|
|
128
|
+
self.target_locale = target_locale
|
|
129
|
+
self.source_language: str | None = None
|
|
130
|
+
self.target_language: str | None = None
|
|
131
|
+
self.export_origin = ""
|
|
132
|
+
self.export_timestamp = ""
|
|
133
|
+
self.extensions: dict[str, str] = {"input_format": "html"}
|
|
134
|
+
|
|
135
|
+
def extract(self) -> Iterator[ExtractItem]:
|
|
136
|
+
doc = lxml_html.parse(self.filepath)
|
|
137
|
+
root = doc.getroot()
|
|
138
|
+
if root is None:
|
|
139
|
+
return
|
|
140
|
+
|
|
141
|
+
lang = root.get("lang")
|
|
142
|
+
if lang and not self.source_locale:
|
|
143
|
+
self.source_locale = lang
|
|
144
|
+
self.source_language = self._base_language(lang)
|
|
145
|
+
if self.source_locale and self.source_language is None:
|
|
146
|
+
self.source_language = self._base_language(self.source_locale)
|
|
147
|
+
if self.target_locale and self.target_language is None:
|
|
148
|
+
self.target_language = self._base_language(self.target_locale)
|
|
149
|
+
|
|
150
|
+
index = 0
|
|
151
|
+
for unit_id, data in self._extract_meta(root, index):
|
|
152
|
+
yield unit_id, data
|
|
153
|
+
index += 1
|
|
154
|
+
|
|
155
|
+
for unit_id, data in self._walk(root, index):
|
|
156
|
+
yield unit_id, data
|
|
157
|
+
|
|
158
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
159
|
+
return _AsyncHtmlExtraction(self)
|
|
160
|
+
|
|
161
|
+
def _extract_meta(
|
|
162
|
+
self, root: HtmlElement, start_index: int
|
|
163
|
+
) -> Iterator[ExtractItem]:
|
|
164
|
+
index = start_index
|
|
165
|
+
head = root.find(".//head")
|
|
166
|
+
if head is None:
|
|
167
|
+
return
|
|
168
|
+
for meta_el in head.iterfind(".//meta"):
|
|
169
|
+
name = (meta_el.get("name") or "").lower()
|
|
170
|
+
content = meta_el.get("content") or ""
|
|
171
|
+
if name in ("description", "keywords") and content.strip():
|
|
172
|
+
unit_id = f"html:meta.{name}:{index}"
|
|
173
|
+
yield unit_id, Data(
|
|
174
|
+
source=content.strip(),
|
|
175
|
+
meta=Meta(),
|
|
176
|
+
status=TranslationStatus.UNKNOWN,
|
|
177
|
+
extensions={"meta_name": name},
|
|
178
|
+
)
|
|
179
|
+
index += 1
|
|
180
|
+
|
|
181
|
+
def _walk(
|
|
182
|
+
self, element: HtmlElement, start_index: int
|
|
183
|
+
) -> Iterator[ExtractItem]:
|
|
184
|
+
index = start_index
|
|
185
|
+
for child in element.iter():
|
|
186
|
+
tag = self._tag_name(child)
|
|
187
|
+
if tag in _SKIP_TAGS:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if tag in _BLOCK_TAGS:
|
|
191
|
+
result = self._extract_block(child, index)
|
|
192
|
+
if result is not None:
|
|
193
|
+
yield result
|
|
194
|
+
index += 1
|
|
195
|
+
|
|
196
|
+
if tag == "img":
|
|
197
|
+
alt = child.get("alt")
|
|
198
|
+
if alt and alt.strip():
|
|
199
|
+
unit_id = f"html:img.alt:{index}"
|
|
200
|
+
yield unit_id, Data(
|
|
201
|
+
source=alt.strip(),
|
|
202
|
+
meta=Meta(),
|
|
203
|
+
status=TranslationStatus.UNKNOWN,
|
|
204
|
+
)
|
|
205
|
+
index += 1
|
|
206
|
+
|
|
207
|
+
def _extract_block(
|
|
208
|
+
self, element: HtmlElement, index: int
|
|
209
|
+
) -> ExtractItem | None:
|
|
210
|
+
tag = self._tag_name(element)
|
|
211
|
+
has_inline = self._has_inline_children(element)
|
|
212
|
+
|
|
213
|
+
if has_inline:
|
|
214
|
+
return self._extract_with_tags(element, tag, index)
|
|
215
|
+
|
|
216
|
+
text = self._get_direct_text(element)
|
|
217
|
+
if not text:
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
unit_id = f"html:{tag}:{index}"
|
|
221
|
+
return unit_id, Data(
|
|
222
|
+
source=text,
|
|
223
|
+
meta=Meta(),
|
|
224
|
+
status=TranslationStatus.UNKNOWN,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
def _has_inline_children(self, element: HtmlElement) -> bool:
|
|
228
|
+
for child in element:
|
|
229
|
+
if self._tag_name(child) in _INLINE_TAGS:
|
|
230
|
+
return True
|
|
231
|
+
return False
|
|
232
|
+
|
|
233
|
+
def _extract_with_tags(
|
|
234
|
+
self, element: HtmlElement, tag: str, index: int
|
|
235
|
+
) -> ExtractItem | None:
|
|
236
|
+
parts: list[TextPart | CodePart] = []
|
|
237
|
+
tag_map: dict[str, TieData] = {}
|
|
238
|
+
tag_order = 0
|
|
239
|
+
pair_counter = 0
|
|
240
|
+
|
|
241
|
+
full_text = self._build_parts(
|
|
242
|
+
element, parts, tag_map, tag_order, pair_counter
|
|
243
|
+
)
|
|
244
|
+
if not full_text.strip():
|
|
245
|
+
return None
|
|
246
|
+
|
|
247
|
+
unit_id = f"html:{tag}:{index}"
|
|
248
|
+
tags = Tags(
|
|
249
|
+
source_tag_map=tag_map,
|
|
250
|
+
target_tag_map={},
|
|
251
|
+
source_parts=parts,
|
|
252
|
+
target_parts=[],
|
|
253
|
+
)
|
|
254
|
+
return unit_id, Data(
|
|
255
|
+
source=full_text.strip(),
|
|
256
|
+
tags=tags,
|
|
257
|
+
meta=Meta(),
|
|
258
|
+
status=TranslationStatus.UNKNOWN,
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def _build_parts(
|
|
262
|
+
self,
|
|
263
|
+
element: HtmlElement,
|
|
264
|
+
parts: list[TextPart | CodePart],
|
|
265
|
+
tag_map: dict[str, TieData],
|
|
266
|
+
tag_order: int,
|
|
267
|
+
pair_counter: int,
|
|
268
|
+
) -> str:
|
|
269
|
+
full_text = ""
|
|
270
|
+
|
|
271
|
+
text = element.text or ""
|
|
272
|
+
if text:
|
|
273
|
+
parts.append(TextPart(value=text))
|
|
274
|
+
full_text += text
|
|
275
|
+
|
|
276
|
+
for child in element:
|
|
277
|
+
child_tag = self._tag_name(child)
|
|
278
|
+
if child_tag not in _INLINE_TAGS:
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
if child_tag in _STANDALONE_TAGS:
|
|
282
|
+
ref_id = f"t{tag_order}"
|
|
283
|
+
type_info = _TAG_TYPE_MAP.get(child_tag)
|
|
284
|
+
tie_type = type_info[0] if type_info else TieType.CUSTOM_STANDALONE
|
|
285
|
+
attrs = dict(child.attrib)
|
|
286
|
+
tag_map[ref_id] = TieData(
|
|
287
|
+
id=ref_id,
|
|
288
|
+
type=tie_type,
|
|
289
|
+
attributes=attrs,
|
|
290
|
+
position=tag_order,
|
|
291
|
+
order=tag_order,
|
|
292
|
+
original_name=child_tag,
|
|
293
|
+
)
|
|
294
|
+
parts.append(CodePart(ref=ref_id))
|
|
295
|
+
tag_order += 1
|
|
296
|
+
else:
|
|
297
|
+
pair_id = f"pair{pair_counter}"
|
|
298
|
+
pair_counter += 1
|
|
299
|
+
type_info = _TAG_TYPE_MAP.get(child_tag)
|
|
300
|
+
|
|
301
|
+
open_id = f"t{tag_order}"
|
|
302
|
+
open_type = type_info[0] if type_info else TieType.CUSTOM_OPEN
|
|
303
|
+
attrs = dict(child.attrib)
|
|
304
|
+
tag_map[open_id] = TieData(
|
|
305
|
+
id=open_id,
|
|
306
|
+
type=open_type,
|
|
307
|
+
attributes=attrs,
|
|
308
|
+
position=tag_order,
|
|
309
|
+
order=tag_order,
|
|
310
|
+
pair_id=pair_id,
|
|
311
|
+
original_name=child_tag,
|
|
312
|
+
)
|
|
313
|
+
parts.append(CodePart(ref=open_id))
|
|
314
|
+
tag_order += 1
|
|
315
|
+
|
|
316
|
+
inner_text = child.text or ""
|
|
317
|
+
if inner_text:
|
|
318
|
+
parts.append(TextPart(value=inner_text))
|
|
319
|
+
full_text += inner_text
|
|
320
|
+
|
|
321
|
+
for grandchild in child:
|
|
322
|
+
gc_tag = self._tag_name(grandchild)
|
|
323
|
+
if gc_tag in _INLINE_TAGS:
|
|
324
|
+
nested_text = self._build_parts(
|
|
325
|
+
grandchild, parts, tag_map, tag_order, pair_counter
|
|
326
|
+
)
|
|
327
|
+
full_text += nested_text
|
|
328
|
+
|
|
329
|
+
close_id = f"t{tag_order}"
|
|
330
|
+
close_type = type_info[1] if type_info and type_info[1] else TieType.CUSTOM_CLOSE
|
|
331
|
+
tag_map[close_id] = TieData(
|
|
332
|
+
id=close_id,
|
|
333
|
+
type=close_type,
|
|
334
|
+
position=tag_order,
|
|
335
|
+
order=tag_order,
|
|
336
|
+
pair_id=pair_id,
|
|
337
|
+
original_name=child_tag,
|
|
338
|
+
)
|
|
339
|
+
parts.append(CodePart(ref=close_id))
|
|
340
|
+
tag_order += 1
|
|
341
|
+
|
|
342
|
+
tail = child.tail or ""
|
|
343
|
+
if tail:
|
|
344
|
+
parts.append(TextPart(value=tail))
|
|
345
|
+
full_text += tail
|
|
346
|
+
|
|
347
|
+
return full_text
|
|
348
|
+
|
|
349
|
+
def _get_direct_text(self, element: HtmlElement) -> str:
|
|
350
|
+
parts: list[str] = []
|
|
351
|
+
if element.text:
|
|
352
|
+
parts.append(element.text)
|
|
353
|
+
for child in element:
|
|
354
|
+
if child.tail:
|
|
355
|
+
parts.append(child.tail)
|
|
356
|
+
return "".join(parts).strip()
|
|
357
|
+
|
|
358
|
+
def _tag_name(self, element: HtmlElement) -> str:
|
|
359
|
+
tag = element.tag
|
|
360
|
+
if isinstance(tag, str):
|
|
361
|
+
return tag.lower()
|
|
362
|
+
return ""
|
|
363
|
+
|
|
364
|
+
def _base_language(self, locale: str) -> str:
|
|
365
|
+
return locale.replace("_", "-").split("-")[0].lower()
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import zipfile
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
7
|
+
|
|
8
|
+
from lxml import etree
|
|
9
|
+
from lxml.etree import _Element
|
|
10
|
+
|
|
11
|
+
from lokit.data.structure import CodePart, Data, Meta, Tags, TextPart, TranslationStatus
|
|
12
|
+
from lokit.data.tag_types import TieData, TieType
|
|
13
|
+
|
|
14
|
+
ExtractItem = tuple[str, Data]
|
|
15
|
+
|
|
16
|
+
IDML_NS = "http://ns.adobe.com/AdobeInDesign/idms/1.0/"
|
|
17
|
+
IDML_NSMAP: dict[str, str] = {"idPkg": IDML_NS}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class _AsyncResult:
|
|
22
|
+
item: Optional[ExtractItem] = None
|
|
23
|
+
error: Optional[BaseException] = None
|
|
24
|
+
done: bool = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class _AsyncIdmlExtraction:
|
|
28
|
+
def __init__(self, extractor: IdmlExtractor) -> None:
|
|
29
|
+
self._extractor = extractor
|
|
30
|
+
self._queue: asyncio.Queue[_AsyncResult] = asyncio.Queue()
|
|
31
|
+
self._producer: asyncio.Task[None] | None = None
|
|
32
|
+
|
|
33
|
+
def __aiter__(self) -> _AsyncIdmlExtraction:
|
|
34
|
+
return self
|
|
35
|
+
|
|
36
|
+
async def __anext__(self) -> ExtractItem:
|
|
37
|
+
if self._producer is None:
|
|
38
|
+
self._start()
|
|
39
|
+
result = await self._queue.get()
|
|
40
|
+
if result.done:
|
|
41
|
+
await self._finish()
|
|
42
|
+
raise StopAsyncIteration
|
|
43
|
+
if result.error is not None:
|
|
44
|
+
await self._finish()
|
|
45
|
+
raise result.error
|
|
46
|
+
if result.item is None:
|
|
47
|
+
await self._finish()
|
|
48
|
+
raise StopAsyncIteration
|
|
49
|
+
return result.item
|
|
50
|
+
|
|
51
|
+
def _start(self) -> None:
|
|
52
|
+
loop = asyncio.get_running_loop()
|
|
53
|
+
|
|
54
|
+
def produce() -> None:
|
|
55
|
+
try:
|
|
56
|
+
for item in self._extractor.extract():
|
|
57
|
+
loop.call_soon_threadsafe(
|
|
58
|
+
self._queue.put_nowait,
|
|
59
|
+
_AsyncResult(item=item),
|
|
60
|
+
)
|
|
61
|
+
except BaseException as exc:
|
|
62
|
+
loop.call_soon_threadsafe(
|
|
63
|
+
self._queue.put_nowait,
|
|
64
|
+
_AsyncResult(error=exc),
|
|
65
|
+
)
|
|
66
|
+
finally:
|
|
67
|
+
loop.call_soon_threadsafe(
|
|
68
|
+
self._queue.put_nowait,
|
|
69
|
+
_AsyncResult(done=True),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
73
|
+
|
|
74
|
+
async def _finish(self) -> None:
|
|
75
|
+
if self._producer is not None:
|
|
76
|
+
await self._producer
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class IdmlExtractor:
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
filepath: str,
|
|
83
|
+
source_locale: str = "",
|
|
84
|
+
target_locale: str | None = None,
|
|
85
|
+
) -> None:
|
|
86
|
+
self.filepath = filepath
|
|
87
|
+
self.source_locale = source_locale
|
|
88
|
+
self.target_locale = target_locale
|
|
89
|
+
self.source_language: str | None = None
|
|
90
|
+
self.target_language: str | None = None
|
|
91
|
+
self.export_origin = ""
|
|
92
|
+
self.export_timestamp = ""
|
|
93
|
+
self.extensions: dict[str, str] = {"input_format": "idml"}
|
|
94
|
+
|
|
95
|
+
def extract(self) -> Iterator[ExtractItem]:
|
|
96
|
+
if self.source_locale and self.source_language is None:
|
|
97
|
+
self.source_language = self._base_language(self.source_locale)
|
|
98
|
+
if self.target_locale and self.target_language is None:
|
|
99
|
+
self.target_language = self._base_language(self.target_locale)
|
|
100
|
+
|
|
101
|
+
with zipfile.ZipFile(self.filepath, "r") as zf:
|
|
102
|
+
story_files = sorted(
|
|
103
|
+
name for name in zf.namelist()
|
|
104
|
+
if name.startswith("Stories/Story_") and name.endswith(".xml")
|
|
105
|
+
)
|
|
106
|
+
for story_file in story_files:
|
|
107
|
+
story_name = _story_name(story_file)
|
|
108
|
+
with zf.open(story_file) as stream:
|
|
109
|
+
tree = etree.parse(stream)
|
|
110
|
+
root = tree.getroot()
|
|
111
|
+
yield from self._extract_story(root, story_name, story_file)
|
|
112
|
+
|
|
113
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
114
|
+
return _AsyncIdmlExtraction(self)
|
|
115
|
+
|
|
116
|
+
def _extract_story(
|
|
117
|
+
self,
|
|
118
|
+
root: _Element,
|
|
119
|
+
story_name: str,
|
|
120
|
+
story_file: str,
|
|
121
|
+
) -> Iterator[ExtractItem]:
|
|
122
|
+
paragraph_index = 0
|
|
123
|
+
for psr in root.iter():
|
|
124
|
+
if _local_name(psr.tag) != "ParagraphStyleRange":
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
result = self._extract_paragraph(psr, story_name, story_file, paragraph_index)
|
|
128
|
+
if result is not None:
|
|
129
|
+
yield result
|
|
130
|
+
paragraph_index += 1
|
|
131
|
+
|
|
132
|
+
def _extract_paragraph(
|
|
133
|
+
self,
|
|
134
|
+
psr: _Element,
|
|
135
|
+
story_name: str,
|
|
136
|
+
story_file: str,
|
|
137
|
+
paragraph_index: int,
|
|
138
|
+
) -> ExtractItem | None:
|
|
139
|
+
char_ranges: list[_Element] = [
|
|
140
|
+
el for el in psr
|
|
141
|
+
if _local_name(el.tag) == "CharacterStyleRange"
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
if not char_ranges:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
if len(char_ranges) == 1:
|
|
148
|
+
text = _collect_content_text(char_ranges[0])
|
|
149
|
+
if not text.strip():
|
|
150
|
+
return None
|
|
151
|
+
unit_id = f"{story_name}:p{paragraph_index}"
|
|
152
|
+
return unit_id, Data(
|
|
153
|
+
source=text.strip(),
|
|
154
|
+
meta=Meta(),
|
|
155
|
+
status=TranslationStatus.UNKNOWN,
|
|
156
|
+
extensions={"story": story_file, "input_format": "idml"},
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return self._extract_styled_paragraph(
|
|
160
|
+
char_ranges, story_name, story_file, paragraph_index
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def _extract_styled_paragraph(
|
|
164
|
+
self,
|
|
165
|
+
char_ranges: list[_Element],
|
|
166
|
+
story_name: str,
|
|
167
|
+
story_file: str,
|
|
168
|
+
paragraph_index: int,
|
|
169
|
+
) -> ExtractItem | None:
|
|
170
|
+
parts: list[TextPart | CodePart] = []
|
|
171
|
+
tag_map: dict[str, TieData] = {}
|
|
172
|
+
full_text_parts: list[str] = []
|
|
173
|
+
tag_order = 0
|
|
174
|
+
pair_counter = 0
|
|
175
|
+
|
|
176
|
+
for csr in char_ranges:
|
|
177
|
+
style = csr.get("AppliedCharacterStyle") or ""
|
|
178
|
+
text = _collect_content_text(csr)
|
|
179
|
+
|
|
180
|
+
if not text:
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
if style and style != "CharacterStyle/$ID/[No character style]":
|
|
184
|
+
pair_id = f"pair{pair_counter}"
|
|
185
|
+
pair_counter += 1
|
|
186
|
+
|
|
187
|
+
open_id = f"t{tag_order}"
|
|
188
|
+
tag_map[open_id] = TieData(
|
|
189
|
+
id=open_id,
|
|
190
|
+
type=TieType.CUSTOM_OPEN,
|
|
191
|
+
attributes={"style": style},
|
|
192
|
+
position=tag_order,
|
|
193
|
+
order=tag_order,
|
|
194
|
+
pair_id=pair_id,
|
|
195
|
+
original_name="CharacterStyleRange",
|
|
196
|
+
)
|
|
197
|
+
parts.append(CodePart(ref=open_id))
|
|
198
|
+
tag_order += 1
|
|
199
|
+
|
|
200
|
+
parts.append(TextPart(value=text))
|
|
201
|
+
full_text_parts.append(text)
|
|
202
|
+
|
|
203
|
+
close_id = f"t{tag_order}"
|
|
204
|
+
tag_map[close_id] = TieData(
|
|
205
|
+
id=close_id,
|
|
206
|
+
type=TieType.CUSTOM_CLOSE,
|
|
207
|
+
position=tag_order,
|
|
208
|
+
order=tag_order,
|
|
209
|
+
pair_id=pair_id,
|
|
210
|
+
original_name="CharacterStyleRange",
|
|
211
|
+
)
|
|
212
|
+
parts.append(CodePart(ref=close_id))
|
|
213
|
+
tag_order += 1
|
|
214
|
+
else:
|
|
215
|
+
parts.append(TextPart(value=text))
|
|
216
|
+
full_text_parts.append(text)
|
|
217
|
+
|
|
218
|
+
full_text = "".join(full_text_parts)
|
|
219
|
+
if not full_text.strip():
|
|
220
|
+
return None
|
|
221
|
+
|
|
222
|
+
unit_id = f"{story_name}:p{paragraph_index}"
|
|
223
|
+
tags = Tags(
|
|
224
|
+
source_tag_map=tag_map,
|
|
225
|
+
target_tag_map={},
|
|
226
|
+
source_parts=parts,
|
|
227
|
+
target_parts=[],
|
|
228
|
+
)
|
|
229
|
+
return unit_id, Data(
|
|
230
|
+
source=full_text.strip(),
|
|
231
|
+
tags=tags if tag_map else None,
|
|
232
|
+
meta=Meta(),
|
|
233
|
+
status=TranslationStatus.UNKNOWN,
|
|
234
|
+
extensions={"story": story_file, "input_format": "idml"},
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
def _base_language(self, locale: str) -> str:
|
|
238
|
+
return locale.replace("_", "-").split("-")[0].lower()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _local_name(tag: str | bytes) -> str:
|
|
242
|
+
name = tag if isinstance(tag, str) else tag.decode("utf-8")
|
|
243
|
+
if "}" in name:
|
|
244
|
+
return name.split("}", 1)[1]
|
|
245
|
+
return name
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def _story_name(story_file: str) -> str:
|
|
249
|
+
name = story_file
|
|
250
|
+
if name.startswith("Stories/"):
|
|
251
|
+
name = name[len("Stories/"):]
|
|
252
|
+
if name.endswith(".xml"):
|
|
253
|
+
name = name[: -len(".xml")]
|
|
254
|
+
return name
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _collect_content_text(element: _Element) -> str:
|
|
258
|
+
parts: list[str] = []
|
|
259
|
+
for child in element.iter():
|
|
260
|
+
if _local_name(child.tag) == "Content" and child.text:
|
|
261
|
+
parts.append(child.text)
|
|
262
|
+
if _local_name(child.tag) == "Br":
|
|
263
|
+
parts.append("\n")
|
|
264
|
+
return "".join(parts)
|
|
Binary file
|
|
Binary file
|