lokit-python 0.1.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
  2. lokit/__init__.cp313-win_amd64.pyd +0 -0
  3. lokit/__init__.py +128 -0
  4. lokit/core/__init__.cp313-win_amd64.pyd +0 -0
  5. lokit/core/__init__.py +0 -0
  6. lokit/core/logger.cp313-win_amd64.pyd +0 -0
  7. lokit/core/logger.py +20 -0
  8. lokit/data/__init__.cp313-win_amd64.pyd +0 -0
  9. lokit/data/__init__.py +0 -0
  10. lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
  11. lokit/data/lang_codes.py +455 -0
  12. lokit/data/structure.cp313-win_amd64.pyd +0 -0
  13. lokit/data/structure.py +118 -0
  14. lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
  15. lokit/data/tag_types.py +78 -0
  16. lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
  17. lokit/exporters/__init__.py +34 -0
  18. lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
  19. lokit/exporters/csv.py +32 -0
  20. lokit/exporters/html.cp313-win_amd64.pyd +0 -0
  21. lokit/exporters/html.py +217 -0
  22. lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
  23. lokit/exporters/idml.py +178 -0
  24. lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
  25. lokit/exporters/json_i18n.py +47 -0
  26. lokit/exporters/po.cp313-win_amd64.pyd +0 -0
  27. lokit/exporters/po.py +162 -0
  28. lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
  29. lokit/exporters/tmx.py +247 -0
  30. lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
  31. lokit/exporters/xliff.py +152 -0
  32. lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
  33. lokit/exporters/xlsx.py +39 -0
  34. lokit/format_detection.cp313-win_amd64.pyd +0 -0
  35. lokit/format_detection.py +115 -0
  36. lokit/importers.py +321 -0
  37. lokit/io/__init__.cp313-win_amd64.pyd +0 -0
  38. lokit/io/__init__.py +3 -0
  39. lokit/io/json.cp313-win_amd64.pyd +0 -0
  40. lokit/io/json.py +194 -0
  41. lokit/logic.cp313-win_amd64.pyd +0 -0
  42. lokit/logic.py +324 -0
  43. lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
  44. lokit/parsers/__init__.py +1 -0
  45. lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
  46. lokit/parsers/csv/__init__.py +1 -0
  47. lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
  48. lokit/parsers/csv/extraction.py +164 -0
  49. lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
  50. lokit/parsers/html/__init__.py +3 -0
  51. lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
  52. lokit/parsers/html/extraction.py +365 -0
  53. lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
  54. lokit/parsers/idml/__init__.py +3 -0
  55. lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
  56. lokit/parsers/idml/extraction.py +264 -0
  57. lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
  58. lokit/parsers/json_i18n/__init__.py +3 -0
  59. lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
  60. lokit/parsers/json_i18n/extraction.py +163 -0
  61. lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
  62. lokit/parsers/po/__init__.py +3 -0
  63. lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
  64. lokit/parsers/po/extraction.py +236 -0
  65. lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
  66. lokit/parsers/tmx/__init__.py +0 -0
  67. lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
  68. lokit/parsers/tmx/base.py +145 -0
  69. lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
  70. lokit/parsers/tmx/extraction.py +170 -0
  71. lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
  72. lokit/parsers/tmx/header.py +55 -0
  73. lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
  74. lokit/parsers/tmx/helpers.py +9 -0
  75. lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
  76. lokit/parsers/tmx/models.py +10 -0
  77. lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
  78. lokit/parsers/tmx/props.py +201 -0
  79. lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
  80. lokit/parsers/tmx/tags.py +59 -0
  81. lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
  82. lokit/parsers/tmx/xml_utils.py +46 -0
  83. lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
  84. lokit/parsers/xliff/__init__.py +3 -0
  85. lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
  86. lokit/parsers/xliff/extraction.py +229 -0
  87. lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
  88. lokit/parsers/xliff/tags.py +128 -0
  89. lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
  90. lokit/parsers/xlsx/__init__.py +1 -0
  91. lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
  92. lokit/parsers/xlsx/extraction.py +198 -0
  93. lokit/py.typed +1 -0
  94. lokit_python-0.1.0.dist-info/METADATA +149 -0
  95. lokit_python-0.1.0.dist-info/RECORD +97 -0
  96. lokit_python-0.1.0.dist-info/WHEEL +5 -0
  97. lokit_python-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,365 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ from typing import AsyncIterator, Iterator, Optional
6
+
7
+ from lxml import html as lxml_html
8
+ from lxml.html import HtmlElement
9
+
10
+ from lokit.data.structure import CodePart, Data, Meta, Tags, TextPart, TranslationStatus
11
+ from lokit.data.tag_types import TieData, TieType
12
+
13
+ ExtractItem = tuple[str, Data]
14
+
15
+ _BLOCK_TAGS: frozenset[str] = frozenset({
16
+ "p", "h1", "h2", "h3", "h4", "h5", "h6",
17
+ "li", "td", "th", "dt", "dd", "caption",
18
+ "figcaption", "blockquote", "label", "option", "title",
19
+ })
20
+
21
+ _INLINE_TAGS: frozenset[str] = frozenset({
22
+ "b", "i", "em", "strong", "a", "span", "u", "s",
23
+ "small", "mark", "code", "sub", "sup", "abbr", "q",
24
+ "cite", "dfn", "kbd", "samp", "var", "br", "img", "wbr",
25
+ })
26
+
27
+ _SKIP_TAGS: frozenset[str] = frozenset({"script", "style"})
28
+
29
+ _STANDALONE_TAGS: frozenset[str] = frozenset({"br", "img", "wbr"})
30
+
31
+ _TAG_TYPE_MAP: dict[str, tuple[TieType, TieType | None]] = {
32
+ "a": (TieType.A_OPEN, TieType.A_CLOSE),
33
+ "abbr": (TieType.ABBR_OPEN, TieType.ABBR_CLOSE),
34
+ "b": (TieType.B_OPEN, TieType.B_CLOSE),
35
+ "bdi": (TieType.BDI_OPEN, TieType.BDI_CLOSE),
36
+ "bdo": (TieType.BDO_OPEN, TieType.BDO_CLOSE),
37
+ "br": (TieType.BR, None),
38
+ "cite": (TieType.CITE_OPEN, TieType.CITE_CLOSE),
39
+ "code": (TieType.CODE_OPEN, TieType.CODE_CLOSE),
40
+ "dfn": (TieType.DFN_OPEN, TieType.DFN_CLOSE),
41
+ "em": (TieType.EM_OPEN, TieType.EM_CLOSE),
42
+ "i": (TieType.I_OPEN, TieType.I_CLOSE),
43
+ "img": (TieType.IMG, None),
44
+ "kbd": (TieType.KBD_OPEN, TieType.KBD_CLOSE),
45
+ "mark": (TieType.MARK_OPEN, TieType.MARK_CLOSE),
46
+ "q": (TieType.Q_OPEN, TieType.Q_CLOSE),
47
+ "s": (TieType.S_OPEN, TieType.S_CLOSE),
48
+ "samp": (TieType.SAMP_OPEN, TieType.SAMP_CLOSE),
49
+ "small": (TieType.SMALL_OPEN, TieType.SMALL_CLOSE),
50
+ "span": (TieType.SPAN_OPEN, TieType.SPAN_CLOSE),
51
+ "strong": (TieType.STRONG_OPEN, TieType.STRONG_CLOSE),
52
+ "sub": (TieType.SUB_OPEN, TieType.SUB_CLOSE),
53
+ "sup": (TieType.SUP_OPEN, TieType.SUP_CLOSE),
54
+ "u": (TieType.U_OPEN, TieType.U_CLOSE),
55
+ "var": (TieType.VAR_OPEN, TieType.VAR_CLOSE),
56
+ "wbr": (TieType.WBR, None),
57
+ }
58
+
59
+
60
+ @dataclass(slots=True)
61
+ class _AsyncResult:
62
+ item: Optional[ExtractItem] = None
63
+ error: Optional[BaseException] = None
64
+ done: bool = False
65
+
66
+
67
+ class _AsyncHtmlExtraction:
68
+ def __init__(self, extractor: HtmlExtractor) -> None:
69
+ self._extractor = extractor
70
+ self._queue: asyncio.Queue[_AsyncResult] = asyncio.Queue()
71
+ self._producer: asyncio.Task[None] | None = None
72
+
73
+ def __aiter__(self) -> _AsyncHtmlExtraction:
74
+ return self
75
+
76
+ async def __anext__(self) -> ExtractItem:
77
+ if self._producer is None:
78
+ self._start()
79
+ result = await self._queue.get()
80
+ if result.done:
81
+ await self._finish()
82
+ raise StopAsyncIteration
83
+ if result.error is not None:
84
+ await self._finish()
85
+ raise result.error
86
+ if result.item is None:
87
+ await self._finish()
88
+ raise StopAsyncIteration
89
+ return result.item
90
+
91
+ def _start(self) -> None:
92
+ loop = asyncio.get_running_loop()
93
+
94
+ def produce() -> None:
95
+ try:
96
+ for item in self._extractor.extract():
97
+ loop.call_soon_threadsafe(
98
+ self._queue.put_nowait,
99
+ _AsyncResult(item=item),
100
+ )
101
+ except BaseException as exc:
102
+ loop.call_soon_threadsafe(
103
+ self._queue.put_nowait,
104
+ _AsyncResult(error=exc),
105
+ )
106
+ finally:
107
+ loop.call_soon_threadsafe(
108
+ self._queue.put_nowait,
109
+ _AsyncResult(done=True),
110
+ )
111
+
112
+ self._producer = asyncio.create_task(asyncio.to_thread(produce))
113
+
114
+ async def _finish(self) -> None:
115
+ if self._producer is not None:
116
+ await self._producer
117
+
118
+
119
+ class HtmlExtractor:
120
+ def __init__(
121
+ self,
122
+ filepath: str,
123
+ source_locale: str = "",
124
+ target_locale: str | None = None,
125
+ ) -> None:
126
+ self.filepath = filepath
127
+ self.source_locale = source_locale
128
+ self.target_locale = target_locale
129
+ self.source_language: str | None = None
130
+ self.target_language: str | None = None
131
+ self.export_origin = ""
132
+ self.export_timestamp = ""
133
+ self.extensions: dict[str, str] = {"input_format": "html"}
134
+
135
+ def extract(self) -> Iterator[ExtractItem]:
136
+ doc = lxml_html.parse(self.filepath)
137
+ root = doc.getroot()
138
+ if root is None:
139
+ return
140
+
141
+ lang = root.get("lang")
142
+ if lang and not self.source_locale:
143
+ self.source_locale = lang
144
+ self.source_language = self._base_language(lang)
145
+ if self.source_locale and self.source_language is None:
146
+ self.source_language = self._base_language(self.source_locale)
147
+ if self.target_locale and self.target_language is None:
148
+ self.target_language = self._base_language(self.target_locale)
149
+
150
+ index = 0
151
+ for unit_id, data in self._extract_meta(root, index):
152
+ yield unit_id, data
153
+ index += 1
154
+
155
+ for unit_id, data in self._walk(root, index):
156
+ yield unit_id, data
157
+
158
+ def extract_async(self) -> AsyncIterator[ExtractItem]:
159
+ return _AsyncHtmlExtraction(self)
160
+
161
+ def _extract_meta(
162
+ self, root: HtmlElement, start_index: int
163
+ ) -> Iterator[ExtractItem]:
164
+ index = start_index
165
+ head = root.find(".//head")
166
+ if head is None:
167
+ return
168
+ for meta_el in head.iterfind(".//meta"):
169
+ name = (meta_el.get("name") or "").lower()
170
+ content = meta_el.get("content") or ""
171
+ if name in ("description", "keywords") and content.strip():
172
+ unit_id = f"html:meta.{name}:{index}"
173
+ yield unit_id, Data(
174
+ source=content.strip(),
175
+ meta=Meta(),
176
+ status=TranslationStatus.UNKNOWN,
177
+ extensions={"meta_name": name},
178
+ )
179
+ index += 1
180
+
181
+ def _walk(
182
+ self, element: HtmlElement, start_index: int
183
+ ) -> Iterator[ExtractItem]:
184
+ index = start_index
185
+ for child in element.iter():
186
+ tag = self._tag_name(child)
187
+ if tag in _SKIP_TAGS:
188
+ continue
189
+
190
+ if tag in _BLOCK_TAGS:
191
+ result = self._extract_block(child, index)
192
+ if result is not None:
193
+ yield result
194
+ index += 1
195
+
196
+ if tag == "img":
197
+ alt = child.get("alt")
198
+ if alt and alt.strip():
199
+ unit_id = f"html:img.alt:{index}"
200
+ yield unit_id, Data(
201
+ source=alt.strip(),
202
+ meta=Meta(),
203
+ status=TranslationStatus.UNKNOWN,
204
+ )
205
+ index += 1
206
+
207
+ def _extract_block(
208
+ self, element: HtmlElement, index: int
209
+ ) -> ExtractItem | None:
210
+ tag = self._tag_name(element)
211
+ has_inline = self._has_inline_children(element)
212
+
213
+ if has_inline:
214
+ return self._extract_with_tags(element, tag, index)
215
+
216
+ text = self._get_direct_text(element)
217
+ if not text:
218
+ return None
219
+
220
+ unit_id = f"html:{tag}:{index}"
221
+ return unit_id, Data(
222
+ source=text,
223
+ meta=Meta(),
224
+ status=TranslationStatus.UNKNOWN,
225
+ )
226
+
227
+ def _has_inline_children(self, element: HtmlElement) -> bool:
228
+ for child in element:
229
+ if self._tag_name(child) in _INLINE_TAGS:
230
+ return True
231
+ return False
232
+
233
+ def _extract_with_tags(
234
+ self, element: HtmlElement, tag: str, index: int
235
+ ) -> ExtractItem | None:
236
+ parts: list[TextPart | CodePart] = []
237
+ tag_map: dict[str, TieData] = {}
238
+ tag_order = 0
239
+ pair_counter = 0
240
+
241
+ full_text = self._build_parts(
242
+ element, parts, tag_map, tag_order, pair_counter
243
+ )
244
+ if not full_text.strip():
245
+ return None
246
+
247
+ unit_id = f"html:{tag}:{index}"
248
+ tags = Tags(
249
+ source_tag_map=tag_map,
250
+ target_tag_map={},
251
+ source_parts=parts,
252
+ target_parts=[],
253
+ )
254
+ return unit_id, Data(
255
+ source=full_text.strip(),
256
+ tags=tags,
257
+ meta=Meta(),
258
+ status=TranslationStatus.UNKNOWN,
259
+ )
260
+
261
+ def _build_parts(
262
+ self,
263
+ element: HtmlElement,
264
+ parts: list[TextPart | CodePart],
265
+ tag_map: dict[str, TieData],
266
+ tag_order: int,
267
+ pair_counter: int,
268
+ ) -> str:
269
+ full_text = ""
270
+
271
+ text = element.text or ""
272
+ if text:
273
+ parts.append(TextPart(value=text))
274
+ full_text += text
275
+
276
+ for child in element:
277
+ child_tag = self._tag_name(child)
278
+ if child_tag not in _INLINE_TAGS:
279
+ continue
280
+
281
+ if child_tag in _STANDALONE_TAGS:
282
+ ref_id = f"t{tag_order}"
283
+ type_info = _TAG_TYPE_MAP.get(child_tag)
284
+ tie_type = type_info[0] if type_info else TieType.CUSTOM_STANDALONE
285
+ attrs = dict(child.attrib)
286
+ tag_map[ref_id] = TieData(
287
+ id=ref_id,
288
+ type=tie_type,
289
+ attributes=attrs,
290
+ position=tag_order,
291
+ order=tag_order,
292
+ original_name=child_tag,
293
+ )
294
+ parts.append(CodePart(ref=ref_id))
295
+ tag_order += 1
296
+ else:
297
+ pair_id = f"pair{pair_counter}"
298
+ pair_counter += 1
299
+ type_info = _TAG_TYPE_MAP.get(child_tag)
300
+
301
+ open_id = f"t{tag_order}"
302
+ open_type = type_info[0] if type_info else TieType.CUSTOM_OPEN
303
+ attrs = dict(child.attrib)
304
+ tag_map[open_id] = TieData(
305
+ id=open_id,
306
+ type=open_type,
307
+ attributes=attrs,
308
+ position=tag_order,
309
+ order=tag_order,
310
+ pair_id=pair_id,
311
+ original_name=child_tag,
312
+ )
313
+ parts.append(CodePart(ref=open_id))
314
+ tag_order += 1
315
+
316
+ inner_text = child.text or ""
317
+ if inner_text:
318
+ parts.append(TextPart(value=inner_text))
319
+ full_text += inner_text
320
+
321
+ for grandchild in child:
322
+ gc_tag = self._tag_name(grandchild)
323
+ if gc_tag in _INLINE_TAGS:
324
+ nested_text = self._build_parts(
325
+ grandchild, parts, tag_map, tag_order, pair_counter
326
+ )
327
+ full_text += nested_text
328
+
329
+ close_id = f"t{tag_order}"
330
+ close_type = type_info[1] if type_info and type_info[1] else TieType.CUSTOM_CLOSE
331
+ tag_map[close_id] = TieData(
332
+ id=close_id,
333
+ type=close_type,
334
+ position=tag_order,
335
+ order=tag_order,
336
+ pair_id=pair_id,
337
+ original_name=child_tag,
338
+ )
339
+ parts.append(CodePart(ref=close_id))
340
+ tag_order += 1
341
+
342
+ tail = child.tail or ""
343
+ if tail:
344
+ parts.append(TextPart(value=tail))
345
+ full_text += tail
346
+
347
+ return full_text
348
+
349
+ def _get_direct_text(self, element: HtmlElement) -> str:
350
+ parts: list[str] = []
351
+ if element.text:
352
+ parts.append(element.text)
353
+ for child in element:
354
+ if child.tail:
355
+ parts.append(child.tail)
356
+ return "".join(parts).strip()
357
+
358
+ def _tag_name(self, element: HtmlElement) -> str:
359
+ tag = element.tag
360
+ if isinstance(tag, str):
361
+ return tag.lower()
362
+ return ""
363
+
364
+ def _base_language(self, locale: str) -> str:
365
+ return locale.replace("_", "-").split("-")[0].lower()
@@ -0,0 +1,3 @@
1
+ from lokit.parsers.idml.extraction import IdmlExtractor
2
+
3
+ __all__ = ["IdmlExtractor"]
@@ -0,0 +1,264 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import zipfile
5
+ from dataclasses import dataclass
6
+ from typing import AsyncIterator, Iterator, Optional
7
+
8
+ from lxml import etree
9
+ from lxml.etree import _Element
10
+
11
+ from lokit.data.structure import CodePart, Data, Meta, Tags, TextPart, TranslationStatus
12
+ from lokit.data.tag_types import TieData, TieType
13
+
14
+ ExtractItem = tuple[str, Data]
15
+
16
+ IDML_NS = "http://ns.adobe.com/AdobeInDesign/idms/1.0/"
17
+ IDML_NSMAP: dict[str, str] = {"idPkg": IDML_NS}
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class _AsyncResult:
22
+ item: Optional[ExtractItem] = None
23
+ error: Optional[BaseException] = None
24
+ done: bool = False
25
+
26
+
27
+ class _AsyncIdmlExtraction:
28
+ def __init__(self, extractor: IdmlExtractor) -> None:
29
+ self._extractor = extractor
30
+ self._queue: asyncio.Queue[_AsyncResult] = asyncio.Queue()
31
+ self._producer: asyncio.Task[None] | None = None
32
+
33
+ def __aiter__(self) -> _AsyncIdmlExtraction:
34
+ return self
35
+
36
+ async def __anext__(self) -> ExtractItem:
37
+ if self._producer is None:
38
+ self._start()
39
+ result = await self._queue.get()
40
+ if result.done:
41
+ await self._finish()
42
+ raise StopAsyncIteration
43
+ if result.error is not None:
44
+ await self._finish()
45
+ raise result.error
46
+ if result.item is None:
47
+ await self._finish()
48
+ raise StopAsyncIteration
49
+ return result.item
50
+
51
+ def _start(self) -> None:
52
+ loop = asyncio.get_running_loop()
53
+
54
+ def produce() -> None:
55
+ try:
56
+ for item in self._extractor.extract():
57
+ loop.call_soon_threadsafe(
58
+ self._queue.put_nowait,
59
+ _AsyncResult(item=item),
60
+ )
61
+ except BaseException as exc:
62
+ loop.call_soon_threadsafe(
63
+ self._queue.put_nowait,
64
+ _AsyncResult(error=exc),
65
+ )
66
+ finally:
67
+ loop.call_soon_threadsafe(
68
+ self._queue.put_nowait,
69
+ _AsyncResult(done=True),
70
+ )
71
+
72
+ self._producer = asyncio.create_task(asyncio.to_thread(produce))
73
+
74
+ async def _finish(self) -> None:
75
+ if self._producer is not None:
76
+ await self._producer
77
+
78
+
79
+ class IdmlExtractor:
80
+ def __init__(
81
+ self,
82
+ filepath: str,
83
+ source_locale: str = "",
84
+ target_locale: str | None = None,
85
+ ) -> None:
86
+ self.filepath = filepath
87
+ self.source_locale = source_locale
88
+ self.target_locale = target_locale
89
+ self.source_language: str | None = None
90
+ self.target_language: str | None = None
91
+ self.export_origin = ""
92
+ self.export_timestamp = ""
93
+ self.extensions: dict[str, str] = {"input_format": "idml"}
94
+
95
+ def extract(self) -> Iterator[ExtractItem]:
96
+ if self.source_locale and self.source_language is None:
97
+ self.source_language = self._base_language(self.source_locale)
98
+ if self.target_locale and self.target_language is None:
99
+ self.target_language = self._base_language(self.target_locale)
100
+
101
+ with zipfile.ZipFile(self.filepath, "r") as zf:
102
+ story_files = sorted(
103
+ name for name in zf.namelist()
104
+ if name.startswith("Stories/Story_") and name.endswith(".xml")
105
+ )
106
+ for story_file in story_files:
107
+ story_name = _story_name(story_file)
108
+ with zf.open(story_file) as stream:
109
+ tree = etree.parse(stream)
110
+ root = tree.getroot()
111
+ yield from self._extract_story(root, story_name, story_file)
112
+
113
+ def extract_async(self) -> AsyncIterator[ExtractItem]:
114
+ return _AsyncIdmlExtraction(self)
115
+
116
+ def _extract_story(
117
+ self,
118
+ root: _Element,
119
+ story_name: str,
120
+ story_file: str,
121
+ ) -> Iterator[ExtractItem]:
122
+ paragraph_index = 0
123
+ for psr in root.iter():
124
+ if _local_name(psr.tag) != "ParagraphStyleRange":
125
+ continue
126
+
127
+ result = self._extract_paragraph(psr, story_name, story_file, paragraph_index)
128
+ if result is not None:
129
+ yield result
130
+ paragraph_index += 1
131
+
132
+ def _extract_paragraph(
133
+ self,
134
+ psr: _Element,
135
+ story_name: str,
136
+ story_file: str,
137
+ paragraph_index: int,
138
+ ) -> ExtractItem | None:
139
+ char_ranges: list[_Element] = [
140
+ el for el in psr
141
+ if _local_name(el.tag) == "CharacterStyleRange"
142
+ ]
143
+
144
+ if not char_ranges:
145
+ return None
146
+
147
+ if len(char_ranges) == 1:
148
+ text = _collect_content_text(char_ranges[0])
149
+ if not text.strip():
150
+ return None
151
+ unit_id = f"{story_name}:p{paragraph_index}"
152
+ return unit_id, Data(
153
+ source=text.strip(),
154
+ meta=Meta(),
155
+ status=TranslationStatus.UNKNOWN,
156
+ extensions={"story": story_file, "input_format": "idml"},
157
+ )
158
+
159
+ return self._extract_styled_paragraph(
160
+ char_ranges, story_name, story_file, paragraph_index
161
+ )
162
+
163
+ def _extract_styled_paragraph(
164
+ self,
165
+ char_ranges: list[_Element],
166
+ story_name: str,
167
+ story_file: str,
168
+ paragraph_index: int,
169
+ ) -> ExtractItem | None:
170
+ parts: list[TextPart | CodePart] = []
171
+ tag_map: dict[str, TieData] = {}
172
+ full_text_parts: list[str] = []
173
+ tag_order = 0
174
+ pair_counter = 0
175
+
176
+ for csr in char_ranges:
177
+ style = csr.get("AppliedCharacterStyle") or ""
178
+ text = _collect_content_text(csr)
179
+
180
+ if not text:
181
+ continue
182
+
183
+ if style and style != "CharacterStyle/$ID/[No character style]":
184
+ pair_id = f"pair{pair_counter}"
185
+ pair_counter += 1
186
+
187
+ open_id = f"t{tag_order}"
188
+ tag_map[open_id] = TieData(
189
+ id=open_id,
190
+ type=TieType.CUSTOM_OPEN,
191
+ attributes={"style": style},
192
+ position=tag_order,
193
+ order=tag_order,
194
+ pair_id=pair_id,
195
+ original_name="CharacterStyleRange",
196
+ )
197
+ parts.append(CodePart(ref=open_id))
198
+ tag_order += 1
199
+
200
+ parts.append(TextPart(value=text))
201
+ full_text_parts.append(text)
202
+
203
+ close_id = f"t{tag_order}"
204
+ tag_map[close_id] = TieData(
205
+ id=close_id,
206
+ type=TieType.CUSTOM_CLOSE,
207
+ position=tag_order,
208
+ order=tag_order,
209
+ pair_id=pair_id,
210
+ original_name="CharacterStyleRange",
211
+ )
212
+ parts.append(CodePart(ref=close_id))
213
+ tag_order += 1
214
+ else:
215
+ parts.append(TextPart(value=text))
216
+ full_text_parts.append(text)
217
+
218
+ full_text = "".join(full_text_parts)
219
+ if not full_text.strip():
220
+ return None
221
+
222
+ unit_id = f"{story_name}:p{paragraph_index}"
223
+ tags = Tags(
224
+ source_tag_map=tag_map,
225
+ target_tag_map={},
226
+ source_parts=parts,
227
+ target_parts=[],
228
+ )
229
+ return unit_id, Data(
230
+ source=full_text.strip(),
231
+ tags=tags if tag_map else None,
232
+ meta=Meta(),
233
+ status=TranslationStatus.UNKNOWN,
234
+ extensions={"story": story_file, "input_format": "idml"},
235
+ )
236
+
237
+ def _base_language(self, locale: str) -> str:
238
+ return locale.replace("_", "-").split("-")[0].lower()
239
+
240
+
241
+ def _local_name(tag: str | bytes) -> str:
242
+ name = tag if isinstance(tag, str) else tag.decode("utf-8")
243
+ if "}" in name:
244
+ return name.split("}", 1)[1]
245
+ return name
246
+
247
+
248
+ def _story_name(story_file: str) -> str:
249
+ name = story_file
250
+ if name.startswith("Stories/"):
251
+ name = name[len("Stories/"):]
252
+ if name.endswith(".xml"):
253
+ name = name[: -len(".xml")]
254
+ return name
255
+
256
+
257
+ def _collect_content_text(element: _Element) -> str:
258
+ parts: list[str] = []
259
+ for child in element.iter():
260
+ if _local_name(child.tag) == "Content" and child.text:
261
+ parts.append(child.text)
262
+ if _local_name(child.tag) == "Br":
263
+ parts.append("\n")
264
+ return "".join(parts)
@@ -0,0 +1,3 @@
1
+ from lokit.parsers.json_i18n.extraction import JsonI18nExtractor
2
+
3
+ __all__ = ["JsonI18nExtractor"]