lokit-python 0.1.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
  2. lokit/__init__.cp313-win_amd64.pyd +0 -0
  3. lokit/__init__.py +128 -0
  4. lokit/core/__init__.cp313-win_amd64.pyd +0 -0
  5. lokit/core/__init__.py +0 -0
  6. lokit/core/logger.cp313-win_amd64.pyd +0 -0
  7. lokit/core/logger.py +20 -0
  8. lokit/data/__init__.cp313-win_amd64.pyd +0 -0
  9. lokit/data/__init__.py +0 -0
  10. lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
  11. lokit/data/lang_codes.py +455 -0
  12. lokit/data/structure.cp313-win_amd64.pyd +0 -0
  13. lokit/data/structure.py +118 -0
  14. lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
  15. lokit/data/tag_types.py +78 -0
  16. lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
  17. lokit/exporters/__init__.py +34 -0
  18. lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
  19. lokit/exporters/csv.py +32 -0
  20. lokit/exporters/html.cp313-win_amd64.pyd +0 -0
  21. lokit/exporters/html.py +217 -0
  22. lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
  23. lokit/exporters/idml.py +178 -0
  24. lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
  25. lokit/exporters/json_i18n.py +47 -0
  26. lokit/exporters/po.cp313-win_amd64.pyd +0 -0
  27. lokit/exporters/po.py +162 -0
  28. lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
  29. lokit/exporters/tmx.py +247 -0
  30. lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
  31. lokit/exporters/xliff.py +152 -0
  32. lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
  33. lokit/exporters/xlsx.py +39 -0
  34. lokit/format_detection.cp313-win_amd64.pyd +0 -0
  35. lokit/format_detection.py +115 -0
  36. lokit/importers.py +321 -0
  37. lokit/io/__init__.cp313-win_amd64.pyd +0 -0
  38. lokit/io/__init__.py +3 -0
  39. lokit/io/json.cp313-win_amd64.pyd +0 -0
  40. lokit/io/json.py +194 -0
  41. lokit/logic.cp313-win_amd64.pyd +0 -0
  42. lokit/logic.py +324 -0
  43. lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
  44. lokit/parsers/__init__.py +1 -0
  45. lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
  46. lokit/parsers/csv/__init__.py +1 -0
  47. lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
  48. lokit/parsers/csv/extraction.py +164 -0
  49. lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
  50. lokit/parsers/html/__init__.py +3 -0
  51. lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
  52. lokit/parsers/html/extraction.py +365 -0
  53. lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
  54. lokit/parsers/idml/__init__.py +3 -0
  55. lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
  56. lokit/parsers/idml/extraction.py +264 -0
  57. lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
  58. lokit/parsers/json_i18n/__init__.py +3 -0
  59. lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
  60. lokit/parsers/json_i18n/extraction.py +163 -0
  61. lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
  62. lokit/parsers/po/__init__.py +3 -0
  63. lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
  64. lokit/parsers/po/extraction.py +236 -0
  65. lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
  66. lokit/parsers/tmx/__init__.py +0 -0
  67. lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
  68. lokit/parsers/tmx/base.py +145 -0
  69. lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
  70. lokit/parsers/tmx/extraction.py +170 -0
  71. lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
  72. lokit/parsers/tmx/header.py +55 -0
  73. lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
  74. lokit/parsers/tmx/helpers.py +9 -0
  75. lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
  76. lokit/parsers/tmx/models.py +10 -0
  77. lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
  78. lokit/parsers/tmx/props.py +201 -0
  79. lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
  80. lokit/parsers/tmx/tags.py +59 -0
  81. lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
  82. lokit/parsers/tmx/xml_utils.py +46 -0
  83. lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
  84. lokit/parsers/xliff/__init__.py +3 -0
  85. lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
  86. lokit/parsers/xliff/extraction.py +229 -0
  87. lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
  88. lokit/parsers/xliff/tags.py +128 -0
  89. lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
  90. lokit/parsers/xlsx/__init__.py +1 -0
  91. lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
  92. lokit/parsers/xlsx/extraction.py +198 -0
  93. lokit/py.typed +1 -0
  94. lokit_python-0.1.0.dist-info/METADATA +149 -0
  95. lokit_python-0.1.0.dist-info/RECORD +97 -0
  96. lokit_python-0.1.0.dist-info/WHEEL +5 -0
  97. lokit_python-0.1.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,170 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from dataclasses import dataclass
5
+ from typing import AsyncIterator, Iterator, Optional
6
+ from uuid import uuid4
7
+
8
+ from lxml.etree import _Element
9
+
10
+ from lokit.data.structure import Data, SegmentPart, Tags
11
+ from lokit.data.tag_types import TieData
12
+ from lokit.parsers.tmx.base import TmxParser
13
+ from lokit.parsers.tmx.props import TmxProps
14
+ from lokit.parsers.tmx.tags import TmxTagParser
15
+ from lokit.parsers.tmx.xml_utils import (
16
+ clear_element,
17
+ element_children,
18
+ find_child,
19
+ iterparse_safe,
20
+ local_name,
21
+ )
22
+
23
+ ExtractItem = tuple[str, Data]
24
+
25
+
26
+ @dataclass(slots=True)
27
+ class _AsyncExtractionResult:
28
+ item: Optional[ExtractItem] = None
29
+ error: Optional[BaseException] = None
30
+ done: bool = False
31
+
32
+
33
+ class AsyncTmxExtraction:
34
+ def __init__(self, extractor: TmxExtractor) -> None:
35
+ self._extractor = extractor
36
+ self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
37
+ self._producer: asyncio.Task[None] | None = None
38
+
39
+ def __aiter__(self) -> AsyncTmxExtraction:
40
+ return self
41
+
42
+ async def __anext__(self) -> ExtractItem:
43
+ if self._producer is None:
44
+ self._start()
45
+
46
+ result = await self._queue.get()
47
+ if result.done:
48
+ await self._finish()
49
+ raise StopAsyncIteration
50
+ if result.error is not None:
51
+ await self._finish()
52
+ raise result.error
53
+ if result.item is None:
54
+ await self._finish()
55
+ raise StopAsyncIteration
56
+ return result.item
57
+
58
+ def _start(self) -> None:
59
+ loop = asyncio.get_running_loop()
60
+
61
+ def produce() -> None:
62
+ try:
63
+ for item in self._extractor.extract():
64
+ loop.call_soon_threadsafe(
65
+ self._queue.put_nowait,
66
+ _AsyncExtractionResult(item=item),
67
+ )
68
+ except BaseException as exc:
69
+ loop.call_soon_threadsafe(
70
+ self._queue.put_nowait,
71
+ _AsyncExtractionResult(error=exc),
72
+ )
73
+ finally:
74
+ loop.call_soon_threadsafe(
75
+ self._queue.put_nowait,
76
+ _AsyncExtractionResult(done=True),
77
+ )
78
+
79
+ self._producer = asyncio.create_task(asyncio.to_thread(produce))
80
+
81
+ async def _finish(self) -> None:
82
+ if self._producer is not None:
83
+ await self._producer
84
+
85
+
86
+ class TmxExtractor(TmxParser):
87
+ def __init__(
88
+ self,
89
+ filepath: str,
90
+ source_language: Optional[str] = None,
91
+ target_language: Optional[str] = None,
92
+ domain: Optional[str] = None,
93
+ ) -> None:
94
+ super().__init__(
95
+ tmx_file_path=filepath,
96
+ source_language=source_language,
97
+ target_language=target_language,
98
+ domain=domain,
99
+ )
100
+ self.tag_parser: TmxTagParser = TmxTagParser()
101
+ self.prop_parser: TmxProps = TmxProps()
102
+ self.namespace: str = "{http://www.w3.org/XML/1998/namespace}"
103
+
104
+ def extract(self) -> Iterator[tuple[str, Data]]:
105
+ with open(self.filepath, "rb") as stream:
106
+ context = iterparse_safe(stream, events=("end",))
107
+
108
+ for _, elem in context:
109
+ if local_name(elem.tag) != "tu":
110
+ continue
111
+
112
+ unit_id: str = elem.attrib.get("tuid") or str(uuid4())
113
+
114
+ meta = self.prop_parser.parse_meta(elem)
115
+ comments = self.prop_parser.parse_comments(elem)
116
+ prev_ctx, next_ctx = self.prop_parser.parse_adjacent_context(elem)
117
+ status = self.prop_parser.parse_status(elem)
118
+ extensions = self.prop_parser.parse_extensions(elem)
119
+
120
+ source_text: str = ""
121
+ target_text: str = ""
122
+ source_tags: dict[str, TieData] = {}
123
+ target_tags: dict[str, TieData] = {}
124
+ source_parts: list[SegmentPart] = []
125
+ target_parts: list[SegmentPart] = []
126
+
127
+ for tuv in element_children(elem, "tuv"):
128
+ lang: str = (
129
+ tuv.get(f"{self.namespace}lang") or tuv.get("lang") or ""
130
+ )
131
+ seg: _Element | None = find_child(tuv, "seg")
132
+
133
+ if seg is not None:
134
+ text, tags, parts = self.tag_parser.parse(seg)
135
+
136
+ if self._compare_base_lang(lang, self.native_source):
137
+ source_text = text
138
+ source_tags = tags
139
+ source_parts = parts
140
+ else:
141
+ target_text = text
142
+ target_tags = tags
143
+ target_parts = parts
144
+
145
+ tags_obj = Tags(
146
+ source_tag_map=source_tags,
147
+ target_tag_map=target_tags,
148
+ source_parts=source_parts,
149
+ target_parts=target_parts,
150
+ )
151
+
152
+ data_obj = Data(
153
+ source=source_text,
154
+ target=target_text if target_text else None,
155
+ plural=None,
156
+ tags=tags_obj if (source_tags or target_tags) else None,
157
+ meta=meta,
158
+ status=status,
159
+ comments=comments,
160
+ previous_context=prev_ctx,
161
+ next_context=next_ctx,
162
+ extensions=extensions,
163
+ )
164
+
165
+ yield unit_id, data_obj
166
+
167
+ clear_element(elem)
168
+
169
+ def extract_async(self) -> AsyncIterator[ExtractItem]:
170
+ return AsyncTmxExtraction(self)
@@ -0,0 +1,55 @@
1
+ from lxml.etree import _Element
2
+
3
+ from lokit.parsers.tmx.models import HeaderData
4
+ from lokit.parsers.tmx.xml_utils import element_children, local_name
5
+
6
+
7
+ class TmxHeaderParser:
8
+ def __init__(self) -> None:
9
+ pass
10
+
11
+ def parse(self, element: _Element) -> HeaderData:
12
+ creation_tool: str = element.attrib.get("creationtool") or "unknown_origin"
13
+ tool_version: str = element.attrib.get("creationtoolversion") or ""
14
+ origin: str = f"{creation_tool} {tool_version}".strip()
15
+ timestamp: str = element.attrib.get("creationdate") or ""
16
+ extensions: dict[str, str] = {
17
+ "input_format": "tmx",
18
+ }
19
+ self._add_if_present(extensions, "admin_locale", element.attrib.get("adminlang"))
20
+ self._add_if_present(extensions, "data_type", element.attrib.get("datatype"))
21
+ self._add_if_present(extensions, "segmentation", element.attrib.get("segtype"))
22
+ self._add_if_present(extensions, "translation_memory_format", element.attrib.get("o-tmf"))
23
+ self._add_if_present(extensions, "tool_name", creation_tool)
24
+ self._add_if_present(extensions, "tool_version", tool_version)
25
+
26
+ srclang: str = element.attrib.get("srclang") or ""
27
+ if srclang == "*all*":
28
+ srclang = ""
29
+
30
+ tgtlang: str = element.attrib.get("tgtlang") or ""
31
+
32
+ for child in element_children(element):
33
+ child_name = local_name(child.tag)
34
+ if child_name == "prop":
35
+ prop_type = child.attrib.get("type") or "unknown"
36
+ extensions[f"property.{self._normalize_key(prop_type)}"] = child.text or ""
37
+ elif child.text:
38
+ extensions[f"property.{self._normalize_key(child_name)}"] = child.text
39
+
40
+ return HeaderData(
41
+ origin=origin,
42
+ timestamp=timestamp,
43
+ srclang=srclang,
44
+ tgtlang=tgtlang,
45
+ extensions=extensions,
46
+ )
47
+
48
+ def _add_if_present(
49
+ self, extensions: dict[str, str], key: str, value: str | None
50
+ ) -> None:
51
+ if value:
52
+ extensions[key] = value
53
+
54
+ def _normalize_key(self, value: str) -> str:
55
+ return value.lower().replace(" ", "_").replace("-", "_")
@@ -0,0 +1,9 @@
1
+ from lokit.data.tag_types import TieType as tt
2
+
3
+ TMX_TAG_MAP: dict[str, tt] = {
4
+ "bpt": tt.CUSTOM_OPEN,
5
+ "ept": tt.CUSTOM_CLOSE,
6
+ "ph": tt.CUSTOM_STANDALONE,
7
+ "it": tt.CUSTOM_STANDALONE,
8
+ "ut": tt.CUSTOM_STANDALONE,
9
+ }
@@ -0,0 +1,10 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class HeaderData:
6
+ origin: str
7
+ timestamp: str
8
+ srclang: str
9
+ tgtlang: str
10
+ extensions: dict[str, str]
@@ -0,0 +1,201 @@
1
+ from typing import Optional
2
+
3
+ from lxml.etree import _Element
4
+
5
+ from lokit.data.structure import AdjacentContext, Comment, Meta, Origin, TranslationStatus
6
+ from lokit.parsers.tmx.xml_utils import element_children, local_name
7
+
8
+
9
+ class TmxProps:
10
+ def __init__(self) -> None:
11
+ self._known_props: frozenset[str] = frozenset(
12
+ {
13
+ "status",
14
+ "x-status",
15
+ "x-xtm-status",
16
+ "x-project",
17
+ "x-system",
18
+ "x-domain",
19
+ "x-context",
20
+ "x-key",
21
+ "note",
22
+ "x-note",
23
+ "comment",
24
+ "x-comment",
25
+ "x-previous-id",
26
+ "x-previous-source",
27
+ "x-previous-source-text",
28
+ "x-previous-target",
29
+ "x-previous-target-text",
30
+ "x-next-id",
31
+ "x-next-source",
32
+ "x-next-source-text",
33
+ "x-next-target",
34
+ "x-next-target-text",
35
+ }
36
+ )
37
+
38
+ def parse_meta(self, element: _Element) -> Meta:
39
+ creation_date: str = element.attrib.get("creationdate") or ""
40
+ _usage_count: str = element.attrib.get("usagecount") or ""
41
+ usage_count: Optional[int] = (
42
+ int(_usage_count) if _usage_count.isdigit() else None
43
+ )
44
+ return Meta(
45
+ usage_count=usage_count,
46
+ last_used=element.attrib.get("lastusagedate"),
47
+ first_used=None,
48
+ created=creation_date if creation_date else None,
49
+ updated=element.attrib.get("changedate"),
50
+ max_length=None,
51
+ min_length=None,
52
+ extensions=self.parse_meta_extensions(element),
53
+ )
54
+
55
+ def parse_comments(self, element: _Element) -> list[Comment]:
56
+ project: str = ""
57
+ system: str = ""
58
+ creator: str = element.attrib.get("creationid") or ""
59
+ context_key: str = ""
60
+ comments: list[Comment] = []
61
+
62
+ for child in element_children(element):
63
+ tag_name: str = local_name(child.tag)
64
+ if tag_name == "prop":
65
+ prop_type: str = child.attrib.get("type", "").lower()
66
+ text_val: str = child.text or ""
67
+ if prop_type == "x-project":
68
+ project = text_val
69
+ elif prop_type in ("x-system", "x-domain"):
70
+ system = text_val
71
+ elif prop_type in ("x-context", "x-key"):
72
+ context_key = text_val
73
+ elif prop_type in ("note", "x-note", "comment", "x-comment"):
74
+ comments.append(
75
+ Comment(
76
+ context=text_val.strip(),
77
+ timestamp=element.attrib.get("changedate"),
78
+ )
79
+ )
80
+
81
+ elif tag_name == "note" and child.text:
82
+ comments.append(
83
+ Comment(
84
+ context=child.text.strip(),
85
+ timestamp=element.attrib.get("changedate"),
86
+ )
87
+ )
88
+
89
+ if not any([project, system, creator, context_key]) and not comments:
90
+ return []
91
+
92
+ origin: Origin = Origin(
93
+ system=system if system else None,
94
+ project=project if project else None,
95
+ creator_id=creator if creator else None,
96
+ )
97
+
98
+ if not comments:
99
+ comments.append(Comment(context=""))
100
+
101
+ for comment in comments:
102
+ comment.origin = origin if any([system, project, creator]) else None
103
+ comment.context_key = context_key if context_key else None
104
+ comment.timestamp = comment.timestamp or element.attrib.get("changedate")
105
+
106
+ return comments
107
+
108
+ def parse_status(self, element: _Element) -> TranslationStatus:
109
+ status_values: list[str] = []
110
+
111
+ for child in element_children(element, "prop"):
112
+ prop_type: str = child.attrib.get("type", "").lower()
113
+ if prop_type in ("status", "x-status", "x-xtm-status"):
114
+ status_values.append((child.text or "").strip().lower())
115
+
116
+ for value in reversed(status_values):
117
+ if value in ("approved", "signed-off", "final"):
118
+ return TranslationStatus.APPROVED
119
+ if value in ("reviewed", "review"):
120
+ return TranslationStatus.REVIEWED
121
+ if value in ("translated", "complete"):
122
+ return TranslationStatus.TRANSLATED
123
+ if value in ("new",):
124
+ return TranslationStatus.NEW
125
+ if value in ("draft", "notapproved", "not-approved", "unapproved"):
126
+ return TranslationStatus.DRAFT
127
+ if value in ("rejected",):
128
+ return TranslationStatus.REJECTED
129
+
130
+ return TranslationStatus.UNKNOWN
131
+
132
+ def parse_adjacent_context(
133
+ self, element: _Element
134
+ ) -> tuple[Optional[AdjacentContext], Optional[AdjacentContext]]:
135
+ prev_id: Optional[str] = None
136
+ prev_src: Optional[str] = None
137
+ prev_tgt: Optional[str] = None
138
+ next_id: Optional[str] = None
139
+ next_src: Optional[str] = None
140
+ next_tgt: Optional[str] = None
141
+
142
+ for child in element_children(element, "prop"):
143
+ prop_type: str = child.attrib.get("type", "").lower()
144
+ text_val: str = child.text or ""
145
+ if prop_type == "x-previous-id":
146
+ prev_id = text_val
147
+ elif prop_type in ("x-previous-source", "x-previous-source-text"):
148
+ prev_src = text_val
149
+ elif prop_type in ("x-previous-target", "x-previous-target-text"):
150
+ prev_tgt = text_val
151
+ elif prop_type == "x-next-id":
152
+ next_id = text_val
153
+ elif prop_type in ("x-next-source", "x-next-source-text"):
154
+ next_src = text_val
155
+ elif prop_type in ("x-next-target", "x-next-target-text"):
156
+ next_tgt = text_val
157
+
158
+ prev_ctx: Optional[AdjacentContext] = (
159
+ AdjacentContext(unit_id=prev_id, source=prev_src, target=prev_tgt)
160
+ if any([prev_id, prev_src, prev_tgt])
161
+ else None
162
+ )
163
+
164
+ next_ctx: Optional[AdjacentContext] = (
165
+ AdjacentContext(
166
+ unit_id=next_id,
167
+ source=next_src,
168
+ target=next_tgt,
169
+ )
170
+ if any([next_id, next_src, next_tgt])
171
+ else None
172
+ )
173
+
174
+ return prev_ctx, next_ctx
175
+
176
+ def parse_meta_extensions(self, element: _Element) -> dict[str, str]:
177
+ extensions: dict[str, str] = {}
178
+
179
+ change_id = element.attrib.get("changeid")
180
+ if change_id:
181
+ extensions["change_id"] = change_id
182
+
183
+ usage_count = element.attrib.get("usagecount")
184
+ if usage_count:
185
+ extensions["usage_count_raw"] = usage_count
186
+
187
+ return extensions
188
+
189
+ def parse_extensions(self, element: _Element) -> dict[str, str]:
190
+ extensions: dict[str, str] = {}
191
+
192
+ for child in element_children(element, "prop"):
193
+ prop_type = (child.attrib.get("type") or "unknown").lower()
194
+ if prop_type in self._known_props:
195
+ continue
196
+ extensions[f"property.{self._normalize_key(prop_type)}"] = child.text or ""
197
+
198
+ return extensions
199
+
200
+ def _normalize_key(self, value: str) -> str:
201
+ return value.lower().replace(" ", "_").replace("-", "_")
@@ -0,0 +1,59 @@
1
+ from lxml.etree import _Element
2
+
3
+ from lokit.data.structure import CodePart, SegmentPart, TextPart
4
+ from lokit.data.tag_types import TieData, TieType
5
+ from lokit.parsers.tmx.helpers import TMX_TAG_MAP
6
+ from lokit.parsers.tmx.xml_utils import element_children, local_name
7
+
8
+
9
+ class TmxTagParser:
10
+ def __init__(self) -> None:
11
+ pass
12
+
13
+ def parse(
14
+ self, element: _Element
15
+ ) -> tuple[str, dict[str, TieData], list[SegmentPart]]:
16
+ raw_txt: str = ""
17
+ tag_map: dict[str, TieData] = {}
18
+ parts: list[SegmentPart] = []
19
+ order: int = 0
20
+ pair_ids: dict[str, str] = {}
21
+
22
+ if element.text:
23
+ raw_txt += element.text
24
+ parts.append(TextPart(element.text))
25
+
26
+ for child in element_children(element):
27
+ tag_name: str = local_name(child.tag)
28
+ tie_type: TieType = TMX_TAG_MAP.get(tag_name, TieType.CUSTOM_STANDALONE)
29
+ source_pair_id: str | None = child.attrib.get("i") or child.attrib.get("id")
30
+ pair_id = self._normalize_pair_id(source_pair_id, pair_ids)
31
+ tie_id = f"c{order}"
32
+
33
+ tag_map[tie_id] = TieData(
34
+ id=tie_id,
35
+ type=tie_type,
36
+ position=len(raw_txt),
37
+ order=order,
38
+ pair_id=pair_id,
39
+ )
40
+ parts.append(CodePart(tie_id))
41
+ order += 1
42
+
43
+ if child.tail:
44
+ raw_txt += child.tail
45
+ parts.append(TextPart(child.tail))
46
+
47
+ return raw_txt, tag_map, parts
48
+
49
+ def _normalize_pair_id(
50
+ self, source_pair_id: str | None, pair_ids: dict[str, str]
51
+ ) -> str | None:
52
+ if source_pair_id is None:
53
+ return None
54
+ existing = pair_ids.get(source_pair_id)
55
+ if existing is not None:
56
+ return existing
57
+ normalized = f"p{len(pair_ids)}"
58
+ pair_ids[source_pair_id] = normalized
59
+ return normalized
@@ -0,0 +1,46 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from typing import BinaryIO
5
+
6
+ from lxml import etree
7
+ from lxml.etree import _Element
8
+
9
+
10
+ def local_name(tag: object) -> str:
11
+ if not isinstance(tag, str):
12
+ return ""
13
+ return tag.rsplit("}", 1)[-1]
14
+
15
+
16
+ def iterparse_safe(
17
+ source: str | BinaryIO,
18
+ events: tuple[str, ...],
19
+ ) -> etree.iterparse[etree._Element]:
20
+ return etree.iterparse(
21
+ source,
22
+ events=events,
23
+ no_network=True,
24
+ resolve_entities=False,
25
+ )
26
+
27
+
28
+ def element_children(element: _Element, name: str | None = None) -> Iterator[_Element]:
29
+ for child in element:
30
+ if name is None or local_name(child.tag) == name:
31
+ yield child
32
+
33
+
34
+ def find_child(element: _Element, name: str) -> _Element | None:
35
+ for child in element_children(element, name):
36
+ return child
37
+ return None
38
+
39
+
40
+ def clear_element(element: _Element) -> None:
41
+ element.clear()
42
+ while element.getprevious() is not None:
43
+ parent = element.getparent()
44
+ if parent is None:
45
+ break
46
+ del parent[0]
@@ -0,0 +1,3 @@
1
+ from lokit.parsers.xliff.extraction import XliffExtractor
2
+
3
+ __all__ = ["XliffExtractor"]