lokit-python 0.1.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
- lokit/__init__.cp313-win_amd64.pyd +0 -0
- lokit/__init__.py +128 -0
- lokit/core/__init__.cp313-win_amd64.pyd +0 -0
- lokit/core/__init__.py +0 -0
- lokit/core/logger.cp313-win_amd64.pyd +0 -0
- lokit/core/logger.py +20 -0
- lokit/data/__init__.cp313-win_amd64.pyd +0 -0
- lokit/data/__init__.py +0 -0
- lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
- lokit/data/lang_codes.py +455 -0
- lokit/data/structure.cp313-win_amd64.pyd +0 -0
- lokit/data/structure.py +118 -0
- lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
- lokit/data/tag_types.py +78 -0
- lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
- lokit/exporters/__init__.py +34 -0
- lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
- lokit/exporters/csv.py +32 -0
- lokit/exporters/html.cp313-win_amd64.pyd +0 -0
- lokit/exporters/html.py +217 -0
- lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
- lokit/exporters/idml.py +178 -0
- lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
- lokit/exporters/json_i18n.py +47 -0
- lokit/exporters/po.cp313-win_amd64.pyd +0 -0
- lokit/exporters/po.py +162 -0
- lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/tmx.py +247 -0
- lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xliff.py +152 -0
- lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xlsx.py +39 -0
- lokit/format_detection.cp313-win_amd64.pyd +0 -0
- lokit/format_detection.py +115 -0
- lokit/importers.py +321 -0
- lokit/io/__init__.cp313-win_amd64.pyd +0 -0
- lokit/io/__init__.py +3 -0
- lokit/io/json.cp313-win_amd64.pyd +0 -0
- lokit/io/json.py +194 -0
- lokit/logic.cp313-win_amd64.pyd +0 -0
- lokit/logic.py +324 -0
- lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/__init__.py +1 -0
- lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/__init__.py +1 -0
- lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/extraction.py +164 -0
- lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/__init__.py +3 -0
- lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/extraction.py +365 -0
- lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/__init__.py +3 -0
- lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/extraction.py +264 -0
- lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/__init__.py +3 -0
- lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/extraction.py +163 -0
- lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/__init__.py +3 -0
- lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/extraction.py +236 -0
- lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/__init__.py +0 -0
- lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/base.py +145 -0
- lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/extraction.py +170 -0
- lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/header.py +55 -0
- lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/helpers.py +9 -0
- lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/models.py +10 -0
- lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/props.py +201 -0
- lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/tags.py +59 -0
- lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/xml_utils.py +46 -0
- lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/__init__.py +3 -0
- lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/extraction.py +229 -0
- lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/tags.py +128 -0
- lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/__init__.py +1 -0
- lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/extraction.py +198 -0
- lokit/py.typed +1 -0
- lokit_python-0.1.0.dist-info/METADATA +149 -0
- lokit_python-0.1.0.dist-info/RECORD +97 -0
- lokit_python-0.1.0.dist-info/WHEEL +5 -0
- lokit_python-0.1.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
from lxml.etree import _Element
|
|
9
|
+
|
|
10
|
+
from lokit.data.structure import Data, SegmentPart, Tags
|
|
11
|
+
from lokit.data.tag_types import TieData
|
|
12
|
+
from lokit.parsers.tmx.base import TmxParser
|
|
13
|
+
from lokit.parsers.tmx.props import TmxProps
|
|
14
|
+
from lokit.parsers.tmx.tags import TmxTagParser
|
|
15
|
+
from lokit.parsers.tmx.xml_utils import (
|
|
16
|
+
clear_element,
|
|
17
|
+
element_children,
|
|
18
|
+
find_child,
|
|
19
|
+
iterparse_safe,
|
|
20
|
+
local_name,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
ExtractItem = tuple[str, Data]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(slots=True)
|
|
27
|
+
class _AsyncExtractionResult:
|
|
28
|
+
item: Optional[ExtractItem] = None
|
|
29
|
+
error: Optional[BaseException] = None
|
|
30
|
+
done: bool = False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AsyncTmxExtraction:
|
|
34
|
+
def __init__(self, extractor: TmxExtractor) -> None:
|
|
35
|
+
self._extractor = extractor
|
|
36
|
+
self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
|
|
37
|
+
self._producer: asyncio.Task[None] | None = None
|
|
38
|
+
|
|
39
|
+
def __aiter__(self) -> AsyncTmxExtraction:
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
async def __anext__(self) -> ExtractItem:
|
|
43
|
+
if self._producer is None:
|
|
44
|
+
self._start()
|
|
45
|
+
|
|
46
|
+
result = await self._queue.get()
|
|
47
|
+
if result.done:
|
|
48
|
+
await self._finish()
|
|
49
|
+
raise StopAsyncIteration
|
|
50
|
+
if result.error is not None:
|
|
51
|
+
await self._finish()
|
|
52
|
+
raise result.error
|
|
53
|
+
if result.item is None:
|
|
54
|
+
await self._finish()
|
|
55
|
+
raise StopAsyncIteration
|
|
56
|
+
return result.item
|
|
57
|
+
|
|
58
|
+
def _start(self) -> None:
|
|
59
|
+
loop = asyncio.get_running_loop()
|
|
60
|
+
|
|
61
|
+
def produce() -> None:
|
|
62
|
+
try:
|
|
63
|
+
for item in self._extractor.extract():
|
|
64
|
+
loop.call_soon_threadsafe(
|
|
65
|
+
self._queue.put_nowait,
|
|
66
|
+
_AsyncExtractionResult(item=item),
|
|
67
|
+
)
|
|
68
|
+
except BaseException as exc:
|
|
69
|
+
loop.call_soon_threadsafe(
|
|
70
|
+
self._queue.put_nowait,
|
|
71
|
+
_AsyncExtractionResult(error=exc),
|
|
72
|
+
)
|
|
73
|
+
finally:
|
|
74
|
+
loop.call_soon_threadsafe(
|
|
75
|
+
self._queue.put_nowait,
|
|
76
|
+
_AsyncExtractionResult(done=True),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
80
|
+
|
|
81
|
+
async def _finish(self) -> None:
|
|
82
|
+
if self._producer is not None:
|
|
83
|
+
await self._producer
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TmxExtractor(TmxParser):
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
filepath: str,
|
|
90
|
+
source_language: Optional[str] = None,
|
|
91
|
+
target_language: Optional[str] = None,
|
|
92
|
+
domain: Optional[str] = None,
|
|
93
|
+
) -> None:
|
|
94
|
+
super().__init__(
|
|
95
|
+
tmx_file_path=filepath,
|
|
96
|
+
source_language=source_language,
|
|
97
|
+
target_language=target_language,
|
|
98
|
+
domain=domain,
|
|
99
|
+
)
|
|
100
|
+
self.tag_parser: TmxTagParser = TmxTagParser()
|
|
101
|
+
self.prop_parser: TmxProps = TmxProps()
|
|
102
|
+
self.namespace: str = "{http://www.w3.org/XML/1998/namespace}"
|
|
103
|
+
|
|
104
|
+
def extract(self) -> Iterator[tuple[str, Data]]:
|
|
105
|
+
with open(self.filepath, "rb") as stream:
|
|
106
|
+
context = iterparse_safe(stream, events=("end",))
|
|
107
|
+
|
|
108
|
+
for _, elem in context:
|
|
109
|
+
if local_name(elem.tag) != "tu":
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
unit_id: str = elem.attrib.get("tuid") or str(uuid4())
|
|
113
|
+
|
|
114
|
+
meta = self.prop_parser.parse_meta(elem)
|
|
115
|
+
comments = self.prop_parser.parse_comments(elem)
|
|
116
|
+
prev_ctx, next_ctx = self.prop_parser.parse_adjacent_context(elem)
|
|
117
|
+
status = self.prop_parser.parse_status(elem)
|
|
118
|
+
extensions = self.prop_parser.parse_extensions(elem)
|
|
119
|
+
|
|
120
|
+
source_text: str = ""
|
|
121
|
+
target_text: str = ""
|
|
122
|
+
source_tags: dict[str, TieData] = {}
|
|
123
|
+
target_tags: dict[str, TieData] = {}
|
|
124
|
+
source_parts: list[SegmentPart] = []
|
|
125
|
+
target_parts: list[SegmentPart] = []
|
|
126
|
+
|
|
127
|
+
for tuv in element_children(elem, "tuv"):
|
|
128
|
+
lang: str = (
|
|
129
|
+
tuv.get(f"{self.namespace}lang") or tuv.get("lang") or ""
|
|
130
|
+
)
|
|
131
|
+
seg: _Element | None = find_child(tuv, "seg")
|
|
132
|
+
|
|
133
|
+
if seg is not None:
|
|
134
|
+
text, tags, parts = self.tag_parser.parse(seg)
|
|
135
|
+
|
|
136
|
+
if self._compare_base_lang(lang, self.native_source):
|
|
137
|
+
source_text = text
|
|
138
|
+
source_tags = tags
|
|
139
|
+
source_parts = parts
|
|
140
|
+
else:
|
|
141
|
+
target_text = text
|
|
142
|
+
target_tags = tags
|
|
143
|
+
target_parts = parts
|
|
144
|
+
|
|
145
|
+
tags_obj = Tags(
|
|
146
|
+
source_tag_map=source_tags,
|
|
147
|
+
target_tag_map=target_tags,
|
|
148
|
+
source_parts=source_parts,
|
|
149
|
+
target_parts=target_parts,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
data_obj = Data(
|
|
153
|
+
source=source_text,
|
|
154
|
+
target=target_text if target_text else None,
|
|
155
|
+
plural=None,
|
|
156
|
+
tags=tags_obj if (source_tags or target_tags) else None,
|
|
157
|
+
meta=meta,
|
|
158
|
+
status=status,
|
|
159
|
+
comments=comments,
|
|
160
|
+
previous_context=prev_ctx,
|
|
161
|
+
next_context=next_ctx,
|
|
162
|
+
extensions=extensions,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
yield unit_id, data_obj
|
|
166
|
+
|
|
167
|
+
clear_element(elem)
|
|
168
|
+
|
|
169
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
170
|
+
return AsyncTmxExtraction(self)
|
|
Binary file
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from lxml.etree import _Element
|
|
2
|
+
|
|
3
|
+
from lokit.parsers.tmx.models import HeaderData
|
|
4
|
+
from lokit.parsers.tmx.xml_utils import element_children, local_name
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TmxHeaderParser:
|
|
8
|
+
def __init__(self) -> None:
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
def parse(self, element: _Element) -> HeaderData:
|
|
12
|
+
creation_tool: str = element.attrib.get("creationtool") or "unknown_origin"
|
|
13
|
+
tool_version: str = element.attrib.get("creationtoolversion") or ""
|
|
14
|
+
origin: str = f"{creation_tool} {tool_version}".strip()
|
|
15
|
+
timestamp: str = element.attrib.get("creationdate") or ""
|
|
16
|
+
extensions: dict[str, str] = {
|
|
17
|
+
"input_format": "tmx",
|
|
18
|
+
}
|
|
19
|
+
self._add_if_present(extensions, "admin_locale", element.attrib.get("adminlang"))
|
|
20
|
+
self._add_if_present(extensions, "data_type", element.attrib.get("datatype"))
|
|
21
|
+
self._add_if_present(extensions, "segmentation", element.attrib.get("segtype"))
|
|
22
|
+
self._add_if_present(extensions, "translation_memory_format", element.attrib.get("o-tmf"))
|
|
23
|
+
self._add_if_present(extensions, "tool_name", creation_tool)
|
|
24
|
+
self._add_if_present(extensions, "tool_version", tool_version)
|
|
25
|
+
|
|
26
|
+
srclang: str = element.attrib.get("srclang") or ""
|
|
27
|
+
if srclang == "*all*":
|
|
28
|
+
srclang = ""
|
|
29
|
+
|
|
30
|
+
tgtlang: str = element.attrib.get("tgtlang") or ""
|
|
31
|
+
|
|
32
|
+
for child in element_children(element):
|
|
33
|
+
child_name = local_name(child.tag)
|
|
34
|
+
if child_name == "prop":
|
|
35
|
+
prop_type = child.attrib.get("type") or "unknown"
|
|
36
|
+
extensions[f"property.{self._normalize_key(prop_type)}"] = child.text or ""
|
|
37
|
+
elif child.text:
|
|
38
|
+
extensions[f"property.{self._normalize_key(child_name)}"] = child.text
|
|
39
|
+
|
|
40
|
+
return HeaderData(
|
|
41
|
+
origin=origin,
|
|
42
|
+
timestamp=timestamp,
|
|
43
|
+
srclang=srclang,
|
|
44
|
+
tgtlang=tgtlang,
|
|
45
|
+
extensions=extensions,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
def _add_if_present(
|
|
49
|
+
self, extensions: dict[str, str], key: str, value: str | None
|
|
50
|
+
) -> None:
|
|
51
|
+
if value:
|
|
52
|
+
extensions[key] = value
|
|
53
|
+
|
|
54
|
+
def _normalize_key(self, value: str) -> str:
|
|
55
|
+
return value.lower().replace(" ", "_").replace("-", "_")
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from lxml.etree import _Element
|
|
4
|
+
|
|
5
|
+
from lokit.data.structure import AdjacentContext, Comment, Meta, Origin, TranslationStatus
|
|
6
|
+
from lokit.parsers.tmx.xml_utils import element_children, local_name
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TmxProps:
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self._known_props: frozenset[str] = frozenset(
|
|
12
|
+
{
|
|
13
|
+
"status",
|
|
14
|
+
"x-status",
|
|
15
|
+
"x-xtm-status",
|
|
16
|
+
"x-project",
|
|
17
|
+
"x-system",
|
|
18
|
+
"x-domain",
|
|
19
|
+
"x-context",
|
|
20
|
+
"x-key",
|
|
21
|
+
"note",
|
|
22
|
+
"x-note",
|
|
23
|
+
"comment",
|
|
24
|
+
"x-comment",
|
|
25
|
+
"x-previous-id",
|
|
26
|
+
"x-previous-source",
|
|
27
|
+
"x-previous-source-text",
|
|
28
|
+
"x-previous-target",
|
|
29
|
+
"x-previous-target-text",
|
|
30
|
+
"x-next-id",
|
|
31
|
+
"x-next-source",
|
|
32
|
+
"x-next-source-text",
|
|
33
|
+
"x-next-target",
|
|
34
|
+
"x-next-target-text",
|
|
35
|
+
}
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
def parse_meta(self, element: _Element) -> Meta:
|
|
39
|
+
creation_date: str = element.attrib.get("creationdate") or ""
|
|
40
|
+
_usage_count: str = element.attrib.get("usagecount") or ""
|
|
41
|
+
usage_count: Optional[int] = (
|
|
42
|
+
int(_usage_count) if _usage_count.isdigit() else None
|
|
43
|
+
)
|
|
44
|
+
return Meta(
|
|
45
|
+
usage_count=usage_count,
|
|
46
|
+
last_used=element.attrib.get("lastusagedate"),
|
|
47
|
+
first_used=None,
|
|
48
|
+
created=creation_date if creation_date else None,
|
|
49
|
+
updated=element.attrib.get("changedate"),
|
|
50
|
+
max_length=None,
|
|
51
|
+
min_length=None,
|
|
52
|
+
extensions=self.parse_meta_extensions(element),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def parse_comments(self, element: _Element) -> list[Comment]:
|
|
56
|
+
project: str = ""
|
|
57
|
+
system: str = ""
|
|
58
|
+
creator: str = element.attrib.get("creationid") or ""
|
|
59
|
+
context_key: str = ""
|
|
60
|
+
comments: list[Comment] = []
|
|
61
|
+
|
|
62
|
+
for child in element_children(element):
|
|
63
|
+
tag_name: str = local_name(child.tag)
|
|
64
|
+
if tag_name == "prop":
|
|
65
|
+
prop_type: str = child.attrib.get("type", "").lower()
|
|
66
|
+
text_val: str = child.text or ""
|
|
67
|
+
if prop_type == "x-project":
|
|
68
|
+
project = text_val
|
|
69
|
+
elif prop_type in ("x-system", "x-domain"):
|
|
70
|
+
system = text_val
|
|
71
|
+
elif prop_type in ("x-context", "x-key"):
|
|
72
|
+
context_key = text_val
|
|
73
|
+
elif prop_type in ("note", "x-note", "comment", "x-comment"):
|
|
74
|
+
comments.append(
|
|
75
|
+
Comment(
|
|
76
|
+
context=text_val.strip(),
|
|
77
|
+
timestamp=element.attrib.get("changedate"),
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
elif tag_name == "note" and child.text:
|
|
82
|
+
comments.append(
|
|
83
|
+
Comment(
|
|
84
|
+
context=child.text.strip(),
|
|
85
|
+
timestamp=element.attrib.get("changedate"),
|
|
86
|
+
)
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
if not any([project, system, creator, context_key]) and not comments:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
origin: Origin = Origin(
|
|
93
|
+
system=system if system else None,
|
|
94
|
+
project=project if project else None,
|
|
95
|
+
creator_id=creator if creator else None,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if not comments:
|
|
99
|
+
comments.append(Comment(context=""))
|
|
100
|
+
|
|
101
|
+
for comment in comments:
|
|
102
|
+
comment.origin = origin if any([system, project, creator]) else None
|
|
103
|
+
comment.context_key = context_key if context_key else None
|
|
104
|
+
comment.timestamp = comment.timestamp or element.attrib.get("changedate")
|
|
105
|
+
|
|
106
|
+
return comments
|
|
107
|
+
|
|
108
|
+
def parse_status(self, element: _Element) -> TranslationStatus:
|
|
109
|
+
status_values: list[str] = []
|
|
110
|
+
|
|
111
|
+
for child in element_children(element, "prop"):
|
|
112
|
+
prop_type: str = child.attrib.get("type", "").lower()
|
|
113
|
+
if prop_type in ("status", "x-status", "x-xtm-status"):
|
|
114
|
+
status_values.append((child.text or "").strip().lower())
|
|
115
|
+
|
|
116
|
+
for value in reversed(status_values):
|
|
117
|
+
if value in ("approved", "signed-off", "final"):
|
|
118
|
+
return TranslationStatus.APPROVED
|
|
119
|
+
if value in ("reviewed", "review"):
|
|
120
|
+
return TranslationStatus.REVIEWED
|
|
121
|
+
if value in ("translated", "complete"):
|
|
122
|
+
return TranslationStatus.TRANSLATED
|
|
123
|
+
if value in ("new",):
|
|
124
|
+
return TranslationStatus.NEW
|
|
125
|
+
if value in ("draft", "notapproved", "not-approved", "unapproved"):
|
|
126
|
+
return TranslationStatus.DRAFT
|
|
127
|
+
if value in ("rejected",):
|
|
128
|
+
return TranslationStatus.REJECTED
|
|
129
|
+
|
|
130
|
+
return TranslationStatus.UNKNOWN
|
|
131
|
+
|
|
132
|
+
def parse_adjacent_context(
|
|
133
|
+
self, element: _Element
|
|
134
|
+
) -> tuple[Optional[AdjacentContext], Optional[AdjacentContext]]:
|
|
135
|
+
prev_id: Optional[str] = None
|
|
136
|
+
prev_src: Optional[str] = None
|
|
137
|
+
prev_tgt: Optional[str] = None
|
|
138
|
+
next_id: Optional[str] = None
|
|
139
|
+
next_src: Optional[str] = None
|
|
140
|
+
next_tgt: Optional[str] = None
|
|
141
|
+
|
|
142
|
+
for child in element_children(element, "prop"):
|
|
143
|
+
prop_type: str = child.attrib.get("type", "").lower()
|
|
144
|
+
text_val: str = child.text or ""
|
|
145
|
+
if prop_type == "x-previous-id":
|
|
146
|
+
prev_id = text_val
|
|
147
|
+
elif prop_type in ("x-previous-source", "x-previous-source-text"):
|
|
148
|
+
prev_src = text_val
|
|
149
|
+
elif prop_type in ("x-previous-target", "x-previous-target-text"):
|
|
150
|
+
prev_tgt = text_val
|
|
151
|
+
elif prop_type == "x-next-id":
|
|
152
|
+
next_id = text_val
|
|
153
|
+
elif prop_type in ("x-next-source", "x-next-source-text"):
|
|
154
|
+
next_src = text_val
|
|
155
|
+
elif prop_type in ("x-next-target", "x-next-target-text"):
|
|
156
|
+
next_tgt = text_val
|
|
157
|
+
|
|
158
|
+
prev_ctx: Optional[AdjacentContext] = (
|
|
159
|
+
AdjacentContext(unit_id=prev_id, source=prev_src, target=prev_tgt)
|
|
160
|
+
if any([prev_id, prev_src, prev_tgt])
|
|
161
|
+
else None
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
next_ctx: Optional[AdjacentContext] = (
|
|
165
|
+
AdjacentContext(
|
|
166
|
+
unit_id=next_id,
|
|
167
|
+
source=next_src,
|
|
168
|
+
target=next_tgt,
|
|
169
|
+
)
|
|
170
|
+
if any([next_id, next_src, next_tgt])
|
|
171
|
+
else None
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return prev_ctx, next_ctx
|
|
175
|
+
|
|
176
|
+
def parse_meta_extensions(self, element: _Element) -> dict[str, str]:
|
|
177
|
+
extensions: dict[str, str] = {}
|
|
178
|
+
|
|
179
|
+
change_id = element.attrib.get("changeid")
|
|
180
|
+
if change_id:
|
|
181
|
+
extensions["change_id"] = change_id
|
|
182
|
+
|
|
183
|
+
usage_count = element.attrib.get("usagecount")
|
|
184
|
+
if usage_count:
|
|
185
|
+
extensions["usage_count_raw"] = usage_count
|
|
186
|
+
|
|
187
|
+
return extensions
|
|
188
|
+
|
|
189
|
+
def parse_extensions(self, element: _Element) -> dict[str, str]:
|
|
190
|
+
extensions: dict[str, str] = {}
|
|
191
|
+
|
|
192
|
+
for child in element_children(element, "prop"):
|
|
193
|
+
prop_type = (child.attrib.get("type") or "unknown").lower()
|
|
194
|
+
if prop_type in self._known_props:
|
|
195
|
+
continue
|
|
196
|
+
extensions[f"property.{self._normalize_key(prop_type)}"] = child.text or ""
|
|
197
|
+
|
|
198
|
+
return extensions
|
|
199
|
+
|
|
200
|
+
def _normalize_key(self, value: str) -> str:
|
|
201
|
+
return value.lower().replace(" ", "_").replace("-", "_")
|
|
Binary file
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from lxml.etree import _Element
|
|
2
|
+
|
|
3
|
+
from lokit.data.structure import CodePart, SegmentPart, TextPart
|
|
4
|
+
from lokit.data.tag_types import TieData, TieType
|
|
5
|
+
from lokit.parsers.tmx.helpers import TMX_TAG_MAP
|
|
6
|
+
from lokit.parsers.tmx.xml_utils import element_children, local_name
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TmxTagParser:
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
def parse(
|
|
14
|
+
self, element: _Element
|
|
15
|
+
) -> tuple[str, dict[str, TieData], list[SegmentPart]]:
|
|
16
|
+
raw_txt: str = ""
|
|
17
|
+
tag_map: dict[str, TieData] = {}
|
|
18
|
+
parts: list[SegmentPart] = []
|
|
19
|
+
order: int = 0
|
|
20
|
+
pair_ids: dict[str, str] = {}
|
|
21
|
+
|
|
22
|
+
if element.text:
|
|
23
|
+
raw_txt += element.text
|
|
24
|
+
parts.append(TextPart(element.text))
|
|
25
|
+
|
|
26
|
+
for child in element_children(element):
|
|
27
|
+
tag_name: str = local_name(child.tag)
|
|
28
|
+
tie_type: TieType = TMX_TAG_MAP.get(tag_name, TieType.CUSTOM_STANDALONE)
|
|
29
|
+
source_pair_id: str | None = child.attrib.get("i") or child.attrib.get("id")
|
|
30
|
+
pair_id = self._normalize_pair_id(source_pair_id, pair_ids)
|
|
31
|
+
tie_id = f"c{order}"
|
|
32
|
+
|
|
33
|
+
tag_map[tie_id] = TieData(
|
|
34
|
+
id=tie_id,
|
|
35
|
+
type=tie_type,
|
|
36
|
+
position=len(raw_txt),
|
|
37
|
+
order=order,
|
|
38
|
+
pair_id=pair_id,
|
|
39
|
+
)
|
|
40
|
+
parts.append(CodePart(tie_id))
|
|
41
|
+
order += 1
|
|
42
|
+
|
|
43
|
+
if child.tail:
|
|
44
|
+
raw_txt += child.tail
|
|
45
|
+
parts.append(TextPart(child.tail))
|
|
46
|
+
|
|
47
|
+
return raw_txt, tag_map, parts
|
|
48
|
+
|
|
49
|
+
def _normalize_pair_id(
|
|
50
|
+
self, source_pair_id: str | None, pair_ids: dict[str, str]
|
|
51
|
+
) -> str | None:
|
|
52
|
+
if source_pair_id is None:
|
|
53
|
+
return None
|
|
54
|
+
existing = pair_ids.get(source_pair_id)
|
|
55
|
+
if existing is not None:
|
|
56
|
+
return existing
|
|
57
|
+
normalized = f"p{len(pair_ids)}"
|
|
58
|
+
pair_ids[source_pair_id] = normalized
|
|
59
|
+
return normalized
|
|
Binary file
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from typing import BinaryIO
|
|
5
|
+
|
|
6
|
+
from lxml import etree
|
|
7
|
+
from lxml.etree import _Element
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def local_name(tag: object) -> str:
|
|
11
|
+
if not isinstance(tag, str):
|
|
12
|
+
return ""
|
|
13
|
+
return tag.rsplit("}", 1)[-1]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def iterparse_safe(
|
|
17
|
+
source: str | BinaryIO,
|
|
18
|
+
events: tuple[str, ...],
|
|
19
|
+
) -> etree.iterparse[etree._Element]:
|
|
20
|
+
return etree.iterparse(
|
|
21
|
+
source,
|
|
22
|
+
events=events,
|
|
23
|
+
no_network=True,
|
|
24
|
+
resolve_entities=False,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def element_children(element: _Element, name: str | None = None) -> Iterator[_Element]:
|
|
29
|
+
for child in element:
|
|
30
|
+
if name is None or local_name(child.tag) == name:
|
|
31
|
+
yield child
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def find_child(element: _Element, name: str) -> _Element | None:
|
|
35
|
+
for child in element_children(element, name):
|
|
36
|
+
return child
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def clear_element(element: _Element) -> None:
|
|
41
|
+
element.clear()
|
|
42
|
+
while element.getprevious() is not None:
|
|
43
|
+
parent = element.getparent()
|
|
44
|
+
if parent is None:
|
|
45
|
+
break
|
|
46
|
+
del parent[0]
|
|
Binary file
|
|
Binary file
|