lokit-python 0.1.0__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
- lokit/__init__.cp313-win_amd64.pyd +0 -0
- lokit/__init__.py +128 -0
- lokit/core/__init__.cp313-win_amd64.pyd +0 -0
- lokit/core/__init__.py +0 -0
- lokit/core/logger.cp313-win_amd64.pyd +0 -0
- lokit/core/logger.py +20 -0
- lokit/data/__init__.cp313-win_amd64.pyd +0 -0
- lokit/data/__init__.py +0 -0
- lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
- lokit/data/lang_codes.py +455 -0
- lokit/data/structure.cp313-win_amd64.pyd +0 -0
- lokit/data/structure.py +118 -0
- lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
- lokit/data/tag_types.py +78 -0
- lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
- lokit/exporters/__init__.py +34 -0
- lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
- lokit/exporters/csv.py +32 -0
- lokit/exporters/html.cp313-win_amd64.pyd +0 -0
- lokit/exporters/html.py +217 -0
- lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
- lokit/exporters/idml.py +178 -0
- lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
- lokit/exporters/json_i18n.py +47 -0
- lokit/exporters/po.cp313-win_amd64.pyd +0 -0
- lokit/exporters/po.py +162 -0
- lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/tmx.py +247 -0
- lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xliff.py +152 -0
- lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
- lokit/exporters/xlsx.py +39 -0
- lokit/format_detection.cp313-win_amd64.pyd +0 -0
- lokit/format_detection.py +115 -0
- lokit/importers.py +321 -0
- lokit/io/__init__.cp313-win_amd64.pyd +0 -0
- lokit/io/__init__.py +3 -0
- lokit/io/json.cp313-win_amd64.pyd +0 -0
- lokit/io/json.py +194 -0
- lokit/logic.cp313-win_amd64.pyd +0 -0
- lokit/logic.py +324 -0
- lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/__init__.py +1 -0
- lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/__init__.py +1 -0
- lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/csv/extraction.py +164 -0
- lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/__init__.py +3 -0
- lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/html/extraction.py +365 -0
- lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/__init__.py +3 -0
- lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/idml/extraction.py +264 -0
- lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/__init__.py +3 -0
- lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/json_i18n/extraction.py +163 -0
- lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/__init__.py +3 -0
- lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/po/extraction.py +236 -0
- lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/__init__.py +0 -0
- lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/base.py +145 -0
- lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/extraction.py +170 -0
- lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/header.py +55 -0
- lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/helpers.py +9 -0
- lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/models.py +10 -0
- lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/props.py +201 -0
- lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/tags.py +59 -0
- lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
- lokit/parsers/tmx/xml_utils.py +46 -0
- lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/__init__.py +3 -0
- lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/extraction.py +229 -0
- lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xliff/tags.py +128 -0
- lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/__init__.py +1 -0
- lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
- lokit/parsers/xlsx/extraction.py +198 -0
- lokit/py.typed +1 -0
- lokit_python-0.1.0.dist-info/METADATA +149 -0
- lokit_python-0.1.0.dist-info/RECORD +97 -0
- lokit_python-0.1.0.dist-info/WHEEL +5 -0
- lokit_python-0.1.0.dist-info/top_level.txt +2 -0
lokit/logic.py
ADDED
|
@@ -0,0 +1,324 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import tempfile
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from collections.abc import Callable, Iterator
|
|
7
|
+
from dataclasses import asdict, dataclass, is_dataclass
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Self, cast
|
|
11
|
+
|
|
12
|
+
from lokit.data.structure import BaseStructure, Data
|
|
13
|
+
from lokit.exporters import (
|
|
14
|
+
export_csv,
|
|
15
|
+
export_idml,
|
|
16
|
+
export_html,
|
|
17
|
+
export_json_i18n,
|
|
18
|
+
export_po,
|
|
19
|
+
export_tmx,
|
|
20
|
+
export_xliff,
|
|
21
|
+
export_xlsx,
|
|
22
|
+
)
|
|
23
|
+
from lokit.format_detection import LokitInputFormat, detect_format, detect_format_from_bytes
|
|
24
|
+
from lokit.importers import (
|
|
25
|
+
import_csv,
|
|
26
|
+
import_idml,
|
|
27
|
+
import_html,
|
|
28
|
+
import_json_i18n,
|
|
29
|
+
import_po,
|
|
30
|
+
import_tmx,
|
|
31
|
+
import_xliff,
|
|
32
|
+
import_xlsx,
|
|
33
|
+
)
|
|
34
|
+
from lokit.io import load_lokit_json, load_lokit_json_bytes
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(slots=True)
|
|
38
|
+
class MatchResult:
|
|
39
|
+
unit_id: str
|
|
40
|
+
score: float
|
|
41
|
+
kind: str
|
|
42
|
+
source_equal: bool
|
|
43
|
+
tags_equal: bool
|
|
44
|
+
previous_equal: bool
|
|
45
|
+
next_equal: bool
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Lokit:
|
|
49
|
+
def __init__(self, document: BaseStructure) -> None:
|
|
50
|
+
self.document = document
|
|
51
|
+
self._ids: list[str] = list(document.data)
|
|
52
|
+
self._positions: dict[str, int] = {
|
|
53
|
+
unit_id: index for index, unit_id in enumerate(self._ids)
|
|
54
|
+
}
|
|
55
|
+
self._source_index: dict[str, list[str]] = defaultdict(list)
|
|
56
|
+
for unit_id, unit in document.data.items():
|
|
57
|
+
self._source_index[_normalize_text(unit.source)].append(unit_id)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def parse(cls, filepath: str | Path) -> Self:
|
|
61
|
+
path = Path(filepath)
|
|
62
|
+
input_format = detect_format(path)
|
|
63
|
+
if input_format == LokitInputFormat.TMX:
|
|
64
|
+
return cls(import_tmx(str(path)))
|
|
65
|
+
if input_format == LokitInputFormat.XLIFF:
|
|
66
|
+
return cls(import_xliff(str(path)))
|
|
67
|
+
if input_format == LokitInputFormat.CSV:
|
|
68
|
+
return cls(import_csv(str(path)))
|
|
69
|
+
if input_format == LokitInputFormat.XLSX:
|
|
70
|
+
return cls(import_xlsx(str(path)))
|
|
71
|
+
if input_format == LokitInputFormat.HTML:
|
|
72
|
+
return cls(import_html(str(path)))
|
|
73
|
+
if input_format == LokitInputFormat.PO:
|
|
74
|
+
return cls(import_po(str(path)))
|
|
75
|
+
if input_format == LokitInputFormat.JSON_I18N:
|
|
76
|
+
return cls(import_json_i18n(str(path)))
|
|
77
|
+
if input_format == LokitInputFormat.IDML:
|
|
78
|
+
return cls(import_idml(str(path)))
|
|
79
|
+
return cls(load_lokit_json(path))
|
|
80
|
+
|
|
81
|
+
@classmethod
|
|
82
|
+
def parse_bytes(cls, data: bytes) -> Self:
|
|
83
|
+
input_format = detect_format_from_bytes(data)
|
|
84
|
+
if input_format == LokitInputFormat.LOKIT_JSON:
|
|
85
|
+
return cls(load_lokit_json_bytes(data))
|
|
86
|
+
suffix_map = {
|
|
87
|
+
LokitInputFormat.TMX: ".tmx",
|
|
88
|
+
LokitInputFormat.XLIFF: ".xliff",
|
|
89
|
+
LokitInputFormat.CSV: ".csv",
|
|
90
|
+
LokitInputFormat.XLSX: ".xlsx",
|
|
91
|
+
LokitInputFormat.HTML: ".html",
|
|
92
|
+
LokitInputFormat.PO: ".po",
|
|
93
|
+
LokitInputFormat.JSON_I18N: ".json",
|
|
94
|
+
LokitInputFormat.IDML: ".idml",
|
|
95
|
+
}
|
|
96
|
+
suffix = suffix_map.get(input_format, ".json")
|
|
97
|
+
with tempfile.NamedTemporaryFile(suffix=suffix) as temp:
|
|
98
|
+
temp.write(data)
|
|
99
|
+
temp.flush()
|
|
100
|
+
return cls.parse(temp.name)
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def from_document(cls, document: BaseStructure) -> Self:
|
|
104
|
+
return cls(document)
|
|
105
|
+
|
|
106
|
+
def output(self, filepath: str | Path) -> None:
|
|
107
|
+
path = Path(filepath)
|
|
108
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
suffix = path.suffix.lower()
|
|
110
|
+
if suffix == ".tmx":
|
|
111
|
+
export_tmx(self.document, path)
|
|
112
|
+
elif suffix in (".xlf", ".xliff"):
|
|
113
|
+
export_xliff(self.document, path)
|
|
114
|
+
elif suffix == ".csv":
|
|
115
|
+
export_csv(self.document, path)
|
|
116
|
+
elif suffix == ".xlsx":
|
|
117
|
+
export_xlsx(self.document, path)
|
|
118
|
+
elif suffix in (".html", ".htm"):
|
|
119
|
+
source_html = self.document.extensions.get("source_file") or self.document.extensions.get("source_html")
|
|
120
|
+
export_html(self.document, path, source_html)
|
|
121
|
+
elif suffix == ".po":
|
|
122
|
+
export_po(self.document, path)
|
|
123
|
+
elif suffix == ".json":
|
|
124
|
+
if self.document.extensions.get("input_format") == "json_i18n":
|
|
125
|
+
export_json_i18n(self.document, path)
|
|
126
|
+
else:
|
|
127
|
+
path.write_text(
|
|
128
|
+
json.dumps(asdict(self.document), ensure_ascii=False, indent=2, default=str),
|
|
129
|
+
encoding="utf-8",
|
|
130
|
+
)
|
|
131
|
+
elif suffix == ".idml":
|
|
132
|
+
source_idml = self.document.extensions.get("source_file") or self.document.extensions.get("source_idml")
|
|
133
|
+
if not source_idml:
|
|
134
|
+
raise ValueError("Original IDML file path not found in document extensions. Cannot export IDML without source IDML.")
|
|
135
|
+
export_idml(self.document, path, source_idml)
|
|
136
|
+
else:
|
|
137
|
+
path.write_text(
|
|
138
|
+
json.dumps(asdict(self.document), ensure_ascii=False, indent=2, default=str),
|
|
139
|
+
encoding="utf-8",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def unit(self, unit_id: str) -> Data:
|
|
143
|
+
return self.document.data[unit_id]
|
|
144
|
+
|
|
145
|
+
def all(self) -> Iterator[tuple[str, Data]]:
|
|
146
|
+
yield from self.document.data.items()
|
|
147
|
+
|
|
148
|
+
def ids(self) -> list[str]:
|
|
149
|
+
return list(self._ids)
|
|
150
|
+
|
|
151
|
+
def previous(self, unit_id: str) -> tuple[str, Data] | None:
|
|
152
|
+
index = self._positions.get(unit_id)
|
|
153
|
+
if index is None or index == 0:
|
|
154
|
+
return None
|
|
155
|
+
prev_id = self._ids[index - 1]
|
|
156
|
+
return prev_id, self.document.data[prev_id]
|
|
157
|
+
|
|
158
|
+
def next(self, unit_id: str) -> tuple[str, Data] | None:
|
|
159
|
+
index = self._positions.get(unit_id)
|
|
160
|
+
if index is None or index + 1 >= len(self._ids):
|
|
161
|
+
return None
|
|
162
|
+
next_id = self._ids[index + 1]
|
|
163
|
+
return next_id, self.document.data[next_id]
|
|
164
|
+
|
|
165
|
+
def plurals(self) -> Iterator[tuple[str, Data]]:
|
|
166
|
+
for unit_id, unit in self.document.data.items():
|
|
167
|
+
if unit.plural is not None:
|
|
168
|
+
yield unit_id, unit
|
|
169
|
+
|
|
170
|
+
def filter(
|
|
171
|
+
self,
|
|
172
|
+
predicate: Callable[[str, Data], bool],
|
|
173
|
+
) -> list[str]:
|
|
174
|
+
return [
|
|
175
|
+
unit_id
|
|
176
|
+
for unit_id, unit in self.document.data.items()
|
|
177
|
+
if predicate(unit_id, unit)
|
|
178
|
+
]
|
|
179
|
+
|
|
180
|
+
def where(self, key_path: str, value: object) -> list[str]:
|
|
181
|
+
expected = str(value)
|
|
182
|
+
return [
|
|
183
|
+
unit_id
|
|
184
|
+
for unit_id, unit in self.document.data.items()
|
|
185
|
+
if expected in _values_at_path(unit, key_path.split("."))
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
def fuzzy_find(
|
|
189
|
+
self,
|
|
190
|
+
source: str,
|
|
191
|
+
limit: int = 10,
|
|
192
|
+
threshold: float = 0.0,
|
|
193
|
+
) -> list[MatchResult]:
|
|
194
|
+
normalized = _normalize_text(source)
|
|
195
|
+
exact_ids = self._source_index.get(normalized, [])
|
|
196
|
+
exact_results = [
|
|
197
|
+
self._match_against_unit(source, unit_id, require_context=False, require_tags=False)
|
|
198
|
+
for unit_id in exact_ids
|
|
199
|
+
]
|
|
200
|
+
if len(exact_results) >= limit:
|
|
201
|
+
return exact_results[:limit]
|
|
202
|
+
|
|
203
|
+
candidates: list[MatchResult] = exact_results
|
|
204
|
+
exact_set = set(exact_ids)
|
|
205
|
+
for unit_id, unit in self.document.data.items():
|
|
206
|
+
if unit_id in exact_set:
|
|
207
|
+
continue
|
|
208
|
+
score = SequenceMatcher(None, normalized, _normalize_text(unit.source)).ratio()
|
|
209
|
+
if score >= threshold:
|
|
210
|
+
candidates.append(
|
|
211
|
+
MatchResult(
|
|
212
|
+
unit_id=unit_id,
|
|
213
|
+
score=score,
|
|
214
|
+
kind="fuzzy",
|
|
215
|
+
source_equal=False,
|
|
216
|
+
tags_equal=False,
|
|
217
|
+
previous_equal=False,
|
|
218
|
+
next_equal=False,
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
candidates.sort(key=lambda item: item.score, reverse=True)
|
|
223
|
+
return candidates[:limit]
|
|
224
|
+
|
|
225
|
+
def match(
|
|
226
|
+
self,
|
|
227
|
+
source: str,
|
|
228
|
+
target_unit_id: str,
|
|
229
|
+
previous_source: str | None = None,
|
|
230
|
+
next_source: str | None = None,
|
|
231
|
+
tag_signature: tuple[tuple[str, str | None], ...] | None = None,
|
|
232
|
+
require_context: bool = False,
|
|
233
|
+
require_tags: bool = False,
|
|
234
|
+
) -> MatchResult:
|
|
235
|
+
return self._match_against_unit(
|
|
236
|
+
source,
|
|
237
|
+
target_unit_id,
|
|
238
|
+
previous_source=previous_source,
|
|
239
|
+
next_source=next_source,
|
|
240
|
+
tag_signature=tag_signature,
|
|
241
|
+
require_context=require_context,
|
|
242
|
+
require_tags=require_tags,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
def _match_against_unit(
|
|
246
|
+
self,
|
|
247
|
+
source: str,
|
|
248
|
+
unit_id: str,
|
|
249
|
+
previous_source: str | None = None,
|
|
250
|
+
next_source: str | None = None,
|
|
251
|
+
tag_signature: tuple[tuple[str, str | None], ...] | None = None,
|
|
252
|
+
require_context: bool = False,
|
|
253
|
+
require_tags: bool = False,
|
|
254
|
+
) -> MatchResult:
|
|
255
|
+
unit = self.document.data[unit_id]
|
|
256
|
+
source_equal = _normalize_text(source) == _normalize_text(unit.source)
|
|
257
|
+
score = SequenceMatcher(None, _normalize_text(source), _normalize_text(unit.source)).ratio()
|
|
258
|
+
tags_equal = (not require_tags) or (
|
|
259
|
+
tag_signature is not None and tag_signature == _tags_signature(unit)
|
|
260
|
+
)
|
|
261
|
+
previous_equal = (not require_context) or (
|
|
262
|
+
previous_source is not None
|
|
263
|
+
and _normalize_text(previous_source)
|
|
264
|
+
== _normalize_text(_context_text(unit.previous_context) or "")
|
|
265
|
+
)
|
|
266
|
+
next_equal = (not require_context) or (
|
|
267
|
+
next_source is not None
|
|
268
|
+
and _normalize_text(next_source)
|
|
269
|
+
== _normalize_text(_context_text(unit.next_context) or "")
|
|
270
|
+
)
|
|
271
|
+
checked_ice_context = require_context or require_tags
|
|
272
|
+
is_ice = checked_ice_context and source_equal and tags_equal and previous_equal and next_equal
|
|
273
|
+
return MatchResult(
|
|
274
|
+
unit_id=unit_id,
|
|
275
|
+
score=1.0 if is_ice else score,
|
|
276
|
+
kind="ice" if is_ice else ("exact" if source_equal else "fuzzy"),
|
|
277
|
+
source_equal=source_equal,
|
|
278
|
+
tags_equal=tags_equal,
|
|
279
|
+
previous_equal=previous_equal,
|
|
280
|
+
next_equal=next_equal,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _normalize_text(value: str) -> str:
|
|
285
|
+
return " ".join(value.casefold().split())
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _values_at_path(root: object, path: list[str]) -> list[str]:
|
|
289
|
+
if not path:
|
|
290
|
+
return [str(root)] if root is not None else []
|
|
291
|
+
head = path[0]
|
|
292
|
+
tail = path[1:]
|
|
293
|
+
|
|
294
|
+
if isinstance(root, list):
|
|
295
|
+
values: list[str] = []
|
|
296
|
+
for item in root:
|
|
297
|
+
values.extend(_values_at_path(item, path))
|
|
298
|
+
return values
|
|
299
|
+
|
|
300
|
+
if isinstance(root, dict):
|
|
301
|
+
if head not in root:
|
|
302
|
+
return []
|
|
303
|
+
return _values_at_path(root[head], tail)
|
|
304
|
+
|
|
305
|
+
if is_dataclass(root):
|
|
306
|
+
if not hasattr(root, head):
|
|
307
|
+
return []
|
|
308
|
+
return _values_at_path(getattr(root, head), tail)
|
|
309
|
+
|
|
310
|
+
return []
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _tags_signature(unit: Data) -> tuple[tuple[str, str | None], ...]:
|
|
314
|
+
if unit.tags is None:
|
|
315
|
+
return ()
|
|
316
|
+
ordered = sorted(unit.tags.source_tag_map.values(), key=lambda item: item.order)
|
|
317
|
+
return tuple((tag.type.value, tag.pair_id) for tag in ordered)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _context_text(context: object) -> str | None:
|
|
321
|
+
if context is None:
|
|
322
|
+
return None
|
|
323
|
+
source = getattr(context, "source", None)
|
|
324
|
+
return cast(str | None, source)
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
Binary file
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
Binary file
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import csv
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import AsyncIterator, Iterator, Optional
|
|
8
|
+
|
|
9
|
+
from lokit.data.structure import Comment, Data, TranslationStatus
|
|
10
|
+
|
|
11
|
+
ExtractItem = tuple[str, Data]
|
|
12
|
+
|
|
13
|
+
_KNOWN_COLUMNS = frozenset({"id", "source", "target", "status", "comment"})
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _parse_base_lang(locale: str) -> str:
|
|
17
|
+
return locale.replace("_", "-").split("-")[0].lower()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _parse_status(value: str) -> TranslationStatus:
|
|
21
|
+
normalized = value.strip().lower()
|
|
22
|
+
try:
|
|
23
|
+
return TranslationStatus(normalized)
|
|
24
|
+
except ValueError:
|
|
25
|
+
return TranslationStatus.UNKNOWN
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _infer_locales_from_filename(filepath: str) -> tuple[str, str | None]:
|
|
29
|
+
stem = Path(filepath).stem
|
|
30
|
+
if "-" in stem:
|
|
31
|
+
parts = stem.split("-")
|
|
32
|
+
if len(parts) == 2:
|
|
33
|
+
return parts[0], parts[1]
|
|
34
|
+
if len(parts) == 4:
|
|
35
|
+
return f"{parts[0]}-{parts[1]}", f"{parts[2]}-{parts[3]}"
|
|
36
|
+
if "_" in stem:
|
|
37
|
+
parts = stem.split("_")
|
|
38
|
+
if len(parts) == 2:
|
|
39
|
+
return parts[0], parts[1]
|
|
40
|
+
if len(parts) == 4:
|
|
41
|
+
return f"{parts[0]}_{parts[1]}", f"{parts[2]}_{parts[3]}"
|
|
42
|
+
return "", None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(slots=True)
|
|
46
|
+
class _AsyncExtractionResult:
|
|
47
|
+
item: Optional[ExtractItem] = None
|
|
48
|
+
error: Optional[BaseException] = None
|
|
49
|
+
done: bool = False
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class AsyncCsvExtraction:
|
|
53
|
+
def __init__(self, extractor: CsvExtractor) -> None:
|
|
54
|
+
self._extractor = extractor
|
|
55
|
+
self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
|
|
56
|
+
self._producer: asyncio.Task[None] | None = None
|
|
57
|
+
|
|
58
|
+
def __aiter__(self) -> AsyncCsvExtraction:
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
async def __anext__(self) -> ExtractItem:
|
|
62
|
+
if self._producer is None:
|
|
63
|
+
self._start()
|
|
64
|
+
|
|
65
|
+
result = await self._queue.get()
|
|
66
|
+
if result.done:
|
|
67
|
+
await self._finish()
|
|
68
|
+
raise StopAsyncIteration
|
|
69
|
+
if result.error is not None:
|
|
70
|
+
await self._finish()
|
|
71
|
+
raise result.error
|
|
72
|
+
if result.item is None:
|
|
73
|
+
await self._finish()
|
|
74
|
+
raise StopAsyncIteration
|
|
75
|
+
return result.item
|
|
76
|
+
|
|
77
|
+
def _start(self) -> None:
|
|
78
|
+
loop = asyncio.get_running_loop()
|
|
79
|
+
|
|
80
|
+
def produce() -> None:
|
|
81
|
+
try:
|
|
82
|
+
for item in self._extractor.extract():
|
|
83
|
+
loop.call_soon_threadsafe(
|
|
84
|
+
self._queue.put_nowait,
|
|
85
|
+
_AsyncExtractionResult(item=item),
|
|
86
|
+
)
|
|
87
|
+
except BaseException as exc:
|
|
88
|
+
loop.call_soon_threadsafe(
|
|
89
|
+
self._queue.put_nowait,
|
|
90
|
+
_AsyncExtractionResult(error=exc),
|
|
91
|
+
)
|
|
92
|
+
finally:
|
|
93
|
+
loop.call_soon_threadsafe(
|
|
94
|
+
self._queue.put_nowait,
|
|
95
|
+
_AsyncExtractionResult(done=True),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
self._producer = asyncio.create_task(asyncio.to_thread(produce))
|
|
99
|
+
|
|
100
|
+
async def _finish(self) -> None:
|
|
101
|
+
if self._producer is not None:
|
|
102
|
+
await self._producer
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class CsvExtractor:
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
filepath: str,
|
|
109
|
+
source_locale: str = "",
|
|
110
|
+
target_locale: str | None = None,
|
|
111
|
+
) -> None:
|
|
112
|
+
self.filepath: str = filepath
|
|
113
|
+
|
|
114
|
+
if source_locale:
|
|
115
|
+
self.source_locale: str = source_locale
|
|
116
|
+
self.target_locale: str | None = target_locale
|
|
117
|
+
else:
|
|
118
|
+
inferred_source, inferred_target = _infer_locales_from_filename(filepath)
|
|
119
|
+
self.source_locale = inferred_source
|
|
120
|
+
self.target_locale = target_locale or inferred_target
|
|
121
|
+
|
|
122
|
+
self.source_language: str | None = (
|
|
123
|
+
_parse_base_lang(self.source_locale) if self.source_locale else None
|
|
124
|
+
)
|
|
125
|
+
self.target_language: str | None = (
|
|
126
|
+
_parse_base_lang(self.target_locale) if self.target_locale else None
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
self.export_origin: str = ""
|
|
130
|
+
self.export_timestamp: str = ""
|
|
131
|
+
self.extensions: dict[str, str] = {"input_format": "csv"}
|
|
132
|
+
|
|
133
|
+
def extract(self) -> Iterator[ExtractItem]:
|
|
134
|
+
with open(self.filepath, newline="", encoding="utf-8") as fh:
|
|
135
|
+
reader = csv.DictReader(fh)
|
|
136
|
+
fieldnames: list[str] = list(reader.fieldnames or [])
|
|
137
|
+
has_id = "id" in fieldnames
|
|
138
|
+
extra_columns = [c for c in fieldnames if c not in _KNOWN_COLUMNS]
|
|
139
|
+
|
|
140
|
+
for index, row in enumerate(reader):
|
|
141
|
+
unit_id = row["id"] if has_id and row.get("id") else f"csv:{index}"
|
|
142
|
+
source = row.get("source", "")
|
|
143
|
+
target = row.get("target") or None
|
|
144
|
+
status = _parse_status(row["status"]) if row.get("status") else TranslationStatus.UNKNOWN
|
|
145
|
+
|
|
146
|
+
comments: list[Comment] = []
|
|
147
|
+
comment_text = row.get("comment", "").strip()
|
|
148
|
+
if comment_text:
|
|
149
|
+
comments.append(Comment(context=comment_text))
|
|
150
|
+
|
|
151
|
+
extensions: dict[str, str] = {
|
|
152
|
+
col: row[col] for col in extra_columns if row.get(col)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
yield unit_id, Data(
|
|
156
|
+
source=source,
|
|
157
|
+
target=target,
|
|
158
|
+
status=status,
|
|
159
|
+
comments=comments,
|
|
160
|
+
extensions=extensions,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
def extract_async(self) -> AsyncIterator[ExtractItem]:
|
|
164
|
+
return AsyncCsvExtraction(self)
|
|
Binary file
|
|
Binary file
|