lokit-python 0.1.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. 821d8b73c2a02cb7980f__mypyc.cp313-win_amd64.pyd +0 -0
  2. lokit/__init__.cp313-win_amd64.pyd +0 -0
  3. lokit/__init__.py +128 -0
  4. lokit/core/__init__.cp313-win_amd64.pyd +0 -0
  5. lokit/core/__init__.py +0 -0
  6. lokit/core/logger.cp313-win_amd64.pyd +0 -0
  7. lokit/core/logger.py +20 -0
  8. lokit/data/__init__.cp313-win_amd64.pyd +0 -0
  9. lokit/data/__init__.py +0 -0
  10. lokit/data/lang_codes.cp313-win_amd64.pyd +0 -0
  11. lokit/data/lang_codes.py +455 -0
  12. lokit/data/structure.cp313-win_amd64.pyd +0 -0
  13. lokit/data/structure.py +118 -0
  14. lokit/data/tag_types.cp313-win_amd64.pyd +0 -0
  15. lokit/data/tag_types.py +78 -0
  16. lokit/exporters/__init__.cp313-win_amd64.pyd +0 -0
  17. lokit/exporters/__init__.py +34 -0
  18. lokit/exporters/csv.cp313-win_amd64.pyd +0 -0
  19. lokit/exporters/csv.py +32 -0
  20. lokit/exporters/html.cp313-win_amd64.pyd +0 -0
  21. lokit/exporters/html.py +217 -0
  22. lokit/exporters/idml.cp313-win_amd64.pyd +0 -0
  23. lokit/exporters/idml.py +178 -0
  24. lokit/exporters/json_i18n.cp313-win_amd64.pyd +0 -0
  25. lokit/exporters/json_i18n.py +47 -0
  26. lokit/exporters/po.cp313-win_amd64.pyd +0 -0
  27. lokit/exporters/po.py +162 -0
  28. lokit/exporters/tmx.cp313-win_amd64.pyd +0 -0
  29. lokit/exporters/tmx.py +247 -0
  30. lokit/exporters/xliff.cp313-win_amd64.pyd +0 -0
  31. lokit/exporters/xliff.py +152 -0
  32. lokit/exporters/xlsx.cp313-win_amd64.pyd +0 -0
  33. lokit/exporters/xlsx.py +39 -0
  34. lokit/format_detection.cp313-win_amd64.pyd +0 -0
  35. lokit/format_detection.py +115 -0
  36. lokit/importers.py +321 -0
  37. lokit/io/__init__.cp313-win_amd64.pyd +0 -0
  38. lokit/io/__init__.py +3 -0
  39. lokit/io/json.cp313-win_amd64.pyd +0 -0
  40. lokit/io/json.py +194 -0
  41. lokit/logic.cp313-win_amd64.pyd +0 -0
  42. lokit/logic.py +324 -0
  43. lokit/parsers/__init__.cp313-win_amd64.pyd +0 -0
  44. lokit/parsers/__init__.py +1 -0
  45. lokit/parsers/csv/__init__.cp313-win_amd64.pyd +0 -0
  46. lokit/parsers/csv/__init__.py +1 -0
  47. lokit/parsers/csv/extraction.cp313-win_amd64.pyd +0 -0
  48. lokit/parsers/csv/extraction.py +164 -0
  49. lokit/parsers/html/__init__.cp313-win_amd64.pyd +0 -0
  50. lokit/parsers/html/__init__.py +3 -0
  51. lokit/parsers/html/extraction.cp313-win_amd64.pyd +0 -0
  52. lokit/parsers/html/extraction.py +365 -0
  53. lokit/parsers/idml/__init__.cp313-win_amd64.pyd +0 -0
  54. lokit/parsers/idml/__init__.py +3 -0
  55. lokit/parsers/idml/extraction.cp313-win_amd64.pyd +0 -0
  56. lokit/parsers/idml/extraction.py +264 -0
  57. lokit/parsers/json_i18n/__init__.cp313-win_amd64.pyd +0 -0
  58. lokit/parsers/json_i18n/__init__.py +3 -0
  59. lokit/parsers/json_i18n/extraction.cp313-win_amd64.pyd +0 -0
  60. lokit/parsers/json_i18n/extraction.py +163 -0
  61. lokit/parsers/po/__init__.cp313-win_amd64.pyd +0 -0
  62. lokit/parsers/po/__init__.py +3 -0
  63. lokit/parsers/po/extraction.cp313-win_amd64.pyd +0 -0
  64. lokit/parsers/po/extraction.py +236 -0
  65. lokit/parsers/tmx/__init__.cp313-win_amd64.pyd +0 -0
  66. lokit/parsers/tmx/__init__.py +0 -0
  67. lokit/parsers/tmx/base.cp313-win_amd64.pyd +0 -0
  68. lokit/parsers/tmx/base.py +145 -0
  69. lokit/parsers/tmx/extraction.cp313-win_amd64.pyd +0 -0
  70. lokit/parsers/tmx/extraction.py +170 -0
  71. lokit/parsers/tmx/header.cp313-win_amd64.pyd +0 -0
  72. lokit/parsers/tmx/header.py +55 -0
  73. lokit/parsers/tmx/helpers.cp313-win_amd64.pyd +0 -0
  74. lokit/parsers/tmx/helpers.py +9 -0
  75. lokit/parsers/tmx/models.cp313-win_amd64.pyd +0 -0
  76. lokit/parsers/tmx/models.py +10 -0
  77. lokit/parsers/tmx/props.cp313-win_amd64.pyd +0 -0
  78. lokit/parsers/tmx/props.py +201 -0
  79. lokit/parsers/tmx/tags.cp313-win_amd64.pyd +0 -0
  80. lokit/parsers/tmx/tags.py +59 -0
  81. lokit/parsers/tmx/xml_utils.cp313-win_amd64.pyd +0 -0
  82. lokit/parsers/tmx/xml_utils.py +46 -0
  83. lokit/parsers/xliff/__init__.cp313-win_amd64.pyd +0 -0
  84. lokit/parsers/xliff/__init__.py +3 -0
  85. lokit/parsers/xliff/extraction.cp313-win_amd64.pyd +0 -0
  86. lokit/parsers/xliff/extraction.py +229 -0
  87. lokit/parsers/xliff/tags.cp313-win_amd64.pyd +0 -0
  88. lokit/parsers/xliff/tags.py +128 -0
  89. lokit/parsers/xlsx/__init__.cp313-win_amd64.pyd +0 -0
  90. lokit/parsers/xlsx/__init__.py +1 -0
  91. lokit/parsers/xlsx/extraction.cp313-win_amd64.pyd +0 -0
  92. lokit/parsers/xlsx/extraction.py +198 -0
  93. lokit/py.typed +1 -0
  94. lokit_python-0.1.0.dist-info/METADATA +149 -0
  95. lokit_python-0.1.0.dist-info/RECORD +97 -0
  96. lokit_python-0.1.0.dist-info/WHEEL +5 -0
  97. lokit_python-0.1.0.dist-info/top_level.txt +2 -0
lokit/logic.py ADDED
@@ -0,0 +1,324 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import tempfile
5
+ from collections import defaultdict
6
+ from collections.abc import Callable, Iterator
7
+ from dataclasses import asdict, dataclass, is_dataclass
8
+ from difflib import SequenceMatcher
9
+ from pathlib import Path
10
+ from typing import Self, cast
11
+
12
+ from lokit.data.structure import BaseStructure, Data
13
+ from lokit.exporters import (
14
+ export_csv,
15
+ export_idml,
16
+ export_html,
17
+ export_json_i18n,
18
+ export_po,
19
+ export_tmx,
20
+ export_xliff,
21
+ export_xlsx,
22
+ )
23
+ from lokit.format_detection import LokitInputFormat, detect_format, detect_format_from_bytes
24
+ from lokit.importers import (
25
+ import_csv,
26
+ import_idml,
27
+ import_html,
28
+ import_json_i18n,
29
+ import_po,
30
+ import_tmx,
31
+ import_xliff,
32
+ import_xlsx,
33
+ )
34
+ from lokit.io import load_lokit_json, load_lokit_json_bytes
35
+
36
+
37
+ @dataclass(slots=True)
38
+ class MatchResult:
39
+ unit_id: str
40
+ score: float
41
+ kind: str
42
+ source_equal: bool
43
+ tags_equal: bool
44
+ previous_equal: bool
45
+ next_equal: bool
46
+
47
+
48
+ class Lokit:
49
+ def __init__(self, document: BaseStructure) -> None:
50
+ self.document = document
51
+ self._ids: list[str] = list(document.data)
52
+ self._positions: dict[str, int] = {
53
+ unit_id: index for index, unit_id in enumerate(self._ids)
54
+ }
55
+ self._source_index: dict[str, list[str]] = defaultdict(list)
56
+ for unit_id, unit in document.data.items():
57
+ self._source_index[_normalize_text(unit.source)].append(unit_id)
58
+
59
+ @classmethod
60
+ def parse(cls, filepath: str | Path) -> Self:
61
+ path = Path(filepath)
62
+ input_format = detect_format(path)
63
+ if input_format == LokitInputFormat.TMX:
64
+ return cls(import_tmx(str(path)))
65
+ if input_format == LokitInputFormat.XLIFF:
66
+ return cls(import_xliff(str(path)))
67
+ if input_format == LokitInputFormat.CSV:
68
+ return cls(import_csv(str(path)))
69
+ if input_format == LokitInputFormat.XLSX:
70
+ return cls(import_xlsx(str(path)))
71
+ if input_format == LokitInputFormat.HTML:
72
+ return cls(import_html(str(path)))
73
+ if input_format == LokitInputFormat.PO:
74
+ return cls(import_po(str(path)))
75
+ if input_format == LokitInputFormat.JSON_I18N:
76
+ return cls(import_json_i18n(str(path)))
77
+ if input_format == LokitInputFormat.IDML:
78
+ return cls(import_idml(str(path)))
79
+ return cls(load_lokit_json(path))
80
+
81
+ @classmethod
82
+ def parse_bytes(cls, data: bytes) -> Self:
83
+ input_format = detect_format_from_bytes(data)
84
+ if input_format == LokitInputFormat.LOKIT_JSON:
85
+ return cls(load_lokit_json_bytes(data))
86
+ suffix_map = {
87
+ LokitInputFormat.TMX: ".tmx",
88
+ LokitInputFormat.XLIFF: ".xliff",
89
+ LokitInputFormat.CSV: ".csv",
90
+ LokitInputFormat.XLSX: ".xlsx",
91
+ LokitInputFormat.HTML: ".html",
92
+ LokitInputFormat.PO: ".po",
93
+ LokitInputFormat.JSON_I18N: ".json",
94
+ LokitInputFormat.IDML: ".idml",
95
+ }
96
+ suffix = suffix_map.get(input_format, ".json")
97
+ with tempfile.NamedTemporaryFile(suffix=suffix) as temp:
98
+ temp.write(data)
99
+ temp.flush()
100
+ return cls.parse(temp.name)
101
+
102
+ @classmethod
103
+ def from_document(cls, document: BaseStructure) -> Self:
104
+ return cls(document)
105
+
106
+ def output(self, filepath: str | Path) -> None:
107
+ path = Path(filepath)
108
+ path.parent.mkdir(parents=True, exist_ok=True)
109
+ suffix = path.suffix.lower()
110
+ if suffix == ".tmx":
111
+ export_tmx(self.document, path)
112
+ elif suffix in (".xlf", ".xliff"):
113
+ export_xliff(self.document, path)
114
+ elif suffix == ".csv":
115
+ export_csv(self.document, path)
116
+ elif suffix == ".xlsx":
117
+ export_xlsx(self.document, path)
118
+ elif suffix in (".html", ".htm"):
119
+ source_html = self.document.extensions.get("source_file") or self.document.extensions.get("source_html")
120
+ export_html(self.document, path, source_html)
121
+ elif suffix == ".po":
122
+ export_po(self.document, path)
123
+ elif suffix == ".json":
124
+ if self.document.extensions.get("input_format") == "json_i18n":
125
+ export_json_i18n(self.document, path)
126
+ else:
127
+ path.write_text(
128
+ json.dumps(asdict(self.document), ensure_ascii=False, indent=2, default=str),
129
+ encoding="utf-8",
130
+ )
131
+ elif suffix == ".idml":
132
+ source_idml = self.document.extensions.get("source_file") or self.document.extensions.get("source_idml")
133
+ if not source_idml:
134
+ raise ValueError("Original IDML file path not found in document extensions. Cannot export IDML without source IDML.")
135
+ export_idml(self.document, path, source_idml)
136
+ else:
137
+ path.write_text(
138
+ json.dumps(asdict(self.document), ensure_ascii=False, indent=2, default=str),
139
+ encoding="utf-8",
140
+ )
141
+
142
+ def unit(self, unit_id: str) -> Data:
143
+ return self.document.data[unit_id]
144
+
145
+ def all(self) -> Iterator[tuple[str, Data]]:
146
+ yield from self.document.data.items()
147
+
148
+ def ids(self) -> list[str]:
149
+ return list(self._ids)
150
+
151
+ def previous(self, unit_id: str) -> tuple[str, Data] | None:
152
+ index = self._positions.get(unit_id)
153
+ if index is None or index == 0:
154
+ return None
155
+ prev_id = self._ids[index - 1]
156
+ return prev_id, self.document.data[prev_id]
157
+
158
+ def next(self, unit_id: str) -> tuple[str, Data] | None:
159
+ index = self._positions.get(unit_id)
160
+ if index is None or index + 1 >= len(self._ids):
161
+ return None
162
+ next_id = self._ids[index + 1]
163
+ return next_id, self.document.data[next_id]
164
+
165
+ def plurals(self) -> Iterator[tuple[str, Data]]:
166
+ for unit_id, unit in self.document.data.items():
167
+ if unit.plural is not None:
168
+ yield unit_id, unit
169
+
170
+ def filter(
171
+ self,
172
+ predicate: Callable[[str, Data], bool],
173
+ ) -> list[str]:
174
+ return [
175
+ unit_id
176
+ for unit_id, unit in self.document.data.items()
177
+ if predicate(unit_id, unit)
178
+ ]
179
+
180
+ def where(self, key_path: str, value: object) -> list[str]:
181
+ expected = str(value)
182
+ return [
183
+ unit_id
184
+ for unit_id, unit in self.document.data.items()
185
+ if expected in _values_at_path(unit, key_path.split("."))
186
+ ]
187
+
188
+ def fuzzy_find(
189
+ self,
190
+ source: str,
191
+ limit: int = 10,
192
+ threshold: float = 0.0,
193
+ ) -> list[MatchResult]:
194
+ normalized = _normalize_text(source)
195
+ exact_ids = self._source_index.get(normalized, [])
196
+ exact_results = [
197
+ self._match_against_unit(source, unit_id, require_context=False, require_tags=False)
198
+ for unit_id in exact_ids
199
+ ]
200
+ if len(exact_results) >= limit:
201
+ return exact_results[:limit]
202
+
203
+ candidates: list[MatchResult] = exact_results
204
+ exact_set = set(exact_ids)
205
+ for unit_id, unit in self.document.data.items():
206
+ if unit_id in exact_set:
207
+ continue
208
+ score = SequenceMatcher(None, normalized, _normalize_text(unit.source)).ratio()
209
+ if score >= threshold:
210
+ candidates.append(
211
+ MatchResult(
212
+ unit_id=unit_id,
213
+ score=score,
214
+ kind="fuzzy",
215
+ source_equal=False,
216
+ tags_equal=False,
217
+ previous_equal=False,
218
+ next_equal=False,
219
+ )
220
+ )
221
+
222
+ candidates.sort(key=lambda item: item.score, reverse=True)
223
+ return candidates[:limit]
224
+
225
+ def match(
226
+ self,
227
+ source: str,
228
+ target_unit_id: str,
229
+ previous_source: str | None = None,
230
+ next_source: str | None = None,
231
+ tag_signature: tuple[tuple[str, str | None], ...] | None = None,
232
+ require_context: bool = False,
233
+ require_tags: bool = False,
234
+ ) -> MatchResult:
235
+ return self._match_against_unit(
236
+ source,
237
+ target_unit_id,
238
+ previous_source=previous_source,
239
+ next_source=next_source,
240
+ tag_signature=tag_signature,
241
+ require_context=require_context,
242
+ require_tags=require_tags,
243
+ )
244
+
245
+ def _match_against_unit(
246
+ self,
247
+ source: str,
248
+ unit_id: str,
249
+ previous_source: str | None = None,
250
+ next_source: str | None = None,
251
+ tag_signature: tuple[tuple[str, str | None], ...] | None = None,
252
+ require_context: bool = False,
253
+ require_tags: bool = False,
254
+ ) -> MatchResult:
255
+ unit = self.document.data[unit_id]
256
+ source_equal = _normalize_text(source) == _normalize_text(unit.source)
257
+ score = SequenceMatcher(None, _normalize_text(source), _normalize_text(unit.source)).ratio()
258
+ tags_equal = (not require_tags) or (
259
+ tag_signature is not None and tag_signature == _tags_signature(unit)
260
+ )
261
+ previous_equal = (not require_context) or (
262
+ previous_source is not None
263
+ and _normalize_text(previous_source)
264
+ == _normalize_text(_context_text(unit.previous_context) or "")
265
+ )
266
+ next_equal = (not require_context) or (
267
+ next_source is not None
268
+ and _normalize_text(next_source)
269
+ == _normalize_text(_context_text(unit.next_context) or "")
270
+ )
271
+ checked_ice_context = require_context or require_tags
272
+ is_ice = checked_ice_context and source_equal and tags_equal and previous_equal and next_equal
273
+ return MatchResult(
274
+ unit_id=unit_id,
275
+ score=1.0 if is_ice else score,
276
+ kind="ice" if is_ice else ("exact" if source_equal else "fuzzy"),
277
+ source_equal=source_equal,
278
+ tags_equal=tags_equal,
279
+ previous_equal=previous_equal,
280
+ next_equal=next_equal,
281
+ )
282
+
283
+
284
+ def _normalize_text(value: str) -> str:
285
+ return " ".join(value.casefold().split())
286
+
287
+
288
+ def _values_at_path(root: object, path: list[str]) -> list[str]:
289
+ if not path:
290
+ return [str(root)] if root is not None else []
291
+ head = path[0]
292
+ tail = path[1:]
293
+
294
+ if isinstance(root, list):
295
+ values: list[str] = []
296
+ for item in root:
297
+ values.extend(_values_at_path(item, path))
298
+ return values
299
+
300
+ if isinstance(root, dict):
301
+ if head not in root:
302
+ return []
303
+ return _values_at_path(root[head], tail)
304
+
305
+ if is_dataclass(root):
306
+ if not hasattr(root, head):
307
+ return []
308
+ return _values_at_path(getattr(root, head), tail)
309
+
310
+ return []
311
+
312
+
313
+ def _tags_signature(unit: Data) -> tuple[tuple[str, str | None], ...]:
314
+ if unit.tags is None:
315
+ return ()
316
+ ordered = sorted(unit.tags.source_tag_map.values(), key=lambda item: item.order)
317
+ return tuple((tag.type.value, tag.pair_id) for tag in ordered)
318
+
319
+
320
+ def _context_text(context: object) -> str | None:
321
+ if context is None:
322
+ return None
323
+ source = getattr(context, "source", None)
324
+ return cast(str | None, source)
@@ -0,0 +1 @@
1
+
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,164 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import csv
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import AsyncIterator, Iterator, Optional
8
+
9
+ from lokit.data.structure import Comment, Data, TranslationStatus
10
+
11
+ ExtractItem = tuple[str, Data]
12
+
13
+ _KNOWN_COLUMNS = frozenset({"id", "source", "target", "status", "comment"})
14
+
15
+
16
+ def _parse_base_lang(locale: str) -> str:
17
+ return locale.replace("_", "-").split("-")[0].lower()
18
+
19
+
20
+ def _parse_status(value: str) -> TranslationStatus:
21
+ normalized = value.strip().lower()
22
+ try:
23
+ return TranslationStatus(normalized)
24
+ except ValueError:
25
+ return TranslationStatus.UNKNOWN
26
+
27
+
28
+ def _infer_locales_from_filename(filepath: str) -> tuple[str, str | None]:
29
+ stem = Path(filepath).stem
30
+ if "-" in stem:
31
+ parts = stem.split("-")
32
+ if len(parts) == 2:
33
+ return parts[0], parts[1]
34
+ if len(parts) == 4:
35
+ return f"{parts[0]}-{parts[1]}", f"{parts[2]}-{parts[3]}"
36
+ if "_" in stem:
37
+ parts = stem.split("_")
38
+ if len(parts) == 2:
39
+ return parts[0], parts[1]
40
+ if len(parts) == 4:
41
+ return f"{parts[0]}_{parts[1]}", f"{parts[2]}_{parts[3]}"
42
+ return "", None
43
+
44
+
45
+ @dataclass(slots=True)
46
+ class _AsyncExtractionResult:
47
+ item: Optional[ExtractItem] = None
48
+ error: Optional[BaseException] = None
49
+ done: bool = False
50
+
51
+
52
+ class AsyncCsvExtraction:
53
+ def __init__(self, extractor: CsvExtractor) -> None:
54
+ self._extractor = extractor
55
+ self._queue: asyncio.Queue[_AsyncExtractionResult] = asyncio.Queue()
56
+ self._producer: asyncio.Task[None] | None = None
57
+
58
+ def __aiter__(self) -> AsyncCsvExtraction:
59
+ return self
60
+
61
+ async def __anext__(self) -> ExtractItem:
62
+ if self._producer is None:
63
+ self._start()
64
+
65
+ result = await self._queue.get()
66
+ if result.done:
67
+ await self._finish()
68
+ raise StopAsyncIteration
69
+ if result.error is not None:
70
+ await self._finish()
71
+ raise result.error
72
+ if result.item is None:
73
+ await self._finish()
74
+ raise StopAsyncIteration
75
+ return result.item
76
+
77
+ def _start(self) -> None:
78
+ loop = asyncio.get_running_loop()
79
+
80
+ def produce() -> None:
81
+ try:
82
+ for item in self._extractor.extract():
83
+ loop.call_soon_threadsafe(
84
+ self._queue.put_nowait,
85
+ _AsyncExtractionResult(item=item),
86
+ )
87
+ except BaseException as exc:
88
+ loop.call_soon_threadsafe(
89
+ self._queue.put_nowait,
90
+ _AsyncExtractionResult(error=exc),
91
+ )
92
+ finally:
93
+ loop.call_soon_threadsafe(
94
+ self._queue.put_nowait,
95
+ _AsyncExtractionResult(done=True),
96
+ )
97
+
98
+ self._producer = asyncio.create_task(asyncio.to_thread(produce))
99
+
100
+ async def _finish(self) -> None:
101
+ if self._producer is not None:
102
+ await self._producer
103
+
104
+
105
+ class CsvExtractor:
106
+ def __init__(
107
+ self,
108
+ filepath: str,
109
+ source_locale: str = "",
110
+ target_locale: str | None = None,
111
+ ) -> None:
112
+ self.filepath: str = filepath
113
+
114
+ if source_locale:
115
+ self.source_locale: str = source_locale
116
+ self.target_locale: str | None = target_locale
117
+ else:
118
+ inferred_source, inferred_target = _infer_locales_from_filename(filepath)
119
+ self.source_locale = inferred_source
120
+ self.target_locale = target_locale or inferred_target
121
+
122
+ self.source_language: str | None = (
123
+ _parse_base_lang(self.source_locale) if self.source_locale else None
124
+ )
125
+ self.target_language: str | None = (
126
+ _parse_base_lang(self.target_locale) if self.target_locale else None
127
+ )
128
+
129
+ self.export_origin: str = ""
130
+ self.export_timestamp: str = ""
131
+ self.extensions: dict[str, str] = {"input_format": "csv"}
132
+
133
+ def extract(self) -> Iterator[ExtractItem]:
134
+ with open(self.filepath, newline="", encoding="utf-8") as fh:
135
+ reader = csv.DictReader(fh)
136
+ fieldnames: list[str] = list(reader.fieldnames or [])
137
+ has_id = "id" in fieldnames
138
+ extra_columns = [c for c in fieldnames if c not in _KNOWN_COLUMNS]
139
+
140
+ for index, row in enumerate(reader):
141
+ unit_id = row["id"] if has_id and row.get("id") else f"csv:{index}"
142
+ source = row.get("source", "")
143
+ target = row.get("target") or None
144
+ status = _parse_status(row["status"]) if row.get("status") else TranslationStatus.UNKNOWN
145
+
146
+ comments: list[Comment] = []
147
+ comment_text = row.get("comment", "").strip()
148
+ if comment_text:
149
+ comments.append(Comment(context=comment_text))
150
+
151
+ extensions: dict[str, str] = {
152
+ col: row[col] for col in extra_columns if row.get(col)
153
+ }
154
+
155
+ yield unit_id, Data(
156
+ source=source,
157
+ target=target,
158
+ status=status,
159
+ comments=comments,
160
+ extensions=extensions,
161
+ )
162
+
163
+ def extract_async(self) -> AsyncIterator[ExtractItem]:
164
+ return AsyncCsvExtraction(self)
@@ -0,0 +1,3 @@
1
+ from lokit.parsers.html.extraction import HtmlExtractor
2
+
3
+ __all__ = ["HtmlExtractor"]