epub-translator 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. epub_translator/__init__.py +2 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/segment/__init__.py +26 -0
  15. epub_translator/segment/block_segment.py +124 -0
  16. epub_translator/segment/common.py +29 -0
  17. epub_translator/segment/inline_segment.py +356 -0
  18. epub_translator/{xml_translator → segment}/text_segment.py +8 -8
  19. epub_translator/segment/utils.py +43 -0
  20. epub_translator/translator.py +147 -183
  21. epub_translator/utils.py +33 -0
  22. epub_translator/xml/__init__.py +2 -0
  23. epub_translator/xml/const.py +1 -0
  24. epub_translator/xml/deduplication.py +3 -3
  25. epub_translator/xml/self_closing.py +182 -0
  26. epub_translator/xml/utils.py +42 -0
  27. epub_translator/xml/xml.py +7 -0
  28. epub_translator/xml/xml_like.py +8 -33
  29. epub_translator/xml_interrupter.py +165 -0
  30. epub_translator/xml_translator/__init__.py +1 -2
  31. epub_translator/xml_translator/callbacks.py +34 -0
  32. epub_translator/xml_translator/{const.py → common.py} +0 -1
  33. epub_translator/xml_translator/hill_climbing.py +104 -0
  34. epub_translator/xml_translator/stream_mapper.py +253 -0
  35. epub_translator/xml_translator/submitter.py +26 -72
  36. epub_translator/xml_translator/translator.py +162 -113
  37. epub_translator/xml_translator/validation.py +458 -0
  38. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
  39. epub_translator-0.1.3.dist-info/RECORD +66 -0
  40. epub_translator/epub/placeholder.py +0 -53
  41. epub_translator/iter_sync.py +0 -24
  42. epub_translator/xml_translator/fill.py +0 -128
  43. epub_translator/xml_translator/format.py +0 -282
  44. epub_translator/xml_translator/fragmented.py +0 -125
  45. epub_translator/xml_translator/group.py +0 -183
  46. epub_translator/xml_translator/progressive_locking.py +0 -256
  47. epub_translator/xml_translator/utils.py +0 -29
  48. epub_translator-0.1.1.dist-info/RECORD +0 -58
  49. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
  50. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,356 @@
1
+ from collections.abc import Generator, Iterable, Iterator
2
+ from dataclasses import dataclass
3
+ from xml.etree.ElementTree import Element
4
+
5
+ from ..utils import ensure_list, is_the_same, nest
6
+ from ..xml import ID_KEY, append_text_in_element, iter_with_stack, plain_text
7
+ from .common import FoundInvalidIDError, validate_id_in_element
8
+ from .text_segment import TextSegment
9
+ from .utils import IDGenerator, element_fingerprint, id_in_element
10
+
11
+
12
+ @dataclass
13
+ class InlineLostIDError:
14
+ element: Element
15
+ stack: list[Element]
16
+
17
+
18
+ @dataclass
19
+ class InlineUnexpectedIDError:
20
+ id: int
21
+ element: Element
22
+
23
+
24
+ @dataclass
25
+ class InlineExpectedIDsError:
26
+ id2element: dict[int, Element]
27
+
28
+
29
+ @dataclass
30
+ class InlineWrongTagCountError:
31
+ expected_count: int
32
+ found_elements: list[Element]
33
+ stack: list[Element]
34
+
35
+
36
+ InlineError = InlineLostIDError | InlineUnexpectedIDError | InlineExpectedIDsError | InlineWrongTagCountError
37
+
38
+
39
+ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["InlineSegment", None, None]:
40
+ stack_data: tuple[list[list[TextSegment | InlineSegment]], Element, int] | None = None
41
+ inline_segment: InlineSegment | None = None
42
+
43
+ for text_segment in text_segments:
44
+ if stack_data is not None:
45
+ stack, stack_block, stack_base_depth = stack_data
46
+ if stack_block is not text_segment.block_parent:
47
+ inline_segment = _pop_stack_data(stack_data)
48
+ stack_data = None
49
+ if inline_segment:
50
+ yield inline_segment
51
+
52
+ if stack_data is None:
53
+ stack_data = (
54
+ [],
55
+ text_segment.block_parent,
56
+ text_segment.block_depth,
57
+ )
58
+
59
+ stack, stack_block, stack_base_depth = stack_data
60
+
61
+ while len(stack) < text_segment.depth + 1:
62
+ stack.append([])
63
+
64
+ while len(stack) > text_segment.depth + 1:
65
+ _pop_stack(
66
+ stack=stack,
67
+ stack_base_depth=stack_base_depth,
68
+ )
69
+
70
+ # text_segment.depth 可视为它在 stack 中的 index,必须令 len(stack) == text_segment.depth + 1
71
+ stack[-1].append(text_segment)
72
+
73
+ if stack_data is not None:
74
+ inline_segment = _pop_stack_data(stack_data)
75
+ if inline_segment:
76
+ yield inline_segment
77
+
78
+
79
+ def _pop_stack_data(stack_data: tuple[list[list["TextSegment | InlineSegment"]], Element, int]):
80
+ stack, _, stack_base_depth = stack_data
81
+ inline_segment: InlineSegment | None = None
82
+ while stack:
83
+ inline_segment = _pop_stack(
84
+ stack=stack,
85
+ stack_base_depth=stack_base_depth,
86
+ )
87
+ return inline_segment
88
+
89
+
90
+ def _pop_stack(
91
+ stack: list[list["TextSegment | InlineSegment"]],
92
+ stack_base_depth: int,
93
+ ) -> "InlineSegment | None":
94
+ inline_segment: InlineSegment | None = None
95
+ depth = len(stack) + stack_base_depth - 1
96
+ popped = stack.pop()
97
+ if popped:
98
+ inline_segment = InlineSegment(depth, popped)
99
+ if stack and inline_segment is not None:
100
+ stack[-1].append(inline_segment)
101
+ return inline_segment
102
+
103
+
104
+ class InlineSegment:
105
+ def __init__(self, depth: int, children: list["TextSegment | InlineSegment"]) -> None:
106
+ assert depth > 0
107
+ self.id: int | None = None
108
+ self._children: list[TextSegment | InlineSegment] = children
109
+ self._parent_stack: list[Element] = children[0].parent_stack[:depth]
110
+
111
+ # 每一组 tag 都对应一个 ids 列表。
112
+ # 若为空,说明该 tag 属性结构全同,没必要分配 id 以区分。
113
+ # 若非空,则表示 tag 下每一个 element 都有 id 属性。
114
+ # 注意,相同 tag 下的 element 要么全部有 id,要么全部都没有 id
115
+ self._child_tag2ids: dict[str, list[int]] = {}
116
+ self._child_tag2count: dict[str, int] = {}
117
+
118
+ next_temp_id: int = 0
119
+ terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
120
+
121
+ for tag, child_terms in terms.items():
122
+ self._child_tag2count[tag] = len(child_terms)
123
+ if not is_the_same( # 仅当 tag 彼此无法区分时才分配 id,以尽可能减少 id 的数量
124
+ elements=(element_fingerprint(t.parent) for t in child_terms),
125
+ ):
126
+ for child in child_terms:
127
+ child.id = next_temp_id
128
+ next_temp_id += 1
129
+
130
+ @property
131
+ def head(self) -> TextSegment:
132
+ first_child = self._children[0]
133
+ if isinstance(first_child, TextSegment):
134
+ return first_child
135
+ else:
136
+ return first_child.head
137
+
138
+ @property
139
+ def tail(self) -> TextSegment:
140
+ last_child = self._children[-1]
141
+ if isinstance(last_child, TextSegment):
142
+ return last_child
143
+ else:
144
+ return last_child.tail
145
+
146
+ @property
147
+ def children(self) -> list["TextSegment | InlineSegment"]:
148
+ return self._children
149
+
150
+ @property
151
+ def parent(self) -> Element:
152
+ return self._parent_stack[-1]
153
+
154
+ @property
155
+ def parent_stack(self) -> list[Element]:
156
+ return self._parent_stack
157
+
158
+ def __iter__(self) -> Iterator[TextSegment]:
159
+ for child in self._children:
160
+ if isinstance(child, TextSegment):
161
+ yield child
162
+ elif isinstance(child, InlineSegment):
163
+ yield from child
164
+
165
+ def recreate_ids(self, id_generator: IDGenerator) -> None:
166
+ self._child_tag2count.clear()
167
+ self._child_tag2ids.clear()
168
+
169
+ for child in self._children:
170
+ if isinstance(child, InlineSegment):
171
+ child_tag = child.parent.tag
172
+ ids = ensure_list(self._child_tag2ids, child_tag)
173
+ if child.id is not None:
174
+ child.id = id_generator.next_id()
175
+ ids.append(child.id)
176
+ child.recreate_ids(id_generator)
177
+ self._child_tag2count[child_tag] = self._child_tag2count.get(child_tag, 0) + 1
178
+
179
+ def create_element(self) -> Element:
180
+ element = Element(self.parent.tag)
181
+ previous_element: Element | None = None
182
+ for child in self._children:
183
+ if isinstance(child, InlineSegment):
184
+ previous_element = child.create_element()
185
+ element.append(previous_element)
186
+
187
+ elif isinstance(child, TextSegment):
188
+ if previous_element is None:
189
+ element.text = append_text_in_element(
190
+ origin_text=element.text,
191
+ append_text=child.text,
192
+ )
193
+ else:
194
+ previous_element.tail = append_text_in_element(
195
+ origin_text=previous_element.tail,
196
+ append_text=child.text,
197
+ )
198
+ if self.id is not None:
199
+ element.set(ID_KEY, str(self.id))
200
+ return element
201
+
202
+ def validate(self, validated_element: Element) -> Generator[InlineError | FoundInvalidIDError, None, None]:
203
+ remain_expected_elements: dict[int, Element] = {}
204
+ for child in self._child_inline_segments():
205
+ if child.id is not None:
206
+ remain_expected_elements[child.id] = child.parent
207
+
208
+ for _, child_element in iter_with_stack(validated_element):
209
+ if child_element is validated_element:
210
+ continue # skip the root self
211
+
212
+ element_id = id_in_element(child_element)
213
+ if element_id is None:
214
+ validated_id = validate_id_in_element(
215
+ element=child_element,
216
+ enable_no_id=True,
217
+ )
218
+ if isinstance(validated_id, FoundInvalidIDError):
219
+ yield validated_id
220
+ continue
221
+
222
+ remain_expected_element = remain_expected_elements.pop(element_id, None)
223
+ if remain_expected_element is None:
224
+ yield InlineUnexpectedIDError(
225
+ id=element_id,
226
+ element=child_element,
227
+ )
228
+
229
+ if remain_expected_elements:
230
+ yield InlineExpectedIDsError(
231
+ id2element=remain_expected_elements,
232
+ )
233
+
234
+ yield from self._validate_children_structure(validated_element)
235
+
236
+ def _child_inline_segments(self) -> Generator["InlineSegment", None, None]:
237
+ for child in self._children:
238
+ if isinstance(child, InlineSegment):
239
+ yield child
240
+ yield from child._child_inline_segments() # pylint: disable=protected-access
241
+
242
+ def _validate_children_structure(self, validated_element: Element):
243
+ tag2found_elements: dict[str, list[Element]] = {}
244
+
245
+ for child_element in validated_element:
246
+ ids = self._child_tag2ids.get(child_element.tag, None)
247
+ if not ids:
248
+ found_elements = ensure_list(tag2found_elements, child_element.tag)
249
+ found_elements.append(child_element)
250
+ else:
251
+ id_str = child_element.get(ID_KEY, None)
252
+ if id_str is None:
253
+ yield InlineLostIDError(
254
+ element=child_element,
255
+ stack=[self.parent],
256
+ )
257
+
258
+ for tag, found_elements in tag2found_elements.items():
259
+ expected_count = self._child_tag2count.get(tag, 0)
260
+ if len(found_elements) != expected_count:
261
+ yield InlineWrongTagCountError(
262
+ expected_count=expected_count,
263
+ found_elements=found_elements,
264
+ stack=[self.parent],
265
+ )
266
+
267
+ for child, child_element in self._match_children(validated_element):
268
+ # pylint: disable=protected-access
269
+ for error in child._validate_children_structure(child_element):
270
+ error.stack.insert(0, self.parent)
271
+ yield error
272
+
273
+ # 即便 self.validate(...) 的错误没有排除干净,也要尽可能匹配一个质量较高(尽力而为)的版本
274
+ def assign_attributes(self, template_element: Element) -> Element:
275
+ assigned_element = Element(self.parent.tag, self.parent.attrib)
276
+ if template_element.text and template_element.text.strip():
277
+ assigned_element.text = append_text_in_element(
278
+ origin_text=assigned_element.text,
279
+ append_text=template_element.text,
280
+ )
281
+
282
+ matched_child_element_ids: set[int] = set()
283
+ for child, child_element in self._match_children(template_element):
284
+ child_assigned_element = child.assign_attributes(child_element)
285
+ assigned_element.append(child_assigned_element)
286
+ matched_child_element_ids.add(id(child_element))
287
+
288
+ assigned_child_element_stack = list(assigned_element)
289
+ assigned_child_element_stack.reverse()
290
+
291
+ previous_assigned_child_element: Element | None = None
292
+ for child_element in template_element:
293
+ # 只关心 child_element 是否是分割点,不关心它真实对应。极端情况下可能乱序,只好大致对上就行
294
+ child_text: str = ""
295
+ if id(child_element) not in matched_child_element_ids:
296
+ child_text = plain_text(child_element)
297
+ elif assigned_child_element_stack:
298
+ previous_assigned_child_element = assigned_child_element_stack.pop()
299
+ if child_element.tail is not None:
300
+ child_text += child_element.tail
301
+ if not child_text.strip():
302
+ continue
303
+ if previous_assigned_child_element is None:
304
+ assigned_element.text = append_text_in_element(
305
+ origin_text=assigned_element.text,
306
+ append_text=child_text,
307
+ )
308
+ else:
309
+ previous_assigned_child_element.tail = append_text_in_element(
310
+ origin_text=previous_assigned_child_element.tail,
311
+ append_text=child_text,
312
+ )
313
+ return assigned_element
314
+
315
+ def _match_children(self, element: Element) -> Generator[tuple["InlineSegment", Element], None, None]:
316
+ tag2elements = nest((c.tag, c) for c in element)
317
+ tag2children = nest(
318
+ (c.parent.tag, (i, c)) for i, c in enumerate(c for c in self._children if isinstance(c, InlineSegment))
319
+ )
320
+ used_ids: set[int] = set()
321
+ children_and_elements: list[tuple[int, InlineSegment, Element]] = []
322
+
323
+ for tag, orders_and_children in tag2children.items():
324
+ # 优先考虑 id 匹配,剩下的以自然顺序尽可能匹配
325
+ ids = self._child_tag2ids.get(tag, [])
326
+ matched_children_elements: list[Element | None] = [None] * len(orders_and_children)
327
+ not_matched_elements: list[Element] = []
328
+
329
+ for child_element in tag2elements.get(tag, []):
330
+ id_order: int | None = None
331
+ child_id = id_in_element(child_element)
332
+ if child_id is not None and child_id not in used_ids:
333
+ used_ids.add(child_id) # 一个 id 只能用一次,防止重复
334
+ try:
335
+ id_order = ids.index(child_id)
336
+ except ValueError:
337
+ pass
338
+ if id_order is None:
339
+ not_matched_elements.append(child_element)
340
+ else:
341
+ matched_children_elements[id_order] = child_element
342
+
343
+ not_matched_elements.reverse()
344
+ for i in range(len(matched_children_elements)):
345
+ if not not_matched_elements:
346
+ break
347
+ matched_element = matched_children_elements[i]
348
+ if matched_element is None:
349
+ matched_children_elements[i] = not_matched_elements.pop()
350
+
351
+ for (order, child), child_element in zip(orders_and_children, matched_children_elements):
352
+ if child_element is not None:
353
+ children_and_elements.append((order, child, child_element))
354
+
355
+ for _, child, child_element in sorted(children_and_elements, key=lambda x: x[0]):
356
+ yield child, child_element
@@ -4,13 +4,13 @@ from enum import Enum, auto
4
4
  from typing import Self
5
5
  from xml.etree.ElementTree import Element
6
6
 
7
- from .utils import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
7
+ from ..xml import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
8
8
 
9
9
  # HTML inline-level elements
10
10
  # Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
11
11
  # Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
12
12
  _HTML_INLINE_TAGS = frozenset(
13
- [
13
+ (
14
14
  # Inline text semantics
15
15
  "a",
16
16
  "abbr",
@@ -67,7 +67,7 @@ _HTML_INLINE_TAGS = frozenset(
67
67
  "del",
68
68
  "ins",
69
69
  "slot",
70
- ]
70
+ )
71
71
  )
72
72
 
73
73
 
@@ -79,7 +79,6 @@ class TextPosition(Enum):
79
79
  @dataclass
80
80
  class TextSegment:
81
81
  text: str
82
- index: int # *.text is 0, the first *.tail is 1, and so on
83
82
  parent_stack: list[Element]
84
83
  left_common_depth: int
85
84
  right_common_depth: int
@@ -90,6 +89,10 @@ class TextSegment:
90
89
  def root(self) -> Element:
91
90
  return self.parent_stack[0]
92
91
 
92
+ @property
93
+ def depth(self) -> int:
94
+ return len(self.parent_stack) - self.block_depth
95
+
93
96
  @property
94
97
  def block_parent(self) -> Element:
95
98
  return self.parent_stack[self.block_depth - 1]
@@ -106,7 +109,6 @@ class TextSegment:
106
109
  def clone(self) -> "TextSegment":
107
110
  return TextSegment(
108
111
  text=self.text,
109
- index=self.index,
110
112
  parent_stack=list(self.parent_stack),
111
113
  left_common_depth=self.left_common_depth,
112
114
  right_common_depth=self.right_common_depth,
@@ -171,20 +173,18 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
171
173
  if text is not None:
172
174
  yield TextSegment(
173
175
  text=text,
174
- index=0,
175
176
  parent_stack=next_stack,
176
177
  left_common_depth=0,
177
178
  right_common_depth=0,
178
179
  block_depth=next_block_depth,
179
180
  position=TextPosition.TEXT,
180
181
  )
181
- for i, child_element in enumerate(element):
182
+ for child_element in element:
182
183
  yield from _search_text_segments(next_stack, child_element)
183
184
  child_tail = normalize_text_in_element(child_element.tail)
184
185
  if child_tail is not None:
185
186
  yield TextSegment(
186
187
  text=child_tail,
187
- index=i + 1,
188
188
  parent_stack=next_stack,
189
189
  left_common_depth=0,
190
190
  right_common_depth=0,
@@ -0,0 +1,43 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from ..xml import ID_KEY
4
+
5
+
6
+ def element_fingerprint(element: Element) -> str:
7
+ attrs = sorted(f"{key}={value}" for key, value in element.attrib.items())
8
+ return f"<{element.tag} {' '.join(attrs)}/>"
9
+
10
+
11
+ def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
12
+ parents: list[Element] = []
13
+ while True:
14
+ if len(element) != 1:
15
+ break
16
+ child = element[0]
17
+ if not element.text:
18
+ break
19
+ if not child.tail:
20
+ break
21
+ parents.append(element)
22
+ element = child
23
+ element.tail = None
24
+ return element, parents
25
+
26
+
27
+ def id_in_element(element: Element) -> int | None:
28
+ id_str = element.get(ID_KEY, None)
29
+ if id_str is None:
30
+ return None
31
+ try:
32
+ return int(id_str)
33
+ except ValueError:
34
+ return None
35
+
36
+
37
+ class IDGenerator:
38
+ def __init__(self):
39
+ self._previous_id: int = 0
40
+
41
+ def next_id(self) -> int:
42
+ self._previous_id += 1
43
+ return self._previous_id