epub-translator 0.1.1__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. epub_translator/__init__.py +9 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/punctuation.py +34 -0
  15. epub_translator/segment/__init__.py +26 -0
  16. epub_translator/segment/block_segment.py +124 -0
  17. epub_translator/segment/common.py +29 -0
  18. epub_translator/segment/inline_segment.py +356 -0
  19. epub_translator/{xml_translator → segment}/text_segment.py +7 -72
  20. epub_translator/segment/utils.py +43 -0
  21. epub_translator/translator.py +152 -184
  22. epub_translator/utils.py +33 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/const.py +1 -0
  25. epub_translator/xml/deduplication.py +3 -3
  26. epub_translator/xml/inline.py +67 -0
  27. epub_translator/xml/self_closing.py +182 -0
  28. epub_translator/xml/utils.py +42 -0
  29. epub_translator/xml/xml.py +7 -0
  30. epub_translator/xml/xml_like.py +8 -33
  31. epub_translator/xml_interrupter.py +165 -0
  32. epub_translator/xml_translator/__init__.py +3 -3
  33. epub_translator/xml_translator/callbacks.py +34 -0
  34. epub_translator/xml_translator/{const.py → common.py} +0 -1
  35. epub_translator/xml_translator/hill_climbing.py +104 -0
  36. epub_translator/xml_translator/stream_mapper.py +253 -0
  37. epub_translator/xml_translator/submitter.py +352 -91
  38. epub_translator/xml_translator/translator.py +182 -114
  39. epub_translator/xml_translator/validation.py +458 -0
  40. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/METADATA +134 -21
  41. epub_translator-0.1.4.dist-info/RECORD +68 -0
  42. epub_translator/epub/placeholder.py +0 -53
  43. epub_translator/iter_sync.py +0 -24
  44. epub_translator/xml_translator/fill.py +0 -128
  45. epub_translator/xml_translator/format.py +0 -282
  46. epub_translator/xml_translator/fragmented.py +0 -125
  47. epub_translator/xml_translator/group.py +0 -183
  48. epub_translator/xml_translator/progressive_locking.py +0 -256
  49. epub_translator/xml_translator/utils.py +0 -29
  50. epub_translator-0.1.1.dist-info/RECORD +0 -58
  51. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/LICENSE +0 -0
  52. {epub_translator-0.1.1.dist-info → epub_translator-0.1.4.dist-info}/WHEEL +0 -0
@@ -0,0 +1,356 @@
1
+ from collections.abc import Generator, Iterable, Iterator
2
+ from dataclasses import dataclass
3
+ from xml.etree.ElementTree import Element
4
+
5
+ from ..utils import ensure_list, is_the_same, nest
6
+ from ..xml import ID_KEY, append_text_in_element, iter_with_stack, plain_text
7
+ from .common import FoundInvalidIDError, validate_id_in_element
8
+ from .text_segment import TextSegment
9
+ from .utils import IDGenerator, element_fingerprint, id_in_element
10
+
11
+
12
+ @dataclass
13
+ class InlineLostIDError:
14
+ element: Element
15
+ stack: list[Element]
16
+
17
+
18
+ @dataclass
19
+ class InlineUnexpectedIDError:
20
+ id: int
21
+ element: Element
22
+
23
+
24
+ @dataclass
25
+ class InlineExpectedIDsError:
26
+ id2element: dict[int, Element]
27
+
28
+
29
+ @dataclass
30
+ class InlineWrongTagCountError:
31
+ expected_count: int
32
+ found_elements: list[Element]
33
+ stack: list[Element]
34
+
35
+
36
+ InlineError = InlineLostIDError | InlineUnexpectedIDError | InlineExpectedIDsError | InlineWrongTagCountError
37
+
38
+
39
+ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["InlineSegment", None, None]:
40
+ stack_data: tuple[list[list[TextSegment | InlineSegment]], Element, int] | None = None
41
+ inline_segment: InlineSegment | None = None
42
+
43
+ for text_segment in text_segments:
44
+ if stack_data is not None:
45
+ stack, stack_block, stack_base_depth = stack_data
46
+ if stack_block is not text_segment.block_parent:
47
+ inline_segment = _pop_stack_data(stack_data)
48
+ stack_data = None
49
+ if inline_segment:
50
+ yield inline_segment
51
+
52
+ if stack_data is None:
53
+ stack_data = (
54
+ [],
55
+ text_segment.block_parent,
56
+ text_segment.block_depth,
57
+ )
58
+
59
+ stack, stack_block, stack_base_depth = stack_data
60
+
61
+ while len(stack) < text_segment.depth + 1:
62
+ stack.append([])
63
+
64
+ while len(stack) > text_segment.depth + 1:
65
+ _pop_stack(
66
+ stack=stack,
67
+ stack_base_depth=stack_base_depth,
68
+ )
69
+
70
+ # text_segment.depth 可视为它在 stack 中的 index,必须令 len(stack) == text_segment.depth + 1
71
+ stack[-1].append(text_segment)
72
+
73
+ if stack_data is not None:
74
+ inline_segment = _pop_stack_data(stack_data)
75
+ if inline_segment:
76
+ yield inline_segment
77
+
78
+
79
+ def _pop_stack_data(stack_data: tuple[list[list["TextSegment | InlineSegment"]], Element, int]):
80
+ stack, _, stack_base_depth = stack_data
81
+ inline_segment: InlineSegment | None = None
82
+ while stack:
83
+ inline_segment = _pop_stack(
84
+ stack=stack,
85
+ stack_base_depth=stack_base_depth,
86
+ )
87
+ return inline_segment
88
+
89
+
90
+ def _pop_stack(
91
+ stack: list[list["TextSegment | InlineSegment"]],
92
+ stack_base_depth: int,
93
+ ) -> "InlineSegment | None":
94
+ inline_segment: InlineSegment | None = None
95
+ depth = len(stack) + stack_base_depth - 1
96
+ popped = stack.pop()
97
+ if popped:
98
+ inline_segment = InlineSegment(depth, popped)
99
+ if stack and inline_segment is not None:
100
+ stack[-1].append(inline_segment)
101
+ return inline_segment
102
+
103
+
104
+ class InlineSegment:
105
+ def __init__(self, depth: int, children: list["TextSegment | InlineSegment"]) -> None:
106
+ assert depth > 0
107
+ self.id: int | None = None
108
+ self._children: list[TextSegment | InlineSegment] = children
109
+ self._parent_stack: list[Element] = children[0].parent_stack[:depth]
110
+
111
+ # 每一组 tag 都对应一个 ids 列表。
112
+ # 若为空,说明该 tag 属性结构全同,没必要分配 id 以区分。
113
+ # 若非空,则表示 tag 下每一个 element 都有 id 属性。
114
+ # 注意,相同 tag 下的 element 要么全部有 id,要么全部都没有 id
115
+ self._child_tag2ids: dict[str, list[int]] = {}
116
+ self._child_tag2count: dict[str, int] = {}
117
+
118
+ next_temp_id: int = 0
119
+ terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
120
+
121
+ for tag, child_terms in terms.items():
122
+ self._child_tag2count[tag] = len(child_terms)
123
+ if not is_the_same( # 仅当 tag 彼此无法区分时才分配 id,以尽可能减少 id 的数量
124
+ elements=(element_fingerprint(t.parent) for t in child_terms),
125
+ ):
126
+ for child in child_terms:
127
+ child.id = next_temp_id
128
+ next_temp_id += 1
129
+
130
+ @property
131
+ def head(self) -> TextSegment:
132
+ first_child = self._children[0]
133
+ if isinstance(first_child, TextSegment):
134
+ return first_child
135
+ else:
136
+ return first_child.head
137
+
138
+ @property
139
+ def tail(self) -> TextSegment:
140
+ last_child = self._children[-1]
141
+ if isinstance(last_child, TextSegment):
142
+ return last_child
143
+ else:
144
+ return last_child.tail
145
+
146
+ @property
147
+ def children(self) -> list["TextSegment | InlineSegment"]:
148
+ return self._children
149
+
150
+ @property
151
+ def parent(self) -> Element:
152
+ return self._parent_stack[-1]
153
+
154
+ @property
155
+ def parent_stack(self) -> list[Element]:
156
+ return self._parent_stack
157
+
158
+ def __iter__(self) -> Iterator[TextSegment]:
159
+ for child in self._children:
160
+ if isinstance(child, TextSegment):
161
+ yield child
162
+ elif isinstance(child, InlineSegment):
163
+ yield from child
164
+
165
+ def recreate_ids(self, id_generator: IDGenerator) -> None:
166
+ self._child_tag2count.clear()
167
+ self._child_tag2ids.clear()
168
+
169
+ for child in self._children:
170
+ if isinstance(child, InlineSegment):
171
+ child_tag = child.parent.tag
172
+ ids = ensure_list(self._child_tag2ids, child_tag)
173
+ if child.id is not None:
174
+ child.id = id_generator.next_id()
175
+ ids.append(child.id)
176
+ child.recreate_ids(id_generator)
177
+ self._child_tag2count[child_tag] = self._child_tag2count.get(child_tag, 0) + 1
178
+
179
+ def create_element(self) -> Element:
180
+ element = Element(self.parent.tag)
181
+ previous_element: Element | None = None
182
+ for child in self._children:
183
+ if isinstance(child, InlineSegment):
184
+ previous_element = child.create_element()
185
+ element.append(previous_element)
186
+
187
+ elif isinstance(child, TextSegment):
188
+ if previous_element is None:
189
+ element.text = append_text_in_element(
190
+ origin_text=element.text,
191
+ append_text=child.text,
192
+ )
193
+ else:
194
+ previous_element.tail = append_text_in_element(
195
+ origin_text=previous_element.tail,
196
+ append_text=child.text,
197
+ )
198
+ if self.id is not None:
199
+ element.set(ID_KEY, str(self.id))
200
+ return element
201
+
202
+ def validate(self, validated_element: Element) -> Generator[InlineError | FoundInvalidIDError, None, None]:
203
+ remain_expected_elements: dict[int, Element] = {}
204
+ for child in self._child_inline_segments():
205
+ if child.id is not None:
206
+ remain_expected_elements[child.id] = child.parent
207
+
208
+ for _, child_element in iter_with_stack(validated_element):
209
+ if child_element is validated_element:
210
+ continue # skip the root self
211
+
212
+ element_id = id_in_element(child_element)
213
+ if element_id is None:
214
+ validated_id = validate_id_in_element(
215
+ element=child_element,
216
+ enable_no_id=True,
217
+ )
218
+ if isinstance(validated_id, FoundInvalidIDError):
219
+ yield validated_id
220
+ continue
221
+
222
+ remain_expected_element = remain_expected_elements.pop(element_id, None)
223
+ if remain_expected_element is None:
224
+ yield InlineUnexpectedIDError(
225
+ id=element_id,
226
+ element=child_element,
227
+ )
228
+
229
+ if remain_expected_elements:
230
+ yield InlineExpectedIDsError(
231
+ id2element=remain_expected_elements,
232
+ )
233
+
234
+ yield from self._validate_children_structure(validated_element)
235
+
236
+ def _child_inline_segments(self) -> Generator["InlineSegment", None, None]:
237
+ for child in self._children:
238
+ if isinstance(child, InlineSegment):
239
+ yield child
240
+ yield from child._child_inline_segments() # pylint: disable=protected-access
241
+
242
+ def _validate_children_structure(self, validated_element: Element):
243
+ tag2found_elements: dict[str, list[Element]] = {}
244
+
245
+ for child_element in validated_element:
246
+ ids = self._child_tag2ids.get(child_element.tag, None)
247
+ if not ids:
248
+ found_elements = ensure_list(tag2found_elements, child_element.tag)
249
+ found_elements.append(child_element)
250
+ else:
251
+ id_str = child_element.get(ID_KEY, None)
252
+ if id_str is None:
253
+ yield InlineLostIDError(
254
+ element=child_element,
255
+ stack=[self.parent],
256
+ )
257
+
258
+ for tag, found_elements in tag2found_elements.items():
259
+ expected_count = self._child_tag2count.get(tag, 0)
260
+ if len(found_elements) != expected_count:
261
+ yield InlineWrongTagCountError(
262
+ expected_count=expected_count,
263
+ found_elements=found_elements,
264
+ stack=[self.parent],
265
+ )
266
+
267
+ for child, child_element in self._match_children(validated_element):
268
+ # pylint: disable=protected-access
269
+ for error in child._validate_children_structure(child_element):
270
+ error.stack.insert(0, self.parent)
271
+ yield error
272
+
273
+ # 即便 self.validate(...) 的错误没有排除干净,也要尽可能匹配一个质量较高(尽力而为)的版本
274
+ def assign_attributes(self, template_element: Element) -> Element:
275
+ assigned_element = Element(self.parent.tag, self.parent.attrib)
276
+ if template_element.text and template_element.text.strip():
277
+ assigned_element.text = append_text_in_element(
278
+ origin_text=assigned_element.text,
279
+ append_text=template_element.text,
280
+ )
281
+
282
+ matched_child_element_ids: set[int] = set()
283
+ for child, child_element in self._match_children(template_element):
284
+ child_assigned_element = child.assign_attributes(child_element)
285
+ assigned_element.append(child_assigned_element)
286
+ matched_child_element_ids.add(id(child_element))
287
+
288
+ assigned_child_element_stack = list(assigned_element)
289
+ assigned_child_element_stack.reverse()
290
+
291
+ previous_assigned_child_element: Element | None = None
292
+ for child_element in template_element:
293
+ # 只关心 child_element 是否是分割点,不关心它真实对应。极端情况下可能乱序,只好大致对上就行
294
+ child_text: str = ""
295
+ if id(child_element) not in matched_child_element_ids:
296
+ child_text = plain_text(child_element)
297
+ elif assigned_child_element_stack:
298
+ previous_assigned_child_element = assigned_child_element_stack.pop()
299
+ if child_element.tail is not None:
300
+ child_text += child_element.tail
301
+ if not child_text.strip():
302
+ continue
303
+ if previous_assigned_child_element is None:
304
+ assigned_element.text = append_text_in_element(
305
+ origin_text=assigned_element.text,
306
+ append_text=child_text,
307
+ )
308
+ else:
309
+ previous_assigned_child_element.tail = append_text_in_element(
310
+ origin_text=previous_assigned_child_element.tail,
311
+ append_text=child_text,
312
+ )
313
+ return assigned_element
314
+
315
+ def _match_children(self, element: Element) -> Generator[tuple["InlineSegment", Element], None, None]:
316
+ tag2elements = nest((c.tag, c) for c in element)
317
+ tag2children = nest(
318
+ (c.parent.tag, (i, c)) for i, c in enumerate(c for c in self._children if isinstance(c, InlineSegment))
319
+ )
320
+ used_ids: set[int] = set()
321
+ children_and_elements: list[tuple[int, InlineSegment, Element]] = []
322
+
323
+ for tag, orders_and_children in tag2children.items():
324
+ # 优先考虑 id 匹配,剩下的以自然顺序尽可能匹配
325
+ ids = self._child_tag2ids.get(tag, [])
326
+ matched_children_elements: list[Element | None] = [None] * len(orders_and_children)
327
+ not_matched_elements: list[Element] = []
328
+
329
+ for child_element in tag2elements.get(tag, []):
330
+ id_order: int | None = None
331
+ child_id = id_in_element(child_element)
332
+ if child_id is not None and child_id not in used_ids:
333
+ used_ids.add(child_id) # 一个 id 只能用一次,防止重复
334
+ try:
335
+ id_order = ids.index(child_id)
336
+ except ValueError:
337
+ pass
338
+ if id_order is None:
339
+ not_matched_elements.append(child_element)
340
+ else:
341
+ matched_children_elements[id_order] = child_element
342
+
343
+ not_matched_elements.reverse()
344
+ for i in range(len(matched_children_elements)):
345
+ if not not_matched_elements:
346
+ break
347
+ matched_element = matched_children_elements[i]
348
+ if matched_element is None:
349
+ matched_children_elements[i] = not_matched_elements.pop()
350
+
351
+ for (order, child), child_element in zip(orders_and_children, matched_children_elements):
352
+ if child_element is not None:
353
+ children_and_elements.append((order, child, child_element))
354
+
355
+ for _, child, child_element in sorted(children_and_elements, key=lambda x: x[0]):
356
+ yield child, child_element
@@ -4,71 +4,7 @@ from enum import Enum, auto
4
4
  from typing import Self
5
5
  from xml.etree.ElementTree import Element
6
6
 
7
- from .utils import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
8
-
9
- # HTML inline-level elements
10
- # Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
11
- # Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
12
- _HTML_INLINE_TAGS = frozenset(
13
- [
14
- # Inline text semantics
15
- "a",
16
- "abbr",
17
- "b",
18
- "bdi",
19
- "bdo",
20
- "br",
21
- "cite",
22
- "code",
23
- "data",
24
- "dfn",
25
- "em",
26
- "i",
27
- "kbd",
28
- "mark",
29
- "q",
30
- "rp",
31
- "rt",
32
- "ruby",
33
- "s",
34
- "samp",
35
- "small",
36
- "span",
37
- "strong",
38
- "sub",
39
- "sup",
40
- "time",
41
- "u",
42
- "var",
43
- "wbr",
44
- # Image and multimedia
45
- "img",
46
- "svg",
47
- "canvas",
48
- "audio",
49
- "video",
50
- "map",
51
- "area",
52
- # Form elements
53
- "input",
54
- "button",
55
- "select",
56
- "textarea",
57
- "label",
58
- "output",
59
- "progress",
60
- "meter",
61
- # Embedded content
62
- "iframe",
63
- "embed",
64
- "object",
65
- # Other inline elements
66
- "script",
67
- "del",
68
- "ins",
69
- "slot",
70
- ]
71
- )
7
+ from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_tag, normalize_text_in_element
72
8
 
73
9
 
74
10
  class TextPosition(Enum):
@@ -79,7 +15,6 @@ class TextPosition(Enum):
79
15
  @dataclass
80
16
  class TextSegment:
81
17
  text: str
82
- index: int # *.text is 0, the first *.tail is 1, and so on
83
18
  parent_stack: list[Element]
84
19
  left_common_depth: int
85
20
  right_common_depth: int
@@ -90,6 +25,10 @@ class TextSegment:
90
25
  def root(self) -> Element:
91
26
  return self.parent_stack[0]
92
27
 
28
+ @property
29
+ def depth(self) -> int:
30
+ return len(self.parent_stack) - self.block_depth
31
+
93
32
  @property
94
33
  def block_parent(self) -> Element:
95
34
  return self.parent_stack[self.block_depth - 1]
@@ -106,7 +45,6 @@ class TextSegment:
106
45
  def clone(self) -> "TextSegment":
107
46
  return TextSegment(
108
47
  text=self.text,
109
- index=self.index,
110
48
  parent_stack=list(self.parent_stack),
111
49
  left_common_depth=self.left_common_depth,
112
50
  right_common_depth=self.right_common_depth,
@@ -171,20 +109,18 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
171
109
  if text is not None:
172
110
  yield TextSegment(
173
111
  text=text,
174
- index=0,
175
112
  parent_stack=next_stack,
176
113
  left_common_depth=0,
177
114
  right_common_depth=0,
178
115
  block_depth=next_block_depth,
179
116
  position=TextPosition.TEXT,
180
117
  )
181
- for i, child_element in enumerate(element):
118
+ for child_element in element:
182
119
  yield from _search_text_segments(next_stack, child_element)
183
120
  child_tail = normalize_text_in_element(child_element.tail)
184
121
  if child_tail is not None:
185
122
  yield TextSegment(
186
123
  text=child_tail,
187
- index=i + 1,
188
124
  parent_stack=next_stack,
189
125
  left_common_depth=0,
190
126
  right_common_depth=0,
@@ -196,8 +132,7 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
196
132
  def _find_block_depth(parent_stack: list[Element]) -> int:
197
133
  index: int = 0
198
134
  for i in range(len(parent_stack) - 1, -1, -1):
199
- checked_tag = parent_stack[i].tag.lower()
200
- if checked_tag not in _HTML_INLINE_TAGS:
135
+ if not is_inline_tag(parent_stack[i].tag):
201
136
  index = i
202
137
  break
203
138
  return index + 1 # depth is a count not index
@@ -0,0 +1,43 @@
1
+ from xml.etree.ElementTree import Element
2
+
3
+ from ..xml import ID_KEY
4
+
5
+
6
+ def element_fingerprint(element: Element) -> str:
7
+ attrs = sorted(f"{key}={value}" for key, value in element.attrib.items())
8
+ return f"<{element.tag} {' '.join(attrs)}/>"
9
+
10
+
11
+ def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
12
+ parents: list[Element] = []
13
+ while True:
14
+ if len(element) != 1:
15
+ break
16
+ child = element[0]
17
+ if not element.text:
18
+ break
19
+ if not child.tail:
20
+ break
21
+ parents.append(element)
22
+ element = child
23
+ element.tail = None
24
+ return element, parents
25
+
26
+
27
+ def id_in_element(element: Element) -> int | None:
28
+ id_str = element.get(ID_KEY, None)
29
+ if id_str is None:
30
+ return None
31
+ try:
32
+ return int(id_str)
33
+ except ValueError:
34
+ return None
35
+
36
+
37
+ class IDGenerator:
38
+ def __init__(self):
39
+ self._previous_id: int = 0
40
+
41
+ def next_id(self) -> int:
42
+ self._previous_id += 1
43
+ return self._previous_id