epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +4 -2
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +233 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +208 -178
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +231 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +179 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.1.dist-info/METADATA +283 -0
  56. epub_translator-0.1.1.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -68
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.7.dist-info/METADATA +0 -170
  80. epub_translator-0.0.7.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
@@ -0,0 +1,183 @@
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from xml.etree.ElementTree import Element
4
+
5
+ from resource_segmentation import Resource, Segment, split
6
+ from tiktoken import Encoding
7
+
8
+ from .fragmented import group_fragmented_elements
9
+ from .text_segment import TextSegment, incision_between, search_text_segments
10
+
11
+ _BORDER_INCISION = 0
12
+ _ELLIPSIS = "..."
13
+
14
+
15
+ @dataclass
16
+ class XMLGroup:
17
+ head: list[TextSegment]
18
+ body: list[TextSegment]
19
+ tail: list[TextSegment]
20
+
21
+ def __iter__(self) -> Generator[TextSegment, None, None]:
22
+ yield from self.head
23
+ yield from self.body
24
+ yield from self.tail
25
+
26
+
27
+ class XMLGroupContext:
28
+ def __init__(self, encoding: Encoding, max_group_tokens: int) -> None:
29
+ self._encoding: Encoding = encoding
30
+ self._max_group_tokens: int = max_group_tokens
31
+
32
+ def split_groups(self, elements: Iterable[Element]) -> Generator[XMLGroup, None, None]:
33
+ for grouped_elements in group_fragmented_elements(
34
+ encoding=self._encoding,
35
+ elements=elements,
36
+ group_max_tokens=self._max_group_tokens,
37
+ ):
38
+ for group in split(
39
+ resources=self._expand_text_segments(grouped_elements),
40
+ max_segment_count=self._max_group_tokens,
41
+ border_incision=_BORDER_INCISION,
42
+ ):
43
+ yield XMLGroup(
44
+ head=list(
45
+ self._truncate_text_segments(
46
+ segments=self._expand_text_segments_with_items(group.head),
47
+ remain_head=False,
48
+ remain_count=group.head_remain_count,
49
+ )
50
+ ),
51
+ body=list(self._expand_text_segments_with_items(group.body)),
52
+ tail=list(
53
+ self._truncate_text_segments(
54
+ segments=self._expand_text_segments_with_items(group.tail),
55
+ remain_head=True,
56
+ remain_count=group.tail_remain_count,
57
+ )
58
+ ),
59
+ )
60
+
61
+ def _expand_text_segments(self, elements: Iterable[Element]):
62
+ for element in elements:
63
+ yield from self._expand_text_segments_with_element(element)
64
+
65
+ def _expand_text_segments_with_element(self, element: Element) -> Generator[Resource[TextSegment], None, None]:
66
+ generator = search_text_segments(element)
67
+ segment = next(generator, None)
68
+ start_incision = _BORDER_INCISION
69
+ if segment is None:
70
+ return
71
+
72
+ while True:
73
+ next_segment = next(generator, None)
74
+ if next_segment is None:
75
+ break
76
+ incision1, incision2 = incision_between(
77
+ segment1=segment,
78
+ segment2=next_segment,
79
+ )
80
+ yield Resource(
81
+ count=len(self._encoding.encode(segment.xml_text)),
82
+ start_incision=start_incision,
83
+ end_incision=incision1,
84
+ payload=segment,
85
+ )
86
+ segment = next_segment
87
+ start_incision = incision2
88
+
89
+ yield Resource(
90
+ count=len(self._encoding.encode(segment.xml_text)),
91
+ start_incision=start_incision,
92
+ end_incision=_BORDER_INCISION,
93
+ payload=segment,
94
+ )
95
+
96
+ def _expand_text_segments_with_items(self, items: list[Resource[TextSegment] | Segment[TextSegment]]):
97
+ for item in items:
98
+ if isinstance(item, Resource):
99
+ yield item.payload.clone()
100
+ elif isinstance(item, Segment):
101
+ for resource in item.resources:
102
+ yield resource.payload.clone()
103
+
104
+ def _truncate_text_segments(self, segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
105
+ if remain_head:
106
+ yield from self._filter_and_remain_segments(
107
+ segments=segments,
108
+ remain_head=remain_head,
109
+ remain_count=remain_count,
110
+ )
111
+ else:
112
+ yield from reversed(
113
+ list(
114
+ self._filter_and_remain_segments(
115
+ segments=reversed(list(segments)),
116
+ remain_head=remain_head,
117
+ remain_count=remain_count,
118
+ )
119
+ )
120
+ )
121
+
122
+ def _filter_and_remain_segments(self, segments: Iterable[TextSegment], remain_head: bool, remain_count: int):
123
+ for segment in segments:
124
+ if remain_count <= 0:
125
+ break
126
+ raw_xml_text = segment.xml_text
127
+ tokens = self._encoding.encode(raw_xml_text)
128
+ tokens_count = len(tokens)
129
+
130
+ if tokens_count > remain_count:
131
+ truncated_segment = self._truncate_text_segment(
132
+ segment=segment,
133
+ tokens=tokens,
134
+ raw_xml_text=raw_xml_text,
135
+ remain_head=remain_head,
136
+ remain_count=remain_count,
137
+ )
138
+ if truncated_segment is not None:
139
+ yield truncated_segment
140
+ break
141
+
142
+ yield segment
143
+ remain_count -= tokens_count
144
+
145
+ def _truncate_text_segment(
146
+ self,
147
+ segment: TextSegment,
148
+ tokens: list[int],
149
+ raw_xml_text: str,
150
+ remain_head: bool,
151
+ remain_count: int,
152
+ ) -> TextSegment | None:
153
+ # 典型的 xml_text: <tag id="99" data-origin-len="999">Some text</tag>
154
+ # 如果切割点在前缀 XML 区,则整体舍弃
155
+ # 如果切割点在后缀 XML 区,则整体保留
156
+ # 只有刚好切割在正文区,才执行文本截断操作
157
+ remain_text: str
158
+ xml_text_head_length = raw_xml_text.find(segment.text)
159
+
160
+ if remain_head:
161
+ remain_xml_text = self._encoding.decode(tokens[:remain_count]) # remain_count cannot be 0 here
162
+ if len(remain_xml_text) <= xml_text_head_length:
163
+ return
164
+ if len(remain_xml_text) >= xml_text_head_length + len(segment.text):
165
+ return segment
166
+ remain_text = remain_xml_text[xml_text_head_length:]
167
+ else:
168
+ xml_text_tail_length = len(raw_xml_text) - (xml_text_head_length + len(segment.text))
169
+ remain_xml_text = self._encoding.decode(tokens[-remain_count:])
170
+ if len(remain_xml_text) <= xml_text_tail_length:
171
+ return
172
+ if len(remain_xml_text) >= xml_text_tail_length + len(segment.text):
173
+ return segment
174
+ remain_text = remain_xml_text[: len(remain_xml_text) - xml_text_tail_length]
175
+
176
+ if not remain_text.strip():
177
+ return
178
+
179
+ if remain_head:
180
+ segment.text = f"{remain_text} {_ELLIPSIS}"
181
+ else:
182
+ segment.text = f"{_ELLIPSIS} {remain_text}"
183
+ return segment
@@ -0,0 +1,256 @@
1
+ """渐进式锁定验证器 - 引导 LLM 逐步收敛到正确结果"""
2
+ # pylint: disable=protected-access # Intentional access to _ValidationContext internals
3
+
4
+ from xml.etree.ElementTree import Element
5
+
6
+ from .const import ID_KEY
7
+ from .format import _ValidationContext
8
+
9
+
10
+ class ProgressiveLockingValidator:
11
+ """
12
+ 渐进式锁定验证器:
13
+ - 验证时收集所有错误(包括已锁定区域)
14
+ - 识别无错误的子树并锁定
15
+ - 只报告未锁定区域的错误
16
+ - 保证收敛:每轮至少锁定一个节点,最多 N 轮完成
17
+ """
18
+
19
+ def __init__(self):
20
+ self.locked_ids: set[int] = set()
21
+ self.no_progress_count: int = 0
22
+ self.lock_history: list[set[int]] = [] # 记录每轮锁定的节点
23
+
24
+ def validate_with_locking(
25
+ self,
26
+ template_ele: Element,
27
+ validated_ele: Element,
28
+ errors_limit: int,
29
+ ) -> tuple[bool, str | None, set[int]]:
30
+ """
31
+ 使用渐进式锁定进行验证
32
+
33
+ 返回:
34
+ - is_complete: 是否所有节点都已锁定(完成)
35
+ - error_message: 未锁定区域的错误消息(None 表示无错误)
36
+ - newly_locked: 本轮新锁定的节点 ID 集合
37
+ """
38
+
39
+ # 1. 执行完整验证(包括已锁定区域)
40
+ context = _ValidationContext()
41
+ context.validate(raw_ele=template_ele, validated_ele=validated_ele)
42
+
43
+ # 2. 获取所有错误(以路径为 key)
44
+ all_errors = context._errors
45
+
46
+ # 3. 识别可以新锁定的节点
47
+ newly_locked = self._find_lockable_nodes(template_ele, validated_ele, all_errors)
48
+
49
+ # 4. 检测卡住情况并解锁
50
+ if not newly_locked and self.locked_ids:
51
+ self.no_progress_count += 1
52
+ if self.no_progress_count >= 3:
53
+ # 卡住了,解锁最近的 2 个节点重试
54
+ self._unlock_recent(count=2)
55
+ self.no_progress_count = 0
56
+ else:
57
+ self.no_progress_count = 0
58
+
59
+ # 5. 更新锁定集合
60
+ self.locked_ids.update(newly_locked)
61
+ self.lock_history.append(newly_locked.copy())
62
+
63
+ # 6. 过滤错误:只保留未锁定区域的错误
64
+ unlocked_errors = self._filter_unlocked_errors(all_errors)
65
+
66
+ # 7. 生成错误消息
67
+ error_message = self._format_errors(unlocked_errors, errors_limit, template_ele)
68
+
69
+ # 8. 检查是否完成
70
+ total_nodes = self._count_nodes_with_id(template_ele)
71
+ is_complete = len(self.locked_ids) == total_nodes and error_message is None
72
+
73
+ return is_complete, error_message, newly_locked
74
+
75
+ def _find_lockable_nodes(
76
+ self, template_ele: Element, validated_ele: Element, errors: dict[tuple[int, ...], list[str]]
77
+ ) -> set[int]:
78
+ """
79
+ 找到可以锁定的节点(该节点及所有后代都无错误,且尚未锁定)
80
+
81
+ 策略:自底向上,优先锁定叶子节点
82
+ """
83
+ lockable = set()
84
+
85
+ # 收集所有带 id 的节点,按深度排序(深度优先)
86
+ nodes_with_depth = []
87
+ for elem in template_ele.iter():
88
+ elem_id_str = elem.get(ID_KEY)
89
+ if elem_id_str is not None:
90
+ elem_id = int(elem_id_str)
91
+ if elem_id not in self.locked_ids:
92
+ depth = self._get_depth(elem, template_ele)
93
+ nodes_with_depth.append((depth, elem_id, elem))
94
+
95
+ # 从最深的节点开始检查
96
+ nodes_with_depth.sort(reverse=True, key=lambda x: x[0])
97
+
98
+ for depth, elem_id, elem in nodes_with_depth:
99
+ # 检查该节点的子树是否完全无错误
100
+ if self._subtree_is_error_free(elem, template_ele, errors):
101
+ # 验证在 validated_ele 中也存在对应节点
102
+ validated_node = self._find_by_id(validated_ele, elem_id)
103
+ if validated_node is not None:
104
+ lockable.add(elem_id)
105
+
106
+ return lockable
107
+
108
+ def _subtree_is_error_free(
109
+ self, root: Element, template_root: Element, errors: dict[tuple[int, ...], list[str]]
110
+ ) -> bool:
111
+ """检查子树是否完全无错误"""
112
+
113
+ # 获取该节点在 template 中的路径
114
+ root_path = self._get_path_to_node(root, template_root)
115
+ if root_path is None:
116
+ return False
117
+
118
+ # 检查该路径及其所有后代路径是否有错误
119
+ for error_path in errors.keys():
120
+ # 如果错误路径是 root_path 的后代或等于 root_path
121
+ if self._is_descendant_path(error_path, root_path):
122
+ return False
123
+
124
+ return True
125
+
126
+ def _get_path_to_node(self, target: Element, root: Element) -> tuple[int, ...] | None:
127
+ """获取从 root 到 target 的路径(以 id 序列表示)"""
128
+
129
+ def find_path(current: Element, path: list[int]) -> list[int] | None:
130
+ if current is target:
131
+ return path
132
+
133
+ current_id_str = current.get(ID_KEY)
134
+ if current_id_str is not None:
135
+ current_path = path + [int(current_id_str)]
136
+ else:
137
+ current_path = path
138
+
139
+ for child in current:
140
+ result = find_path(child, current_path)
141
+ if result is not None:
142
+ return result
143
+
144
+ return None
145
+
146
+ path = find_path(root, [])
147
+ return tuple(path) if path is not None else None
148
+
149
+ def _is_descendant_path(self, path: tuple[int, ...], ancestor_path: tuple[int, ...]) -> bool:
150
+ """检查 path 是否是 ancestor_path 的后代或相等"""
151
+ if len(path) < len(ancestor_path):
152
+ return False
153
+ return path[: len(ancestor_path)] == ancestor_path
154
+
155
+ def _get_depth(self, elem: Element, root: Element) -> int:
156
+ """获取元素的深度"""
157
+ path = self._get_path_to_node(elem, root)
158
+ return len(path) if path else 0
159
+
160
+ def _find_by_id(self, root: Element, target_id: int) -> Element | None:
161
+ """在树中查找指定 id 的元素"""
162
+ for elem in root.iter():
163
+ elem_id_str = elem.get(ID_KEY)
164
+ if elem_id_str is not None and int(elem_id_str) == target_id:
165
+ return elem
166
+ return None
167
+
168
+ def _filter_unlocked_errors(self, errors: dict[tuple[int, ...], list[str]]) -> dict[tuple[int, ...], list[str]]:
169
+ """过滤错误:只保留路径中包含未锁定节点的错误"""
170
+ unlocked_errors = {}
171
+
172
+ for path, error_list in errors.items():
173
+ # 检查路径中是否有未锁定的节点
174
+ has_unlocked = any(node_id not in self.locked_ids for node_id in path)
175
+ if has_unlocked:
176
+ unlocked_errors[path] = error_list
177
+
178
+ return unlocked_errors
179
+
180
+ def _format_errors(
181
+ self, errors: dict[tuple[int, ...], list[str]], limit: int, template_ele: Element
182
+ ) -> str | None:
183
+ """格式化错误消息(复用现有逻辑)"""
184
+ if not errors:
185
+ return None
186
+
187
+ # 使用现有的错误格式化逻辑
188
+ context = _ValidationContext()
189
+ context._errors = errors
190
+
191
+ # 构造 _tag_text_dict,从 template_ele 中提取真实的标签信息
192
+ id_to_elem: dict[int, Element] = {}
193
+ for elem in template_ele.iter():
194
+ elem_id_str = elem.get(ID_KEY)
195
+ if elem_id_str is not None:
196
+ elem_id = int(elem_id_str)
197
+ id_to_elem[elem_id] = elem
198
+
199
+ # 填充 _tag_text_dict
200
+ for path in errors.keys():
201
+ for node_id in path:
202
+ if node_id not in context._tag_text_dict:
203
+ elem = id_to_elem.get(node_id)
204
+ if elem is not None:
205
+ context._tag_text_dict[node_id] = self._str_tag(elem)
206
+ else:
207
+ context._tag_text_dict[node_id] = f'<tag id="{node_id}">'
208
+
209
+ return context.errors(limit=limit)
210
+
211
+ def _str_tag(self, ele: Element) -> str:
212
+ """生成标签的字符串表示(与 format.py 中的逻辑一致)"""
213
+ ele_id = ele.get(ID_KEY)
214
+ content: str
215
+ if ele_id is not None:
216
+ content = f'<{ele.tag} id="{ele_id}"'
217
+ else:
218
+ content = f"<{ele.tag}"
219
+ if len(ele) > 0:
220
+ content += f"> ... </{ele.tag}>"
221
+ else:
222
+ content += " />"
223
+ return content
224
+
225
+ def _count_nodes_with_id(self, root: Element) -> int:
226
+ """统计带有 id 属性的节点数量"""
227
+ count = 0
228
+ for elem in root.iter():
229
+ if elem.get(ID_KEY) is not None:
230
+ count += 1
231
+ return count
232
+
233
+ def _unlock_recent(self, count: int):
234
+ """解锁最近锁定的 count 个节点"""
235
+ if not self.lock_history:
236
+ return
237
+
238
+ unlocked_count = 0
239
+ # 从最近的历史记录开始解锁
240
+ for i in range(len(self.lock_history) - 1, -1, -1):
241
+ if unlocked_count >= count:
242
+ break
243
+
244
+ locked_in_round = self.lock_history[i]
245
+ for node_id in locked_in_round:
246
+ if unlocked_count >= count:
247
+ break
248
+ if node_id in self.locked_ids:
249
+ self.locked_ids.remove(node_id)
250
+ unlocked_count += 1
251
+
252
+ def get_progress_summary(self, total_nodes: int) -> str:
253
+ """获取进度摘要"""
254
+ locked_count = len(self.locked_ids)
255
+ percentage = (locked_count / total_nodes * 100) if total_nodes > 0 else 0
256
+ return f"{locked_count}/{total_nodes} nodes locked ({percentage:.1f}%)"
@@ -0,0 +1,102 @@
1
+ from collections.abc import Iterable
2
+ from xml.etree.ElementTree import Element
3
+
4
+ from ..xml import iter_with_stack
5
+ from .text_segment import TextPosition, TextSegment, combine_text_segments
6
+
7
+
8
+ def submit_text_segments(element: Element, text_segments: Iterable[TextSegment]):
9
+ grouped_map = _group_text_segments(text_segments)
10
+ flatten_text_segments = dict(_extract_flatten_text_segments(element, grouped_map))
11
+ _append_text_segments(element, grouped_map)
12
+ _replace_text_segments(element, flatten_text_segments)
13
+
14
+
15
+ def _group_text_segments(text_segments: Iterable[TextSegment]):
16
+ grouped_map: dict[int, list[TextSegment]] = {}
17
+ for text_segment in text_segments:
18
+ parent_id = id(text_segment.block_parent)
19
+ grouped = grouped_map.get(parent_id, None)
20
+ if grouped is None:
21
+ grouped_map[parent_id] = grouped = []
22
+ grouped_map[parent_id].append(text_segment)
23
+ return grouped_map
24
+
25
+
26
+ # 被覆盖的 block 表示一种偶然现象,由于它的子元素会触发 append 操作,若对它也进行 append 操作阅读顺序会混乱
27
+ # 此时只能在它的所有文本后立即接上翻译后的文本
28
+ def _extract_flatten_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
29
+ override_parent_ids: set[int] = set()
30
+ for parents, child_element in iter_with_stack(element):
31
+ if id(child_element) not in grouped_map:
32
+ continue
33
+ for parent in parents[:-1]:
34
+ parent_id = id(parent)
35
+ if parent_id in grouped_map:
36
+ override_parent_ids.add(parent_id)
37
+
38
+ if id(element) in grouped_map:
39
+ override_parent_ids.add(id(element)) # root 不会出现在 parents 中需单独添加
40
+
41
+ for parent_id in override_parent_ids:
42
+ yield parent_id, grouped_map.pop(parent_id)
43
+
44
+
45
+ def _replace_text_segments(element: Element, text_segments: dict[int, list[TextSegment]]):
46
+ for _, child_element in iter_with_stack(element):
47
+ tail_text_segments: list[TextSegment] = []
48
+ for text_segment in text_segments.get(id(child_element), ()):
49
+ if text_segment.position == TextPosition.TEXT:
50
+ child_element.text = _append_text(
51
+ origin_text=child_element.text,
52
+ append_text=text_segment.text,
53
+ )
54
+ elif text_segment.position == TextPosition.TAIL:
55
+ tail_text_segments.append(text_segment)
56
+
57
+ tail_text_segments.sort(key=lambda t: t.index)
58
+ tail_text_segments.reverse()
59
+ for cc_element in child_element:
60
+ if not tail_text_segments:
61
+ break
62
+ if cc_element.tail is not None:
63
+ cc_element.tail = _append_text(
64
+ origin_text=cc_element.tail,
65
+ append_text=tail_text_segments.pop().text,
66
+ )
67
+
68
+
69
+ def _append_text_segments(element: Element, grouped_map: dict[int, list[TextSegment]]):
70
+ for parents, child_element in iter_with_stack(element):
71
+ if not parents:
72
+ continue
73
+ grouped = grouped_map.get(id(child_element))
74
+ if not grouped:
75
+ continue
76
+ parent = parents[-1]
77
+ index = _index_of_parent(parents[-1], child_element)
78
+ combined = next(
79
+ combine_text_segments(
80
+ segments=(t.strip_block_parents() for t in grouped),
81
+ ),
82
+ None,
83
+ )
84
+ if combined is not None:
85
+ combined_element, _ = combined
86
+ parent.insert(index + 1, combined_element)
87
+ combined_element.tail = child_element.tail
88
+ child_element.tail = None
89
+
90
+
91
+ def _index_of_parent(parent: Element, checked_element: Element) -> int:
92
+ for i, child in enumerate(parent):
93
+ if child == checked_element:
94
+ return i
95
+ raise ValueError("Element not found in parent.")
96
+
97
+
98
+ def _append_text(origin_text: str | None, append_text: str) -> str:
99
+ if origin_text is None:
100
+ return append_text
101
+ else:
102
+ return origin_text + append_text