epub-translator 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +32 -113
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +147 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +8 -33
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +162 -113
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.1.dist-info/RECORD +0 -58
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable, Iterator
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from xml.etree.ElementTree import Element
|
|
4
|
+
|
|
5
|
+
from ..utils import ensure_list, is_the_same, nest
|
|
6
|
+
from ..xml import ID_KEY, append_text_in_element, iter_with_stack, plain_text
|
|
7
|
+
from .common import FoundInvalidIDError, validate_id_in_element
|
|
8
|
+
from .text_segment import TextSegment
|
|
9
|
+
from .utils import IDGenerator, element_fingerprint, id_in_element
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class InlineLostIDError:
|
|
14
|
+
element: Element
|
|
15
|
+
stack: list[Element]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class InlineUnexpectedIDError:
|
|
20
|
+
id: int
|
|
21
|
+
element: Element
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class InlineExpectedIDsError:
|
|
26
|
+
id2element: dict[int, Element]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class InlineWrongTagCountError:
|
|
31
|
+
expected_count: int
|
|
32
|
+
found_elements: list[Element]
|
|
33
|
+
stack: list[Element]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
InlineError = InlineLostIDError | InlineUnexpectedIDError | InlineExpectedIDsError | InlineWrongTagCountError
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["InlineSegment", None, None]:
|
|
40
|
+
stack_data: tuple[list[list[TextSegment | InlineSegment]], Element, int] | None = None
|
|
41
|
+
inline_segment: InlineSegment | None = None
|
|
42
|
+
|
|
43
|
+
for text_segment in text_segments:
|
|
44
|
+
if stack_data is not None:
|
|
45
|
+
stack, stack_block, stack_base_depth = stack_data
|
|
46
|
+
if stack_block is not text_segment.block_parent:
|
|
47
|
+
inline_segment = _pop_stack_data(stack_data)
|
|
48
|
+
stack_data = None
|
|
49
|
+
if inline_segment:
|
|
50
|
+
yield inline_segment
|
|
51
|
+
|
|
52
|
+
if stack_data is None:
|
|
53
|
+
stack_data = (
|
|
54
|
+
[],
|
|
55
|
+
text_segment.block_parent,
|
|
56
|
+
text_segment.block_depth,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
stack, stack_block, stack_base_depth = stack_data
|
|
60
|
+
|
|
61
|
+
while len(stack) < text_segment.depth + 1:
|
|
62
|
+
stack.append([])
|
|
63
|
+
|
|
64
|
+
while len(stack) > text_segment.depth + 1:
|
|
65
|
+
_pop_stack(
|
|
66
|
+
stack=stack,
|
|
67
|
+
stack_base_depth=stack_base_depth,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# text_segment.depth 可视为它在 stack 中的 index,必须令 len(stack) == text_segment.depth + 1
|
|
71
|
+
stack[-1].append(text_segment)
|
|
72
|
+
|
|
73
|
+
if stack_data is not None:
|
|
74
|
+
inline_segment = _pop_stack_data(stack_data)
|
|
75
|
+
if inline_segment:
|
|
76
|
+
yield inline_segment
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _pop_stack_data(stack_data: tuple[list[list["TextSegment | InlineSegment"]], Element, int]):
|
|
80
|
+
stack, _, stack_base_depth = stack_data
|
|
81
|
+
inline_segment: InlineSegment | None = None
|
|
82
|
+
while stack:
|
|
83
|
+
inline_segment = _pop_stack(
|
|
84
|
+
stack=stack,
|
|
85
|
+
stack_base_depth=stack_base_depth,
|
|
86
|
+
)
|
|
87
|
+
return inline_segment
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _pop_stack(
|
|
91
|
+
stack: list[list["TextSegment | InlineSegment"]],
|
|
92
|
+
stack_base_depth: int,
|
|
93
|
+
) -> "InlineSegment | None":
|
|
94
|
+
inline_segment: InlineSegment | None = None
|
|
95
|
+
depth = len(stack) + stack_base_depth - 1
|
|
96
|
+
popped = stack.pop()
|
|
97
|
+
if popped:
|
|
98
|
+
inline_segment = InlineSegment(depth, popped)
|
|
99
|
+
if stack and inline_segment is not None:
|
|
100
|
+
stack[-1].append(inline_segment)
|
|
101
|
+
return inline_segment
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class InlineSegment:
|
|
105
|
+
def __init__(self, depth: int, children: list["TextSegment | InlineSegment"]) -> None:
|
|
106
|
+
assert depth > 0
|
|
107
|
+
self.id: int | None = None
|
|
108
|
+
self._children: list[TextSegment | InlineSegment] = children
|
|
109
|
+
self._parent_stack: list[Element] = children[0].parent_stack[:depth]
|
|
110
|
+
|
|
111
|
+
# 每一组 tag 都对应一个 ids 列表。
|
|
112
|
+
# 若为空,说明该 tag 属性结构全同,没必要分配 id 以区分。
|
|
113
|
+
# 若非空,则表示 tag 下每一个 element 都有 id 属性。
|
|
114
|
+
# 注意,相同 tag 下的 element 要么全部有 id,要么全部都没有 id
|
|
115
|
+
self._child_tag2ids: dict[str, list[int]] = {}
|
|
116
|
+
self._child_tag2count: dict[str, int] = {}
|
|
117
|
+
|
|
118
|
+
next_temp_id: int = 0
|
|
119
|
+
terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
|
|
120
|
+
|
|
121
|
+
for tag, child_terms in terms.items():
|
|
122
|
+
self._child_tag2count[tag] = len(child_terms)
|
|
123
|
+
if not is_the_same( # 仅当 tag 彼此无法区分时才分配 id,以尽可能减少 id 的数量
|
|
124
|
+
elements=(element_fingerprint(t.parent) for t in child_terms),
|
|
125
|
+
):
|
|
126
|
+
for child in child_terms:
|
|
127
|
+
child.id = next_temp_id
|
|
128
|
+
next_temp_id += 1
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def head(self) -> TextSegment:
|
|
132
|
+
first_child = self._children[0]
|
|
133
|
+
if isinstance(first_child, TextSegment):
|
|
134
|
+
return first_child
|
|
135
|
+
else:
|
|
136
|
+
return first_child.head
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def tail(self) -> TextSegment:
|
|
140
|
+
last_child = self._children[-1]
|
|
141
|
+
if isinstance(last_child, TextSegment):
|
|
142
|
+
return last_child
|
|
143
|
+
else:
|
|
144
|
+
return last_child.tail
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def children(self) -> list["TextSegment | InlineSegment"]:
|
|
148
|
+
return self._children
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def parent(self) -> Element:
|
|
152
|
+
return self._parent_stack[-1]
|
|
153
|
+
|
|
154
|
+
@property
|
|
155
|
+
def parent_stack(self) -> list[Element]:
|
|
156
|
+
return self._parent_stack
|
|
157
|
+
|
|
158
|
+
def __iter__(self) -> Iterator[TextSegment]:
|
|
159
|
+
for child in self._children:
|
|
160
|
+
if isinstance(child, TextSegment):
|
|
161
|
+
yield child
|
|
162
|
+
elif isinstance(child, InlineSegment):
|
|
163
|
+
yield from child
|
|
164
|
+
|
|
165
|
+
def recreate_ids(self, id_generator: IDGenerator) -> None:
|
|
166
|
+
self._child_tag2count.clear()
|
|
167
|
+
self._child_tag2ids.clear()
|
|
168
|
+
|
|
169
|
+
for child in self._children:
|
|
170
|
+
if isinstance(child, InlineSegment):
|
|
171
|
+
child_tag = child.parent.tag
|
|
172
|
+
ids = ensure_list(self._child_tag2ids, child_tag)
|
|
173
|
+
if child.id is not None:
|
|
174
|
+
child.id = id_generator.next_id()
|
|
175
|
+
ids.append(child.id)
|
|
176
|
+
child.recreate_ids(id_generator)
|
|
177
|
+
self._child_tag2count[child_tag] = self._child_tag2count.get(child_tag, 0) + 1
|
|
178
|
+
|
|
179
|
+
def create_element(self) -> Element:
|
|
180
|
+
element = Element(self.parent.tag)
|
|
181
|
+
previous_element: Element | None = None
|
|
182
|
+
for child in self._children:
|
|
183
|
+
if isinstance(child, InlineSegment):
|
|
184
|
+
previous_element = child.create_element()
|
|
185
|
+
element.append(previous_element)
|
|
186
|
+
|
|
187
|
+
elif isinstance(child, TextSegment):
|
|
188
|
+
if previous_element is None:
|
|
189
|
+
element.text = append_text_in_element(
|
|
190
|
+
origin_text=element.text,
|
|
191
|
+
append_text=child.text,
|
|
192
|
+
)
|
|
193
|
+
else:
|
|
194
|
+
previous_element.tail = append_text_in_element(
|
|
195
|
+
origin_text=previous_element.tail,
|
|
196
|
+
append_text=child.text,
|
|
197
|
+
)
|
|
198
|
+
if self.id is not None:
|
|
199
|
+
element.set(ID_KEY, str(self.id))
|
|
200
|
+
return element
|
|
201
|
+
|
|
202
|
+
def validate(self, validated_element: Element) -> Generator[InlineError | FoundInvalidIDError, None, None]:
|
|
203
|
+
remain_expected_elements: dict[int, Element] = {}
|
|
204
|
+
for child in self._child_inline_segments():
|
|
205
|
+
if child.id is not None:
|
|
206
|
+
remain_expected_elements[child.id] = child.parent
|
|
207
|
+
|
|
208
|
+
for _, child_element in iter_with_stack(validated_element):
|
|
209
|
+
if child_element is validated_element:
|
|
210
|
+
continue # skip the root self
|
|
211
|
+
|
|
212
|
+
element_id = id_in_element(child_element)
|
|
213
|
+
if element_id is None:
|
|
214
|
+
validated_id = validate_id_in_element(
|
|
215
|
+
element=child_element,
|
|
216
|
+
enable_no_id=True,
|
|
217
|
+
)
|
|
218
|
+
if isinstance(validated_id, FoundInvalidIDError):
|
|
219
|
+
yield validated_id
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
remain_expected_element = remain_expected_elements.pop(element_id, None)
|
|
223
|
+
if remain_expected_element is None:
|
|
224
|
+
yield InlineUnexpectedIDError(
|
|
225
|
+
id=element_id,
|
|
226
|
+
element=child_element,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
if remain_expected_elements:
|
|
230
|
+
yield InlineExpectedIDsError(
|
|
231
|
+
id2element=remain_expected_elements,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
yield from self._validate_children_structure(validated_element)
|
|
235
|
+
|
|
236
|
+
def _child_inline_segments(self) -> Generator["InlineSegment", None, None]:
|
|
237
|
+
for child in self._children:
|
|
238
|
+
if isinstance(child, InlineSegment):
|
|
239
|
+
yield child
|
|
240
|
+
yield from child._child_inline_segments() # pylint: disable=protected-access
|
|
241
|
+
|
|
242
|
+
def _validate_children_structure(self, validated_element: Element):
|
|
243
|
+
tag2found_elements: dict[str, list[Element]] = {}
|
|
244
|
+
|
|
245
|
+
for child_element in validated_element:
|
|
246
|
+
ids = self._child_tag2ids.get(child_element.tag, None)
|
|
247
|
+
if not ids:
|
|
248
|
+
found_elements = ensure_list(tag2found_elements, child_element.tag)
|
|
249
|
+
found_elements.append(child_element)
|
|
250
|
+
else:
|
|
251
|
+
id_str = child_element.get(ID_KEY, None)
|
|
252
|
+
if id_str is None:
|
|
253
|
+
yield InlineLostIDError(
|
|
254
|
+
element=child_element,
|
|
255
|
+
stack=[self.parent],
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
for tag, found_elements in tag2found_elements.items():
|
|
259
|
+
expected_count = self._child_tag2count.get(tag, 0)
|
|
260
|
+
if len(found_elements) != expected_count:
|
|
261
|
+
yield InlineWrongTagCountError(
|
|
262
|
+
expected_count=expected_count,
|
|
263
|
+
found_elements=found_elements,
|
|
264
|
+
stack=[self.parent],
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
for child, child_element in self._match_children(validated_element):
|
|
268
|
+
# pylint: disable=protected-access
|
|
269
|
+
for error in child._validate_children_structure(child_element):
|
|
270
|
+
error.stack.insert(0, self.parent)
|
|
271
|
+
yield error
|
|
272
|
+
|
|
273
|
+
# 即便 self.validate(...) 的错误没有排除干净,也要尽可能匹配一个质量较高(尽力而为)的版本
|
|
274
|
+
def assign_attributes(self, template_element: Element) -> Element:
|
|
275
|
+
assigned_element = Element(self.parent.tag, self.parent.attrib)
|
|
276
|
+
if template_element.text and template_element.text.strip():
|
|
277
|
+
assigned_element.text = append_text_in_element(
|
|
278
|
+
origin_text=assigned_element.text,
|
|
279
|
+
append_text=template_element.text,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
matched_child_element_ids: set[int] = set()
|
|
283
|
+
for child, child_element in self._match_children(template_element):
|
|
284
|
+
child_assigned_element = child.assign_attributes(child_element)
|
|
285
|
+
assigned_element.append(child_assigned_element)
|
|
286
|
+
matched_child_element_ids.add(id(child_element))
|
|
287
|
+
|
|
288
|
+
assigned_child_element_stack = list(assigned_element)
|
|
289
|
+
assigned_child_element_stack.reverse()
|
|
290
|
+
|
|
291
|
+
previous_assigned_child_element: Element | None = None
|
|
292
|
+
for child_element in template_element:
|
|
293
|
+
# 只关心 child_element 是否是分割点,不关心它真实对应。极端情况下可能乱序,只好大致对上就行
|
|
294
|
+
child_text: str = ""
|
|
295
|
+
if id(child_element) not in matched_child_element_ids:
|
|
296
|
+
child_text = plain_text(child_element)
|
|
297
|
+
elif assigned_child_element_stack:
|
|
298
|
+
previous_assigned_child_element = assigned_child_element_stack.pop()
|
|
299
|
+
if child_element.tail is not None:
|
|
300
|
+
child_text += child_element.tail
|
|
301
|
+
if not child_text.strip():
|
|
302
|
+
continue
|
|
303
|
+
if previous_assigned_child_element is None:
|
|
304
|
+
assigned_element.text = append_text_in_element(
|
|
305
|
+
origin_text=assigned_element.text,
|
|
306
|
+
append_text=child_text,
|
|
307
|
+
)
|
|
308
|
+
else:
|
|
309
|
+
previous_assigned_child_element.tail = append_text_in_element(
|
|
310
|
+
origin_text=previous_assigned_child_element.tail,
|
|
311
|
+
append_text=child_text,
|
|
312
|
+
)
|
|
313
|
+
return assigned_element
|
|
314
|
+
|
|
315
|
+
def _match_children(self, element: Element) -> Generator[tuple["InlineSegment", Element], None, None]:
|
|
316
|
+
tag2elements = nest((c.tag, c) for c in element)
|
|
317
|
+
tag2children = nest(
|
|
318
|
+
(c.parent.tag, (i, c)) for i, c in enumerate(c for c in self._children if isinstance(c, InlineSegment))
|
|
319
|
+
)
|
|
320
|
+
used_ids: set[int] = set()
|
|
321
|
+
children_and_elements: list[tuple[int, InlineSegment, Element]] = []
|
|
322
|
+
|
|
323
|
+
for tag, orders_and_children in tag2children.items():
|
|
324
|
+
# 优先考虑 id 匹配,剩下的以自然顺序尽可能匹配
|
|
325
|
+
ids = self._child_tag2ids.get(tag, [])
|
|
326
|
+
matched_children_elements: list[Element | None] = [None] * len(orders_and_children)
|
|
327
|
+
not_matched_elements: list[Element] = []
|
|
328
|
+
|
|
329
|
+
for child_element in tag2elements.get(tag, []):
|
|
330
|
+
id_order: int | None = None
|
|
331
|
+
child_id = id_in_element(child_element)
|
|
332
|
+
if child_id is not None and child_id not in used_ids:
|
|
333
|
+
used_ids.add(child_id) # 一个 id 只能用一次,防止重复
|
|
334
|
+
try:
|
|
335
|
+
id_order = ids.index(child_id)
|
|
336
|
+
except ValueError:
|
|
337
|
+
pass
|
|
338
|
+
if id_order is None:
|
|
339
|
+
not_matched_elements.append(child_element)
|
|
340
|
+
else:
|
|
341
|
+
matched_children_elements[id_order] = child_element
|
|
342
|
+
|
|
343
|
+
not_matched_elements.reverse()
|
|
344
|
+
for i in range(len(matched_children_elements)):
|
|
345
|
+
if not not_matched_elements:
|
|
346
|
+
break
|
|
347
|
+
matched_element = matched_children_elements[i]
|
|
348
|
+
if matched_element is None:
|
|
349
|
+
matched_children_elements[i] = not_matched_elements.pop()
|
|
350
|
+
|
|
351
|
+
for (order, child), child_element in zip(orders_and_children, matched_children_elements):
|
|
352
|
+
if child_element is not None:
|
|
353
|
+
children_and_elements.append((order, child, child_element))
|
|
354
|
+
|
|
355
|
+
for _, child, child_element in sorted(children_and_elements, key=lambda x: x[0]):
|
|
356
|
+
yield child, child_element
|
|
@@ -4,13 +4,13 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from ..xml import expand_left_element_texts, expand_right_element_texts, normalize_text_in_element
|
|
8
8
|
|
|
9
9
|
# HTML inline-level elements
|
|
10
10
|
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
11
11
|
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
12
12
|
_HTML_INLINE_TAGS = frozenset(
|
|
13
|
-
|
|
13
|
+
(
|
|
14
14
|
# Inline text semantics
|
|
15
15
|
"a",
|
|
16
16
|
"abbr",
|
|
@@ -67,7 +67,7 @@ _HTML_INLINE_TAGS = frozenset(
|
|
|
67
67
|
"del",
|
|
68
68
|
"ins",
|
|
69
69
|
"slot",
|
|
70
|
-
|
|
70
|
+
)
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
|
|
@@ -79,7 +79,6 @@ class TextPosition(Enum):
|
|
|
79
79
|
@dataclass
|
|
80
80
|
class TextSegment:
|
|
81
81
|
text: str
|
|
82
|
-
index: int # *.text is 0, the first *.tail is 1, and so on
|
|
83
82
|
parent_stack: list[Element]
|
|
84
83
|
left_common_depth: int
|
|
85
84
|
right_common_depth: int
|
|
@@ -90,6 +89,10 @@ class TextSegment:
|
|
|
90
89
|
def root(self) -> Element:
|
|
91
90
|
return self.parent_stack[0]
|
|
92
91
|
|
|
92
|
+
@property
|
|
93
|
+
def depth(self) -> int:
|
|
94
|
+
return len(self.parent_stack) - self.block_depth
|
|
95
|
+
|
|
93
96
|
@property
|
|
94
97
|
def block_parent(self) -> Element:
|
|
95
98
|
return self.parent_stack[self.block_depth - 1]
|
|
@@ -106,7 +109,6 @@ class TextSegment:
|
|
|
106
109
|
def clone(self) -> "TextSegment":
|
|
107
110
|
return TextSegment(
|
|
108
111
|
text=self.text,
|
|
109
|
-
index=self.index,
|
|
110
112
|
parent_stack=list(self.parent_stack),
|
|
111
113
|
left_common_depth=self.left_common_depth,
|
|
112
114
|
right_common_depth=self.right_common_depth,
|
|
@@ -171,20 +173,18 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
171
173
|
if text is not None:
|
|
172
174
|
yield TextSegment(
|
|
173
175
|
text=text,
|
|
174
|
-
index=0,
|
|
175
176
|
parent_stack=next_stack,
|
|
176
177
|
left_common_depth=0,
|
|
177
178
|
right_common_depth=0,
|
|
178
179
|
block_depth=next_block_depth,
|
|
179
180
|
position=TextPosition.TEXT,
|
|
180
181
|
)
|
|
181
|
-
for
|
|
182
|
+
for child_element in element:
|
|
182
183
|
yield from _search_text_segments(next_stack, child_element)
|
|
183
184
|
child_tail = normalize_text_in_element(child_element.tail)
|
|
184
185
|
if child_tail is not None:
|
|
185
186
|
yield TextSegment(
|
|
186
187
|
text=child_tail,
|
|
187
|
-
index=i + 1,
|
|
188
188
|
parent_stack=next_stack,
|
|
189
189
|
left_common_depth=0,
|
|
190
190
|
right_common_depth=0,
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from ..xml import ID_KEY
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def element_fingerprint(element: Element) -> str:
|
|
7
|
+
attrs = sorted(f"{key}={value}" for key, value in element.attrib.items())
|
|
8
|
+
return f"<{element.tag} {' '.join(attrs)}/>"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
|
|
12
|
+
parents: list[Element] = []
|
|
13
|
+
while True:
|
|
14
|
+
if len(element) != 1:
|
|
15
|
+
break
|
|
16
|
+
child = element[0]
|
|
17
|
+
if not element.text:
|
|
18
|
+
break
|
|
19
|
+
if not child.tail:
|
|
20
|
+
break
|
|
21
|
+
parents.append(element)
|
|
22
|
+
element = child
|
|
23
|
+
element.tail = None
|
|
24
|
+
return element, parents
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def id_in_element(element: Element) -> int | None:
|
|
28
|
+
id_str = element.get(ID_KEY, None)
|
|
29
|
+
if id_str is None:
|
|
30
|
+
return None
|
|
31
|
+
try:
|
|
32
|
+
return int(id_str)
|
|
33
|
+
except ValueError:
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IDGenerator:
|
|
38
|
+
def __init__(self):
|
|
39
|
+
self._previous_id: int = 0
|
|
40
|
+
|
|
41
|
+
def next_id(self) -> int:
|
|
42
|
+
self._previous_id += 1
|
|
43
|
+
return self._previous_id
|