epub-translator 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. epub_translator/__init__.py +2 -2
  2. epub_translator/data/fill.jinja +143 -38
  3. epub_translator/epub/__init__.py +1 -1
  4. epub_translator/epub/metadata.py +122 -0
  5. epub_translator/epub/spines.py +3 -2
  6. epub_translator/epub/zip.py +11 -9
  7. epub_translator/epub_transcode.py +108 -0
  8. epub_translator/llm/__init__.py +1 -0
  9. epub_translator/llm/context.py +109 -0
  10. epub_translator/llm/core.py +32 -113
  11. epub_translator/llm/executor.py +25 -31
  12. epub_translator/llm/increasable.py +1 -1
  13. epub_translator/llm/types.py +0 -3
  14. epub_translator/segment/__init__.py +26 -0
  15. epub_translator/segment/block_segment.py +124 -0
  16. epub_translator/segment/common.py +29 -0
  17. epub_translator/segment/inline_segment.py +356 -0
  18. epub_translator/{xml_translator → segment}/text_segment.py +8 -8
  19. epub_translator/segment/utils.py +43 -0
  20. epub_translator/translator.py +147 -183
  21. epub_translator/utils.py +33 -0
  22. epub_translator/xml/__init__.py +2 -0
  23. epub_translator/xml/const.py +1 -0
  24. epub_translator/xml/deduplication.py +3 -3
  25. epub_translator/xml/self_closing.py +182 -0
  26. epub_translator/xml/utils.py +42 -0
  27. epub_translator/xml/xml.py +7 -0
  28. epub_translator/xml/xml_like.py +8 -33
  29. epub_translator/xml_interrupter.py +165 -0
  30. epub_translator/xml_translator/__init__.py +1 -2
  31. epub_translator/xml_translator/callbacks.py +34 -0
  32. epub_translator/xml_translator/{const.py → common.py} +0 -1
  33. epub_translator/xml_translator/hill_climbing.py +104 -0
  34. epub_translator/xml_translator/stream_mapper.py +253 -0
  35. epub_translator/xml_translator/submitter.py +26 -72
  36. epub_translator/xml_translator/translator.py +162 -113
  37. epub_translator/xml_translator/validation.py +458 -0
  38. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
  39. epub_translator-0.1.3.dist-info/RECORD +66 -0
  40. epub_translator/epub/placeholder.py +0 -53
  41. epub_translator/iter_sync.py +0 -24
  42. epub_translator/xml_translator/fill.py +0 -128
  43. epub_translator/xml_translator/format.py +0 -282
  44. epub_translator/xml_translator/fragmented.py +0 -125
  45. epub_translator/xml_translator/group.py +0 -183
  46. epub_translator/xml_translator/progressive_locking.py +0 -256
  47. epub_translator/xml_translator/utils.py +0 -29
  48. epub_translator-0.1.1.dist-info/RECORD +0 -58
  49. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
  50. {epub_translator-0.1.1.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
@@ -1,214 +1,178 @@
1
1
  from collections.abc import Callable
2
+ from dataclasses import dataclass
3
+ from enum import Enum, auto
4
+ from importlib.metadata import version as get_package_version
5
+ from os import PathLike
2
6
  from pathlib import Path
3
- from xml.etree.ElementTree import Element
4
7
 
5
- from .epub import Placeholder, Zip, is_placeholder_tag, read_toc, search_spine_paths, write_toc
6
- from .epub.common import find_opf_path
8
+ from .epub import (
9
+ Zip,
10
+ read_metadata,
11
+ read_toc,
12
+ search_spine_paths,
13
+ write_metadata,
14
+ write_toc,
15
+ )
16
+ from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
7
17
  from .llm import LLM
8
- from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first, plain_text
9
- from .xml_translator import XMLGroupContext, XMLTranslator, submit_text_segments
18
+ from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
19
+ from .xml_interrupter import XMLInterrupter
20
+ from .xml_translator import FillFailedEvent, XMLTranslator
21
+
22
+
23
+ class _ElementType(Enum):
24
+ TOC = auto()
25
+ METADATA = auto()
26
+ CHAPTER = auto()
27
+
28
+
29
+ @dataclass
30
+ class _ElementContext:
31
+ element_type: _ElementType
32
+ chapter_data: tuple[Path, XMLLikeNode] | None = None
10
33
 
11
34
 
12
35
  def translate(
13
- llm: LLM,
14
- source_path: Path,
15
- target_path: Path,
36
+ source_path: PathLike | str,
37
+ target_path: PathLike | str,
16
38
  target_language: str,
17
39
  user_prompt: str | None = None,
18
40
  max_retries: int = 5,
19
41
  max_group_tokens: int = 1200,
42
+ llm: LLM | None = None,
43
+ translation_llm: LLM | None = None,
44
+ fill_llm: LLM | None = None,
20
45
  on_progress: Callable[[float], None] | None = None,
46
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None,
21
47
  ) -> None:
48
+ translation_llm = translation_llm or llm
49
+ fill_llm = fill_llm or llm
50
+ if translation_llm is None:
51
+ raise ValueError("Either translation_llm or llm must be provided")
52
+ if fill_llm is None:
53
+ raise ValueError("Either fill_llm or llm must be provided")
54
+
22
55
  translator = XMLTranslator(
23
- llm=llm,
56
+ translation_llm=translation_llm,
57
+ fill_llm=fill_llm,
24
58
  target_language=target_language,
25
59
  user_prompt=user_prompt,
26
60
  ignore_translated_error=False,
27
61
  max_retries=max_retries,
28
62
  max_fill_displaying_errors=10,
29
- group_context=XMLGroupContext(
30
- encoding=llm.encoding,
31
- max_group_tokens=max_group_tokens,
32
- ),
63
+ max_group_tokens=max_group_tokens,
64
+ cache_seed_content=f"{_get_version()}:{target_language}",
33
65
  )
34
- with Zip(source_path, target_path) as zip:
35
- # Progress distribution: TOC 3%, metadata 2%, chapters 95%
36
- TOC_PROGRESS = 0.03
37
- METADATA_PROGRESS = 0.02
38
- CHAPTERS_PROGRESS = 0.95
39
-
40
- # Count total chapters for progress calculation (lightweight, no content loading)
41
- total_chapters = _count_chapters(zip)
42
- chapter_progress_step = CHAPTERS_PROGRESS / total_chapters if total_chapters > 0 else 0
43
-
66
+ with Zip(
67
+ source_path=Path(source_path).resolve(),
68
+ target_path=Path(target_path).resolve(),
69
+ ) as zip:
70
+ # mimetype should be the first file in the EPUB ZIP
71
+ zip.migrate(Path("mimetype"))
72
+
73
+ total_chapters = sum(1 for _, _ in search_spine_paths(zip))
74
+ toc_list = read_toc(zip)
75
+ metadata_fields = read_metadata(zip)
76
+
77
+ # Calculate weights: TOC (5%), Metadata (5%), Chapters (90%)
78
+ toc_has_items = len(toc_list) > 0
79
+ metadata_has_items = len(metadata_fields) > 0
80
+ total_items = (1 if toc_has_items else 0) + (1 if metadata_has_items else 0) + total_chapters
81
+
82
+ if total_items == 0:
83
+ return
84
+
85
+ interrupter = XMLInterrupter()
86
+ element_contexts: dict[int, _ElementContext] = {}
87
+
88
+ toc_weight = 0.05 if toc_has_items else 0
89
+ metadata_weight = 0.05 if metadata_has_items else 0
90
+ chapters_weight = 1.0 - toc_weight - metadata_weight
91
+ progress_per_chapter = chapters_weight / total_chapters if total_chapters > 0 else 0
44
92
  current_progress = 0.0
45
93
 
46
- # Translate TOC
47
- _translate_toc(translator, zip)
48
- current_progress += TOC_PROGRESS
49
- if on_progress:
50
- on_progress(current_progress)
51
-
52
- # Translate metadata
53
- _translate_metadata(translator, zip)
54
- current_progress += METADATA_PROGRESS
55
- if on_progress:
56
- on_progress(current_progress)
57
-
58
- # Translate chapters
59
- processed_chapters = 0
60
- for element, text_segments, (chapter_path, xml, placeholder) in translator.translate_to_text_segments(
61
- items=_search_chapter_items(zip),
94
+ for translated_elem in translator.translate_elements(
95
+ interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
96
+ interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
97
+ interrupt_block_element=interrupter.interrupt_block_element,
98
+ on_fill_failed=on_fill_failed,
99
+ elements=_generate_elements_from_book(
100
+ zip=zip,
101
+ toc_list=toc_list,
102
+ metadata_fields=metadata_fields,
103
+ element_contexts=element_contexts,
104
+ ),
62
105
  ):
63
- submit_text_segments(
64
- element=element,
65
- text_segments=(
66
- segment
67
- for segment in text_segments
68
- if not any(is_placeholder_tag(e.tag) for e in segment.parent_stack)
69
- ),
70
- )
71
- placeholder.recover()
72
- deduplicate_ids_in_element(xml.element)
73
- with zip.replace(chapter_path) as target_file:
74
- xml.save(target_file)
75
-
76
- # Update progress after each chapter
77
- processed_chapters += 1
78
- current_progress = TOC_PROGRESS + METADATA_PROGRESS + (processed_chapters * chapter_progress_step)
79
- if on_progress:
80
- on_progress(current_progress)
81
-
82
-
83
- def _translate_toc(translator: XMLTranslator, zip: Zip):
84
- """Translate TOC (Table of Contents) titles."""
85
- toc_list = read_toc(zip)
86
- if not toc_list:
87
- return
88
-
89
- # Collect all titles recursively
90
- titles_to_translate: list[str] = []
91
-
92
- def collect_titles(items):
93
- for item in items:
94
- titles_to_translate.append(item.title)
95
- if item.children:
96
- collect_titles(item.children)
97
-
98
- collect_titles(toc_list)
99
-
100
- # Create XML elements for translation
101
- elements_to_translate = Element("toc")
102
- elements_to_translate.extend(_create_text_element(title) for title in titles_to_translate)
103
-
104
- # Translate all titles at once
105
- translated_element = translator.translate_to_element(elements_to_translate)
106
-
107
- # Extract translated texts
108
- from builtins import zip as builtin_zip
109
-
110
- translated_titles = [
111
- plain_text(elem) if elem is not None else original
112
- for elem, original in builtin_zip(translated_element, titles_to_translate)
113
- ]
114
-
115
- # Fill back translated titles
116
- title_index = 0
117
-
118
- def fill_titles(items):
119
- nonlocal title_index
120
- for item in items:
121
- item.title = translated_titles[title_index]
122
- title_index += 1
123
- if item.children:
124
- fill_titles(item.children)
125
-
126
- fill_titles(toc_list)
127
-
128
- # Write back the translated TOC
129
- write_toc(zip, toc_list)
130
-
131
-
132
- def _translate_metadata(translator: XMLTranslator, zip: Zip):
133
- """Translate metadata fields in OPF file."""
134
- opf_path = find_opf_path(zip)
135
-
136
- with zip.read(opf_path) as f:
137
- xml = XMLLikeNode(f)
138
-
139
- # Find metadata element
140
- metadata_elem = None
141
- for child in xml.element:
142
- if child.tag.endswith("metadata"):
143
- metadata_elem = child
144
- break
145
-
146
- if metadata_elem is None:
147
- return
148
-
149
- # Collect metadata fields to translate
150
- # Skip fields that should not be translated
151
- skip_fields = {
152
- "language",
153
- "identifier",
154
- "date",
155
- "meta",
156
- "contributor", # Usually technical information
157
- }
158
-
159
- fields_to_translate: list[tuple[Element, str]] = []
160
-
161
- for elem in metadata_elem:
162
- # Get tag name without namespace
163
- tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
164
-
165
- # Check if element has text content and should be translated
166
- if elem.text and elem.text.strip() and tag_name not in skip_fields:
167
- fields_to_translate.append((elem, elem.text.strip()))
168
-
169
- if not fields_to_translate:
170
- return
171
-
172
- # Create XML elements for translation
173
- elements_to_translate = Element("metadata")
174
- elements_to_translate.extend(_create_text_element(text) for _, text in fields_to_translate)
175
-
176
- # Translate all metadata at once
177
- translated_element = translator.translate_to_element(elements_to_translate)
178
-
179
- # Fill back translated texts
180
- from builtins import zip as builtin_zip
181
-
182
- for (elem, _), translated_elem in builtin_zip(fields_to_translate, translated_element, strict=True):
183
- if translated_elem is not None:
184
- translated_text = plain_text(translated_elem)
185
- if translated_text:
186
- elem.text = translated_text
187
-
188
- # Write back the modified OPF file
189
- with zip.replace(opf_path) as f:
190
- xml.save(f)
191
-
192
-
193
- def _count_chapters(zip: Zip) -> int:
194
- """Count total chapters without loading content (lightweight)."""
195
- return sum(1 for _ in search_spine_paths(zip))
196
-
197
-
198
- def _search_chapter_items(zip: Zip):
199
- for chapter_path in search_spine_paths(zip):
106
+ elem_id = id(translated_elem)
107
+ context = element_contexts.pop(elem_id, None)
108
+
109
+ if context is None:
110
+ continue
111
+
112
+ if context.element_type == _ElementType.TOC:
113
+ decoded_toc = decode_toc_list(translated_elem)
114
+ write_toc(zip, decoded_toc)
115
+
116
+ current_progress += toc_weight
117
+ if on_progress:
118
+ on_progress(current_progress)
119
+
120
+ elif context.element_type == _ElementType.METADATA:
121
+ decoded_metadata = decode_metadata(translated_elem)
122
+ write_metadata(zip, decoded_metadata)
123
+
124
+ current_progress += metadata_weight
125
+ if on_progress:
126
+ on_progress(current_progress)
127
+
128
+ elif context.element_type == _ElementType.CHAPTER:
129
+ if context.chapter_data is not None:
130
+ chapter_path, xml = context.chapter_data
131
+ deduplicate_ids_in_element(xml.element)
132
+ with zip.replace(chapter_path) as target_file:
133
+ xml.save(target_file)
134
+
135
+ current_progress += progress_per_chapter
136
+ if on_progress:
137
+ on_progress(current_progress)
138
+
139
+
140
+ def _generate_elements_from_book(
141
+ zip: Zip,
142
+ toc_list: list,
143
+ metadata_fields: list,
144
+ element_contexts: dict[int, _ElementContext],
145
+ ):
146
+ if toc_list:
147
+ toc_elem = encode_toc_list(toc_list)
148
+ elem_id = id(toc_elem)
149
+ element_contexts[elem_id] = _ElementContext(element_type=_ElementType.TOC)
150
+ yield toc_elem
151
+
152
+ if metadata_fields:
153
+ metadata_elem = encode_metadata(metadata_fields)
154
+ elem_id = id(metadata_elem)
155
+ element_contexts[elem_id] = _ElementContext(element_type=_ElementType.METADATA)
156
+ yield metadata_elem
157
+
158
+ for chapter_path, media_type in search_spine_paths(zip):
200
159
  with zip.read(chapter_path) as chapter_file:
201
160
  xml = XMLLikeNode(
202
161
  file=chapter_file,
203
- is_html_like=chapter_path.suffix.lower() in (".html", ".htm"),
162
+ is_html_like=(media_type == "text/html"),
204
163
  )
205
164
  body_element = find_first(xml.element, "body")
206
165
  if body_element is not None:
207
- placeholder = Placeholder(body_element)
208
- yield body_element, (chapter_path, xml, placeholder)
166
+ elem_id = id(body_element)
167
+ element_contexts[elem_id] = _ElementContext(
168
+ element_type=_ElementType.CHAPTER,
169
+ chapter_data=(chapter_path, xml),
170
+ )
171
+ yield body_element
209
172
 
210
173
 
211
- def _create_text_element(text: str) -> Element:
212
- elem = Element("text")
213
- elem.text = text
214
- return elem
174
+ def _get_version() -> str:
175
+ try:
176
+ return get_package_version("epub-translator")
177
+ except Exception:
178
+ return "development"
epub_translator/utils.py CHANGED
@@ -1,7 +1,40 @@
1
1
  import re
2
+ from collections.abc import Iterable
3
+ from typing import TypeVar
4
+
5
+ K = TypeVar("K")
6
+ T = TypeVar("T")
2
7
 
3
8
  _WHITESPACE_PATTERN = re.compile(r"\s+")
4
9
 
5
10
 
6
11
  def normalize_whitespace(text: str) -> str:
7
12
  return _WHITESPACE_PATTERN.sub(" ", text)
13
+
14
+
15
+ def is_the_same(elements: Iterable[T]) -> bool:
16
+ iterator = iter(elements)
17
+ try:
18
+ first_element = next(iterator)
19
+ except StopIteration:
20
+ return True
21
+
22
+ for element in iterator:
23
+ if element != first_element:
24
+ return False
25
+ return True
26
+
27
+
28
+ def nest(items: Iterable[tuple[K, T]]) -> dict[K, list[T]]:
29
+ nested_dict: dict[K, list[T]] = {}
30
+ for key, value in items:
31
+ ensure_list(nested_dict, key).append(value)
32
+ return nested_dict
33
+
34
+
35
+ def ensure_list(target: dict[K, list[T]], key: K) -> list[T]:
36
+ value = target.get(key, None)
37
+ if value is None:
38
+ value = []
39
+ target[key] = value
40
+ return value
@@ -1,4 +1,6 @@
1
+ from .const import *
1
2
  from .deduplication import *
2
3
  from .firendly import *
4
+ from .utils import *
3
5
  from .xml import *
4
6
  from .xml_like import *
@@ -0,0 +1 @@
1
+ ID_KEY: str = "id"
@@ -1,8 +1,8 @@
1
1
  from xml.etree.ElementTree import Element
2
2
 
3
+ from .const import ID_KEY
3
4
  from .xml import iter_with_stack
4
5
 
5
- _ID_KEY = "id"
6
6
  _SUFFIX = "__translated"
7
7
 
8
8
 
@@ -11,9 +11,9 @@ def deduplicate_ids_in_element(element: Element) -> Element:
11
11
  original_id_count: dict[str, int] = {}
12
12
 
13
13
  for _, sub_element in iter_with_stack(element):
14
- if _ID_KEY not in sub_element.attrib:
14
+ if ID_KEY not in sub_element.attrib:
15
15
  continue
16
- original_id = sub_element.attrib[_ID_KEY]
16
+ original_id = sub_element.attrib[ID_KEY]
17
17
 
18
18
  if original_id not in seen_ids:
19
19
  seen_ids.add(original_id)
@@ -0,0 +1,182 @@
1
+ import re
2
+
3
+ # Some non-standard EPUB generators use HTML-style tags without self-closing syntax
4
+ # We need to convert them to XML-compatible format before parsing
5
+ # These are HTML5 void elements that must be self-closing in XHTML
6
+ _VOID_TAGS = (
7
+ "area",
8
+ "base",
9
+ "br",
10
+ "col",
11
+ "embed",
12
+ "hr",
13
+ "img",
14
+ "input",
15
+ "link",
16
+ "meta",
17
+ "param",
18
+ "source",
19
+ "track",
20
+ "wbr",
21
+ )
22
+
23
+
24
+ def self_close_void_elements(xml_content: str) -> str:
25
+ """
26
+ Convert void HTML elements to self-closing format for XML parsing.
27
+
28
+ This function handles non-standard HTML where void elements are not self-closed.
29
+ For illegal cases like <meta>content</meta>, the content is removed.
30
+
31
+ Args:
32
+ xml_content: HTML/XHTML content string
33
+
34
+ Returns:
35
+ Content with void elements in self-closing format
36
+
37
+ Example:
38
+ <meta charset="utf-8"> → <meta charset="utf-8" />
39
+ <br> → <br />
40
+ <meta>illegal</meta> → <meta />
41
+ """
42
+ for tag in _VOID_TAGS:
43
+ xml_content = _fix_void_element(xml_content, tag)
44
+ return xml_content
45
+
46
+
47
+ def _fix_void_element(content: str, tag_name: str) -> str:
48
+ """
49
+ Fix a specific void element in the content.
50
+
51
+ Strategy:
52
+ 1. Find <tag ...> (not already self-closed)
53
+ 2. Check if there's a matching </tag>
54
+ 3. If yes, remove everything between them and make it self-closing
55
+ 4. If no, just make the opening tag self-closing
56
+ """
57
+ result = []
58
+ pos = 0
59
+
60
+ while pos < len(content):
61
+ tag_start = content.find(f"<{tag_name}", pos)
62
+ if tag_start == -1:
63
+ result.append(content[pos:])
64
+ break
65
+
66
+ # Verify it's a complete tag match (not a prefix like <br matching <brain>)
67
+ # The character after tag_name must be >, /, or whitespace
68
+ check_pos = tag_start + len(f"<{tag_name}")
69
+ if check_pos < len(content):
70
+ next_char = content[check_pos]
71
+ if next_char not in (">", "/", " ", "\t", "\n", "\r"):
72
+ result.append(content[pos:check_pos])
73
+ pos = check_pos
74
+ continue
75
+
76
+ result.append(content[pos:tag_start])
77
+ tag_end = _find_tag_end(content, tag_start)
78
+ if tag_end == -1:
79
+ result.append(content[tag_start:])
80
+ break
81
+
82
+ opening_tag = content[tag_start : tag_end + 1]
83
+
84
+ if opening_tag.rstrip().endswith("/>"):
85
+ result.append(opening_tag)
86
+ pos = tag_end + 1
87
+ continue
88
+
89
+ if not opening_tag.endswith(">"):
90
+ result.append(opening_tag)
91
+ pos = tag_end + 1
92
+ continue
93
+
94
+ closing_tag = f"</{tag_name}>"
95
+ closing_pos = content.find(closing_tag, tag_end + 1)
96
+
97
+ if closing_pos != -1:
98
+ attrs_part = opening_tag[len(f"<{tag_name}") : -1].rstrip()
99
+ if attrs_part:
100
+ result.append(f"<{tag_name}{attrs_part} />")
101
+ else:
102
+ result.append(f"<{tag_name} />")
103
+ pos = closing_pos + len(closing_tag)
104
+ else:
105
+ attrs_part = opening_tag[len(f"<{tag_name}") : -1].rstrip()
106
+ if attrs_part:
107
+ result.append(f"<{tag_name}{attrs_part} />")
108
+ else:
109
+ result.append(f"<{tag_name} />")
110
+ pos = tag_end + 1
111
+
112
+ return "".join(result)
113
+
114
+
115
+ def _find_tag_end(content: str, start_pos: int) -> int:
116
+ """
117
+ Find the end of an HTML tag (the position of >).
118
+
119
+ Handles quotes: ignores > inside quoted attribute values.
120
+ """
121
+ pos = start_pos
122
+ in_quote = None # None, '"', or "'"
123
+
124
+ while pos < len(content):
125
+ char = content[pos]
126
+
127
+ if in_quote:
128
+ if char == in_quote:
129
+ if pos > 0 and content[pos - 1] == "\\":
130
+ pos += 1
131
+ continue
132
+ else:
133
+ in_quote = None
134
+ else:
135
+ if char in ('"', "'"):
136
+ in_quote = char
137
+ elif char == ">":
138
+ return pos
139
+
140
+ pos += 1
141
+
142
+ return -1 # Not found
143
+
144
+
145
+ # For saving: match self-closing tags like <br /> or <br/>
146
+ # Capture tag name and everything between tag name and />
147
+ _VOID_TAG_CLOSE_PATTERN = re.compile(r"<(" + "|".join(_VOID_TAGS) + r")([^>]*?)\s*/>")
148
+
149
+
150
+ def unclose_void_elements(xml_content: str) -> str:
151
+ """
152
+ Convert void elements from self-closing to unclosed format for HTML compatibility.
153
+
154
+ Transforms self-closed void elements like <br /> back to <br> for
155
+ compatibility with HTML parsers that don't support XHTML syntax.
156
+ Used only for text/html media type files.
157
+
158
+ Args:
159
+ xml_content: HTML/XHTML content string
160
+
161
+ Returns:
162
+ Content with void elements in unclosed format
163
+
164
+ Example:
165
+ <meta charset="utf-8" /> → <meta charset="utf-8">
166
+ <br /> → <br>
167
+ <img src="test.png" /> → <img src="test.png">
168
+ """
169
+
170
+ def replacer(m: re.Match):
171
+ tag_name = m.group(1)
172
+ attrs = m.group(2).rstrip() # Remove trailing whitespace
173
+ if attrs:
174
+ return f"<{tag_name}{attrs}>"
175
+ else:
176
+ return f"<{tag_name}>"
177
+
178
+ return re.sub(
179
+ pattern=_VOID_TAG_CLOSE_PATTERN,
180
+ repl=replacer,
181
+ string=xml_content,
182
+ )
@@ -0,0 +1,42 @@
1
+ from collections.abc import Generator
2
+ from xml.etree.ElementTree import Element
3
+
4
+ from ..utils import normalize_whitespace
5
+ from .const import ID_KEY
6
+
7
+
8
+ def normalize_text_in_element(text: str | None) -> str | None:
9
+ if text is None:
10
+ return None
11
+ text = normalize_whitespace(text)
12
+ if not text.strip():
13
+ return None
14
+ return text
15
+
16
+
17
+ def append_text_in_element(origin_text: str | None, append_text: str) -> str:
18
+ if origin_text is None:
19
+ return append_text
20
+ else:
21
+ return origin_text + append_text
22
+
23
+
24
+ def index_of_parent(parent: Element, checked_element: Element) -> int:
25
+ for i, child in enumerate(parent):
26
+ if child == checked_element:
27
+ return i
28
+ raise ValueError("Element not found in parent.")
29
+
30
+
31
+ def expand_left_element_texts(element: Element) -> Generator[str, None, None]:
32
+ yield "<"
33
+ yield element.tag
34
+ yield " "
35
+ yield ID_KEY
36
+ yield '="99">'
37
+
38
+
39
+ def expand_right_element_texts(element: Element) -> Generator[str, None, None]:
40
+ yield "</"
41
+ yield element.tag
42
+ yield ">"
@@ -12,6 +12,13 @@ def find_first(element: Element, tag: str) -> Element | None:
12
12
  return None
13
13
 
14
14
 
15
+ def index_in_parent(parent: Element, element: Element) -> int | None:
16
+ for i, child in enumerate(parent):
17
+ if child is element:
18
+ return i
19
+ return None
20
+
21
+
15
22
  def iter_with_stack(element: Element) -> Generator[tuple[list[Element], Element], None, None]:
16
23
  """先序遍历:yield parent_path, element"""
17
24
  stack: list[list[Element]] = [[element]]