epub-translator 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.1.6 → epub_translator-0.1.7}/PKG-INFO +9 -8
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/data/translate.jinja +3 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/__init__.py +1 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/text_segment.py +5 -6
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/utils.py +0 -16
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/xml_interrupter.py +52 -28
- epub_translator-0.1.7/epub_translator/xml/inline.py +113 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/submitter.py +5 -5
- {epub_translator-0.1.6 → epub_translator-0.1.7}/pyproject.toml +2 -1
- epub_translator-0.1.6/epub_translator/data/mmltex/README.md +0 -67
- epub_translator-0.1.6/epub_translator/data/mmltex/cmarkup.xsl +0 -1106
- epub_translator-0.1.6/epub_translator/data/mmltex/entities.xsl +0 -459
- epub_translator-0.1.6/epub_translator/data/mmltex/glayout.xsl +0 -222
- epub_translator-0.1.6/epub_translator/data/mmltex/mmltex.xsl +0 -36
- epub_translator-0.1.6/epub_translator/data/mmltex/scripts.xsl +0 -375
- epub_translator-0.1.6/epub_translator/data/mmltex/tables.xsl +0 -130
- epub_translator-0.1.6/epub_translator/data/mmltex/tokens.xsl +0 -328
- epub_translator-0.1.6/epub_translator/xml/inline.py +0 -67
- {epub_translator-0.1.6 → epub_translator-0.1.7}/LICENSE +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/README.md +7 -7
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/data/fill.jinja +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/common.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/math.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/metadata.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/spines.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/toc.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/epub/zip.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/context.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/core.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/executor.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/llm/types.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/block_segment.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/common.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/segment/inline_segment.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/serial/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/serial/chunk.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/serial/segment.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/serial/splitter.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/template.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/epub_transcode.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/language.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/punctuation.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/translator.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/utils.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/const.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/deduplication.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/decoder.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/encoder.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/parser.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/tag.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/friendly/transform.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/self_closing.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/utils.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/xml.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml/xml_like.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/__init__.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/callbacks.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/common.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/concurrency.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/hill_climbing.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/score.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/stream_mapper.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/translator.py +0 -0
- {epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/xml_translator/validation.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Localization
|
|
|
24
24
|
Classifier: Topic :: Text Processing :: Markup
|
|
25
25
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
26
|
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
27
|
+
Requires-Dist: mathml2latex (>=0.2.12,<0.3.0)
|
|
27
28
|
Requires-Dist: openai (>=2.14.0,<3.0.0)
|
|
28
29
|
Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
|
|
29
30
|
Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
|
|
@@ -59,6 +60,13 @@ Translate EPUB books using Large Language Models while preserving the original t
|
|
|
59
60
|
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
60
61
|
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
61
62
|
|
|
63
|
+
## Use Cases
|
|
64
|
+
|
|
65
|
+
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
66
|
+
- **Academic Research**: Access foreign literature with bilingual references
|
|
67
|
+
- **Content Localization**: Prepare books for international audiences
|
|
68
|
+
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
69
|
+
|
|
62
70
|
## Installation
|
|
63
71
|
|
|
64
72
|
```bash
|
|
@@ -357,13 +365,6 @@ llm = LLM(
|
|
|
357
365
|
)
|
|
358
366
|
```
|
|
359
367
|
|
|
360
|
-
## Use Cases
|
|
361
|
-
|
|
362
|
-
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
363
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
364
|
-
- **Content Localization**: Prepare books for international audiences
|
|
365
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
366
|
-
|
|
367
368
|
## Advanced Features
|
|
368
369
|
|
|
369
370
|
### Custom Translation Prompts
|
|
@@ -13,6 +13,9 @@ Translation rules:
|
|
|
13
13
|
{% if user_prompt -%}
|
|
14
14
|
User may provide additional requirements in <rules> tags before the source text. Follow them, but prioritize the rules above if conflicts arise.
|
|
15
15
|
|
|
16
|
+
<rules>
|
|
17
|
+
{{ user_prompt }}
|
|
18
|
+
</rules>
|
|
16
19
|
{% endif -%}
|
|
17
20
|
|
|
18
21
|
Output only the translated text, nothing else.
|
|
@@ -4,7 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from ..xml import expand_left_element_texts, expand_right_element_texts,
|
|
7
|
+
from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_element, normalize_text_in_element
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TextPosition(Enum):
|
|
@@ -100,7 +100,7 @@ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
|
|
|
100
100
|
def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
|
|
101
101
|
text = normalize_text_in_element(element.text)
|
|
102
102
|
next_stack = stack + [element]
|
|
103
|
-
next_block_depth =
|
|
103
|
+
next_block_depth = find_block_depth(next_stack)
|
|
104
104
|
|
|
105
105
|
if text is not None:
|
|
106
106
|
yield TextSegment(
|
|
@@ -125,12 +125,11 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
125
125
|
)
|
|
126
126
|
|
|
127
127
|
|
|
128
|
-
def
|
|
128
|
+
def find_block_depth(parent_stack: list[Element]) -> int:
|
|
129
129
|
index: int = 0
|
|
130
|
-
for i in range(len(parent_stack)
|
|
131
|
-
if not
|
|
130
|
+
for i in range(len(parent_stack)):
|
|
131
|
+
if not is_inline_element(parent_stack[i]):
|
|
132
132
|
index = i
|
|
133
|
-
break
|
|
134
133
|
return index + 1 # depth is a count not index
|
|
135
134
|
|
|
136
135
|
|
|
@@ -8,22 +8,6 @@ def element_fingerprint(element: Element) -> str:
|
|
|
8
8
|
return f"<{element.tag} {' '.join(attrs)}/>"
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
|
|
12
|
-
parents: list[Element] = []
|
|
13
|
-
while True:
|
|
14
|
-
if len(element) != 1:
|
|
15
|
-
break
|
|
16
|
-
child = element[0]
|
|
17
|
-
if not element.text:
|
|
18
|
-
break
|
|
19
|
-
if not child.tail:
|
|
20
|
-
break
|
|
21
|
-
parents.append(element)
|
|
22
|
-
element = child
|
|
23
|
-
element.tail = None
|
|
24
|
-
return element, parents
|
|
25
|
-
|
|
26
|
-
|
|
27
11
|
def id_in_element(element: Element) -> int | None:
|
|
28
12
|
id_str = element.get(ID_KEY, None)
|
|
29
13
|
if id_str is None:
|
{epub_translator-0.1.6 → epub_translator-0.1.7}/epub_translator/translation/xml_interrupter.py
RENAMED
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from collections.abc import Generator, Iterable
|
|
2
2
|
from typing import cast
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
3
|
+
from xml.etree.ElementTree import Element, tostring
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from mathml2latex.mathml import process_mathml
|
|
7
|
+
|
|
8
|
+
from ..segment import TextSegment, combine_text_segments, find_block_depth
|
|
9
|
+
from ..utils import ensure_list
|
|
10
|
+
from ..xml import clone_element
|
|
7
11
|
|
|
8
12
|
_ID_KEY = "__XML_INTERRUPTER_ID"
|
|
9
13
|
_MATH_TAG = "math"
|
|
@@ -37,8 +41,10 @@ class XMLInterrupter:
|
|
|
37
41
|
def interrupt_block_element(self, element: Element) -> Element:
|
|
38
42
|
interrupted_element = self._placeholder2interrupted.pop(id(element), None)
|
|
39
43
|
if interrupted_element is None:
|
|
44
|
+
element.attrib.pop(_ID_KEY, None)
|
|
40
45
|
return element
|
|
41
46
|
else:
|
|
47
|
+
interrupted_element.attrib.pop(_ID_KEY, None)
|
|
42
48
|
return interrupted_element
|
|
43
49
|
|
|
44
50
|
def _expand_source_text_segment(self, text_segment: TextSegment):
|
|
@@ -81,14 +87,18 @@ class XMLInterrupter:
|
|
|
81
87
|
_ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
|
|
82
88
|
},
|
|
83
89
|
)
|
|
90
|
+
interrupted_display = interrupted_element.get("display", None)
|
|
91
|
+
if interrupted_display is not None:
|
|
92
|
+
placeholder_element.set("display", interrupted_display)
|
|
93
|
+
|
|
84
94
|
raw_parent_stack = text_segment.parent_stack[:interrupted_index]
|
|
85
95
|
parent_stack = raw_parent_stack + [placeholder_element]
|
|
86
96
|
merged_text_segment = TextSegment(
|
|
87
|
-
text=
|
|
97
|
+
text=self._render_latex(text_segments),
|
|
88
98
|
parent_stack=parent_stack,
|
|
89
99
|
left_common_depth=text_segments[0].left_common_depth,
|
|
90
100
|
right_common_depth=text_segments[-1].right_common_depth,
|
|
91
|
-
block_depth=
|
|
101
|
+
block_depth=find_block_depth(parent_stack),
|
|
92
102
|
position=text_segments[0].position,
|
|
93
103
|
)
|
|
94
104
|
self._placeholder2interrupted[id(placeholder_element)] = interrupted_element
|
|
@@ -116,8 +126,8 @@ class XMLInterrupter:
|
|
|
116
126
|
# 原始栈退光,仅留下相对 interrupted 元素的栈,这种格式与 translated 要求一致
|
|
117
127
|
text_segment.left_common_depth = max(0, text_segment.left_common_depth - interrupted_index)
|
|
118
128
|
text_segment.right_common_depth = max(0, text_segment.right_common_depth - interrupted_index)
|
|
119
|
-
text_segment.block_depth = 1
|
|
120
129
|
text_segment.parent_stack = text_segment.parent_stack[interrupted_index:]
|
|
130
|
+
text_segment.block_depth = find_block_depth(text_segment.parent_stack)
|
|
121
131
|
|
|
122
132
|
return merged_text_segment
|
|
123
133
|
|
|
@@ -129,37 +139,51 @@ class XMLInterrupter:
|
|
|
129
139
|
break
|
|
130
140
|
return interrupted_index
|
|
131
141
|
|
|
142
|
+
def _render_latex(self, text_segments: list[TextSegment]) -> str:
|
|
143
|
+
math_element, _ = next(combine_text_segments(text_segments))
|
|
144
|
+
while math_element.tag != _MATH_TAG:
|
|
145
|
+
if len(math_element) == 0:
|
|
146
|
+
return ""
|
|
147
|
+
math_element = math_element[0]
|
|
148
|
+
|
|
149
|
+
math_element = clone_element(math_element)
|
|
150
|
+
math_element.attrib.pop(_ID_KEY, None)
|
|
151
|
+
math_element.tail = None
|
|
152
|
+
latex: str | None = None
|
|
153
|
+
try:
|
|
154
|
+
mathml_str = tostring(math_element, encoding="unicode")
|
|
155
|
+
soup = BeautifulSoup(mathml_str, "html.parser")
|
|
156
|
+
latex = process_mathml(soup)
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
if latex is None:
|
|
161
|
+
latex = "".join(t.text for t in text_segments)
|
|
162
|
+
elif math_element.get("display", None) == "inline":
|
|
163
|
+
latex = f"${latex}$"
|
|
164
|
+
else:
|
|
165
|
+
latex = f"$${latex}$$"
|
|
166
|
+
|
|
167
|
+
return f" {latex} "
|
|
168
|
+
|
|
132
169
|
def _expand_translated_text_segment(self, text_segment: TextSegment):
|
|
133
|
-
|
|
170
|
+
parent_element = text_segment.parent_stack[-1]
|
|
171
|
+
interrupted_id = parent_element.attrib.pop(_ID_KEY, None)
|
|
134
172
|
if interrupted_id is None:
|
|
135
173
|
yield text_segment
|
|
136
174
|
return
|
|
137
175
|
|
|
138
|
-
|
|
139
|
-
|
|
176
|
+
if parent_element is text_segment.block_parent:
|
|
177
|
+
# Block-level math, need to be hidden
|
|
140
178
|
return
|
|
141
179
|
|
|
142
|
-
|
|
143
|
-
if not
|
|
180
|
+
raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
|
|
181
|
+
if not raw_text_segments:
|
|
182
|
+
yield text_segment
|
|
144
183
|
return
|
|
145
184
|
|
|
146
185
|
for raw_text_segment in raw_text_segments:
|
|
186
|
+
text_basic_parent_stack = text_segment.parent_stack[:-1]
|
|
147
187
|
raw_text_segment.block_parent.attrib.pop(_ID_KEY, None)
|
|
188
|
+
raw_text_segment.parent_stack = text_basic_parent_stack + raw_text_segment.parent_stack
|
|
148
189
|
yield raw_text_segment
|
|
149
|
-
|
|
150
|
-
def _has_no_math_texts(self, element: Element):
|
|
151
|
-
if element.tag == _MATH_TAG:
|
|
152
|
-
return True
|
|
153
|
-
if element.text and normalize_whitespace(element.text).strip():
|
|
154
|
-
return False
|
|
155
|
-
for child_element in element:
|
|
156
|
-
if not self._has_no_math_texts(child_element):
|
|
157
|
-
return False
|
|
158
|
-
if child_element.tail and normalize_whitespace(child_element.tail).strip():
|
|
159
|
-
return False
|
|
160
|
-
return True
|
|
161
|
-
|
|
162
|
-
def _is_inline_math(self, element: Element) -> bool:
|
|
163
|
-
if element.tag != _MATH_TAG:
|
|
164
|
-
return False
|
|
165
|
-
return element.get("display", "").lower() != "block"
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
# HTML inline-level elements
|
|
4
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
5
|
+
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
6
|
+
# Reference: https://developer.mozilla.org/en-US/docs/MathML/Element
|
|
7
|
+
_HTML_INLINE_TAGS = frozenset(
|
|
8
|
+
(
|
|
9
|
+
# Inline text semantics
|
|
10
|
+
"a",
|
|
11
|
+
"abbr",
|
|
12
|
+
"b",
|
|
13
|
+
"bdi",
|
|
14
|
+
"bdo",
|
|
15
|
+
"br",
|
|
16
|
+
"cite",
|
|
17
|
+
"code",
|
|
18
|
+
"data",
|
|
19
|
+
"dfn",
|
|
20
|
+
"em",
|
|
21
|
+
"i",
|
|
22
|
+
"kbd",
|
|
23
|
+
"mark",
|
|
24
|
+
"q",
|
|
25
|
+
"rp",
|
|
26
|
+
"rt",
|
|
27
|
+
"ruby",
|
|
28
|
+
"s",
|
|
29
|
+
"samp",
|
|
30
|
+
"small",
|
|
31
|
+
"span",
|
|
32
|
+
"strong",
|
|
33
|
+
"sub",
|
|
34
|
+
"sup",
|
|
35
|
+
"time",
|
|
36
|
+
"u",
|
|
37
|
+
"var",
|
|
38
|
+
"wbr",
|
|
39
|
+
# Image and multimedia
|
|
40
|
+
"img",
|
|
41
|
+
"svg",
|
|
42
|
+
"canvas",
|
|
43
|
+
"audio",
|
|
44
|
+
"video",
|
|
45
|
+
"map",
|
|
46
|
+
"area",
|
|
47
|
+
# Form elements
|
|
48
|
+
"input",
|
|
49
|
+
"button",
|
|
50
|
+
"select",
|
|
51
|
+
"textarea",
|
|
52
|
+
"label",
|
|
53
|
+
"output",
|
|
54
|
+
"progress",
|
|
55
|
+
"meter",
|
|
56
|
+
# Embedded content
|
|
57
|
+
"iframe",
|
|
58
|
+
"embed",
|
|
59
|
+
"object",
|
|
60
|
+
# Other inline elements
|
|
61
|
+
"script",
|
|
62
|
+
"del",
|
|
63
|
+
"ins",
|
|
64
|
+
"slot",
|
|
65
|
+
# MathML elements
|
|
66
|
+
# Token elements
|
|
67
|
+
"mi", # identifier
|
|
68
|
+
"mn", # number
|
|
69
|
+
"mo", # operator
|
|
70
|
+
"ms", # string literal
|
|
71
|
+
"mspace", # space
|
|
72
|
+
"mtext", # text
|
|
73
|
+
# General layout
|
|
74
|
+
"menclose", # enclosed content
|
|
75
|
+
"merror", # syntax error message
|
|
76
|
+
"mfenced", # parentheses (deprecated)
|
|
77
|
+
"mfrac", # fraction
|
|
78
|
+
"mpadded", # space around content
|
|
79
|
+
"mphantom", # invisible content
|
|
80
|
+
"mroot", # radical with index
|
|
81
|
+
"mrow", # grouped sub-expressions
|
|
82
|
+
"msqrt", # square root
|
|
83
|
+
"mstyle", # style change
|
|
84
|
+
# Scripts and limits
|
|
85
|
+
"mmultiscripts", # prescripts and tensor indices
|
|
86
|
+
"mover", # overscript
|
|
87
|
+
"mprescripts", # prescripts separator
|
|
88
|
+
"msub", # subscript
|
|
89
|
+
"msubsup", # subscript-superscript pair
|
|
90
|
+
"msup", # superscript
|
|
91
|
+
"munder", # underscript
|
|
92
|
+
"munderover", # underscript-overscript pair
|
|
93
|
+
# Table math
|
|
94
|
+
"mtable", # table or matrix
|
|
95
|
+
"mtr", # row in table or matrix
|
|
96
|
+
"mtd", # cell in table or matrix
|
|
97
|
+
# Semantic annotations
|
|
98
|
+
"annotation", # data annotation
|
|
99
|
+
"annotation-xml", # XML annotation
|
|
100
|
+
"semantics", # semantic annotation container
|
|
101
|
+
# Other
|
|
102
|
+
"maction", # bind actions to sub-expressions (deprecated)
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def is_inline_element(element: Element) -> bool:
|
|
108
|
+
if element.tag.lower() in _HTML_INLINE_TAGS:
|
|
109
|
+
return True
|
|
110
|
+
display = element.get("display", None)
|
|
111
|
+
if display is not None and display.lower() == "inline":
|
|
112
|
+
return True
|
|
113
|
+
return False
|
|
@@ -4,7 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from xml.etree.ElementTree import Element
|
|
5
5
|
|
|
6
6
|
from ..segment import TextSegment, combine_text_segments
|
|
7
|
-
from ..xml import index_of_parent,
|
|
7
|
+
from ..xml import index_of_parent, is_inline_element, iter_with_stack
|
|
8
8
|
from .stream_mapper import InlineSegmentMapping
|
|
9
9
|
|
|
10
10
|
|
|
@@ -78,7 +78,7 @@ class _Submitter:
|
|
|
78
78
|
preserved_elements: list[Element] = []
|
|
79
79
|
if self._action == SubmitKind.REPLACE:
|
|
80
80
|
for child in list(node.raw_element):
|
|
81
|
-
if not
|
|
81
|
+
if not is_inline_element(child):
|
|
82
82
|
child.tail = None
|
|
83
83
|
preserved_elements.append(child)
|
|
84
84
|
|
|
@@ -87,7 +87,7 @@ class _Submitter:
|
|
|
87
87
|
|
|
88
88
|
if combined is not None:
|
|
89
89
|
# 在 APPEND_BLOCK 模式下,如果是 inline tag,则在文本前面加空格
|
|
90
|
-
if self._action == SubmitKind.APPEND_BLOCK and
|
|
90
|
+
if self._action == SubmitKind.APPEND_BLOCK and is_inline_element(combined) and combined.text:
|
|
91
91
|
combined.text = " " + combined.text
|
|
92
92
|
parent.insert(index + 1, combined)
|
|
93
93
|
index += 1
|
|
@@ -200,7 +200,7 @@ class _Submitter:
|
|
|
200
200
|
preserved_elements: list[Element] = []
|
|
201
201
|
for i in range(start_index, end_index):
|
|
202
202
|
elem = node_element[i]
|
|
203
|
-
if not
|
|
203
|
+
if not is_inline_element(elem):
|
|
204
204
|
elem.tail = None
|
|
205
205
|
preserved_elements.append(elem)
|
|
206
206
|
|
|
@@ -223,7 +223,7 @@ class _Submitter:
|
|
|
223
223
|
|
|
224
224
|
if combined.text:
|
|
225
225
|
will_inject_space = self._action == SubmitKind.APPEND_TEXT or (
|
|
226
|
-
|
|
226
|
+
is_inline_element(combined) and self._action == SubmitKind.APPEND_BLOCK
|
|
227
227
|
)
|
|
228
228
|
if tail_element is not None:
|
|
229
229
|
tail_element.tail = self._append_text_in_element(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "epub-translator"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.7"
|
|
4
4
|
description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
|
|
5
5
|
keywords = ["epub", "llm", "translation", "translator"]
|
|
6
6
|
authors = [
|
|
@@ -33,6 +33,7 @@ dependencies = [
|
|
|
33
33
|
"jinja2>=3.1.6,<4.0.0",
|
|
34
34
|
"resource-segmentation>=0.0.7,<0.1.0",
|
|
35
35
|
"openai>=2.14.0,<3.0.0",
|
|
36
|
+
"mathml2latex (>=0.2.12,<0.3.0)",
|
|
36
37
|
]
|
|
37
38
|
|
|
38
39
|
[project.urls]
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
# XSLT MathML Library
|
|
2
|
-
|
|
3
|
-
This directory contains XSLT stylesheets from the **XSLT MathML Library 2.1.2**, a set of XSLT stylesheets to transform MathML 2.0 to LaTeX.
|
|
4
|
-
|
|
5
|
-
**Note**: These files are included for reference purposes. Our project uses a custom Python implementation to convert MathML to LaTeX, but we keep these XSLT files as a reference for understanding MathML element mappings and conversion rules.
|
|
6
|
-
|
|
7
|
-
## File Manifest
|
|
8
|
-
|
|
9
|
-
- `mmltex.xsl` - Main stylesheet
|
|
10
|
-
- `tokens.xsl` - Token elements (mi, mn, mo, etc.)
|
|
11
|
-
- `glayout.xsl` - Layout elements (mfrac, msqrt, etc.)
|
|
12
|
-
- `scripts.xsl` - Script elements (msub, msup, etc.)
|
|
13
|
-
- `tables.xsl` - Table elements (mtable, mtr, mtd)
|
|
14
|
-
- `entities.xsl` - Entity definitions
|
|
15
|
-
- `cmarkup.xsl` - Content markup elements
|
|
16
|
-
|
|
17
|
-
## Original Project Information
|
|
18
|
-
|
|
19
|
-
**Original Author**: Vasil Yaroshevich
|
|
20
|
-
|
|
21
|
-
**Original Website**: http://www.raleigh.ru/MathML/mmltex/
|
|
22
|
-
|
|
23
|
-
**Archived Links**:
|
|
24
|
-
- Sourceforge Project: https://sourceforge.net/projects/xsltml/files/xsltml/
|
|
25
|
-
- Archived Documentation: https://web.archive.org/web/20160109063934/http://www.raleigh.ru/MathML/mmltex/index.php
|
|
26
|
-
- Google Translated (English): https://translate.google.com/translate?sl=ru&tl=en&u=https%3A%2F%2Fweb.archive.org%2Fweb%2F20160114170851%2Fhttp%3A%2F%2Fwww.raleigh.ru%2FMathML%2Fmmltex%2Findex.php
|
|
27
|
-
|
|
28
|
-
---
|
|
29
|
-
|
|
30
|
-
## Copyright
|
|
31
|
-
|
|
32
|
-
Copyright (C) 2001-2003 Vasil Yaroshevich
|
|
33
|
-
|
|
34
|
-
Permission is hereby granted, free of charge, to any person
|
|
35
|
-
obtaining a copy of this software and associated documentation
|
|
36
|
-
files (the "Software"), to deal in the Software without
|
|
37
|
-
restriction, including without limitation the rights to use,
|
|
38
|
-
copy, modify, merge, publish, distribute, sublicense, and/or
|
|
39
|
-
sell copies of the Software, and to permit persons to whom the
|
|
40
|
-
Software is furnished to do so, subject to the following
|
|
41
|
-
conditions:
|
|
42
|
-
|
|
43
|
-
The above copyright notice and this permission notice shall be
|
|
44
|
-
included in all copies or substantial portions of the Software.
|
|
45
|
-
|
|
46
|
-
Except as contained in this notice, the names of individuals
|
|
47
|
-
credited with contribution to this software shall not be used in
|
|
48
|
-
advertising or otherwise to promote the sale, use or other
|
|
49
|
-
dealings in this Software without prior written authorization
|
|
50
|
-
from the individuals in question.
|
|
51
|
-
|
|
52
|
-
Any stylesheet derived from this Software that is publically
|
|
53
|
-
distributed will be identified with a different name and the
|
|
54
|
-
version strings in any derived Software will be changed so that
|
|
55
|
-
no possibility of confusion between the derived package and this
|
|
56
|
-
Software will exist.
|
|
57
|
-
|
|
58
|
-
## Warranty
|
|
59
|
-
|
|
60
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
61
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
62
|
-
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
63
|
-
NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER
|
|
64
|
-
CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
65
|
-
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
66
|
-
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
67
|
-
OTHER DEALINGS IN THE SOFTWARE.
|