epub-translator 0.0.7__py3-none-any.whl → 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +4 -2
- epub_translator/data/fill.jinja +66 -0
- epub_translator/data/mmltex/README.md +67 -0
- epub_translator/data/mmltex/cmarkup.xsl +1106 -0
- epub_translator/data/mmltex/entities.xsl +459 -0
- epub_translator/data/mmltex/glayout.xsl +222 -0
- epub_translator/data/mmltex/mmltex.xsl +36 -0
- epub_translator/data/mmltex/scripts.xsl +375 -0
- epub_translator/data/mmltex/tables.xsl +130 -0
- epub_translator/data/mmltex/tokens.xsl +328 -0
- epub_translator/data/translate.jinja +15 -12
- epub_translator/epub/__init__.py +4 -2
- epub_translator/epub/common.py +43 -0
- epub_translator/epub/math.py +193 -0
- epub_translator/epub/placeholder.py +53 -0
- epub_translator/epub/spines.py +42 -0
- epub_translator/epub/toc.py +505 -0
- epub_translator/epub/zip.py +67 -0
- epub_translator/iter_sync.py +24 -0
- epub_translator/language.py +23 -0
- epub_translator/llm/__init__.py +2 -1
- epub_translator/llm/core.py +175 -0
- epub_translator/llm/error.py +38 -35
- epub_translator/llm/executor.py +159 -136
- epub_translator/llm/increasable.py +28 -28
- epub_translator/llm/types.py +17 -0
- epub_translator/serial/__init__.py +2 -0
- epub_translator/serial/chunk.py +52 -0
- epub_translator/serial/segment.py +17 -0
- epub_translator/serial/splitter.py +50 -0
- epub_translator/template.py +35 -33
- epub_translator/translator.py +205 -178
- epub_translator/utils.py +7 -0
- epub_translator/xml/__init__.py +4 -3
- epub_translator/xml/deduplication.py +38 -0
- epub_translator/xml/firendly/__init__.py +2 -0
- epub_translator/xml/firendly/decoder.py +75 -0
- epub_translator/xml/firendly/encoder.py +84 -0
- epub_translator/xml/firendly/parser.py +177 -0
- epub_translator/xml/firendly/tag.py +118 -0
- epub_translator/xml/firendly/transform.py +36 -0
- epub_translator/xml/xml.py +52 -0
- epub_translator/xml/xml_like.py +176 -0
- epub_translator/xml_translator/__init__.py +3 -0
- epub_translator/xml_translator/const.py +2 -0
- epub_translator/xml_translator/fill.py +128 -0
- epub_translator/xml_translator/format.py +282 -0
- epub_translator/xml_translator/fragmented.py +125 -0
- epub_translator/xml_translator/group.py +183 -0
- epub_translator/xml_translator/progressive_locking.py +256 -0
- epub_translator/xml_translator/submitter.py +102 -0
- epub_translator/xml_translator/text_segment.py +263 -0
- epub_translator/xml_translator/translator.py +178 -0
- epub_translator/xml_translator/utils.py +29 -0
- epub_translator-0.1.0.dist-info/METADATA +283 -0
- epub_translator-0.1.0.dist-info/RECORD +58 -0
- epub_translator/data/format.jinja +0 -33
- epub_translator/epub/content_parser.py +0 -162
- epub_translator/epub/html/__init__.py +0 -1
- epub_translator/epub/html/dom_operator.py +0 -68
- epub_translator/epub/html/empty_tags.py +0 -23
- epub_translator/epub/html/file.py +0 -80
- epub_translator/epub/html/texts_searcher.py +0 -46
- epub_translator/llm/node.py +0 -201
- epub_translator/translation/__init__.py +0 -2
- epub_translator/translation/chunk.py +0 -118
- epub_translator/translation/splitter.py +0 -78
- epub_translator/translation/store.py +0 -36
- epub_translator/translation/translation.py +0 -231
- epub_translator/translation/types.py +0 -45
- epub_translator/translation/utils.py +0 -11
- epub_translator/xml/decoder.py +0 -71
- epub_translator/xml/encoder.py +0 -95
- epub_translator/xml/parser.py +0 -172
- epub_translator/xml/tag.py +0 -93
- epub_translator/xml/transform.py +0 -34
- epub_translator/xml/utils.py +0 -12
- epub_translator/zip_context.py +0 -74
- epub_translator-0.0.7.dist-info/METADATA +0 -170
- epub_translator-0.0.7.dist-info/RECORD +0 -36
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.0.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.7.dist-info → epub_translator-0.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
# 运算符映射表
|
|
4
|
+
_OPERATOR_MAP = {
|
|
5
|
+
"→": r"\rightarrow",
|
|
6
|
+
"←": r"\leftarrow",
|
|
7
|
+
"↔": r"\leftrightarrow",
|
|
8
|
+
"×": r"\times",
|
|
9
|
+
"·": r"\cdot",
|
|
10
|
+
"÷": r"\div",
|
|
11
|
+
"±": r"\pm",
|
|
12
|
+
"∓": r"\mp",
|
|
13
|
+
"≤": r"\leq",
|
|
14
|
+
"≥": r"\geq",
|
|
15
|
+
"≠": r"\neq",
|
|
16
|
+
"≈": r"\approx",
|
|
17
|
+
"∞": r"\infty",
|
|
18
|
+
"∫": r"\int",
|
|
19
|
+
"∑": r"\sum",
|
|
20
|
+
"∏": r"\prod",
|
|
21
|
+
"√": r"\sqrt",
|
|
22
|
+
"∂": r"\partial",
|
|
23
|
+
"∇": r"\nabla",
|
|
24
|
+
"∈": r"\in",
|
|
25
|
+
"∉": r"\notin",
|
|
26
|
+
"⊂": r"\subset",
|
|
27
|
+
"⊃": r"\supset",
|
|
28
|
+
"⊆": r"\subseteq",
|
|
29
|
+
"⊇": r"\supseteq",
|
|
30
|
+
"∪": r"\cup",
|
|
31
|
+
"∩": r"\cap",
|
|
32
|
+
"∅": r"\emptyset",
|
|
33
|
+
"∀": r"\forall",
|
|
34
|
+
"∃": r"\exists",
|
|
35
|
+
"¬": r"\neg",
|
|
36
|
+
"∧": r"\land",
|
|
37
|
+
"∨": r"\lor",
|
|
38
|
+
"α": r"\alpha",
|
|
39
|
+
"β": r"\beta",
|
|
40
|
+
"γ": r"\gamma",
|
|
41
|
+
"δ": r"\delta",
|
|
42
|
+
"ε": r"\epsilon",
|
|
43
|
+
"θ": r"\theta",
|
|
44
|
+
"λ": r"\lambda",
|
|
45
|
+
"μ": r"\mu",
|
|
46
|
+
"π": r"\pi",
|
|
47
|
+
"σ": r"\sigma",
|
|
48
|
+
"φ": r"\phi",
|
|
49
|
+
"ω": r"\omega",
|
|
50
|
+
"Δ": r"\Delta",
|
|
51
|
+
"Σ": r"\Sigma",
|
|
52
|
+
"Ω": r"\Omega",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def xml_to_latex(element: Element) -> str:
|
|
57
|
+
tag = element.tag
|
|
58
|
+
|
|
59
|
+
# 根据元素类型进行转换
|
|
60
|
+
if tag == "math":
|
|
61
|
+
# 根元素,只处理子元素
|
|
62
|
+
return "".join(xml_to_latex(child) for child in element)
|
|
63
|
+
|
|
64
|
+
elif tag == "mrow":
|
|
65
|
+
# 分组元素,递归处理所有子元素
|
|
66
|
+
return "".join(xml_to_latex(child) for child in element)
|
|
67
|
+
|
|
68
|
+
elif tag == "mi":
|
|
69
|
+
# 标识符(变量名)
|
|
70
|
+
text = element.text or ""
|
|
71
|
+
# 多字符标识符用 \mathrm
|
|
72
|
+
if len(text) > 1:
|
|
73
|
+
return f"\\mathrm{{{text}}}"
|
|
74
|
+
return text
|
|
75
|
+
|
|
76
|
+
elif tag == "mn":
|
|
77
|
+
# 数字
|
|
78
|
+
return element.text or ""
|
|
79
|
+
|
|
80
|
+
elif tag == "mo":
|
|
81
|
+
# 运算符
|
|
82
|
+
text = (element.text or "").strip()
|
|
83
|
+
return _OPERATOR_MAP.get(text, text)
|
|
84
|
+
|
|
85
|
+
elif tag == "mfrac":
|
|
86
|
+
# 分数
|
|
87
|
+
children = list(element)
|
|
88
|
+
if len(children) >= 2:
|
|
89
|
+
numerator = xml_to_latex(children[0])
|
|
90
|
+
denominator = xml_to_latex(children[1])
|
|
91
|
+
return f"\\frac{{{numerator}}}{{{denominator}}}"
|
|
92
|
+
return ""
|
|
93
|
+
|
|
94
|
+
elif tag == "msub":
|
|
95
|
+
# 下标
|
|
96
|
+
children = list(element)
|
|
97
|
+
if len(children) >= 2:
|
|
98
|
+
base = xml_to_latex(children[0])
|
|
99
|
+
subscript = xml_to_latex(children[1])
|
|
100
|
+
return f"{base}_{{{subscript}}}"
|
|
101
|
+
return ""
|
|
102
|
+
|
|
103
|
+
elif tag == "msup":
|
|
104
|
+
# 上标
|
|
105
|
+
children = list(element)
|
|
106
|
+
if len(children) >= 2:
|
|
107
|
+
base = xml_to_latex(children[0])
|
|
108
|
+
superscript = xml_to_latex(children[1])
|
|
109
|
+
return f"{base}^{{{superscript}}}"
|
|
110
|
+
return ""
|
|
111
|
+
|
|
112
|
+
elif tag == "msubsup":
|
|
113
|
+
# 同时有上下标
|
|
114
|
+
children = list(element)
|
|
115
|
+
if len(children) >= 3:
|
|
116
|
+
base = xml_to_latex(children[0])
|
|
117
|
+
subscript = xml_to_latex(children[1])
|
|
118
|
+
superscript = xml_to_latex(children[2])
|
|
119
|
+
return f"{base}_{{{subscript}}}^{{{superscript}}}"
|
|
120
|
+
return ""
|
|
121
|
+
|
|
122
|
+
elif tag == "msqrt":
|
|
123
|
+
# 平方根
|
|
124
|
+
content = "".join(xml_to_latex(child) for child in element)
|
|
125
|
+
return f"\\sqrt{{{content}}}"
|
|
126
|
+
|
|
127
|
+
elif tag == "mroot":
|
|
128
|
+
# n次根
|
|
129
|
+
children = list(element)
|
|
130
|
+
if len(children) >= 2:
|
|
131
|
+
base = xml_to_latex(children[0])
|
|
132
|
+
index = xml_to_latex(children[1])
|
|
133
|
+
return f"\\sqrt[{index}]{{{base}}}"
|
|
134
|
+
return ""
|
|
135
|
+
|
|
136
|
+
elif tag == "munder":
|
|
137
|
+
# 下方符号
|
|
138
|
+
children = list(element)
|
|
139
|
+
if len(children) >= 2:
|
|
140
|
+
base = xml_to_latex(children[0])
|
|
141
|
+
under = xml_to_latex(children[1])
|
|
142
|
+
return f"\\underset{{{under}}}{{{base}}}"
|
|
143
|
+
return ""
|
|
144
|
+
|
|
145
|
+
elif tag == "mover":
|
|
146
|
+
# 上方符号
|
|
147
|
+
children = list(element)
|
|
148
|
+
if len(children) >= 2:
|
|
149
|
+
base = xml_to_latex(children[0])
|
|
150
|
+
over = xml_to_latex(children[1])
|
|
151
|
+
return f"\\overset{{{over}}}{{{base}}}"
|
|
152
|
+
return ""
|
|
153
|
+
|
|
154
|
+
elif tag == "munderover":
|
|
155
|
+
# 上下方符号
|
|
156
|
+
children = list(element)
|
|
157
|
+
if len(children) >= 3:
|
|
158
|
+
base = xml_to_latex(children[0])
|
|
159
|
+
under = xml_to_latex(children[1])
|
|
160
|
+
over = xml_to_latex(children[2])
|
|
161
|
+
# 特殊处理求和、积分等
|
|
162
|
+
base_str = base.strip()
|
|
163
|
+
if base_str in (r"\sum", r"\int", r"\prod"):
|
|
164
|
+
return f"{base}_{{{under}}}^{{{over}}}"
|
|
165
|
+
return f"\\overset{{{over}}}{{\\underset{{{under}}}{{{base}}}}}"
|
|
166
|
+
return ""
|
|
167
|
+
|
|
168
|
+
elif tag == "mtext":
|
|
169
|
+
# 文本
|
|
170
|
+
text = element.text or ""
|
|
171
|
+
return f"\\text{{{text}}}"
|
|
172
|
+
|
|
173
|
+
elif tag == "mspace":
|
|
174
|
+
# 空格
|
|
175
|
+
return r"\,"
|
|
176
|
+
|
|
177
|
+
elif tag == "mtable":
|
|
178
|
+
# 表格/矩阵
|
|
179
|
+
rows = [xml_to_latex(child) for child in element if child.tag.endswith("mtr")]
|
|
180
|
+
return f"\\begin{{array}}{{{rows[0].count('&') + 1}}}\n" + "\\\\\n".join(rows) + "\n\\end{array}"
|
|
181
|
+
|
|
182
|
+
elif tag == "mtr":
|
|
183
|
+
# 表格行
|
|
184
|
+
cells = [xml_to_latex(child) for child in element if child.tag.endswith("mtd")]
|
|
185
|
+
return " & ".join(cells)
|
|
186
|
+
|
|
187
|
+
elif tag == "mtd":
|
|
188
|
+
# 表格单元格
|
|
189
|
+
return "".join(xml_to_latex(child) for child in element)
|
|
190
|
+
|
|
191
|
+
else:
|
|
192
|
+
# 未知元素,递归处理子元素
|
|
193
|
+
return "".join(xml_to_latex(child) for child in element)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from xml.etree.ElementTree import Element
|
|
3
|
+
|
|
4
|
+
from .math import xml_to_latex
|
|
5
|
+
|
|
6
|
+
_MATH_TAG = "math"
|
|
7
|
+
_EXPRESSION_TAG = "expression"
|
|
8
|
+
|
|
9
|
+
_PLACEHOLDER_TAGS = frozenset((_EXPRESSION_TAG,))
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_placeholder_tag(tag: str) -> bool:
|
|
13
|
+
return tag in _PLACEHOLDER_TAGS
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Placeholder:
|
|
17
|
+
def __init__(self, root: Element):
|
|
18
|
+
self._raw_elements: dict[int, Element] = {}
|
|
19
|
+
self._root: Element = self._replace(
|
|
20
|
+
element=root,
|
|
21
|
+
replace=self._replace_raw,
|
|
22
|
+
)
|
|
23
|
+
assert id(self._root) == id(root)
|
|
24
|
+
|
|
25
|
+
def recover(self) -> None:
|
|
26
|
+
self._replace(
|
|
27
|
+
element=self._root,
|
|
28
|
+
replace=self._recover_to_raw,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def _replace(self, element: Element, replace: Callable[[Element], Element | None]) -> Element:
|
|
32
|
+
replaced = replace(element)
|
|
33
|
+
if replaced is not None:
|
|
34
|
+
return replaced
|
|
35
|
+
if len(element):
|
|
36
|
+
element[:] = [self._replace(child, replace) for child in element]
|
|
37
|
+
return element
|
|
38
|
+
|
|
39
|
+
def _replace_raw(self, element: Element) -> Element | None:
|
|
40
|
+
if element.tag == _MATH_TAG:
|
|
41
|
+
replaced = Element(_EXPRESSION_TAG)
|
|
42
|
+
replaced.text = xml_to_latex(element)
|
|
43
|
+
replaced.tail = element.tail
|
|
44
|
+
self._raw_elements[id(replaced)] = element
|
|
45
|
+
return replaced
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
def _recover_to_raw(self, replaced: Element) -> Element | None:
|
|
49
|
+
raw = self._raw_elements.get(id(replaced))
|
|
50
|
+
if raw is not None:
|
|
51
|
+
del self._raw_elements[id(replaced)]
|
|
52
|
+
return raw
|
|
53
|
+
return None
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from collections.abc import Generator
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from xml.etree import ElementTree as ET
|
|
4
|
+
|
|
5
|
+
from .common import find_opf_path, strip_namespace
|
|
6
|
+
from .zip import Zip
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def search_spine_paths(zip: Zip) -> Generator[Path, None, None]:
|
|
10
|
+
opf_path = find_opf_path(zip)
|
|
11
|
+
opf_dir = opf_path.parent
|
|
12
|
+
|
|
13
|
+
with zip.read(opf_path) as f:
|
|
14
|
+
content = f.read()
|
|
15
|
+
root = ET.fromstring(content)
|
|
16
|
+
strip_namespace(root)
|
|
17
|
+
|
|
18
|
+
manifest = root.find(".//manifest")
|
|
19
|
+
if manifest is None:
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
manifest_items = {}
|
|
23
|
+
for item in manifest.findall("item"):
|
|
24
|
+
item_id = item.get("id")
|
|
25
|
+
item_href = item.get("href")
|
|
26
|
+
media_type = item.get("media-type", "")
|
|
27
|
+
if item_id and item_href:
|
|
28
|
+
manifest_items[item_id] = (item_href, media_type)
|
|
29
|
+
|
|
30
|
+
spine = root.find(".//spine")
|
|
31
|
+
if spine is None:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
for itemref in spine.findall("itemref"):
|
|
35
|
+
idref = itemref.get("idref")
|
|
36
|
+
if not idref:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
if idref in manifest_items:
|
|
40
|
+
href, media_type = manifest_items[idref]
|
|
41
|
+
if media_type in ("application/xhtml+xml", "text/html"):
|
|
42
|
+
yield opf_dir / href
|