epub-translator 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/data/translate.jinja +3 -0
- epub_translator/llm/core.py +19 -1
- epub_translator/llm/executor.py +5 -0
- epub_translator/llm/statistics.py +25 -0
- epub_translator/segment/__init__.py +1 -0
- epub_translator/segment/text_segment.py +10 -6
- epub_translator/segment/utils.py +0 -16
- epub_translator/translation/xml_interrupter.py +54 -27
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/inline.py +55 -2
- epub_translator/xml_translator/submitter.py +5 -5
- {epub_translator-0.1.6.dist-info → epub_translator-0.1.8.dist-info}/METADATA +108 -8
- {epub_translator-0.1.6.dist-info → epub_translator-0.1.8.dist-info}/RECORD +15 -22
- epub_translator/data/mmltex/README.md +0 -67
- epub_translator/data/mmltex/cmarkup.xsl +0 -1106
- epub_translator/data/mmltex/entities.xsl +0 -459
- epub_translator/data/mmltex/glayout.xsl +0 -222
- epub_translator/data/mmltex/mmltex.xsl +0 -36
- epub_translator/data/mmltex/scripts.xsl +0 -375
- epub_translator/data/mmltex/tables.xsl +0 -130
- epub_translator/data/mmltex/tokens.xsl +0 -328
- {epub_translator-0.1.6.dist-info → epub_translator-0.1.8.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.6.dist-info → epub_translator-0.1.8.dist-info}/WHEEL +0 -0
|
@@ -13,6 +13,9 @@ Translation rules:
|
|
|
13
13
|
{% if user_prompt -%}
|
|
14
14
|
User may provide additional requirements in <rules> tags before the source text. Follow them, but prioritize the rules above if conflicts arise.
|
|
15
15
|
|
|
16
|
+
<rules>
|
|
17
|
+
{{ user_prompt }}
|
|
18
|
+
</rules>
|
|
16
19
|
{% endif -%}
|
|
17
20
|
|
|
18
21
|
Output only the translated text, nothing else.
|
epub_translator/llm/core.py
CHANGED
|
@@ -13,6 +13,7 @@ from ..template import create_env
|
|
|
13
13
|
from .context import LLMContext
|
|
14
14
|
from .executor import LLMExecutor
|
|
15
15
|
from .increasable import Increasable
|
|
16
|
+
from .statistics import Statistics
|
|
16
17
|
from .types import Message
|
|
17
18
|
|
|
18
19
|
# Global state for logger filename generation
|
|
@@ -44,7 +45,7 @@ class LLM:
|
|
|
44
45
|
self._temperature: Increasable = Increasable(temperature)
|
|
45
46
|
self._cache_path: Path | None = self._ensure_dir_path(cache_path)
|
|
46
47
|
self._logger_save_path: Path | None = self._ensure_dir_path(log_dir_path)
|
|
47
|
-
|
|
48
|
+
self._statistics = Statistics()
|
|
48
49
|
self._executor = LLMExecutor(
|
|
49
50
|
url=url,
|
|
50
51
|
model=model,
|
|
@@ -53,12 +54,29 @@ class LLM:
|
|
|
53
54
|
retry_times=retry_times,
|
|
54
55
|
retry_interval_seconds=retry_interval_seconds,
|
|
55
56
|
create_logger=self._create_logger,
|
|
57
|
+
statistics=self._statistics,
|
|
56
58
|
)
|
|
57
59
|
|
|
58
60
|
@property
|
|
59
61
|
def encoding(self) -> Encoding:
|
|
60
62
|
return self._encoding
|
|
61
63
|
|
|
64
|
+
@property
|
|
65
|
+
def total_tokens(self) -> int:
|
|
66
|
+
return self._statistics.total_tokens
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def input_tokens(self) -> int:
|
|
70
|
+
return self._statistics.input_tokens
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def input_cache_tokens(self) -> int:
|
|
74
|
+
return self._statistics.input_cache_tokens
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def output_tokens(self) -> int:
|
|
78
|
+
return self._statistics.output_tokens
|
|
79
|
+
|
|
62
80
|
def context(self, cache_seed_content: str | None = None) -> LLMContext:
|
|
63
81
|
return LLMContext(
|
|
64
82
|
executor=self._executor,
|
epub_translator/llm/executor.py
CHANGED
|
@@ -7,6 +7,7 @@ from openai import OpenAI
|
|
|
7
7
|
from openai.types.chat import ChatCompletionMessageParam
|
|
8
8
|
|
|
9
9
|
from .error import is_retry_error
|
|
10
|
+
from .statistics import Statistics
|
|
10
11
|
from .types import Message, MessageRole
|
|
11
12
|
|
|
12
13
|
|
|
@@ -20,12 +21,14 @@ class LLMExecutor:
|
|
|
20
21
|
retry_times: int,
|
|
21
22
|
retry_interval_seconds: float,
|
|
22
23
|
create_logger: Callable[[], Logger | None],
|
|
24
|
+
statistics: Statistics,
|
|
23
25
|
) -> None:
|
|
24
26
|
self._model_name: str = model
|
|
25
27
|
self._timeout: float | None = timeout
|
|
26
28
|
self._retry_times: int = retry_times
|
|
27
29
|
self._retry_interval_seconds: float = retry_interval_seconds
|
|
28
30
|
self._create_logger: Callable[[], Logger | None] = create_logger
|
|
31
|
+
self._statistics = statistics
|
|
29
32
|
self._client = OpenAI(
|
|
30
33
|
api_key=api_key,
|
|
31
34
|
base_url=url,
|
|
@@ -156,6 +159,7 @@ class LLMExecutor:
|
|
|
156
159
|
model=self._model_name,
|
|
157
160
|
messages=messages,
|
|
158
161
|
stream=True,
|
|
162
|
+
stream_options={"include_usage": True},
|
|
159
163
|
top_p=top_p,
|
|
160
164
|
temperature=temperature,
|
|
161
165
|
max_tokens=max_tokens,
|
|
@@ -164,4 +168,5 @@ class LLMExecutor:
|
|
|
164
168
|
for chunk in stream:
|
|
165
169
|
if chunk.choices and chunk.choices[0].delta.content:
|
|
166
170
|
buffer.write(chunk.choices[0].delta.content)
|
|
171
|
+
self._statistics.submit_usage(chunk.usage)
|
|
167
172
|
return buffer.getvalue()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from threading import Lock
|
|
2
|
+
|
|
3
|
+
from openai.types import CompletionUsage
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Statistics:
|
|
7
|
+
def __init__(self) -> None:
|
|
8
|
+
self._lock = Lock()
|
|
9
|
+
self.total_tokens = 0
|
|
10
|
+
self.input_tokens = 0
|
|
11
|
+
self.input_cache_tokens = 0
|
|
12
|
+
self.output_tokens = 0
|
|
13
|
+
|
|
14
|
+
def submit_usage(self, usage: CompletionUsage | None) -> None:
|
|
15
|
+
if usage is None:
|
|
16
|
+
return
|
|
17
|
+
with self._lock:
|
|
18
|
+
if usage.total_tokens:
|
|
19
|
+
self.total_tokens += usage.total_tokens
|
|
20
|
+
if usage.prompt_tokens:
|
|
21
|
+
self.input_tokens += usage.prompt_tokens
|
|
22
|
+
if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
|
|
23
|
+
self.input_cache_tokens += usage.prompt_tokens_details.cached_tokens
|
|
24
|
+
if usage.completion_tokens:
|
|
25
|
+
self.output_tokens += usage.completion_tokens
|
|
@@ -4,7 +4,12 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from ..xml import
|
|
7
|
+
from ..xml import (
|
|
8
|
+
expand_left_element_texts,
|
|
9
|
+
expand_right_element_texts,
|
|
10
|
+
is_inline_element,
|
|
11
|
+
normalize_text_in_element,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TextPosition(Enum):
|
|
@@ -100,7 +105,7 @@ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
|
|
|
100
105
|
def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
|
|
101
106
|
text = normalize_text_in_element(element.text)
|
|
102
107
|
next_stack = stack + [element]
|
|
103
|
-
next_block_depth =
|
|
108
|
+
next_block_depth = find_block_depth(next_stack)
|
|
104
109
|
|
|
105
110
|
if text is not None:
|
|
106
111
|
yield TextSegment(
|
|
@@ -125,12 +130,11 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
125
130
|
)
|
|
126
131
|
|
|
127
132
|
|
|
128
|
-
def
|
|
133
|
+
def find_block_depth(parent_stack: list[Element]) -> int:
|
|
129
134
|
index: int = 0
|
|
130
|
-
for i in range(len(parent_stack)
|
|
131
|
-
if not
|
|
135
|
+
for i in range(len(parent_stack)):
|
|
136
|
+
if not is_inline_element(parent_stack[i]):
|
|
132
137
|
index = i
|
|
133
|
-
break
|
|
134
138
|
return index + 1 # depth is a count not index
|
|
135
139
|
|
|
136
140
|
|
epub_translator/segment/utils.py
CHANGED
|
@@ -8,22 +8,6 @@ def element_fingerprint(element: Element) -> str:
|
|
|
8
8
|
return f"<{element.tag} {' '.join(attrs)}/>"
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
|
|
12
|
-
parents: list[Element] = []
|
|
13
|
-
while True:
|
|
14
|
-
if len(element) != 1:
|
|
15
|
-
break
|
|
16
|
-
child = element[0]
|
|
17
|
-
if not element.text:
|
|
18
|
-
break
|
|
19
|
-
if not child.tail:
|
|
20
|
-
break
|
|
21
|
-
parents.append(element)
|
|
22
|
-
element = child
|
|
23
|
-
element.tail = None
|
|
24
|
-
return element, parents
|
|
25
|
-
|
|
26
|
-
|
|
27
11
|
def id_in_element(element: Element) -> int | None:
|
|
28
12
|
id_str = element.get(ID_KEY, None)
|
|
29
13
|
if id_str is None:
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from collections.abc import Generator, Iterable
|
|
2
2
|
from typing import cast
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
3
|
+
from xml.etree.ElementTree import Element, tostring
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from mathml2latex.mathml import process_mathml
|
|
7
|
+
|
|
8
|
+
from ..segment import TextSegment, combine_text_segments, find_block_depth
|
|
6
9
|
from ..utils import ensure_list, normalize_whitespace
|
|
10
|
+
from ..xml import DISPLAY_ATTRIBUTE, clone_element, is_inline_element
|
|
7
11
|
|
|
8
12
|
_ID_KEY = "__XML_INTERRUPTER_ID"
|
|
9
13
|
_MATH_TAG = "math"
|
|
@@ -37,8 +41,10 @@ class XMLInterrupter:
|
|
|
37
41
|
def interrupt_block_element(self, element: Element) -> Element:
|
|
38
42
|
interrupted_element = self._placeholder2interrupted.pop(id(element), None)
|
|
39
43
|
if interrupted_element is None:
|
|
44
|
+
element.attrib.pop(_ID_KEY, None)
|
|
40
45
|
return element
|
|
41
46
|
else:
|
|
47
|
+
interrupted_element.attrib.pop(_ID_KEY, None)
|
|
42
48
|
return interrupted_element
|
|
43
49
|
|
|
44
50
|
def _expand_source_text_segment(self, text_segment: TextSegment):
|
|
@@ -81,14 +87,18 @@ class XMLInterrupter:
|
|
|
81
87
|
_ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
|
|
82
88
|
},
|
|
83
89
|
)
|
|
90
|
+
interrupted_display = interrupted_element.get(DISPLAY_ATTRIBUTE, None)
|
|
91
|
+
if interrupted_display is not None:
|
|
92
|
+
placeholder_element.set(DISPLAY_ATTRIBUTE, interrupted_display)
|
|
93
|
+
|
|
84
94
|
raw_parent_stack = text_segment.parent_stack[:interrupted_index]
|
|
85
95
|
parent_stack = raw_parent_stack + [placeholder_element]
|
|
86
96
|
merged_text_segment = TextSegment(
|
|
87
|
-
text=
|
|
97
|
+
text=self._render_latex(text_segments),
|
|
88
98
|
parent_stack=parent_stack,
|
|
89
99
|
left_common_depth=text_segments[0].left_common_depth,
|
|
90
100
|
right_common_depth=text_segments[-1].right_common_depth,
|
|
91
|
-
block_depth=
|
|
101
|
+
block_depth=find_block_depth(parent_stack),
|
|
92
102
|
position=text_segments[0].position,
|
|
93
103
|
)
|
|
94
104
|
self._placeholder2interrupted[id(placeholder_element)] = interrupted_element
|
|
@@ -116,8 +126,8 @@ class XMLInterrupter:
|
|
|
116
126
|
# 原始栈退光,仅留下相对 interrupted 元素的栈,这种格式与 translated 要求一致
|
|
117
127
|
text_segment.left_common_depth = max(0, text_segment.left_common_depth - interrupted_index)
|
|
118
128
|
text_segment.right_common_depth = max(0, text_segment.right_common_depth - interrupted_index)
|
|
119
|
-
text_segment.block_depth = 1
|
|
120
129
|
text_segment.parent_stack = text_segment.parent_stack[interrupted_index:]
|
|
130
|
+
text_segment.block_depth = find_block_depth(text_segment.parent_stack)
|
|
121
131
|
|
|
122
132
|
return merged_text_segment
|
|
123
133
|
|
|
@@ -129,37 +139,54 @@ class XMLInterrupter:
|
|
|
129
139
|
break
|
|
130
140
|
return interrupted_index
|
|
131
141
|
|
|
142
|
+
def _render_latex(self, text_segments: list[TextSegment]) -> str:
|
|
143
|
+
math_element, _ = next(combine_text_segments(text_segments))
|
|
144
|
+
while math_element.tag != _MATH_TAG:
|
|
145
|
+
if len(math_element) == 0:
|
|
146
|
+
return ""
|
|
147
|
+
math_element = math_element[0]
|
|
148
|
+
|
|
149
|
+
math_element = clone_element(math_element)
|
|
150
|
+
math_element.attrib.pop(_ID_KEY, None)
|
|
151
|
+
math_element.tail = None
|
|
152
|
+
latex: str | None = None
|
|
153
|
+
try:
|
|
154
|
+
mathml_str = tostring(math_element, encoding="unicode")
|
|
155
|
+
soup = BeautifulSoup(mathml_str, "html.parser")
|
|
156
|
+
latex = process_mathml(soup)
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
if latex is None:
|
|
161
|
+
latex = "".join(t.text for t in text_segments)
|
|
162
|
+
latex = normalize_whitespace(latex).strip()
|
|
163
|
+
else:
|
|
164
|
+
latex = normalize_whitespace(latex).strip()
|
|
165
|
+
if is_inline_element(math_element):
|
|
166
|
+
latex = f"${latex}$"
|
|
167
|
+
else:
|
|
168
|
+
latex = f"$${latex}$$"
|
|
169
|
+
|
|
170
|
+
return f" {latex} "
|
|
171
|
+
|
|
132
172
|
def _expand_translated_text_segment(self, text_segment: TextSegment):
|
|
133
|
-
|
|
173
|
+
parent_element = text_segment.parent_stack[-1]
|
|
174
|
+
interrupted_id = parent_element.attrib.pop(_ID_KEY, None)
|
|
134
175
|
if interrupted_id is None:
|
|
135
176
|
yield text_segment
|
|
136
177
|
return
|
|
137
178
|
|
|
138
|
-
|
|
139
|
-
|
|
179
|
+
if parent_element is text_segment.block_parent:
|
|
180
|
+
# Block-level math, need to be hidden
|
|
140
181
|
return
|
|
141
182
|
|
|
142
|
-
|
|
143
|
-
if not
|
|
183
|
+
raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
|
|
184
|
+
if not raw_text_segments:
|
|
185
|
+
yield text_segment
|
|
144
186
|
return
|
|
145
187
|
|
|
146
188
|
for raw_text_segment in raw_text_segments:
|
|
189
|
+
text_basic_parent_stack = text_segment.parent_stack[:-1]
|
|
147
190
|
raw_text_segment.block_parent.attrib.pop(_ID_KEY, None)
|
|
191
|
+
raw_text_segment.parent_stack = text_basic_parent_stack + raw_text_segment.parent_stack
|
|
148
192
|
yield raw_text_segment
|
|
149
|
-
|
|
150
|
-
def _has_no_math_texts(self, element: Element):
|
|
151
|
-
if element.tag == _MATH_TAG:
|
|
152
|
-
return True
|
|
153
|
-
if element.text and normalize_whitespace(element.text).strip():
|
|
154
|
-
return False
|
|
155
|
-
for child_element in element:
|
|
156
|
-
if not self._has_no_math_texts(child_element):
|
|
157
|
-
return False
|
|
158
|
-
if child_element.tail and normalize_whitespace(child_element.tail).strip():
|
|
159
|
-
return False
|
|
160
|
-
return True
|
|
161
|
-
|
|
162
|
-
def _is_inline_math(self, element: Element) -> bool:
|
|
163
|
-
if element.tag != _MATH_TAG:
|
|
164
|
-
return False
|
|
165
|
-
return element.get("display", "").lower() != "block"
|
epub_translator/xml/const.py
CHANGED
epub_translator/xml/inline.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
from .const import DISPLAY_ATTRIBUTE
|
|
4
|
+
|
|
1
5
|
# HTML inline-level elements
|
|
2
6
|
# Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
|
|
3
7
|
# Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
|
|
8
|
+
# Reference: https://developer.mozilla.org/en-US/docs/MathML/Element
|
|
4
9
|
_HTML_INLINE_TAGS = frozenset(
|
|
5
10
|
(
|
|
6
11
|
# Inline text semantics
|
|
@@ -59,9 +64,57 @@ _HTML_INLINE_TAGS = frozenset(
|
|
|
59
64
|
"del",
|
|
60
65
|
"ins",
|
|
61
66
|
"slot",
|
|
67
|
+
# MathML elements
|
|
68
|
+
# Token elements
|
|
69
|
+
"mi", # identifier
|
|
70
|
+
"mn", # number
|
|
71
|
+
"mo", # operator
|
|
72
|
+
"ms", # string literal
|
|
73
|
+
"mspace", # space
|
|
74
|
+
"mtext", # text
|
|
75
|
+
# General layout
|
|
76
|
+
"menclose", # enclosed content
|
|
77
|
+
"merror", # syntax error message
|
|
78
|
+
"mfenced", # parentheses (deprecated)
|
|
79
|
+
"mfrac", # fraction
|
|
80
|
+
"mpadded", # space around content
|
|
81
|
+
"mphantom", # invisible content
|
|
82
|
+
"mroot", # radical with index
|
|
83
|
+
"mrow", # grouped sub-expressions
|
|
84
|
+
"msqrt", # square root
|
|
85
|
+
"mstyle", # style change
|
|
86
|
+
# Scripts and limits
|
|
87
|
+
"mmultiscripts", # prescripts and tensor indices
|
|
88
|
+
"mover", # overscript
|
|
89
|
+
"mprescripts", # prescripts separator
|
|
90
|
+
"msub", # subscript
|
|
91
|
+
"msubsup", # subscript-superscript pair
|
|
92
|
+
"msup", # superscript
|
|
93
|
+
"munder", # underscript
|
|
94
|
+
"munderover", # underscript-overscript pair
|
|
95
|
+
# Table math
|
|
96
|
+
"mtable", # table or matrix
|
|
97
|
+
"mtr", # row in table or matrix
|
|
98
|
+
"mtd", # cell in table or matrix
|
|
99
|
+
# Semantic annotations
|
|
100
|
+
"annotation", # data annotation
|
|
101
|
+
"annotation-xml", # XML annotation
|
|
102
|
+
"semantics", # semantic annotation container
|
|
103
|
+
# Other
|
|
104
|
+
"maction", # bind actions to sub-expressions (deprecated)
|
|
62
105
|
)
|
|
63
106
|
)
|
|
64
107
|
|
|
65
108
|
|
|
66
|
-
def
|
|
67
|
-
|
|
109
|
+
def is_inline_element(element: Element) -> bool:
|
|
110
|
+
tag = element.tag.lower()
|
|
111
|
+
if tag in _HTML_INLINE_TAGS:
|
|
112
|
+
return True
|
|
113
|
+
display = element.get(DISPLAY_ATTRIBUTE, None)
|
|
114
|
+
if display is not None:
|
|
115
|
+
display = display.lower()
|
|
116
|
+
if display == "inline":
|
|
117
|
+
return True
|
|
118
|
+
if tag == "math" and display != "block":
|
|
119
|
+
return True
|
|
120
|
+
return False
|
|
@@ -4,7 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from xml.etree.ElementTree import Element
|
|
5
5
|
|
|
6
6
|
from ..segment import TextSegment, combine_text_segments
|
|
7
|
-
from ..xml import index_of_parent,
|
|
7
|
+
from ..xml import index_of_parent, is_inline_element, iter_with_stack
|
|
8
8
|
from .stream_mapper import InlineSegmentMapping
|
|
9
9
|
|
|
10
10
|
|
|
@@ -78,7 +78,7 @@ class _Submitter:
|
|
|
78
78
|
preserved_elements: list[Element] = []
|
|
79
79
|
if self._action == SubmitKind.REPLACE:
|
|
80
80
|
for child in list(node.raw_element):
|
|
81
|
-
if not
|
|
81
|
+
if not is_inline_element(child):
|
|
82
82
|
child.tail = None
|
|
83
83
|
preserved_elements.append(child)
|
|
84
84
|
|
|
@@ -87,7 +87,7 @@ class _Submitter:
|
|
|
87
87
|
|
|
88
88
|
if combined is not None:
|
|
89
89
|
# 在 APPEND_BLOCK 模式下,如果是 inline tag,则在文本前面加空格
|
|
90
|
-
if self._action == SubmitKind.APPEND_BLOCK and
|
|
90
|
+
if self._action == SubmitKind.APPEND_BLOCK and is_inline_element(combined) and combined.text:
|
|
91
91
|
combined.text = " " + combined.text
|
|
92
92
|
parent.insert(index + 1, combined)
|
|
93
93
|
index += 1
|
|
@@ -200,7 +200,7 @@ class _Submitter:
|
|
|
200
200
|
preserved_elements: list[Element] = []
|
|
201
201
|
for i in range(start_index, end_index):
|
|
202
202
|
elem = node_element[i]
|
|
203
|
-
if not
|
|
203
|
+
if not is_inline_element(elem):
|
|
204
204
|
elem.tail = None
|
|
205
205
|
preserved_elements.append(elem)
|
|
206
206
|
|
|
@@ -223,7 +223,7 @@ class _Submitter:
|
|
|
223
223
|
|
|
224
224
|
if combined.text:
|
|
225
225
|
will_inject_space = self._action == SubmitKind.APPEND_TEXT or (
|
|
226
|
-
|
|
226
|
+
is_inline_element(combined) and self._action == SubmitKind.APPEND_BLOCK
|
|
227
227
|
)
|
|
228
228
|
if tail_element is not None:
|
|
229
229
|
tail_element.tail = self._append_text_in_element(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.8
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Localization
|
|
|
24
24
|
Classifier: Topic :: Text Processing :: Markup
|
|
25
25
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
26
|
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
27
|
+
Requires-Dist: mathml2latex (>=0.2.12,<0.3.0)
|
|
27
28
|
Requires-Dist: openai (>=2.14.0,<3.0.0)
|
|
28
29
|
Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
|
|
29
30
|
Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
|
|
@@ -59,6 +60,13 @@ Translate EPUB books using Large Language Models while preserving the original t
|
|
|
59
60
|
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
60
61
|
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
61
62
|
|
|
63
|
+
## Use Cases
|
|
64
|
+
|
|
65
|
+
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
66
|
+
- **Academic Research**: Access foreign literature with bilingual references
|
|
67
|
+
- **Content Localization**: Prepare books for international audiences
|
|
68
|
+
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
69
|
+
|
|
62
70
|
## Installation
|
|
63
71
|
|
|
64
72
|
```bash
|
|
@@ -357,13 +365,6 @@ llm = LLM(
|
|
|
357
365
|
)
|
|
358
366
|
```
|
|
359
367
|
|
|
360
|
-
## Use Cases
|
|
361
|
-
|
|
362
|
-
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
363
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
364
|
-
- **Content Localization**: Prepare books for international audiences
|
|
365
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
366
|
-
|
|
367
368
|
## Advanced Features
|
|
368
369
|
|
|
369
370
|
### Custom Translation Prompts
|
|
@@ -421,6 +422,105 @@ translate(
|
|
|
421
422
|
|
|
422
423
|
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
423
424
|
|
|
425
|
+
### Token Usage Monitoring
|
|
426
|
+
|
|
427
|
+
Track token consumption during translation to monitor API costs and usage:
|
|
428
|
+
|
|
429
|
+
```python
|
|
430
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
431
|
+
|
|
432
|
+
llm = LLM(
|
|
433
|
+
key="your-api-key",
|
|
434
|
+
url="https://api.openai.com/v1",
|
|
435
|
+
model="gpt-4",
|
|
436
|
+
token_encoding="o200k_base",
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
translate(
|
|
440
|
+
source_path="source.epub",
|
|
441
|
+
target_path="translated.epub",
|
|
442
|
+
target_language=language.ENGLISH,
|
|
443
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
444
|
+
llm=llm,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Access token statistics after translation
|
|
448
|
+
print(f"Total tokens: {llm.total_tokens}")
|
|
449
|
+
print(f"Input tokens: {llm.input_tokens}")
|
|
450
|
+
print(f"Input cache tokens: {llm.input_cache_tokens}")
|
|
451
|
+
print(f"Output tokens: {llm.output_tokens}")
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
**Available Statistics:**
|
|
455
|
+
|
|
456
|
+
- `total_tokens` - Total number of tokens used (input + output)
|
|
457
|
+
- `input_tokens` - Number of prompt/input tokens
|
|
458
|
+
- `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
|
|
459
|
+
- `output_tokens` - Number of generated/completion tokens
|
|
460
|
+
|
|
461
|
+
**Real-time Monitoring:**
|
|
462
|
+
|
|
463
|
+
You can also monitor token usage in real-time during translation:
|
|
464
|
+
|
|
465
|
+
```python
|
|
466
|
+
from tqdm import tqdm
|
|
467
|
+
import time
|
|
468
|
+
|
|
469
|
+
with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
470
|
+
last_progress = 0.0
|
|
471
|
+
start_time = time.time()
|
|
472
|
+
|
|
473
|
+
def on_progress(progress: float):
|
|
474
|
+
nonlocal last_progress
|
|
475
|
+
increment = (progress - last_progress) * 100
|
|
476
|
+
pbar.update(increment)
|
|
477
|
+
last_progress = progress
|
|
478
|
+
|
|
479
|
+
# Update token stats in progress bar
|
|
480
|
+
pbar.set_postfix({
|
|
481
|
+
'tokens': llm.total_tokens,
|
|
482
|
+
'cost_est': f'${llm.total_tokens * 0.00001:.4f}' # Estimate based on your pricing
|
|
483
|
+
})
|
|
484
|
+
|
|
485
|
+
translate(
|
|
486
|
+
source_path="source.epub",
|
|
487
|
+
target_path="translated.epub",
|
|
488
|
+
target_language=language.ENGLISH,
|
|
489
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
490
|
+
llm=llm,
|
|
491
|
+
on_progress=on_progress,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
elapsed = time.time() - start_time
|
|
495
|
+
print(f"\nTranslation completed in {elapsed:.1f}s")
|
|
496
|
+
print(f"Total tokens used: {llm.total_tokens:,}")
|
|
497
|
+
print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
|
|
498
|
+
```
|
|
499
|
+
|
|
500
|
+
**Dual-LLM Token Tracking:**
|
|
501
|
+
|
|
502
|
+
When using separate LLMs for translation and filling, each LLM tracks its own statistics:
|
|
503
|
+
|
|
504
|
+
```python
|
|
505
|
+
translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
506
|
+
fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
507
|
+
|
|
508
|
+
translate(
|
|
509
|
+
source_path="source.epub",
|
|
510
|
+
target_path="translated.epub",
|
|
511
|
+
target_language=language.ENGLISH,
|
|
512
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
513
|
+
translation_llm=translation_llm,
|
|
514
|
+
fill_llm=fill_llm,
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
print(f"Translation tokens: {translation_llm.total_tokens}")
|
|
518
|
+
print(f"Fill tokens: {fill_llm.total_tokens}")
|
|
519
|
+
print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
|
|
520
|
+
```
|
|
521
|
+
|
|
522
|
+
**Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
|
|
523
|
+
|
|
424
524
|
## Related Projects
|
|
425
525
|
|
|
426
526
|
### PDF Craft
|
|
@@ -1,14 +1,6 @@
|
|
|
1
1
|
epub_translator/__init__.py,sha256=JsiOUPpk5k7q8mXIgnRQWdVVnkJww_KDTg7jXsP7_C4,222
|
|
2
2
|
epub_translator/data/fill.jinja,sha256=zSytA8Vhp2i6YBZ09F1z9iPJq1-jUaiphoXqTNZwnvo,6964
|
|
3
|
-
epub_translator/data/
|
|
4
|
-
epub_translator/data/mmltex/cmarkup.xsl,sha256=DkhimAATM0XSCfVOfY41-qTPoddqzOHjZ00Pynr4zQE,37707
|
|
5
|
-
epub_translator/data/mmltex/entities.xsl,sha256=TYZ5iGg0u9XlDDBBGuZiHL7MsxKc-3OsTIBAVM1GDek,107742
|
|
6
|
-
epub_translator/data/mmltex/glayout.xsl,sha256=Ztc7N1wiHaYZlo9u9iuROrIl3uIIIoo1VFIuojXq7TM,6879
|
|
7
|
-
epub_translator/data/mmltex/mmltex.xsl,sha256=BVXFbApz-9W2qRKKtBTxptK5vxG2bfB8tv9W1MP5iBI,1384
|
|
8
|
-
epub_translator/data/mmltex/scripts.xsl,sha256=f4ei0cDCW3cV-Ra7rC3kC5tRcKdjJxbSpCeQLoohtgo,13697
|
|
9
|
-
epub_translator/data/mmltex/tables.xsl,sha256=RxtNo8qDtVAg8_6BuYsafraB_0z7YDAB9D__fT9gmWs,4327
|
|
10
|
-
epub_translator/data/mmltex/tokens.xsl,sha256=j3JZRcBhAiiY8o5K3640phfLwxO8JVspCFlSttwBzJk,12373
|
|
11
|
-
epub_translator/data/translate.jinja,sha256=93d8kschm5HV-EfXd1kFSIVMObDqTMdoUrwDfce2bhU,820
|
|
3
|
+
epub_translator/data/translate.jinja,sha256=MVAWvgO9kybEFi0zMiZLEWwuRUL3l8PrwJdsoueQeCs,855
|
|
12
4
|
epub_translator/epub/__init__.py,sha256=aZawPakdkEquL4kRRpyCTdoSQ82l7FGqY4Uw6-ndoGA,154
|
|
13
5
|
epub_translator/epub/common.py,sha256=4-SpTe8iot9hMfyXILmlUFvYVNYqPAHL5hn1fr2wgis,1180
|
|
14
6
|
epub_translator/epub/math.py,sha256=-Q2LJQxxjgQZQUe_WlJA9tjzLqgqtw2ZmbGbHsPRp2U,5422
|
|
@@ -18,17 +10,18 @@ epub_translator/epub/toc.py,sha256=TKJfyDT4svFkXd6JCNZk2ZEYc9q-5DXnV3zY2UKo8nE,1
|
|
|
18
10
|
epub_translator/epub/zip.py,sha256=-3LI8f-ksgU8xCy28NjBOKyQPE8PhPEUPqIKZE1p8dw,2364
|
|
19
11
|
epub_translator/llm/__init__.py,sha256=YcFYYnXmXyX0RUyC-PDbj5k7Woygp_XOpTI3vDiNSPM,75
|
|
20
12
|
epub_translator/llm/context.py,sha256=8-0UnrZIaNshR_imy_ed_UpOK7H1a6dOsG-boaYOX8k,4186
|
|
21
|
-
epub_translator/llm/core.py,sha256=
|
|
13
|
+
epub_translator/llm/core.py,sha256=MnToX8Zhr_r4sj9B3s54bclesojQEFarzl0VqHGDKlo,6488
|
|
22
14
|
epub_translator/llm/error.py,sha256=4efAIQL14DFSvAnSTUfgdAbZRqaWBqOfUGsSfvxa5zM,1503
|
|
23
|
-
epub_translator/llm/executor.py,sha256=
|
|
15
|
+
epub_translator/llm/executor.py,sha256=wxgFwWaLmuqAvctO3lcQX4U52aiw7EdaFw9Ut0v-ZzU,5745
|
|
24
16
|
epub_translator/llm/increasable.py,sha256=8XkKeI1hiHlpMHj8dQ4fW0BkViSx4hH8QfbQsy-5SDw,1297
|
|
17
|
+
epub_translator/llm/statistics.py,sha256=BX75qVWJ9aWbMoFtaQzoE8oVCLh7wiHoR06dX-AAl3E,875
|
|
25
18
|
epub_translator/llm/types.py,sha256=c-dMAIvlG4R3la3mUTWEw5xei-sIYKmQeBja7mirxcI,219
|
|
26
|
-
epub_translator/segment/__init__.py,sha256=
|
|
19
|
+
epub_translator/segment/__init__.py,sha256=nCHNaHASElKTbC8HEAQkI1Y12m6kEdX5uJVvVvHKtFg,595
|
|
27
20
|
epub_translator/segment/block_segment.py,sha256=psNKA_HMIcwZtoug8AtnAcV9_mQ2WXLnXqFsekHzt2g,4570
|
|
28
21
|
epub_translator/segment/common.py,sha256=gGWYQaJ0tGnWCuF1me9TOo-Q_DrZVakCu2patyFIOs0,714
|
|
29
22
|
epub_translator/segment/inline_segment.py,sha256=nrRKoJ-vblsNITJeixrCgIOkVQyUXrchMg0XYU_8pLo,14563
|
|
30
|
-
epub_translator/segment/text_segment.py,sha256=
|
|
31
|
-
epub_translator/segment/utils.py,sha256=
|
|
23
|
+
epub_translator/segment/text_segment.py,sha256=E_qgPI09sCV_-PsJtgwcloTa0tpOP3wl0pw5gV9dDNY,6288
|
|
24
|
+
epub_translator/segment/utils.py,sha256=_tlIA1I7rYz9_q-oQ5cPZWPmhTObCXjksQzRtX3beXY,636
|
|
32
25
|
epub_translator/serial/__init__.py,sha256=b3IMVmWcUwEqHKcGmey88b057pyz5ct946CaUZi4LB4,67
|
|
33
26
|
epub_translator/serial/chunk.py,sha256=FrTaHikVOd6bLYumnEriTaAQ_DIDLjHm16gh-wBVR9k,1495
|
|
34
27
|
epub_translator/serial/segment.py,sha256=uEz-ke1KcYrON-68FaUEzMG2CzHlMjvbC11F3ZT4yH0,446
|
|
@@ -39,10 +32,10 @@ epub_translator/translation/epub_transcode.py,sha256=_pRzmQgDrlfsibalkUogVi0F0Qy
|
|
|
39
32
|
epub_translator/translation/language.py,sha256=88osG0JNYxOkxBjg5Pm-P0Mhiyxf6GqdxoPW12HW0PE,493
|
|
40
33
|
epub_translator/translation/punctuation.py,sha256=TPCGjEmlAyN3G11VuXdHn-pvUkuWDwWqbTNzw-ij60E,813
|
|
41
34
|
epub_translator/translation/translator.py,sha256=WC4Yqx-ffhxBhqzMAujE_NQG7BsDwgn95UMNG7OkUSo,6487
|
|
42
|
-
epub_translator/translation/xml_interrupter.py,sha256=
|
|
35
|
+
epub_translator/translation/xml_interrupter.py,sha256=7TRGskn_OxRZT5mvKfjL0VMtU2VCgl1d9ElmfhFG0pM,8628
|
|
43
36
|
epub_translator/utils.py,sha256=BfZWrYjzDNQ4cFrgvRNzd4i1CKLtPxS8Z4LBHhqEV78,914
|
|
44
37
|
epub_translator/xml/__init__.py,sha256=qluFTfZYlPmOie8nR2C5O0tZ3UbCQEoEoR-Fq-__79c,160
|
|
45
|
-
epub_translator/xml/const.py,sha256=
|
|
38
|
+
epub_translator/xml/const.py,sha256=tCdeJfGwH5xgS4uOmR-pXSfyWXGxOHMJyZKE46BVkJU,54
|
|
46
39
|
epub_translator/xml/deduplication.py,sha256=TaMbzeA70VvUQV0X1wcQFVbuMEPJUtj9Hq6iWlUmtAQ,1152
|
|
47
40
|
epub_translator/xml/friendly/__init__.py,sha256=I5jhnhFWoHvojLsYXH4jfR4Gi8lKFZ3yQ56ze5hEe1M,74
|
|
48
41
|
epub_translator/xml/friendly/decoder.py,sha256=xRQ5LnSunmYbba_0oT39oUr86-sLYAHYMUGmlseIu2U,2467
|
|
@@ -50,7 +43,7 @@ epub_translator/xml/friendly/encoder.py,sha256=evjvw6oE-oCud44IsJ-YZVHn6dtUzjNYX
|
|
|
50
43
|
epub_translator/xml/friendly/parser.py,sha256=QlMHA0nfPJbNyx6IwRFrYVw7okuvzDB42NXCauIFV-o,6560
|
|
51
44
|
epub_translator/xml/friendly/tag.py,sha256=ahaGoYttuAlnFxLFFgTV51KUZSpUiHho-COZX14nxN8,3308
|
|
52
45
|
epub_translator/xml/friendly/transform.py,sha256=5tG1MJmzrXIR_Z5gmRxwcoKvXBzJBVH0ELeaRsG-8w0,1201
|
|
53
|
-
epub_translator/xml/inline.py,sha256=
|
|
46
|
+
epub_translator/xml/inline.py,sha256=VcaNEF2ebVl2fogVk2yV3f4vOP4rePsPTV_qU3fJCE0,3108
|
|
54
47
|
epub_translator/xml/self_closing.py,sha256=41ofGUdss9yU51IVwI4It6hKfzh8YcxIR_j-ohD19LE,5240
|
|
55
48
|
epub_translator/xml/utils.py,sha256=7tQ6L5P0_JXhxONeG64hEeeL5mKjA6NKS1H1Q9B1Cac,1062
|
|
56
49
|
epub_translator/xml/xml.py,sha256=qQ5Wk1-KVVHE4TX25zGOR7fINsGkXnoq-qyKKNl5no4,1675
|
|
@@ -62,10 +55,10 @@ epub_translator/xml_translator/concurrency.py,sha256=ACwoDHNX3xChL0On5yvUSFT8By7
|
|
|
62
55
|
epub_translator/xml_translator/hill_climbing.py,sha256=1jvilOkTLzwljJA4Nrel8yU2XGvOXpueUJTK7RAp-XY,4272
|
|
63
56
|
epub_translator/xml_translator/score.py,sha256=TkXDmr-29p8SzuAp68u_vFDE69y1TyId9S20HT1T_xs,5311
|
|
64
57
|
epub_translator/xml_translator/stream_mapper.py,sha256=nk8iRUHAUQA2B35_y-JOCo6il8MSxXikWvyl-WA8WAA,10662
|
|
65
|
-
epub_translator/xml_translator/submitter.py,sha256=
|
|
58
|
+
epub_translator/xml_translator/submitter.py,sha256=_ic2_JBPdEd2nMSu2mtQ5OzqpGv0zGrvYaicVUXAiUQ,14159
|
|
66
59
|
epub_translator/xml_translator/translator.py,sha256=7Ja1jFbmjIgHcmI9V6gg_K0t7qb6in9mhRn54a7qhZ8,9497
|
|
67
60
|
epub_translator/xml_translator/validation.py,sha256=-OKlSZuD__sjAiEpGAO93YQme4ZDSPmoPjRsAMOCEjc,16668
|
|
68
|
-
epub_translator-0.1.
|
|
69
|
-
epub_translator-0.1.
|
|
70
|
-
epub_translator-0.1.
|
|
71
|
-
epub_translator-0.1.
|
|
61
|
+
epub_translator-0.1.8.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
62
|
+
epub_translator-0.1.8.dist-info/METADATA,sha256=DTipkbLL2pnijg7XIXSHogZXJzI009K7ZTkGUMy06d8,18663
|
|
63
|
+
epub_translator-0.1.8.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
64
|
+
epub_translator-0.1.8.dist-info/RECORD,,
|