epub-translator 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +2 -2
- epub_translator/data/fill.jinja +143 -38
- epub_translator/epub/__init__.py +1 -1
- epub_translator/epub/metadata.py +122 -0
- epub_translator/epub/spines.py +3 -2
- epub_translator/epub/zip.py +11 -9
- epub_translator/epub_transcode.py +108 -0
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/context.py +109 -0
- epub_translator/llm/core.py +39 -62
- epub_translator/llm/executor.py +25 -31
- epub_translator/llm/increasable.py +1 -1
- epub_translator/llm/types.py +0 -3
- epub_translator/segment/__init__.py +26 -0
- epub_translator/segment/block_segment.py +124 -0
- epub_translator/segment/common.py +29 -0
- epub_translator/segment/inline_segment.py +356 -0
- epub_translator/{xml_translator → segment}/text_segment.py +8 -8
- epub_translator/segment/utils.py +43 -0
- epub_translator/translator.py +150 -183
- epub_translator/utils.py +33 -0
- epub_translator/xml/__init__.py +2 -0
- epub_translator/xml/const.py +1 -0
- epub_translator/xml/deduplication.py +3 -3
- epub_translator/xml/self_closing.py +182 -0
- epub_translator/xml/utils.py +42 -0
- epub_translator/xml/xml.py +7 -0
- epub_translator/xml/xml_like.py +145 -115
- epub_translator/xml_interrupter.py +165 -0
- epub_translator/xml_translator/__init__.py +1 -2
- epub_translator/xml_translator/callbacks.py +34 -0
- epub_translator/xml_translator/{const.py → common.py} +0 -1
- epub_translator/xml_translator/hill_climbing.py +104 -0
- epub_translator/xml_translator/stream_mapper.py +253 -0
- epub_translator/xml_translator/submitter.py +26 -72
- epub_translator/xml_translator/translator.py +157 -107
- epub_translator/xml_translator/validation.py +458 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/METADATA +72 -9
- epub_translator-0.1.3.dist-info/RECORD +66 -0
- epub_translator/epub/placeholder.py +0 -53
- epub_translator/iter_sync.py +0 -24
- epub_translator/xml_translator/fill.py +0 -128
- epub_translator/xml_translator/format.py +0 -282
- epub_translator/xml_translator/fragmented.py +0 -125
- epub_translator/xml_translator/group.py +0 -183
- epub_translator/xml_translator/progressive_locking.py +0 -256
- epub_translator/xml_translator/utils.py +0 -29
- epub_translator-0.1.0.dist-info/RECORD +0 -58
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.1.0.dist-info → epub_translator-0.1.3.dist-info}/WHEEL +0 -0
epub_translator/__init__.py
CHANGED
epub_translator/data/fill.jinja
CHANGED
|
@@ -1,66 +1,171 @@
|
|
|
1
|
-
You
|
|
1
|
+
You are an XML structure validator. Your ONLY task is to preserve the exact XML structure from the template while filling in translated text.
|
|
2
2
|
|
|
3
|
-
CRITICAL:
|
|
3
|
+
CRITICAL RULES:
|
|
4
|
+
|
|
5
|
+
1. Structure Preservation: The output XML MUST have the EXACT SAME structure as the template
|
|
6
|
+
- Same tags in the same order
|
|
7
|
+
- Same nesting hierarchy
|
|
8
|
+
- Same attributes (especially id attributes)
|
|
9
|
+
|
|
10
|
+
IMPORTANT: Translation fluency is SECONDARY to structure preservation.
|
|
11
|
+
If the translated text flows naturally but doesn't match template structure,
|
|
12
|
+
you MUST break the flow to insert required tags.
|
|
13
|
+
|
|
14
|
+
2. ID Handling:
|
|
15
|
+
- Tags WITH id="X": Disambiguation markers for structurally similar elements
|
|
16
|
+
- Tags WITHOUT id: Structurally unique, match by position and tag name
|
|
17
|
+
- NEVER add, remove, or change id attributes
|
|
18
|
+
|
|
19
|
+
3. Text Filling Strategy:
|
|
20
|
+
- Compare source text with translated text
|
|
21
|
+
- Identify how source maps to template structure
|
|
22
|
+
- Apply the same mapping to translated text
|
|
23
|
+
- Preserve paragraph breaks (elements are natural separators)
|
|
24
|
+
- IMPORTANT: Translation may change word order - use SEMANTIC matching, not position
|
|
4
25
|
|
|
5
26
|
---
|
|
6
27
|
|
|
7
|
-
|
|
28
|
+
COMMON ERRORS TO AVOID:
|
|
29
|
+
|
|
30
|
+
Error Type 1: Missing expected blocks
|
|
31
|
+
❌ WRONG: Omitting elements with id attributes
|
|
32
|
+
✓ CORRECT: Every <tag id="X"> in template MUST appear in output
|
|
33
|
+
|
|
34
|
+
Error Type 2: Tag count mismatch for non-id elements
|
|
35
|
+
Example template:
|
|
36
|
+
<p id="1">
|
|
37
|
+
<span>text1</span>
|
|
38
|
+
<span>text2</span>
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
❌ WRONG: <p id="1"><span>merged text</span></p> (only 1 span, expected 2)
|
|
42
|
+
✓ CORRECT: <p id="1"><span>text1</span><span>text2</span></p>
|
|
43
|
+
|
|
44
|
+
Error Type 3: Adding unexpected IDs
|
|
45
|
+
❌ WRONG: Adding id="99" to a tag that didn't have an id in template
|
|
46
|
+
✓ CORRECT: If template has <span>text</span>, output should be <span>译文</span> (no id)
|
|
47
|
+
|
|
48
|
+
Error Type 4: Wrong tag names
|
|
49
|
+
❌ WRONG: Changing <em id="5"> to <i id="5">
|
|
50
|
+
✓ CORRECT: Keep exact tag name from template
|
|
51
|
+
|
|
52
|
+
Error Type 5: Missing ID on required elements
|
|
53
|
+
❌ WRONG: <span>text</span> when template has <span id="5">text</span>
|
|
54
|
+
✓ CORRECT: <span id="5">译文</span>
|
|
55
|
+
|
|
56
|
+
Error Type 6: Wrong text mapping when word order changes
|
|
57
|
+
Example 1: Template has "reviewer of <span id="5">Book</span> in <span id="6">Journal</span>"
|
|
58
|
+
Translation: "Journal 上对 Book 的评论者"
|
|
59
|
+
|
|
60
|
+
❌ WRONG: Journal 上对 <span id="5">Book</span> 的评论者<span id="6">Journal</span>
|
|
61
|
+
(appending original text at end)
|
|
62
|
+
✓ CORRECT: <span id="6">Journal</span> 上对 <span id="5">Book</span> 的评论者
|
|
63
|
+
(wrapping semantic equivalents in translated positions)
|
|
64
|
+
|
|
65
|
+
Example 2: Breaking fluent translation to preserve structure
|
|
66
|
+
Template: "published in <span id="5">Book Title</span> in 1990"
|
|
67
|
+
Translation: "于1990年出版的《书名》" (flows naturally, but loses structure)
|
|
68
|
+
|
|
69
|
+
❌ WRONG: 于1990年出版的《书名》 (fluent but missing <span id="5">)
|
|
70
|
+
✓ CORRECT: 于1990年出版的<span id="5">《书名》</span>
|
|
71
|
+
(Break fluency to preserve structure - this is REQUIRED)
|
|
72
|
+
|
|
73
|
+
Error Type 7: Wrong semantic matching when word order changes
|
|
74
|
+
When translation changes word order, match elements by SEMANTIC TYPE, not position.
|
|
8
75
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
3. Translated text (target language)
|
|
76
|
+
Example: Book title and year
|
|
77
|
+
Template: "<span id="3">Book Title</span> in <span id="4"><a>1990</a></span>"
|
|
78
|
+
Translation: "《书名》于1990年出版"
|
|
13
79
|
|
|
14
|
-
|
|
80
|
+
❌ WRONG: 《书名》于<span id="3">1990</span>年出版...
|
|
81
|
+
(Matching by position: "1990" appears after "于", so wrapping it with id="3")
|
|
82
|
+
(WRONG because you matched a YEAR to a slot expecting BOOK TITLE)
|
|
83
|
+
|
|
84
|
+
✓ CORRECT: <span id="3">《书名》</span>于<span id="4"><a>1990</a></span>年出版
|
|
85
|
+
(Matching by semantic type: book title → book title, year → year)
|
|
86
|
+
|
|
87
|
+
KEY PRINCIPLE: Semantic type matching beats position matching!
|
|
88
|
+
- Identify semantic types: book titles, journal names, years (4-digit numbers), person names, etc.
|
|
89
|
+
- Match each to its corresponding slot, regardless of position in translation
|
|
90
|
+
- data-orig-len hints at length: book/journal titles usually longer than years/numbers
|
|
15
91
|
|
|
16
92
|
---
|
|
17
93
|
|
|
18
|
-
|
|
94
|
+
FILLING ALGORITHM:
|
|
19
95
|
|
|
20
|
-
|
|
21
|
-
→ Put ALL remaining text INSIDE that tag
|
|
96
|
+
1. Analyze template structure: count elements at each level, note id attributes
|
|
22
97
|
|
|
23
|
-
|
|
98
|
+
2. Segment source text by elements (elements are natural separators)
|
|
24
99
|
|
|
25
|
-
|
|
26
|
-
```xml
|
|
27
|
-
<a id="1">
|
|
28
|
-
<span id="2">4</span>
|
|
29
|
-
The methodology of
|
|
30
|
-
<span id="3">Robotics</span>
|
|
31
|
-
</a>
|
|
32
|
-
```
|
|
100
|
+
3. Apply to translation - STRICT STRUCTURAL MATCHING:
|
|
33
101
|
|
|
34
|
-
|
|
35
|
-
|
|
102
|
+
A. For elements WITH id:
|
|
103
|
+
- Locate semantic equivalent in translation
|
|
104
|
+
- Wrap with same tag+id
|
|
105
|
+
- If translation merged multiple spans: You MUST still output all original spans separately
|
|
106
|
+
Example: Template has id="1" and id="2", translation merged both
|
|
107
|
+
→ Output BOTH spans, use source text fallback for missing one
|
|
36
108
|
|
|
37
|
-
|
|
38
|
-
-
|
|
39
|
-
-
|
|
109
|
+
B. For elements WITHOUT id:
|
|
110
|
+
- Match by STRUCTURAL POSITION only (template order)
|
|
111
|
+
- Count MUST be exact: 7 spans in template = 7 spans in output
|
|
112
|
+
- Even if content repeats (e.g., 3 instances of "x"), each gets its own span
|
|
113
|
+
- Process sequentially: wrap 1st occurrence with 1st span, 2nd with 2nd span, etc.
|
|
114
|
+
- DO NOT merge, skip, or add extra elements
|
|
40
115
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
116
|
+
CRITICAL for repeated content:
|
|
117
|
+
If template has: "...<span>Word</span>...more text...<span>Word</span>"
|
|
118
|
+
And translation has: "...词...更多文字...词"
|
|
119
|
+
→ Wrap 1st occurrence of "词" with 1st span, 2nd occurrence with 2nd span
|
|
120
|
+
→ Even if the words are identical, treat each span position independently
|
|
45
121
|
|
|
46
|
-
|
|
47
|
-
</a>
|
|
48
|
-
```
|
|
122
|
+
4. Verify: same element counts, all ids preserved, tag names match
|
|
49
123
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
124
|
+
CRITICAL: Template structure is LAW. Translation fluency is secondary.
|
|
125
|
+
|
|
126
|
+
---
|
|
53
127
|
|
|
54
|
-
|
|
55
|
-
|
|
128
|
+
SPECIAL CASES:
|
|
129
|
+
|
|
130
|
+
1. data-orig-len attribute: Token count hint. Longer counts usually = titles/names, shorter = numbers/symbols.
|
|
131
|
+
|
|
132
|
+
2. Name+Number as single unit (e.g., "JournalName42" with NO space):
|
|
133
|
+
- In translation, find the name's equivalent and any adjacent number
|
|
134
|
+
- Wrap them together: <span id="X">《期刊名》42</span> or <span id="X">《期刊名》第42期</span>
|
|
135
|
+
- Key: If template treats them as one span, keep them in one span in translation
|
|
136
|
+
|
|
137
|
+
3. Translation merges adjacent spans:
|
|
138
|
+
Template: "<span id="A">Word1</span> & <span id="B">Word2</span>"
|
|
139
|
+
Translation: "复合词" (one inseparable term)
|
|
140
|
+
|
|
141
|
+
Solution: You MUST output BOTH spans even if translation merged them
|
|
142
|
+
- Try to split translation if possible
|
|
143
|
+
- If truly inseparable: Keep translation for one span, use source text for the other
|
|
144
|
+
- Example: <span id="A">复合词</span>与<span id="B">Word2</span>
|
|
145
|
+
|
|
146
|
+
4. Missing semantic match:
|
|
147
|
+
- Exhaust all possibilities first (synonyms, paraphrases, context)
|
|
148
|
+
- Last resort: Use source text as fallback
|
|
149
|
+
- Mixed language is acceptable to preserve structure
|
|
150
|
+
|
|
151
|
+
WRONG fallback approaches:
|
|
152
|
+
❌ Empty: <span id="2"></span>
|
|
153
|
+
❌ Placeholder: <span id="2">内容</span>
|
|
154
|
+
❌ Duplicate: <span id="2">中文名称</span> (when id="1" has this)
|
|
56
155
|
|
|
57
156
|
---
|
|
58
157
|
|
|
59
158
|
OUTPUT FORMAT:
|
|
60
159
|
```xml
|
|
61
160
|
<xml>
|
|
62
|
-
... your
|
|
161
|
+
... your filled XML here ...
|
|
63
162
|
</xml>
|
|
64
163
|
```
|
|
65
164
|
|
|
165
|
+
CRITICAL:
|
|
166
|
+
- Return ONLY the XML block, no explanations
|
|
167
|
+
- Do NOT include example blocks or alternatives
|
|
168
|
+
- If unsure, make best attempt based on pattern
|
|
169
|
+
- System will provide detailed error messages if corrections needed
|
|
170
|
+
|
|
66
171
|
Begin.
|
epub_translator/epub/__init__.py
CHANGED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
from .common import find_opf_path
|
|
4
|
+
from .zip import Zip
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class MetadataField:
|
|
9
|
+
"""
|
|
10
|
+
表示 EPUB OPF 文件中的元数据字段
|
|
11
|
+
|
|
12
|
+
- tag_name: 标签名(不带命名空间)
|
|
13
|
+
- text: 文本内容
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
tag_name: str
|
|
17
|
+
text: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# 不应该被翻译的元数据字段
|
|
21
|
+
SKIP_FIELDS = {
|
|
22
|
+
"language",
|
|
23
|
+
"identifier",
|
|
24
|
+
"date",
|
|
25
|
+
"meta",
|
|
26
|
+
"contributor", # Usually technical information
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def read_metadata(zip: Zip) -> list[MetadataField]:
|
|
31
|
+
"""
|
|
32
|
+
从 EPUB 的 OPF 文件中读取所有可翻译的元数据字段。
|
|
33
|
+
|
|
34
|
+
返回包含标签名和文本内容的列表。
|
|
35
|
+
自动过滤掉不应该翻译的字段(language, identifier, date, meta, contributor 等)。
|
|
36
|
+
"""
|
|
37
|
+
opf_path = find_opf_path(zip)
|
|
38
|
+
|
|
39
|
+
with zip.read(opf_path) as f:
|
|
40
|
+
content = f.read()
|
|
41
|
+
|
|
42
|
+
from xml.etree import ElementTree as ET
|
|
43
|
+
|
|
44
|
+
root = ET.fromstring(content)
|
|
45
|
+
|
|
46
|
+
# Find metadata element
|
|
47
|
+
metadata_elem = None
|
|
48
|
+
for child in root:
|
|
49
|
+
if child.tag.endswith("metadata"):
|
|
50
|
+
metadata_elem = child
|
|
51
|
+
break
|
|
52
|
+
|
|
53
|
+
if metadata_elem is None:
|
|
54
|
+
return []
|
|
55
|
+
|
|
56
|
+
# Collect metadata fields to translate
|
|
57
|
+
fields: list[MetadataField] = []
|
|
58
|
+
|
|
59
|
+
for elem in metadata_elem:
|
|
60
|
+
# Get tag name without namespace
|
|
61
|
+
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
62
|
+
|
|
63
|
+
# Check if element has text content and should be translated
|
|
64
|
+
if elem.text and elem.text.strip() and tag_name not in SKIP_FIELDS:
|
|
65
|
+
fields.append(MetadataField(tag_name=tag_name, text=elem.text.strip()))
|
|
66
|
+
|
|
67
|
+
return fields
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def write_metadata(zip: Zip, fields: list[MetadataField]) -> None:
|
|
71
|
+
"""
|
|
72
|
+
将翻译后的元数据字段写回 EPUB 的 OPF 文件。
|
|
73
|
+
|
|
74
|
+
根据 tag_name 匹配对应的元素,并更新其文本内容。
|
|
75
|
+
匹配策略:按照 tag_name 和在文件中出现的顺序依次匹配。
|
|
76
|
+
"""
|
|
77
|
+
opf_path = find_opf_path(zip)
|
|
78
|
+
|
|
79
|
+
with zip.read(opf_path) as f:
|
|
80
|
+
content = f.read()
|
|
81
|
+
|
|
82
|
+
from xml.etree import ElementTree as ET
|
|
83
|
+
|
|
84
|
+
root = ET.fromstring(content)
|
|
85
|
+
|
|
86
|
+
# Find metadata element
|
|
87
|
+
metadata_elem = None
|
|
88
|
+
for child in root:
|
|
89
|
+
if child.tag.endswith("metadata"):
|
|
90
|
+
metadata_elem = child
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
if metadata_elem is None:
|
|
94
|
+
return
|
|
95
|
+
|
|
96
|
+
# Build a mapping: tag_name -> list of fields with that tag_name
|
|
97
|
+
fields_by_tag: dict[str, list[str]] = {}
|
|
98
|
+
for field in fields:
|
|
99
|
+
if field.tag_name not in fields_by_tag:
|
|
100
|
+
fields_by_tag[field.tag_name] = []
|
|
101
|
+
fields_by_tag[field.tag_name].append(field.text)
|
|
102
|
+
|
|
103
|
+
# Create a counter for each tag to track which occurrence we're at
|
|
104
|
+
tag_counters: dict[str, int] = {tag: 0 for tag in fields_by_tag}
|
|
105
|
+
|
|
106
|
+
# Update elements in metadata
|
|
107
|
+
for elem in metadata_elem:
|
|
108
|
+
# Get tag name without namespace
|
|
109
|
+
tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
110
|
+
|
|
111
|
+
# Check if this tag has translated text
|
|
112
|
+
if tag_name in fields_by_tag and elem.text and elem.text.strip():
|
|
113
|
+
counter = tag_counters[tag_name]
|
|
114
|
+
if counter < len(fields_by_tag[tag_name]):
|
|
115
|
+
# Update the text with translated version
|
|
116
|
+
elem.text = fields_by_tag[tag_name][counter]
|
|
117
|
+
tag_counters[tag_name] += 1
|
|
118
|
+
|
|
119
|
+
# Write back the modified OPF file
|
|
120
|
+
tree = ET.ElementTree(root)
|
|
121
|
+
with zip.replace(opf_path) as f:
|
|
122
|
+
tree.write(f, encoding="utf-8", xml_declaration=True)
|
epub_translator/epub/spines.py
CHANGED
|
@@ -6,7 +6,8 @@ from .common import find_opf_path, strip_namespace
|
|
|
6
6
|
from .zip import Zip
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
# yield file_path, media_type
|
|
10
|
+
def search_spine_paths(zip: Zip) -> Generator[tuple[Path, str], None, None]:
|
|
10
11
|
opf_path = find_opf_path(zip)
|
|
11
12
|
opf_dir = opf_path.parent
|
|
12
13
|
|
|
@@ -39,4 +40,4 @@ def search_spine_paths(zip: Zip) -> Generator[Path, None, None]:
|
|
|
39
40
|
if idref in manifest_items:
|
|
40
41
|
href, media_type = manifest_items[idref]
|
|
41
42
|
if media_type in ("application/xhtml+xml", "text/html"):
|
|
42
|
-
yield opf_dir / href
|
|
43
|
+
yield opf_dir / href, media_type
|
epub_translator/epub/zip.py
CHANGED
|
@@ -44,24 +44,26 @@ class Zip:
|
|
|
44
44
|
all_files = self._source_zip.namelist()
|
|
45
45
|
if prefix_path is None:
|
|
46
46
|
return [Path(f) for f in all_files]
|
|
47
|
-
prefix =
|
|
47
|
+
prefix = prefix_path.as_posix()
|
|
48
48
|
if not prefix.endswith("/"):
|
|
49
49
|
prefix += "/"
|
|
50
50
|
return [Path(f) for f in all_files if f.startswith(prefix)]
|
|
51
51
|
|
|
52
52
|
def migrate(self, path: Path):
|
|
53
|
+
path_str = path.as_posix()
|
|
54
|
+
source_info = self._source_zip.getinfo(path_str)
|
|
53
55
|
with self.read(path) as source_file:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
56
|
+
content = source_file.read()
|
|
57
|
+
self._target_zip.writestr(
|
|
58
|
+
zinfo_or_arcname=source_info,
|
|
59
|
+
data=content,
|
|
60
|
+
compress_type=source_info.compress_type,
|
|
61
|
+
)
|
|
60
62
|
self._processed_files.add(path)
|
|
61
63
|
|
|
62
64
|
def read(self, path: Path) -> IO[bytes]:
|
|
63
|
-
return self._source_zip.open(
|
|
65
|
+
return self._source_zip.open(path.as_posix(), "r")
|
|
64
66
|
|
|
65
67
|
def replace(self, path: Path) -> IO[bytes]:
|
|
66
68
|
self._processed_files.add(path)
|
|
67
|
-
return self._target_zip.open(
|
|
69
|
+
return self._target_zip.open(path.as_posix(), "w")
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EPUB 数据结构与 XML 的编码/解码转换
|
|
3
|
+
|
|
4
|
+
将 Toc 和 MetadataField 等数据结构转换为 XML Element,以便进行翻译处理。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from xml.etree.ElementTree import Element
|
|
8
|
+
|
|
9
|
+
from .epub.metadata import MetadataField
|
|
10
|
+
from .epub.toc import Toc
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def encode_toc(toc: Toc) -> Element:
|
|
14
|
+
elem = Element("toc-item")
|
|
15
|
+
|
|
16
|
+
if toc.href is not None:
|
|
17
|
+
elem.set("href", toc.href)
|
|
18
|
+
if toc.fragment is not None:
|
|
19
|
+
elem.set("fragment", toc.fragment)
|
|
20
|
+
if toc.id is not None:
|
|
21
|
+
elem.set("id", toc.id)
|
|
22
|
+
|
|
23
|
+
title_elem = Element("title")
|
|
24
|
+
title_elem.text = toc.title
|
|
25
|
+
elem.append(title_elem)
|
|
26
|
+
|
|
27
|
+
for child in toc.children:
|
|
28
|
+
child_elem = encode_toc(child)
|
|
29
|
+
elem.append(child_elem)
|
|
30
|
+
|
|
31
|
+
return elem
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def decode_toc(elem: Element) -> Toc:
|
|
35
|
+
href = elem.get("href")
|
|
36
|
+
fragment = elem.get("fragment")
|
|
37
|
+
toc_id = elem.get("id")
|
|
38
|
+
|
|
39
|
+
title_elem = elem.find("title")
|
|
40
|
+
if title_elem is None or title_elem.text is None:
|
|
41
|
+
raise ValueError("Missing title element in toc-item")
|
|
42
|
+
title = title_elem.text
|
|
43
|
+
|
|
44
|
+
children = []
|
|
45
|
+
for child_elem in elem.findall("toc-item"):
|
|
46
|
+
child_toc = decode_toc(child_elem)
|
|
47
|
+
children.append(child_toc)
|
|
48
|
+
|
|
49
|
+
return Toc(
|
|
50
|
+
title=title,
|
|
51
|
+
href=href,
|
|
52
|
+
fragment=fragment,
|
|
53
|
+
id=toc_id,
|
|
54
|
+
children=children,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def encode_toc_list(toc_list: list[Toc]) -> Element:
|
|
59
|
+
root = Element("toc-list")
|
|
60
|
+
|
|
61
|
+
for toc in toc_list:
|
|
62
|
+
toc_elem = encode_toc(toc)
|
|
63
|
+
root.append(toc_elem)
|
|
64
|
+
|
|
65
|
+
return root
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def decode_toc_list(elem: Element) -> list[Toc]:
|
|
69
|
+
if elem.tag != "toc-list":
|
|
70
|
+
raise ValueError(f"Expected 'toc-list' element, got '{elem.tag}'")
|
|
71
|
+
|
|
72
|
+
toc_list = []
|
|
73
|
+
for toc_elem in elem.findall("toc-item"):
|
|
74
|
+
toc = decode_toc(toc_elem)
|
|
75
|
+
toc_list.append(toc)
|
|
76
|
+
|
|
77
|
+
return toc_list
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def encode_metadata(fields: list[MetadataField]) -> Element:
|
|
81
|
+
root = Element("metadata-list")
|
|
82
|
+
|
|
83
|
+
for field in fields:
|
|
84
|
+
field_elem = Element("field")
|
|
85
|
+
field_elem.set("tag", field.tag_name)
|
|
86
|
+
field_elem.text = field.text
|
|
87
|
+
root.append(field_elem)
|
|
88
|
+
|
|
89
|
+
return root
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def decode_metadata(elem: Element) -> list[MetadataField]:
|
|
93
|
+
if elem.tag != "metadata-list":
|
|
94
|
+
raise ValueError(f"Expected 'metadata-list' element, got '{elem.tag}'")
|
|
95
|
+
|
|
96
|
+
fields = []
|
|
97
|
+
for field_elem in elem.findall("field"):
|
|
98
|
+
tag_name = field_elem.get("tag")
|
|
99
|
+
if tag_name is None:
|
|
100
|
+
raise ValueError("Missing 'tag' attribute in field element")
|
|
101
|
+
|
|
102
|
+
text = field_elem.text
|
|
103
|
+
if text is None:
|
|
104
|
+
raise ValueError(f"Missing text content in field element (tag={tag_name})")
|
|
105
|
+
|
|
106
|
+
fields.append(MetadataField(tag_name=tag_name, text=text))
|
|
107
|
+
|
|
108
|
+
return fields
|
epub_translator/llm/__init__.py
CHANGED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import uuid
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Self
|
|
6
|
+
|
|
7
|
+
from .executor import LLMExecutor
|
|
8
|
+
from .increasable import Increasable, Increaser
|
|
9
|
+
from .types import Message, MessageRole
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class LLMContext:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
executor: LLMExecutor,
|
|
16
|
+
cache_path: Path | None,
|
|
17
|
+
cache_seed_content: str | None,
|
|
18
|
+
top_p: Increasable,
|
|
19
|
+
temperature: Increasable,
|
|
20
|
+
) -> None:
|
|
21
|
+
self._executor = executor
|
|
22
|
+
self._cache_path = cache_path
|
|
23
|
+
self._cache_seed_content = cache_seed_content
|
|
24
|
+
self._top_p: Increaser = top_p.context()
|
|
25
|
+
self._temperature: Increaser = temperature.context()
|
|
26
|
+
self._context_id = uuid.uuid4().hex[:12]
|
|
27
|
+
self._temp_files: set[Path] = set()
|
|
28
|
+
|
|
29
|
+
def __enter__(self) -> Self:
|
|
30
|
+
return self
|
|
31
|
+
|
|
32
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
33
|
+
if exc_type is None:
|
|
34
|
+
# Success: commit all temporary cache files
|
|
35
|
+
self._commit()
|
|
36
|
+
else:
|
|
37
|
+
# Failure: rollback (delete) all temporary cache files
|
|
38
|
+
self._rollback()
|
|
39
|
+
|
|
40
|
+
def request(
|
|
41
|
+
self,
|
|
42
|
+
input: str | list[Message],
|
|
43
|
+
max_tokens: int | None = None,
|
|
44
|
+
temperature: float | None = None,
|
|
45
|
+
top_p: float | None = None,
|
|
46
|
+
) -> str:
|
|
47
|
+
messages: list[Message]
|
|
48
|
+
if isinstance(input, str):
|
|
49
|
+
messages = [Message(role=MessageRole.USER, message=input)]
|
|
50
|
+
else:
|
|
51
|
+
messages = input
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
cache_key: str | None = None
|
|
55
|
+
if self._cache_path is not None:
|
|
56
|
+
cache_key = self._compute_messages_hash(messages)
|
|
57
|
+
permanent_cache_file = self._cache_path / f"{cache_key}.txt"
|
|
58
|
+
if permanent_cache_file.exists():
|
|
59
|
+
cached_content = permanent_cache_file.read_text(encoding="utf-8")
|
|
60
|
+
return cached_content
|
|
61
|
+
|
|
62
|
+
if temperature is None:
|
|
63
|
+
temperature = self._temperature.current
|
|
64
|
+
if top_p is None:
|
|
65
|
+
top_p = self._top_p.current
|
|
66
|
+
|
|
67
|
+
# Make the actual request
|
|
68
|
+
response = self._executor.request(
|
|
69
|
+
messages=messages,
|
|
70
|
+
max_tokens=max_tokens,
|
|
71
|
+
temperature=temperature,
|
|
72
|
+
top_p=top_p,
|
|
73
|
+
cache_key=cache_key,
|
|
74
|
+
)
|
|
75
|
+
# Save to temporary cache if cache_path is set
|
|
76
|
+
if self._cache_path is not None and cache_key is not None:
|
|
77
|
+
temp_cache_file = self._cache_path / f"{cache_key}.{self._context_id}.txt"
|
|
78
|
+
if temp_cache_file.exists():
|
|
79
|
+
temp_cache_file.unlink()
|
|
80
|
+
temp_cache_file.write_text(response, encoding="utf-8")
|
|
81
|
+
self._temp_files.add(temp_cache_file)
|
|
82
|
+
|
|
83
|
+
return response
|
|
84
|
+
|
|
85
|
+
finally:
|
|
86
|
+
self._temperature.increase()
|
|
87
|
+
self._top_p.increase()
|
|
88
|
+
|
|
89
|
+
def _compute_messages_hash(self, messages: list[Message]) -> str:
|
|
90
|
+
messages_dict = [{"role": msg.role.value, "message": msg.message} for msg in messages]
|
|
91
|
+
hash_data = {
|
|
92
|
+
"messages": messages_dict,
|
|
93
|
+
"cache_seed": self._cache_seed_content,
|
|
94
|
+
}
|
|
95
|
+
hash_json = json.dumps(hash_data, ensure_ascii=False, sort_keys=True)
|
|
96
|
+
return hashlib.sha512(hash_json.encode("utf-8")).hexdigest()
|
|
97
|
+
|
|
98
|
+
def _commit(self) -> None:
|
|
99
|
+
for temp_file in sorted(self._temp_files):
|
|
100
|
+
if temp_file.exists():
|
|
101
|
+
# Remove the .[context-id].txt suffix to get permanent name
|
|
102
|
+
permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
|
|
103
|
+
permanent_file = temp_file.parent / permanent_name
|
|
104
|
+
temp_file.rename(permanent_file)
|
|
105
|
+
|
|
106
|
+
def _rollback(self) -> None:
|
|
107
|
+
for temp_file in self._temp_files:
|
|
108
|
+
if temp_file.exists():
|
|
109
|
+
temp_file.unlink()
|