epub-translator 0.1.0__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {epub_translator-0.1.0 → epub_translator-0.1.3}/PKG-INFO +72 -9
  2. {epub_translator-0.1.0 → epub_translator-0.1.3}/README.md +71 -8
  3. epub_translator-0.1.3/epub_translator/__init__.py +5 -0
  4. epub_translator-0.1.3/epub_translator/data/fill.jinja +171 -0
  5. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/__init__.py +1 -1
  6. epub_translator-0.1.3/epub_translator/epub/metadata.py +122 -0
  7. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/spines.py +3 -2
  8. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/zip.py +11 -9
  9. epub_translator-0.1.3/epub_translator/epub_transcode.py +108 -0
  10. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/__init__.py +1 -0
  11. epub_translator-0.1.3/epub_translator/llm/context.py +109 -0
  12. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/core.py +39 -62
  13. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/executor.py +25 -31
  14. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/increasable.py +1 -1
  15. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/types.py +0 -3
  16. epub_translator-0.1.3/epub_translator/segment/__init__.py +26 -0
  17. epub_translator-0.1.3/epub_translator/segment/block_segment.py +124 -0
  18. epub_translator-0.1.3/epub_translator/segment/common.py +29 -0
  19. epub_translator-0.1.3/epub_translator/segment/inline_segment.py +356 -0
  20. {epub_translator-0.1.0/epub_translator/xml_translator → epub_translator-0.1.3/epub_translator/segment}/text_segment.py +8 -8
  21. epub_translator-0.1.3/epub_translator/segment/utils.py +43 -0
  22. epub_translator-0.1.3/epub_translator/translator.py +178 -0
  23. epub_translator-0.1.3/epub_translator/utils.py +40 -0
  24. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/__init__.py +2 -0
  25. epub_translator-0.1.3/epub_translator/xml/const.py +1 -0
  26. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/deduplication.py +3 -3
  27. epub_translator-0.1.3/epub_translator/xml/self_closing.py +182 -0
  28. epub_translator-0.1.3/epub_translator/xml/utils.py +42 -0
  29. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/xml.py +7 -0
  30. epub_translator-0.1.3/epub_translator/xml/xml_like.py +206 -0
  31. epub_translator-0.1.3/epub_translator/xml_interrupter.py +165 -0
  32. epub_translator-0.1.3/epub_translator/xml_translator/__init__.py +2 -0
  33. epub_translator-0.1.3/epub_translator/xml_translator/callbacks.py +34 -0
  34. epub_translator-0.1.0/epub_translator/xml_translator/const.py → epub_translator-0.1.3/epub_translator/xml_translator/common.py +0 -1
  35. epub_translator-0.1.3/epub_translator/xml_translator/hill_climbing.py +104 -0
  36. epub_translator-0.1.3/epub_translator/xml_translator/stream_mapper.py +253 -0
  37. epub_translator-0.1.3/epub_translator/xml_translator/submitter.py +56 -0
  38. epub_translator-0.1.3/epub_translator/xml_translator/translator.py +228 -0
  39. epub_translator-0.1.3/epub_translator/xml_translator/validation.py +458 -0
  40. {epub_translator-0.1.0 → epub_translator-0.1.3}/pyproject.toml +1 -1
  41. epub_translator-0.1.0/epub_translator/__init__.py +0 -5
  42. epub_translator-0.1.0/epub_translator/data/fill.jinja +0 -66
  43. epub_translator-0.1.0/epub_translator/epub/placeholder.py +0 -53
  44. epub_translator-0.1.0/epub_translator/iter_sync.py +0 -24
  45. epub_translator-0.1.0/epub_translator/translator.py +0 -211
  46. epub_translator-0.1.0/epub_translator/utils.py +0 -7
  47. epub_translator-0.1.0/epub_translator/xml/xml_like.py +0 -176
  48. epub_translator-0.1.0/epub_translator/xml_translator/__init__.py +0 -3
  49. epub_translator-0.1.0/epub_translator/xml_translator/fill.py +0 -128
  50. epub_translator-0.1.0/epub_translator/xml_translator/format.py +0 -282
  51. epub_translator-0.1.0/epub_translator/xml_translator/fragmented.py +0 -125
  52. epub_translator-0.1.0/epub_translator/xml_translator/group.py +0 -183
  53. epub_translator-0.1.0/epub_translator/xml_translator/progressive_locking.py +0 -256
  54. epub_translator-0.1.0/epub_translator/xml_translator/submitter.py +0 -102
  55. epub_translator-0.1.0/epub_translator/xml_translator/translator.py +0 -178
  56. epub_translator-0.1.0/epub_translator/xml_translator/utils.py +0 -29
  57. {epub_translator-0.1.0 → epub_translator-0.1.3}/LICENSE +0 -0
  58. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/README.md +0 -0
  59. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
  60. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/entities.xsl +0 -0
  61. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/glayout.xsl +0 -0
  62. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/mmltex.xsl +0 -0
  63. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/scripts.xsl +0 -0
  64. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/tables.xsl +0 -0
  65. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/mmltex/tokens.xsl +0 -0
  66. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/data/translate.jinja +0 -0
  67. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/common.py +0 -0
  68. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/math.py +0 -0
  69. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/epub/toc.py +0 -0
  70. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/language.py +0 -0
  71. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/llm/error.py +0 -0
  72. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/serial/__init__.py +0 -0
  73. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/serial/chunk.py +0 -0
  74. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/serial/segment.py +0 -0
  75. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/serial/splitter.py +0 -0
  76. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/template.py +0 -0
  77. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/__init__.py +0 -0
  78. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/decoder.py +0 -0
  79. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/encoder.py +0 -0
  80. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/parser.py +0 -0
  81. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/tag.py +0 -0
  82. {epub_translator-0.1.0 → epub_translator-0.1.3}/epub_translator/xml/firendly/transform.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -91,10 +91,10 @@ llm = LLM(
91
91
 
92
92
  # Translate EPUB file using language constants
93
93
  translate(
94
- llm=llm,
95
94
  source_path=Path("source.epub"),
96
95
  target_path=Path("translated.epub"),
97
96
  target_language=language.ENGLISH,
97
+ llm=llm,
98
98
  )
99
99
  ```
100
100
 
@@ -113,10 +113,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
113
113
  last_progress = progress
114
114
 
115
115
  translate(
116
- llm=llm,
117
116
  source_path=Path("source.epub"),
118
117
  target_path=Path("translated.epub"),
119
118
  target_language="English",
119
+ llm=llm,
120
120
  on_progress=on_progress,
121
121
  )
122
122
  ```
@@ -149,17 +149,22 @@ Translate an EPUB file:
149
149
 
150
150
  ```python
151
151
  translate(
152
- llm: LLM, # LLM instance
153
- source_path: Path, # Source EPUB file path
154
- target_path: Path, # Output EPUB file path
152
+ source_path: PathLike | str, # Source EPUB file path
153
+ target_path: PathLike | str, # Output EPUB file path
155
154
  target_language: str, # Target language (e.g., "English", "Chinese")
156
155
  user_prompt: str | None = None, # Custom translation instructions
157
156
  max_retries: int = 5, # Maximum retries for failed translations
158
157
  max_group_tokens: int = 1200, # Maximum tokens per translation group
158
+ llm: LLM | None = None, # Single LLM instance for both translation and filling
159
+ translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
160
+ fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
159
161
  on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
162
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
160
163
  )
161
164
  ```
162
165
 
166
+ **Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
167
+
163
168
  #### Language Constants
164
169
 
165
170
  EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
@@ -169,18 +174,76 @@ from epub_translator import language
169
174
 
170
175
  # Usage example:
171
176
  translate(
172
- llm=llm,
173
177
  source_path=Path("source.epub"),
174
178
  target_path=Path("translated.epub"),
175
179
  target_language=language.ENGLISH,
180
+ llm=llm,
176
181
  )
177
182
 
178
183
  # You can also use custom language strings:
179
184
  translate(
180
- llm=llm,
181
185
  source_path=Path("source.epub"),
182
186
  target_path=Path("translated.epub"),
183
187
  target_language="Icelandic", # For languages not in the constants
188
+ llm=llm,
189
+ )
190
+ ```
191
+
192
+ ### Error Handling with `on_fill_failed`
193
+
194
+ Monitor and handle translation errors using the `on_fill_failed` callback:
195
+
196
+ ```python
197
+ from epub_translator import FillFailedEvent
198
+
199
+ def handle_fill_error(event: FillFailedEvent):
200
+ print(f"Translation error (attempt {event.retried_count}):")
201
+ print(f" {event.error_message}")
202
+ if event.over_maximum_retries:
203
+ print(" Maximum retries exceeded!")
204
+
205
+ translate(
206
+ source_path=Path("source.epub"),
207
+ target_path=Path("translated.epub"),
208
+ target_language=language.ENGLISH,
209
+ llm=llm,
210
+ on_fill_failed=handle_fill_error,
211
+ )
212
+ ```
213
+
214
+ The `FillFailedEvent` contains:
215
+ - `error_message: str` - Description of the error
216
+ - `retried_count: int` - Current retry attempt number
217
+ - `over_maximum_retries: bool` - Whether max retries has been exceeded
218
+
219
+ ### Dual-LLM Architecture
220
+
221
+ Use separate LLM instances for translation and XML structure filling with different optimization parameters:
222
+
223
+ ```python
224
+ # Create two LLM instances with different temperatures
225
+ translation_llm = LLM(
226
+ key="your-api-key",
227
+ url="https://api.openai.com/v1",
228
+ model="gpt-4",
229
+ token_encoding="o200k_base",
230
+ temperature=0.8, # Higher temperature for creative translation
231
+ )
232
+
233
+ fill_llm = LLM(
234
+ key="your-api-key",
235
+ url="https://api.openai.com/v1",
236
+ model="gpt-4",
237
+ token_encoding="o200k_base",
238
+ temperature=0.3, # Lower temperature for structure preservation
239
+ )
240
+
241
+ translate(
242
+ source_path=Path("source.epub"),
243
+ target_path=Path("translated.epub"),
244
+ target_language=language.ENGLISH,
245
+ translation_llm=translation_llm,
246
+ fill_llm=fill_llm,
184
247
  )
185
248
  ```
186
249
 
@@ -236,10 +299,10 @@ Provide specific translation instructions:
236
299
 
237
300
  ```python
238
301
  translate(
239
- llm=llm,
240
302
  source_path=Path("source.epub"),
241
303
  target_path=Path("translated.epub"),
242
304
  target_language="English",
305
+ llm=llm,
243
306
  user_prompt="Use formal language and preserve technical terminology",
244
307
  )
245
308
  ```
@@ -58,10 +58,10 @@ llm = LLM(
58
58
 
59
59
  # Translate EPUB file using language constants
60
60
  translate(
61
- llm=llm,
62
61
  source_path=Path("source.epub"),
63
62
  target_path=Path("translated.epub"),
64
63
  target_language=language.ENGLISH,
64
+ llm=llm,
65
65
  )
66
66
  ```
67
67
 
@@ -80,10 +80,10 @@ with tqdm(total=100, desc="Translating", unit="%") as pbar:
80
80
  last_progress = progress
81
81
 
82
82
  translate(
83
- llm=llm,
84
83
  source_path=Path("source.epub"),
85
84
  target_path=Path("translated.epub"),
86
85
  target_language="English",
86
+ llm=llm,
87
87
  on_progress=on_progress,
88
88
  )
89
89
  ```
@@ -116,17 +116,22 @@ Translate an EPUB file:
116
116
 
117
117
  ```python
118
118
  translate(
119
- llm: LLM, # LLM instance
120
- source_path: Path, # Source EPUB file path
121
- target_path: Path, # Output EPUB file path
119
+ source_path: PathLike | str, # Source EPUB file path
120
+ target_path: PathLike | str, # Output EPUB file path
122
121
  target_language: str, # Target language (e.g., "English", "Chinese")
123
122
  user_prompt: str | None = None, # Custom translation instructions
124
123
  max_retries: int = 5, # Maximum retries for failed translations
125
124
  max_group_tokens: int = 1200, # Maximum tokens per translation group
125
+ llm: LLM | None = None, # Single LLM instance for both translation and filling
126
+ translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
127
+ fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
126
128
  on_progress: Callable[[float], None] | None = None, # Progress callback (0.0-1.0)
129
+ on_fill_failed: Callable[[FillFailedEvent], None] | None = None, # Error callback
127
130
  )
128
131
  ```
129
132
 
133
+ **Note**: Either `llm` or both `translation_llm` and `fill_llm` must be provided. Using separate LLMs allows for task-specific optimization.
134
+
130
135
  #### Language Constants
131
136
 
132
137
  EPUB Translator provides predefined language constants for convenience. You can use these constants instead of writing language names as strings:
@@ -136,18 +141,76 @@ from epub_translator import language
136
141
 
137
142
  # Usage example:
138
143
  translate(
139
- llm=llm,
140
144
  source_path=Path("source.epub"),
141
145
  target_path=Path("translated.epub"),
142
146
  target_language=language.ENGLISH,
147
+ llm=llm,
143
148
  )
144
149
 
145
150
  # You can also use custom language strings:
146
151
  translate(
147
- llm=llm,
148
152
  source_path=Path("source.epub"),
149
153
  target_path=Path("translated.epub"),
150
154
  target_language="Icelandic", # For languages not in the constants
155
+ llm=llm,
156
+ )
157
+ ```
158
+
159
+ ### Error Handling with `on_fill_failed`
160
+
161
+ Monitor and handle translation errors using the `on_fill_failed` callback:
162
+
163
+ ```python
164
+ from epub_translator import FillFailedEvent
165
+
166
+ def handle_fill_error(event: FillFailedEvent):
167
+ print(f"Translation error (attempt {event.retried_count}):")
168
+ print(f" {event.error_message}")
169
+ if event.over_maximum_retries:
170
+ print(" Maximum retries exceeded!")
171
+
172
+ translate(
173
+ source_path=Path("source.epub"),
174
+ target_path=Path("translated.epub"),
175
+ target_language=language.ENGLISH,
176
+ llm=llm,
177
+ on_fill_failed=handle_fill_error,
178
+ )
179
+ ```
180
+
181
+ The `FillFailedEvent` contains:
182
+ - `error_message: str` - Description of the error
183
+ - `retried_count: int` - Current retry attempt number
184
+ - `over_maximum_retries: bool` - Whether max retries has been exceeded
185
+
186
+ ### Dual-LLM Architecture
187
+
188
+ Use separate LLM instances for translation and XML structure filling with different optimization parameters:
189
+
190
+ ```python
191
+ # Create two LLM instances with different temperatures
192
+ translation_llm = LLM(
193
+ key="your-api-key",
194
+ url="https://api.openai.com/v1",
195
+ model="gpt-4",
196
+ token_encoding="o200k_base",
197
+ temperature=0.8, # Higher temperature for creative translation
198
+ )
199
+
200
+ fill_llm = LLM(
201
+ key="your-api-key",
202
+ url="https://api.openai.com/v1",
203
+ model="gpt-4",
204
+ token_encoding="o200k_base",
205
+ temperature=0.3, # Lower temperature for structure preservation
206
+ )
207
+
208
+ translate(
209
+ source_path=Path("source.epub"),
210
+ target_path=Path("translated.epub"),
211
+ target_language=language.ENGLISH,
212
+ translation_llm=translation_llm,
213
+ fill_llm=fill_llm,
151
214
  )
152
215
  ```
153
216
 
@@ -203,10 +266,10 @@ Provide specific translation instructions:
203
266
 
204
267
  ```python
205
268
  translate(
206
- llm=llm,
207
269
  source_path=Path("source.epub"),
208
270
  target_path=Path("translated.epub"),
209
271
  target_language="English",
272
+ llm=llm,
210
273
  user_prompt="Use formal language and preserve technical terminology",
211
274
  )
212
275
  ```
@@ -0,0 +1,5 @@
1
+ from . import language
2
+ from .llm import LLM
3
+ from .translator import FillFailedEvent, translate
4
+
5
+ __all__ = ["LLM", "translate", "language", "FillFailedEvent"]
@@ -0,0 +1,171 @@
1
+ You are an XML structure validator. Your ONLY task is to preserve the exact XML structure from the template while filling in translated text.
2
+
3
+ CRITICAL RULES:
4
+
5
+ 1. Structure Preservation: The output XML MUST have the EXACT SAME structure as the template
6
+ - Same tags in the same order
7
+ - Same nesting hierarchy
8
+ - Same attributes (especially id attributes)
9
+
10
+ IMPORTANT: Translation fluency is SECONDARY to structure preservation.
11
+ If the translated text flows naturally but doesn't match template structure,
12
+ you MUST break the flow to insert required tags.
13
+
14
+ 2. ID Handling:
15
+ - Tags WITH id="X": Disambiguation markers for structurally similar elements
16
+ - Tags WITHOUT id: Structurally unique, match by position and tag name
17
+ - NEVER add, remove, or change id attributes
18
+
19
+ 3. Text Filling Strategy:
20
+ - Compare source text with translated text
21
+ - Identify how source maps to template structure
22
+ - Apply the same mapping to translated text
23
+ - Preserve paragraph breaks (elements are natural separators)
24
+ - IMPORTANT: Translation may change word order - use SEMANTIC matching, not position
25
+
26
+ ---
27
+
28
+ COMMON ERRORS TO AVOID:
29
+
30
+ Error Type 1: Missing expected blocks
31
+ ❌ WRONG: Omitting elements with id attributes
32
+ ✓ CORRECT: Every <tag id="X"> in template MUST appear in output
33
+
34
+ Error Type 2: Tag count mismatch for non-id elements
35
+ Example template:
36
+ <p id="1">
37
+ <span>text1</span>
38
+ <span>text2</span>
39
+ </p>
40
+
41
+ ❌ WRONG: <p id="1"><span>merged text</span></p> (only 1 span, expected 2)
42
+ ✓ CORRECT: <p id="1"><span>text1</span><span>text2</span></p>
43
+
44
+ Error Type 3: Adding unexpected IDs
45
+ ❌ WRONG: Adding id="99" to a tag that didn't have an id in template
46
+ ✓ CORRECT: If template has <span>text</span>, output should be <span>译文</span> (no id)
47
+
48
+ Error Type 4: Wrong tag names
49
+ ❌ WRONG: Changing <em id="5"> to <i id="5">
50
+ ✓ CORRECT: Keep exact tag name from template
51
+
52
+ Error Type 5: Missing ID on required elements
53
+ ❌ WRONG: <span>text</span> when template has <span id="5">text</span>
54
+ ✓ CORRECT: <span id="5">译文</span>
55
+
56
+ Error Type 6: Wrong text mapping when word order changes
57
+ Example 1: Template has "reviewer of <span id="5">Book</span> in <span id="6">Journal</span>"
58
+ Translation: "Journal 上对 Book 的评论者"
59
+
60
+ ❌ WRONG: Journal 上对 <span id="5">Book</span> 的评论者<span id="6">Journal</span>
61
+ (appending original text at end)
62
+ ✓ CORRECT: <span id="6">Journal</span> 上对 <span id="5">Book</span> 的评论者
63
+ (wrapping semantic equivalents in translated positions)
64
+
65
+ Example 2: Breaking fluent translation to preserve structure
66
+ Template: "published in <span id="5">Book Title</span> in 1990"
67
+ Translation: "于1990年出版的《书名》" (flows naturally, but loses structure)
68
+
69
+ ❌ WRONG: 于1990年出版的《书名》 (fluent but missing <span id="5">)
70
+ ✓ CORRECT: 于1990年出版的<span id="5">《书名》</span>
71
+ (Break fluency to preserve structure - this is REQUIRED)
72
+
73
+ Error Type 7: Wrong semantic matching when word order changes
74
+ When translation changes word order, match elements by SEMANTIC TYPE, not position.
75
+
76
+ Example: Book title and year
77
+ Template: "<span id="3">Book Title</span> in <span id="4"><a>1990</a></span>"
78
+ Translation: "《书名》于1990年出版"
79
+
80
+ ❌ WRONG: 《书名》于<span id="3">1990</span>年出版...
81
+ (Matching by position: "1990" appears after "于", so wrapping it with id="3")
82
+ (WRONG because you matched a YEAR to a slot expecting BOOK TITLE)
83
+
84
+ ✓ CORRECT: <span id="3">《书名》</span>于<span id="4"><a>1990</a></span>年出版
85
+ (Matching by semantic type: book title → book title, year → year)
86
+
87
+ KEY PRINCIPLE: Semantic type matching beats position matching!
88
+ - Identify semantic types: book titles, journal names, years (4-digit numbers), person names, etc.
89
+ - Match each to its corresponding slot, regardless of position in translation
90
+ - data-orig-len hints at length: book/journal titles usually longer than years/numbers
91
+
92
+ ---
93
+
94
+ FILLING ALGORITHM:
95
+
96
+ 1. Analyze template structure: count elements at each level, note id attributes
97
+
98
+ 2. Segment source text by elements (elements are natural separators)
99
+
100
+ 3. Apply to translation - STRICT STRUCTURAL MATCHING:
101
+
102
+ A. For elements WITH id:
103
+ - Locate semantic equivalent in translation
104
+ - Wrap with same tag+id
105
+ - If translation merged multiple spans: You MUST still output all original spans separately
106
+ Example: Template has id="1" and id="2", translation merged both
107
+ → Output BOTH spans, use source text fallback for missing one
108
+
109
+ B. For elements WITHOUT id:
110
+ - Match by STRUCTURAL POSITION only (template order)
111
+ - Count MUST be exact: 7 spans in template = 7 spans in output
112
+ - Even if content repeats (e.g., 3 instances of "x"), each gets its own span
113
+ - Process sequentially: wrap 1st occurrence with 1st span, 2nd with 2nd span, etc.
114
+ - DO NOT merge, skip, or add extra elements
115
+
116
+ CRITICAL for repeated content:
117
+ If template has: "...<span>Word</span>...more text...<span>Word</span>"
118
+ And translation has: "...词...更多文字...词"
119
+ → Wrap 1st occurrence of "词" with 1st span, 2nd occurrence with 2nd span
120
+ → Even if the words are identical, treat each span position independently
121
+
122
+ 4. Verify: same element counts, all ids preserved, tag names match
123
+
124
+ CRITICAL: Template structure is LAW. Translation fluency is secondary.
125
+
126
+ ---
127
+
128
+ SPECIAL CASES:
129
+
130
+ 1. data-orig-len attribute: Token count hint. Longer counts usually = titles/names, shorter = numbers/symbols.
131
+
132
+ 2. Name+Number as single unit (e.g., "JournalName42" with NO space):
133
+ - In translation, find the name's equivalent and any adjacent number
134
+ - Wrap them together: <span id="X">《期刊名》42</span> or <span id="X">《期刊名》第42期</span>
135
+ - Key: If template treats them as one span, keep them in one span in translation
136
+
137
+ 3. Translation merges adjacent spans:
138
+ Template: "<span id="A">Word1</span> & <span id="B">Word2</span>"
139
+ Translation: "复合词" (one inseparable term)
140
+
141
+ Solution: You MUST output BOTH spans even if translation merged them
142
+ - Try to split translation if possible
143
+ - If truly inseparable: Keep translation for one span, use source text for the other
144
+ - Example: <span id="A">复合词</span>与<span id="B">Word2</span>
145
+
146
+ 4. Missing semantic match:
147
+ - Exhaust all possibilities first (synonyms, paraphrases, context)
148
+ - Last resort: Use source text as fallback
149
+ - Mixed language is acceptable to preserve structure
150
+
151
+ WRONG fallback approaches:
152
+ ❌ Empty: <span id="2"></span>
153
+ ❌ Placeholder: <span id="2">内容</span>
154
+ ❌ Duplicate: <span id="2">中文名称</span> (when id="1" has this)
155
+
156
+ ---
157
+
158
+ OUTPUT FORMAT:
159
+ ```xml
160
+ <xml>
161
+ ... your filled XML here ...
162
+ </xml>
163
+ ```
164
+
165
+ CRITICAL:
166
+ - Return ONLY the XML block, no explanations
167
+ - Do NOT include example blocks or alternatives
168
+ - If unsure, make best attempt based on pattern
169
+ - System will provide detailed error messages if corrections needed
170
+
171
+ Begin.
@@ -1,4 +1,4 @@
1
- from .placeholder import Placeholder, is_placeholder_tag
1
+ from .metadata import read_metadata, write_metadata
2
2
  from .spines import search_spine_paths
3
3
  from .toc import read_toc, write_toc
4
4
  from .zip import Zip
@@ -0,0 +1,122 @@
1
+ from dataclasses import dataclass
2
+
3
+ from .common import find_opf_path
4
+ from .zip import Zip
5
+
6
+
7
+ @dataclass
8
+ class MetadataField:
9
+ """
10
+ 表示 EPUB OPF 文件中的元数据字段
11
+
12
+ - tag_name: 标签名(不带命名空间)
13
+ - text: 文本内容
14
+ """
15
+
16
+ tag_name: str
17
+ text: str
18
+
19
+
20
+ # 不应该被翻译的元数据字段
21
+ SKIP_FIELDS = {
22
+ "language",
23
+ "identifier",
24
+ "date",
25
+ "meta",
26
+ "contributor", # Usually technical information
27
+ }
28
+
29
+
30
+ def read_metadata(zip: Zip) -> list[MetadataField]:
31
+ """
32
+ 从 EPUB 的 OPF 文件中读取所有可翻译的元数据字段。
33
+
34
+ 返回包含标签名和文本内容的列表。
35
+ 自动过滤掉不应该翻译的字段(language, identifier, date, meta, contributor 等)。
36
+ """
37
+ opf_path = find_opf_path(zip)
38
+
39
+ with zip.read(opf_path) as f:
40
+ content = f.read()
41
+
42
+ from xml.etree import ElementTree as ET
43
+
44
+ root = ET.fromstring(content)
45
+
46
+ # Find metadata element
47
+ metadata_elem = None
48
+ for child in root:
49
+ if child.tag.endswith("metadata"):
50
+ metadata_elem = child
51
+ break
52
+
53
+ if metadata_elem is None:
54
+ return []
55
+
56
+ # Collect metadata fields to translate
57
+ fields: list[MetadataField] = []
58
+
59
+ for elem in metadata_elem:
60
+ # Get tag name without namespace
61
+ tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
62
+
63
+ # Check if element has text content and should be translated
64
+ if elem.text and elem.text.strip() and tag_name not in SKIP_FIELDS:
65
+ fields.append(MetadataField(tag_name=tag_name, text=elem.text.strip()))
66
+
67
+ return fields
68
+
69
+
70
+ def write_metadata(zip: Zip, fields: list[MetadataField]) -> None:
71
+ """
72
+ 将翻译后的元数据字段写回 EPUB 的 OPF 文件。
73
+
74
+ 根据 tag_name 匹配对应的元素,并更新其文本内容。
75
+ 匹配策略:按照 tag_name 和在文件中出现的顺序依次匹配。
76
+ """
77
+ opf_path = find_opf_path(zip)
78
+
79
+ with zip.read(opf_path) as f:
80
+ content = f.read()
81
+
82
+ from xml.etree import ElementTree as ET
83
+
84
+ root = ET.fromstring(content)
85
+
86
+ # Find metadata element
87
+ metadata_elem = None
88
+ for child in root:
89
+ if child.tag.endswith("metadata"):
90
+ metadata_elem = child
91
+ break
92
+
93
+ if metadata_elem is None:
94
+ return
95
+
96
+ # Build a mapping: tag_name -> list of fields with that tag_name
97
+ fields_by_tag: dict[str, list[str]] = {}
98
+ for field in fields:
99
+ if field.tag_name not in fields_by_tag:
100
+ fields_by_tag[field.tag_name] = []
101
+ fields_by_tag[field.tag_name].append(field.text)
102
+
103
+ # Create a counter for each tag to track which occurrence we're at
104
+ tag_counters: dict[str, int] = {tag: 0 for tag in fields_by_tag}
105
+
106
+ # Update elements in metadata
107
+ for elem in metadata_elem:
108
+ # Get tag name without namespace
109
+ tag_name = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
110
+
111
+ # Check if this tag has translated text
112
+ if tag_name in fields_by_tag and elem.text and elem.text.strip():
113
+ counter = tag_counters[tag_name]
114
+ if counter < len(fields_by_tag[tag_name]):
115
+ # Update the text with translated version
116
+ elem.text = fields_by_tag[tag_name][counter]
117
+ tag_counters[tag_name] += 1
118
+
119
+ # Write back the modified OPF file
120
+ tree = ET.ElementTree(root)
121
+ with zip.replace(opf_path) as f:
122
+ tree.write(f, encoding="utf-8", xml_declaration=True)
@@ -6,7 +6,8 @@ from .common import find_opf_path, strip_namespace
6
6
  from .zip import Zip
7
7
 
8
8
 
9
- def search_spine_paths(zip: Zip) -> Generator[Path, None, None]:
9
+ # yield file_path, media_type
10
+ def search_spine_paths(zip: Zip) -> Generator[tuple[Path, str], None, None]:
10
11
  opf_path = find_opf_path(zip)
11
12
  opf_dir = opf_path.parent
12
13
 
@@ -39,4 +40,4 @@ def search_spine_paths(zip: Zip) -> Generator[Path, None, None]:
39
40
  if idref in manifest_items:
40
41
  href, media_type = manifest_items[idref]
41
42
  if media_type in ("application/xhtml+xml", "text/html"):
42
- yield opf_dir / href
43
+ yield opf_dir / href, media_type
@@ -44,24 +44,26 @@ class Zip:
44
44
  all_files = self._source_zip.namelist()
45
45
  if prefix_path is None:
46
46
  return [Path(f) for f in all_files]
47
- prefix = str(prefix_path)
47
+ prefix = prefix_path.as_posix()
48
48
  if not prefix.endswith("/"):
49
49
  prefix += "/"
50
50
  return [Path(f) for f in all_files if f.startswith(prefix)]
51
51
 
52
52
  def migrate(self, path: Path):
53
+ path_str = path.as_posix()
54
+ source_info = self._source_zip.getinfo(path_str)
53
55
  with self.read(path) as source_file:
54
- with self._target_zip.open(str(path), "w") as target_file:
55
- while True:
56
- chunk = source_file.read(_BUFFER_SIZE)
57
- if not chunk:
58
- break
59
- target_file.write(chunk)
56
+ content = source_file.read()
57
+ self._target_zip.writestr(
58
+ zinfo_or_arcname=source_info,
59
+ data=content,
60
+ compress_type=source_info.compress_type,
61
+ )
60
62
  self._processed_files.add(path)
61
63
 
62
64
  def read(self, path: Path) -> IO[bytes]:
63
- return self._source_zip.open(str(path), "r")
65
+ return self._source_zip.open(path.as_posix(), "r")
64
66
 
65
67
  def replace(self, path: Path) -> IO[bytes]:
66
68
  self._processed_files.add(path)
67
- return self._target_zip.open(str(path), "w")
69
+ return self._target_zip.open(path.as_posix(), "w")