epub-translator 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.1.5 → epub_translator-0.1.7}/PKG-INFO +37 -9
- {epub_translator-0.1.5 → epub_translator-0.1.7}/README.md +35 -8
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/__init__.py +1 -2
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/data/translate.jinja +3 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/__init__.py +1 -1
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/context.py +10 -1
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/core.py +30 -3
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/__init__.py +1 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/inline_segment.py +11 -1
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/text_segment.py +5 -10
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/utils.py +0 -16
- epub_translator-0.1.7/epub_translator/translation/__init__.py +2 -0
- {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/epub_transcode.py +2 -2
- {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/punctuation.py +1 -1
- {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/translator.py +8 -6
- {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/xml_interrupter.py +52 -28
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/__init__.py +1 -1
- epub_translator-0.1.7/epub_translator/xml/inline.py +113 -0
- epub_translator-0.1.7/epub_translator/xml_translator/concurrency.py +52 -0
- epub_translator-0.1.7/epub_translator/xml_translator/score.py +164 -0
- epub_translator-0.1.7/epub_translator/xml_translator/stream_mapper.py +284 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/submitter.py +5 -5
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/translator.py +12 -18
- {epub_translator-0.1.5 → epub_translator-0.1.7}/pyproject.toml +2 -1
- epub_translator-0.1.5/epub_translator/data/mmltex/README.md +0 -67
- epub_translator-0.1.5/epub_translator/data/mmltex/cmarkup.xsl +0 -1106
- epub_translator-0.1.5/epub_translator/data/mmltex/entities.xsl +0 -459
- epub_translator-0.1.5/epub_translator/data/mmltex/glayout.xsl +0 -222
- epub_translator-0.1.5/epub_translator/data/mmltex/mmltex.xsl +0 -36
- epub_translator-0.1.5/epub_translator/data/mmltex/scripts.xsl +0 -375
- epub_translator-0.1.5/epub_translator/data/mmltex/tables.xsl +0 -130
- epub_translator-0.1.5/epub_translator/data/mmltex/tokens.xsl +0 -328
- epub_translator-0.1.5/epub_translator/xml/inline.py +0 -67
- epub_translator-0.1.5/epub_translator/xml_translator/stream_mapper.py +0 -253
- {epub_translator-0.1.5 → epub_translator-0.1.7}/LICENSE +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/data/fill.jinja +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/common.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/math.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/metadata.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/spines.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/toc.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/zip.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/executor.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/types.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/block_segment.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/common.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/__init__.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/chunk.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/segment.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/splitter.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/template.py +0 -0
- {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/language.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/utils.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/const.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/deduplication.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/__init__.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/decoder.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/encoder.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/parser.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/tag.py +0 -0
- {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/transform.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/self_closing.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/utils.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/xml.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/xml_like.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/__init__.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/callbacks.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/common.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/hill_climbing.py +0 -0
- {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/validation.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Localization
|
|
|
24
24
|
Classifier: Topic :: Text Processing :: Markup
|
|
25
25
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
26
26
|
Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
|
|
27
|
+
Requires-Dist: mathml2latex (>=0.2.12,<0.3.0)
|
|
27
28
|
Requires-Dist: openai (>=2.14.0,<3.0.0)
|
|
28
29
|
Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
|
|
29
30
|
Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
|
|
@@ -59,6 +60,13 @@ Translate EPUB books using Large Language Models while preserving the original t
|
|
|
59
60
|
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
60
61
|
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
61
62
|
|
|
63
|
+
## Use Cases
|
|
64
|
+
|
|
65
|
+
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
66
|
+
- **Academic Research**: Access foreign literature with bilingual references
|
|
67
|
+
- **Content Localization**: Prepare books for international audiences
|
|
68
|
+
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
69
|
+
|
|
62
70
|
## Installation
|
|
63
71
|
|
|
64
72
|
```bash
|
|
@@ -156,7 +164,8 @@ translate(
|
|
|
156
164
|
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
157
165
|
user_prompt: str | None = None, # Custom translation instructions
|
|
158
166
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
159
|
-
max_group_tokens: int =
|
|
167
|
+
max_group_tokens: int = 2600, # Maximum tokens per translation group
|
|
168
|
+
concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
|
|
160
169
|
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
161
170
|
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
162
171
|
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
@@ -356,13 +365,6 @@ llm = LLM(
|
|
|
356
365
|
)
|
|
357
366
|
```
|
|
358
367
|
|
|
359
|
-
## Use Cases
|
|
360
|
-
|
|
361
|
-
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
362
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
363
|
-
- **Content Localization**: Prepare books for international audiences
|
|
364
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
365
|
-
|
|
366
368
|
## Advanced Features
|
|
367
369
|
|
|
368
370
|
### Custom Translation Prompts
|
|
@@ -394,6 +396,32 @@ llm = LLM(
|
|
|
394
396
|
)
|
|
395
397
|
```
|
|
396
398
|
|
|
399
|
+
### Concurrent Translation
|
|
400
|
+
|
|
401
|
+
Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
|
|
402
|
+
|
|
403
|
+
```python
|
|
404
|
+
translate(
|
|
405
|
+
source_path="source.epub",
|
|
406
|
+
target_path="translated.epub",
|
|
407
|
+
target_language="English",
|
|
408
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
409
|
+
llm=llm,
|
|
410
|
+
concurrency=4, # Process 4 segments concurrently
|
|
411
|
+
)
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
**Performance Tips:**
|
|
415
|
+
|
|
416
|
+
- Start with `concurrency=4` and adjust based on your API rate limits and system resources
|
|
417
|
+
- Higher concurrency values can significantly reduce translation time for large books
|
|
418
|
+
- The translation order is preserved regardless of concurrency settings
|
|
419
|
+
- Monitor your API provider's rate limits to avoid throttling
|
|
420
|
+
|
|
421
|
+
**Thread Safety:**
|
|
422
|
+
|
|
423
|
+
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
424
|
+
|
|
397
425
|
## Related Projects
|
|
398
426
|
|
|
399
427
|
### PDF Craft
|
|
@@ -26,6 +26,13 @@ Translate EPUB books using Large Language Models while preserving the original t
|
|
|
26
26
|
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
27
27
|
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
28
28
|
|
|
29
|
+
## Use Cases
|
|
30
|
+
|
|
31
|
+
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
32
|
+
- **Academic Research**: Access foreign literature with bilingual references
|
|
33
|
+
- **Content Localization**: Prepare books for international audiences
|
|
34
|
+
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
35
|
+
|
|
29
36
|
## Installation
|
|
30
37
|
|
|
31
38
|
```bash
|
|
@@ -123,7 +130,8 @@ translate(
|
|
|
123
130
|
submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
|
|
124
131
|
user_prompt: str | None = None, # Custom translation instructions
|
|
125
132
|
max_retries: int = 5, # Maximum retries for failed translations
|
|
126
|
-
max_group_tokens: int =
|
|
133
|
+
max_group_tokens: int = 2600, # Maximum tokens per translation group
|
|
134
|
+
concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
|
|
127
135
|
llm: LLM | None = None, # Single LLM instance for both translation and filling
|
|
128
136
|
translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
|
|
129
137
|
fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
|
|
@@ -323,13 +331,6 @@ llm = LLM(
|
|
|
323
331
|
)
|
|
324
332
|
```
|
|
325
333
|
|
|
326
|
-
## Use Cases
|
|
327
|
-
|
|
328
|
-
- **Language Learning**: Read books in their original language with side-by-side translations
|
|
329
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
330
|
-
- **Content Localization**: Prepare books for international audiences
|
|
331
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
332
|
-
|
|
333
334
|
## Advanced Features
|
|
334
335
|
|
|
335
336
|
### Custom Translation Prompts
|
|
@@ -361,6 +362,32 @@ llm = LLM(
|
|
|
361
362
|
)
|
|
362
363
|
```
|
|
363
364
|
|
|
365
|
+
### Concurrent Translation
|
|
366
|
+
|
|
367
|
+
Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
|
|
368
|
+
|
|
369
|
+
```python
|
|
370
|
+
translate(
|
|
371
|
+
source_path="source.epub",
|
|
372
|
+
target_path="translated.epub",
|
|
373
|
+
target_language="English",
|
|
374
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
375
|
+
llm=llm,
|
|
376
|
+
concurrency=4, # Process 4 segments concurrently
|
|
377
|
+
)
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
**Performance Tips:**
|
|
381
|
+
|
|
382
|
+
- Start with `concurrency=4` and adjust based on your API rate limits and system resources
|
|
383
|
+
- Higher concurrency values can significantly reduce translation time for large books
|
|
384
|
+
- The translation order is preserved regardless of concurrency settings
|
|
385
|
+
- Monitor your API provider's rate limits to avoid throttling
|
|
386
|
+
|
|
387
|
+
**Thread Safety:**
|
|
388
|
+
|
|
389
|
+
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
390
|
+
|
|
364
391
|
## Related Projects
|
|
365
392
|
|
|
366
393
|
### PDF Craft
|
|
@@ -13,6 +13,9 @@ Translation rules:
|
|
|
13
13
|
{% if user_prompt -%}
|
|
14
14
|
User may provide additional requirements in <rules> tags before the source text. Follow them, but prioritize the rules above if conflicts arise.
|
|
15
15
|
|
|
16
|
+
<rules>
|
|
17
|
+
{{ user_prompt }}
|
|
18
|
+
</rules>
|
|
16
19
|
{% endif -%}
|
|
17
20
|
|
|
18
21
|
Output only the translated text, nothing else.
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
|
+
import threading
|
|
3
4
|
import uuid
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
from typing import Self
|
|
@@ -8,6 +9,9 @@ from .executor import LLMExecutor
|
|
|
8
9
|
from .increasable import Increasable, Increaser
|
|
9
10
|
from .types import Message, MessageRole
|
|
10
11
|
|
|
12
|
+
# Global lock for cache file commit operations
|
|
13
|
+
_CACHE_COMMIT_LOCK = threading.Lock()
|
|
14
|
+
|
|
11
15
|
|
|
12
16
|
class LLMContext:
|
|
13
17
|
def __init__(
|
|
@@ -101,7 +105,12 @@ class LLMContext:
|
|
|
101
105
|
# Remove the .[context-id].txt suffix to get permanent name
|
|
102
106
|
permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
|
|
103
107
|
permanent_file = temp_file.parent / permanent_name
|
|
104
|
-
|
|
108
|
+
|
|
109
|
+
with _CACHE_COMMIT_LOCK: # 多线程下的线程安全
|
|
110
|
+
if permanent_file.exists():
|
|
111
|
+
temp_file.unlink()
|
|
112
|
+
else:
|
|
113
|
+
temp_file.rename(permanent_file)
|
|
105
114
|
|
|
106
115
|
def _rollback(self) -> None:
|
|
107
116
|
for temp_file in self._temp_files:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
+
import threading
|
|
2
3
|
from collections.abc import Generator
|
|
3
4
|
from importlib.resources import files
|
|
4
5
|
from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
|
|
@@ -14,6 +15,11 @@ from .executor import LLMExecutor
|
|
|
14
15
|
from .increasable import Increasable
|
|
15
16
|
from .types import Message
|
|
16
17
|
|
|
18
|
+
# Global state for logger filename generation
|
|
19
|
+
_LOGGER_LOCK = threading.Lock()
|
|
20
|
+
_LAST_TIMESTAMP: str | None = None
|
|
21
|
+
_LOGGER_SUFFIX_ID: int = 1
|
|
22
|
+
|
|
17
23
|
|
|
18
24
|
class LLM:
|
|
19
25
|
def __init__(
|
|
@@ -95,13 +101,34 @@ class LLM:
|
|
|
95
101
|
return dir_path.resolve()
|
|
96
102
|
|
|
97
103
|
def _create_logger(self) -> Logger | None:
|
|
104
|
+
# pylint: disable=global-statement
|
|
105
|
+
global _LAST_TIMESTAMP, _LOGGER_SUFFIX_ID
|
|
106
|
+
|
|
98
107
|
if self._logger_save_path is None:
|
|
99
108
|
return None
|
|
100
109
|
|
|
101
110
|
now = datetime.datetime.now(datetime.UTC)
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
111
|
+
# Use second-level precision for collision detection
|
|
112
|
+
timestamp_key = now.strftime("%Y-%m-%d %H-%M-%S")
|
|
113
|
+
|
|
114
|
+
with _LOGGER_LOCK:
|
|
115
|
+
if _LAST_TIMESTAMP == timestamp_key:
|
|
116
|
+
_LOGGER_SUFFIX_ID += 1
|
|
117
|
+
suffix_id = _LOGGER_SUFFIX_ID
|
|
118
|
+
else:
|
|
119
|
+
_LAST_TIMESTAMP = timestamp_key
|
|
120
|
+
_LOGGER_SUFFIX_ID = 1
|
|
121
|
+
suffix_id = 1
|
|
122
|
+
|
|
123
|
+
if suffix_id == 1:
|
|
124
|
+
file_name = f"request {timestamp_key}.log"
|
|
125
|
+
logger_name = f"LLM Request {timestamp_key}"
|
|
126
|
+
else:
|
|
127
|
+
file_name = f"request {timestamp_key}_{suffix_id}.log"
|
|
128
|
+
logger_name = f"LLM Request {timestamp_key}_{suffix_id}"
|
|
129
|
+
|
|
130
|
+
file_path = self._logger_save_path / file_name
|
|
131
|
+
logger = getLogger(logger_name)
|
|
105
132
|
logger.setLevel(DEBUG)
|
|
106
133
|
handler = FileHandler(file_path, encoding="utf-8")
|
|
107
134
|
handler.setLevel(DEBUG)
|
|
@@ -47,6 +47,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
|
|
|
47
47
|
inline_segment = _pop_stack_data(stack_data)
|
|
48
48
|
stack_data = None
|
|
49
49
|
if inline_segment:
|
|
50
|
+
inline_segment.id = 0
|
|
50
51
|
yield inline_segment
|
|
51
52
|
|
|
52
53
|
if stack_data is None:
|
|
@@ -73,6 +74,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
|
|
|
73
74
|
if stack_data is not None:
|
|
74
75
|
inline_segment = _pop_stack_data(stack_data)
|
|
75
76
|
if inline_segment:
|
|
77
|
+
inline_segment.id = 0
|
|
76
78
|
yield inline_segment
|
|
77
79
|
|
|
78
80
|
|
|
@@ -115,7 +117,7 @@ class InlineSegment:
|
|
|
115
117
|
self._child_tag2ids: dict[str, list[int]] = {}
|
|
116
118
|
self._child_tag2count: dict[str, int] = {}
|
|
117
119
|
|
|
118
|
-
next_temp_id: int =
|
|
120
|
+
next_temp_id: int = 1
|
|
119
121
|
terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
|
|
120
122
|
|
|
121
123
|
for tag, child_terms in terms.items():
|
|
@@ -162,6 +164,14 @@ class InlineSegment:
|
|
|
162
164
|
elif isinstance(child, InlineSegment):
|
|
163
165
|
yield from child
|
|
164
166
|
|
|
167
|
+
def clone(self) -> "InlineSegment":
|
|
168
|
+
cloned_segment = InlineSegment(
|
|
169
|
+
depth=len(self._parent_stack),
|
|
170
|
+
children=[child.clone() for child in self._children],
|
|
171
|
+
)
|
|
172
|
+
cloned_segment.id = self.id
|
|
173
|
+
return cloned_segment
|
|
174
|
+
|
|
165
175
|
def recreate_ids(self, id_generator: IDGenerator) -> None:
|
|
166
176
|
self._child_tag2count.clear()
|
|
167
177
|
self._child_tag2ids.clear()
|
|
@@ -4,7 +4,7 @@ from enum import Enum, auto
|
|
|
4
4
|
from typing import Self
|
|
5
5
|
from xml.etree.ElementTree import Element
|
|
6
6
|
|
|
7
|
-
from ..xml import expand_left_element_texts, expand_right_element_texts,
|
|
7
|
+
from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_element, normalize_text_in_element
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TextPosition(Enum):
|
|
@@ -33,10 +33,6 @@ class TextSegment:
|
|
|
33
33
|
def block_parent(self) -> Element:
|
|
34
34
|
return self.parent_stack[self.block_depth - 1]
|
|
35
35
|
|
|
36
|
-
@property
|
|
37
|
-
def xml_text(self) -> str:
|
|
38
|
-
return "".join(_expand_xml_texts(self))
|
|
39
|
-
|
|
40
36
|
def strip_block_parents(self) -> Self:
|
|
41
37
|
self.parent_stack = self.parent_stack[self.block_depth - 1 :]
|
|
42
38
|
self.block_depth = 1
|
|
@@ -104,7 +100,7 @@ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
|
|
|
104
100
|
def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
|
|
105
101
|
text = normalize_text_in_element(element.text)
|
|
106
102
|
next_stack = stack + [element]
|
|
107
|
-
next_block_depth =
|
|
103
|
+
next_block_depth = find_block_depth(next_stack)
|
|
108
104
|
|
|
109
105
|
if text is not None:
|
|
110
106
|
yield TextSegment(
|
|
@@ -129,12 +125,11 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
|
|
|
129
125
|
)
|
|
130
126
|
|
|
131
127
|
|
|
132
|
-
def
|
|
128
|
+
def find_block_depth(parent_stack: list[Element]) -> int:
|
|
133
129
|
index: int = 0
|
|
134
|
-
for i in range(len(parent_stack)
|
|
135
|
-
if not
|
|
130
|
+
for i in range(len(parent_stack)):
|
|
131
|
+
if not is_inline_element(parent_stack[i]):
|
|
136
132
|
index = i
|
|
137
|
-
break
|
|
138
133
|
return index + 1 # depth is a count not index
|
|
139
134
|
|
|
140
135
|
|
|
@@ -8,22 +8,6 @@ def element_fingerprint(element: Element) -> str:
|
|
|
8
8
|
return f"<{element.tag} {' '.join(attrs)}/>"
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
|
|
12
|
-
parents: list[Element] = []
|
|
13
|
-
while True:
|
|
14
|
-
if len(element) != 1:
|
|
15
|
-
break
|
|
16
|
-
child = element[0]
|
|
17
|
-
if not element.text:
|
|
18
|
-
break
|
|
19
|
-
if not child.tail:
|
|
20
|
-
break
|
|
21
|
-
parents.append(element)
|
|
22
|
-
element = child
|
|
23
|
-
element.tail = None
|
|
24
|
-
return element, parents
|
|
25
|
-
|
|
26
|
-
|
|
27
11
|
def id_in_element(element: Element) -> int | None:
|
|
28
12
|
id_str = element.get(ID_KEY, None)
|
|
29
13
|
if id_str is None:
|
|
@@ -5,7 +5,7 @@ from importlib.metadata import version as get_package_version
|
|
|
5
5
|
from os import PathLike
|
|
6
6
|
from pathlib import Path
|
|
7
7
|
|
|
8
|
-
from
|
|
8
|
+
from ..epub import (
|
|
9
9
|
Zip,
|
|
10
10
|
read_metadata,
|
|
11
11
|
read_toc,
|
|
@@ -13,12 +13,12 @@ from .epub import (
|
|
|
13
13
|
write_metadata,
|
|
14
14
|
write_toc,
|
|
15
15
|
)
|
|
16
|
+
from ..llm import LLM
|
|
17
|
+
from ..xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
18
|
+
from ..xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
16
19
|
from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
|
|
17
|
-
from .llm import LLM
|
|
18
20
|
from .punctuation import unwrap_french_quotes
|
|
19
|
-
from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
|
|
20
21
|
from .xml_interrupter import XMLInterrupter
|
|
21
|
-
from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class _ElementType(Enum):
|
|
@@ -40,7 +40,8 @@ def translate(
|
|
|
40
40
|
submit: SubmitKind,
|
|
41
41
|
user_prompt: str | None = None,
|
|
42
42
|
max_retries: int = 5,
|
|
43
|
-
max_group_tokens: int =
|
|
43
|
+
max_group_tokens: int = 2600,
|
|
44
|
+
concurrency: int = 1,
|
|
44
45
|
llm: LLM | None = None,
|
|
45
46
|
translation_llm: LLM | None = None,
|
|
46
47
|
fill_llm: LLM | None = None,
|
|
@@ -62,7 +63,7 @@ def translate(
|
|
|
62
63
|
ignore_translated_error=False,
|
|
63
64
|
max_retries=max_retries,
|
|
64
65
|
max_fill_displaying_errors=10,
|
|
65
|
-
|
|
66
|
+
max_group_score=max_group_tokens,
|
|
66
67
|
cache_seed_content=f"{_get_version()}:{target_language}",
|
|
67
68
|
)
|
|
68
69
|
with Zip(
|
|
@@ -92,6 +93,7 @@ def translate(
|
|
|
92
93
|
current_progress = 0.0
|
|
93
94
|
|
|
94
95
|
for translated_elem, context in translator.translate_elements(
|
|
96
|
+
concurrency=concurrency,
|
|
95
97
|
interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
|
|
96
98
|
interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
|
|
97
99
|
interrupt_block_element=interrupter.interrupt_block_element,
|
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
from collections.abc import Generator, Iterable
|
|
2
2
|
from typing import cast
|
|
3
|
-
from xml.etree.ElementTree import Element
|
|
3
|
+
from xml.etree.ElementTree import Element, tostring
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from .
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from mathml2latex.mathml import process_mathml
|
|
7
|
+
|
|
8
|
+
from ..segment import TextSegment, combine_text_segments, find_block_depth
|
|
9
|
+
from ..utils import ensure_list
|
|
10
|
+
from ..xml import clone_element
|
|
7
11
|
|
|
8
12
|
_ID_KEY = "__XML_INTERRUPTER_ID"
|
|
9
13
|
_MATH_TAG = "math"
|
|
@@ -37,8 +41,10 @@ class XMLInterrupter:
|
|
|
37
41
|
def interrupt_block_element(self, element: Element) -> Element:
|
|
38
42
|
interrupted_element = self._placeholder2interrupted.pop(id(element), None)
|
|
39
43
|
if interrupted_element is None:
|
|
44
|
+
element.attrib.pop(_ID_KEY, None)
|
|
40
45
|
return element
|
|
41
46
|
else:
|
|
47
|
+
interrupted_element.attrib.pop(_ID_KEY, None)
|
|
42
48
|
return interrupted_element
|
|
43
49
|
|
|
44
50
|
def _expand_source_text_segment(self, text_segment: TextSegment):
|
|
@@ -81,14 +87,18 @@ class XMLInterrupter:
|
|
|
81
87
|
_ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
|
|
82
88
|
},
|
|
83
89
|
)
|
|
90
|
+
interrupted_display = interrupted_element.get("display", None)
|
|
91
|
+
if interrupted_display is not None:
|
|
92
|
+
placeholder_element.set("display", interrupted_display)
|
|
93
|
+
|
|
84
94
|
raw_parent_stack = text_segment.parent_stack[:interrupted_index]
|
|
85
95
|
parent_stack = raw_parent_stack + [placeholder_element]
|
|
86
96
|
merged_text_segment = TextSegment(
|
|
87
|
-
text=
|
|
97
|
+
text=self._render_latex(text_segments),
|
|
88
98
|
parent_stack=parent_stack,
|
|
89
99
|
left_common_depth=text_segments[0].left_common_depth,
|
|
90
100
|
right_common_depth=text_segments[-1].right_common_depth,
|
|
91
|
-
block_depth=
|
|
101
|
+
block_depth=find_block_depth(parent_stack),
|
|
92
102
|
position=text_segments[0].position,
|
|
93
103
|
)
|
|
94
104
|
self._placeholder2interrupted[id(placeholder_element)] = interrupted_element
|
|
@@ -116,8 +126,8 @@ class XMLInterrupter:
|
|
|
116
126
|
# 原始栈退光,仅留下相对 interrupted 元素的栈,这种格式与 translated 要求一致
|
|
117
127
|
text_segment.left_common_depth = max(0, text_segment.left_common_depth - interrupted_index)
|
|
118
128
|
text_segment.right_common_depth = max(0, text_segment.right_common_depth - interrupted_index)
|
|
119
|
-
text_segment.block_depth = 1
|
|
120
129
|
text_segment.parent_stack = text_segment.parent_stack[interrupted_index:]
|
|
130
|
+
text_segment.block_depth = find_block_depth(text_segment.parent_stack)
|
|
121
131
|
|
|
122
132
|
return merged_text_segment
|
|
123
133
|
|
|
@@ -129,37 +139,51 @@ class XMLInterrupter:
|
|
|
129
139
|
break
|
|
130
140
|
return interrupted_index
|
|
131
141
|
|
|
142
|
+
def _render_latex(self, text_segments: list[TextSegment]) -> str:
|
|
143
|
+
math_element, _ = next(combine_text_segments(text_segments))
|
|
144
|
+
while math_element.tag != _MATH_TAG:
|
|
145
|
+
if len(math_element) == 0:
|
|
146
|
+
return ""
|
|
147
|
+
math_element = math_element[0]
|
|
148
|
+
|
|
149
|
+
math_element = clone_element(math_element)
|
|
150
|
+
math_element.attrib.pop(_ID_KEY, None)
|
|
151
|
+
math_element.tail = None
|
|
152
|
+
latex: str | None = None
|
|
153
|
+
try:
|
|
154
|
+
mathml_str = tostring(math_element, encoding="unicode")
|
|
155
|
+
soup = BeautifulSoup(mathml_str, "html.parser")
|
|
156
|
+
latex = process_mathml(soup)
|
|
157
|
+
except Exception:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
if latex is None:
|
|
161
|
+
latex = "".join(t.text for t in text_segments)
|
|
162
|
+
elif math_element.get("display", None) == "inline":
|
|
163
|
+
latex = f"${latex}$"
|
|
164
|
+
else:
|
|
165
|
+
latex = f"$${latex}$$"
|
|
166
|
+
|
|
167
|
+
return f" {latex} "
|
|
168
|
+
|
|
132
169
|
def _expand_translated_text_segment(self, text_segment: TextSegment):
|
|
133
|
-
|
|
170
|
+
parent_element = text_segment.parent_stack[-1]
|
|
171
|
+
interrupted_id = parent_element.attrib.pop(_ID_KEY, None)
|
|
134
172
|
if interrupted_id is None:
|
|
135
173
|
yield text_segment
|
|
136
174
|
return
|
|
137
175
|
|
|
138
|
-
|
|
139
|
-
|
|
176
|
+
if parent_element is text_segment.block_parent:
|
|
177
|
+
# Block-level math, need to be hidden
|
|
140
178
|
return
|
|
141
179
|
|
|
142
|
-
|
|
143
|
-
if not
|
|
180
|
+
raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
|
|
181
|
+
if not raw_text_segments:
|
|
182
|
+
yield text_segment
|
|
144
183
|
return
|
|
145
184
|
|
|
146
185
|
for raw_text_segment in raw_text_segments:
|
|
186
|
+
text_basic_parent_stack = text_segment.parent_stack[:-1]
|
|
147
187
|
raw_text_segment.block_parent.attrib.pop(_ID_KEY, None)
|
|
188
|
+
raw_text_segment.parent_stack = text_basic_parent_stack + raw_text_segment.parent_stack
|
|
148
189
|
yield raw_text_segment
|
|
149
|
-
|
|
150
|
-
def _has_no_math_texts(self, element: Element):
|
|
151
|
-
if element.tag == _MATH_TAG:
|
|
152
|
-
return True
|
|
153
|
-
if element.text and normalize_whitespace(element.text).strip():
|
|
154
|
-
return False
|
|
155
|
-
for child_element in element:
|
|
156
|
-
if not self._has_no_math_texts(child_element):
|
|
157
|
-
return False
|
|
158
|
-
if child_element.tail and normalize_whitespace(child_element.tail).strip():
|
|
159
|
-
return False
|
|
160
|
-
return True
|
|
161
|
-
|
|
162
|
-
def _is_inline_math(self, element: Element) -> bool:
|
|
163
|
-
if element.tag != _MATH_TAG:
|
|
164
|
-
return False
|
|
165
|
-
return element.get("display", "").lower() != "block"
|