epub-translator 0.1.4__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. {epub_translator-0.1.4 → epub_translator-0.1.6}/PKG-INFO +58 -8
  2. {epub_translator-0.1.4 → epub_translator-0.1.6}/README.md +57 -7
  3. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/__init__.py +1 -2
  4. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/__init__.py +1 -1
  5. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/context.py +10 -1
  6. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/core.py +30 -3
  7. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/inline_segment.py +11 -1
  8. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/text_segment.py +0 -4
  9. epub_translator-0.1.6/epub_translator/translation/__init__.py +2 -0
  10. {epub_translator-0.1.4/epub_translator → epub_translator-0.1.6/epub_translator/translation}/epub_transcode.py +2 -2
  11. {epub_translator-0.1.4/epub_translator → epub_translator-0.1.6/epub_translator/translation}/punctuation.py +1 -1
  12. {epub_translator-0.1.4/epub_translator → epub_translator-0.1.6/epub_translator/translation}/translator.py +8 -6
  13. {epub_translator-0.1.4/epub_translator → epub_translator-0.1.6/epub_translator/translation}/xml_interrupter.py +2 -2
  14. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/__init__.py +1 -1
  15. epub_translator-0.1.6/epub_translator/xml_translator/concurrency.py +52 -0
  16. epub_translator-0.1.6/epub_translator/xml_translator/score.py +164 -0
  17. epub_translator-0.1.6/epub_translator/xml_translator/stream_mapper.py +284 -0
  18. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/submitter.py +28 -10
  19. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/translator.py +12 -18
  20. {epub_translator-0.1.4 → epub_translator-0.1.6}/pyproject.toml +1 -1
  21. epub_translator-0.1.4/epub_translator/xml_translator/stream_mapper.py +0 -253
  22. {epub_translator-0.1.4 → epub_translator-0.1.6}/LICENSE +0 -0
  23. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/fill.jinja +0 -0
  24. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/README.md +0 -0
  25. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/cmarkup.xsl +0 -0
  26. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/entities.xsl +0 -0
  27. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/glayout.xsl +0 -0
  28. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/mmltex.xsl +0 -0
  29. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/scripts.xsl +0 -0
  30. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/tables.xsl +0 -0
  31. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/mmltex/tokens.xsl +0 -0
  32. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/data/translate.jinja +0 -0
  33. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/common.py +0 -0
  34. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/math.py +0 -0
  35. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/metadata.py +0 -0
  36. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/spines.py +0 -0
  37. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/toc.py +0 -0
  38. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/epub/zip.py +0 -0
  39. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/__init__.py +0 -0
  40. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/error.py +0 -0
  41. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/executor.py +0 -0
  42. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/increasable.py +0 -0
  43. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/llm/types.py +0 -0
  44. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/__init__.py +0 -0
  45. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/block_segment.py +0 -0
  46. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/common.py +0 -0
  47. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/segment/utils.py +0 -0
  48. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/serial/__init__.py +0 -0
  49. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/serial/chunk.py +0 -0
  50. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/serial/segment.py +0 -0
  51. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/serial/splitter.py +0 -0
  52. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/template.py +0 -0
  53. {epub_translator-0.1.4/epub_translator → epub_translator-0.1.6/epub_translator/translation}/language.py +0 -0
  54. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/utils.py +0 -0
  55. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/const.py +0 -0
  56. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/deduplication.py +0 -0
  57. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/__init__.py +0 -0
  58. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/decoder.py +0 -0
  59. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/encoder.py +0 -0
  60. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/parser.py +0 -0
  61. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/tag.py +0 -0
  62. {epub_translator-0.1.4/epub_translator/xml/firendly → epub_translator-0.1.6/epub_translator/xml/friendly}/transform.py +0 -0
  63. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/inline.py +0 -0
  64. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/self_closing.py +0 -0
  65. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/utils.py +0 -0
  66. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/xml.py +0 -0
  67. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml/xml_like.py +0 -0
  68. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/__init__.py +0 -0
  69. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/callbacks.py +0 -0
  70. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/common.py +0 -0
  71. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/hill_climbing.py +0 -0
  72. {epub_translator-0.1.4 → epub_translator-0.1.6}/epub_translator/xml_translator/validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.4
3
+ Version: 0.1.6
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -156,7 +156,8 @@ translate(
156
156
  submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
157
157
  user_prompt: str | None = None, # Custom translation instructions
158
158
  max_retries: int = 5, # Maximum retries for failed translations
159
- max_group_tokens: int = 1200, # Maximum tokens per translation group
159
+ max_group_tokens: int = 2600, # Maximum tokens per translation group
160
+ concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
160
161
  llm: LLM | None = None, # Single LLM instance for both translation and filling
161
162
  translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
162
163
  fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
@@ -238,16 +239,17 @@ translate(
238
239
 
239
240
  ### Error Handling with `on_fill_failed`
240
241
 
241
- Monitor and handle translation errors using the `on_fill_failed` callback:
242
+ Monitor translation errors using the `on_fill_failed` callback. The system automatically retries failed translations up to `max_retries` times (default: 5). Most errors are recovered during retries and don't affect the final output.
242
243
 
243
244
  ```python
244
245
  from epub_translator import FillFailedEvent
245
246
 
246
247
  def handle_fill_error(event: FillFailedEvent):
247
- print(f"Translation error (attempt {event.retried_count}):")
248
- print(f" {event.error_message}")
248
+ # Only log critical errors that will affect the final EPUB
249
249
  if event.over_maximum_retries:
250
- print(" Maximum retries exceeded!")
250
+ print(f"Critical error after {event.retried_count} attempts:")
251
+ print(f" {event.error_message}")
252
+ print(" This error will be present in the final EPUB file!")
251
253
 
252
254
  translate(
253
255
  source_path="source.epub",
@@ -259,10 +261,32 @@ translate(
259
261
  )
260
262
  ```
261
263
 
264
+ **Understanding Error Severity:**
265
+
262
266
  The `FillFailedEvent` contains:
263
267
  - `error_message: str` - Description of the error
264
- - `retried_count: int` - Current retry attempt number
265
- - `over_maximum_retries: bool` - Whether max retries has been exceeded
268
+ - `retried_count: int` - Current retry attempt number (1 to max_retries)
269
+ - `over_maximum_retries: bool` - Whether the error is critical
270
+
271
+ **Error Categories:**
272
+
273
+ - **Recoverable errors** (`over_maximum_retries=False`): Errors during retry attempts. The system will continue retrying and may resolve these automatically. Safe to ignore in most cases.
274
+
275
+ - **Critical errors** (`over_maximum_retries=True`): Errors that persist after all retry attempts. These will appear in the final EPUB file and should be investigated.
276
+
277
+ **Advanced Usage:**
278
+
279
+ For verbose logging during translation debugging:
280
+
281
+ ```python
282
+ def handle_fill_error(event: FillFailedEvent):
283
+ if event.over_maximum_retries:
284
+ # Critical: affects final output
285
+ print(f"❌ CRITICAL: {event.error_message}")
286
+ else:
287
+ # Informational: system is retrying
288
+ print(f"⚠️ Retry {event.retried_count}: {event.error_message}")
289
+ ```
266
290
 
267
291
  ### Dual-LLM Architecture
268
292
 
@@ -371,6 +395,32 @@ llm = LLM(
371
395
  )
372
396
  ```
373
397
 
398
+ ### Concurrent Translation
399
+
400
+ Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
401
+
402
+ ```python
403
+ translate(
404
+ source_path="source.epub",
405
+ target_path="translated.epub",
406
+ target_language="English",
407
+ submit=SubmitKind.APPEND_BLOCK,
408
+ llm=llm,
409
+ concurrency=4, # Process 4 segments concurrently
410
+ )
411
+ ```
412
+
413
+ **Performance Tips:**
414
+
415
+ - Start with `concurrency=4` and adjust based on your API rate limits and system resources
416
+ - Higher concurrency values can significantly reduce translation time for large books
417
+ - The translation order is preserved regardless of concurrency settings
418
+ - Monitor your API provider's rate limits to avoid throttling
419
+
420
+ **Thread Safety:**
421
+
422
+ When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
423
+
374
424
  ## Related Projects
375
425
 
376
426
  ### PDF Craft
@@ -123,7 +123,8 @@ translate(
123
123
  submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
124
124
  user_prompt: str | None = None, # Custom translation instructions
125
125
  max_retries: int = 5, # Maximum retries for failed translations
126
- max_group_tokens: int = 1200, # Maximum tokens per translation group
126
+ max_group_tokens: int = 2600, # Maximum tokens per translation group
127
+ concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
127
128
  llm: LLM | None = None, # Single LLM instance for both translation and filling
128
129
  translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
129
130
  fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
@@ -205,16 +206,17 @@ translate(
205
206
 
206
207
  ### Error Handling with `on_fill_failed`
207
208
 
208
- Monitor and handle translation errors using the `on_fill_failed` callback:
209
+ Monitor translation errors using the `on_fill_failed` callback. The system automatically retries failed translations up to `max_retries` times (default: 5). Most errors are recovered during retries and don't affect the final output.
209
210
 
210
211
  ```python
211
212
  from epub_translator import FillFailedEvent
212
213
 
213
214
  def handle_fill_error(event: FillFailedEvent):
214
- print(f"Translation error (attempt {event.retried_count}):")
215
- print(f" {event.error_message}")
215
+ # Only log critical errors that will affect the final EPUB
216
216
  if event.over_maximum_retries:
217
- print(" Maximum retries exceeded!")
217
+ print(f"Critical error after {event.retried_count} attempts:")
218
+ print(f" {event.error_message}")
219
+ print(" This error will be present in the final EPUB file!")
218
220
 
219
221
  translate(
220
222
  source_path="source.epub",
@@ -226,10 +228,32 @@ translate(
226
228
  )
227
229
  ```
228
230
 
231
+ **Understanding Error Severity:**
232
+
229
233
  The `FillFailedEvent` contains:
230
234
  - `error_message: str` - Description of the error
231
- - `retried_count: int` - Current retry attempt number
232
- - `over_maximum_retries: bool` - Whether max retries has been exceeded
235
+ - `retried_count: int` - Current retry attempt number (1 to max_retries)
236
+ - `over_maximum_retries: bool` - Whether the error is critical
237
+
238
+ **Error Categories:**
239
+
240
+ - **Recoverable errors** (`over_maximum_retries=False`): Errors during retry attempts. The system will continue retrying and may resolve these automatically. Safe to ignore in most cases.
241
+
242
+ - **Critical errors** (`over_maximum_retries=True`): Errors that persist after all retry attempts. These will appear in the final EPUB file and should be investigated.
243
+
244
+ **Advanced Usage:**
245
+
246
+ For verbose logging during translation debugging:
247
+
248
+ ```python
249
+ def handle_fill_error(event: FillFailedEvent):
250
+ if event.over_maximum_retries:
251
+ # Critical: affects final output
252
+ print(f"❌ CRITICAL: {event.error_message}")
253
+ else:
254
+ # Informational: system is retrying
255
+ print(f"⚠️ Retry {event.retried_count}: {event.error_message}")
256
+ ```
233
257
 
234
258
  ### Dual-LLM Architecture
235
259
 
@@ -338,6 +362,32 @@ llm = LLM(
338
362
  )
339
363
  ```
340
364
 
365
+ ### Concurrent Translation
366
+
367
+ Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
368
+
369
+ ```python
370
+ translate(
371
+ source_path="source.epub",
372
+ target_path="translated.epub",
373
+ target_language="English",
374
+ submit=SubmitKind.APPEND_BLOCK,
375
+ llm=llm,
376
+ concurrency=4, # Process 4 segments concurrently
377
+ )
378
+ ```
379
+
380
+ **Performance Tips:**
381
+
382
+ - Start with `concurrency=4` and adjust based on your API rate limits and system resources
383
+ - Higher concurrency values can significantly reduce translation time for large books
384
+ - The translation order is preserved regardless of concurrency settings
385
+ - Monitor your API provider's rate limits to avoid throttling
386
+
387
+ **Thread Safety:**
388
+
389
+ When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
390
+
341
391
  ## Related Projects
342
392
 
343
393
  ### PDF Craft
@@ -1,6 +1,5 @@
1
- from . import language
2
1
  from .llm import LLM
3
- from .translator import FillFailedEvent, translate
2
+ from .translation import FillFailedEvent, language, translate
4
3
  from .xml_translator import SubmitKind
5
4
 
6
5
  __all__ = [
@@ -1,4 +1,4 @@
1
1
  from .metadata import read_metadata, write_metadata
2
2
  from .spines import search_spine_paths
3
- from .toc import read_toc, write_toc
3
+ from .toc import Toc, read_toc, write_toc
4
4
  from .zip import Zip
@@ -1,5 +1,6 @@
1
1
  import hashlib
2
2
  import json
3
+ import threading
3
4
  import uuid
4
5
  from pathlib import Path
5
6
  from typing import Self
@@ -8,6 +9,9 @@ from .executor import LLMExecutor
8
9
  from .increasable import Increasable, Increaser
9
10
  from .types import Message, MessageRole
10
11
 
12
+ # Global lock for cache file commit operations
13
+ _CACHE_COMMIT_LOCK = threading.Lock()
14
+
11
15
 
12
16
  class LLMContext:
13
17
  def __init__(
@@ -101,7 +105,12 @@ class LLMContext:
101
105
  # Remove the .[context-id].txt suffix to get permanent name
102
106
  permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
103
107
  permanent_file = temp_file.parent / permanent_name
104
- temp_file.rename(permanent_file)
108
+
109
+ with _CACHE_COMMIT_LOCK: # 多线程下的线程安全
110
+ if permanent_file.exists():
111
+ temp_file.unlink()
112
+ else:
113
+ temp_file.rename(permanent_file)
105
114
 
106
115
  def _rollback(self) -> None:
107
116
  for temp_file in self._temp_files:
@@ -1,4 +1,5 @@
1
1
  import datetime
2
+ import threading
2
3
  from collections.abc import Generator
3
4
  from importlib.resources import files
4
5
  from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
@@ -14,6 +15,11 @@ from .executor import LLMExecutor
14
15
  from .increasable import Increasable
15
16
  from .types import Message
16
17
 
18
+ # Global state for logger filename generation
19
+ _LOGGER_LOCK = threading.Lock()
20
+ _LAST_TIMESTAMP: str | None = None
21
+ _LOGGER_SUFFIX_ID: int = 1
22
+
17
23
 
18
24
  class LLM:
19
25
  def __init__(
@@ -95,13 +101,34 @@ class LLM:
95
101
  return dir_path.resolve()
96
102
 
97
103
  def _create_logger(self) -> Logger | None:
104
+ # pylint: disable=global-statement
105
+ global _LAST_TIMESTAMP, _LOGGER_SUFFIX_ID
106
+
98
107
  if self._logger_save_path is None:
99
108
  return None
100
109
 
101
110
  now = datetime.datetime.now(datetime.UTC)
102
- timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
103
- file_path = self._logger_save_path / f"request {timestamp}.log"
104
- logger = getLogger(f"LLM Request {timestamp}")
111
+ # Use second-level precision for collision detection
112
+ timestamp_key = now.strftime("%Y-%m-%d %H-%M-%S")
113
+
114
+ with _LOGGER_LOCK:
115
+ if _LAST_TIMESTAMP == timestamp_key:
116
+ _LOGGER_SUFFIX_ID += 1
117
+ suffix_id = _LOGGER_SUFFIX_ID
118
+ else:
119
+ _LAST_TIMESTAMP = timestamp_key
120
+ _LOGGER_SUFFIX_ID = 1
121
+ suffix_id = 1
122
+
123
+ if suffix_id == 1:
124
+ file_name = f"request {timestamp_key}.log"
125
+ logger_name = f"LLM Request {timestamp_key}"
126
+ else:
127
+ file_name = f"request {timestamp_key}_{suffix_id}.log"
128
+ logger_name = f"LLM Request {timestamp_key}_{suffix_id}"
129
+
130
+ file_path = self._logger_save_path / file_name
131
+ logger = getLogger(logger_name)
105
132
  logger.setLevel(DEBUG)
106
133
  handler = FileHandler(file_path, encoding="utf-8")
107
134
  handler.setLevel(DEBUG)
@@ -47,6 +47,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
47
47
  inline_segment = _pop_stack_data(stack_data)
48
48
  stack_data = None
49
49
  if inline_segment:
50
+ inline_segment.id = 0
50
51
  yield inline_segment
51
52
 
52
53
  if stack_data is None:
@@ -73,6 +74,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
73
74
  if stack_data is not None:
74
75
  inline_segment = _pop_stack_data(stack_data)
75
76
  if inline_segment:
77
+ inline_segment.id = 0
76
78
  yield inline_segment
77
79
 
78
80
 
@@ -115,7 +117,7 @@ class InlineSegment:
115
117
  self._child_tag2ids: dict[str, list[int]] = {}
116
118
  self._child_tag2count: dict[str, int] = {}
117
119
 
118
- next_temp_id: int = 0
120
+ next_temp_id: int = 1
119
121
  terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
120
122
 
121
123
  for tag, child_terms in terms.items():
@@ -162,6 +164,14 @@ class InlineSegment:
162
164
  elif isinstance(child, InlineSegment):
163
165
  yield from child
164
166
 
167
+ def clone(self) -> "InlineSegment":
168
+ cloned_segment = InlineSegment(
169
+ depth=len(self._parent_stack),
170
+ children=[child.clone() for child in self._children],
171
+ )
172
+ cloned_segment.id = self.id
173
+ return cloned_segment
174
+
165
175
  def recreate_ids(self, id_generator: IDGenerator) -> None:
166
176
  self._child_tag2count.clear()
167
177
  self._child_tag2ids.clear()
@@ -33,10 +33,6 @@ class TextSegment:
33
33
  def block_parent(self) -> Element:
34
34
  return self.parent_stack[self.block_depth - 1]
35
35
 
36
- @property
37
- def xml_text(self) -> str:
38
- return "".join(_expand_xml_texts(self))
39
-
40
36
  def strip_block_parents(self) -> Self:
41
37
  self.parent_stack = self.parent_stack[self.block_depth - 1 :]
42
38
  self.block_depth = 1
@@ -0,0 +1,2 @@
1
+ from . import language
2
+ from .translator import FillFailedEvent, translate
@@ -6,8 +6,8 @@ EPUB 数据结构与 XML 的编码/解码转换
6
6
 
7
7
  from xml.etree.ElementTree import Element
8
8
 
9
- from .epub.metadata import MetadataField
10
- from .epub.toc import Toc
9
+ from ..epub import Toc
10
+ from ..epub.metadata import MetadataField
11
11
 
12
12
 
13
13
  def encode_toc(toc: Toc) -> Element:
@@ -1,6 +1,6 @@
1
1
  from xml.etree.ElementTree import Element
2
2
 
3
- from .xml import iter_with_stack
3
+ from ..xml import iter_with_stack
4
4
 
5
5
  _QUOTE_MAPPING = {
6
6
  # 法语引号
@@ -5,7 +5,7 @@ from importlib.metadata import version as get_package_version
5
5
  from os import PathLike
6
6
  from pathlib import Path
7
7
 
8
- from .epub import (
8
+ from ..epub import (
9
9
  Zip,
10
10
  read_metadata,
11
11
  read_toc,
@@ -13,12 +13,12 @@ from .epub import (
13
13
  write_metadata,
14
14
  write_toc,
15
15
  )
16
+ from ..llm import LLM
17
+ from ..xml import XMLLikeNode, deduplicate_ids_in_element, find_first
18
+ from ..xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
16
19
  from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
17
- from .llm import LLM
18
20
  from .punctuation import unwrap_french_quotes
19
- from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
20
21
  from .xml_interrupter import XMLInterrupter
21
- from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
22
22
 
23
23
 
24
24
  class _ElementType(Enum):
@@ -40,7 +40,8 @@ def translate(
40
40
  submit: SubmitKind,
41
41
  user_prompt: str | None = None,
42
42
  max_retries: int = 5,
43
- max_group_tokens: int = 1200,
43
+ max_group_tokens: int = 2600,
44
+ concurrency: int = 1,
44
45
  llm: LLM | None = None,
45
46
  translation_llm: LLM | None = None,
46
47
  fill_llm: LLM | None = None,
@@ -62,7 +63,7 @@ def translate(
62
63
  ignore_translated_error=False,
63
64
  max_retries=max_retries,
64
65
  max_fill_displaying_errors=10,
65
- max_group_tokens=max_group_tokens,
66
+ max_group_score=max_group_tokens,
66
67
  cache_seed_content=f"{_get_version()}:{target_language}",
67
68
  )
68
69
  with Zip(
@@ -92,6 +93,7 @@ def translate(
92
93
  current_progress = 0.0
93
94
 
94
95
  for translated_elem, context in translator.translate_elements(
96
+ concurrency=concurrency,
95
97
  interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
96
98
  interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
97
99
  interrupt_block_element=interrupter.interrupt_block_element,
@@ -2,8 +2,8 @@ from collections.abc import Generator, Iterable
2
2
  from typing import cast
3
3
  from xml.etree.ElementTree import Element
4
4
 
5
- from .segment import TextSegment
6
- from .utils import ensure_list, normalize_whitespace
5
+ from ..segment import TextSegment
6
+ from ..utils import ensure_list, normalize_whitespace
7
7
 
8
8
  _ID_KEY = "__XML_INTERRUPTER_ID"
9
9
  _MATH_TAG = "math"
@@ -1,6 +1,6 @@
1
1
  from .const import *
2
2
  from .deduplication import *
3
- from .firendly import *
3
+ from .friendly import *
4
4
  from .inline import *
5
5
  from .utils import *
6
6
  from .xml import *
@@ -0,0 +1,52 @@
1
+ from collections import deque
2
+ from collections.abc import Callable, Iterable
3
+ from concurrent.futures import Future, ThreadPoolExecutor
4
+ from typing import TypeVar
5
+
6
+ P = TypeVar("P")
7
+ R = TypeVar("R")
8
+
9
+
10
+ def run_concurrency(
11
+ parameters: Iterable[P],
12
+ execute: Callable[[P], R],
13
+ concurrency: int,
14
+ ) -> Iterable[R]:
15
+ assert concurrency >= 1, "the concurrency must be at least 1"
16
+ # Fast path: concurrency == 1, no thread overhead
17
+ if concurrency == 1:
18
+ for param in parameters:
19
+ yield execute(param)
20
+ return
21
+
22
+ executor = ThreadPoolExecutor(max_workers=concurrency)
23
+ did_shutdown = False
24
+ try:
25
+ futures: deque[Future[R]] = deque()
26
+ params_iter = iter(parameters)
27
+ for _ in range(concurrency):
28
+ try:
29
+ param = next(params_iter)
30
+ future = executor.submit(execute, param)
31
+ futures.append(future)
32
+ except StopIteration:
33
+ break
34
+
35
+ while futures:
36
+ future = futures.popleft()
37
+ yield future.result()
38
+ try:
39
+ param = next(params_iter)
40
+ new_future = executor.submit(execute, param)
41
+ futures.append(new_future)
42
+ except StopIteration:
43
+ pass
44
+
45
+ except KeyboardInterrupt:
46
+ executor.shutdown(wait=False, cancel_futures=True)
47
+ did_shutdown = True
48
+ raise
49
+
50
+ finally:
51
+ if not did_shutdown:
52
+ executor.shutdown(wait=True)
@@ -0,0 +1,164 @@
1
+ from collections.abc import Generator
2
+ from dataclasses import dataclass
3
+ from enum import Enum, auto
4
+
5
+ from tiktoken import Encoding
6
+
7
+ from ..segment import InlineSegment, TextSegment
8
+ from .common import DATA_ORIGIN_LEN_KEY
9
+
10
+ _ID_WEIGHT = 80
11
+ _ELLIPSIS = "..."
12
+
13
+
14
+ @dataclass
15
+ class ScoreSegment:
16
+ text_segment: TextSegment
17
+ left_parents: list[InlineSegment]
18
+ right_parents: list[InlineSegment]
19
+ text_tokens: list[int]
20
+ score: int
21
+
22
+
23
+ def expand_to_score_segments(encoding: Encoding, inline_segment: InlineSegment) -> Generator[ScoreSegment, None, None]:
24
+ for i, score_segment in enumerate(_do_expand_inline_segment(inline_segment)):
25
+ xml_text = "".join(
26
+ _render_score_segment(
27
+ score_segment=score_segment,
28
+ is_first=(i == 0),
29
+ )
30
+ )
31
+ score_segment.text_tokens = encoding.encode(score_segment.text_segment.text)
32
+ score_segment.score = len(encoding.encode(xml_text)) + sum(
33
+ _ID_WEIGHT for parent in score_segment.left_parents if parent.id is not None
34
+ )
35
+ yield score_segment
36
+
37
+
38
+ def truncate_score_segment(
39
+ encoding: Encoding,
40
+ score_segment: ScoreSegment,
41
+ remain_head: bool,
42
+ remain_score: int,
43
+ ):
44
+ fixed_score = score_segment.score - len(score_segment.text_tokens)
45
+ if remain_score <= fixed_score:
46
+ # 裁剪仅能减少 text 部分的 tokens 数。
47
+ # 而 XML 本身头尾占用的 tokens 数,以及 ID 占用加权分属于 fixed_score 部分,无法裁剪
48
+ # 当发现将文字删光后才能达标时,不如直接放弃整段内容
49
+ return None
50
+
51
+ remain_text_tokens_count = remain_score - fixed_score
52
+
53
+ # remain_text_tokens_count cannot be 0 here
54
+ if remain_head:
55
+ remain_text = encoding.decode(score_segment.text_tokens[:remain_text_tokens_count])
56
+ else:
57
+ remain_text = encoding.decode(score_segment.text_tokens[-remain_text_tokens_count:])
58
+
59
+ if not remain_text.strip():
60
+ return None
61
+
62
+ if remain_head:
63
+ remain_text = f"{remain_text} {_ELLIPSIS}"
64
+ else:
65
+ remain_text = f"{_ELLIPSIS} {remain_text}"
66
+
67
+ text_segment = score_segment.text_segment.clone()
68
+ text_segment.text = remain_text
69
+
70
+ return ScoreSegment(
71
+ text_segment=text_segment,
72
+ left_parents=score_segment.left_parents,
73
+ right_parents=score_segment.right_parents,
74
+ text_tokens=encoding.encode(remain_text),
75
+ score=remain_text_tokens_count + fixed_score,
76
+ )
77
+
78
+
79
+ def _render_score_segment(score_segment: ScoreSegment, is_first: bool):
80
+ for i, parent in enumerate(score_segment.left_parents):
81
+ yield "<"
82
+ yield parent.parent.tag
83
+ if parent.id is not None:
84
+ yield ' id="99"'
85
+ if is_first and i == 0:
86
+ yield " "
87
+ yield DATA_ORIGIN_LEN_KEY
88
+ yield '="9999"'
89
+ yield ">"
90
+
91
+ yield score_segment.text_segment.text
92
+
93
+ for parent in reversed(score_segment.right_parents):
94
+ yield "</"
95
+ yield parent.parent.tag
96
+ yield ">"
97
+
98
+
99
+ def _do_expand_inline_segment(inline_segment: InlineSegment):
100
+ text_segment: TextSegment | None = None
101
+ left_parents: list[InlineSegment] = []
102
+ right_parents: list[InlineSegment] = []
103
+
104
+ for item in _expand_as_wrapped(inline_segment):
105
+ if isinstance(item, TextSegment):
106
+ if text_segment is None:
107
+ text_segment = item
108
+ else:
109
+ yield ScoreSegment(
110
+ text_segment=text_segment,
111
+ left_parents=left_parents,
112
+ right_parents=right_parents,
113
+ text_tokens=[],
114
+ score=0,
115
+ )
116
+ text_segment = item
117
+ left_parents = []
118
+ right_parents = []
119
+
120
+ elif isinstance(item, tuple):
121
+ child_inline_segment, orientation = item
122
+ if orientation == _Orientation.UP:
123
+ if text_segment is not None:
124
+ yield ScoreSegment(
125
+ text_segment=text_segment,
126
+ left_parents=left_parents,
127
+ right_parents=right_parents,
128
+ text_tokens=[],
129
+ score=0,
130
+ )
131
+ text_segment = None
132
+ left_parents = []
133
+ right_parents = []
134
+ left_parents.append(child_inline_segment)
135
+
136
+ elif orientation == _Orientation.DOWN:
137
+ if text_segment is None:
138
+ left_parents.clear()
139
+ else:
140
+ right_parents.append(child_inline_segment)
141
+
142
+ if text_segment is not None:
143
+ yield ScoreSegment(
144
+ text_segment=text_segment,
145
+ left_parents=left_parents,
146
+ right_parents=right_parents,
147
+ text_tokens=[],
148
+ score=0,
149
+ )
150
+
151
+
152
+ class _Orientation(Enum):
153
+ DOWN = auto()
154
+ UP = auto()
155
+
156
+
157
+ def _expand_as_wrapped(inline_segment: InlineSegment):
158
+ yield (inline_segment, _Orientation.UP)
159
+ for child in inline_segment.children:
160
+ if isinstance(child, InlineSegment):
161
+ yield from _expand_as_wrapped(child)
162
+ elif isinstance(child, TextSegment):
163
+ yield child
164
+ yield (inline_segment, _Orientation.DOWN)