epub-translator 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. {epub_translator-0.1.5 → epub_translator-0.1.7}/PKG-INFO +37 -9
  2. {epub_translator-0.1.5 → epub_translator-0.1.7}/README.md +35 -8
  3. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/__init__.py +1 -2
  4. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/data/translate.jinja +3 -0
  5. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/__init__.py +1 -1
  6. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/context.py +10 -1
  7. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/core.py +30 -3
  8. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/__init__.py +1 -0
  9. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/inline_segment.py +11 -1
  10. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/text_segment.py +5 -10
  11. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/utils.py +0 -16
  12. epub_translator-0.1.7/epub_translator/translation/__init__.py +2 -0
  13. {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/epub_transcode.py +2 -2
  14. {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/punctuation.py +1 -1
  15. {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/translator.py +8 -6
  16. {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/xml_interrupter.py +52 -28
  17. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/__init__.py +1 -1
  18. epub_translator-0.1.7/epub_translator/xml/inline.py +113 -0
  19. epub_translator-0.1.7/epub_translator/xml_translator/concurrency.py +52 -0
  20. epub_translator-0.1.7/epub_translator/xml_translator/score.py +164 -0
  21. epub_translator-0.1.7/epub_translator/xml_translator/stream_mapper.py +284 -0
  22. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/submitter.py +5 -5
  23. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/translator.py +12 -18
  24. {epub_translator-0.1.5 → epub_translator-0.1.7}/pyproject.toml +2 -1
  25. epub_translator-0.1.5/epub_translator/data/mmltex/README.md +0 -67
  26. epub_translator-0.1.5/epub_translator/data/mmltex/cmarkup.xsl +0 -1106
  27. epub_translator-0.1.5/epub_translator/data/mmltex/entities.xsl +0 -459
  28. epub_translator-0.1.5/epub_translator/data/mmltex/glayout.xsl +0 -222
  29. epub_translator-0.1.5/epub_translator/data/mmltex/mmltex.xsl +0 -36
  30. epub_translator-0.1.5/epub_translator/data/mmltex/scripts.xsl +0 -375
  31. epub_translator-0.1.5/epub_translator/data/mmltex/tables.xsl +0 -130
  32. epub_translator-0.1.5/epub_translator/data/mmltex/tokens.xsl +0 -328
  33. epub_translator-0.1.5/epub_translator/xml/inline.py +0 -67
  34. epub_translator-0.1.5/epub_translator/xml_translator/stream_mapper.py +0 -253
  35. {epub_translator-0.1.5 → epub_translator-0.1.7}/LICENSE +0 -0
  36. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/data/fill.jinja +0 -0
  37. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/common.py +0 -0
  38. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/math.py +0 -0
  39. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/metadata.py +0 -0
  40. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/spines.py +0 -0
  41. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/toc.py +0 -0
  42. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/epub/zip.py +0 -0
  43. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/__init__.py +0 -0
  44. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/error.py +0 -0
  45. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/executor.py +0 -0
  46. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/increasable.py +0 -0
  47. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/llm/types.py +0 -0
  48. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/block_segment.py +0 -0
  49. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/segment/common.py +0 -0
  50. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/__init__.py +0 -0
  51. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/chunk.py +0 -0
  52. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/segment.py +0 -0
  53. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/serial/splitter.py +0 -0
  54. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/template.py +0 -0
  55. {epub_translator-0.1.5/epub_translator → epub_translator-0.1.7/epub_translator/translation}/language.py +0 -0
  56. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/utils.py +0 -0
  57. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/const.py +0 -0
  58. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/deduplication.py +0 -0
  59. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/__init__.py +0 -0
  60. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/decoder.py +0 -0
  61. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/encoder.py +0 -0
  62. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/parser.py +0 -0
  63. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/tag.py +0 -0
  64. {epub_translator-0.1.5/epub_translator/xml/firendly → epub_translator-0.1.7/epub_translator/xml/friendly}/transform.py +0 -0
  65. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/self_closing.py +0 -0
  66. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/utils.py +0 -0
  67. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/xml.py +0 -0
  68. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml/xml_like.py +0 -0
  69. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/__init__.py +0 -0
  70. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/callbacks.py +0 -0
  71. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/common.py +0 -0
  72. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/hill_climbing.py +0 -0
  73. {epub_translator-0.1.5 → epub_translator-0.1.7}/epub_translator/xml_translator/validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Localization
24
24
  Classifier: Topic :: Text Processing :: Markup
25
25
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
26
  Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
27
+ Requires-Dist: mathml2latex (>=0.2.12,<0.3.0)
27
28
  Requires-Dist: openai (>=2.14.0,<3.0.0)
28
29
  Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
29
30
  Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
@@ -59,6 +60,13 @@ Translate EPUB books using Large Language Models while preserving the original t
59
60
  - **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
60
61
  - **Caching**: Built-in caching for progress recovery when translation fails
61
62
 
63
+ ## Use Cases
64
+
65
+ - **Language Learning**: Read books in their original language with side-by-side translations
66
+ - **Academic Research**: Access foreign literature with bilingual references
67
+ - **Content Localization**: Prepare books for international audiences
68
+ - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
69
+
62
70
  ## Installation
63
71
 
64
72
  ```bash
@@ -156,7 +164,8 @@ translate(
156
164
  submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
157
165
  user_prompt: str | None = None, # Custom translation instructions
158
166
  max_retries: int = 5, # Maximum retries for failed translations
159
- max_group_tokens: int = 1200, # Maximum tokens per translation group
167
+ max_group_tokens: int = 2600, # Maximum tokens per translation group
168
+ concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
160
169
  llm: LLM | None = None, # Single LLM instance for both translation and filling
161
170
  translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
162
171
  fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
@@ -356,13 +365,6 @@ llm = LLM(
356
365
  )
357
366
  ```
358
367
 
359
- ## Use Cases
360
-
361
- - **Language Learning**: Read books in their original language with side-by-side translations
362
- - **Academic Research**: Access foreign literature with bilingual references
363
- - **Content Localization**: Prepare books for international audiences
364
- - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
365
-
366
368
  ## Advanced Features
367
369
 
368
370
  ### Custom Translation Prompts
@@ -394,6 +396,32 @@ llm = LLM(
394
396
  )
395
397
  ```
396
398
 
399
+ ### Concurrent Translation
400
+
401
+ Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
402
+
403
+ ```python
404
+ translate(
405
+ source_path="source.epub",
406
+ target_path="translated.epub",
407
+ target_language="English",
408
+ submit=SubmitKind.APPEND_BLOCK,
409
+ llm=llm,
410
+ concurrency=4, # Process 4 segments concurrently
411
+ )
412
+ ```
413
+
414
+ **Performance Tips:**
415
+
416
+ - Start with `concurrency=4` and adjust based on your API rate limits and system resources
417
+ - Higher concurrency values can significantly reduce translation time for large books
418
+ - The translation order is preserved regardless of concurrency settings
419
+ - Monitor your API provider's rate limits to avoid throttling
420
+
421
+ **Thread Safety:**
422
+
423
+ When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
424
+
397
425
  ## Related Projects
398
426
 
399
427
  ### PDF Craft
@@ -26,6 +26,13 @@ Translate EPUB books using Large Language Models while preserving the original t
26
26
  - **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
27
27
  - **Caching**: Built-in caching for progress recovery when translation fails
28
28
 
29
+ ## Use Cases
30
+
31
+ - **Language Learning**: Read books in their original language with side-by-side translations
32
+ - **Academic Research**: Access foreign literature with bilingual references
33
+ - **Content Localization**: Prepare books for international audiences
34
+ - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
35
+
29
36
  ## Installation
30
37
 
31
38
  ```bash
@@ -123,7 +130,8 @@ translate(
123
130
  submit: SubmitKind, # How to insert translations (REPLACE, APPEND_TEXT, or APPEND_BLOCK)
124
131
  user_prompt: str | None = None, # Custom translation instructions
125
132
  max_retries: int = 5, # Maximum retries for failed translations
126
- max_group_tokens: int = 1200, # Maximum tokens per translation group
133
+ max_group_tokens: int = 2600, # Maximum tokens per translation group
134
+ concurrency: int = 1, # Number of concurrent translation tasks (default: 1)
127
135
  llm: LLM | None = None, # Single LLM instance for both translation and filling
128
136
  translation_llm: LLM | None = None, # LLM instance for translation (overrides llm)
129
137
  fill_llm: LLM | None = None, # LLM instance for XML filling (overrides llm)
@@ -323,13 +331,6 @@ llm = LLM(
323
331
  )
324
332
  ```
325
333
 
326
- ## Use Cases
327
-
328
- - **Language Learning**: Read books in their original language with side-by-side translations
329
- - **Academic Research**: Access foreign literature with bilingual references
330
- - **Content Localization**: Prepare books for international audiences
331
- - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
332
-
333
334
  ## Advanced Features
334
335
 
335
336
  ### Custom Translation Prompts
@@ -361,6 +362,32 @@ llm = LLM(
361
362
  )
362
363
  ```
363
364
 
365
+ ### Concurrent Translation
366
+
367
+ Speed up translation by processing multiple text segments concurrently. Use the `concurrency` parameter to control how many translation tasks run in parallel:
368
+
369
+ ```python
370
+ translate(
371
+ source_path="source.epub",
372
+ target_path="translated.epub",
373
+ target_language="English",
374
+ submit=SubmitKind.APPEND_BLOCK,
375
+ llm=llm,
376
+ concurrency=4, # Process 4 segments concurrently
377
+ )
378
+ ```
379
+
380
+ **Performance Tips:**
381
+
382
+ - Start with `concurrency=4` and adjust based on your API rate limits and system resources
383
+ - Higher concurrency values can significantly reduce translation time for large books
384
+ - The translation order is preserved regardless of concurrency settings
385
+ - Monitor your API provider's rate limits to avoid throttling
386
+
387
+ **Thread Safety:**
388
+
389
+ When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
390
+
364
391
  ## Related Projects
365
392
 
366
393
  ### PDF Craft
@@ -1,6 +1,5 @@
1
- from . import language
2
1
  from .llm import LLM
3
- from .translator import FillFailedEvent, translate
2
+ from .translation import FillFailedEvent, language, translate
4
3
  from .xml_translator import SubmitKind
5
4
 
6
5
  __all__ = [
@@ -13,6 +13,9 @@ Translation rules:
13
13
  {% if user_prompt -%}
14
14
  User may provide additional requirements in <rules> tags before the source text. Follow them, but prioritize the rules above if conflicts arise.
15
15
 
16
+ <rules>
17
+ {{ user_prompt }}
18
+ </rules>
16
19
  {% endif -%}
17
20
 
18
21
  Output only the translated text, nothing else.
@@ -1,4 +1,4 @@
1
1
  from .metadata import read_metadata, write_metadata
2
2
  from .spines import search_spine_paths
3
- from .toc import read_toc, write_toc
3
+ from .toc import Toc, read_toc, write_toc
4
4
  from .zip import Zip
@@ -1,5 +1,6 @@
1
1
  import hashlib
2
2
  import json
3
+ import threading
3
4
  import uuid
4
5
  from pathlib import Path
5
6
  from typing import Self
@@ -8,6 +9,9 @@ from .executor import LLMExecutor
8
9
  from .increasable import Increasable, Increaser
9
10
  from .types import Message, MessageRole
10
11
 
12
+ # Global lock for cache file commit operations
13
+ _CACHE_COMMIT_LOCK = threading.Lock()
14
+
11
15
 
12
16
  class LLMContext:
13
17
  def __init__(
@@ -101,7 +105,12 @@ class LLMContext:
101
105
  # Remove the .[context-id].txt suffix to get permanent name
102
106
  permanent_name = temp_file.name.rsplit(".", 2)[0] + ".txt"
103
107
  permanent_file = temp_file.parent / permanent_name
104
- temp_file.rename(permanent_file)
108
+
109
+ with _CACHE_COMMIT_LOCK: # 多线程下的线程安全
110
+ if permanent_file.exists():
111
+ temp_file.unlink()
112
+ else:
113
+ temp_file.rename(permanent_file)
105
114
 
106
115
  def _rollback(self) -> None:
107
116
  for temp_file in self._temp_files:
@@ -1,4 +1,5 @@
1
1
  import datetime
2
+ import threading
2
3
  from collections.abc import Generator
3
4
  from importlib.resources import files
4
5
  from logging import DEBUG, FileHandler, Formatter, Logger, getLogger
@@ -14,6 +15,11 @@ from .executor import LLMExecutor
14
15
  from .increasable import Increasable
15
16
  from .types import Message
16
17
 
18
+ # Global state for logger filename generation
19
+ _LOGGER_LOCK = threading.Lock()
20
+ _LAST_TIMESTAMP: str | None = None
21
+ _LOGGER_SUFFIX_ID: int = 1
22
+
17
23
 
18
24
  class LLM:
19
25
  def __init__(
@@ -95,13 +101,34 @@ class LLM:
95
101
  return dir_path.resolve()
96
102
 
97
103
  def _create_logger(self) -> Logger | None:
104
+ # pylint: disable=global-statement
105
+ global _LAST_TIMESTAMP, _LOGGER_SUFFIX_ID
106
+
98
107
  if self._logger_save_path is None:
99
108
  return None
100
109
 
101
110
  now = datetime.datetime.now(datetime.UTC)
102
- timestamp = now.strftime("%Y-%m-%d %H-%M-%S %f")
103
- file_path = self._logger_save_path / f"request {timestamp}.log"
104
- logger = getLogger(f"LLM Request {timestamp}")
111
+ # Use second-level precision for collision detection
112
+ timestamp_key = now.strftime("%Y-%m-%d %H-%M-%S")
113
+
114
+ with _LOGGER_LOCK:
115
+ if _LAST_TIMESTAMP == timestamp_key:
116
+ _LOGGER_SUFFIX_ID += 1
117
+ suffix_id = _LOGGER_SUFFIX_ID
118
+ else:
119
+ _LAST_TIMESTAMP = timestamp_key
120
+ _LOGGER_SUFFIX_ID = 1
121
+ suffix_id = 1
122
+
123
+ if suffix_id == 1:
124
+ file_name = f"request {timestamp_key}.log"
125
+ logger_name = f"LLM Request {timestamp_key}"
126
+ else:
127
+ file_name = f"request {timestamp_key}_{suffix_id}.log"
128
+ logger_name = f"LLM Request {timestamp_key}_{suffix_id}"
129
+
130
+ file_path = self._logger_save_path / file_name
131
+ logger = getLogger(logger_name)
105
132
  logger.setLevel(DEBUG)
106
133
  handler = FileHandler(file_path, encoding="utf-8")
107
134
  handler.setLevel(DEBUG)
@@ -21,6 +21,7 @@ from .text_segment import (
21
21
  TextPosition,
22
22
  TextSegment,
23
23
  combine_text_segments,
24
+ find_block_depth,
24
25
  incision_between,
25
26
  search_text_segments,
26
27
  )
@@ -47,6 +47,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
47
47
  inline_segment = _pop_stack_data(stack_data)
48
48
  stack_data = None
49
49
  if inline_segment:
50
+ inline_segment.id = 0
50
51
  yield inline_segment
51
52
 
52
53
  if stack_data is None:
@@ -73,6 +74,7 @@ def search_inline_segments(text_segments: Iterable[TextSegment]) -> Generator["I
73
74
  if stack_data is not None:
74
75
  inline_segment = _pop_stack_data(stack_data)
75
76
  if inline_segment:
77
+ inline_segment.id = 0
76
78
  yield inline_segment
77
79
 
78
80
 
@@ -115,7 +117,7 @@ class InlineSegment:
115
117
  self._child_tag2ids: dict[str, list[int]] = {}
116
118
  self._child_tag2count: dict[str, int] = {}
117
119
 
118
- next_temp_id: int = 0
120
+ next_temp_id: int = 1
119
121
  terms = nest((child.parent.tag, child) for child in children if isinstance(child, InlineSegment))
120
122
 
121
123
  for tag, child_terms in terms.items():
@@ -162,6 +164,14 @@ class InlineSegment:
162
164
  elif isinstance(child, InlineSegment):
163
165
  yield from child
164
166
 
167
+ def clone(self) -> "InlineSegment":
168
+ cloned_segment = InlineSegment(
169
+ depth=len(self._parent_stack),
170
+ children=[child.clone() for child in self._children],
171
+ )
172
+ cloned_segment.id = self.id
173
+ return cloned_segment
174
+
165
175
  def recreate_ids(self, id_generator: IDGenerator) -> None:
166
176
  self._child_tag2count.clear()
167
177
  self._child_tag2ids.clear()
@@ -4,7 +4,7 @@ from enum import Enum, auto
4
4
  from typing import Self
5
5
  from xml.etree.ElementTree import Element
6
6
 
7
- from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_tag, normalize_text_in_element
7
+ from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_element, normalize_text_in_element
8
8
 
9
9
 
10
10
  class TextPosition(Enum):
@@ -33,10 +33,6 @@ class TextSegment:
33
33
  def block_parent(self) -> Element:
34
34
  return self.parent_stack[self.block_depth - 1]
35
35
 
36
- @property
37
- def xml_text(self) -> str:
38
- return "".join(_expand_xml_texts(self))
39
-
40
36
  def strip_block_parents(self) -> Self:
41
37
  self.parent_stack = self.parent_stack[self.block_depth - 1 :]
42
38
  self.block_depth = 1
@@ -104,7 +100,7 @@ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
104
100
  def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
105
101
  text = normalize_text_in_element(element.text)
106
102
  next_stack = stack + [element]
107
- next_block_depth = _find_block_depth(next_stack)
103
+ next_block_depth = find_block_depth(next_stack)
108
104
 
109
105
  if text is not None:
110
106
  yield TextSegment(
@@ -129,12 +125,11 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
129
125
  )
130
126
 
131
127
 
132
- def _find_block_depth(parent_stack: list[Element]) -> int:
128
+ def find_block_depth(parent_stack: list[Element]) -> int:
133
129
  index: int = 0
134
- for i in range(len(parent_stack) - 1, -1, -1):
135
- if not is_inline_tag(parent_stack[i].tag):
130
+ for i in range(len(parent_stack)):
131
+ if not is_inline_element(parent_stack[i]):
136
132
  index = i
137
- break
138
133
  return index + 1 # depth is a count not index
139
134
 
140
135
 
@@ -8,22 +8,6 @@ def element_fingerprint(element: Element) -> str:
8
8
  return f"<{element.tag} {' '.join(attrs)}/>"
9
9
 
10
10
 
11
- def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
12
- parents: list[Element] = []
13
- while True:
14
- if len(element) != 1:
15
- break
16
- child = element[0]
17
- if not element.text:
18
- break
19
- if not child.tail:
20
- break
21
- parents.append(element)
22
- element = child
23
- element.tail = None
24
- return element, parents
25
-
26
-
27
11
  def id_in_element(element: Element) -> int | None:
28
12
  id_str = element.get(ID_KEY, None)
29
13
  if id_str is None:
@@ -0,0 +1,2 @@
1
+ from . import language
2
+ from .translator import FillFailedEvent, translate
@@ -6,8 +6,8 @@ EPUB 数据结构与 XML 的编码/解码转换
6
6
 
7
7
  from xml.etree.ElementTree import Element
8
8
 
9
- from .epub.metadata import MetadataField
10
- from .epub.toc import Toc
9
+ from ..epub import Toc
10
+ from ..epub.metadata import MetadataField
11
11
 
12
12
 
13
13
  def encode_toc(toc: Toc) -> Element:
@@ -1,6 +1,6 @@
1
1
  from xml.etree.ElementTree import Element
2
2
 
3
- from .xml import iter_with_stack
3
+ from ..xml import iter_with_stack
4
4
 
5
5
  _QUOTE_MAPPING = {
6
6
  # 法语引号
@@ -5,7 +5,7 @@ from importlib.metadata import version as get_package_version
5
5
  from os import PathLike
6
6
  from pathlib import Path
7
7
 
8
- from .epub import (
8
+ from ..epub import (
9
9
  Zip,
10
10
  read_metadata,
11
11
  read_toc,
@@ -13,12 +13,12 @@ from .epub import (
13
13
  write_metadata,
14
14
  write_toc,
15
15
  )
16
+ from ..llm import LLM
17
+ from ..xml import XMLLikeNode, deduplicate_ids_in_element, find_first
18
+ from ..xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
16
19
  from .epub_transcode import decode_metadata, decode_toc_list, encode_metadata, encode_toc_list
17
- from .llm import LLM
18
20
  from .punctuation import unwrap_french_quotes
19
- from .xml import XMLLikeNode, deduplicate_ids_in_element, find_first
20
21
  from .xml_interrupter import XMLInterrupter
21
- from .xml_translator import FillFailedEvent, SubmitKind, TranslationTask, XMLTranslator
22
22
 
23
23
 
24
24
  class _ElementType(Enum):
@@ -40,7 +40,8 @@ def translate(
40
40
  submit: SubmitKind,
41
41
  user_prompt: str | None = None,
42
42
  max_retries: int = 5,
43
- max_group_tokens: int = 1200,
43
+ max_group_tokens: int = 2600,
44
+ concurrency: int = 1,
44
45
  llm: LLM | None = None,
45
46
  translation_llm: LLM | None = None,
46
47
  fill_llm: LLM | None = None,
@@ -62,7 +63,7 @@ def translate(
62
63
  ignore_translated_error=False,
63
64
  max_retries=max_retries,
64
65
  max_fill_displaying_errors=10,
65
- max_group_tokens=max_group_tokens,
66
+ max_group_score=max_group_tokens,
66
67
  cache_seed_content=f"{_get_version()}:{target_language}",
67
68
  )
68
69
  with Zip(
@@ -92,6 +93,7 @@ def translate(
92
93
  current_progress = 0.0
93
94
 
94
95
  for translated_elem, context in translator.translate_elements(
96
+ concurrency=concurrency,
95
97
  interrupt_source_text_segments=interrupter.interrupt_source_text_segments,
96
98
  interrupt_translated_text_segments=interrupter.interrupt_translated_text_segments,
97
99
  interrupt_block_element=interrupter.interrupt_block_element,
@@ -1,9 +1,13 @@
1
1
  from collections.abc import Generator, Iterable
2
2
  from typing import cast
3
- from xml.etree.ElementTree import Element
3
+ from xml.etree.ElementTree import Element, tostring
4
4
 
5
- from .segment import TextSegment
6
- from .utils import ensure_list, normalize_whitespace
5
+ from bs4 import BeautifulSoup
6
+ from mathml2latex.mathml import process_mathml
7
+
8
+ from ..segment import TextSegment, combine_text_segments, find_block_depth
9
+ from ..utils import ensure_list
10
+ from ..xml import clone_element
7
11
 
8
12
  _ID_KEY = "__XML_INTERRUPTER_ID"
9
13
  _MATH_TAG = "math"
@@ -37,8 +41,10 @@ class XMLInterrupter:
37
41
  def interrupt_block_element(self, element: Element) -> Element:
38
42
  interrupted_element = self._placeholder2interrupted.pop(id(element), None)
39
43
  if interrupted_element is None:
44
+ element.attrib.pop(_ID_KEY, None)
40
45
  return element
41
46
  else:
47
+ interrupted_element.attrib.pop(_ID_KEY, None)
42
48
  return interrupted_element
43
49
 
44
50
  def _expand_source_text_segment(self, text_segment: TextSegment):
@@ -81,14 +87,18 @@ class XMLInterrupter:
81
87
  _ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
82
88
  },
83
89
  )
90
+ interrupted_display = interrupted_element.get("display", None)
91
+ if interrupted_display is not None:
92
+ placeholder_element.set("display", interrupted_display)
93
+
84
94
  raw_parent_stack = text_segment.parent_stack[:interrupted_index]
85
95
  parent_stack = raw_parent_stack + [placeholder_element]
86
96
  merged_text_segment = TextSegment(
87
- text="".join(t.text for t in text_segments),
97
+ text=self._render_latex(text_segments),
88
98
  parent_stack=parent_stack,
89
99
  left_common_depth=text_segments[0].left_common_depth,
90
100
  right_common_depth=text_segments[-1].right_common_depth,
91
- block_depth=len(parent_stack),
101
+ block_depth=find_block_depth(parent_stack),
92
102
  position=text_segments[0].position,
93
103
  )
94
104
  self._placeholder2interrupted[id(placeholder_element)] = interrupted_element
@@ -116,8 +126,8 @@ class XMLInterrupter:
116
126
  # 原始栈退光,仅留下相对 interrupted 元素的栈,这种格式与 translated 要求一致
117
127
  text_segment.left_common_depth = max(0, text_segment.left_common_depth - interrupted_index)
118
128
  text_segment.right_common_depth = max(0, text_segment.right_common_depth - interrupted_index)
119
- text_segment.block_depth = 1
120
129
  text_segment.parent_stack = text_segment.parent_stack[interrupted_index:]
130
+ text_segment.block_depth = find_block_depth(text_segment.parent_stack)
121
131
 
122
132
  return merged_text_segment
123
133
 
@@ -129,37 +139,51 @@ class XMLInterrupter:
129
139
  break
130
140
  return interrupted_index
131
141
 
142
+ def _render_latex(self, text_segments: list[TextSegment]) -> str:
143
+ math_element, _ = next(combine_text_segments(text_segments))
144
+ while math_element.tag != _MATH_TAG:
145
+ if len(math_element) == 0:
146
+ return ""
147
+ math_element = math_element[0]
148
+
149
+ math_element = clone_element(math_element)
150
+ math_element.attrib.pop(_ID_KEY, None)
151
+ math_element.tail = None
152
+ latex: str | None = None
153
+ try:
154
+ mathml_str = tostring(math_element, encoding="unicode")
155
+ soup = BeautifulSoup(mathml_str, "html.parser")
156
+ latex = process_mathml(soup)
157
+ except Exception:
158
+ pass
159
+
160
+ if latex is None:
161
+ latex = "".join(t.text for t in text_segments)
162
+ elif math_element.get("display", None) == "inline":
163
+ latex = f"${latex}$"
164
+ else:
165
+ latex = f"$${latex}$$"
166
+
167
+ return f" {latex} "
168
+
132
169
  def _expand_translated_text_segment(self, text_segment: TextSegment):
133
- interrupted_id = text_segment.block_parent.attrib.pop(_ID_KEY, None)
170
+ parent_element = text_segment.parent_stack[-1]
171
+ interrupted_id = parent_element.attrib.pop(_ID_KEY, None)
134
172
  if interrupted_id is None:
135
173
  yield text_segment
136
174
  return
137
175
 
138
- raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
139
- if not raw_text_segments:
176
+ if parent_element is text_segment.block_parent:
177
+ # Block-level math, need to be hidden
140
178
  return
141
179
 
142
- raw_block = raw_text_segments[0].parent_stack[0]
143
- if not self._is_inline_math(raw_block):
180
+ raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
181
+ if not raw_text_segments:
182
+ yield text_segment
144
183
  return
145
184
 
146
185
  for raw_text_segment in raw_text_segments:
186
+ text_basic_parent_stack = text_segment.parent_stack[:-1]
147
187
  raw_text_segment.block_parent.attrib.pop(_ID_KEY, None)
188
+ raw_text_segment.parent_stack = text_basic_parent_stack + raw_text_segment.parent_stack
148
189
  yield raw_text_segment
149
-
150
- def _has_no_math_texts(self, element: Element):
151
- if element.tag == _MATH_TAG:
152
- return True
153
- if element.text and normalize_whitespace(element.text).strip():
154
- return False
155
- for child_element in element:
156
- if not self._has_no_math_texts(child_element):
157
- return False
158
- if child_element.tail and normalize_whitespace(child_element.tail).strip():
159
- return False
160
- return True
161
-
162
- def _is_inline_math(self, element: Element) -> bool:
163
- if element.tag != _MATH_TAG:
164
- return False
165
- return element.get("display", "").lower() != "block"
@@ -1,6 +1,6 @@
1
1
  from .const import *
2
2
  from .deduplication import *
3
- from .firendly import *
3
+ from .friendly import *
4
4
  from .inline import *
5
5
  from .utils import *
6
6
  from .xml import *