epub-translator 0.0.7__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. epub_translator/__init__.py +4 -2
  2. epub_translator/data/fill.jinja +66 -0
  3. epub_translator/data/mmltex/README.md +67 -0
  4. epub_translator/data/mmltex/cmarkup.xsl +1106 -0
  5. epub_translator/data/mmltex/entities.xsl +459 -0
  6. epub_translator/data/mmltex/glayout.xsl +222 -0
  7. epub_translator/data/mmltex/mmltex.xsl +36 -0
  8. epub_translator/data/mmltex/scripts.xsl +375 -0
  9. epub_translator/data/mmltex/tables.xsl +130 -0
  10. epub_translator/data/mmltex/tokens.xsl +328 -0
  11. epub_translator/data/translate.jinja +15 -12
  12. epub_translator/epub/__init__.py +4 -2
  13. epub_translator/epub/common.py +43 -0
  14. epub_translator/epub/math.py +193 -0
  15. epub_translator/epub/placeholder.py +53 -0
  16. epub_translator/epub/spines.py +42 -0
  17. epub_translator/epub/toc.py +505 -0
  18. epub_translator/epub/zip.py +67 -0
  19. epub_translator/iter_sync.py +24 -0
  20. epub_translator/language.py +23 -0
  21. epub_translator/llm/__init__.py +2 -1
  22. epub_translator/llm/core.py +233 -0
  23. epub_translator/llm/error.py +38 -35
  24. epub_translator/llm/executor.py +159 -136
  25. epub_translator/llm/increasable.py +28 -28
  26. epub_translator/llm/types.py +17 -0
  27. epub_translator/serial/__init__.py +2 -0
  28. epub_translator/serial/chunk.py +52 -0
  29. epub_translator/serial/segment.py +17 -0
  30. epub_translator/serial/splitter.py +50 -0
  31. epub_translator/template.py +35 -33
  32. epub_translator/translator.py +208 -178
  33. epub_translator/utils.py +7 -0
  34. epub_translator/xml/__init__.py +4 -3
  35. epub_translator/xml/deduplication.py +38 -0
  36. epub_translator/xml/firendly/__init__.py +2 -0
  37. epub_translator/xml/firendly/decoder.py +75 -0
  38. epub_translator/xml/firendly/encoder.py +84 -0
  39. epub_translator/xml/firendly/parser.py +177 -0
  40. epub_translator/xml/firendly/tag.py +118 -0
  41. epub_translator/xml/firendly/transform.py +36 -0
  42. epub_translator/xml/xml.py +52 -0
  43. epub_translator/xml/xml_like.py +231 -0
  44. epub_translator/xml_translator/__init__.py +3 -0
  45. epub_translator/xml_translator/const.py +2 -0
  46. epub_translator/xml_translator/fill.py +128 -0
  47. epub_translator/xml_translator/format.py +282 -0
  48. epub_translator/xml_translator/fragmented.py +125 -0
  49. epub_translator/xml_translator/group.py +183 -0
  50. epub_translator/xml_translator/progressive_locking.py +256 -0
  51. epub_translator/xml_translator/submitter.py +102 -0
  52. epub_translator/xml_translator/text_segment.py +263 -0
  53. epub_translator/xml_translator/translator.py +179 -0
  54. epub_translator/xml_translator/utils.py +29 -0
  55. epub_translator-0.1.1.dist-info/METADATA +283 -0
  56. epub_translator-0.1.1.dist-info/RECORD +58 -0
  57. epub_translator/data/format.jinja +0 -33
  58. epub_translator/epub/content_parser.py +0 -162
  59. epub_translator/epub/html/__init__.py +0 -1
  60. epub_translator/epub/html/dom_operator.py +0 -68
  61. epub_translator/epub/html/empty_tags.py +0 -23
  62. epub_translator/epub/html/file.py +0 -80
  63. epub_translator/epub/html/texts_searcher.py +0 -46
  64. epub_translator/llm/node.py +0 -201
  65. epub_translator/translation/__init__.py +0 -2
  66. epub_translator/translation/chunk.py +0 -118
  67. epub_translator/translation/splitter.py +0 -78
  68. epub_translator/translation/store.py +0 -36
  69. epub_translator/translation/translation.py +0 -231
  70. epub_translator/translation/types.py +0 -45
  71. epub_translator/translation/utils.py +0 -11
  72. epub_translator/xml/decoder.py +0 -71
  73. epub_translator/xml/encoder.py +0 -95
  74. epub_translator/xml/parser.py +0 -172
  75. epub_translator/xml/tag.py +0 -93
  76. epub_translator/xml/transform.py +0 -34
  77. epub_translator/xml/utils.py +0 -12
  78. epub_translator/zip_context.py +0 -74
  79. epub_translator-0.0.7.dist-info/METADATA +0 -170
  80. epub_translator-0.0.7.dist-info/RECORD +0 -36
  81. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/LICENSE +0 -0
  82. {epub_translator-0.0.7.dist-info → epub_translator-0.1.1.dist-info}/WHEEL +0 -0
@@ -1,35 +1,35 @@
1
1
  class Increaser:
2
- def __init__(self, value_range: tuple[float, float] | None):
3
- self._value_range: tuple[float, float] | None = value_range
4
- self._current: float | None = value_range[0] if value_range is not None else None
2
+ def __init__(self, value_range: tuple[float, float] | None):
3
+ self._value_range: tuple[float, float] | None = value_range
4
+ self._current: float | None = value_range[0] if value_range is not None else None
5
5
 
6
- @property
7
- def current(self) -> float | None:
8
- return self._current
6
+ @property
7
+ def current(self) -> float | None:
8
+ return self._current
9
+
10
+ def increase(self):
11
+ if self._value_range is not None and self._current is not None:
12
+ _, end_value = self._value_range
13
+ self._current = self._current + 0.5 * (end_value - self._current)
9
14
 
10
- def increase(self):
11
- if self._value_range is None:
12
- return
13
- _, end_value = self._value_range
14
- self._current = self._current + 0.5 * (end_value - self._current)
15
15
 
16
16
  class Increasable:
17
- def __init__(self, param: float | tuple[float, float] | None):
18
- self._value_range: tuple[float, float] | None = None
17
+ def __init__(self, param: float | tuple[float, float] | None):
18
+ self._value_range: tuple[float, float] | None = None
19
19
 
20
- if isinstance(param, int):
21
- param = float(param)
22
- if isinstance(param, float):
23
- param = (param, param)
24
- if isinstance(param, tuple):
25
- if len(param) != 2:
26
- raise ValueError(f"Expected a tuple of length 2, got {len(param)}")
27
- begin, end = param
28
- if isinstance(begin, int):
29
- begin = float(begin)
30
- if isinstance(end, int):
31
- end = float(end)
32
- self._value_range = (begin, end)
20
+ if isinstance(param, int):
21
+ param = float(param)
22
+ if isinstance(param, float):
23
+ param = (param, param)
24
+ if isinstance(param, tuple):
25
+ if len(param) != 2:
26
+ raise ValueError(f"Expected a tuple of length 2, got {len(param)}")
27
+ begin, end = param
28
+ if isinstance(begin, int):
29
+ begin = float(begin)
30
+ if isinstance(end, int):
31
+ end = float(end)
32
+ self._value_range = (begin, end)
33
33
 
34
- def context(self) -> Increaser:
35
- return Increaser(self._value_range)
34
+ def context(self) -> Increaser:
35
+ return Increaser(self._value_range)
@@ -0,0 +1,17 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum, auto
3
+ from typing import TypeVar
4
+
5
+ R = TypeVar("R")
6
+
7
+
8
+ @dataclass
9
+ class Message:
10
+ role: "MessageRole"
11
+ message: str
12
+
13
+
14
+ class MessageRole(Enum):
15
+ SYSTEM = auto()
16
+ USER = auto()
17
+ ASSISTANT = auto()
@@ -0,0 +1,2 @@
1
+ from .segment import ST, S, Segment, T
2
+ from .splitter import split
@@ -0,0 +1,52 @@
1
+ from collections.abc import Generator, Iterable
2
+ from dataclasses import dataclass
3
+ from typing import Generic
4
+
5
+ from resource_segmentation import Resource, Segment, split
6
+
7
+ from .segment import ST
8
+
9
+ _INCISION = 0
10
+
11
+
12
+ @dataclass
13
+ class Chunk(Generic[ST]):
14
+ head_remain_tokens: int
15
+ tail_remain_tokens: int
16
+ head: list[ST]
17
+ body: list[ST]
18
+ tail: list[ST]
19
+
20
+
21
+ def split_into_chunks(segments: Iterable[ST], max_group_tokens: int) -> Generator[Chunk[ST], None, None]:
22
+ for group in split(
23
+ max_segment_count=max_group_tokens,
24
+ gap_rate=0.07,
25
+ tail_rate=0.5,
26
+ border_incision=_INCISION,
27
+ resources=(
28
+ Resource(
29
+ count=segment.tokens,
30
+ start_incision=_INCISION,
31
+ end_incision=_INCISION,
32
+ payload=segment,
33
+ )
34
+ for segment in segments
35
+ ),
36
+ ):
37
+ yield Chunk(
38
+ head_remain_tokens=group.head_remain_count,
39
+ tail_remain_tokens=group.tail_remain_count,
40
+ head=list(_expand_payloads(group.head)),
41
+ body=list(_expand_payloads(group.body)),
42
+ tail=list(_expand_payloads(group.tail)),
43
+ )
44
+
45
+
46
+ def _expand_payloads(target: list[Resource[ST] | Segment[ST]]) -> Generator[ST, None, None]:
47
+ for item in target:
48
+ if isinstance(item, Resource):
49
+ yield item.payload
50
+ elif isinstance(item, Segment):
51
+ for resource in item.resources:
52
+ yield resource.payload
@@ -0,0 +1,17 @@
1
+ from typing import Generic, Protocol, Self, TypeVar, runtime_checkable
2
+
3
+ S = TypeVar("S", covariant=True)
4
+ T = TypeVar("T")
5
+ ST = TypeVar("ST", bound="Segment")
6
+
7
+
8
+ @runtime_checkable
9
+ class Segment(Protocol, Generic[S]):
10
+ @property
11
+ def tokens(self) -> int: ...
12
+
13
+ @property
14
+ def payload(self) -> S: ...
15
+
16
+ def truncate_after_head(self, remain_tokens: int) -> Self: ...
17
+ def truncate_before_tail(self, remain_tokens: int) -> Self: ...
@@ -0,0 +1,50 @@
1
+ from collections.abc import Callable, Generator, Iterable
2
+
3
+ from .chunk import split_into_chunks
4
+ from .segment import ST, T
5
+
6
+
7
+ def split(
8
+ segments: Iterable[ST],
9
+ transform: Callable[[list[ST]], list[T]],
10
+ max_group_tokens: int,
11
+ ) -> Generator[T, None, None]:
12
+ for group in split_into_chunks(segments, max_group_tokens):
13
+ head = list(
14
+ _truncate_extra_content(
15
+ segments=group.head,
16
+ remain_left=False,
17
+ remain_tokens=group.head_remain_tokens,
18
+ )
19
+ )
20
+ tail = list(
21
+ _truncate_extra_content(
22
+ segments=group.tail,
23
+ remain_left=True,
24
+ remain_tokens=group.tail_remain_tokens,
25
+ )
26
+ )
27
+ transformed = transform(head + group.body + tail)
28
+
29
+ if len(tail) > 0: # 避免 target[N:-0] 切片错误
30
+ yield from transformed[len(head) : -len(tail)]
31
+ else:
32
+ yield from transformed[len(head) :]
33
+
34
+
35
+ def _truncate_extra_content(segments: list[ST], remain_left: bool, remain_tokens: int):
36
+ tokens_list: list[int] = [segment.tokens for segment in segments]
37
+ segments = list(segments)
38
+ for tokens in tokens_list if remain_left else reversed(tokens_list):
39
+ if remain_tokens <= 0:
40
+ break
41
+ next_segment = segments.pop(0) if remain_left else segments.pop()
42
+ if remain_tokens < tokens:
43
+ if remain_left:
44
+ next_segment = next_segment.truncate_after_head(remain_tokens)
45
+ else:
46
+ next_segment = next_segment.truncate_before_tail(remain_tokens)
47
+ remain_tokens = 0
48
+ else:
49
+ remain_tokens -= tokens
50
+ yield next_segment
@@ -1,50 +1,52 @@
1
1
  import re
2
-
3
- from typing import Tuple, Callable
2
+ from collections.abc import Callable
4
3
  from pathlib import Path
5
- from jinja2 import select_autoescape, Environment, BaseLoader, TemplateNotFound
4
+
5
+ from jinja2 import BaseLoader, Environment, TemplateNotFound, select_autoescape
6
6
 
7
7
 
8
8
  def create_env(dir_path: Path) -> Environment:
9
- return Environment(
10
- loader=_DSLoader(dir_path),
11
- autoescape=select_autoescape(),
12
- trim_blocks=True,
13
- keep_trailing_newline=True,
14
- )
9
+ return Environment(
10
+ loader=_DSLoader(dir_path),
11
+ autoescape=select_autoescape(),
12
+ trim_blocks=True,
13
+ keep_trailing_newline=True,
14
+ )
15
+
16
+
17
+ _LoaderResult = tuple[str, str | None, Callable[[], bool] | None]
15
18
 
16
- _LoaderResult = Tuple[str, str | None, Callable[[], bool] | None]
17
19
 
18
20
  class _DSLoader(BaseLoader):
19
- def __init__(self, dir_path: Path):
20
- super().__init__()
21
- self._dir_path: Path = dir_path
21
+ def __init__(self, dir_path: Path):
22
+ super().__init__()
23
+ self._dir_path: Path = dir_path
22
24
 
23
- def get_source(self, _: Environment, template: str) -> _LoaderResult:
24
- template = self._norm_template(template)
25
- target_path = (self._dir_path / template).resolve()
25
+ def get_source(self, environment: Environment, template: str) -> _LoaderResult:
26
+ template = self._norm_template(template)
27
+ target_path = (self._dir_path / template).resolve()
26
28
 
27
- if not target_path.exists():
28
- raise TemplateNotFound(f"cannot find {template}")
29
+ if not target_path.exists():
30
+ raise TemplateNotFound(f"cannot find {template}")
29
31
 
30
- return self._get_source_with_path(target_path)
32
+ return self._get_source_with_path(target_path)
31
33
 
32
- def _norm_template(self, template: str) -> str:
33
- if bool(re.match(r"^\.+/", template)):
34
- raise TemplateNotFound(f"invalid path {template}")
34
+ def _norm_template(self, template: str) -> str:
35
+ if bool(re.match(r"^\.+/", template)):
36
+ raise TemplateNotFound(f"invalid path {template}")
35
37
 
36
- template = re.sub(r"^/", "", template)
37
- template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
38
- template = f"{template}.jinja"
38
+ template = re.sub(r"^/", "", template)
39
+ template = re.sub(r"\.jinja$", "", template, flags=re.IGNORECASE)
40
+ template = f"{template}.jinja"
39
41
 
40
- return template
42
+ return template
41
43
 
42
- def _get_source_with_path(self, path: Path) -> _LoaderResult:
43
- mtime = path.stat().st_mtime
44
- with open(path, "r", encoding="utf-8") as f:
45
- source = f.read()
44
+ def _get_source_with_path(self, path: Path) -> _LoaderResult:
45
+ mtime = path.stat().st_mtime
46
+ with open(path, encoding="utf-8") as f:
47
+ source = f.read()
46
48
 
47
- def is_updated() -> bool:
48
- return mtime == path.stat().st_mtime
49
+ def is_updated() -> bool:
50
+ return mtime == path.stat().st_mtime
49
51
 
50
- return source, path, is_updated
52
+ return source, str(path), is_updated