epub-translator 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- epub_translator/__init__.py +3 -2
- epub_translator/data/format.jinja +33 -0
- epub_translator/data/translate.jinja +15 -0
- epub_translator/epub/__init__.py +2 -3
- epub_translator/epub/content_parser.py +2 -2
- epub_translator/epub/html/__init__.py +1 -1
- epub_translator/epub/html/file.py +56 -41
- epub_translator/epub/html/texts_searcher.py +2 -1
- epub_translator/llm/__init__.py +1 -0
- epub_translator/llm/error.py +49 -0
- epub_translator/llm/executor.py +147 -0
- epub_translator/llm/increasable.py +35 -0
- epub_translator/llm/node.py +197 -0
- epub_translator/template.py +50 -0
- epub_translator/translation/__init__.py +2 -0
- epub_translator/translation/chunk.py +120 -0
- epub_translator/translation/splitter.py +77 -0
- epub_translator/translation/store.py +37 -0
- epub_translator/translation/translation.py +192 -0
- epub_translator/translation/types.py +23 -0
- epub_translator/translation/utils.py +11 -0
- epub_translator/translator.py +169 -0
- epub_translator/xml/__init__.py +3 -0
- epub_translator/xml/decoder.py +71 -0
- epub_translator/xml/encoder.py +95 -0
- epub_translator/xml/parser.py +172 -0
- epub_translator/xml/tag.py +93 -0
- epub_translator/xml/transform.py +34 -0
- epub_translator/xml/utils.py +12 -0
- epub_translator/zip_context.py +74 -0
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/METADATA +5 -7
- epub_translator-0.0.3.dist-info/RECORD +36 -0
- epub_translator/epub/types.py +0 -4
- epub_translator/file.py +0 -124
- epub_translator/translator/__init__.py +0 -1
- epub_translator/translator/group.py +0 -140
- epub_translator/translator/llm.py +0 -58
- epub_translator/translator/nlp.py +0 -36
- epub_translator/translator/translator.py +0 -159
- epub_translator-0.0.1.dist-info/RECORD +0 -19
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE +0 -0
- {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from typing import Generator, Iterable
|
|
2
|
+
from xml.etree.ElementTree import Element
|
|
3
|
+
|
|
4
|
+
from .tag import Tag, TagKind
|
|
5
|
+
from .parser import parse_tags
|
|
6
|
+
from .transform import tag_to_element
|
|
7
|
+
from .utils import clone
|
|
8
|
+
|
|
9
|
+
# why implement XML decoding?
|
|
10
|
+
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
11
|
+
def decode_friendly(chars: Iterable[str], tags: Iterable[str] | str = ()) -> Generator[Element, None, None]:
|
|
12
|
+
if isinstance(tags, str):
|
|
13
|
+
tags = set((tags,))
|
|
14
|
+
else:
|
|
15
|
+
tags = set(tags)
|
|
16
|
+
|
|
17
|
+
for element in _collect_elements(chars):
|
|
18
|
+
if element.tag in tags or len(tags) == 0:
|
|
19
|
+
yield clone(element)
|
|
20
|
+
|
|
21
|
+
def _collect_elements(chars: Iterable[str]) -> Generator[Element, None, None]:
|
|
22
|
+
opening_stack: list[Element] = []
|
|
23
|
+
last_closed_element: Element | None = None
|
|
24
|
+
|
|
25
|
+
for cell in parse_tags(chars):
|
|
26
|
+
if isinstance(cell, Tag):
|
|
27
|
+
tag: Tag = cell
|
|
28
|
+
element = tag_to_element(tag)
|
|
29
|
+
if tag.kind == TagKind.CLOSING:
|
|
30
|
+
popped = _pop_element(tag.name, opening_stack)
|
|
31
|
+
if popped is not None:
|
|
32
|
+
yield popped
|
|
33
|
+
last_closed_element = popped
|
|
34
|
+
elif last_closed_element is not None:
|
|
35
|
+
_append_to_tail(last_closed_element, tag.proto)
|
|
36
|
+
else:
|
|
37
|
+
if opening_stack:
|
|
38
|
+
opening_stack[-1].append(element)
|
|
39
|
+
if tag.kind == TagKind.SELF_CLOSING:
|
|
40
|
+
yield element
|
|
41
|
+
last_closed_element = element
|
|
42
|
+
elif tag.kind == TagKind.OPENING:
|
|
43
|
+
opening_stack.append(element)
|
|
44
|
+
last_closed_element = None
|
|
45
|
+
|
|
46
|
+
elif last_closed_element is not None:
|
|
47
|
+
_append_to_tail(last_closed_element, cell)
|
|
48
|
+
|
|
49
|
+
elif opening_stack:
|
|
50
|
+
opening_stack[-1].text = cell
|
|
51
|
+
|
|
52
|
+
def _append_to_tail(element: Element, text: str) -> None:
|
|
53
|
+
if element.tail:
|
|
54
|
+
element.tail += text
|
|
55
|
+
else:
|
|
56
|
+
element.tail = text
|
|
57
|
+
|
|
58
|
+
def _pop_element(tag_name: str, opening_stack: list[Element]) -> Element | None:
|
|
59
|
+
index = -1
|
|
60
|
+
for i in range(len(opening_stack) - 1, -1, -1):
|
|
61
|
+
opening_element = opening_stack[i]
|
|
62
|
+
if tag_name == opening_element.tag:
|
|
63
|
+
index = i
|
|
64
|
+
break
|
|
65
|
+
if index == -1:
|
|
66
|
+
return None
|
|
67
|
+
|
|
68
|
+
popped: Element | None = None
|
|
69
|
+
for _ in range(len(opening_stack) - index):
|
|
70
|
+
popped = opening_stack.pop()
|
|
71
|
+
return popped
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
from typing import Callable
|
|
3
|
+
from html import escape as escape_html
|
|
4
|
+
from xml.etree.ElementTree import Element
|
|
5
|
+
|
|
6
|
+
from .tag import Tag, TagKind
|
|
7
|
+
from .parser import parse_tags
|
|
8
|
+
from .transform import element_to_tag
|
|
9
|
+
|
|
10
|
+
# why implement XML encoding?
|
|
11
|
+
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
12
|
+
def encode_friendly(element: Element, indent: int = 2) -> str:
|
|
13
|
+
buffer = StringIO()
|
|
14
|
+
_encode_element(
|
|
15
|
+
buffer=buffer,
|
|
16
|
+
element=element,
|
|
17
|
+
indent=indent,
|
|
18
|
+
depth=0,
|
|
19
|
+
escape=_escape_text,
|
|
20
|
+
)
|
|
21
|
+
return buffer.getvalue()
|
|
22
|
+
|
|
23
|
+
def _escape_text(text: str) -> str:
|
|
24
|
+
buffer = StringIO()
|
|
25
|
+
for cell in parse_tags(text):
|
|
26
|
+
if isinstance(cell, Tag):
|
|
27
|
+
cell = escape_html(str(cell))
|
|
28
|
+
buffer.write(cell)
|
|
29
|
+
return buffer.getvalue()
|
|
30
|
+
|
|
31
|
+
def encode(element: Element, indent: int = 2) -> str:
|
|
32
|
+
buffer = StringIO()
|
|
33
|
+
_encode_element(
|
|
34
|
+
buffer=buffer,
|
|
35
|
+
element=element,
|
|
36
|
+
indent=indent,
|
|
37
|
+
depth=0,
|
|
38
|
+
escape=escape_html,
|
|
39
|
+
)
|
|
40
|
+
return buffer.getvalue()
|
|
41
|
+
|
|
42
|
+
_TINY_TEXT_LEN = 35
|
|
43
|
+
|
|
44
|
+
def _encode_element(
|
|
45
|
+
buffer: StringIO,
|
|
46
|
+
element: Element,
|
|
47
|
+
indent: int,
|
|
48
|
+
depth: int,
|
|
49
|
+
escape: Callable[[str], str],
|
|
50
|
+
) -> None:
|
|
51
|
+
|
|
52
|
+
_write_indent(buffer, indent, depth)
|
|
53
|
+
if len(element) == 0 and not element.text:
|
|
54
|
+
tag = element_to_tag(element, TagKind.SELF_CLOSING)
|
|
55
|
+
buffer.write(str(tag))
|
|
56
|
+
else:
|
|
57
|
+
text = (element.text or "").strip()
|
|
58
|
+
opening_tag = element_to_tag(element, TagKind.OPENING)
|
|
59
|
+
closing_tag = element_to_tag(element, TagKind.CLOSING)
|
|
60
|
+
buffer.write(str(opening_tag))
|
|
61
|
+
is_one_line = (
|
|
62
|
+
len(text) <= _TINY_TEXT_LEN and
|
|
63
|
+
len(element) == 0 and
|
|
64
|
+
"\n" not in text
|
|
65
|
+
)
|
|
66
|
+
if text:
|
|
67
|
+
if not is_one_line:
|
|
68
|
+
buffer.write("\n")
|
|
69
|
+
_write_indent(buffer, indent, depth + 1)
|
|
70
|
+
buffer.write(escape(text))
|
|
71
|
+
|
|
72
|
+
for child in element:
|
|
73
|
+
buffer.write("\n")
|
|
74
|
+
_encode_element(
|
|
75
|
+
buffer=buffer,
|
|
76
|
+
element=child,
|
|
77
|
+
indent=indent,
|
|
78
|
+
depth=depth + 1,
|
|
79
|
+
escape=escape,
|
|
80
|
+
)
|
|
81
|
+
child_tail = (child.tail or "").strip()
|
|
82
|
+
if child_tail:
|
|
83
|
+
buffer.write("\n")
|
|
84
|
+
_write_indent(buffer, indent, depth + 1)
|
|
85
|
+
buffer.write(escape(child_tail))
|
|
86
|
+
|
|
87
|
+
if not is_one_line:
|
|
88
|
+
buffer.write("\n")
|
|
89
|
+
_write_indent(buffer, indent, depth)
|
|
90
|
+
|
|
91
|
+
buffer.write(str(closing_tag))
|
|
92
|
+
|
|
93
|
+
def _write_indent(buffer: StringIO, indent: int, depth: int) -> None:
|
|
94
|
+
for _ in range(indent * depth):
|
|
95
|
+
buffer.write(" ")
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
from typing import Generator, Iterable
|
|
3
|
+
from enum import auto, Enum
|
|
4
|
+
from .tag import is_valid_name_char, is_valid_value_char, Tag, TagKind
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
_SPACES = (" ", "\n")
|
|
8
|
+
|
|
9
|
+
class _Phase(Enum):
|
|
10
|
+
OUTSIDE = auto()
|
|
11
|
+
LEFT_BRACKET = auto()
|
|
12
|
+
LEFT_SLASH = auto()
|
|
13
|
+
TAG_NAME = auto()
|
|
14
|
+
TAG_GAP = auto()
|
|
15
|
+
ATTRIBUTE_NAME = auto()
|
|
16
|
+
ATTRIBUTE_NAME_EQUAL = auto()
|
|
17
|
+
ATTRIBUTE_VALUE = auto()
|
|
18
|
+
MUST_CLOSING_SIGN = auto()
|
|
19
|
+
|
|
20
|
+
class _ParsedResult(Enum):
|
|
21
|
+
Continue = auto()
|
|
22
|
+
Success = auto()
|
|
23
|
+
Failed = auto()
|
|
24
|
+
|
|
25
|
+
def parse_tags(chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
26
|
+
yield from _XMLTagsParser().do(chars)
|
|
27
|
+
|
|
28
|
+
class _XMLTagsParser:
|
|
29
|
+
def __init__(self):
|
|
30
|
+
self._outside_buffer: StringIO = StringIO()
|
|
31
|
+
self._tag_buffer: StringIO = StringIO()
|
|
32
|
+
self._tag: Tag | None = None
|
|
33
|
+
self._phase: _Phase = _Phase.OUTSIDE
|
|
34
|
+
|
|
35
|
+
def do(self, chars: Iterable[str]) -> Generator[str | Tag, None, None]:
|
|
36
|
+
for char in chars:
|
|
37
|
+
parsed_result = self._parse_char(char)
|
|
38
|
+
yield from self._generate_by_result(parsed_result)
|
|
39
|
+
|
|
40
|
+
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
41
|
+
outside_text = self._outside_buffer.getvalue()
|
|
42
|
+
if outside_text != "":
|
|
43
|
+
yield outside_text
|
|
44
|
+
|
|
45
|
+
def _parse_char(self, char: str) -> _ParsedResult:
|
|
46
|
+
parsed_result: _ParsedResult = _ParsedResult.Continue
|
|
47
|
+
|
|
48
|
+
if self._phase == _Phase.OUTSIDE:
|
|
49
|
+
if char != "<":
|
|
50
|
+
self._outside_buffer.write(char)
|
|
51
|
+
else:
|
|
52
|
+
self._phase = _Phase.LEFT_BRACKET
|
|
53
|
+
self._tag_buffer.write(char)
|
|
54
|
+
self._tag = Tag(
|
|
55
|
+
kind=TagKind.OPENING,
|
|
56
|
+
name="",
|
|
57
|
+
proto="",
|
|
58
|
+
attributes=[],
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
self._tag_buffer.write(char)
|
|
62
|
+
|
|
63
|
+
if self._phase == _Phase.LEFT_BRACKET:
|
|
64
|
+
if char == "/":
|
|
65
|
+
self._tag.kind = TagKind.CLOSING
|
|
66
|
+
self._phase = _Phase.LEFT_SLASH
|
|
67
|
+
elif is_valid_name_char(char):
|
|
68
|
+
self._tag.name += char
|
|
69
|
+
self._phase = _Phase.TAG_NAME
|
|
70
|
+
else:
|
|
71
|
+
parsed_result = _ParsedResult.Failed
|
|
72
|
+
|
|
73
|
+
elif self._phase == _Phase.LEFT_SLASH:
|
|
74
|
+
if is_valid_name_char(char):
|
|
75
|
+
self._tag.name += char
|
|
76
|
+
self._phase = _Phase.TAG_NAME
|
|
77
|
+
else:
|
|
78
|
+
parsed_result = _ParsedResult.Failed
|
|
79
|
+
|
|
80
|
+
elif self._phase == _Phase.TAG_NAME:
|
|
81
|
+
if char in _SPACES:
|
|
82
|
+
self._phase = _Phase.TAG_GAP
|
|
83
|
+
elif is_valid_name_char(char):
|
|
84
|
+
self._tag.name += char
|
|
85
|
+
elif char == ">":
|
|
86
|
+
parsed_result = _ParsedResult.Success
|
|
87
|
+
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
88
|
+
self._tag.kind = TagKind.SELF_CLOSING
|
|
89
|
+
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
90
|
+
else:
|
|
91
|
+
parsed_result = _ParsedResult.Failed
|
|
92
|
+
|
|
93
|
+
elif self._phase == _Phase.TAG_GAP:
|
|
94
|
+
if char in _SPACES:
|
|
95
|
+
pass
|
|
96
|
+
elif is_valid_name_char(char):
|
|
97
|
+
self._tag.attributes.append((char, ""))
|
|
98
|
+
self._phase = _Phase.ATTRIBUTE_NAME
|
|
99
|
+
elif char == ">":
|
|
100
|
+
parsed_result = _ParsedResult.Success
|
|
101
|
+
elif char == "/" and self._tag.kind == TagKind.OPENING:
|
|
102
|
+
self._tag.kind = TagKind.SELF_CLOSING
|
|
103
|
+
self._phase = _Phase.MUST_CLOSING_SIGN
|
|
104
|
+
else:
|
|
105
|
+
parsed_result = _ParsedResult.Failed
|
|
106
|
+
|
|
107
|
+
elif self._phase == _Phase.ATTRIBUTE_NAME:
|
|
108
|
+
if is_valid_name_char(char):
|
|
109
|
+
attr_name, attr_value = self._tag.attributes[-1]
|
|
110
|
+
attr_name = attr_name + char
|
|
111
|
+
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
112
|
+
elif char == "=":
|
|
113
|
+
self._phase = _Phase.ATTRIBUTE_NAME_EQUAL
|
|
114
|
+
else:
|
|
115
|
+
parsed_result = _ParsedResult.Failed
|
|
116
|
+
|
|
117
|
+
elif self._phase == _Phase.ATTRIBUTE_NAME_EQUAL:
|
|
118
|
+
if char == "\"":
|
|
119
|
+
self._phase = _Phase.ATTRIBUTE_VALUE
|
|
120
|
+
else:
|
|
121
|
+
parsed_result = _ParsedResult.Failed
|
|
122
|
+
|
|
123
|
+
elif self._phase == _Phase.ATTRIBUTE_VALUE:
|
|
124
|
+
if is_valid_value_char(char):
|
|
125
|
+
attr_name, attr_value = self._tag.attributes[-1]
|
|
126
|
+
attr_value = attr_value + char
|
|
127
|
+
self._tag.attributes[-1] = (attr_name, attr_value)
|
|
128
|
+
elif char == "\"":
|
|
129
|
+
self._phase = _Phase.TAG_GAP
|
|
130
|
+
else:
|
|
131
|
+
parsed_result = _ParsedResult.Failed
|
|
132
|
+
|
|
133
|
+
elif self._phase == _Phase.MUST_CLOSING_SIGN:
|
|
134
|
+
if char == ">":
|
|
135
|
+
parsed_result = _ParsedResult.Success
|
|
136
|
+
else:
|
|
137
|
+
parsed_result = _ParsedResult.Failed
|
|
138
|
+
|
|
139
|
+
return parsed_result
|
|
140
|
+
|
|
141
|
+
def _generate_by_result(self, parsed_result: _ParsedResult) -> Generator[str | Tag, None, None]:
|
|
142
|
+
if parsed_result == _ParsedResult.Success:
|
|
143
|
+
assert self._tag is not None
|
|
144
|
+
if self._is_tag_valid(self._tag):
|
|
145
|
+
outside_text = self._outside_buffer.getvalue()
|
|
146
|
+
self._clear_buffer(self._outside_buffer)
|
|
147
|
+
self._clear_buffer(self._tag_buffer)
|
|
148
|
+
if outside_text != "":
|
|
149
|
+
yield outside_text
|
|
150
|
+
yield self._tag
|
|
151
|
+
else:
|
|
152
|
+
self._tag.proto = self._tag_buffer.getvalue()
|
|
153
|
+
self._outside_buffer.write(self._tag.proto)
|
|
154
|
+
self._clear_buffer(self._tag_buffer)
|
|
155
|
+
self._tag = None
|
|
156
|
+
self._phase = _Phase.OUTSIDE
|
|
157
|
+
|
|
158
|
+
elif parsed_result == _ParsedResult.Failed:
|
|
159
|
+
self._outside_buffer.write(self._tag_buffer.getvalue())
|
|
160
|
+
self._clear_buffer(self._tag_buffer)
|
|
161
|
+
self._phase = _Phase.OUTSIDE
|
|
162
|
+
|
|
163
|
+
def _is_tag_valid(self, tag: Tag) -> bool:
|
|
164
|
+
if tag.kind == TagKind.CLOSING and len(tag.attributes) > 0:
|
|
165
|
+
return False
|
|
166
|
+
if tag.find_invalid_name() is not None:
|
|
167
|
+
return False
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
def _clear_buffer(self, buffer: StringIO):
|
|
171
|
+
buffer.truncate(0)
|
|
172
|
+
buffer.seek(0)
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from io import StringIO
|
|
2
|
+
from enum import auto, Enum
|
|
3
|
+
from typing import Generator
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TagKind(Enum):
|
|
8
|
+
OPENING = auto()
|
|
9
|
+
CLOSING = auto()
|
|
10
|
+
SELF_CLOSING = auto()
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class Tag:
|
|
14
|
+
kind: TagKind
|
|
15
|
+
name: str
|
|
16
|
+
proto: str
|
|
17
|
+
attributes: list[tuple[str, str]]
|
|
18
|
+
|
|
19
|
+
def __str__(self):
|
|
20
|
+
buffer = StringIO()
|
|
21
|
+
buffer.write("<")
|
|
22
|
+
if self.kind == TagKind.CLOSING:
|
|
23
|
+
buffer.write("/")
|
|
24
|
+
buffer.write(self.name)
|
|
25
|
+
if len(self.attributes) > 0:
|
|
26
|
+
buffer.write(" ")
|
|
27
|
+
for i, (attr_name, attr_value) in enumerate(self.attributes):
|
|
28
|
+
buffer.write(attr_name)
|
|
29
|
+
buffer.write("=")
|
|
30
|
+
buffer.write("\"")
|
|
31
|
+
buffer.write(attr_value)
|
|
32
|
+
buffer.write("\"")
|
|
33
|
+
if i < len(self.attributes) - 1:
|
|
34
|
+
buffer.write(" ")
|
|
35
|
+
if self.kind == TagKind.SELF_CLOSING:
|
|
36
|
+
buffer.write("/>")
|
|
37
|
+
else:
|
|
38
|
+
buffer.write(">")
|
|
39
|
+
return buffer.getvalue()
|
|
40
|
+
|
|
41
|
+
def find_invalid_name(self) -> str | None:
|
|
42
|
+
for name in self._iter_tag_names():
|
|
43
|
+
if not all(is_valid_value_char(c) for c in name):
|
|
44
|
+
return name
|
|
45
|
+
# https://www.w3schools.com/xml/xml_elements.asp
|
|
46
|
+
# The following logic enforces a subset of XML naming rules:
|
|
47
|
+
# - Names must not be empty.
|
|
48
|
+
# - Names must start with a letter (a-z, A-Z) or an underscore (_).
|
|
49
|
+
if name == "":
|
|
50
|
+
return name
|
|
51
|
+
char = name[0]
|
|
52
|
+
if char == "_":
|
|
53
|
+
continue
|
|
54
|
+
if "a" <= char <= "z" or "A" <= char <= "Z":
|
|
55
|
+
continue
|
|
56
|
+
return name
|
|
57
|
+
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def find_invalid_attr_value(self) -> tuple[str, str] | None:
|
|
61
|
+
for attr_name, attr_value in self.attributes:
|
|
62
|
+
if not all(is_valid_value_char(c) for c in attr_value):
|
|
63
|
+
return attr_name, attr_value
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
def _iter_tag_names(self) -> Generator[str, None, None]:
|
|
67
|
+
yield self.name
|
|
68
|
+
for attr_name, _ in self.attributes:
|
|
69
|
+
yield attr_name
|
|
70
|
+
|
|
71
|
+
def is_valid_value_char(char: str) -> bool:
|
|
72
|
+
if is_valid_name_char(char):
|
|
73
|
+
return True
|
|
74
|
+
if char == ",":
|
|
75
|
+
return True
|
|
76
|
+
if char == ".":
|
|
77
|
+
return True
|
|
78
|
+
if char == "/":
|
|
79
|
+
return True
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def is_valid_name_char(char: str) -> bool:
|
|
83
|
+
if "a" <= char <= "z":
|
|
84
|
+
return True
|
|
85
|
+
if "A" <= char <= "Z":
|
|
86
|
+
return True
|
|
87
|
+
if "0" <= char <= "9":
|
|
88
|
+
return True
|
|
89
|
+
if char == "_":
|
|
90
|
+
return True
|
|
91
|
+
if char == "-":
|
|
92
|
+
return True
|
|
93
|
+
return False
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
from .tag import Tag, TagKind
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def tag_to_element(tag: Tag) -> Element:
|
|
6
|
+
element = Element(tag.name)
|
|
7
|
+
for attr_name, attr_value in tag.attributes:
|
|
8
|
+
element.set(attr_name, attr_value)
|
|
9
|
+
return element
|
|
10
|
+
|
|
11
|
+
def element_to_tag(element: Element, kind: TagKind, proto: str = "") -> Tag:
|
|
12
|
+
tag = Tag(
|
|
13
|
+
kind=kind,
|
|
14
|
+
name=element.tag,
|
|
15
|
+
proto=proto,
|
|
16
|
+
attributes=[],
|
|
17
|
+
)
|
|
18
|
+
if kind != TagKind.CLOSING:
|
|
19
|
+
for attr_name in sorted(list(element.keys())):
|
|
20
|
+
attr_value = element.get(attr_name, "")
|
|
21
|
+
tag.attributes.append((attr_name, attr_value))
|
|
22
|
+
|
|
23
|
+
# To make LLM easier to understand, the naming here is restricted in a more strict way.
|
|
24
|
+
# https://github.com/oomol-lab/pdf-craft/issues/149
|
|
25
|
+
invalid_name = tag.find_invalid_name()
|
|
26
|
+
if invalid_name is not None:
|
|
27
|
+
raise ValueError(f"find invalid tag name or attribute name: {invalid_name}")
|
|
28
|
+
|
|
29
|
+
invalid_attr_pair = tag.find_invalid_attr_value()
|
|
30
|
+
if invalid_attr_pair is not None:
|
|
31
|
+
attr_name, attr_value = invalid_attr_pair
|
|
32
|
+
raise ValueError(f"find invalid attribute value: {attr_name}=\"{attr_value}\"")
|
|
33
|
+
|
|
34
|
+
return tag
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from xml.etree.ElementTree import Element
|
|
2
|
+
|
|
3
|
+
def clone(element: Element) -> Element:
|
|
4
|
+
new_element = Element(element.tag)
|
|
5
|
+
for attr_name, attr_value in element.items():
|
|
6
|
+
new_element.set(attr_name, attr_value)
|
|
7
|
+
new_element.text = element.text
|
|
8
|
+
for child in element:
|
|
9
|
+
new_child = clone(child)
|
|
10
|
+
new_element.append(new_child)
|
|
11
|
+
new_child.tail = child.tail
|
|
12
|
+
return new_element
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import zipfile
|
|
2
|
+
|
|
3
|
+
from typing import Generator, Callable
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from lxml.etree import parse
|
|
6
|
+
from .epub import EpubContent, HTMLFile
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ZipContext:
|
|
10
|
+
def __init__(self, epub_path: Path, temp_dir: Path):
|
|
11
|
+
with zipfile.ZipFile(epub_path, "r") as zip_ref:
|
|
12
|
+
for member in zip_ref.namelist():
|
|
13
|
+
target_path = temp_dir / member
|
|
14
|
+
if member.endswith("/"):
|
|
15
|
+
target_path.mkdir(parents=True, exist_ok=True)
|
|
16
|
+
else:
|
|
17
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
18
|
+
with zip_ref.open(member) as source:
|
|
19
|
+
with open(target_path, "wb") as file:
|
|
20
|
+
file.write(source.read())
|
|
21
|
+
|
|
22
|
+
self._temp_dir: Path = temp_dir
|
|
23
|
+
self._epub_content: EpubContent = EpubContent(temp_dir)
|
|
24
|
+
|
|
25
|
+
def archive(self, saved_path: Path):
|
|
26
|
+
with zipfile.ZipFile(saved_path, "w") as zip_file:
|
|
27
|
+
for file_path in self._temp_dir.rglob("*"):
|
|
28
|
+
if not file_path.is_file():
|
|
29
|
+
continue
|
|
30
|
+
relative_path = file_path.relative_to(self._temp_dir)
|
|
31
|
+
zip_file.write(
|
|
32
|
+
filename=file_path,
|
|
33
|
+
arcname=str(relative_path),
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def search_spine_paths(self) -> Generator[Path, None, None]:
|
|
37
|
+
for spine in self._epub_content.spines:
|
|
38
|
+
if spine.media_type == "application/xhtml+xml":
|
|
39
|
+
yield Path(spine.path)
|
|
40
|
+
|
|
41
|
+
def read_spine_file(self, spine_path: Path) -> HTMLFile:
|
|
42
|
+
with open(spine_path, "r", encoding="utf-8") as file:
|
|
43
|
+
return HTMLFile(file.read())
|
|
44
|
+
|
|
45
|
+
def write_spine_file(self, spine_path: Path, file: HTMLFile):
|
|
46
|
+
with open(spine_path, "w", encoding="utf-8") as f:
|
|
47
|
+
f.write(file.file_content)
|
|
48
|
+
|
|
49
|
+
def replace_ncx(self, replace: Callable[[list[str]], list[str]]):
|
|
50
|
+
ncx_path = self._epub_content.ncx_path
|
|
51
|
+
if ncx_path is None:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
tree = parse(ncx_path)
|
|
55
|
+
root = tree.getroot()
|
|
56
|
+
namespaces={ "ns": root.nsmap.get(None) }
|
|
57
|
+
text_doms = []
|
|
58
|
+
text_list = []
|
|
59
|
+
|
|
60
|
+
for text_dom in root.xpath("//ns:text", namespaces=namespaces):
|
|
61
|
+
text_doms.append(text_dom)
|
|
62
|
+
text_list.append(text_dom.text or "")
|
|
63
|
+
|
|
64
|
+
for index, text in enumerate(replace(text_list)):
|
|
65
|
+
text_dom = text_doms[index]
|
|
66
|
+
text_dom.text = self._link_translated(text_dom.text, text)
|
|
67
|
+
|
|
68
|
+
tree.write(ncx_path, pretty_print=True)
|
|
69
|
+
|
|
70
|
+
def _link_translated(self, origin: str, target: str) -> str:
|
|
71
|
+
if origin == target:
|
|
72
|
+
return origin
|
|
73
|
+
else:
|
|
74
|
+
return f"{origin} - {target}"
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
|
+
Keywords: epub,llm,translation,translator
|
|
6
7
|
Author: Tao Zeyu
|
|
7
8
|
Author-email: i@taozeyu.com
|
|
8
9
|
Maintainer: Tao Zeyu
|
|
@@ -13,14 +14,11 @@ Classifier: Programming Language :: Python :: 3
|
|
|
13
14
|
Classifier: Programming Language :: Python :: 3.10
|
|
14
15
|
Classifier: Programming Language :: Python :: 3.11
|
|
15
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist: langchain
|
|
18
|
-
Requires-Dist: langid (>=1.1.6,<2.0.0)
|
|
17
|
+
Requires-Dist: jinja2 (>=3.1.5,<4.0.0)
|
|
18
|
+
Requires-Dist: langchain[openai] (>=0.3.21,<0.4.0)
|
|
19
19
|
Requires-Dist: lxml (>=6.0.0,<7.0.0)
|
|
20
|
-
Requires-Dist: resource-segmentation (==0.0.
|
|
21
|
-
Requires-Dist: spacy (>=3.8.7,<4.0.0)
|
|
20
|
+
Requires-Dist: resource-segmentation (==0.0.2)
|
|
22
21
|
Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
|
|
23
|
-
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
|
24
22
|
Project-URL: Homepage, https://hub.oomol.com/package/book-translator
|
|
25
23
|
Project-URL: Repository, https://github.com/oomol-flows/books-translator
|
|
26
24
|
Description-Content-Type: text/markdown
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
epub_translator/__init__.py,sha256=69k-70xbqxLgTvLyJE_MPFyymmPgyUAFg4mCSfEn49k,106
|
|
2
|
+
epub_translator/data/format.jinja,sha256=1dXiMAhznuHWA2vpDx-968pm95FCXCRV3cKVmy3EXYA,3710
|
|
3
|
+
epub_translator/data/translate.jinja,sha256=XWiMgeQS3tKCnh9cC6p1KQs9kqb1GO6m_j_cNeDq8lQ,1768
|
|
4
|
+
epub_translator/epub/__init__.py,sha256=vqD29XWxgcfHw8LFCHQxupQeJufEEoS5_KHNphwfEYU,73
|
|
5
|
+
epub_translator/epub/content_parser.py,sha256=Y-2fEcACCAl_ucA6VwwOoQ95BMHr9f-hryuxnsYNONM,4403
|
|
6
|
+
epub_translator/epub/html/__init__.py,sha256=Yyg0TrWjecolaJP6qLzDnOlhi7SrkW0Is7wh9Fr0-Bg,26
|
|
7
|
+
epub_translator/epub/html/dom_operator.py,sha256=Ryayv6hG0jEXv7RkXrZTbIP54P0fyPTbMVbymMtBUnU,1935
|
|
8
|
+
epub_translator/epub/html/empty_tags.py,sha256=GSSe-CV4YkUhWv4F0fiiRsf2vz0ZBAsC21Ovnqo5oIA,601
|
|
9
|
+
epub_translator/epub/html/file.py,sha256=AqUV-Tmptk5J2EYmw3oRVsLjGSqEMNz5rItnoRbGstc,2477
|
|
10
|
+
epub_translator/epub/html/texts_searcher.py,sha256=vamO99pki6_sX2PeKCJk7mPwHdApZq1sOgSYDTPckx8,1376
|
|
11
|
+
epub_translator/llm/__init__.py,sha256=wMBWLgh5iLNQBioniSOmWC83NS7RLM41hIs1V1uZiWI,21
|
|
12
|
+
epub_translator/llm/error.py,sha256=fG0A3z69YoSNu0MNVWVFMtHCB_4fpOvAEb0Kajn9OHc,1401
|
|
13
|
+
epub_translator/llm/executor.py,sha256=vwHqtlvCDHjDXLcvvKstlcQ5MfAGNPz1RKbq8W6WwKs,4378
|
|
14
|
+
epub_translator/llm/increasable.py,sha256=Dpu5z4JK5h1OtLorZgsOAdRFeTH2LOkdroasgmCWAIo,1136
|
|
15
|
+
epub_translator/llm/node.py,sha256=IKgdWoBwiejHOJ7akv8AiXUpyFlv2U4fGllt7ZLE3M0,5970
|
|
16
|
+
epub_translator/template.py,sha256=GdV3QnypProKFCMH1kBNfdt6wiShygP_-xGnE5EOUwU,1460
|
|
17
|
+
epub_translator/translation/__init__.py,sha256=mudXLDVSIG0XTLoHUIos0-wtQCnL9ZreuHsTHcVKjnE,73
|
|
18
|
+
epub_translator/translation/chunk.py,sha256=obrkx_yCeGMeikinfIx0NRvMo2kQBwXVbCdJbeT-ERA,3576
|
|
19
|
+
epub_translator/translation/splitter.py,sha256=xOaP1p3lqY95CR0vDXdeGUMHYObiqs3y093EUAxJ-jI,2676
|
|
20
|
+
epub_translator/translation/store.py,sha256=1FmksPAUj0mt3tN8Jdb_L1ovaI1p_5OhTWgxbIDl0SI,1133
|
|
21
|
+
epub_translator/translation/translation.py,sha256=_qiw6s_z_Tv4VmIP1U-_YhDYNiKEDNfigHIIGKa41fU,5734
|
|
22
|
+
epub_translator/translation/types.py,sha256=vDW5bVqYwngW_YUgf0SgfZ5zIFWUxcbBGO1U9Dsxc0o,499
|
|
23
|
+
epub_translator/translation/utils.py,sha256=G6Gqq6mot3lgFA-jqUD0UqtDS0GC1wrb9DnK7rTxJNs,223
|
|
24
|
+
epub_translator/translator.py,sha256=oh7PdCijUSGU6f72hzc6doJdMWnAqg6zHRFIj3aeTjc,5332
|
|
25
|
+
epub_translator/xml/__init__.py,sha256=o2_qwUYU_MUcyfmfKkiOQ-cKUQyl4PiRL8YHVzCTAZU,106
|
|
26
|
+
epub_translator/xml/decoder.py,sha256=UlqgmEKQDzxt3lvBeNGHgZP6jznmnq_1HLJuAe5X0C4,2181
|
|
27
|
+
epub_translator/xml/encoder.py,sha256=p4A7GRSOM2i0WOh1lLtEdTTg2gXSQrxDdzMgUqbiV18,2428
|
|
28
|
+
epub_translator/xml/parser.py,sha256=3JuFLFLX4w98pzME72ywdpq0JzSBgzwZMFbH8IfB9T4,5545
|
|
29
|
+
epub_translator/xml/tag.py,sha256=QLZImF0PtYyiASI7swrB8DL_qUwcYpU6cL68jEXDnvg,2353
|
|
30
|
+
epub_translator/xml/transform.py,sha256=vS_a4d_o2Qqf9B6k2CovQVLUknp6TyUi3FyLOu21Vio,1126
|
|
31
|
+
epub_translator/xml/utils.py,sha256=KDNGWHwaIiFKS27sjZF0e-bBSjeTxzceae_aeuj4wzI,384
|
|
32
|
+
epub_translator/zip_context.py,sha256=7_05kycmADb4-vxHkw_DX__vkKOxT4zo9pr2a8F4L_U,2409
|
|
33
|
+
epub_translator-0.0.3.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
|
|
34
|
+
epub_translator-0.0.3.dist-info/METADATA,sha256=HuD7ogzPi96e8ZKjAJyTARhSeR4xmu3vZt0Q3Vmg0Hk,2342
|
|
35
|
+
epub_translator-0.0.3.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
36
|
+
epub_translator-0.0.3.dist-info/RECORD,,
|
epub_translator/epub/types.py
DELETED