epub-translator 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ from .translator import Translator
2
+ from .file import translate_epub_file, ProgressReporter
@@ -0,0 +1,3 @@
1
+ from .content_parser import EpubContent
2
+ from .types import Translate, ReportProgress
3
+ from .html import translate_html
@@ -0,0 +1,162 @@
1
+ import os
2
+ import re
3
+
4
+ from lxml.etree import parse, Element, QName
5
+ from html import escape
6
+
7
+
8
+ # TODO replace with XML
9
+ class Spine:
10
+ def __init__(self, folder_path, base_path, item):
11
+ self._folder_path = folder_path
12
+ self._base_path = base_path
13
+ self.href = item.get("href")
14
+ self.media_type = item.get("media-type")
15
+
16
+ @property
17
+ def path(self):
18
+ path = os.path.join(self._base_path, self.href)
19
+ path = os.path.abspath(path)
20
+
21
+ if os.path.exists(path):
22
+ return path
23
+
24
+ path = os.path.join(self._folder_path, self.href)
25
+ path = os.path.abspath(path)
26
+ return path
27
+
28
+ class EpubContent:
29
+ def __init__(self, path: str):
30
+ self.folder_path = path
31
+ self._content_path = self._find_content_path(path)
32
+ self._tree = parse(self._content_path)
33
+ self._namespaces = { "ns": self._tree.getroot().nsmap.get(None) }
34
+ self._spine = self._tree.xpath("//ns:spine", namespaces=self._namespaces)[0]
35
+ self._metadata = self._tree.xpath("//ns:metadata", namespaces=self._namespaces)[0]
36
+ self._manifest = self._tree.xpath("//ns:manifest", namespaces=self._namespaces)[0]
37
+
38
+ def save(self):
39
+ self._tree.write(self._content_path, pretty_print=True)
40
+
41
+ def _find_content_path(self, path: str) -> str:
42
+ root = parse(os.path.join(path, "META-INF", "container.xml")).getroot()
43
+ rootfile = root.xpath(
44
+ "//ns:container/ns:rootfiles/ns:rootfile",
45
+ namespaces={ "ns": root.nsmap.get(None) },
46
+ )[0]
47
+ full_path = rootfile.attrib["full-path"]
48
+ joined_path = os.path.join(path, full_path)
49
+
50
+ return os.path.abspath(joined_path)
51
+
52
+ @property
53
+ def ncx_path(self):
54
+ ncx_dom = self._manifest.find(".//*[@id=\"ncx\"]")
55
+ if ncx_dom is not None:
56
+ href_path = ncx_dom.get("href")
57
+ base_path = os.path.dirname(self._content_path)
58
+ path = os.path.join(base_path, href_path)
59
+ path = os.path.abspath(path)
60
+
61
+ if os.path.exists(path):
62
+ return path
63
+
64
+ path = os.path.join(self.folder_path, path)
65
+ path = os.path.abspath(path)
66
+ return path
67
+
68
+ @property
69
+ def spines(self):
70
+ idref_dict = {}
71
+ index = 0
72
+
73
+ for child in self._spine.iterchildren():
74
+ id = child.get("idref")
75
+ idref_dict[id] = index
76
+ index += 1
77
+
78
+ items = [None for _ in range(index)]
79
+ spines = []
80
+
81
+ for child in self._manifest.iterchildren():
82
+ id = child.get("id")
83
+ if id in idref_dict:
84
+ index = idref_dict[id]
85
+ items[index] = child
86
+
87
+ base_path = os.path.dirname(self._content_path)
88
+
89
+ for item in items:
90
+ if item is not None:
91
+ spines.append(Spine(
92
+ folder_path=self.folder_path,
93
+ base_path=base_path,
94
+ item=item,
95
+ ))
96
+
97
+ return spines
98
+
99
+ @property
100
+ def title(self):
101
+ title_dom = self._get_title()
102
+ if title_dom is None:
103
+ return None
104
+ return title_dom.text
105
+
106
+ @title.setter
107
+ def title(self, title: str):
108
+ title_dom = self._get_title()
109
+ if title_dom is not None:
110
+ title_dom.text = _escape_ascii(title)
111
+
112
+ def _get_title(self):
113
+ titles = self._metadata.xpath(
114
+ "./dc:title",
115
+ namespaces={
116
+ "dc": self._metadata.nsmap.get("dc"),
117
+ },
118
+ )
119
+ if len(titles) == 0:
120
+ return None
121
+ return titles[0]
122
+
123
+ @property
124
+ def authors(self) -> list[str]:
125
+ return list(map(lambda x: x.text, self._get_creators()))
126
+
127
+ @authors.setter
128
+ def authors(self, authors):
129
+ creator_doms = self._get_creators()
130
+ if len(creator_doms) == 0:
131
+ return
132
+ parent_dom = creator_doms[0].getparent()
133
+ index_at_parent = parent_dom.index(creator_doms[0])
134
+ ns={
135
+ "dc": self._metadata.nsmap.get("dc"),
136
+ "opf": self._metadata.nsmap.get("opf"),
137
+ }
138
+ for author in reversed(authors):
139
+ creator_dom = Element(QName(ns["dc"], "creator"))
140
+ creator_dom.set(QName(ns["opf"], "file-as"), author)
141
+ creator_dom.set(QName(ns["opf"], "role"), "aut")
142
+ creator_dom.text = _escape_ascii(author)
143
+ parent_dom.insert(index_at_parent, creator_dom)
144
+
145
+ for creator_dom in creator_doms:
146
+ parent_dom.remove(creator_dom)
147
+
148
+ def _get_creators(self):
149
+ return self._metadata.xpath(
150
+ "./dc:creator",
151
+ namespaces={
152
+ "dc": self._metadata.nsmap.get("dc"),
153
+ },
154
+ )
155
+
156
+ def _escape_ascii(content: str) -> str:
157
+ content = escape(content)
158
+ content = re.sub(
159
+ r"\\u([\da-fA-F]{4})",
160
+ lambda x: chr(int(x.group(1), 16)), content,
161
+ )
162
+ return content
@@ -0,0 +1 @@
1
+ from .file import translate_html
@@ -0,0 +1,62 @@
1
+ from io import StringIO
2
+ from typing import cast, Generator, Iterable
3
+ from xml.etree.ElementTree import Element
4
+ from .texts_searcher import search_texts, TextPosition
5
+
6
+
7
+ def read_texts(root: Element) -> Generator[str, None, None]:
8
+ for element, position, _ in search_texts(root):
9
+ if position == TextPosition.WHOLE_DOM:
10
+ yield _plain_text(element)
11
+ elif position == TextPosition.TEXT:
12
+ yield cast(str, element.text)
13
+ elif position == TextPosition.TAIL:
14
+ yield cast(str, element.tail)
15
+
16
+ def append_texts(root: Element, texts: Iterable[str | Iterable[str] | None]):
17
+ zip_list = list(zip(texts, search_texts(root)))
18
+ for text, (element, position, parent) in reversed(zip_list):
19
+ if text is None:
20
+ continue
21
+ if not isinstance(text, str):
22
+ # TODO: implements split text
23
+ text = "".join(text)
24
+ if position == TextPosition.WHOLE_DOM:
25
+ if parent is not None:
26
+ _append_dom(parent, element, text)
27
+ elif position == TextPosition.TEXT:
28
+ element.text = _append_text(element.text, text)
29
+ elif position == TextPosition.TAIL:
30
+ element.tail = _append_text(element.tail, text)
31
+
32
+ def _append_dom(parent: Element, origin: Element, text: str):
33
+ appended = Element(origin.tag, {**origin.attrib})
34
+ for index, child in enumerate(parent):
35
+ if child == origin:
36
+ parent.insert(index + 1, appended)
37
+ break
38
+
39
+ appended.attrib.pop("id", None)
40
+ appended.text = text
41
+ appended.tail = origin.tail
42
+ origin.tail = None
43
+
44
+ def _append_text(left: str | None, right: str) -> str:
45
+ if left is None:
46
+ return right
47
+ else:
48
+ return left + right
49
+
50
+ def _plain_text(target: Element):
51
+ buffer = StringIO()
52
+ for text in _iter_text(target):
53
+ buffer.write(text)
54
+ return buffer.getvalue()
55
+
56
+ def _iter_text(parent: Element):
57
+ if parent.text is not None:
58
+ yield parent.text
59
+ for child in parent:
60
+ yield from _iter_text(child)
61
+ if parent.tail is not None:
62
+ yield parent.tail
@@ -0,0 +1,23 @@
1
+ import re
2
+
3
+ # HTML 规定了一系列自闭标签,这些标签需要改成非自闭的,因为 EPub 格式不支持
4
+ # https://www.tutorialspoint.com/which-html-tags-are-self-closing
5
+ _EMPTY_TAGS = (
6
+ "br",
7
+ "hr",
8
+ "input",
9
+ "col",
10
+ "base",
11
+ "meta",
12
+ "area",
13
+ )
14
+
15
+ _EMPTY_TAG_PATTERN = re.compile(
16
+ r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>"
17
+ )
18
+
19
+ def to_html(content: str) -> str:
20
+ return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)}>", content)
21
+
22
+ def to_xml(content: str) -> str:
23
+ return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)} />", content)
@@ -0,0 +1,65 @@
1
+ import re
2
+
3
+ from xml.etree.ElementTree import fromstring, tostring, Element
4
+ from ..types import Translate, ReportProgress
5
+ from .dom_operator import read_texts, append_texts
6
+ from .empty_tags import to_xml, to_html
7
+
8
+
9
+ _FILE_HEAD_PATTERN = re.compile(r"^<\?xml.*?\?>[\s]*<!DOCTYPE.*?>")
10
+ _XMLNS_IN_TAG = re.compile(r"\{[^}]+\}")
11
+ _BRACES = re.compile(r"(\{|\})")
12
+
13
+ def translate_html(translate: Translate, file_content: str, report_progress: ReportProgress) -> str:
14
+ match = re.match(_FILE_HEAD_PATTERN, file_content)
15
+ head = match.group() if match else None
16
+ xml_content = re.sub(_FILE_HEAD_PATTERN, "", to_xml(file_content))
17
+
18
+ root = fromstring(xml_content)
19
+ root_attrib = {**root.attrib}
20
+ xmlns = _extract_xmlns(root)
21
+
22
+ source_texts = list(read_texts(root))
23
+ target_texts = translate(source_texts, report_progress)
24
+ append_texts(root, target_texts)
25
+
26
+ if xmlns is not None:
27
+ root_attrib["xmlns"] = xmlns
28
+ root.attrib = root_attrib
29
+
30
+ if xmlns is None:
31
+ file_content = tostring(root, encoding="unicode")
32
+ file_content = to_html(file_content)
33
+ else:
34
+ # XHTML disable <tag/> (we need replace them with <tag></tag>)
35
+ for element in _all_elements(root):
36
+ if element.text is None:
37
+ element.text = ""
38
+ file_content = tostring(root, encoding="unicode")
39
+
40
+ if head is not None:
41
+ file_content = head + file_content
42
+
43
+ return file_content
44
+
45
+ def _extract_xmlns(root: Element) -> str | None:
46
+ root_xmlns: str | None = None
47
+ for i, element in enumerate(_all_elements(root)):
48
+ need_clean_xmlns = True
49
+ match = re.match(_XMLNS_IN_TAG, element.tag)
50
+
51
+ if match:
52
+ xmlns = re.sub(_BRACES, "", match.group())
53
+ if i == 0:
54
+ root_xmlns = xmlns
55
+ elif root_xmlns != xmlns:
56
+ need_clean_xmlns = False
57
+ if need_clean_xmlns:
58
+ element.tag = re.sub(_XMLNS_IN_TAG, "", element.tag)
59
+
60
+ return root_xmlns
61
+
62
+ def _all_elements(parent: Element):
63
+ yield parent
64
+ for child in parent:
65
+ yield from _all_elements(child)
@@ -0,0 +1,45 @@
1
+ from typing import Generator, TypeGuard
2
+ from enum import auto, Enum
3
+ from xml.etree.ElementTree import Element
4
+
5
+
6
+ class TextPosition(Enum):
7
+ WHOLE_DOM = auto()
8
+ TEXT = auto()
9
+ TAIL = auto()
10
+
11
+ # element, position, parent
12
+ TextDescription = tuple[Element, TextPosition, Element | None]
13
+
14
+ _IGNORE_TAGS = (
15
+ "title", "link", "style", "css", "img", "script", "metadata"
16
+ )
17
+
18
+ _TEXT_LEAF_TAGS = (
19
+ "a", "b", "br", "hr", "span", "em", "strong", "label",
20
+ )
21
+
22
+ def search_texts(element: Element, parent: Element | None = None) -> Generator[TextDescription, None, None]:
23
+ if element.tag in _IGNORE_TAGS:
24
+ return
25
+
26
+ if any(c.tag not in _TEXT_LEAF_TAGS for c in element):
27
+ if _is_not_empty_str(element.text):
28
+ yield element, TextPosition.TEXT, parent
29
+ for child in element:
30
+ if child.tag in _TEXT_LEAF_TAGS:
31
+ yield child, TextPosition.WHOLE_DOM, element
32
+ else:
33
+ yield from search_texts(child, element)
34
+ if _is_not_empty_str(child.tail):
35
+ yield child, TextPosition.TAIL, element
36
+ else:
37
+ yield element, TextPosition.WHOLE_DOM, parent
38
+
39
+ def _is_not_empty_str(text: str | None) -> TypeGuard[str]:
40
+ if text is None:
41
+ return False
42
+ for char in text:
43
+ if char not in (" ", "\n"):
44
+ return True
45
+ return False
@@ -0,0 +1,4 @@
1
+ from typing import Callable
2
+
3
+ ReportProgress = Callable[[float], None]
4
+ Translate=Callable[[list[str], ReportProgress], list[str]]
@@ -0,0 +1,124 @@
1
+ import io
2
+ import os
3
+ import zipfile
4
+ import tempfile
5
+ import shutil
6
+
7
+ from typing import Callable
8
+ from lxml.etree import parse
9
+ from .epub import translate_html, Translate, EpubContent
10
+
11
+
12
+ ProgressReporter = Callable[[float], None]
13
+
14
+ def translate_epub_file(
15
+ translate: Translate,
16
+ file_path: str,
17
+ book_title: str | None,
18
+ report_progress: ProgressReporter,
19
+ ) -> bytes:
20
+
21
+ unzip_path = tempfile.mkdtemp()
22
+ try:
23
+ with zipfile.ZipFile(file_path, "r") as zip_ref:
24
+ for member in zip_ref.namelist():
25
+ target_path = os.path.join(unzip_path, member)
26
+ if member.endswith("/"):
27
+ os.makedirs(target_path, exist_ok=True)
28
+ else:
29
+ target_dir_path = os.path.dirname(target_path)
30
+ os.makedirs(target_dir_path, exist_ok=True)
31
+ with zip_ref.open(member) as source, open(target_path, "wb") as file:
32
+ file.write(source.read())
33
+
34
+ _translate_folder(
35
+ translate=translate,
36
+ path=unzip_path,
37
+ book_title=book_title,
38
+ report_progress=report_progress,
39
+ )
40
+ in_memory_zip = io.BytesIO()
41
+
42
+ with zipfile.ZipFile(in_memory_zip, "w") as zip_file:
43
+ for root, _, files in os.walk(unzip_path):
44
+ for file in files:
45
+ file_path = os.path.join(root, file)
46
+ relative_path = os.path.relpath(file_path, unzip_path)
47
+ zip_file.write(file_path, arcname=relative_path)
48
+
49
+ in_memory_zip.seek(0)
50
+ zip_data = in_memory_zip.read()
51
+
52
+ return zip_data
53
+
54
+ finally:
55
+ shutil.rmtree(unzip_path)
56
+
57
+ def _translate_folder(
58
+ translate: Translate,
59
+ path: str,
60
+ book_title: str | None,
61
+ report_progress: ProgressReporter,
62
+ ) -> None:
63
+ epub_content = EpubContent(path)
64
+ if book_title is None:
65
+ book_title = epub_content.title
66
+ if book_title is not None:
67
+ book_title = _link_translated(book_title, translate([book_title], lambda _: None)[0])
68
+
69
+ if book_title is not None:
70
+ epub_content.title = book_title
71
+
72
+ authors = epub_content.authors
73
+ to_authors = translate(authors, lambda _: None)
74
+
75
+ for i, author in enumerate(authors):
76
+ authors[i] = _link_translated(author, to_authors[i])
77
+
78
+ epub_content.authors = authors
79
+ epub_content.save()
80
+
81
+ _translate_ncx(epub_content, translate)
82
+ _translate_spines(epub_content, translate, report_progress)
83
+
84
+ def _translate_ncx(epub_content: EpubContent, translate: Translate):
85
+ ncx_path = epub_content.ncx_path
86
+
87
+ if ncx_path is not None:
88
+ tree = parse(ncx_path)
89
+ root = tree.getroot()
90
+ namespaces={ "ns": root.nsmap.get(None) }
91
+ text_doms = []
92
+ text_list = []
93
+
94
+ for text_dom in root.xpath("//ns:text", namespaces=namespaces):
95
+ text_doms.append(text_dom)
96
+ text_list.append(text_dom.text or "")
97
+
98
+ for index, text in enumerate(translate(text_list, lambda _: None)):
99
+ text_dom = text_doms[index]
100
+ text_dom.text = _link_translated(text_dom.text, text)
101
+
102
+ tree.write(ncx_path, pretty_print=True)
103
+
104
+ def _translate_spines(epub_content: EpubContent, translate: Translate, report_progress: ProgressReporter):
105
+ spines = epub_content.spines
106
+ for index, spine in enumerate(spines):
107
+ if spine.media_type == "application/xhtml+xml":
108
+ file_path = spine.path
109
+ with open(file_path, "r", encoding="utf-8") as file:
110
+ content = translate_html(
111
+ translate=translate,
112
+ file_content=file.read(),
113
+ report_progress=lambda p, i=index: report_progress((float(i) + p) / len(spines)),
114
+ )
115
+ with open(file_path, "w", encoding="utf-8") as file:
116
+ file.write(content)
117
+
118
+ report_progress(float(index + 1) / len(spines))
119
+
120
+ def _link_translated(origin: str, target: str) -> str:
121
+ if origin == target:
122
+ return origin
123
+ else:
124
+ return f"{origin} - {target}"
@@ -0,0 +1 @@
1
+ from .translator import Translator
@@ -0,0 +1,140 @@
1
+ import tiktoken
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Generator, Iterable
5
+ from resource_segmentation import split, Segment, Resource, Incision
6
+ from .nlp import NLP
7
+
8
+
9
+ @dataclass
10
+ class Fragment:
11
+ id: int
12
+ origin: str
13
+ target: str
14
+ tokens: int
15
+ index: int
16
+
17
+ @dataclass
18
+ class _Sentence:
19
+ index: int
20
+ tokens: list[int]
21
+ text: str
22
+
23
+ class Group:
24
+ def __init__(self, group_max_tokens: int, gap_rate: float) -> None:
25
+ self._encoder: tiktoken.Encoding = tiktoken.get_encoding("o200k_base")
26
+ self._nlp: NLP = NLP()
27
+ self._next_id: int = 0
28
+ self._group_max_tokens: int = group_max_tokens
29
+ self._gap_rate: float = gap_rate
30
+
31
+ def split(self, texts: Iterable[str]) -> Generator[tuple[list[Fragment], list[Fragment], list[Fragment]], Any, None]:
32
+ for group in split(
33
+ max_segment_count=self._group_max_tokens,
34
+ gap_rate=self._gap_rate,
35
+ resources=self._gen_resources(texts),
36
+ ):
37
+ head_fragments = self._handle_gap_sentences(
38
+ sentences_iter=self._extract_sentences(group.head),
39
+ remain_tokens=group.head_remain_count,
40
+ clip_head=True,
41
+ )
42
+ body_fragments = self._extract_sentences(group.body)
43
+ tail_fragments = self._handle_gap_sentences(
44
+ sentences_iter=self._extract_sentences(group.tail),
45
+ remain_tokens=group.tail_remain_count,
46
+ clip_head=False,
47
+ )
48
+ yield (
49
+ list(self._to_fragments(head_fragments)),
50
+ list(self._to_fragments(body_fragments)),
51
+ list(self._to_fragments(tail_fragments)),
52
+ )
53
+
54
+ def _gen_resources(self, texts: Iterable[str]) -> Generator[Resource[_Sentence], None, None]:
55
+ for index, text in enumerate(texts):
56
+ sentences = self._nlp.split_into_sents(text)
57
+ for i, text in enumerate(sentences):
58
+ sentence = _Sentence(
59
+ text=text,
60
+ index=index,
61
+ tokens=self._encoder.encode(text)
62
+ )
63
+ start_incision: Incision = Incision.MOST_LIKELY
64
+ end_incision: Incision = Incision.MOST_LIKELY
65
+
66
+ if i == 0:
67
+ start_incision = Incision.IMPOSSIBLE
68
+ if i == len(sentences) - 1:
69
+ end_incision = Incision.IMPOSSIBLE
70
+
71
+ yield Resource(
72
+ count=len(sentence.tokens),
73
+ payload=sentence,
74
+ start_incision=start_incision,
75
+ end_incision=end_incision,
76
+ )
77
+
78
+ def _extract_sentences(self, items: list[Resource[_Sentence] | Segment[_Sentence]]) -> Generator[_Sentence, None, None]:
79
+ for item in items:
80
+ if isinstance(item, Resource):
81
+ yield item.payload
82
+ elif isinstance(item, Segment):
83
+ for resource in item.resources:
84
+ yield resource.payload
85
+
86
+ def _handle_gap_sentences(
87
+ self,
88
+ sentences_iter: Iterable[_Sentence],
89
+ remain_tokens: int,
90
+ clip_head: bool,
91
+ ) -> Generator[_Sentence, None, None]:
92
+
93
+ sentences = list(sentences_iter)
94
+
95
+ if self._need_clip(sentences, remain_tokens):
96
+ sentence = sentences[0]
97
+ if clip_head:
98
+ tokens = sentence.tokens[len(sentence.tokens) - remain_tokens:]
99
+ else:
100
+ tokens: list[int] = sentence.tokens[:remain_tokens]
101
+
102
+ yield _Sentence(
103
+ index=sentence.index,
104
+ tokens=tokens,
105
+ text=self._encoder.decode(tokens),
106
+ )
107
+ else:
108
+ yield from sentences
109
+
110
+ def _need_clip(self, sentences: list[_Sentence], remain_tokens: int) -> bool:
111
+ if len(sentences) == 1:
112
+ sentence = sentences[0]
113
+ if len(sentence.tokens) > remain_tokens:
114
+ return True
115
+ return False
116
+
117
+ def _to_fragments(self, sentences: Iterable[_Sentence]):
118
+ fragment: Fragment | None = None
119
+ for sentence in sentences:
120
+ if fragment is None:
121
+ fragment = self._create_fragment(sentence)
122
+ elif fragment.index != sentence.index:
123
+ yield fragment
124
+ fragment = self._create_fragment(sentence)
125
+ else:
126
+ fragment.origin += sentence.text
127
+ fragment.tokens += len(sentence.tokens)
128
+ if fragment is not None:
129
+ yield fragment
130
+
131
+ def _create_fragment(self, sentence: _Sentence) -> Fragment:
132
+ fragment = Fragment(
133
+ id=self._next_id,
134
+ index=sentence.index,
135
+ origin=sentence.text,
136
+ target="",
137
+ tokens=len(sentence.tokens),
138
+ )
139
+ self._next_id += 1
140
+ return fragment
@@ -0,0 +1,58 @@
1
+ from typing import Generator, cast
2
+ from io import StringIO
3
+ from pydantic import SecretStr
4
+ from langchain_core.messages import SystemMessage, HumanMessage, BaseMessageChunk
5
+ from langchain_openai import ChatOpenAI
6
+
7
+
8
+ class LLM:
9
+ def __init__(
10
+ self,
11
+ key: str | None,
12
+ url: str | None,
13
+ model: str,
14
+ temperature: float,
15
+ timeout: float | None,
16
+ ) -> None:
17
+ self._timeout: float | None = timeout
18
+ self._model: ChatOpenAI = ChatOpenAI(
19
+ api_key=cast(SecretStr, key),
20
+ base_url=url,
21
+ model=model,
22
+ temperature=temperature,
23
+ )
24
+
25
+ def invoke(self, system: str, human: str) -> str:
26
+ resp = self._model.invoke(
27
+ timeout=self._timeout,
28
+ input=[
29
+ SystemMessage(content=system),
30
+ HumanMessage(content=human),
31
+ ],
32
+ )
33
+ return str(resp.content)
34
+
35
+ def invoke_response_lines(self, system: str, human: str) -> Generator[str, None, None]:
36
+ stream = self._model.stream(
37
+ timeout=self._timeout,
38
+ input=[
39
+ SystemMessage(content=system),
40
+ HumanMessage(content=human),
41
+ ],
42
+ )
43
+ line_buffer = StringIO()
44
+ aggregate: BaseMessageChunk | None = None
45
+
46
+ for chunk in stream:
47
+ fragment = str(chunk.content)
48
+ aggregate = chunk if aggregate is None else aggregate + chunk
49
+ lines = fragment.split("\n")
50
+ if len(lines) > 0:
51
+ line_buffer.write(lines[0])
52
+ for line in lines[1:]:
53
+ yield line_buffer.getvalue()
54
+ line_buffer = StringIO()
55
+ line_buffer.write(line)
56
+
57
+ # TODO: aggregate.usage_metadata
58
+ yield line_buffer.getvalue()
@@ -0,0 +1,36 @@
1
+ import re
2
+ import spacy
3
+ import langid
4
+ import threading
5
+
6
+ from spacy.language import Language
7
+
8
+ _lan2model: dict = {
9
+ "en": "en_core_web_sm",
10
+ "zh": "zh_core_web_sm",
11
+ "fr": "fr_core_news_sm",
12
+ "ru": "ru_core_news_sm",
13
+ "de": "de_core_news_sm",
14
+ }
15
+
16
+ class NLP:
17
+ def __init__(self) -> None:
18
+ self._lock: threading.Lock = threading.Lock()
19
+ self._nlp_dict: dict[str, Language] = {}
20
+
21
+ def split_into_sents(self, text: str) -> list[str]:
22
+ lan, _ = langid.classify(text)
23
+ with self._lock:
24
+ nlp = self._nlp_dict.get(lan, None)
25
+ if nlp is None:
26
+ model_id = _lan2model.get(lan, None)
27
+ if model_id is None:
28
+ return self._split_into_sents(text)
29
+ nlp = spacy.load(model_id)
30
+ self._nlp_dict[lan] = nlp
31
+
32
+ return [s.text for s in nlp(text).sents]
33
+
34
+ def _split_into_sents(self, text: str) -> list[str]:
35
+ cells: list[str] = re.split(r"(\.|!|\?|;|。|!|?|;)", text)
36
+ return [cells[i] + cells[i+1] for i in range(0, len(cells)-1, 2)]
@@ -0,0 +1,159 @@
1
+ import re
2
+ import os
3
+
4
+ from typing import Callable, Iterable
5
+ from hashlib import sha256
6
+
7
+ from .group import Group, Fragment
8
+ from json import loads, dumps
9
+ from .llm import LLM
10
+
11
+
12
+ _LAN_FULL_NAMES: dict[str, str] = {
13
+ "en": "English",
14
+ "cn": "simplified Chinese",
15
+ "ja": "Japanese",
16
+ "fr": "French",
17
+ "ru": "Russian",
18
+ "de": "German",
19
+ }
20
+
21
+ class Translator:
22
+ def __init__(
23
+ self,
24
+ group_max_tokens: int,
25
+ cache_path: str,
26
+ key: str | None,
27
+ url: str | None,
28
+ model: str,
29
+ temperature: float,
30
+ timeout: float | None,
31
+ source_lan: str,
32
+ target_lan: str,
33
+ streaming: bool) -> None:
34
+
35
+ self._streaming: bool = streaming
36
+ self._group: Group = Group(
37
+ group_max_tokens=group_max_tokens,
38
+ gap_rate=0.1,
39
+ )
40
+ self._cache_path: str = cache_path
41
+ self._llm = LLM(
42
+ key=key,
43
+ url=url,
44
+ model=model,
45
+ temperature=temperature,
46
+ timeout=timeout,
47
+ )
48
+ self._admin_prompt: str = _gen_admin_prompt(
49
+ source_lan=self._lan_full_name(source_lan),
50
+ target_lan=self._lan_full_name(target_lan),
51
+ )
52
+
53
+ def translate(self, source_texts: list[str], report_progress: Callable[[float], None]) -> list[str]:
54
+ body_fragments: list[Fragment] = []
55
+ target_texts: list[str] = [""] * len(source_texts)
56
+ splitted = list(self._group.split(source_texts))
57
+
58
+ for i, (head, body, tail) in enumerate(splitted):
59
+ body_fragments.extend(body)
60
+ self._translate_fragments(
61
+ fragments=head + body + tail,
62
+ report_progress=lambda p, i=i: report_progress(
63
+ (float(i) + p) / len(splitted),
64
+ ),
65
+ )
66
+ for fragment in body_fragments:
67
+ target_texts[fragment.index] += fragment.target
68
+
69
+ return target_texts
70
+
71
+ def _translate_fragments(self, fragments: list[Fragment], report_progress: Callable[[float], None]) -> list[Fragment]:
72
+ texts: list[str] = []
73
+ translated_texts: list[str] = []
74
+ indexes: list[int] = []
75
+ for index, fragment in enumerate(fragments):
76
+ text = fragment.origin.strip()
77
+ if text != "":
78
+ texts.append(text)
79
+ indexes.append(index)
80
+
81
+ if len(texts) > 0:
82
+ for i, text in enumerate(self._translate_text_by_text(texts)):
83
+ report_progress(min(1.0, float(i) / float(len(texts))))
84
+ translated_texts.append(text)
85
+ report_progress(1.0)
86
+
87
+ for index, text in zip(indexes, translated_texts):
88
+ fragments[index].target = text
89
+ return fragments
90
+
91
+ def _translate_text_by_text(self, texts: list[str]):
92
+ hash = self._to_hash(texts)
93
+ cache_file_path = os.path.join(self._cache_path, f"{hash}.json")
94
+ if os.path.exists(cache_file_path):
95
+ with open(cache_file_path, "r", encoding="utf-8") as cache_file:
96
+ for translated_text in loads(cache_file.read()):
97
+ yield translated_text
98
+ else:
99
+ system=self._admin_prompt
100
+ human="\n".join([f"{i+1}: {t}" for i, t in enumerate(texts)])
101
+ translated_texts: list[str] = []
102
+ iter_lines: Iterable[str]
103
+
104
+ if self._streaming:
105
+ iter_lines = self._llm.invoke_response_lines(system, human)
106
+ else:
107
+ iter_lines = self._llm.invoke(system, human).split("\n")
108
+ for line in iter_lines:
109
+ match = re.search(r"^\d+\:", line)
110
+ if match:
111
+ translated_text = re.sub(r"^\d+\:\s*", "", line)
112
+ yield translated_text
113
+ translated_texts.append(translated_text)
114
+
115
+ with open(cache_file_path, "w", encoding="utf-8") as cache_file:
116
+ cache_file.write(dumps(
117
+ obj=translated_texts,
118
+ ensure_ascii=False,
119
+ indent=2,
120
+ ))
121
+
122
+
123
+ def _lan_full_name(self, name: str) -> str:
124
+ full_name = _LAN_FULL_NAMES.get(name, None)
125
+ if full_name is None:
126
+ full_name = _LAN_FULL_NAMES["en"]
127
+ return full_name
128
+
129
+ def _to_hash(self, texts: list[str]) -> str:
130
+ hash = sha256()
131
+ for text in texts:
132
+ data = text.encode(encoding="utf-8")
133
+ hash.update(data)
134
+ hash.update(b"\x03") # ETX means string's end
135
+ return hash.hexdigest()
136
+
137
+ def _gen_admin_prompt(target_lan: str, source_lan: str) -> str:
138
+ return f"""
139
+ You are a translator and need to translate the user's {source_lan} text into {target_lan}.
140
+ I want you to replace simplified A0-level words and sentences with more beautiful and elegant, upper level {target_lan} words and sentences. Keep the meaning same, but make them more literary.
141
+ I want you to only reply the translation and nothing else, do not write explanations.
142
+ A number and colon are added to the top of each line of text entered by the user. This number is only used to align the translation text for you and has no meaning in itself. You should delete the number in your mind to understand the user's original text.
143
+ Your translation results should be split into a number of lines, the number of lines is equal to the number of lines in the user's original text. The content of each line should correspond to the corresponding line of the user's original text.
144
+ All user submitted text must be translated. The translated lines must not be missing, added, misplaced, or have their order changed. They must correspond exactly to the original text of the user.
145
+
146
+ Here is an example. First, the user submits the original text in English (this is just an example):
147
+ 1: IV
148
+ 2: This true without lying, certain & most true:
149
+ 3: That which is below is like that which is above and that which is above is like that which is below to do ye miracles of one only thing.
150
+ 4: .+
151
+ 5: And as all things have been and arose from one by ye mediation of one: so all things have their birth from this one thing by adaptation.
152
+
153
+ If you are asked to translate into Chinese, you need to submit the translated content in the following format:
154
+ 1: 四
155
+ 2: 这是真的,没有任何虚妄,是确定的,最真实的:
156
+ 3: 上如其下,下如其上,以此来展现“一”的奇迹。
157
+ 4: .+
158
+ 5: 万物皆来自“一”的沉思,万物在“一”的安排下诞生。
159
+ """
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 OOMOL Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.3
2
+ Name: epub-translator
3
+ Version: 0.0.1
4
+ Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
+ License: MIT
6
+ Author: Tao Zeyu
7
+ Author-email: i@taozeyu.com
8
+ Maintainer: Tao Zeyu
9
+ Maintainer-email: i@taozeyu.com
10
+ Requires-Python: >=3.10,<3.13
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Requires-Dist: langchain (==0.3.23)
17
+ Requires-Dist: langchain-openai (==0.3.13)
18
+ Requires-Dist: langid (>=1.1.6,<2.0.0)
19
+ Requires-Dist: lxml (>=6.0.0,<7.0.0)
20
+ Requires-Dist: resource-segmentation (==0.0.1)
21
+ Requires-Dist: spacy (>=3.8.7,<4.0.0)
22
+ Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
23
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0)
24
+ Project-URL: Homepage, https://hub.oomol.com/package/book-translator
25
+ Project-URL: Repository, https://github.com/oomol-flows/books-translator
26
+ Description-Content-Type: text/markdown
27
+
28
+ # epub-translator
29
+
30
+ Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
31
+
32
+ ## Field Description
33
+
34
+ - `file`: the epub file to be translated.
35
+ - `title`: the title of the book to be translated (original language)
36
+ - `max_translating_group`: the maximum amount of translation text submitted each time. Books will be submitted in chunks during translation, and this value will limit the maximum length of each chunk.
37
+ - `max_translating_group_unit`: the unit of the `max_translating_group_unit`.
38
+ - `source`: the language of the book to be translated.
39
+ - `target`: the target language you want to translate it into.
40
+ - `llm_api`: the LLM API format used for translation.
41
+ - `model`: the model used for translation
42
+ - `url`: the URL of the LLM
43
+ - `api_key`: the Key of the LLM
44
+ - `temperature`: the temperature of the LLM, which is a parameter used to control the randomness of the generated text. In simple terms, the lower the temperature value, the more certain and conservative the text generated by the model. The higher the temperature value, the more random and diverse the text generated by the model.
45
+ - `timeout`: the request timeout, in seconds.
46
+ - `binary`: the translated target epub file content.
@@ -0,0 +1,19 @@
1
+ epub_translator/__init__.py,sha256=_R15M7icijpfTrXeDTsJ_LCCBeKn83eZPE8FFTL9AAM,90
2
+ epub_translator/epub/__init__.py,sha256=GWng1nNmf-ugEmN-VPeRBgYtGGGv_SxEz0649RcK43A,117
3
+ epub_translator/epub/content_parser.py,sha256=Ju94SanlYv5fpG71P1M1tg8d1mP47RHJ38RYu5P7h0k,4381
4
+ epub_translator/epub/html/__init__.py,sha256=Am-3WLD0d4eHLo4gW41rCC2ooYa-ZJ_Kua2OrZ9CVoE,32
5
+ epub_translator/epub/html/dom_operator.py,sha256=Ryayv6hG0jEXv7RkXrZTbIP54P0fyPTbMVbymMtBUnU,1935
6
+ epub_translator/epub/html/empty_tags.py,sha256=GSSe-CV4YkUhWv4F0fiiRsf2vz0ZBAsC21Ovnqo5oIA,601
7
+ epub_translator/epub/html/file.py,sha256=KfuJ3QD74VIId9tLNK4JYSbQCjpE8XWvzN6T3tamM60,1966
8
+ epub_translator/epub/html/texts_searcher.py,sha256=Gs1n38CzfpM3G5XeZrW12Mw_JPixaQOyQEc7ew4B1Vs,1251
9
+ epub_translator/epub/types.py,sha256=PlEwlXWeX_S4HkFr4GheZgoR1a0qKby1z-_dzpcntG4,128
10
+ epub_translator/file.py,sha256=tUxDwqCNIeXYqzU_GmjbyKptLF_nBtb8JjVpjgTK4OI,3728
11
+ epub_translator/translator/__init__.py,sha256=qJhlcRMR3t1aEp-vFpJFb_6pUTEWPMTohXaJFDPE5SU,34
12
+ epub_translator/translator/group.py,sha256=TNGgPPjt3ir3v_ODECpRxhvuBatNMo3vqs4YF-Q9mjQ,4243
13
+ epub_translator/translator/llm.py,sha256=eEJEkuzTJlS3-bcLk988LxK8Ttl9JOlSBPKbaOoxY6g,1598
14
+ epub_translator/translator/nlp.py,sha256=5LLHL93873gddS8QJks1qKrvKLMnd9voq358-2FHNqE,990
15
+ epub_translator/translator/translator.py,sha256=hNM-baEqsEIKkZqEQEUKMG6wnYXaSy9ZtiOqree8zQ0,5968
16
+ epub_translator-0.0.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
17
+ epub_translator-0.0.1.dist-info/METADATA,sha256=CkVZ-sRTf4yylk2_3gFuTCK2hKPG8iKDkgp53q0yIOw,2404
18
+ epub_translator-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
+ epub_translator-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.1.3
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any