epub-translator 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. epub_translator/__init__.py +3 -2
  2. epub_translator/data/format.jinja +33 -0
  3. epub_translator/data/translate.jinja +15 -0
  4. epub_translator/epub/__init__.py +2 -3
  5. epub_translator/epub/content_parser.py +2 -2
  6. epub_translator/epub/html/__init__.py +1 -1
  7. epub_translator/epub/html/file.py +56 -41
  8. epub_translator/epub/html/texts_searcher.py +2 -1
  9. epub_translator/llm/__init__.py +1 -0
  10. epub_translator/llm/error.py +49 -0
  11. epub_translator/llm/executor.py +147 -0
  12. epub_translator/llm/increasable.py +35 -0
  13. epub_translator/llm/node.py +197 -0
  14. epub_translator/template.py +50 -0
  15. epub_translator/translation/__init__.py +2 -0
  16. epub_translator/translation/chunk.py +120 -0
  17. epub_translator/translation/splitter.py +77 -0
  18. epub_translator/translation/store.py +37 -0
  19. epub_translator/translation/translation.py +192 -0
  20. epub_translator/translation/types.py +23 -0
  21. epub_translator/translation/utils.py +11 -0
  22. epub_translator/translator.py +169 -0
  23. epub_translator/xml/__init__.py +3 -0
  24. epub_translator/xml/decoder.py +71 -0
  25. epub_translator/xml/encoder.py +95 -0
  26. epub_translator/xml/parser.py +172 -0
  27. epub_translator/xml/tag.py +93 -0
  28. epub_translator/xml/transform.py +34 -0
  29. epub_translator/xml/utils.py +12 -0
  30. epub_translator/zip_context.py +74 -0
  31. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/METADATA +5 -7
  32. epub_translator-0.0.3.dist-info/RECORD +36 -0
  33. epub_translator/epub/types.py +0 -4
  34. epub_translator/file.py +0 -124
  35. epub_translator/translator/__init__.py +0 -1
  36. epub_translator/translator/group.py +0 -140
  37. epub_translator/translator/llm.py +0 -58
  38. epub_translator/translator/nlp.py +0 -36
  39. epub_translator/translator/translator.py +0 -159
  40. epub_translator-0.0.1.dist-info/RECORD +0 -19
  41. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE +0 -0
  42. {epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL +0 -0
epub_translator/file.py DELETED
@@ -1,124 +0,0 @@
1
- import io
2
- import os
3
- import zipfile
4
- import tempfile
5
- import shutil
6
-
7
- from typing import Callable
8
- from lxml.etree import parse
9
- from .epub import translate_html, Translate, EpubContent
10
-
11
-
12
- ProgressReporter = Callable[[float], None]
13
-
14
- def translate_epub_file(
15
- translate: Translate,
16
- file_path: str,
17
- book_title: str | None,
18
- report_progress: ProgressReporter,
19
- ) -> bytes:
20
-
21
- unzip_path = tempfile.mkdtemp()
22
- try:
23
- with zipfile.ZipFile(file_path, "r") as zip_ref:
24
- for member in zip_ref.namelist():
25
- target_path = os.path.join(unzip_path, member)
26
- if member.endswith("/"):
27
- os.makedirs(target_path, exist_ok=True)
28
- else:
29
- target_dir_path = os.path.dirname(target_path)
30
- os.makedirs(target_dir_path, exist_ok=True)
31
- with zip_ref.open(member) as source, open(target_path, "wb") as file:
32
- file.write(source.read())
33
-
34
- _translate_folder(
35
- translate=translate,
36
- path=unzip_path,
37
- book_title=book_title,
38
- report_progress=report_progress,
39
- )
40
- in_memory_zip = io.BytesIO()
41
-
42
- with zipfile.ZipFile(in_memory_zip, "w") as zip_file:
43
- for root, _, files in os.walk(unzip_path):
44
- for file in files:
45
- file_path = os.path.join(root, file)
46
- relative_path = os.path.relpath(file_path, unzip_path)
47
- zip_file.write(file_path, arcname=relative_path)
48
-
49
- in_memory_zip.seek(0)
50
- zip_data = in_memory_zip.read()
51
-
52
- return zip_data
53
-
54
- finally:
55
- shutil.rmtree(unzip_path)
56
-
57
- def _translate_folder(
58
- translate: Translate,
59
- path: str,
60
- book_title: str | None,
61
- report_progress: ProgressReporter,
62
- ) -> None:
63
- epub_content = EpubContent(path)
64
- if book_title is None:
65
- book_title = epub_content.title
66
- if book_title is not None:
67
- book_title = _link_translated(book_title, translate([book_title], lambda _: None)[0])
68
-
69
- if book_title is not None:
70
- epub_content.title = book_title
71
-
72
- authors = epub_content.authors
73
- to_authors = translate(authors, lambda _: None)
74
-
75
- for i, author in enumerate(authors):
76
- authors[i] = _link_translated(author, to_authors[i])
77
-
78
- epub_content.authors = authors
79
- epub_content.save()
80
-
81
- _translate_ncx(epub_content, translate)
82
- _translate_spines(epub_content, translate, report_progress)
83
-
84
- def _translate_ncx(epub_content: EpubContent, translate: Translate):
85
- ncx_path = epub_content.ncx_path
86
-
87
- if ncx_path is not None:
88
- tree = parse(ncx_path)
89
- root = tree.getroot()
90
- namespaces={ "ns": root.nsmap.get(None) }
91
- text_doms = []
92
- text_list = []
93
-
94
- for text_dom in root.xpath("//ns:text", namespaces=namespaces):
95
- text_doms.append(text_dom)
96
- text_list.append(text_dom.text or "")
97
-
98
- for index, text in enumerate(translate(text_list, lambda _: None)):
99
- text_dom = text_doms[index]
100
- text_dom.text = _link_translated(text_dom.text, text)
101
-
102
- tree.write(ncx_path, pretty_print=True)
103
-
104
- def _translate_spines(epub_content: EpubContent, translate: Translate, report_progress: ProgressReporter):
105
- spines = epub_content.spines
106
- for index, spine in enumerate(spines):
107
- if spine.media_type == "application/xhtml+xml":
108
- file_path = spine.path
109
- with open(file_path, "r", encoding="utf-8") as file:
110
- content = translate_html(
111
- translate=translate,
112
- file_content=file.read(),
113
- report_progress=lambda p, i=index: report_progress((float(i) + p) / len(spines)),
114
- )
115
- with open(file_path, "w", encoding="utf-8") as file:
116
- file.write(content)
117
-
118
- report_progress(float(index + 1) / len(spines))
119
-
120
- def _link_translated(origin: str, target: str) -> str:
121
- if origin == target:
122
- return origin
123
- else:
124
- return f"{origin} - {target}"
@@ -1 +0,0 @@
1
- from .translator import Translator
@@ -1,140 +0,0 @@
1
- import tiktoken
2
-
3
- from dataclasses import dataclass
4
- from typing import Any, Generator, Iterable
5
- from resource_segmentation import split, Segment, Resource, Incision
6
- from .nlp import NLP
7
-
8
-
9
- @dataclass
10
- class Fragment:
11
- id: int
12
- origin: str
13
- target: str
14
- tokens: int
15
- index: int
16
-
17
- @dataclass
18
- class _Sentence:
19
- index: int
20
- tokens: list[int]
21
- text: str
22
-
23
- class Group:
24
- def __init__(self, group_max_tokens: int, gap_rate: float) -> None:
25
- self._encoder: tiktoken.Encoding = tiktoken.get_encoding("o200k_base")
26
- self._nlp: NLP = NLP()
27
- self._next_id: int = 0
28
- self._group_max_tokens: int = group_max_tokens
29
- self._gap_rate: float = gap_rate
30
-
31
- def split(self, texts: Iterable[str]) -> Generator[tuple[list[Fragment], list[Fragment], list[Fragment]], Any, None]:
32
- for group in split(
33
- max_segment_count=self._group_max_tokens,
34
- gap_rate=self._gap_rate,
35
- resources=self._gen_resources(texts),
36
- ):
37
- head_fragments = self._handle_gap_sentences(
38
- sentences_iter=self._extract_sentences(group.head),
39
- remain_tokens=group.head_remain_count,
40
- clip_head=True,
41
- )
42
- body_fragments = self._extract_sentences(group.body)
43
- tail_fragments = self._handle_gap_sentences(
44
- sentences_iter=self._extract_sentences(group.tail),
45
- remain_tokens=group.tail_remain_count,
46
- clip_head=False,
47
- )
48
- yield (
49
- list(self._to_fragments(head_fragments)),
50
- list(self._to_fragments(body_fragments)),
51
- list(self._to_fragments(tail_fragments)),
52
- )
53
-
54
- def _gen_resources(self, texts: Iterable[str]) -> Generator[Resource[_Sentence], None, None]:
55
- for index, text in enumerate(texts):
56
- sentences = self._nlp.split_into_sents(text)
57
- for i, text in enumerate(sentences):
58
- sentence = _Sentence(
59
- text=text,
60
- index=index,
61
- tokens=self._encoder.encode(text)
62
- )
63
- start_incision: Incision = Incision.MOST_LIKELY
64
- end_incision: Incision = Incision.MOST_LIKELY
65
-
66
- if i == 0:
67
- start_incision = Incision.IMPOSSIBLE
68
- if i == len(sentences) - 1:
69
- end_incision = Incision.IMPOSSIBLE
70
-
71
- yield Resource(
72
- count=len(sentence.tokens),
73
- payload=sentence,
74
- start_incision=start_incision,
75
- end_incision=end_incision,
76
- )
77
-
78
- def _extract_sentences(self, items: list[Resource[_Sentence] | Segment[_Sentence]]) -> Generator[_Sentence, None, None]:
79
- for item in items:
80
- if isinstance(item, Resource):
81
- yield item.payload
82
- elif isinstance(item, Segment):
83
- for resource in item.resources:
84
- yield resource.payload
85
-
86
- def _handle_gap_sentences(
87
- self,
88
- sentences_iter: Iterable[_Sentence],
89
- remain_tokens: int,
90
- clip_head: bool,
91
- ) -> Generator[_Sentence, None, None]:
92
-
93
- sentences = list(sentences_iter)
94
-
95
- if self._need_clip(sentences, remain_tokens):
96
- sentence = sentences[0]
97
- if clip_head:
98
- tokens = sentence.tokens[len(sentence.tokens) - remain_tokens:]
99
- else:
100
- tokens: list[int] = sentence.tokens[:remain_tokens]
101
-
102
- yield _Sentence(
103
- index=sentence.index,
104
- tokens=tokens,
105
- text=self._encoder.decode(tokens),
106
- )
107
- else:
108
- yield from sentences
109
-
110
- def _need_clip(self, sentences: list[_Sentence], remain_tokens: int) -> bool:
111
- if len(sentences) == 1:
112
- sentence = sentences[0]
113
- if len(sentence.tokens) > remain_tokens:
114
- return True
115
- return False
116
-
117
- def _to_fragments(self, sentences: Iterable[_Sentence]):
118
- fragment: Fragment | None = None
119
- for sentence in sentences:
120
- if fragment is None:
121
- fragment = self._create_fragment(sentence)
122
- elif fragment.index != sentence.index:
123
- yield fragment
124
- fragment = self._create_fragment(sentence)
125
- else:
126
- fragment.origin += sentence.text
127
- fragment.tokens += len(sentence.tokens)
128
- if fragment is not None:
129
- yield fragment
130
-
131
- def _create_fragment(self, sentence: _Sentence) -> Fragment:
132
- fragment = Fragment(
133
- id=self._next_id,
134
- index=sentence.index,
135
- origin=sentence.text,
136
- target="",
137
- tokens=len(sentence.tokens),
138
- )
139
- self._next_id += 1
140
- return fragment
@@ -1,58 +0,0 @@
1
- from typing import Generator, cast
2
- from io import StringIO
3
- from pydantic import SecretStr
4
- from langchain_core.messages import SystemMessage, HumanMessage, BaseMessageChunk
5
- from langchain_openai import ChatOpenAI
6
-
7
-
8
- class LLM:
9
- def __init__(
10
- self,
11
- key: str | None,
12
- url: str | None,
13
- model: str,
14
- temperature: float,
15
- timeout: float | None,
16
- ) -> None:
17
- self._timeout: float | None = timeout
18
- self._model: ChatOpenAI = ChatOpenAI(
19
- api_key=cast(SecretStr, key),
20
- base_url=url,
21
- model=model,
22
- temperature=temperature,
23
- )
24
-
25
- def invoke(self, system: str, human: str) -> str:
26
- resp = self._model.invoke(
27
- timeout=self._timeout,
28
- input=[
29
- SystemMessage(content=system),
30
- HumanMessage(content=human),
31
- ],
32
- )
33
- return str(resp.content)
34
-
35
- def invoke_response_lines(self, system: str, human: str) -> Generator[str, None, None]:
36
- stream = self._model.stream(
37
- timeout=self._timeout,
38
- input=[
39
- SystemMessage(content=system),
40
- HumanMessage(content=human),
41
- ],
42
- )
43
- line_buffer = StringIO()
44
- aggregate: BaseMessageChunk | None = None
45
-
46
- for chunk in stream:
47
- fragment = str(chunk.content)
48
- aggregate = chunk if aggregate is None else aggregate + chunk
49
- lines = fragment.split("\n")
50
- if len(lines) > 0:
51
- line_buffer.write(lines[0])
52
- for line in lines[1:]:
53
- yield line_buffer.getvalue()
54
- line_buffer = StringIO()
55
- line_buffer.write(line)
56
-
57
- # TODO: aggregate.usage_metadata
58
- yield line_buffer.getvalue()
@@ -1,36 +0,0 @@
1
- import re
2
- import spacy
3
- import langid
4
- import threading
5
-
6
- from spacy.language import Language
7
-
8
- _lan2model: dict = {
9
- "en": "en_core_web_sm",
10
- "zh": "zh_core_web_sm",
11
- "fr": "fr_core_news_sm",
12
- "ru": "ru_core_news_sm",
13
- "de": "de_core_news_sm",
14
- }
15
-
16
- class NLP:
17
- def __init__(self) -> None:
18
- self._lock: threading.Lock = threading.Lock()
19
- self._nlp_dict: dict[str, Language] = {}
20
-
21
- def split_into_sents(self, text: str) -> list[str]:
22
- lan, _ = langid.classify(text)
23
- with self._lock:
24
- nlp = self._nlp_dict.get(lan, None)
25
- if nlp is None:
26
- model_id = _lan2model.get(lan, None)
27
- if model_id is None:
28
- return self._split_into_sents(text)
29
- nlp = spacy.load(model_id)
30
- self._nlp_dict[lan] = nlp
31
-
32
- return [s.text for s in nlp(text).sents]
33
-
34
- def _split_into_sents(self, text: str) -> list[str]:
35
- cells: list[str] = re.split(r"(\.|!|\?|;|。|!|?|;)", text)
36
- return [cells[i] + cells[i+1] for i in range(0, len(cells)-1, 2)]
@@ -1,159 +0,0 @@
1
- import re
2
- import os
3
-
4
- from typing import Callable, Iterable
5
- from hashlib import sha256
6
-
7
- from .group import Group, Fragment
8
- from json import loads, dumps
9
- from .llm import LLM
10
-
11
-
12
- _LAN_FULL_NAMES: dict[str, str] = {
13
- "en": "English",
14
- "cn": "simplified Chinese",
15
- "ja": "Japanese",
16
- "fr": "French",
17
- "ru": "Russian",
18
- "de": "German",
19
- }
20
-
21
- class Translator:
22
- def __init__(
23
- self,
24
- group_max_tokens: int,
25
- cache_path: str,
26
- key: str | None,
27
- url: str | None,
28
- model: str,
29
- temperature: float,
30
- timeout: float | None,
31
- source_lan: str,
32
- target_lan: str,
33
- streaming: bool) -> None:
34
-
35
- self._streaming: bool = streaming
36
- self._group: Group = Group(
37
- group_max_tokens=group_max_tokens,
38
- gap_rate=0.1,
39
- )
40
- self._cache_path: str = cache_path
41
- self._llm = LLM(
42
- key=key,
43
- url=url,
44
- model=model,
45
- temperature=temperature,
46
- timeout=timeout,
47
- )
48
- self._admin_prompt: str = _gen_admin_prompt(
49
- source_lan=self._lan_full_name(source_lan),
50
- target_lan=self._lan_full_name(target_lan),
51
- )
52
-
53
- def translate(self, source_texts: list[str], report_progress: Callable[[float], None]) -> list[str]:
54
- body_fragments: list[Fragment] = []
55
- target_texts: list[str] = [""] * len(source_texts)
56
- splitted = list(self._group.split(source_texts))
57
-
58
- for i, (head, body, tail) in enumerate(splitted):
59
- body_fragments.extend(body)
60
- self._translate_fragments(
61
- fragments=head + body + tail,
62
- report_progress=lambda p, i=i: report_progress(
63
- (float(i) + p) / len(splitted),
64
- ),
65
- )
66
- for fragment in body_fragments:
67
- target_texts[fragment.index] += fragment.target
68
-
69
- return target_texts
70
-
71
- def _translate_fragments(self, fragments: list[Fragment], report_progress: Callable[[float], None]) -> list[Fragment]:
72
- texts: list[str] = []
73
- translated_texts: list[str] = []
74
- indexes: list[int] = []
75
- for index, fragment in enumerate(fragments):
76
- text = fragment.origin.strip()
77
- if text != "":
78
- texts.append(text)
79
- indexes.append(index)
80
-
81
- if len(texts) > 0:
82
- for i, text in enumerate(self._translate_text_by_text(texts)):
83
- report_progress(min(1.0, float(i) / float(len(texts))))
84
- translated_texts.append(text)
85
- report_progress(1.0)
86
-
87
- for index, text in zip(indexes, translated_texts):
88
- fragments[index].target = text
89
- return fragments
90
-
91
- def _translate_text_by_text(self, texts: list[str]):
92
- hash = self._to_hash(texts)
93
- cache_file_path = os.path.join(self._cache_path, f"{hash}.json")
94
- if os.path.exists(cache_file_path):
95
- with open(cache_file_path, "r", encoding="utf-8") as cache_file:
96
- for translated_text in loads(cache_file.read()):
97
- yield translated_text
98
- else:
99
- system=self._admin_prompt
100
- human="\n".join([f"{i+1}: {t}" for i, t in enumerate(texts)])
101
- translated_texts: list[str] = []
102
- iter_lines: Iterable[str]
103
-
104
- if self._streaming:
105
- iter_lines = self._llm.invoke_response_lines(system, human)
106
- else:
107
- iter_lines = self._llm.invoke(system, human).split("\n")
108
- for line in iter_lines:
109
- match = re.search(r"^\d+\:", line)
110
- if match:
111
- translated_text = re.sub(r"^\d+\:\s*", "", line)
112
- yield translated_text
113
- translated_texts.append(translated_text)
114
-
115
- with open(cache_file_path, "w", encoding="utf-8") as cache_file:
116
- cache_file.write(dumps(
117
- obj=translated_texts,
118
- ensure_ascii=False,
119
- indent=2,
120
- ))
121
-
122
-
123
- def _lan_full_name(self, name: str) -> str:
124
- full_name = _LAN_FULL_NAMES.get(name, None)
125
- if full_name is None:
126
- full_name = _LAN_FULL_NAMES["en"]
127
- return full_name
128
-
129
- def _to_hash(self, texts: list[str]) -> str:
130
- hash = sha256()
131
- for text in texts:
132
- data = text.encode(encoding="utf-8")
133
- hash.update(data)
134
- hash.update(b"\x03") # ETX means string's end
135
- return hash.hexdigest()
136
-
137
- def _gen_admin_prompt(target_lan: str, source_lan: str) -> str:
138
- return f"""
139
- You are a translator and need to translate the user's {source_lan} text into {target_lan}.
140
- I want you to replace simplified A0-level words and sentences with more beautiful and elegant, upper level {target_lan} words and sentences. Keep the meaning same, but make them more literary.
141
- I want you to only reply the translation and nothing else, do not write explanations.
142
- A number and colon are added to the top of each line of text entered by the user. This number is only used to align the translation text for you and has no meaning in itself. You should delete the number in your mind to understand the user's original text.
143
- Your translation results should be split into a number of lines, the number of lines is equal to the number of lines in the user's original text. The content of each line should correspond to the corresponding line of the user's original text.
144
- All user submitted text must be translated. The translated lines must not be missing, added, misplaced, or have their order changed. They must correspond exactly to the original text of the user.
145
-
146
- Here is an example. First, the user submits the original text in English (this is just an example):
147
- 1: IV
148
- 2: This true without lying, certain & most true:
149
- 3: That which is below is like that which is above and that which is above is like that which is below to do ye miracles of one only thing.
150
- 4: .+
151
- 5: And as all things have been and arose from one by ye mediation of one: so all things have their birth from this one thing by adaptation.
152
-
153
- If you are asked to translate into Chinese, you need to submit the translated content in the following format:
154
- 1: 四
155
- 2: 这是真的,没有任何虚妄,是确定的,最真实的:
156
- 3: 上如其下,下如其上,以此来展现“一”的奇迹。
157
- 4: .+
158
- 5: 万物皆来自“一”的沉思,万物在“一”的安排下诞生。
159
- """
@@ -1,19 +0,0 @@
1
- epub_translator/__init__.py,sha256=_R15M7icijpfTrXeDTsJ_LCCBeKn83eZPE8FFTL9AAM,90
2
- epub_translator/epub/__init__.py,sha256=GWng1nNmf-ugEmN-VPeRBgYtGGGv_SxEz0649RcK43A,117
3
- epub_translator/epub/content_parser.py,sha256=Ju94SanlYv5fpG71P1M1tg8d1mP47RHJ38RYu5P7h0k,4381
4
- epub_translator/epub/html/__init__.py,sha256=Am-3WLD0d4eHLo4gW41rCC2ooYa-ZJ_Kua2OrZ9CVoE,32
5
- epub_translator/epub/html/dom_operator.py,sha256=Ryayv6hG0jEXv7RkXrZTbIP54P0fyPTbMVbymMtBUnU,1935
6
- epub_translator/epub/html/empty_tags.py,sha256=GSSe-CV4YkUhWv4F0fiiRsf2vz0ZBAsC21Ovnqo5oIA,601
7
- epub_translator/epub/html/file.py,sha256=KfuJ3QD74VIId9tLNK4JYSbQCjpE8XWvzN6T3tamM60,1966
8
- epub_translator/epub/html/texts_searcher.py,sha256=Gs1n38CzfpM3G5XeZrW12Mw_JPixaQOyQEc7ew4B1Vs,1251
9
- epub_translator/epub/types.py,sha256=PlEwlXWeX_S4HkFr4GheZgoR1a0qKby1z-_dzpcntG4,128
10
- epub_translator/file.py,sha256=tUxDwqCNIeXYqzU_GmjbyKptLF_nBtb8JjVpjgTK4OI,3728
11
- epub_translator/translator/__init__.py,sha256=qJhlcRMR3t1aEp-vFpJFb_6pUTEWPMTohXaJFDPE5SU,34
12
- epub_translator/translator/group.py,sha256=TNGgPPjt3ir3v_ODECpRxhvuBatNMo3vqs4YF-Q9mjQ,4243
13
- epub_translator/translator/llm.py,sha256=eEJEkuzTJlS3-bcLk988LxK8Ttl9JOlSBPKbaOoxY6g,1598
14
- epub_translator/translator/nlp.py,sha256=5LLHL93873gddS8QJks1qKrvKLMnd9voq358-2FHNqE,990
15
- epub_translator/translator/translator.py,sha256=hNM-baEqsEIKkZqEQEUKMG6wnYXaSy9ZtiOqree8zQ0,5968
16
- epub_translator-0.0.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
17
- epub_translator-0.0.1.dist-info/METADATA,sha256=CkVZ-sRTf4yylk2_3gFuTCK2hKPG8iKDkgp53q0yIOw,2404
18
- epub_translator-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
19
- epub_translator-0.0.1.dist-info/RECORD,,