PyPI - epub-translator - Versions diffs - 0.0.1__py3-none-any.whl - Mend

epub-translator 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

epub_translator/__init__.py +2 -0
epub_translator/epub/__init__.py +3 -0
epub_translator/epub/content_parser.py +162 -0
epub_translator/epub/html/__init__.py +1 -0
epub_translator/epub/html/dom_operator.py +62 -0
epub_translator/epub/html/empty_tags.py +23 -0
epub_translator/epub/html/file.py +65 -0
epub_translator/epub/html/texts_searcher.py +45 -0
epub_translator/epub/types.py +4 -0
epub_translator/file.py +124 -0
epub_translator/translator/__init__.py +1 -0
epub_translator/translator/group.py +140 -0
epub_translator/translator/llm.py +58 -0
epub_translator/translator/nlp.py +36 -0
epub_translator/translator/translator.py +159 -0
epub_translator-0.0.1.dist-info/LICENSE +21 -0
epub_translator-0.0.1.dist-info/METADATA +46 -0
epub_translator-0.0.1.dist-info/RECORD +19 -0
epub_translator-0.0.1.dist-info/WHEEL +4 -0

epub_translator/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .translator import Translator
2	+ from .file import translate_epub_file, ProgressReporter

epub_translator/epub/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .content_parser import EpubContent
+from .types import Translate, ReportProgress
+from .html import translate_html

epub_translator/epub/content_parser.py ADDED Viewed

@@ -0,0 +1,162 @@
+import os
+import re
+from lxml.etree import parse, Element, QName
+from html import escape
+# TODO replace with XML
+class Spine:
+  def __init__(self, folder_path, base_path, item):
+    self._folder_path = folder_path
+    self._base_path = base_path
+    self.href = item.get("href")
+    self.media_type = item.get("media-type")
+  @property
+  def path(self):
+    path = os.path.join(self._base_path, self.href)
+    path = os.path.abspath(path)
+    if os.path.exists(path):
+      return path
+    path = os.path.join(self._folder_path, self.href)
+    path = os.path.abspath(path)
+    return path
+class EpubContent:
+  def __init__(self, path: str):
+    self.folder_path = path
+    self._content_path = self._find_content_path(path)
+    self._tree = parse(self._content_path)
+    self._namespaces = { "ns": self._tree.getroot().nsmap.get(None) }
+    self._spine = self._tree.xpath("//ns:spine", namespaces=self._namespaces)[0]
+    self._metadata = self._tree.xpath("//ns:metadata", namespaces=self._namespaces)[0]
+    self._manifest = self._tree.xpath("//ns:manifest", namespaces=self._namespaces)[0]
+  def save(self):
+    self._tree.write(self._content_path, pretty_print=True)
+  def _find_content_path(self, path: str) -> str:
+    root = parse(os.path.join(path, "META-INF", "container.xml")).getroot()
+    rootfile = root.xpath(
+      "//ns:container/ns:rootfiles/ns:rootfile",
+      namespaces={ "ns": root.nsmap.get(None) },
+    )[0]
+    full_path = rootfile.attrib["full-path"]
+    joined_path = os.path.join(path, full_path)
+    return os.path.abspath(joined_path)
+  @property
+  def ncx_path(self):
+    ncx_dom = self._manifest.find(".//*[@id=\"ncx\"]")
+    if ncx_dom is not None:
+      href_path = ncx_dom.get("href")
+      base_path = os.path.dirname(self._content_path)
+      path = os.path.join(base_path, href_path)
+      path = os.path.abspath(path)
+      if os.path.exists(path):
+        return path
+      path = os.path.join(self.folder_path, path)
+      path = os.path.abspath(path)
+      return path
+  @property
+  def spines(self):
+    idref_dict = {}
+    index = 0
+    for child in self._spine.iterchildren():
+      id = child.get("idref")
+      idref_dict[id] = index
+      index += 1
+    items = [None for _ in range(index)]
+    spines = []
+    for child in self._manifest.iterchildren():
+      id = child.get("id")
+      if id in idref_dict:
+        index = idref_dict[id]
+        items[index] = child
+    base_path = os.path.dirname(self._content_path)
+    for item in items:
+      if item is not None:
+        spines.append(Spine(
+          folder_path=self.folder_path,
+          base_path=base_path,
+          item=item,
+        ))
+    return spines
+  @property
+  def title(self):
+    title_dom = self._get_title()
+    if title_dom is None:
+      return None
+    return title_dom.text
+  @title.setter
+  def title(self, title: str):
+    title_dom = self._get_title()
+    if title_dom is not None:
+      title_dom.text = _escape_ascii(title)
+  def _get_title(self):
+    titles = self._metadata.xpath(
+      "./dc:title",
+      namespaces={
+        "dc": self._metadata.nsmap.get("dc"),
+      },
+    )
+    if len(titles) == 0:
+      return None
+    return titles[0]
+  @property
+  def authors(self) -> list[str]:
+    return list(map(lambda x: x.text, self._get_creators()))
+  @authors.setter
+  def authors(self, authors):
+    creator_doms = self._get_creators()
+    if len(creator_doms) == 0:
+      return
+    parent_dom = creator_doms[0].getparent()
+    index_at_parent = parent_dom.index(creator_doms[0])
+    ns={
+      "dc": self._metadata.nsmap.get("dc"),
+      "opf": self._metadata.nsmap.get("opf"),
+    }
+    for author in reversed(authors):
+      creator_dom = Element(QName(ns["dc"], "creator"))
+      creator_dom.set(QName(ns["opf"], "file-as"), author)
+      creator_dom.set(QName(ns["opf"], "role"), "aut")
+      creator_dom.text = _escape_ascii(author)
+      parent_dom.insert(index_at_parent, creator_dom)
+    for creator_dom in creator_doms:
+      parent_dom.remove(creator_dom)
+  def _get_creators(self):
+    return self._metadata.xpath(
+      "./dc:creator",
+      namespaces={
+        "dc": self._metadata.nsmap.get("dc"),
+      },
+    )
+def _escape_ascii(content: str) -> str:
+  content = escape(content)
+  content = re.sub(
+    r"\\u([\da-fA-F]{4})",
+    lambda x: chr(int(x.group(1), 16)), content,
+  )
+  return content

epub_translator/epub/html/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .file import translate_html

epub_translator/epub/html/dom_operator.py ADDED Viewed

@@ -0,0 +1,62 @@
+from io import StringIO
+from typing import cast, Generator, Iterable
+from xml.etree.ElementTree import Element
+from .texts_searcher import search_texts, TextPosition
+def read_texts(root: Element) -> Generator[str, None, None]:
+  for element, position, _ in search_texts(root):
+    if position == TextPosition.WHOLE_DOM:
+      yield _plain_text(element)
+    elif position == TextPosition.TEXT:
+      yield cast(str, element.text)
+    elif position == TextPosition.TAIL:
+      yield cast(str, element.tail)
+def append_texts(root: Element, texts: Iterable[str | Iterable[str] | None]):
+  zip_list = list(zip(texts, search_texts(root)))
+  for text, (element, position, parent) in reversed(zip_list):
+    if text is None:
+      continue
+    if not isinstance(text, str):
+      # TODO: implements split text
+      text = "".join(text)
+    if position == TextPosition.WHOLE_DOM:
+      if parent is not None:
+        _append_dom(parent, element, text)
+    elif position == TextPosition.TEXT:
+      element.text = _append_text(element.text, text)
+    elif position == TextPosition.TAIL:
+      element.tail = _append_text(element.tail, text)
+def _append_dom(parent: Element, origin: Element, text: str):
+  appended = Element(origin.tag, {**origin.attrib})
+  for index, child in enumerate(parent):
+    if child == origin:
+      parent.insert(index + 1, appended)
+      break
+  appended.attrib.pop("id", None)
+  appended.text = text
+  appended.tail = origin.tail
+  origin.tail = None
+def _append_text(left: str | None, right: str) -> str:
+  if left is None:
+    return right
+  else:
+    return left + right
+def _plain_text(target: Element):
+  buffer = StringIO()
+  for text in _iter_text(target):
+    buffer.write(text)
+  return buffer.getvalue()
+def _iter_text(parent: Element):
+  if parent.text is not None:
+    yield parent.text
+  for child in parent:
+    yield from _iter_text(child)
+  if parent.tail is not None:
+    yield parent.tail

epub_translator/epub/html/empty_tags.py ADDED Viewed

@@ -0,0 +1,23 @@
+import re
+# HTML 规定了一系列自闭标签，这些标签需要改成非自闭的，因为 EPub 格式不支持
+# https://www.tutorialspoint.com/which-html-tags-are-self-closing
+_EMPTY_TAGS = (
+  "br",
+  "hr",
+  "input",
+  "col",
+  "base",
+  "meta",
+  "area",
+)
+_EMPTY_TAG_PATTERN = re.compile(
+  r"<(" + "|".join(_EMPTY_TAGS) + r")(\s[^>]*?)\s*/?>"
+)
+def to_html(content: str) -> str:
+  return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)}>", content)
+def to_xml(content: str) -> str:
+  return re.sub(_EMPTY_TAG_PATTERN, lambda m: f"<{m.group(1)}{m.group(2)} />", content)

epub_translator/epub/html/file.py ADDED Viewed

@@ -0,0 +1,65 @@
+import re
+from xml.etree.ElementTree import fromstring, tostring, Element
+from ..types import Translate, ReportProgress
+from .dom_operator import read_texts, append_texts
+from .empty_tags import to_xml, to_html
+_FILE_HEAD_PATTERN = re.compile(r"^<\?xml.*?\?>[\s]*<!DOCTYPE.*?>")
+_XMLNS_IN_TAG = re.compile(r"\{[^}]+\}")
+_BRACES = re.compile(r"(\{|\})")
+def translate_html(translate: Translate, file_content: str, report_progress: ReportProgress) -> str:
+  match = re.match(_FILE_HEAD_PATTERN, file_content)
+  head = match.group() if match else None
+  xml_content = re.sub(_FILE_HEAD_PATTERN, "", to_xml(file_content))
+  root = fromstring(xml_content)
+  root_attrib = {**root.attrib}
+  xmlns = _extract_xmlns(root)
+  source_texts = list(read_texts(root))
+  target_texts = translate(source_texts, report_progress)
+  append_texts(root, target_texts)
+  if xmlns is not None:
+    root_attrib["xmlns"] = xmlns
+  root.attrib = root_attrib
+  if xmlns is None:
+    file_content = tostring(root, encoding="unicode")
+    file_content = to_html(file_content)
+  else:
+    # XHTML disable <tag/> (we need replace them with <tag></tag>)
+    for element in _all_elements(root):
+      if element.text is None:
+        element.text = ""
+    file_content = tostring(root, encoding="unicode")
+  if head is not None:
+    file_content = head + file_content
+  return file_content
+def _extract_xmlns(root: Element) -> str | None:
+  root_xmlns: str | None = None
+  for i, element in enumerate(_all_elements(root)):
+    need_clean_xmlns = True
+    match = re.match(_XMLNS_IN_TAG, element.tag)
+    if match:
+      xmlns = re.sub(_BRACES, "", match.group())
+      if i == 0:
+        root_xmlns = xmlns
+      elif root_xmlns != xmlns:
+        need_clean_xmlns = False
+    if need_clean_xmlns:
+      element.tag = re.sub(_XMLNS_IN_TAG, "", element.tag)
+  return root_xmlns
+def _all_elements(parent: Element):
+  yield parent
+  for child in parent:
+    yield from _all_elements(child)

epub_translator/epub/html/texts_searcher.py ADDED Viewed

@@ -0,0 +1,45 @@
+from typing import Generator, TypeGuard
+from enum import auto, Enum
+from xml.etree.ElementTree import Element
+class TextPosition(Enum):
+  WHOLE_DOM = auto()
+  TEXT = auto()
+  TAIL = auto()
+# element, position, parent
+TextDescription = tuple[Element, TextPosition, Element | None]
+_IGNORE_TAGS = (
+  "title", "link", "style", "css", "img", "script", "metadata"
+)
+_TEXT_LEAF_TAGS = (
+  "a", "b", "br", "hr", "span", "em", "strong", "label",
+)
+def search_texts(element: Element, parent: Element | None = None) -> Generator[TextDescription, None, None]:
+  if element.tag in _IGNORE_TAGS:
+    return
+  if any(c.tag not in _TEXT_LEAF_TAGS for c in element):
+    if _is_not_empty_str(element.text):
+      yield element, TextPosition.TEXT, parent
+    for child in element:
+      if child.tag in _TEXT_LEAF_TAGS:
+        yield child, TextPosition.WHOLE_DOM, element
+      else:
+        yield from search_texts(child, element)
+      if _is_not_empty_str(child.tail):
+        yield child, TextPosition.TAIL, element
+  else:
+    yield element, TextPosition.WHOLE_DOM, parent
+def _is_not_empty_str(text: str | None) -> TypeGuard[str]:
+  if text is None:
+    return False
+  for char in text:
+    if char not in (" ", "\n"):
+      return True
+  return False

epub_translator/epub/types.py ADDED Viewed

@@ -0,0 +1,4 @@
+from typing import Callable
+ReportProgress = Callable[[float], None]
+Translate=Callable[[list[str], ReportProgress], list[str]]

epub_translator/file.py ADDED Viewed

@@ -0,0 +1,124 @@
+import io
+import os
+import zipfile
+import tempfile
+import shutil
+from typing import Callable
+from lxml.etree import parse
+from .epub import translate_html, Translate, EpubContent
+ProgressReporter = Callable[[float], None]
+def translate_epub_file(
+      translate: Translate,
+      file_path: str,
+      book_title: str | None,
+      report_progress: ProgressReporter,
+    ) -> bytes:
+  unzip_path = tempfile.mkdtemp()
+  try:
+    with zipfile.ZipFile(file_path, "r") as zip_ref:
+      for member in zip_ref.namelist():
+        target_path = os.path.join(unzip_path, member)
+        if member.endswith("/"):
+          os.makedirs(target_path, exist_ok=True)
+        else:
+          target_dir_path = os.path.dirname(target_path)
+          os.makedirs(target_dir_path, exist_ok=True)
+          with zip_ref.open(member) as source, open(target_path, "wb") as file:
+            file.write(source.read())
+    _translate_folder(
+      translate=translate,
+      path=unzip_path,
+      book_title=book_title,
+      report_progress=report_progress,
+    )
+    in_memory_zip = io.BytesIO()
+    with zipfile.ZipFile(in_memory_zip, "w") as zip_file:
+      for root, _, files in os.walk(unzip_path):
+        for file in files:
+          file_path = os.path.join(root, file)
+          relative_path = os.path.relpath(file_path, unzip_path)
+          zip_file.write(file_path, arcname=relative_path)
+    in_memory_zip.seek(0)
+    zip_data = in_memory_zip.read()
+    return zip_data
+  finally:
+    shutil.rmtree(unzip_path)
+def _translate_folder(
+      translate: Translate,
+      path: str,
+      book_title: str | None,
+      report_progress: ProgressReporter,
+    ) -> None:
+  epub_content = EpubContent(path)
+  if book_title is None:
+    book_title = epub_content.title
+    if book_title is not None:
+      book_title = _link_translated(book_title, translate([book_title], lambda _: None)[0])
+  if book_title is not None:
+    epub_content.title = book_title
+  authors = epub_content.authors
+  to_authors = translate(authors, lambda _: None)
+  for i, author in enumerate(authors):
+    authors[i] = _link_translated(author, to_authors[i])
+  epub_content.authors = authors
+  epub_content.save()
+  _translate_ncx(epub_content, translate)
+  _translate_spines(epub_content, translate, report_progress)
+def _translate_ncx(epub_content: EpubContent, translate: Translate):
+  ncx_path = epub_content.ncx_path
+  if ncx_path is not None:
+    tree = parse(ncx_path)
+    root = tree.getroot()
+    namespaces={ "ns": root.nsmap.get(None) }
+    text_doms = []
+    text_list = []
+    for text_dom in root.xpath("//ns:text", namespaces=namespaces):
+      text_doms.append(text_dom)
+      text_list.append(text_dom.text or "")
+    for index, text in enumerate(translate(text_list, lambda _: None)):
+      text_dom = text_doms[index]
+      text_dom.text = _link_translated(text_dom.text, text)
+    tree.write(ncx_path, pretty_print=True)
+def _translate_spines(epub_content: EpubContent, translate: Translate, report_progress: ProgressReporter):
+  spines = epub_content.spines
+  for index, spine in enumerate(spines):
+    if spine.media_type == "application/xhtml+xml":
+      file_path = spine.path
+      with open(file_path, "r", encoding="utf-8") as file:
+        content = translate_html(
+          translate=translate,
+          file_content=file.read(),
+          report_progress=lambda p, i=index: report_progress((float(i) + p) / len(spines)),
+        )
+      with open(file_path, "w", encoding="utf-8") as file:
+        file.write(content)
+    report_progress(float(index + 1) / len(spines))
+def _link_translated(origin: str, target: str) -> str:
+  if origin == target:
+    return origin
+  else:
+    return f"{origin} - {target}"

epub_translator/translator/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .translator import Translator

epub_translator/translator/group.py ADDED Viewed

@@ -0,0 +1,140 @@
+import tiktoken
+from dataclasses import dataclass
+from typing import Any, Generator, Iterable
+from resource_segmentation import split, Segment, Resource, Incision
+from .nlp import NLP
+@dataclass
+class Fragment:
+  id: int
+  origin: str
+  target: str
+  tokens: int
+  index: int
+@dataclass
+class _Sentence:
+  index: int
+  tokens: list[int]
+  text: str
+class Group:
+  def __init__(self, group_max_tokens: int, gap_rate: float) -> None:
+    self._encoder: tiktoken.Encoding = tiktoken.get_encoding("o200k_base")
+    self._nlp: NLP = NLP()
+    self._next_id: int = 0
+    self._group_max_tokens: int = group_max_tokens
+    self._gap_rate: float = gap_rate
+  def split(self, texts: Iterable[str]) -> Generator[tuple[list[Fragment], list[Fragment], list[Fragment]], Any, None]:
+    for group in split(
+      max_segment_count=self._group_max_tokens,
+      gap_rate=self._gap_rate,
+      resources=self._gen_resources(texts),
+    ):
+      head_fragments = self._handle_gap_sentences(
+        sentences_iter=self._extract_sentences(group.head),
+        remain_tokens=group.head_remain_count,
+        clip_head=True,
+      )
+      body_fragments = self._extract_sentences(group.body)
+      tail_fragments = self._handle_gap_sentences(
+        sentences_iter=self._extract_sentences(group.tail),
+        remain_tokens=group.tail_remain_count,
+        clip_head=False,
+      )
+      yield (
+        list(self._to_fragments(head_fragments)),
+        list(self._to_fragments(body_fragments)),
+        list(self._to_fragments(tail_fragments)),
+      )
+  def _gen_resources(self, texts: Iterable[str]) -> Generator[Resource[_Sentence], None, None]:
+    for index, text in enumerate(texts):
+      sentences = self._nlp.split_into_sents(text)
+      for i, text in enumerate(sentences):
+        sentence = _Sentence(
+          text=text,
+          index=index,
+          tokens=self._encoder.encode(text)
+        )
+        start_incision: Incision = Incision.MOST_LIKELY
+        end_incision: Incision = Incision.MOST_LIKELY
+        if i == 0:
+          start_incision = Incision.IMPOSSIBLE
+        if i == len(sentences) - 1:
+          end_incision = Incision.IMPOSSIBLE
+        yield Resource(
+          count=len(sentence.tokens),
+          payload=sentence,
+          start_incision=start_incision,
+          end_incision=end_incision,
+        )
+  def _extract_sentences(self, items: list[Resource[_Sentence] | Segment[_Sentence]]) -> Generator[_Sentence, None, None]:
+    for item in items:
+      if isinstance(item, Resource):
+        yield item.payload
+      elif isinstance(item, Segment):
+        for resource in item.resources:
+          yield resource.payload
+  def _handle_gap_sentences(
+    self,
+    sentences_iter: Iterable[_Sentence],
+    remain_tokens: int,
+    clip_head: bool,
+  ) -> Generator[_Sentence, None, None]:
+    sentences = list(sentences_iter)
+    if self._need_clip(sentences, remain_tokens):
+      sentence = sentences[0]
+      if clip_head:
+        tokens = sentence.tokens[len(sentence.tokens) - remain_tokens:]
+      else:
+        tokens: list[int] = sentence.tokens[:remain_tokens]
+      yield _Sentence(
+        index=sentence.index,
+        tokens=tokens,
+        text=self._encoder.decode(tokens),
+      )
+    else:
+      yield from sentences
+  def _need_clip(self, sentences: list[_Sentence], remain_tokens: int) -> bool:
+    if len(sentences) == 1:
+      sentence = sentences[0]
+      if len(sentence.tokens) > remain_tokens:
+        return True
+    return False
+  def _to_fragments(self, sentences: Iterable[_Sentence]):
+    fragment: Fragment | None = None
+    for sentence in sentences:
+      if fragment is None:
+        fragment = self._create_fragment(sentence)
+      elif fragment.index != sentence.index:
+        yield fragment
+        fragment = self._create_fragment(sentence)
+      else:
+        fragment.origin += sentence.text
+        fragment.tokens += len(sentence.tokens)
+    if fragment is not None:
+      yield fragment
+  def _create_fragment(self, sentence: _Sentence) -> Fragment:
+    fragment = Fragment(
+      id=self._next_id,
+      index=sentence.index,
+      origin=sentence.text,
+      target="",
+      tokens=len(sentence.tokens),
+    )
+    self._next_id += 1
+    return fragment

epub_translator/translator/llm.py ADDED Viewed

@@ -0,0 +1,58 @@
+from typing import Generator, cast
+from io import StringIO
+from pydantic import SecretStr
+from langchain_core.messages import SystemMessage, HumanMessage, BaseMessageChunk
+from langchain_openai import ChatOpenAI
+class LLM:
+  def __init__(
+      self,
+      key: str | None,
+      url: str | None,
+      model: str,
+      temperature: float,
+      timeout: float | None,
+    ) -> None:
+    self._timeout: float | None = timeout
+    self._model: ChatOpenAI = ChatOpenAI(
+      api_key=cast(SecretStr, key),
+      base_url=url,
+      model=model,
+      temperature=temperature,
+    )
+  def invoke(self, system: str, human: str) -> str:
+    resp = self._model.invoke(
+      timeout=self._timeout,
+      input=[
+        SystemMessage(content=system),
+        HumanMessage(content=human),
+      ],
+    )
+    return str(resp.content)
+  def invoke_response_lines(self, system: str, human: str) -> Generator[str, None, None]:
+    stream = self._model.stream(
+      timeout=self._timeout,
+      input=[
+        SystemMessage(content=system),
+        HumanMessage(content=human),
+      ],
+    )
+    line_buffer = StringIO()
+    aggregate: BaseMessageChunk | None = None
+    for chunk in stream:
+      fragment = str(chunk.content)
+      aggregate = chunk if aggregate is None else aggregate + chunk
+      lines = fragment.split("\n")
+      if len(lines) > 0:
+        line_buffer.write(lines[0])
+        for line in lines[1:]:
+          yield line_buffer.getvalue()
+          line_buffer = StringIO()
+          line_buffer.write(line)
+    # TODO: aggregate.usage_metadata
+    yield line_buffer.getvalue()

epub_translator/translator/nlp.py ADDED Viewed

@@ -0,0 +1,36 @@
+import re
+import spacy
+import langid
+import threading
+from spacy.language import Language
+_lan2model: dict = {
+  "en": "en_core_web_sm",
+  "zh": "zh_core_web_sm",
+  "fr": "fr_core_news_sm",
+  "ru": "ru_core_news_sm",
+  "de": "de_core_news_sm",
+}
+class NLP:
+  def __init__(self) -> None:
+    self._lock: threading.Lock = threading.Lock()
+    self._nlp_dict: dict[str, Language] = {}
+  def split_into_sents(self, text: str) -> list[str]:
+    lan, _ = langid.classify(text)
+    with self._lock:
+      nlp = self._nlp_dict.get(lan, None)
+      if nlp is None:
+        model_id = _lan2model.get(lan, None)
+        if model_id is None:
+          return self._split_into_sents(text)
+        nlp = spacy.load(model_id)
+        self._nlp_dict[lan] = nlp
+    return [s.text for s in nlp(text).sents]
+  def _split_into_sents(self, text: str) -> list[str]:
+    cells: list[str] = re.split(r"(\.|!|\?|;|。|！|？|；)", text)
+    return [cells[i] + cells[i+1] for i in range(0, len(cells)-1, 2)]

epub_translator/translator/translator.py ADDED Viewed

@@ -0,0 +1,159 @@
+import re
+import os
+from typing import Callable, Iterable
+from hashlib import sha256
+from .group import Group, Fragment
+from json import loads, dumps
+from .llm import LLM
+_LAN_FULL_NAMES: dict[str, str] = {
+  "en": "English",
+  "cn": "simplified Chinese",
+  "ja": "Japanese",
+  "fr": "French",
+  "ru": "Russian",
+  "de": "German",
+}
+class Translator:
+  def __init__(
+        self,
+        group_max_tokens: int,
+        cache_path: str,
+        key: str | None,
+        url: str | None,
+        model: str,
+        temperature: float,
+        timeout: float | None,
+        source_lan: str,
+        target_lan: str,
+        streaming: bool) -> None:
+    self._streaming: bool = streaming
+    self._group: Group = Group(
+      group_max_tokens=group_max_tokens,
+      gap_rate=0.1,
+    )
+    self._cache_path: str = cache_path
+    self._llm = LLM(
+      key=key,
+      url=url,
+      model=model,
+      temperature=temperature,
+      timeout=timeout,
+    )
+    self._admin_prompt: str = _gen_admin_prompt(
+      source_lan=self._lan_full_name(source_lan),
+      target_lan=self._lan_full_name(target_lan),
+    )
+  def translate(self, source_texts: list[str], report_progress: Callable[[float], None]) -> list[str]:
+    body_fragments: list[Fragment] = []
+    target_texts: list[str] = [""] * len(source_texts)
+    splitted = list(self._group.split(source_texts))
+    for i, (head, body, tail) in enumerate(splitted):
+      body_fragments.extend(body)
+      self._translate_fragments(
+        fragments=head + body + tail,
+        report_progress=lambda p, i=i: report_progress(
+          (float(i) + p) / len(splitted),
+        ),
+      )
+    for fragment in body_fragments:
+      target_texts[fragment.index] += fragment.target
+    return target_texts
+  def _translate_fragments(self, fragments: list[Fragment], report_progress: Callable[[float], None]) -> list[Fragment]:
+    texts: list[str] = []
+    translated_texts: list[str] = []
+    indexes: list[int] = []
+    for index, fragment in enumerate(fragments):
+      text = fragment.origin.strip()
+      if text != "":
+        texts.append(text)
+        indexes.append(index)
+    if len(texts) > 0:
+      for i, text in enumerate(self._translate_text_by_text(texts)):
+        report_progress(min(1.0, float(i) / float(len(texts))))
+        translated_texts.append(text)
+    report_progress(1.0)
+    for index, text in zip(indexes, translated_texts):
+      fragments[index].target = text
+    return fragments
+  def _translate_text_by_text(self, texts: list[str]):
+    hash = self._to_hash(texts)
+    cache_file_path = os.path.join(self._cache_path, f"{hash}.json")
+    if os.path.exists(cache_file_path):
+      with open(cache_file_path, "r", encoding="utf-8") as cache_file:
+        for translated_text in loads(cache_file.read()):
+          yield translated_text
+    else:
+      system=self._admin_prompt
+      human="\n".join([f"{i+1}: {t}" for i, t in enumerate(texts)])
+      translated_texts: list[str] = []
+      iter_lines: Iterable[str]
+      if self._streaming:
+        iter_lines = self._llm.invoke_response_lines(system, human)
+      else:
+        iter_lines = self._llm.invoke(system, human).split("\n")
+      for line in iter_lines:
+        match = re.search(r"^\d+\:", line)
+        if match:
+          translated_text = re.sub(r"^\d+\:\s*", "", line)
+          yield translated_text
+          translated_texts.append(translated_text)
+      with open(cache_file_path, "w", encoding="utf-8") as cache_file:
+        cache_file.write(dumps(
+          obj=translated_texts,
+          ensure_ascii=False,
+          indent=2,
+        ))
+  def _lan_full_name(self, name: str) -> str:
+    full_name = _LAN_FULL_NAMES.get(name, None)
+    if full_name is None:
+      full_name = _LAN_FULL_NAMES["en"]
+    return full_name
+  def _to_hash(self, texts: list[str]) -> str:
+    hash = sha256()
+    for text in texts:
+      data = text.encode(encoding="utf-8")
+      hash.update(data)
+      hash.update(b"\x03") # ETX means string's end
+    return hash.hexdigest()
+def _gen_admin_prompt(target_lan: str, source_lan: str) -> str:
+  return f"""
+You are a translator and need to translate the user's {source_lan} text into {target_lan}.
+I want you to replace simplified A0-level words and sentences with more beautiful and elegant, upper level {target_lan} words and sentences. Keep the meaning same, but make them more literary.
+I want you to only reply the translation and nothing else, do not write explanations.
+A number and colon are added to the top of each line of text entered by the user. This number is only used to align the translation text for you and has no meaning in itself. You should delete the number in your mind to understand the user's original text.
+Your translation results should be split into a number of lines, the number of lines is equal to the number of lines in the user's original text. The content of each line should correspond to the corresponding line of the user's original text.
+All user submitted text must be translated. The translated lines must not be missing, added, misplaced, or have their order changed. They must correspond exactly to the original text of the user.
+Here is an example. First, the user submits the original text in English (this is just an example):
+1: IV
+2: This true without lying, certain & most true:
+3: That which is below is like that which is above and that which is above is like that which is below to do ye miracles of one only thing.
+4: .+
+5: And as all things have been and arose from one by ye mediation of one: so all things have their birth from this one thing by adaptation.
+If you are asked to translate into Chinese, you need to submit the translated content in the following format:
+1: 四
+2: 这是真的，没有任何虚妄，是确定的，最真实的：
+3: 上如其下，下如其上，以此来展现“一”的奇迹。
+4: .+
+5: 万物皆来自“一”的沉思，万物在“一”的安排下诞生。
+"""

epub_translator-0.0.1.dist-info/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 OOMOL Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

epub_translator-0.0.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,46 @@
+Metadata-Version: 2.3
+Name: epub-translator
+Version: 0.0.1
+Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
+License: MIT
+Author: Tao Zeyu
+Author-email: i@taozeyu.com
+Maintainer: Tao Zeyu
+Maintainer-email: i@taozeyu.com
+Requires-Python: >=3.10,<3.13
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: langchain (==0.3.23)
+Requires-Dist: langchain-openai (==0.3.13)
+Requires-Dist: langid (>=1.1.6,<2.0.0)
+Requires-Dist: lxml (>=6.0.0,<7.0.0)
+Requires-Dist: resource-segmentation (==0.0.1)
+Requires-Dist: spacy (>=3.8.7,<4.0.0)
+Requires-Dist: tiktoken (>=0.9.0,<0.10.0)
+Requires-Dist: tqdm (>=4.67.1,<5.0.0)
+Project-URL: Homepage, https://hub.oomol.com/package/book-translator
+Project-URL: Repository, https://github.com/oomol-flows/books-translator
+Description-Content-Type: text/markdown
+# epub-translator
+Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
+## Field Description
+- `file`: the epub file to be translated.
+- `title`: the title of the book to be translated (original language)
+- `max_translating_group`: the maximum amount of translation text submitted each time. Books will be submitted in chunks during translation, and this value will limit the maximum length of each chunk.
+- `max_translating_group_unit`: the unit of the `max_translating_group_unit`.
+- `source`: the language of the book to be translated.
+- `target`: the target language you want to translate it into.
+- `llm_api`: the LLM API format used for translation.
+- `model`: the model used for translation
+- `url`: the URL of the LLM
+- `api_key`: the Key of the LLM
+- `temperature`: the temperature of the LLM, which is a parameter used to control the randomness of the generated text. In simple terms, the lower the temperature value, the more certain and conservative the text generated by the model. The higher the temperature value, the more random and diverse the text generated by the model.
+- `timeout`: the request timeout, in seconds.
+- `binary`: the translated target epub file content.

epub_translator-0.0.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,19 @@
+epub_translator/__init__.py,sha256=_R15M7icijpfTrXeDTsJ_LCCBeKn83eZPE8FFTL9AAM,90
+epub_translator/epub/__init__.py,sha256=GWng1nNmf-ugEmN-VPeRBgYtGGGv_SxEz0649RcK43A,117
+epub_translator/epub/content_parser.py,sha256=Ju94SanlYv5fpG71P1M1tg8d1mP47RHJ38RYu5P7h0k,4381
+epub_translator/epub/html/__init__.py,sha256=Am-3WLD0d4eHLo4gW41rCC2ooYa-ZJ_Kua2OrZ9CVoE,32
+epub_translator/epub/html/dom_operator.py,sha256=Ryayv6hG0jEXv7RkXrZTbIP54P0fyPTbMVbymMtBUnU,1935
+epub_translator/epub/html/empty_tags.py,sha256=GSSe-CV4YkUhWv4F0fiiRsf2vz0ZBAsC21Ovnqo5oIA,601
+epub_translator/epub/html/file.py,sha256=KfuJ3QD74VIId9tLNK4JYSbQCjpE8XWvzN6T3tamM60,1966
+epub_translator/epub/html/texts_searcher.py,sha256=Gs1n38CzfpM3G5XeZrW12Mw_JPixaQOyQEc7ew4B1Vs,1251
+epub_translator/epub/types.py,sha256=PlEwlXWeX_S4HkFr4GheZgoR1a0qKby1z-_dzpcntG4,128
+epub_translator/file.py,sha256=tUxDwqCNIeXYqzU_GmjbyKptLF_nBtb8JjVpjgTK4OI,3728
+epub_translator/translator/__init__.py,sha256=qJhlcRMR3t1aEp-vFpJFb_6pUTEWPMTohXaJFDPE5SU,34
+epub_translator/translator/group.py,sha256=TNGgPPjt3ir3v_ODECpRxhvuBatNMo3vqs4YF-Q9mjQ,4243
+epub_translator/translator/llm.py,sha256=eEJEkuzTJlS3-bcLk988LxK8Ttl9JOlSBPKbaOoxY6g,1598
+epub_translator/translator/nlp.py,sha256=5LLHL93873gddS8QJks1qKrvKLMnd9voq358-2FHNqE,990
+epub_translator/translator/translator.py,sha256=hNM-baEqsEIKkZqEQEUKMG6wnYXaSy9ZtiOqree8zQ0,5968
+epub_translator-0.0.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
+epub_translator-0.0.1.dist-info/METADATA,sha256=CkVZ-sRTf4yylk2_3gFuTCK2hKPG8iKDkgp53q0yIOw,2404
+epub_translator-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
+epub_translator-0.0.1.dist-info/RECORD,,

epub_translator-0.0.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: poetry-core 2.1.3
+Root-Is-Purelib: true
+Tag: py3-none-any