PyPI - epub-translator - Versions diffs - 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl - Mend

epub-translator 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

epub_translator/__init__.py +3 -2
epub_translator/data/format.jinja +33 -0
epub_translator/data/translate.jinja +15 -0
epub_translator/epub/__init__.py +2 -3
epub_translator/epub/content_parser.py +2 -2
epub_translator/epub/html/__init__.py +1 -1
epub_translator/epub/html/file.py +56 -41
epub_translator/epub/html/texts_searcher.py +2 -1
epub_translator/llm/__init__.py +1 -0
epub_translator/llm/error.py +49 -0
epub_translator/llm/executor.py +147 -0
epub_translator/llm/increasable.py +35 -0
epub_translator/llm/node.py +197 -0
epub_translator/template.py +50 -0
epub_translator/translation/__init__.py +2 -0
epub_translator/translation/chunk.py +120 -0
epub_translator/translation/splitter.py +77 -0
epub_translator/translation/store.py +37 -0
epub_translator/translation/translation.py +192 -0
epub_translator/translation/types.py +23 -0
epub_translator/translation/utils.py +11 -0
epub_translator/translator.py +169 -0
epub_translator/xml/__init__.py +3 -0
epub_translator/xml/decoder.py +71 -0
epub_translator/xml/encoder.py +95 -0
epub_translator/xml/parser.py +172 -0
epub_translator/xml/tag.py +93 -0
epub_translator/xml/transform.py +34 -0
epub_translator/xml/utils.py +12 -0
epub_translator/zip_context.py +74 -0
{epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/METADATA +5 -7
epub_translator-0.0.3.dist-info/RECORD +36 -0
epub_translator/epub/types.py +0 -4
epub_translator/file.py +0 -124
epub_translator/translator/__init__.py +0 -1
epub_translator/translator/group.py +0 -140
epub_translator/translator/llm.py +0 -58
epub_translator/translator/nlp.py +0 -36
epub_translator/translator/translator.py +0 -159
epub_translator-0.0.1.dist-info/RECORD +0 -19
{epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE +0 -0
{epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL +0 -0

epub_translator/file.py DELETED Viewed

@@ -1,124 +0,0 @@
-import io
-import os
-import zipfile
-import tempfile
-import shutil
-from typing import Callable
-from lxml.etree import parse
-from .epub import translate_html, Translate, EpubContent
-ProgressReporter = Callable[[float], None]
-def translate_epub_file(
-      translate: Translate,
-      file_path: str,
-      book_title: str | None,
-      report_progress: ProgressReporter,
-    ) -> bytes:
-  unzip_path = tempfile.mkdtemp()
-  try:
-    with zipfile.ZipFile(file_path, "r") as zip_ref:
-      for member in zip_ref.namelist():
-        target_path = os.path.join(unzip_path, member)
-        if member.endswith("/"):
-          os.makedirs(target_path, exist_ok=True)
-        else:
-          target_dir_path = os.path.dirname(target_path)
-          os.makedirs(target_dir_path, exist_ok=True)
-          with zip_ref.open(member) as source, open(target_path, "wb") as file:
-            file.write(source.read())
-    _translate_folder(
-      translate=translate,
-      path=unzip_path,
-      book_title=book_title,
-      report_progress=report_progress,
-    )
-    in_memory_zip = io.BytesIO()
-    with zipfile.ZipFile(in_memory_zip, "w") as zip_file:
-      for root, _, files in os.walk(unzip_path):
-        for file in files:
-          file_path = os.path.join(root, file)
-          relative_path = os.path.relpath(file_path, unzip_path)
-          zip_file.write(file_path, arcname=relative_path)
-    in_memory_zip.seek(0)
-    zip_data = in_memory_zip.read()
-    return zip_data
-  finally:
-    shutil.rmtree(unzip_path)
-def _translate_folder(
-      translate: Translate,
-      path: str,
-      book_title: str | None,
-      report_progress: ProgressReporter,
-    ) -> None:
-  epub_content = EpubContent(path)
-  if book_title is None:
-    book_title = epub_content.title
-    if book_title is not None:
-      book_title = _link_translated(book_title, translate([book_title], lambda _: None)[0])
-  if book_title is not None:
-    epub_content.title = book_title
-  authors = epub_content.authors
-  to_authors = translate(authors, lambda _: None)
-  for i, author in enumerate(authors):
-    authors[i] = _link_translated(author, to_authors[i])
-  epub_content.authors = authors
-  epub_content.save()
-  _translate_ncx(epub_content, translate)
-  _translate_spines(epub_content, translate, report_progress)
-def _translate_ncx(epub_content: EpubContent, translate: Translate):
-  ncx_path = epub_content.ncx_path
-  if ncx_path is not None:
-    tree = parse(ncx_path)
-    root = tree.getroot()
-    namespaces={ "ns": root.nsmap.get(None) }
-    text_doms = []
-    text_list = []
-    for text_dom in root.xpath("//ns:text", namespaces=namespaces):
-      text_doms.append(text_dom)
-      text_list.append(text_dom.text or "")
-    for index, text in enumerate(translate(text_list, lambda _: None)):
-      text_dom = text_doms[index]
-      text_dom.text = _link_translated(text_dom.text, text)
-    tree.write(ncx_path, pretty_print=True)
-def _translate_spines(epub_content: EpubContent, translate: Translate, report_progress: ProgressReporter):
-  spines = epub_content.spines
-  for index, spine in enumerate(spines):
-    if spine.media_type == "application/xhtml+xml":
-      file_path = spine.path
-      with open(file_path, "r", encoding="utf-8") as file:
-        content = translate_html(
-          translate=translate,
-          file_content=file.read(),
-          report_progress=lambda p, i=index: report_progress((float(i) + p) / len(spines)),
-        )
-      with open(file_path, "w", encoding="utf-8") as file:
-        file.write(content)
-    report_progress(float(index + 1) / len(spines))
-def _link_translated(origin: str, target: str) -> str:
-  if origin == target:
-    return origin
-  else:
-    return f"{origin} - {target}"

epub_translator/translator/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .translator import Translator

epub_translator/translator/group.py DELETED Viewed

@@ -1,140 +0,0 @@
-import tiktoken
-from dataclasses import dataclass
-from typing import Any, Generator, Iterable
-from resource_segmentation import split, Segment, Resource, Incision
-from .nlp import NLP
-@dataclass
-class Fragment:
-  id: int
-  origin: str
-  target: str
-  tokens: int
-  index: int
-@dataclass
-class _Sentence:
-  index: int
-  tokens: list[int]
-  text: str
-class Group:
-  def __init__(self, group_max_tokens: int, gap_rate: float) -> None:
-    self._encoder: tiktoken.Encoding = tiktoken.get_encoding("o200k_base")
-    self._nlp: NLP = NLP()
-    self._next_id: int = 0
-    self._group_max_tokens: int = group_max_tokens
-    self._gap_rate: float = gap_rate
-  def split(self, texts: Iterable[str]) -> Generator[tuple[list[Fragment], list[Fragment], list[Fragment]], Any, None]:
-    for group in split(
-      max_segment_count=self._group_max_tokens,
-      gap_rate=self._gap_rate,
-      resources=self._gen_resources(texts),
-    ):
-      head_fragments = self._handle_gap_sentences(
-        sentences_iter=self._extract_sentences(group.head),
-        remain_tokens=group.head_remain_count,
-        clip_head=True,
-      )
-      body_fragments = self._extract_sentences(group.body)
-      tail_fragments = self._handle_gap_sentences(
-        sentences_iter=self._extract_sentences(group.tail),
-        remain_tokens=group.tail_remain_count,
-        clip_head=False,
-      )
-      yield (
-        list(self._to_fragments(head_fragments)),
-        list(self._to_fragments(body_fragments)),
-        list(self._to_fragments(tail_fragments)),
-      )
-  def _gen_resources(self, texts: Iterable[str]) -> Generator[Resource[_Sentence], None, None]:
-    for index, text in enumerate(texts):
-      sentences = self._nlp.split_into_sents(text)
-      for i, text in enumerate(sentences):
-        sentence = _Sentence(
-          text=text,
-          index=index,
-          tokens=self._encoder.encode(text)
-        )
-        start_incision: Incision = Incision.MOST_LIKELY
-        end_incision: Incision = Incision.MOST_LIKELY
-        if i == 0:
-          start_incision = Incision.IMPOSSIBLE
-        if i == len(sentences) - 1:
-          end_incision = Incision.IMPOSSIBLE
-        yield Resource(
-          count=len(sentence.tokens),
-          payload=sentence,
-          start_incision=start_incision,
-          end_incision=end_incision,
-        )
-  def _extract_sentences(self, items: list[Resource[_Sentence] | Segment[_Sentence]]) -> Generator[_Sentence, None, None]:
-    for item in items:
-      if isinstance(item, Resource):
-        yield item.payload
-      elif isinstance(item, Segment):
-        for resource in item.resources:
-          yield resource.payload
-  def _handle_gap_sentences(
-    self,
-    sentences_iter: Iterable[_Sentence],
-    remain_tokens: int,
-    clip_head: bool,
-  ) -> Generator[_Sentence, None, None]:
-    sentences = list(sentences_iter)
-    if self._need_clip(sentences, remain_tokens):
-      sentence = sentences[0]
-      if clip_head:
-        tokens = sentence.tokens[len(sentence.tokens) - remain_tokens:]
-      else:
-        tokens: list[int] = sentence.tokens[:remain_tokens]
-      yield _Sentence(
-        index=sentence.index,
-        tokens=tokens,
-        text=self._encoder.decode(tokens),
-      )
-    else:
-      yield from sentences
-  def _need_clip(self, sentences: list[_Sentence], remain_tokens: int) -> bool:
-    if len(sentences) == 1:
-      sentence = sentences[0]
-      if len(sentence.tokens) > remain_tokens:
-        return True
-    return False
-  def _to_fragments(self, sentences: Iterable[_Sentence]):
-    fragment: Fragment | None = None
-    for sentence in sentences:
-      if fragment is None:
-        fragment = self._create_fragment(sentence)
-      elif fragment.index != sentence.index:
-        yield fragment
-        fragment = self._create_fragment(sentence)
-      else:
-        fragment.origin += sentence.text
-        fragment.tokens += len(sentence.tokens)
-    if fragment is not None:
-      yield fragment
-  def _create_fragment(self, sentence: _Sentence) -> Fragment:
-    fragment = Fragment(
-      id=self._next_id,
-      index=sentence.index,
-      origin=sentence.text,
-      target="",
-      tokens=len(sentence.tokens),
-    )
-    self._next_id += 1
-    return fragment

epub_translator/translator/llm.py DELETED Viewed

@@ -1,58 +0,0 @@
-from typing import Generator, cast
-from io import StringIO
-from pydantic import SecretStr
-from langchain_core.messages import SystemMessage, HumanMessage, BaseMessageChunk
-from langchain_openai import ChatOpenAI
-class LLM:
-  def __init__(
-      self,
-      key: str | None,
-      url: str | None,
-      model: str,
-      temperature: float,
-      timeout: float | None,
-    ) -> None:
-    self._timeout: float | None = timeout
-    self._model: ChatOpenAI = ChatOpenAI(
-      api_key=cast(SecretStr, key),
-      base_url=url,
-      model=model,
-      temperature=temperature,
-    )
-  def invoke(self, system: str, human: str) -> str:
-    resp = self._model.invoke(
-      timeout=self._timeout,
-      input=[
-        SystemMessage(content=system),
-        HumanMessage(content=human),
-      ],
-    )
-    return str(resp.content)
-  def invoke_response_lines(self, system: str, human: str) -> Generator[str, None, None]:
-    stream = self._model.stream(
-      timeout=self._timeout,
-      input=[
-        SystemMessage(content=system),
-        HumanMessage(content=human),
-      ],
-    )
-    line_buffer = StringIO()
-    aggregate: BaseMessageChunk | None = None
-    for chunk in stream:
-      fragment = str(chunk.content)
-      aggregate = chunk if aggregate is None else aggregate + chunk
-      lines = fragment.split("\n")
-      if len(lines) > 0:
-        line_buffer.write(lines[0])
-        for line in lines[1:]:
-          yield line_buffer.getvalue()
-          line_buffer = StringIO()
-          line_buffer.write(line)
-    # TODO: aggregate.usage_metadata
-    yield line_buffer.getvalue()

epub_translator/translator/nlp.py DELETED Viewed

@@ -1,36 +0,0 @@
-import re
-import spacy
-import langid
-import threading
-from spacy.language import Language
-_lan2model: dict = {
-  "en": "en_core_web_sm",
-  "zh": "zh_core_web_sm",
-  "fr": "fr_core_news_sm",
-  "ru": "ru_core_news_sm",
-  "de": "de_core_news_sm",
-}
-class NLP:
-  def __init__(self) -> None:
-    self._lock: threading.Lock = threading.Lock()
-    self._nlp_dict: dict[str, Language] = {}
-  def split_into_sents(self, text: str) -> list[str]:
-    lan, _ = langid.classify(text)
-    with self._lock:
-      nlp = self._nlp_dict.get(lan, None)
-      if nlp is None:
-        model_id = _lan2model.get(lan, None)
-        if model_id is None:
-          return self._split_into_sents(text)
-        nlp = spacy.load(model_id)
-        self._nlp_dict[lan] = nlp
-    return [s.text for s in nlp(text).sents]
-  def _split_into_sents(self, text: str) -> list[str]:
-    cells: list[str] = re.split(r"(\.|!|\?|;|。|！|？|；)", text)
-    return [cells[i] + cells[i+1] for i in range(0, len(cells)-1, 2)]

epub_translator/translator/translator.py DELETED Viewed

@@ -1,159 +0,0 @@
-import re
-import os
-from typing import Callable, Iterable
-from hashlib import sha256
-from .group import Group, Fragment
-from json import loads, dumps
-from .llm import LLM
-_LAN_FULL_NAMES: dict[str, str] = {
-  "en": "English",
-  "cn": "simplified Chinese",
-  "ja": "Japanese",
-  "fr": "French",
-  "ru": "Russian",
-  "de": "German",
-}
-class Translator:
-  def __init__(
-        self,
-        group_max_tokens: int,
-        cache_path: str,
-        key: str | None,
-        url: str | None,
-        model: str,
-        temperature: float,
-        timeout: float | None,
-        source_lan: str,
-        target_lan: str,
-        streaming: bool) -> None:
-    self._streaming: bool = streaming
-    self._group: Group = Group(
-      group_max_tokens=group_max_tokens,
-      gap_rate=0.1,
-    )
-    self._cache_path: str = cache_path
-    self._llm = LLM(
-      key=key,
-      url=url,
-      model=model,
-      temperature=temperature,
-      timeout=timeout,
-    )
-    self._admin_prompt: str = _gen_admin_prompt(
-      source_lan=self._lan_full_name(source_lan),
-      target_lan=self._lan_full_name(target_lan),
-    )
-  def translate(self, source_texts: list[str], report_progress: Callable[[float], None]) -> list[str]:
-    body_fragments: list[Fragment] = []
-    target_texts: list[str] = [""] * len(source_texts)
-    splitted = list(self._group.split(source_texts))
-    for i, (head, body, tail) in enumerate(splitted):
-      body_fragments.extend(body)
-      self._translate_fragments(
-        fragments=head + body + tail,
-        report_progress=lambda p, i=i: report_progress(
-          (float(i) + p) / len(splitted),
-        ),
-      )
-    for fragment in body_fragments:
-      target_texts[fragment.index] += fragment.target
-    return target_texts
-  def _translate_fragments(self, fragments: list[Fragment], report_progress: Callable[[float], None]) -> list[Fragment]:
-    texts: list[str] = []
-    translated_texts: list[str] = []
-    indexes: list[int] = []
-    for index, fragment in enumerate(fragments):
-      text = fragment.origin.strip()
-      if text != "":
-        texts.append(text)
-        indexes.append(index)
-    if len(texts) > 0:
-      for i, text in enumerate(self._translate_text_by_text(texts)):
-        report_progress(min(1.0, float(i) / float(len(texts))))
-        translated_texts.append(text)
-    report_progress(1.0)
-    for index, text in zip(indexes, translated_texts):
-      fragments[index].target = text
-    return fragments
-  def _translate_text_by_text(self, texts: list[str]):
-    hash = self._to_hash(texts)
-    cache_file_path = os.path.join(self._cache_path, f"{hash}.json")
-    if os.path.exists(cache_file_path):
-      with open(cache_file_path, "r", encoding="utf-8") as cache_file:
-        for translated_text in loads(cache_file.read()):
-          yield translated_text
-    else:
-      system=self._admin_prompt
-      human="\n".join([f"{i+1}: {t}" for i, t in enumerate(texts)])
-      translated_texts: list[str] = []
-      iter_lines: Iterable[str]
-      if self._streaming:
-        iter_lines = self._llm.invoke_response_lines(system, human)
-      else:
-        iter_lines = self._llm.invoke(system, human).split("\n")
-      for line in iter_lines:
-        match = re.search(r"^\d+\:", line)
-        if match:
-          translated_text = re.sub(r"^\d+\:\s*", "", line)
-          yield translated_text
-          translated_texts.append(translated_text)
-      with open(cache_file_path, "w", encoding="utf-8") as cache_file:
-        cache_file.write(dumps(
-          obj=translated_texts,
-          ensure_ascii=False,
-          indent=2,
-        ))
-  def _lan_full_name(self, name: str) -> str:
-    full_name = _LAN_FULL_NAMES.get(name, None)
-    if full_name is None:
-      full_name = _LAN_FULL_NAMES["en"]
-    return full_name
-  def _to_hash(self, texts: list[str]) -> str:
-    hash = sha256()
-    for text in texts:
-      data = text.encode(encoding="utf-8")
-      hash.update(data)
-      hash.update(b"\x03") # ETX means string's end
-    return hash.hexdigest()
-def _gen_admin_prompt(target_lan: str, source_lan: str) -> str:
-  return f"""
-You are a translator and need to translate the user's {source_lan} text into {target_lan}.
-I want you to replace simplified A0-level words and sentences with more beautiful and elegant, upper level {target_lan} words and sentences. Keep the meaning same, but make them more literary.
-I want you to only reply the translation and nothing else, do not write explanations.
-A number and colon are added to the top of each line of text entered by the user. This number is only used to align the translation text for you and has no meaning in itself. You should delete the number in your mind to understand the user's original text.
-Your translation results should be split into a number of lines, the number of lines is equal to the number of lines in the user's original text. The content of each line should correspond to the corresponding line of the user's original text.
-All user submitted text must be translated. The translated lines must not be missing, added, misplaced, or have their order changed. They must correspond exactly to the original text of the user.
-Here is an example. First, the user submits the original text in English (this is just an example):
-1: IV
-2: This true without lying, certain & most true:
-3: That which is below is like that which is above and that which is above is like that which is below to do ye miracles of one only thing.
-4: .+
-5: And as all things have been and arose from one by ye mediation of one: so all things have their birth from this one thing by adaptation.
-If you are asked to translate into Chinese, you need to submit the translated content in the following format:
-1: 四
-2: 这是真的，没有任何虚妄，是确定的，最真实的：
-3: 上如其下，下如其上，以此来展现“一”的奇迹。
-4: .+
-5: 万物皆来自“一”的沉思，万物在“一”的安排下诞生。
-"""

epub_translator-0.0.1.dist-info/RECORD DELETED Viewed

@@ -1,19 +0,0 @@
-epub_translator/__init__.py,sha256=_R15M7icijpfTrXeDTsJ_LCCBeKn83eZPE8FFTL9AAM,90
-epub_translator/epub/__init__.py,sha256=GWng1nNmf-ugEmN-VPeRBgYtGGGv_SxEz0649RcK43A,117
-epub_translator/epub/content_parser.py,sha256=Ju94SanlYv5fpG71P1M1tg8d1mP47RHJ38RYu5P7h0k,4381
-epub_translator/epub/html/__init__.py,sha256=Am-3WLD0d4eHLo4gW41rCC2ooYa-ZJ_Kua2OrZ9CVoE,32
-epub_translator/epub/html/dom_operator.py,sha256=Ryayv6hG0jEXv7RkXrZTbIP54P0fyPTbMVbymMtBUnU,1935
-epub_translator/epub/html/empty_tags.py,sha256=GSSe-CV4YkUhWv4F0fiiRsf2vz0ZBAsC21Ovnqo5oIA,601
-epub_translator/epub/html/file.py,sha256=KfuJ3QD74VIId9tLNK4JYSbQCjpE8XWvzN6T3tamM60,1966
-epub_translator/epub/html/texts_searcher.py,sha256=Gs1n38CzfpM3G5XeZrW12Mw_JPixaQOyQEc7ew4B1Vs,1251
-epub_translator/epub/types.py,sha256=PlEwlXWeX_S4HkFr4GheZgoR1a0qKby1z-_dzpcntG4,128
-epub_translator/file.py,sha256=tUxDwqCNIeXYqzU_GmjbyKptLF_nBtb8JjVpjgTK4OI,3728
-epub_translator/translator/__init__.py,sha256=qJhlcRMR3t1aEp-vFpJFb_6pUTEWPMTohXaJFDPE5SU,34
-epub_translator/translator/group.py,sha256=TNGgPPjt3ir3v_ODECpRxhvuBatNMo3vqs4YF-Q9mjQ,4243
-epub_translator/translator/llm.py,sha256=eEJEkuzTJlS3-bcLk988LxK8Ttl9JOlSBPKbaOoxY6g,1598
-epub_translator/translator/nlp.py,sha256=5LLHL93873gddS8QJks1qKrvKLMnd9voq358-2FHNqE,990
-epub_translator/translator/translator.py,sha256=hNM-baEqsEIKkZqEQEUKMG6wnYXaSy9ZtiOqree8zQ0,5968
-epub_translator-0.0.1.dist-info/LICENSE,sha256=5RF32sL3LtMOJIErdDKp1ZEYPGXS8WPpsiSz_jMBnGI,1066
-epub_translator-0.0.1.dist-info/METADATA,sha256=CkVZ-sRTf4yylk2_3gFuTCK2hKPG8iKDkgp53q0yIOw,2404
-epub_translator-0.0.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
-epub_translator-0.0.1.dist-info/RECORD,,

{epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/LICENSE RENAMED Viewed

File without changes

{epub_translator-0.0.1.dist-info → epub_translator-0.0.3.dist-info}/WHEEL RENAMED Viewed

File without changes

epub-translator 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl

epub-translator 0.0.1py3-none-any.whl → 0.0.3py3-none-any.whl