PyPI - epub-translator - Versions diffs - 0.1.7__tar.gz → 0.1.8__tar.gz - Mend

epub-translator 0.1.7tar.gz → 0.1.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{epub_translator-0.1.7 → epub_translator-0.1.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: epub-translator
-Version: 0.1.7
+Version: 0.1.8
 Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
 License: MIT
 Keywords: epub,llm,translation,translator
@@ -422,6 +422,105 @@ translate(
 When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
+### Token Usage Monitoring
+Track token consumption during translation to monitor API costs and usage:
+```python
+from epub_translator import LLM, translate, language, SubmitKind
+llm = LLM(
+    key="your-api-key",
+    url="https://api.openai.com/v1",
+    model="gpt-4",
+    token_encoding="o200k_base",
+)
+translate(
+    source_path="source.epub",
+    target_path="translated.epub",
+    target_language=language.ENGLISH,
+    submit=SubmitKind.APPEND_BLOCK,
+    llm=llm,
+)
+# Access token statistics after translation
+print(f"Total tokens: {llm.total_tokens}")
+print(f"Input tokens: {llm.input_tokens}")
+print(f"Input cache tokens: {llm.input_cache_tokens}")
+print(f"Output tokens: {llm.output_tokens}")
+```
+**Available Statistics:**
+- `total_tokens` - Total number of tokens used (input + output)
+- `input_tokens` - Number of prompt/input tokens
+- `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
+- `output_tokens` - Number of generated/completion tokens
+**Real-time Monitoring:**
+You can also monitor token usage in real-time during translation:
+```python
+from tqdm import tqdm
+import time
+with tqdm(total=100, desc="Translating", unit="%") as pbar:
+    last_progress = 0.0
+    start_time = time.time()
+    def on_progress(progress: float):
+        nonlocal last_progress
+        increment = (progress - last_progress) * 100
+        pbar.update(increment)
+        last_progress = progress
+        # Update token stats in progress bar
+        pbar.set_postfix({
+            'tokens': llm.total_tokens,
+            'cost_est': f'${llm.total_tokens * 0.00001:.4f}'  # Estimate based on your pricing
+        })
+    translate(
+        source_path="source.epub",
+        target_path="translated.epub",
+        target_language=language.ENGLISH,
+        submit=SubmitKind.APPEND_BLOCK,
+        llm=llm,
+        on_progress=on_progress,
+    )
+    elapsed = time.time() - start_time
+    print(f"\nTranslation completed in {elapsed:.1f}s")
+    print(f"Total tokens used: {llm.total_tokens:,}")
+    print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
+```
+**Dual-LLM Token Tracking:**
+When using separate LLMs for translation and filling, each LLM tracks its own statistics:
+```python
+translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
+fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
+translate(
+    source_path="source.epub",
+    target_path="translated.epub",
+    target_language=language.ENGLISH,
+    submit=SubmitKind.APPEND_BLOCK,
+    translation_llm=translation_llm,
+    fill_llm=fill_llm,
+)
+print(f"Translation tokens: {translation_llm.total_tokens}")
+print(f"Fill tokens: {fill_llm.total_tokens}")
+print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
+```
+**Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
 ## Related Projects
 ### PDF Craft

{epub_translator-0.1.7 → epub_translator-0.1.8}/README.md RENAMED Viewed

@@ -388,6 +388,105 @@ translate(
 When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
+### Token Usage Monitoring
+Track token consumption during translation to monitor API costs and usage:
+```python
+from epub_translator import LLM, translate, language, SubmitKind
+llm = LLM(
+    key="your-api-key",
+    url="https://api.openai.com/v1",
+    model="gpt-4",
+    token_encoding="o200k_base",
+)
+translate(
+    source_path="source.epub",
+    target_path="translated.epub",
+    target_language=language.ENGLISH,
+    submit=SubmitKind.APPEND_BLOCK,
+    llm=llm,
+)
+# Access token statistics after translation
+print(f"Total tokens: {llm.total_tokens}")
+print(f"Input tokens: {llm.input_tokens}")
+print(f"Input cache tokens: {llm.input_cache_tokens}")
+print(f"Output tokens: {llm.output_tokens}")
+```
+**Available Statistics:**
+- `total_tokens` - Total number of tokens used (input + output)
+- `input_tokens` - Number of prompt/input tokens
+- `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
+- `output_tokens` - Number of generated/completion tokens
+**Real-time Monitoring:**
+You can also monitor token usage in real-time during translation:
+```python
+from tqdm import tqdm
+import time
+with tqdm(total=100, desc="Translating", unit="%") as pbar:
+    last_progress = 0.0
+    start_time = time.time()
+    def on_progress(progress: float):
+        nonlocal last_progress
+        increment = (progress - last_progress) * 100
+        pbar.update(increment)
+        last_progress = progress
+        # Update token stats in progress bar
+        pbar.set_postfix({
+            'tokens': llm.total_tokens,
+            'cost_est': f'${llm.total_tokens * 0.00001:.4f}'  # Estimate based on your pricing
+        })
+    translate(
+        source_path="source.epub",
+        target_path="translated.epub",
+        target_language=language.ENGLISH,
+        submit=SubmitKind.APPEND_BLOCK,
+        llm=llm,
+        on_progress=on_progress,
+    )
+    elapsed = time.time() - start_time
+    print(f"\nTranslation completed in {elapsed:.1f}s")
+    print(f"Total tokens used: {llm.total_tokens:,}")
+    print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
+```
+**Dual-LLM Token Tracking:**
+When using separate LLMs for translation and filling, each LLM tracks its own statistics:
+```python
+translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
+fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
+translate(
+    source_path="source.epub",
+    target_path="translated.epub",
+    target_language=language.ENGLISH,
+    submit=SubmitKind.APPEND_BLOCK,
+    translation_llm=translation_llm,
+    fill_llm=fill_llm,
+)
+print(f"Translation tokens: {translation_llm.total_tokens}")
+print(f"Fill tokens: {fill_llm.total_tokens}")
+print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
+```
+**Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
 ## Related Projects
 ### PDF Craft

{epub_translator-0.1.7 → epub_translator-0.1.8}/epub_translator/llm/core.py RENAMED Viewed

@@ -13,6 +13,7 @@ from ..template import create_env
 from .context import LLMContext
 from .executor import LLMExecutor
 from .increasable import Increasable
+from .statistics import Statistics
 from .types import Message
 # Global state for logger filename generation
@@ -44,7 +45,7 @@ class LLM:
         self._temperature: Increasable = Increasable(temperature)
         self._cache_path: Path | None = self._ensure_dir_path(cache_path)
         self._logger_save_path: Path | None = self._ensure_dir_path(log_dir_path)
+        self._statistics = Statistics()
         self._executor = LLMExecutor(
             url=url,
             model=model,
@@ -53,12 +54,29 @@ class LLM:
             retry_times=retry_times,
             retry_interval_seconds=retry_interval_seconds,
             create_logger=self._create_logger,
+            statistics=self._statistics,
         )
     @property
     def encoding(self) -> Encoding:
         return self._encoding
+    @property
+    def total_tokens(self) -> int:
+        return self._statistics.total_tokens
+    @property
+    def input_tokens(self) -> int:
+        return self._statistics.input_tokens
+    @property
+    def input_cache_tokens(self) -> int:
+        return self._statistics.input_cache_tokens
+    @property
+    def output_tokens(self) -> int:
+        return self._statistics.output_tokens
     def context(self, cache_seed_content: str | None = None) -> LLMContext:
         return LLMContext(
             executor=self._executor,

{epub_translator-0.1.7 → epub_translator-0.1.8}/epub_translator/llm/executor.py RENAMED Viewed

@@ -7,6 +7,7 @@ from openai import OpenAI
 from openai.types.chat import ChatCompletionMessageParam
 from .error import is_retry_error
+from .statistics import Statistics
 from .types import Message, MessageRole
@@ -20,12 +21,14 @@ class LLMExecutor:
         retry_times: int,
         retry_interval_seconds: float,
         create_logger: Callable[[], Logger | None],
+        statistics: Statistics,
     ) -> None:
         self._model_name: str = model
         self._timeout: float | None = timeout
         self._retry_times: int = retry_times
         self._retry_interval_seconds: float = retry_interval_seconds
         self._create_logger: Callable[[], Logger | None] = create_logger
+        self._statistics = statistics
         self._client = OpenAI(
             api_key=api_key,
             base_url=url,
@@ -156,6 +159,7 @@ class LLMExecutor:
             model=self._model_name,
             messages=messages,
             stream=True,
+            stream_options={"include_usage": True},
             top_p=top_p,
             temperature=temperature,
             max_tokens=max_tokens,
@@ -164,4 +168,5 @@ class LLMExecutor:
         for chunk in stream:
             if chunk.choices and chunk.choices[0].delta.content:
                 buffer.write(chunk.choices[0].delta.content)
+            self._statistics.submit_usage(chunk.usage)
         return buffer.getvalue()

epub_translator-0.1.8/epub_translator/llm/statistics.py ADDED Viewed

@@ -0,0 +1,25 @@
+from threading import Lock
+from openai.types import CompletionUsage
+class Statistics:
+    def __init__(self) -> None:
+        self._lock = Lock()
+        self.total_tokens = 0
+        self.input_tokens = 0
+        self.input_cache_tokens = 0
+        self.output_tokens = 0
+    def submit_usage(self, usage: CompletionUsage | None) -> None:
+        if usage is None:
+            return
+        with self._lock:
+            if usage.total_tokens:
+                self.total_tokens += usage.total_tokens
+            if usage.prompt_tokens:
+                self.input_tokens += usage.prompt_tokens
+            if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
+                self.input_cache_tokens += usage.prompt_tokens_details.cached_tokens
+            if usage.completion_tokens:
+                self.output_tokens += usage.completion_tokens

{epub_translator-0.1.7 → epub_translator-0.1.8}/epub_translator/segment/text_segment.py RENAMED Viewed

@@ -4,7 +4,12 @@ from enum import Enum, auto
 from typing import Self
 from xml.etree.ElementTree import Element
-from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_element, normalize_text_in_element
+from ..xml import (
+    expand_left_element_texts,
+    expand_right_element_texts,
+    is_inline_element,
+    normalize_text_in_element,
+)
 class TextPosition(Enum):

{epub_translator-0.1.7 → epub_translator-0.1.8}/epub_translator/translation/xml_interrupter.py RENAMED Viewed

@@ -6,8 +6,8 @@ from bs4 import BeautifulSoup
 from mathml2latex.mathml import process_mathml
 from ..segment import TextSegment, combine_text_segments, find_block_depth
-from ..utils import ensure_list
-from ..xml import clone_element
+from ..utils import ensure_list, normalize_whitespace
+from ..xml import DISPLAY_ATTRIBUTE, clone_element, is_inline_element
 _ID_KEY = "__XML_INTERRUPTER_ID"
 _MATH_TAG = "math"
@@ -87,9 +87,9 @@ class XMLInterrupter:
                     _ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
                 },
             )
-            interrupted_display = interrupted_element.get("display", None)
+            interrupted_display = interrupted_element.get(DISPLAY_ATTRIBUTE, None)
             if interrupted_display is not None:
-                placeholder_element.set("display", interrupted_display)
+                placeholder_element.set(DISPLAY_ATTRIBUTE, interrupted_display)
             raw_parent_stack = text_segment.parent_stack[:interrupted_index]
             parent_stack = raw_parent_stack + [placeholder_element]
@@ -159,10 +159,13 @@ class XMLInterrupter:
         if latex is None:
             latex = "".join(t.text for t in text_segments)
-        elif math_element.get("display", None) == "inline":
-            latex = f"${latex}$"
+            latex = normalize_whitespace(latex).strip()
         else:
-            latex = f"$${latex}$$"
+            latex = normalize_whitespace(latex).strip()
+            if is_inline_element(math_element):
+                latex = f"${latex}$"
+            else:
+                latex = f"$${latex}$$"
         return f" {latex} "

epub_translator-0.1.8/epub_translator/xml/const.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ ID_KEY: str = "id"
2	+ DISPLAY_ATTRIBUTE: str = "display"

{epub_translator-0.1.7 → epub_translator-0.1.8}/epub_translator/xml/inline.py RENAMED Viewed

@@ -1,5 +1,7 @@
 from xml.etree.ElementTree import Element
+from .const import DISPLAY_ATTRIBUTE
 # HTML inline-level elements
 # Reference: https://developer.mozilla.org/en-US/docs/Web/HTML/Inline_elements
 # Reference: https://developer.mozilla.org/en-US/docs/Glossary/Inline-level_content
@@ -105,9 +107,14 @@ _HTML_INLINE_TAGS = frozenset(
 def is_inline_element(element: Element) -> bool:
-    if element.tag.lower() in _HTML_INLINE_TAGS:
+    tag = element.tag.lower()
+    if tag in _HTML_INLINE_TAGS:
         return True
-    display = element.get("display", None)
-    if display is not None and display.lower() == "inline":
+    display = element.get(DISPLAY_ATTRIBUTE, None)
+    if display is not None:
+        display = display.lower()
+        if display == "inline":
+            return True
+    if tag == "math" and display != "block":
         return True
     return False

{epub_translator-0.1.7 → epub_translator-0.1.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "epub-translator"
-version = "0.1.7"
+version = "0.1.8"
 description = "Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text."
 keywords = ["epub", "llm", "translation", "translator"]
 authors = [