epub-translator 0.1.6__tar.gz → 0.1.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. {epub_translator-0.1.6 → epub_translator-0.1.8}/PKG-INFO +108 -8
  2. {epub_translator-0.1.6 → epub_translator-0.1.8}/README.md +106 -7
  3. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/data/translate.jinja +3 -0
  4. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/core.py +19 -1
  5. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/executor.py +5 -0
  6. epub_translator-0.1.8/epub_translator/llm/statistics.py +25 -0
  7. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/__init__.py +1 -0
  8. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/text_segment.py +10 -6
  9. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/utils.py +0 -16
  10. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/xml_interrupter.py +54 -27
  11. epub_translator-0.1.8/epub_translator/xml/const.py +2 -0
  12. epub_translator-0.1.8/epub_translator/xml/inline.py +120 -0
  13. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/submitter.py +5 -5
  14. {epub_translator-0.1.6 → epub_translator-0.1.8}/pyproject.toml +2 -1
  15. epub_translator-0.1.6/epub_translator/data/mmltex/README.md +0 -67
  16. epub_translator-0.1.6/epub_translator/data/mmltex/cmarkup.xsl +0 -1106
  17. epub_translator-0.1.6/epub_translator/data/mmltex/entities.xsl +0 -459
  18. epub_translator-0.1.6/epub_translator/data/mmltex/glayout.xsl +0 -222
  19. epub_translator-0.1.6/epub_translator/data/mmltex/mmltex.xsl +0 -36
  20. epub_translator-0.1.6/epub_translator/data/mmltex/scripts.xsl +0 -375
  21. epub_translator-0.1.6/epub_translator/data/mmltex/tables.xsl +0 -130
  22. epub_translator-0.1.6/epub_translator/data/mmltex/tokens.xsl +0 -328
  23. epub_translator-0.1.6/epub_translator/xml/const.py +0 -1
  24. epub_translator-0.1.6/epub_translator/xml/inline.py +0 -67
  25. {epub_translator-0.1.6 → epub_translator-0.1.8}/LICENSE +0 -0
  26. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/__init__.py +0 -0
  27. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/data/fill.jinja +0 -0
  28. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/__init__.py +0 -0
  29. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/common.py +0 -0
  30. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/math.py +0 -0
  31. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/metadata.py +0 -0
  32. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/spines.py +0 -0
  33. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/toc.py +0 -0
  34. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/epub/zip.py +0 -0
  35. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/__init__.py +0 -0
  36. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/context.py +0 -0
  37. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/error.py +0 -0
  38. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/increasable.py +0 -0
  39. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/llm/types.py +0 -0
  40. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/block_segment.py +0 -0
  41. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/common.py +0 -0
  42. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/segment/inline_segment.py +0 -0
  43. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/serial/__init__.py +0 -0
  44. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/serial/chunk.py +0 -0
  45. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/serial/segment.py +0 -0
  46. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/serial/splitter.py +0 -0
  47. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/template.py +0 -0
  48. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/__init__.py +0 -0
  49. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/epub_transcode.py +0 -0
  50. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/language.py +0 -0
  51. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/punctuation.py +0 -0
  52. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/translation/translator.py +0 -0
  53. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/utils.py +0 -0
  54. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/__init__.py +0 -0
  55. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/deduplication.py +0 -0
  56. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/__init__.py +0 -0
  57. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/decoder.py +0 -0
  58. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/encoder.py +0 -0
  59. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/parser.py +0 -0
  60. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/tag.py +0 -0
  61. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/friendly/transform.py +0 -0
  62. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/self_closing.py +0 -0
  63. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/utils.py +0 -0
  64. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/xml.py +0 -0
  65. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml/xml_like.py +0 -0
  66. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/__init__.py +0 -0
  67. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/callbacks.py +0 -0
  68. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/common.py +0 -0
  69. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/concurrency.py +0 -0
  70. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/hill_climbing.py +0 -0
  71. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/score.py +0 -0
  72. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/stream_mapper.py +0 -0
  73. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/translator.py +0 -0
  74. {epub_translator-0.1.6 → epub_translator-0.1.8}/epub_translator/xml_translator/validation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: epub-translator
3
- Version: 0.1.6
3
+ Version: 0.1.8
4
4
  Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
5
5
  License: MIT
6
6
  Keywords: epub,llm,translation,translator
@@ -24,6 +24,7 @@ Classifier: Topic :: Software Development :: Localization
24
24
  Classifier: Topic :: Text Processing :: Markup
25
25
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
26
26
  Requires-Dist: jinja2 (>=3.1.6,<4.0.0)
27
+ Requires-Dist: mathml2latex (>=0.2.12,<0.3.0)
27
28
  Requires-Dist: openai (>=2.14.0,<3.0.0)
28
29
  Requires-Dist: resource-segmentation (>=0.0.7,<0.1.0)
29
30
  Requires-Dist: tiktoken (>=0.12.0,<1.0.0)
@@ -59,6 +60,13 @@ Translate EPUB books using Large Language Models while preserving the original t
59
60
  - **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
60
61
  - **Caching**: Built-in caching for progress recovery when translation fails
61
62
 
63
+ ## Use Cases
64
+
65
+ - **Language Learning**: Read books in their original language with side-by-side translations
66
+ - **Academic Research**: Access foreign literature with bilingual references
67
+ - **Content Localization**: Prepare books for international audiences
68
+ - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
69
+
62
70
  ## Installation
63
71
 
64
72
  ```bash
@@ -357,13 +365,6 @@ llm = LLM(
357
365
  )
358
366
  ```
359
367
 
360
- ## Use Cases
361
-
362
- - **Language Learning**: Read books in their original language with side-by-side translations
363
- - **Academic Research**: Access foreign literature with bilingual references
364
- - **Content Localization**: Prepare books for international audiences
365
- - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
366
-
367
368
  ## Advanced Features
368
369
 
369
370
  ### Custom Translation Prompts
@@ -421,6 +422,105 @@ translate(
421
422
 
422
423
  When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
423
424
 
425
+ ### Token Usage Monitoring
426
+
427
+ Track token consumption during translation to monitor API costs and usage:
428
+
429
+ ```python
430
+ from epub_translator import LLM, translate, language, SubmitKind
431
+
432
+ llm = LLM(
433
+ key="your-api-key",
434
+ url="https://api.openai.com/v1",
435
+ model="gpt-4",
436
+ token_encoding="o200k_base",
437
+ )
438
+
439
+ translate(
440
+ source_path="source.epub",
441
+ target_path="translated.epub",
442
+ target_language=language.ENGLISH,
443
+ submit=SubmitKind.APPEND_BLOCK,
444
+ llm=llm,
445
+ )
446
+
447
+ # Access token statistics after translation
448
+ print(f"Total tokens: {llm.total_tokens}")
449
+ print(f"Input tokens: {llm.input_tokens}")
450
+ print(f"Input cache tokens: {llm.input_cache_tokens}")
451
+ print(f"Output tokens: {llm.output_tokens}")
452
+ ```
453
+
454
+ **Available Statistics:**
455
+
456
+ - `total_tokens` - Total number of tokens used (input + output)
457
+ - `input_tokens` - Number of prompt/input tokens
458
+ - `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
459
+ - `output_tokens` - Number of generated/completion tokens
460
+
461
+ **Real-time Monitoring:**
462
+
463
+ You can also monitor token usage in real-time during translation:
464
+
465
+ ```python
466
+ from tqdm import tqdm
467
+ import time
468
+
469
+ with tqdm(total=100, desc="Translating", unit="%") as pbar:
470
+ last_progress = 0.0
471
+ start_time = time.time()
472
+
473
+ def on_progress(progress: float):
474
+ nonlocal last_progress
475
+ increment = (progress - last_progress) * 100
476
+ pbar.update(increment)
477
+ last_progress = progress
478
+
479
+ # Update token stats in progress bar
480
+ pbar.set_postfix({
481
+ 'tokens': llm.total_tokens,
482
+ 'cost_est': f'${llm.total_tokens * 0.00001:.4f}' # Estimate based on your pricing
483
+ })
484
+
485
+ translate(
486
+ source_path="source.epub",
487
+ target_path="translated.epub",
488
+ target_language=language.ENGLISH,
489
+ submit=SubmitKind.APPEND_BLOCK,
490
+ llm=llm,
491
+ on_progress=on_progress,
492
+ )
493
+
494
+ elapsed = time.time() - start_time
495
+ print(f"\nTranslation completed in {elapsed:.1f}s")
496
+ print(f"Total tokens used: {llm.total_tokens:,}")
497
+ print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
498
+ ```
499
+
500
+ **Dual-LLM Token Tracking:**
501
+
502
+ When using separate LLMs for translation and filling, each LLM tracks its own statistics:
503
+
504
+ ```python
505
+ translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
506
+ fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
507
+
508
+ translate(
509
+ source_path="source.epub",
510
+ target_path="translated.epub",
511
+ target_language=language.ENGLISH,
512
+ submit=SubmitKind.APPEND_BLOCK,
513
+ translation_llm=translation_llm,
514
+ fill_llm=fill_llm,
515
+ )
516
+
517
+ print(f"Translation tokens: {translation_llm.total_tokens}")
518
+ print(f"Fill tokens: {fill_llm.total_tokens}")
519
+ print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
520
+ ```
521
+
522
+ **Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
523
+
424
524
  ## Related Projects
425
525
 
426
526
  ### PDF Craft
@@ -26,6 +26,13 @@ Translate EPUB books using Large Language Models while preserving the original t
26
26
  - **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
27
27
  - **Caching**: Built-in caching for progress recovery when translation fails
28
28
 
29
+ ## Use Cases
30
+
31
+ - **Language Learning**: Read books in their original language with side-by-side translations
32
+ - **Academic Research**: Access foreign literature with bilingual references
33
+ - **Content Localization**: Prepare books for international audiences
34
+ - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
35
+
29
36
  ## Installation
30
37
 
31
38
  ```bash
@@ -324,13 +331,6 @@ llm = LLM(
324
331
  )
325
332
  ```
326
333
 
327
- ## Use Cases
328
-
329
- - **Language Learning**: Read books in their original language with side-by-side translations
330
- - **Academic Research**: Access foreign literature with bilingual references
331
- - **Content Localization**: Prepare books for international audiences
332
- - **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
333
-
334
334
  ## Advanced Features
335
335
 
336
336
  ### Custom Translation Prompts
@@ -388,6 +388,105 @@ translate(
388
388
 
389
389
  When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
390
390
 
391
+ ### Token Usage Monitoring
392
+
393
+ Track token consumption during translation to monitor API costs and usage:
394
+
395
+ ```python
396
+ from epub_translator import LLM, translate, language, SubmitKind
397
+
398
+ llm = LLM(
399
+ key="your-api-key",
400
+ url="https://api.openai.com/v1",
401
+ model="gpt-4",
402
+ token_encoding="o200k_base",
403
+ )
404
+
405
+ translate(
406
+ source_path="source.epub",
407
+ target_path="translated.epub",
408
+ target_language=language.ENGLISH,
409
+ submit=SubmitKind.APPEND_BLOCK,
410
+ llm=llm,
411
+ )
412
+
413
+ # Access token statistics after translation
414
+ print(f"Total tokens: {llm.total_tokens}")
415
+ print(f"Input tokens: {llm.input_tokens}")
416
+ print(f"Input cache tokens: {llm.input_cache_tokens}")
417
+ print(f"Output tokens: {llm.output_tokens}")
418
+ ```
419
+
420
+ **Available Statistics:**
421
+
422
+ - `total_tokens` - Total number of tokens used (input + output)
423
+ - `input_tokens` - Number of prompt/input tokens
424
+ - `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
425
+ - `output_tokens` - Number of generated/completion tokens
426
+
427
+ **Real-time Monitoring:**
428
+
429
+ You can also monitor token usage in real-time during translation:
430
+
431
+ ```python
432
+ from tqdm import tqdm
433
+ import time
434
+
435
+ with tqdm(total=100, desc="Translating", unit="%") as pbar:
436
+ last_progress = 0.0
437
+ start_time = time.time()
438
+
439
+ def on_progress(progress: float):
440
+ nonlocal last_progress
441
+ increment = (progress - last_progress) * 100
442
+ pbar.update(increment)
443
+ last_progress = progress
444
+
445
+ # Update token stats in progress bar
446
+ pbar.set_postfix({
447
+ 'tokens': llm.total_tokens,
448
+ 'cost_est': f'${llm.total_tokens * 0.00001:.4f}' # Estimate based on your pricing
449
+ })
450
+
451
+ translate(
452
+ source_path="source.epub",
453
+ target_path="translated.epub",
454
+ target_language=language.ENGLISH,
455
+ submit=SubmitKind.APPEND_BLOCK,
456
+ llm=llm,
457
+ on_progress=on_progress,
458
+ )
459
+
460
+ elapsed = time.time() - start_time
461
+ print(f"\nTranslation completed in {elapsed:.1f}s")
462
+ print(f"Total tokens used: {llm.total_tokens:,}")
463
+ print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
464
+ ```
465
+
466
+ **Dual-LLM Token Tracking:**
467
+
468
+ When using separate LLMs for translation and filling, each LLM tracks its own statistics:
469
+
470
+ ```python
471
+ translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
472
+ fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
473
+
474
+ translate(
475
+ source_path="source.epub",
476
+ target_path="translated.epub",
477
+ target_language=language.ENGLISH,
478
+ submit=SubmitKind.APPEND_BLOCK,
479
+ translation_llm=translation_llm,
480
+ fill_llm=fill_llm,
481
+ )
482
+
483
+ print(f"Translation tokens: {translation_llm.total_tokens}")
484
+ print(f"Fill tokens: {fill_llm.total_tokens}")
485
+ print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
486
+ ```
487
+
488
+ **Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
489
+
391
490
  ## Related Projects
392
491
 
393
492
  ### PDF Craft
@@ -13,6 +13,9 @@ Translation rules:
13
13
  {% if user_prompt -%}
14
14
  User may provide additional requirements in <rules> tags before the source text. Follow them, but prioritize the rules above if conflicts arise.
15
15
 
16
+ <rules>
17
+ {{ user_prompt }}
18
+ </rules>
16
19
  {% endif -%}
17
20
 
18
21
  Output only the translated text, nothing else.
@@ -13,6 +13,7 @@ from ..template import create_env
13
13
  from .context import LLMContext
14
14
  from .executor import LLMExecutor
15
15
  from .increasable import Increasable
16
+ from .statistics import Statistics
16
17
  from .types import Message
17
18
 
18
19
  # Global state for logger filename generation
@@ -44,7 +45,7 @@ class LLM:
44
45
  self._temperature: Increasable = Increasable(temperature)
45
46
  self._cache_path: Path | None = self._ensure_dir_path(cache_path)
46
47
  self._logger_save_path: Path | None = self._ensure_dir_path(log_dir_path)
47
-
48
+ self._statistics = Statistics()
48
49
  self._executor = LLMExecutor(
49
50
  url=url,
50
51
  model=model,
@@ -53,12 +54,29 @@ class LLM:
53
54
  retry_times=retry_times,
54
55
  retry_interval_seconds=retry_interval_seconds,
55
56
  create_logger=self._create_logger,
57
+ statistics=self._statistics,
56
58
  )
57
59
 
58
60
  @property
59
61
  def encoding(self) -> Encoding:
60
62
  return self._encoding
61
63
 
64
+ @property
65
+ def total_tokens(self) -> int:
66
+ return self._statistics.total_tokens
67
+
68
+ @property
69
+ def input_tokens(self) -> int:
70
+ return self._statistics.input_tokens
71
+
72
+ @property
73
+ def input_cache_tokens(self) -> int:
74
+ return self._statistics.input_cache_tokens
75
+
76
+ @property
77
+ def output_tokens(self) -> int:
78
+ return self._statistics.output_tokens
79
+
62
80
  def context(self, cache_seed_content: str | None = None) -> LLMContext:
63
81
  return LLMContext(
64
82
  executor=self._executor,
@@ -7,6 +7,7 @@ from openai import OpenAI
7
7
  from openai.types.chat import ChatCompletionMessageParam
8
8
 
9
9
  from .error import is_retry_error
10
+ from .statistics import Statistics
10
11
  from .types import Message, MessageRole
11
12
 
12
13
 
@@ -20,12 +21,14 @@ class LLMExecutor:
20
21
  retry_times: int,
21
22
  retry_interval_seconds: float,
22
23
  create_logger: Callable[[], Logger | None],
24
+ statistics: Statistics,
23
25
  ) -> None:
24
26
  self._model_name: str = model
25
27
  self._timeout: float | None = timeout
26
28
  self._retry_times: int = retry_times
27
29
  self._retry_interval_seconds: float = retry_interval_seconds
28
30
  self._create_logger: Callable[[], Logger | None] = create_logger
31
+ self._statistics = statistics
29
32
  self._client = OpenAI(
30
33
  api_key=api_key,
31
34
  base_url=url,
@@ -156,6 +159,7 @@ class LLMExecutor:
156
159
  model=self._model_name,
157
160
  messages=messages,
158
161
  stream=True,
162
+ stream_options={"include_usage": True},
159
163
  top_p=top_p,
160
164
  temperature=temperature,
161
165
  max_tokens=max_tokens,
@@ -164,4 +168,5 @@ class LLMExecutor:
164
168
  for chunk in stream:
165
169
  if chunk.choices and chunk.choices[0].delta.content:
166
170
  buffer.write(chunk.choices[0].delta.content)
171
+ self._statistics.submit_usage(chunk.usage)
167
172
  return buffer.getvalue()
@@ -0,0 +1,25 @@
1
+ from threading import Lock
2
+
3
+ from openai.types import CompletionUsage
4
+
5
+
6
+ class Statistics:
7
+ def __init__(self) -> None:
8
+ self._lock = Lock()
9
+ self.total_tokens = 0
10
+ self.input_tokens = 0
11
+ self.input_cache_tokens = 0
12
+ self.output_tokens = 0
13
+
14
+ def submit_usage(self, usage: CompletionUsage | None) -> None:
15
+ if usage is None:
16
+ return
17
+ with self._lock:
18
+ if usage.total_tokens:
19
+ self.total_tokens += usage.total_tokens
20
+ if usage.prompt_tokens:
21
+ self.input_tokens += usage.prompt_tokens
22
+ if usage.prompt_tokens_details and usage.prompt_tokens_details.cached_tokens:
23
+ self.input_cache_tokens += usage.prompt_tokens_details.cached_tokens
24
+ if usage.completion_tokens:
25
+ self.output_tokens += usage.completion_tokens
@@ -21,6 +21,7 @@ from .text_segment import (
21
21
  TextPosition,
22
22
  TextSegment,
23
23
  combine_text_segments,
24
+ find_block_depth,
24
25
  incision_between,
25
26
  search_text_segments,
26
27
  )
@@ -4,7 +4,12 @@ from enum import Enum, auto
4
4
  from typing import Self
5
5
  from xml.etree.ElementTree import Element
6
6
 
7
- from ..xml import expand_left_element_texts, expand_right_element_texts, is_inline_tag, normalize_text_in_element
7
+ from ..xml import (
8
+ expand_left_element_texts,
9
+ expand_right_element_texts,
10
+ is_inline_element,
11
+ normalize_text_in_element,
12
+ )
8
13
 
9
14
 
10
15
  class TextPosition(Enum):
@@ -100,7 +105,7 @@ def search_text_segments(root: Element) -> Generator[TextSegment, None, None]:
100
105
  def _search_text_segments(stack: list[Element], element: Element) -> Generator[TextSegment, None, None]:
101
106
  text = normalize_text_in_element(element.text)
102
107
  next_stack = stack + [element]
103
- next_block_depth = _find_block_depth(next_stack)
108
+ next_block_depth = find_block_depth(next_stack)
104
109
 
105
110
  if text is not None:
106
111
  yield TextSegment(
@@ -125,12 +130,11 @@ def _search_text_segments(stack: list[Element], element: Element) -> Generator[T
125
130
  )
126
131
 
127
132
 
128
- def _find_block_depth(parent_stack: list[Element]) -> int:
133
+ def find_block_depth(parent_stack: list[Element]) -> int:
129
134
  index: int = 0
130
- for i in range(len(parent_stack) - 1, -1, -1):
131
- if not is_inline_tag(parent_stack[i].tag):
135
+ for i in range(len(parent_stack)):
136
+ if not is_inline_element(parent_stack[i]):
132
137
  index = i
133
- break
134
138
  return index + 1 # depth is a count not index
135
139
 
136
140
 
@@ -8,22 +8,6 @@ def element_fingerprint(element: Element) -> str:
8
8
  return f"<{element.tag} {' '.join(attrs)}/>"
9
9
 
10
10
 
11
- def unwrap_parents(element: Element) -> tuple[Element, list[Element]]:
12
- parents: list[Element] = []
13
- while True:
14
- if len(element) != 1:
15
- break
16
- child = element[0]
17
- if not element.text:
18
- break
19
- if not child.tail:
20
- break
21
- parents.append(element)
22
- element = child
23
- element.tail = None
24
- return element, parents
25
-
26
-
27
11
  def id_in_element(element: Element) -> int | None:
28
12
  id_str = element.get(ID_KEY, None)
29
13
  if id_str is None:
@@ -1,9 +1,13 @@
1
1
  from collections.abc import Generator, Iterable
2
2
  from typing import cast
3
- from xml.etree.ElementTree import Element
3
+ from xml.etree.ElementTree import Element, tostring
4
4
 
5
- from ..segment import TextSegment
5
+ from bs4 import BeautifulSoup
6
+ from mathml2latex.mathml import process_mathml
7
+
8
+ from ..segment import TextSegment, combine_text_segments, find_block_depth
6
9
  from ..utils import ensure_list, normalize_whitespace
10
+ from ..xml import DISPLAY_ATTRIBUTE, clone_element, is_inline_element
7
11
 
8
12
  _ID_KEY = "__XML_INTERRUPTER_ID"
9
13
  _MATH_TAG = "math"
@@ -37,8 +41,10 @@ class XMLInterrupter:
37
41
  def interrupt_block_element(self, element: Element) -> Element:
38
42
  interrupted_element = self._placeholder2interrupted.pop(id(element), None)
39
43
  if interrupted_element is None:
44
+ element.attrib.pop(_ID_KEY, None)
40
45
  return element
41
46
  else:
47
+ interrupted_element.attrib.pop(_ID_KEY, None)
42
48
  return interrupted_element
43
49
 
44
50
  def _expand_source_text_segment(self, text_segment: TextSegment):
@@ -81,14 +87,18 @@ class XMLInterrupter:
81
87
  _ID_KEY: cast(str, interrupted_element.get(_ID_KEY)),
82
88
  },
83
89
  )
90
+ interrupted_display = interrupted_element.get(DISPLAY_ATTRIBUTE, None)
91
+ if interrupted_display is not None:
92
+ placeholder_element.set(DISPLAY_ATTRIBUTE, interrupted_display)
93
+
84
94
  raw_parent_stack = text_segment.parent_stack[:interrupted_index]
85
95
  parent_stack = raw_parent_stack + [placeholder_element]
86
96
  merged_text_segment = TextSegment(
87
- text="".join(t.text for t in text_segments),
97
+ text=self._render_latex(text_segments),
88
98
  parent_stack=parent_stack,
89
99
  left_common_depth=text_segments[0].left_common_depth,
90
100
  right_common_depth=text_segments[-1].right_common_depth,
91
- block_depth=len(parent_stack),
101
+ block_depth=find_block_depth(parent_stack),
92
102
  position=text_segments[0].position,
93
103
  )
94
104
  self._placeholder2interrupted[id(placeholder_element)] = interrupted_element
@@ -116,8 +126,8 @@ class XMLInterrupter:
116
126
  # 原始栈退光,仅留下相对 interrupted 元素的栈,这种格式与 translated 要求一致
117
127
  text_segment.left_common_depth = max(0, text_segment.left_common_depth - interrupted_index)
118
128
  text_segment.right_common_depth = max(0, text_segment.right_common_depth - interrupted_index)
119
- text_segment.block_depth = 1
120
129
  text_segment.parent_stack = text_segment.parent_stack[interrupted_index:]
130
+ text_segment.block_depth = find_block_depth(text_segment.parent_stack)
121
131
 
122
132
  return merged_text_segment
123
133
 
@@ -129,37 +139,54 @@ class XMLInterrupter:
129
139
  break
130
140
  return interrupted_index
131
141
 
142
+ def _render_latex(self, text_segments: list[TextSegment]) -> str:
143
+ math_element, _ = next(combine_text_segments(text_segments))
144
+ while math_element.tag != _MATH_TAG:
145
+ if len(math_element) == 0:
146
+ return ""
147
+ math_element = math_element[0]
148
+
149
+ math_element = clone_element(math_element)
150
+ math_element.attrib.pop(_ID_KEY, None)
151
+ math_element.tail = None
152
+ latex: str | None = None
153
+ try:
154
+ mathml_str = tostring(math_element, encoding="unicode")
155
+ soup = BeautifulSoup(mathml_str, "html.parser")
156
+ latex = process_mathml(soup)
157
+ except Exception:
158
+ pass
159
+
160
+ if latex is None:
161
+ latex = "".join(t.text for t in text_segments)
162
+ latex = normalize_whitespace(latex).strip()
163
+ else:
164
+ latex = normalize_whitespace(latex).strip()
165
+ if is_inline_element(math_element):
166
+ latex = f"${latex}$"
167
+ else:
168
+ latex = f"$${latex}$$"
169
+
170
+ return f" {latex} "
171
+
132
172
  def _expand_translated_text_segment(self, text_segment: TextSegment):
133
- interrupted_id = text_segment.block_parent.attrib.pop(_ID_KEY, None)
173
+ parent_element = text_segment.parent_stack[-1]
174
+ interrupted_id = parent_element.attrib.pop(_ID_KEY, None)
134
175
  if interrupted_id is None:
135
176
  yield text_segment
136
177
  return
137
178
 
138
- raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
139
- if not raw_text_segments:
179
+ if parent_element is text_segment.block_parent:
180
+ # Block-level math, need to be hidden
140
181
  return
141
182
 
142
- raw_block = raw_text_segments[0].parent_stack[0]
143
- if not self._is_inline_math(raw_block):
183
+ raw_text_segments = self._raw_text_segments.pop(interrupted_id, None)
184
+ if not raw_text_segments:
185
+ yield text_segment
144
186
  return
145
187
 
146
188
  for raw_text_segment in raw_text_segments:
189
+ text_basic_parent_stack = text_segment.parent_stack[:-1]
147
190
  raw_text_segment.block_parent.attrib.pop(_ID_KEY, None)
191
+ raw_text_segment.parent_stack = text_basic_parent_stack + raw_text_segment.parent_stack
148
192
  yield raw_text_segment
149
-
150
- def _has_no_math_texts(self, element: Element):
151
- if element.tag == _MATH_TAG:
152
- return True
153
- if element.text and normalize_whitespace(element.text).strip():
154
- return False
155
- for child_element in element:
156
- if not self._has_no_math_texts(child_element):
157
- return False
158
- if child_element.tail and normalize_whitespace(child_element.tail).strip():
159
- return False
160
- return True
161
-
162
- def _is_inline_math(self, element: Element) -> bool:
163
- if element.tag != _MATH_TAG:
164
- return False
165
- return element.get("display", "").lower() != "block"
@@ -0,0 +1,2 @@
1
+ ID_KEY: str = "id"
2
+ DISPLAY_ATTRIBUTE: str = "display"