epub-translator 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {epub_translator-0.1.7 → epub_translator-0.1.9}/PKG-INFO +106 -16
- {epub_translator-0.1.7 → epub_translator-0.1.9}/README.md +105 -15
- epub_translator-0.1.9/epub_translator/epub/__init__.py +4 -0
- epub_translator-0.1.9/epub_translator/epub/metadata.py +85 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/epub/toc.py +76 -94
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/core.py +19 -1
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/executor.py +5 -0
- epub_translator-0.1.9/epub_translator/llm/statistics.py +25 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/text_segment.py +6 -1
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/translator.py +16 -6
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/xml_interrupter.py +10 -7
- epub_translator-0.1.9/epub_translator/xml/const.py +2 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/inline.py +10 -3
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/self_closing.py +5 -4
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/xml_like.py +23 -1
- {epub_translator-0.1.7 → epub_translator-0.1.9}/pyproject.toml +1 -1
- epub_translator-0.1.7/epub_translator/epub/__init__.py +0 -4
- epub_translator-0.1.7/epub_translator/epub/metadata.py +0 -122
- epub_translator-0.1.7/epub_translator/xml/const.py +0 -1
- {epub_translator-0.1.7 → epub_translator-0.1.9}/LICENSE +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/data/fill.jinja +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/data/translate.jinja +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/epub/common.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/epub/math.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/epub/spines.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/epub/zip.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/context.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/error.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/increasable.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/llm/types.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/block_segment.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/common.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/inline_segment.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/segment/utils.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/serial/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/serial/chunk.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/serial/segment.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/serial/splitter.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/template.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/epub_transcode.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/language.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/translation/punctuation.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/utils.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/deduplication.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/decoder.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/encoder.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/parser.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/tag.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/friendly/transform.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/utils.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml/xml.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/__init__.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/callbacks.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/common.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/concurrency.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/hill_climbing.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/score.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/stream_mapper.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/submitter.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/translator.py +0 -0
- {epub_translator-0.1.7 → epub_translator-0.1.9}/epub_translator/xml_translator/validation.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: epub-translator
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: Translate the epub book using LLM. The translated book will retain the original text and list the translated text side by side with the original text.
|
|
5
5
|
License: MIT
|
|
6
6
|
Keywords: epub,llm,translation,translator
|
|
@@ -46,26 +46,17 @@ Description-Content-Type: text/markdown
|
|
|
46
46
|
</div>
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
|
|
49
|
+
Want to read a book in a foreign language without losing the original context? EPUB Translator transforms any EPUB into a bilingual edition with AI-powered translations displayed side-by-side with the original text.
|
|
50
50
|
|
|
51
|
-
|
|
51
|
+
Whether you're learning a new language, conducting academic research, or simply enjoying foreign literature, you get both versions in one book - preserving all formatting, images, and structure.
|
|
52
52
|
|
|
53
|
-
|
|
53
|
+

|
|
54
54
|
|
|
55
|
-
|
|
56
|
-
- **LLM-Powered**: Leverages large language models for high-quality, context-aware translations
|
|
57
|
-
- **Format Preservation**: Maintains EPUB structure, styles, images, and formatting
|
|
58
|
-
- **Complete Translation**: Translates chapter content, table of contents, and metadata
|
|
59
|
-
- **Progress Tracking**: Monitor translation progress with built-in callbacks
|
|
60
|
-
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
61
|
-
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
55
|
+
### Online Demo
|
|
62
56
|
|
|
63
|
-
|
|
57
|
+
We provide an [online demo platform](https://hub.oomol.com/package/books-translator) where you can try EPUB Translator's bilingual translation capabilities without any installation. Simply upload your EPUB file and get a translated bilingual edition.
|
|
64
58
|
|
|
65
|
-
|
|
66
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
67
|
-
- **Content Localization**: Prepare books for international audiences
|
|
68
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
59
|
+
[](https://hub.oomol.com/package/books-translator)
|
|
69
60
|
|
|
70
61
|
## Installation
|
|
71
62
|
|
|
@@ -422,6 +413,105 @@ translate(
|
|
|
422
413
|
|
|
423
414
|
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
424
415
|
|
|
416
|
+
### Token Usage Monitoring
|
|
417
|
+
|
|
418
|
+
Track token consumption during translation to monitor API costs and usage:
|
|
419
|
+
|
|
420
|
+
```python
|
|
421
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
422
|
+
|
|
423
|
+
llm = LLM(
|
|
424
|
+
key="your-api-key",
|
|
425
|
+
url="https://api.openai.com/v1",
|
|
426
|
+
model="gpt-4",
|
|
427
|
+
token_encoding="o200k_base",
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
translate(
|
|
431
|
+
source_path="source.epub",
|
|
432
|
+
target_path="translated.epub",
|
|
433
|
+
target_language=language.ENGLISH,
|
|
434
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
435
|
+
llm=llm,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# Access token statistics after translation
|
|
439
|
+
print(f"Total tokens: {llm.total_tokens}")
|
|
440
|
+
print(f"Input tokens: {llm.input_tokens}")
|
|
441
|
+
print(f"Input cache tokens: {llm.input_cache_tokens}")
|
|
442
|
+
print(f"Output tokens: {llm.output_tokens}")
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
**Available Statistics:**
|
|
446
|
+
|
|
447
|
+
- `total_tokens` - Total number of tokens used (input + output)
|
|
448
|
+
- `input_tokens` - Number of prompt/input tokens
|
|
449
|
+
- `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
|
|
450
|
+
- `output_tokens` - Number of generated/completion tokens
|
|
451
|
+
|
|
452
|
+
**Real-time Monitoring:**
|
|
453
|
+
|
|
454
|
+
You can also monitor token usage in real-time during translation:
|
|
455
|
+
|
|
456
|
+
```python
|
|
457
|
+
from tqdm import tqdm
|
|
458
|
+
import time
|
|
459
|
+
|
|
460
|
+
with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
461
|
+
last_progress = 0.0
|
|
462
|
+
start_time = time.time()
|
|
463
|
+
|
|
464
|
+
def on_progress(progress: float):
|
|
465
|
+
nonlocal last_progress
|
|
466
|
+
increment = (progress - last_progress) * 100
|
|
467
|
+
pbar.update(increment)
|
|
468
|
+
last_progress = progress
|
|
469
|
+
|
|
470
|
+
# Update token stats in progress bar
|
|
471
|
+
pbar.set_postfix({
|
|
472
|
+
'tokens': llm.total_tokens,
|
|
473
|
+
'cost_est': f'${llm.total_tokens * 0.00001:.4f}' # Estimate based on your pricing
|
|
474
|
+
})
|
|
475
|
+
|
|
476
|
+
translate(
|
|
477
|
+
source_path="source.epub",
|
|
478
|
+
target_path="translated.epub",
|
|
479
|
+
target_language=language.ENGLISH,
|
|
480
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
481
|
+
llm=llm,
|
|
482
|
+
on_progress=on_progress,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
elapsed = time.time() - start_time
|
|
486
|
+
print(f"\nTranslation completed in {elapsed:.1f}s")
|
|
487
|
+
print(f"Total tokens used: {llm.total_tokens:,}")
|
|
488
|
+
print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
|
|
489
|
+
```
|
|
490
|
+
|
|
491
|
+
**Dual-LLM Token Tracking:**
|
|
492
|
+
|
|
493
|
+
When using separate LLMs for translation and filling, each LLM tracks its own statistics:
|
|
494
|
+
|
|
495
|
+
```python
|
|
496
|
+
translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
497
|
+
fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
498
|
+
|
|
499
|
+
translate(
|
|
500
|
+
source_path="source.epub",
|
|
501
|
+
target_path="translated.epub",
|
|
502
|
+
target_language=language.ENGLISH,
|
|
503
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
504
|
+
translation_llm=translation_llm,
|
|
505
|
+
fill_llm=fill_llm,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
print(f"Translation tokens: {translation_llm.total_tokens}")
|
|
509
|
+
print(f"Fill tokens: {fill_llm.total_tokens}")
|
|
510
|
+
print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
**Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
|
|
514
|
+
|
|
425
515
|
## Related Projects
|
|
426
516
|
|
|
427
517
|
### PDF Craft
|
|
@@ -12,26 +12,17 @@
|
|
|
12
12
|
</div>
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
Want to read a book in a foreign language without losing the original context? EPUB Translator transforms any EPUB into a bilingual edition with AI-powered translations displayed side-by-side with the original text.
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
Whether you're learning a new language, conducting academic research, or simply enjoying foreign literature, you get both versions in one book - preserving all formatting, images, and structure.
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+

|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
- **LLM-Powered**: Leverages large language models for high-quality, context-aware translations
|
|
23
|
-
- **Format Preservation**: Maintains EPUB structure, styles, images, and formatting
|
|
24
|
-
- **Complete Translation**: Translates chapter content, table of contents, and metadata
|
|
25
|
-
- **Progress Tracking**: Monitor translation progress with built-in callbacks
|
|
26
|
-
- **Flexible LLM Support**: Works with any OpenAI-compatible API endpoint
|
|
27
|
-
- **Caching**: Built-in caching for progress recovery when translation fails
|
|
21
|
+
### Online Demo
|
|
28
22
|
|
|
29
|
-
|
|
23
|
+
We provide an [online demo platform](https://hub.oomol.com/package/books-translator) where you can try EPUB Translator's bilingual translation capabilities without any installation. Simply upload your EPUB file and get a translated bilingual edition.
|
|
30
24
|
|
|
31
|
-
|
|
32
|
-
- **Academic Research**: Access foreign literature with bilingual references
|
|
33
|
-
- **Content Localization**: Prepare books for international audiences
|
|
34
|
-
- **Cross-Cultural Reading**: Enjoy literature while understanding cultural nuances
|
|
25
|
+
[](https://hub.oomol.com/package/books-translator)
|
|
35
26
|
|
|
36
27
|
## Installation
|
|
37
28
|
|
|
@@ -388,6 +379,105 @@ translate(
|
|
|
388
379
|
|
|
389
380
|
When using `concurrency > 1`, ensure that any custom callback functions (`on_progress`, `on_fill_failed`) are thread-safe. Built-in callbacks are thread-safe by default.
|
|
390
381
|
|
|
382
|
+
### Token Usage Monitoring
|
|
383
|
+
|
|
384
|
+
Track token consumption during translation to monitor API costs and usage:
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
from epub_translator import LLM, translate, language, SubmitKind
|
|
388
|
+
|
|
389
|
+
llm = LLM(
|
|
390
|
+
key="your-api-key",
|
|
391
|
+
url="https://api.openai.com/v1",
|
|
392
|
+
model="gpt-4",
|
|
393
|
+
token_encoding="o200k_base",
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
translate(
|
|
397
|
+
source_path="source.epub",
|
|
398
|
+
target_path="translated.epub",
|
|
399
|
+
target_language=language.ENGLISH,
|
|
400
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
401
|
+
llm=llm,
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Access token statistics after translation
|
|
405
|
+
print(f"Total tokens: {llm.total_tokens}")
|
|
406
|
+
print(f"Input tokens: {llm.input_tokens}")
|
|
407
|
+
print(f"Input cache tokens: {llm.input_cache_tokens}")
|
|
408
|
+
print(f"Output tokens: {llm.output_tokens}")
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
**Available Statistics:**
|
|
412
|
+
|
|
413
|
+
- `total_tokens` - Total number of tokens used (input + output)
|
|
414
|
+
- `input_tokens` - Number of prompt/input tokens
|
|
415
|
+
- `input_cache_tokens` - Number of cached input tokens (when using prompt caching)
|
|
416
|
+
- `output_tokens` - Number of generated/completion tokens
|
|
417
|
+
|
|
418
|
+
**Real-time Monitoring:**
|
|
419
|
+
|
|
420
|
+
You can also monitor token usage in real-time during translation:
|
|
421
|
+
|
|
422
|
+
```python
|
|
423
|
+
from tqdm import tqdm
|
|
424
|
+
import time
|
|
425
|
+
|
|
426
|
+
with tqdm(total=100, desc="Translating", unit="%") as pbar:
|
|
427
|
+
last_progress = 0.0
|
|
428
|
+
start_time = time.time()
|
|
429
|
+
|
|
430
|
+
def on_progress(progress: float):
|
|
431
|
+
nonlocal last_progress
|
|
432
|
+
increment = (progress - last_progress) * 100
|
|
433
|
+
pbar.update(increment)
|
|
434
|
+
last_progress = progress
|
|
435
|
+
|
|
436
|
+
# Update token stats in progress bar
|
|
437
|
+
pbar.set_postfix({
|
|
438
|
+
'tokens': llm.total_tokens,
|
|
439
|
+
'cost_est': f'${llm.total_tokens * 0.00001:.4f}' # Estimate based on your pricing
|
|
440
|
+
})
|
|
441
|
+
|
|
442
|
+
translate(
|
|
443
|
+
source_path="source.epub",
|
|
444
|
+
target_path="translated.epub",
|
|
445
|
+
target_language=language.ENGLISH,
|
|
446
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
447
|
+
llm=llm,
|
|
448
|
+
on_progress=on_progress,
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
elapsed = time.time() - start_time
|
|
452
|
+
print(f"\nTranslation completed in {elapsed:.1f}s")
|
|
453
|
+
print(f"Total tokens used: {llm.total_tokens:,}")
|
|
454
|
+
print(f"Average tokens/second: {llm.total_tokens/elapsed:.1f}")
|
|
455
|
+
```
|
|
456
|
+
|
|
457
|
+
**Dual-LLM Token Tracking:**
|
|
458
|
+
|
|
459
|
+
When using separate LLMs for translation and filling, each LLM tracks its own statistics:
|
|
460
|
+
|
|
461
|
+
```python
|
|
462
|
+
translation_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
463
|
+
fill_llm = LLM(key="...", url="...", model="gpt-4", token_encoding="o200k_base")
|
|
464
|
+
|
|
465
|
+
translate(
|
|
466
|
+
source_path="source.epub",
|
|
467
|
+
target_path="translated.epub",
|
|
468
|
+
target_language=language.ENGLISH,
|
|
469
|
+
submit=SubmitKind.APPEND_BLOCK,
|
|
470
|
+
translation_llm=translation_llm,
|
|
471
|
+
fill_llm=fill_llm,
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
print(f"Translation tokens: {translation_llm.total_tokens}")
|
|
475
|
+
print(f"Fill tokens: {fill_llm.total_tokens}")
|
|
476
|
+
print(f"Combined total: {translation_llm.total_tokens + fill_llm.total_tokens}")
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
**Note:** Token statistics are cumulative across all API calls made by the LLM instance. The counts only increase and are thread-safe when using concurrent translation.
|
|
480
|
+
|
|
391
481
|
## Related Projects
|
|
392
482
|
|
|
393
483
|
### PDF Craft
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from ..xml import XMLLikeNode
|
|
5
|
+
from .common import find_opf_path
|
|
6
|
+
from .zip import Zip
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class MetadataField:
|
|
11
|
+
tag_name: str
|
|
12
|
+
text: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class MetadataContext:
|
|
17
|
+
opf_path: Path # OPF 文件路径
|
|
18
|
+
xml_node: XMLLikeNode # XMLLikeNode 对象,保留原始文件信息
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
SKIP_FIELDS = frozenset(
|
|
22
|
+
(
|
|
23
|
+
"language",
|
|
24
|
+
"identifier",
|
|
25
|
+
"date",
|
|
26
|
+
"meta",
|
|
27
|
+
"contributor", # Usually technical information
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def read_metadata(zip: Zip) -> tuple[list[MetadataField], MetadataContext]:
|
|
33
|
+
opf_path = find_opf_path(zip)
|
|
34
|
+
|
|
35
|
+
with zip.read(opf_path) as f:
|
|
36
|
+
xml_node = XMLLikeNode(f, is_html_like=False)
|
|
37
|
+
|
|
38
|
+
metadata_elem = None
|
|
39
|
+
for child in xml_node.element:
|
|
40
|
+
if child.tag.endswith("metadata"):
|
|
41
|
+
metadata_elem = child
|
|
42
|
+
break
|
|
43
|
+
|
|
44
|
+
if metadata_elem is None:
|
|
45
|
+
context = MetadataContext(opf_path=opf_path, xml_node=xml_node)
|
|
46
|
+
return [], context
|
|
47
|
+
|
|
48
|
+
fields: list[MetadataField] = []
|
|
49
|
+
for elem in metadata_elem:
|
|
50
|
+
tag_name = elem.tag
|
|
51
|
+
if elem.text and elem.text.strip() and tag_name not in SKIP_FIELDS:
|
|
52
|
+
fields.append(MetadataField(tag_name=tag_name, text=elem.text.strip()))
|
|
53
|
+
|
|
54
|
+
context = MetadataContext(opf_path=opf_path, xml_node=xml_node)
|
|
55
|
+
return fields, context
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def write_metadata(zip: Zip, fields: list[MetadataField], context: MetadataContext) -> None:
|
|
59
|
+
metadata_elem = None
|
|
60
|
+
for child in context.xml_node.element:
|
|
61
|
+
if child.tag.endswith("metadata"):
|
|
62
|
+
metadata_elem = child
|
|
63
|
+
break
|
|
64
|
+
|
|
65
|
+
if metadata_elem is None:
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
fields_by_tag: dict[str, list[str]] = {}
|
|
69
|
+
for field in fields:
|
|
70
|
+
if field.tag_name not in fields_by_tag:
|
|
71
|
+
fields_by_tag[field.tag_name] = []
|
|
72
|
+
fields_by_tag[field.tag_name].append(field.text)
|
|
73
|
+
|
|
74
|
+
tag_counters: dict[str, int] = {tag: 0 for tag in fields_by_tag}
|
|
75
|
+
|
|
76
|
+
for elem in metadata_elem:
|
|
77
|
+
tag_name = elem.tag
|
|
78
|
+
if tag_name in fields_by_tag and elem.text and elem.text.strip():
|
|
79
|
+
counter = tag_counters[tag_name]
|
|
80
|
+
if counter < len(fields_by_tag[tag_name]):
|
|
81
|
+
elem.text = fields_by_tag[tag_name][counter]
|
|
82
|
+
tag_counters[tag_name] += 1
|
|
83
|
+
|
|
84
|
+
with zip.replace(context.opf_path) as f:
|
|
85
|
+
context.xml_node.save(f)
|
|
@@ -3,8 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from xml.etree import ElementTree as ET
|
|
4
4
|
from xml.etree.ElementTree import Element
|
|
5
5
|
|
|
6
|
-
from ..xml
|
|
7
|
-
from .common import
|
|
6
|
+
from ..xml import XMLLikeNode, plain_text
|
|
7
|
+
from .common import find_opf_path, strip_namespace
|
|
8
8
|
from .zip import Zip
|
|
9
9
|
|
|
10
10
|
|
|
@@ -41,30 +41,40 @@ class Toc:
|
|
|
41
41
|
return self.href
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
|
|
44
|
+
@dataclass
|
|
45
|
+
class TocContext:
|
|
46
|
+
version: int
|
|
47
|
+
toc_path: Path
|
|
48
|
+
xml_node: XMLLikeNode
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def read_toc(zip: Zip) -> tuple[list[Toc], TocContext]:
|
|
45
52
|
version = _detect_epub_version(zip)
|
|
46
53
|
toc_path = _find_toc_path(zip, version)
|
|
47
54
|
|
|
48
55
|
if toc_path is None:
|
|
49
|
-
|
|
56
|
+
raise ValueError("Cannot find TOC file in EPUB")
|
|
50
57
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
else:
|
|
54
|
-
return _read_nav_toc(zip, toc_path)
|
|
58
|
+
with zip.read(toc_path) as f:
|
|
59
|
+
xml_node = XMLLikeNode(f, is_html_like=False)
|
|
55
60
|
|
|
61
|
+
if version == 3:
|
|
62
|
+
toc_list = _read_nav_toc(xml_node.element)
|
|
63
|
+
else:
|
|
64
|
+
toc_list = _read_ncx_toc(xml_node.element)
|
|
56
65
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
toc_path = _find_toc_path(zip, version)
|
|
66
|
+
context = TocContext(version=version, toc_path=toc_path, xml_node=xml_node)
|
|
67
|
+
return toc_list, context
|
|
60
68
|
|
|
61
|
-
if toc_path is None:
|
|
62
|
-
raise ValueError("Cannot find TOC file in EPUB")
|
|
63
69
|
|
|
64
|
-
|
|
65
|
-
|
|
70
|
+
def write_toc(zip: Zip, toc: list[Toc], context: TocContext) -> None:
|
|
71
|
+
if context.version == 2:
|
|
72
|
+
_update_ncx_toc(context.xml_node.element, toc)
|
|
66
73
|
else:
|
|
67
|
-
|
|
74
|
+
_update_nav_toc(context.xml_node.element, toc)
|
|
75
|
+
|
|
76
|
+
with zip.replace(context.toc_path) as f:
|
|
77
|
+
context.xml_node.save(f)
|
|
68
78
|
|
|
69
79
|
|
|
70
80
|
def _detect_epub_version(zip: Zip) -> int:
|
|
@@ -72,8 +82,6 @@ def _detect_epub_version(zip: Zip) -> int:
|
|
|
72
82
|
with zip.read(opf_path) as f:
|
|
73
83
|
content = f.read()
|
|
74
84
|
root = ET.fromstring(content)
|
|
75
|
-
|
|
76
|
-
# 检查 package 元素的 version 属性
|
|
77
85
|
version_str = root.get("version", "2.0")
|
|
78
86
|
|
|
79
87
|
if version_str.startswith("3"):
|
|
@@ -89,7 +97,7 @@ def _find_toc_path(zip: Zip, version: int) -> Path | None:
|
|
|
89
97
|
with zip.read(opf_path) as f:
|
|
90
98
|
content = f.read()
|
|
91
99
|
root = ET.fromstring(content)
|
|
92
|
-
strip_namespace(root)
|
|
100
|
+
strip_namespace(root)
|
|
93
101
|
|
|
94
102
|
manifest = root.find(".//manifest")
|
|
95
103
|
if manifest is None:
|
|
@@ -115,23 +123,18 @@ def _find_toc_path(zip: Zip, version: int) -> Path | None:
|
|
|
115
123
|
return None
|
|
116
124
|
|
|
117
125
|
|
|
118
|
-
def _read_ncx_toc(
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
strip_namespace(root) # 移除命名空间前缀以简化 XPath
|
|
123
|
-
|
|
124
|
-
nav_map = root.find(".//navMap")
|
|
125
|
-
if nav_map is None:
|
|
126
|
-
return []
|
|
126
|
+
def _read_ncx_toc(root: Element) -> list[Toc]:
|
|
127
|
+
nav_map = root.find(".//navMap")
|
|
128
|
+
if nav_map is None:
|
|
129
|
+
return []
|
|
127
130
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
131
|
+
result = []
|
|
132
|
+
for nav_point in nav_map.findall("navPoint"):
|
|
133
|
+
toc_item = _parse_nav_point(nav_point)
|
|
134
|
+
if toc_item:
|
|
135
|
+
result.append(toc_item)
|
|
133
136
|
|
|
134
|
-
|
|
137
|
+
return result
|
|
135
138
|
|
|
136
139
|
|
|
137
140
|
def _parse_nav_point(nav_point: Element) -> Toc | None:
|
|
@@ -172,18 +175,11 @@ def _parse_nav_point(nav_point: Element) -> Toc | None:
|
|
|
172
175
|
)
|
|
173
176
|
|
|
174
177
|
|
|
175
|
-
def
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
nav_map = root.find(f".//{{{ns}}}navMap" if ns else ".//navMap")
|
|
181
|
-
if nav_map is None:
|
|
182
|
-
raise ValueError("Cannot find navMap in NCX file")
|
|
183
|
-
_update_nav_points(nav_map, toc_list, ns)
|
|
184
|
-
tree = ET.ElementTree(root)
|
|
185
|
-
with zip.replace(ncx_path) as out:
|
|
186
|
-
tree.write(out, encoding="utf-8", xml_declaration=True)
|
|
178
|
+
def _update_ncx_toc(root: Element, toc_list: list[Toc]) -> None:
|
|
179
|
+
nav_map = root.find(".//navMap")
|
|
180
|
+
if nav_map is None:
|
|
181
|
+
raise ValueError("Cannot find navMap in NCX file")
|
|
182
|
+
_update_nav_points(nav_map, toc_list, None)
|
|
187
183
|
|
|
188
184
|
|
|
189
185
|
def _update_nav_points(parent: Element, toc_list: list[Toc], ns: str | None, start_play_order: int = 1) -> int:
|
|
@@ -255,34 +251,28 @@ def _create_nav_point(toc: Toc, ns: str | None, play_order: int) -> Element:
|
|
|
255
251
|
return nav_point
|
|
256
252
|
|
|
257
253
|
|
|
258
|
-
def _read_nav_toc(
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
nav_elem = None
|
|
266
|
-
for nav in root.findall(".//nav"):
|
|
267
|
-
epub_type = nav.get("{http://www.idpf.org/2007/ops}type") or nav.get("type")
|
|
268
|
-
if epub_type == "toc":
|
|
269
|
-
nav_elem = nav
|
|
270
|
-
break
|
|
254
|
+
def _read_nav_toc(root: Element) -> list[Toc]:
|
|
255
|
+
nav_elem = None
|
|
256
|
+
for nav in root.findall(".//nav"):
|
|
257
|
+
epub_type = nav.get("type")
|
|
258
|
+
if epub_type == "toc":
|
|
259
|
+
nav_elem = nav
|
|
260
|
+
break
|
|
271
261
|
|
|
272
|
-
|
|
273
|
-
|
|
262
|
+
if nav_elem is None:
|
|
263
|
+
return []
|
|
274
264
|
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
265
|
+
ol = nav_elem.find(".//ol")
|
|
266
|
+
if ol is None:
|
|
267
|
+
return []
|
|
278
268
|
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
269
|
+
result = []
|
|
270
|
+
for li in ol.findall("li"):
|
|
271
|
+
toc_item = _parse_nav_li(li)
|
|
272
|
+
if toc_item:
|
|
273
|
+
result.append(toc_item)
|
|
284
274
|
|
|
285
|
-
|
|
275
|
+
return result
|
|
286
276
|
|
|
287
277
|
|
|
288
278
|
def _parse_nav_li(li: Element) -> Toc | None:
|
|
@@ -331,30 +321,22 @@ def _parse_nav_li(li: Element) -> Toc | None:
|
|
|
331
321
|
)
|
|
332
322
|
|
|
333
323
|
|
|
334
|
-
def
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
if ol is None:
|
|
351
|
-
raise ValueError("Cannot find ol in nav element")
|
|
352
|
-
|
|
353
|
-
_update_nav_lis(ol, toc_list, ns)
|
|
354
|
-
|
|
355
|
-
tree = ET.ElementTree(root)
|
|
356
|
-
with zip.replace(nav_path) as out:
|
|
357
|
-
tree.write(out, encoding="utf-8", xml_declaration=True)
|
|
324
|
+
def _update_nav_toc(root: Element, toc_list: list[Toc]) -> None:
|
|
325
|
+
nav_elem = None
|
|
326
|
+
for nav in root.findall(".//nav"):
|
|
327
|
+
epub_type = nav.get("type")
|
|
328
|
+
if epub_type == "toc":
|
|
329
|
+
nav_elem = nav
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
if nav_elem is None:
|
|
333
|
+
raise ValueError("Cannot find nav element with type='toc'")
|
|
334
|
+
|
|
335
|
+
ol = nav_elem.find(".//ol")
|
|
336
|
+
if ol is None:
|
|
337
|
+
raise ValueError("Cannot find ol in nav element")
|
|
338
|
+
|
|
339
|
+
_update_nav_lis(ol, toc_list, None)
|
|
358
340
|
|
|
359
341
|
|
|
360
342
|
def _update_nav_lis(ol: Element, toc_list: list[Toc], ns: str | None) -> None:
|
|
@@ -13,6 +13,7 @@ from ..template import create_env
|
|
|
13
13
|
from .context import LLMContext
|
|
14
14
|
from .executor import LLMExecutor
|
|
15
15
|
from .increasable import Increasable
|
|
16
|
+
from .statistics import Statistics
|
|
16
17
|
from .types import Message
|
|
17
18
|
|
|
18
19
|
# Global state for logger filename generation
|
|
@@ -44,7 +45,7 @@ class LLM:
|
|
|
44
45
|
self._temperature: Increasable = Increasable(temperature)
|
|
45
46
|
self._cache_path: Path | None = self._ensure_dir_path(cache_path)
|
|
46
47
|
self._logger_save_path: Path | None = self._ensure_dir_path(log_dir_path)
|
|
47
|
-
|
|
48
|
+
self._statistics = Statistics()
|
|
48
49
|
self._executor = LLMExecutor(
|
|
49
50
|
url=url,
|
|
50
51
|
model=model,
|
|
@@ -53,12 +54,29 @@ class LLM:
|
|
|
53
54
|
retry_times=retry_times,
|
|
54
55
|
retry_interval_seconds=retry_interval_seconds,
|
|
55
56
|
create_logger=self._create_logger,
|
|
57
|
+
statistics=self._statistics,
|
|
56
58
|
)
|
|
57
59
|
|
|
58
60
|
@property
|
|
59
61
|
def encoding(self) -> Encoding:
|
|
60
62
|
return self._encoding
|
|
61
63
|
|
|
64
|
+
@property
|
|
65
|
+
def total_tokens(self) -> int:
|
|
66
|
+
return self._statistics.total_tokens
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def input_tokens(self) -> int:
|
|
70
|
+
return self._statistics.input_tokens
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def input_cache_tokens(self) -> int:
|
|
74
|
+
return self._statistics.input_cache_tokens
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def output_tokens(self) -> int:
|
|
78
|
+
return self._statistics.output_tokens
|
|
79
|
+
|
|
62
80
|
def context(self, cache_seed_content: str | None = None) -> LLMContext:
|
|
63
81
|
return LLMContext(
|
|
64
82
|
executor=self._executor,
|