bookdatamaker 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bookdatamaker-0.2.3/src/bookdatamaker.egg-info → bookdatamaker-0.2.4}/PKG-INFO +48 -23
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/README.md +47 -22
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/pyproject.toml +1 -1
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/cli.py +7 -2
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/ocr/extractor.py +14 -3
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4/src/bookdatamaker.egg-info}/PKG-INFO +48 -23
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/LICENSE +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/setup.cfg +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/dataset/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/dataset/builder.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/dataset/dataset_manager.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/llm/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/llm/parallel_generator.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/mcp/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/mcp/server.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/ocr/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/ocr/document_parser.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/utils/__init__.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/utils/page_manager.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker/utils/status.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker.egg-info/SOURCES.txt +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker.egg-info/dependency_links.txt +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker.egg-info/entry_points.txt +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker.egg-info/requires.txt +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/src/bookdatamaker.egg-info/top_level.txt +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/tests/test_dataset.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/tests/test_mcp.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/tests/test_ocr.py +0 -0
- {bookdatamaker-0.2.3 → bookdatamaker-0.2.4}/tests/test_paragraph_indexing.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bookdatamaker
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: CLI tool for extracting text with DeepSeek OCR and generating datasets
|
|
5
5
|
Author-email: zwh20081 <zwh20081@solart.pro>
|
|
6
6
|
License: MIT
|
|
@@ -89,6 +89,49 @@ A powerful CLI tool for extracting text from documents using DeepSeek OCR and ge
|
|
|
89
89
|
- 🌐 **Flexible Modes**: API or self-hosted for both stages
|
|
90
90
|
- 📈 **Progress Tracking**: Real-time progress bars
|
|
91
91
|
|
|
92
|
+
## Installation
|
|
93
|
+
|
|
94
|
+
### From PyPI (Recommended)
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install bookdatamaker
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### From Source
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
git clone https://github.com/yourusername/bookdatamaker.git
|
|
104
|
+
cd bookdatamaker
|
|
105
|
+
pip install -r requirements.txt
|
|
106
|
+
pip install -e .
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Optional: Local Inference Support
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# For self-hosted OCR and LLM generation
|
|
113
|
+
pip install bookdatamaker[local] # From PyPI
|
|
114
|
+
# OR
|
|
115
|
+
pip install -e ".[local]" # From source - installs transformers==4.46.3, torch, flash-attn, etc.
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
119
|
+
|
|
120
|
+
### System Requirements
|
|
121
|
+
|
|
122
|
+
**For API Mode:**
|
|
123
|
+
- Python 3.10+
|
|
124
|
+
- API keys (OpenAI, DeepSeek, etc.)
|
|
125
|
+
|
|
126
|
+
**For Local Mode:**
|
|
127
|
+
- Python 3.10-3.12 (3.13 not supported due to vLLM compatibility)
|
|
128
|
+
- NVIDIA GPU with CUDA support (or CPU, though slower)
|
|
129
|
+
- 16GB+ VRAM recommended for GPU
|
|
130
|
+
- transformers==4.46.3
|
|
131
|
+
- Linux or WSL2 (recommended)
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
92
135
|
## Quick Start
|
|
93
136
|
|
|
94
137
|
### Prerequisites
|
|
@@ -103,7 +146,7 @@ export DEEPSEEK_API_KEY=your_deepseek_key # For API OCR mode
|
|
|
103
146
|
|
|
104
147
|
```bash
|
|
105
148
|
# 1. Install
|
|
106
|
-
pip install
|
|
149
|
+
pip install bookdatamaker
|
|
107
150
|
|
|
108
151
|
# 2. Extract → Generate → Export
|
|
109
152
|
bookdatamaker extract book.pdf -o ./extracted
|
|
@@ -115,7 +158,7 @@ bookdatamaker export-dataset dataset.db -o output.parquet
|
|
|
115
158
|
|
|
116
159
|
```bash
|
|
117
160
|
# 1. Install with local dependencies
|
|
118
|
-
pip install
|
|
161
|
+
pip install bookdatamaker[local]
|
|
119
162
|
|
|
120
163
|
# 2. Extract with local OCR
|
|
121
164
|
bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
|
|
@@ -131,27 +174,9 @@ bookdatamaker generate ./extracted \
|
|
|
131
174
|
bookdatamaker export-dataset dataset.db -o output.parquet
|
|
132
175
|
```
|
|
133
176
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
### Basic Installation
|
|
137
|
-
|
|
138
|
-
```bash
|
|
139
|
-
git clone https://github.com/yourusername/bookdatamaker.git
|
|
140
|
-
cd bookdatamaker
|
|
141
|
-
pip install -r requirements.txt
|
|
142
|
-
pip install -e .
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### Optional: Local Inference Support
|
|
146
|
-
|
|
147
|
-
```bash
|
|
148
|
-
# For self-hosted OCR and LLM generation
|
|
149
|
-
pip install -e ".[local]" # Installs transformers==4.46.3, torch, flash-attn, etc.
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
177
|
+
---
|
|
153
178
|
|
|
154
|
-
|
|
179
|
+
## System Requirements
|
|
155
180
|
|
|
156
181
|
**For API Mode:**
|
|
157
182
|
- Python 3.10+
|
|
@@ -37,6 +37,49 @@ A powerful CLI tool for extracting text from documents using DeepSeek OCR and ge
|
|
|
37
37
|
- 🌐 **Flexible Modes**: API or self-hosted for both stages
|
|
38
38
|
- 📈 **Progress Tracking**: Real-time progress bars
|
|
39
39
|
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
### From PyPI (Recommended)
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install bookdatamaker
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### From Source
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/yourusername/bookdatamaker.git
|
|
52
|
+
cd bookdatamaker
|
|
53
|
+
pip install -r requirements.txt
|
|
54
|
+
pip install -e .
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Optional: Local Inference Support
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# For self-hosted OCR and LLM generation
|
|
61
|
+
pip install bookdatamaker[local] # From PyPI
|
|
62
|
+
# OR
|
|
63
|
+
pip install -e ".[local]" # From source - installs transformers==4.46.3, torch, flash-attn, etc.
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
67
|
+
|
|
68
|
+
### System Requirements
|
|
69
|
+
|
|
70
|
+
**For API Mode:**
|
|
71
|
+
- Python 3.10+
|
|
72
|
+
- API keys (OpenAI, DeepSeek, etc.)
|
|
73
|
+
|
|
74
|
+
**For Local Mode:**
|
|
75
|
+
- Python 3.10-3.12 (3.13 not supported due to vLLM compatibility)
|
|
76
|
+
- NVIDIA GPU with CUDA support (or CPU, though slower)
|
|
77
|
+
- 16GB+ VRAM recommended for GPU
|
|
78
|
+
- transformers==4.46.3
|
|
79
|
+
- Linux or WSL2 (recommended)
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
40
83
|
## Quick Start
|
|
41
84
|
|
|
42
85
|
### Prerequisites
|
|
@@ -51,7 +94,7 @@ export DEEPSEEK_API_KEY=your_deepseek_key # For API OCR mode
|
|
|
51
94
|
|
|
52
95
|
```bash
|
|
53
96
|
# 1. Install
|
|
54
|
-
pip install
|
|
97
|
+
pip install bookdatamaker
|
|
55
98
|
|
|
56
99
|
# 2. Extract → Generate → Export
|
|
57
100
|
bookdatamaker extract book.pdf -o ./extracted
|
|
@@ -63,7 +106,7 @@ bookdatamaker export-dataset dataset.db -o output.parquet
|
|
|
63
106
|
|
|
64
107
|
```bash
|
|
65
108
|
# 1. Install with local dependencies
|
|
66
|
-
pip install
|
|
109
|
+
pip install bookdatamaker[local]
|
|
67
110
|
|
|
68
111
|
# 2. Extract with local OCR
|
|
69
112
|
bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
|
|
@@ -79,27 +122,9 @@ bookdatamaker generate ./extracted \
|
|
|
79
122
|
bookdatamaker export-dataset dataset.db -o output.parquet
|
|
80
123
|
```
|
|
81
124
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
### Basic Installation
|
|
85
|
-
|
|
86
|
-
```bash
|
|
87
|
-
git clone https://github.com/yourusername/bookdatamaker.git
|
|
88
|
-
cd bookdatamaker
|
|
89
|
-
pip install -r requirements.txt
|
|
90
|
-
pip install -e .
|
|
91
|
-
```
|
|
92
|
-
|
|
93
|
-
### Optional: Local Inference Support
|
|
94
|
-
|
|
95
|
-
```bash
|
|
96
|
-
# For self-hosted OCR and LLM generation
|
|
97
|
-
pip install -e ".[local]" # Installs transformers==4.46.3, torch, flash-attn, etc.
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
125
|
+
---
|
|
101
126
|
|
|
102
|
-
|
|
127
|
+
## System Requirements
|
|
103
128
|
|
|
104
129
|
**For API Mode:**
|
|
105
130
|
- Python 3.10+
|
|
@@ -14,11 +14,16 @@ from bookdatamaker.ocr import OCRExtractor
|
|
|
14
14
|
from bookdatamaker.utils import PageManager
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
@click.group()
|
|
17
|
+
@click.group(invoke_without_command=True)
|
|
18
18
|
@click.version_option(version="0.1.0")
|
|
19
|
-
|
|
19
|
+
@click.pass_context
|
|
20
|
+
def cli(ctx: click.Context) -> None:
|
|
20
21
|
"""Book Data Maker - Extract text and generate datasets."""
|
|
21
22
|
load_dotenv()
|
|
23
|
+
|
|
24
|
+
# Show help if no command provided
|
|
25
|
+
if ctx.invoked_subcommand is None:
|
|
26
|
+
click.echo(ctx.get_help())
|
|
22
27
|
|
|
23
28
|
|
|
24
29
|
@cli.command()
|
|
@@ -8,7 +8,6 @@ import os
|
|
|
8
8
|
|
|
9
9
|
import httpx
|
|
10
10
|
from PIL import Image
|
|
11
|
-
import torch
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class OCRExtractor:
|
|
@@ -75,6 +74,18 @@ class OCRExtractor:
|
|
|
75
74
|
"""Initialize local transformers model."""
|
|
76
75
|
import importlib.metadata
|
|
77
76
|
|
|
77
|
+
# Import torch here (only when needed for local mode)
|
|
78
|
+
try:
|
|
79
|
+
import torch
|
|
80
|
+
except ImportError:
|
|
81
|
+
raise ImportError(
|
|
82
|
+
"PyTorch not found. Please install it for local mode:\n"
|
|
83
|
+
" pip install bookdatamaker[local]"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# Store torch reference for later use
|
|
87
|
+
self.torch = torch
|
|
88
|
+
|
|
78
89
|
# Check transformers version
|
|
79
90
|
try:
|
|
80
91
|
transformers_version = importlib.metadata.version("transformers")
|
|
@@ -217,10 +228,10 @@ class OCRExtractor:
|
|
|
217
228
|
)
|
|
218
229
|
|
|
219
230
|
# Move inputs to GPU
|
|
220
|
-
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
|
|
231
|
+
inputs = {k: v.cuda() if isinstance(v, self.torch.Tensor) else v
|
|
221
232
|
for k, v in inputs.items()}
|
|
222
233
|
|
|
223
|
-
with torch.no_grad():
|
|
234
|
+
with self.torch.no_grad():
|
|
224
235
|
outputs = self.model.generate(**inputs, max_new_tokens=8192)
|
|
225
236
|
|
|
226
237
|
# Decode output
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bookdatamaker
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: CLI tool for extracting text with DeepSeek OCR and generating datasets
|
|
5
5
|
Author-email: zwh20081 <zwh20081@solart.pro>
|
|
6
6
|
License: MIT
|
|
@@ -89,6 +89,49 @@ A powerful CLI tool for extracting text from documents using DeepSeek OCR and ge
|
|
|
89
89
|
- 🌐 **Flexible Modes**: API or self-hosted for both stages
|
|
90
90
|
- 📈 **Progress Tracking**: Real-time progress bars
|
|
91
91
|
|
|
92
|
+
## Installation
|
|
93
|
+
|
|
94
|
+
### From PyPI (Recommended)
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install bookdatamaker
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### From Source
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
git clone https://github.com/yourusername/bookdatamaker.git
|
|
104
|
+
cd bookdatamaker
|
|
105
|
+
pip install -r requirements.txt
|
|
106
|
+
pip install -e .
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Optional: Local Inference Support
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# For self-hosted OCR and LLM generation
|
|
113
|
+
pip install bookdatamaker[local] # From PyPI
|
|
114
|
+
# OR
|
|
115
|
+
pip install -e ".[local]" # From source - installs transformers==4.46.3, torch, flash-attn, etc.
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
119
|
+
|
|
120
|
+
### System Requirements
|
|
121
|
+
|
|
122
|
+
**For API Mode:**
|
|
123
|
+
- Python 3.10+
|
|
124
|
+
- API keys (OpenAI, DeepSeek, etc.)
|
|
125
|
+
|
|
126
|
+
**For Local Mode:**
|
|
127
|
+
- Python 3.10-3.12 (3.13 not supported due to vLLM compatibility)
|
|
128
|
+
- NVIDIA GPU with CUDA support (or CPU, though slower)
|
|
129
|
+
- 16GB+ VRAM recommended for GPU
|
|
130
|
+
- transformers==4.46.3
|
|
131
|
+
- Linux or WSL2 (recommended)
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
92
135
|
## Quick Start
|
|
93
136
|
|
|
94
137
|
### Prerequisites
|
|
@@ -103,7 +146,7 @@ export DEEPSEEK_API_KEY=your_deepseek_key # For API OCR mode
|
|
|
103
146
|
|
|
104
147
|
```bash
|
|
105
148
|
# 1. Install
|
|
106
|
-
pip install
|
|
149
|
+
pip install bookdatamaker
|
|
107
150
|
|
|
108
151
|
# 2. Extract → Generate → Export
|
|
109
152
|
bookdatamaker extract book.pdf -o ./extracted
|
|
@@ -115,7 +158,7 @@ bookdatamaker export-dataset dataset.db -o output.parquet
|
|
|
115
158
|
|
|
116
159
|
```bash
|
|
117
160
|
# 1. Install with local dependencies
|
|
118
|
-
pip install
|
|
161
|
+
pip install bookdatamaker[local]
|
|
119
162
|
|
|
120
163
|
# 2. Extract with local OCR
|
|
121
164
|
bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
|
|
@@ -131,27 +174,9 @@ bookdatamaker generate ./extracted \
|
|
|
131
174
|
bookdatamaker export-dataset dataset.db -o output.parquet
|
|
132
175
|
```
|
|
133
176
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
### Basic Installation
|
|
137
|
-
|
|
138
|
-
```bash
|
|
139
|
-
git clone https://github.com/yourusername/bookdatamaker.git
|
|
140
|
-
cd bookdatamaker
|
|
141
|
-
pip install -r requirements.txt
|
|
142
|
-
pip install -e .
|
|
143
|
-
```
|
|
144
|
-
|
|
145
|
-
### Optional: Local Inference Support
|
|
146
|
-
|
|
147
|
-
```bash
|
|
148
|
-
# For self-hosted OCR and LLM generation
|
|
149
|
-
pip install -e ".[local]" # Installs transformers==4.46.3, torch, flash-attn, etc.
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
**Note**: The project requires `transformers==4.46.3` for optimal compatibility with DeepSeek-OCR. A warning will be displayed if a different version is detected.
|
|
177
|
+
---
|
|
153
178
|
|
|
154
|
-
|
|
179
|
+
## System Requirements
|
|
155
180
|
|
|
156
181
|
**For API Mode:**
|
|
157
182
|
- Python 3.10+
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|