bookdatamaker 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {bookdatamaker-0.2.0/src/bookdatamaker.egg-info → bookdatamaker-0.2.2}/PKG-INFO +50 -35
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/README.md +48 -33
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/pyproject.toml +2 -2
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/cli.py +90 -72
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/llm/parallel_generator.py +141 -74
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/mcp/server.py +102 -1
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/extractor.py +90 -51
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/page_manager.py +49 -9
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2/src/bookdatamaker.egg-info}/PKG-INFO +50 -35
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/LICENSE +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/setup.cfg +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/builder.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/dataset_manager.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/llm/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/mcp/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/document_parser.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/__init__.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/status.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/SOURCES.txt +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/dependency_links.txt +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/entry_points.txt +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/requires.txt +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/top_level.txt +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_dataset.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_mcp.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_ocr.py +0 -0
- {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_paragraph_indexing.py +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: bookdatamaker
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: CLI tool for extracting text with DeepSeek OCR and generating datasets
|
|
5
|
-
Author-email:
|
|
5
|
+
Author-email: zwh20081 <zwh20081@solart.pro>
|
|
6
6
|
License: MIT
|
|
7
7
|
Requires-Python: <3.13,>=3.10
|
|
8
8
|
Description-Content-Type: text/markdown
|
|
@@ -108,7 +108,7 @@ pip install -r requirements.txt && pip install -e .
|
|
|
108
108
|
|
|
109
109
|
# 2. Extract → Generate → Export
|
|
110
110
|
bookdatamaker extract book.pdf -o ./extracted
|
|
111
|
-
bookdatamaker generate ./extracted
|
|
111
|
+
bookdatamaker generate ./extracted -d dataset.db --distribution "10,10,20,30,20,10"
|
|
112
112
|
bookdatamaker export-dataset dataset.db -o output.parquet
|
|
113
113
|
```
|
|
114
114
|
|
|
@@ -122,7 +122,7 @@ pip install -r requirements.txt && pip install -e ".[local]"
|
|
|
122
122
|
bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
|
|
123
123
|
|
|
124
124
|
# 3. Generate with vLLM
|
|
125
|
-
bookdatamaker generate ./extracted
|
|
125
|
+
bookdatamaker generate ./extracted \
|
|
126
126
|
--mode vllm \
|
|
127
127
|
--vllm-model-path meta-llama/Llama-3-8B-Instruct \
|
|
128
128
|
--distribution "25,25,25,25" \
|
|
@@ -225,42 +225,57 @@ bookdatamaker extract ./images/ --mode local -o ./extracted
|
|
|
225
225
|
|
|
226
226
|
```
|
|
227
227
|
./extracted/
|
|
228
|
-
├── page_001
|
|
229
|
-
├──
|
|
230
|
-
|
|
231
|
-
|
|
228
|
+
├── page_001/
|
|
229
|
+
│ ├── page_001.png # Page image
|
|
230
|
+
│ └── result.mmd # Extracted text in markdown
|
|
231
|
+
├── page_002/
|
|
232
|
+
│ ├── page_002.png
|
|
233
|
+
│ └── result.mmd
|
|
234
|
+
└── ...
|
|
232
235
|
```
|
|
233
236
|
|
|
237
|
+
**Note**: Each page is stored in its own subdirectory with the extracted text in `result.mmd` format.
|
|
238
|
+
|
|
234
239
|
---
|
|
235
240
|
|
|
236
241
|
## Generate Dataset (Stage 2)
|
|
237
242
|
|
|
238
|
-
Generate Q&A datasets using parallel LLM threads
|
|
243
|
+
Generate Q&A datasets using parallel LLM threads with **page-based navigation**.
|
|
244
|
+
|
|
245
|
+
### Navigation Model
|
|
246
|
+
|
|
247
|
+
The system now uses **page navigation** instead of paragraph navigation:
|
|
248
|
+
- LLM threads navigate through document pages
|
|
249
|
+
- Tools available: `get_current_page`, `next_page`, `previous_page`, `jump_to_page`, `get_page_context`
|
|
250
|
+
- Each thread starts at a specific page based on distribution
|
|
251
|
+
- Threads can move forward/backward through pages to explore content
|
|
239
252
|
|
|
240
253
|
### Basic Usage
|
|
241
254
|
|
|
242
255
|
```bash
|
|
243
256
|
# 6 threads (from distribution), 20 Q&A pairs per thread
|
|
244
|
-
bookdatamaker generate
|
|
257
|
+
bookdatamaker generate ./extracted \
|
|
245
258
|
-d dataset.db \
|
|
246
259
|
--distribution "10,10,20,30,20,10" \
|
|
247
260
|
--datasets-per-thread 20
|
|
248
261
|
```
|
|
249
262
|
|
|
263
|
+
**Note**: The `generate` command now accepts the extracted directory (containing page_XXX/ subdirectories) instead of a combined text file.
|
|
264
|
+
|
|
250
265
|
**Key Concept**: Thread count is determined by the number of comma-separated values in `--distribution`.
|
|
251
266
|
|
|
252
267
|
### API Mode Examples
|
|
253
268
|
|
|
254
269
|
```bash
|
|
255
270
|
# OpenAI/Azure
|
|
256
|
-
bookdatamaker generate
|
|
271
|
+
bookdatamaker generate ./extracted \
|
|
257
272
|
-d dataset.db \
|
|
258
273
|
--openai-api-url https://api.openai.com/v1 \
|
|
259
274
|
--model gpt-4 \
|
|
260
275
|
--distribution "10,10,20,30,20,10"
|
|
261
276
|
|
|
262
277
|
# Custom API endpoint
|
|
263
|
-
bookdatamaker generate
|
|
278
|
+
bookdatamaker generate ./extracted \
|
|
264
279
|
--openai-api-url http://localhost:8000/v1 \
|
|
265
280
|
--model your-model-name \
|
|
266
281
|
--distribution "25,25,25,25"
|
|
@@ -272,14 +287,14 @@ Use vLLM directly without API server:
|
|
|
272
287
|
|
|
273
288
|
```bash
|
|
274
289
|
# Single GPU
|
|
275
|
-
bookdatamaker generate
|
|
290
|
+
bookdatamaker generate ./extracted \
|
|
276
291
|
--mode vllm \
|
|
277
292
|
--vllm-model-path meta-llama/Llama-3-8B-Instruct \
|
|
278
293
|
--distribution "25,25,25,25" \
|
|
279
294
|
-d dataset.db
|
|
280
295
|
|
|
281
296
|
# Multi-GPU (4 GPUs, 6 threads)
|
|
282
|
-
bookdatamaker generate
|
|
297
|
+
bookdatamaker generate ./extracted \
|
|
283
298
|
--mode vllm \
|
|
284
299
|
--vllm-model-path meta-llama/Llama-3-70B-Instruct \
|
|
285
300
|
--tensor-parallel-size 4 \
|
|
@@ -300,15 +315,15 @@ Add specific instructions to guide LLM behavior:
|
|
|
300
315
|
|
|
301
316
|
```bash
|
|
302
317
|
# Language specification
|
|
303
|
-
bookdatamaker generate
|
|
318
|
+
bookdatamaker generate ./extracted \
|
|
304
319
|
--custom-prompt "Generate all Q&A in Chinese with simplified characters"
|
|
305
320
|
|
|
306
321
|
# Format specification
|
|
307
|
-
bookdatamaker generate
|
|
322
|
+
bookdatamaker generate ./extracted \
|
|
308
323
|
--custom-prompt "Questions should be multiple-choice with 4 options"
|
|
309
324
|
|
|
310
325
|
# Multiple requirements
|
|
311
|
-
bookdatamaker generate
|
|
326
|
+
bookdatamaker generate ./extracted \
|
|
312
327
|
--custom-prompt "Requirements:
|
|
313
328
|
1. Generate questions in English
|
|
314
329
|
2. Focus on practical applications
|
|
@@ -355,15 +370,15 @@ Control where threads start in the document using distribution percentages.
|
|
|
355
370
|
### How It Works
|
|
356
371
|
|
|
357
372
|
```
|
|
358
|
-
Document:
|
|
373
|
+
Document: 100 pages
|
|
359
374
|
Distribution: "10,10,20,30,20,10" (6 threads)
|
|
360
375
|
|
|
361
|
-
Thread 0: Start at 0% →
|
|
362
|
-
Thread 1: Start at 10% →
|
|
363
|
-
Thread 2: Start at 20% →
|
|
364
|
-
Thread 3: Start at 50% →
|
|
365
|
-
Thread 4: Start at 70% →
|
|
366
|
-
Thread 5: Start at 80% →
|
|
376
|
+
Thread 0: Start at 0% → Page 1
|
|
377
|
+
Thread 1: Start at 10% → Page 10
|
|
378
|
+
Thread 2: Start at 20% → Page 20
|
|
379
|
+
Thread 3: Start at 50% → Page 50
|
|
380
|
+
Thread 4: Start at 70% → Page 70
|
|
381
|
+
Thread 5: Start at 80% → Page 80
|
|
367
382
|
```
|
|
368
383
|
|
|
369
384
|
### Distribution Strategies
|
|
@@ -387,9 +402,9 @@ Thread 5: Start at 80% → Paragraph 400
|
|
|
387
402
|
|
|
388
403
|
### Thread Count Guidelines
|
|
389
404
|
|
|
390
|
-
- **Small documents** (<
|
|
391
|
-
- **Medium documents** (
|
|
392
|
-
- **Large documents** (>
|
|
405
|
+
- **Small documents** (<50 pages): 2-4 threads
|
|
406
|
+
- **Medium documents** (50-200 pages): 4-8 threads
|
|
407
|
+
- **Large documents** (>200 pages): 8-16 threads
|
|
393
408
|
|
|
394
409
|
---
|
|
395
410
|
|
|
@@ -460,25 +475,25 @@ Chat with an LLM that can access your document through MCP tools. Perfect for ex
|
|
|
460
475
|
|
|
461
476
|
```bash
|
|
462
477
|
# Basic chat with GPT-4
|
|
463
|
-
bookdatamaker chat
|
|
478
|
+
bookdatamaker chat ./extracted
|
|
464
479
|
|
|
465
480
|
# With vLLM server
|
|
466
|
-
bookdatamaker chat
|
|
481
|
+
bookdatamaker chat ./extracted \
|
|
467
482
|
--openai-api-url http://localhost:8000/v1 \
|
|
468
483
|
--model Qwen/Qwen3-4B-Thinking-2507
|
|
469
484
|
|
|
470
485
|
# With custom database
|
|
471
|
-
bookdatamaker chat
|
|
486
|
+
bookdatamaker chat ./extracted --db my_dataset.db
|
|
472
487
|
```
|
|
473
488
|
|
|
474
489
|
### Example Interaction
|
|
475
490
|
|
|
476
491
|
```
|
|
477
|
-
📚 Document:
|
|
478
|
-
📊
|
|
492
|
+
📚 Document: ./extracted
|
|
493
|
+
📊 Pages: 50
|
|
479
494
|
🤖 Model: gpt-4
|
|
480
495
|
|
|
481
|
-
You: What's
|
|
496
|
+
You: What's on page 10?
|
|
482
497
|
- `-f, --format`: Format: `jsonl`, `parquet`, `csv`, `json` (default: `parquet`)
|
|
483
498
|
- `--include-metadata`: Include timestamps
|
|
484
499
|
|
|
@@ -501,7 +516,7 @@ You: What's in paragraph 100?
|
|
|
501
516
|
|
|
502
517
|
| Parameter | Type | Default | Description |
|
|
503
518
|
|-----------|------|---------|-------------|
|
|
504
|
-
| `
|
|
519
|
+
| `extracted_dir` | required | - | Directory containing page subdirectories (page_XXX/) |
|
|
505
520
|
| `--db` | optional | `dataset.db` | Database file path |
|
|
506
521
|
| `--mode` | optional | `api` | LLM mode: `api` or `vllm` |
|
|
507
522
|
| `--distribution` | optional | `10,10,20,30,20,10` | Position distribution (determines threads) |
|
|
@@ -553,7 +568,7 @@ Set environment variable for verbose logging:
|
|
|
553
568
|
|
|
554
569
|
```bash
|
|
555
570
|
export LOG_LEVEL=DEBUG
|
|
556
|
-
bookdatamaker generate
|
|
571
|
+
bookdatamaker generate ./extracted -d dataset.db
|
|
557
572
|
```
|
|
558
573
|
|
|
559
574
|
---
|
|
@@ -56,7 +56,7 @@ pip install -r requirements.txt && pip install -e .
|
|
|
56
56
|
|
|
57
57
|
# 2. Extract → Generate → Export
|
|
58
58
|
bookdatamaker extract book.pdf -o ./extracted
|
|
59
|
-
bookdatamaker generate ./extracted
|
|
59
|
+
bookdatamaker generate ./extracted -d dataset.db --distribution "10,10,20,30,20,10"
|
|
60
60
|
bookdatamaker export-dataset dataset.db -o output.parquet
|
|
61
61
|
```
|
|
62
62
|
|
|
@@ -70,7 +70,7 @@ pip install -r requirements.txt && pip install -e ".[local]"
|
|
|
70
70
|
bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
|
|
71
71
|
|
|
72
72
|
# 3. Generate with vLLM
|
|
73
|
-
bookdatamaker generate ./extracted
|
|
73
|
+
bookdatamaker generate ./extracted \
|
|
74
74
|
--mode vllm \
|
|
75
75
|
--vllm-model-path meta-llama/Llama-3-8B-Instruct \
|
|
76
76
|
--distribution "25,25,25,25" \
|
|
@@ -173,42 +173,57 @@ bookdatamaker extract ./images/ --mode local -o ./extracted
|
|
|
173
173
|
|
|
174
174
|
```
|
|
175
175
|
./extracted/
|
|
176
|
-
├── page_001
|
|
177
|
-
├──
|
|
178
|
-
|
|
179
|
-
|
|
176
|
+
├── page_001/
|
|
177
|
+
│ ├── page_001.png # Page image
|
|
178
|
+
│ └── result.mmd # Extracted text in markdown
|
|
179
|
+
├── page_002/
|
|
180
|
+
│ ├── page_002.png
|
|
181
|
+
│ └── result.mmd
|
|
182
|
+
└── ...
|
|
180
183
|
```
|
|
181
184
|
|
|
185
|
+
**Note**: Each page is stored in its own subdirectory with the extracted text in `result.mmd` format.
|
|
186
|
+
|
|
182
187
|
---
|
|
183
188
|
|
|
184
189
|
## Generate Dataset (Stage 2)
|
|
185
190
|
|
|
186
|
-
Generate Q&A datasets using parallel LLM threads
|
|
191
|
+
Generate Q&A datasets using parallel LLM threads with **page-based navigation**.
|
|
192
|
+
|
|
193
|
+
### Navigation Model
|
|
194
|
+
|
|
195
|
+
The system now uses **page navigation** instead of paragraph navigation:
|
|
196
|
+
- LLM threads navigate through document pages
|
|
197
|
+
- Tools available: `get_current_page`, `next_page`, `previous_page`, `jump_to_page`, `get_page_context`
|
|
198
|
+
- Each thread starts at a specific page based on distribution
|
|
199
|
+
- Threads can move forward/backward through pages to explore content
|
|
187
200
|
|
|
188
201
|
### Basic Usage
|
|
189
202
|
|
|
190
203
|
```bash
|
|
191
204
|
# 6 threads (from distribution), 20 Q&A pairs per thread
|
|
192
|
-
bookdatamaker generate
|
|
205
|
+
bookdatamaker generate ./extracted \
|
|
193
206
|
-d dataset.db \
|
|
194
207
|
--distribution "10,10,20,30,20,10" \
|
|
195
208
|
--datasets-per-thread 20
|
|
196
209
|
```
|
|
197
210
|
|
|
211
|
+
**Note**: The `generate` command now accepts the extracted directory (containing page_XXX/ subdirectories) instead of a combined text file.
|
|
212
|
+
|
|
198
213
|
**Key Concept**: Thread count is determined by the number of comma-separated values in `--distribution`.
|
|
199
214
|
|
|
200
215
|
### API Mode Examples
|
|
201
216
|
|
|
202
217
|
```bash
|
|
203
218
|
# OpenAI/Azure
|
|
204
|
-
bookdatamaker generate
|
|
219
|
+
bookdatamaker generate ./extracted \
|
|
205
220
|
-d dataset.db \
|
|
206
221
|
--openai-api-url https://api.openai.com/v1 \
|
|
207
222
|
--model gpt-4 \
|
|
208
223
|
--distribution "10,10,20,30,20,10"
|
|
209
224
|
|
|
210
225
|
# Custom API endpoint
|
|
211
|
-
bookdatamaker generate
|
|
226
|
+
bookdatamaker generate ./extracted \
|
|
212
227
|
--openai-api-url http://localhost:8000/v1 \
|
|
213
228
|
--model your-model-name \
|
|
214
229
|
--distribution "25,25,25,25"
|
|
@@ -220,14 +235,14 @@ Use vLLM directly without API server:
|
|
|
220
235
|
|
|
221
236
|
```bash
|
|
222
237
|
# Single GPU
|
|
223
|
-
bookdatamaker generate
|
|
238
|
+
bookdatamaker generate ./extracted \
|
|
224
239
|
--mode vllm \
|
|
225
240
|
--vllm-model-path meta-llama/Llama-3-8B-Instruct \
|
|
226
241
|
--distribution "25,25,25,25" \
|
|
227
242
|
-d dataset.db
|
|
228
243
|
|
|
229
244
|
# Multi-GPU (4 GPUs, 6 threads)
|
|
230
|
-
bookdatamaker generate
|
|
245
|
+
bookdatamaker generate ./extracted \
|
|
231
246
|
--mode vllm \
|
|
232
247
|
--vllm-model-path meta-llama/Llama-3-70B-Instruct \
|
|
233
248
|
--tensor-parallel-size 4 \
|
|
@@ -248,15 +263,15 @@ Add specific instructions to guide LLM behavior:
|
|
|
248
263
|
|
|
249
264
|
```bash
|
|
250
265
|
# Language specification
|
|
251
|
-
bookdatamaker generate
|
|
266
|
+
bookdatamaker generate ./extracted \
|
|
252
267
|
--custom-prompt "Generate all Q&A in Chinese with simplified characters"
|
|
253
268
|
|
|
254
269
|
# Format specification
|
|
255
|
-
bookdatamaker generate
|
|
270
|
+
bookdatamaker generate ./extracted \
|
|
256
271
|
--custom-prompt "Questions should be multiple-choice with 4 options"
|
|
257
272
|
|
|
258
273
|
# Multiple requirements
|
|
259
|
-
bookdatamaker generate
|
|
274
|
+
bookdatamaker generate ./extracted \
|
|
260
275
|
--custom-prompt "Requirements:
|
|
261
276
|
1. Generate questions in English
|
|
262
277
|
2. Focus on practical applications
|
|
@@ -303,15 +318,15 @@ Control where threads start in the document using distribution percentages.
|
|
|
303
318
|
### How It Works
|
|
304
319
|
|
|
305
320
|
```
|
|
306
|
-
Document:
|
|
321
|
+
Document: 100 pages
|
|
307
322
|
Distribution: "10,10,20,30,20,10" (6 threads)
|
|
308
323
|
|
|
309
|
-
Thread 0: Start at 0% →
|
|
310
|
-
Thread 1: Start at 10% →
|
|
311
|
-
Thread 2: Start at 20% →
|
|
312
|
-
Thread 3: Start at 50% →
|
|
313
|
-
Thread 4: Start at 70% →
|
|
314
|
-
Thread 5: Start at 80% →
|
|
324
|
+
Thread 0: Start at 0% → Page 1
|
|
325
|
+
Thread 1: Start at 10% → Page 10
|
|
326
|
+
Thread 2: Start at 20% → Page 20
|
|
327
|
+
Thread 3: Start at 50% → Page 50
|
|
328
|
+
Thread 4: Start at 70% → Page 70
|
|
329
|
+
Thread 5: Start at 80% → Page 80
|
|
315
330
|
```
|
|
316
331
|
|
|
317
332
|
### Distribution Strategies
|
|
@@ -335,9 +350,9 @@ Thread 5: Start at 80% → Paragraph 400
|
|
|
335
350
|
|
|
336
351
|
### Thread Count Guidelines
|
|
337
352
|
|
|
338
|
-
- **Small documents** (<
|
|
339
|
-
- **Medium documents** (
|
|
340
|
-
- **Large documents** (>
|
|
353
|
+
- **Small documents** (<50 pages): 2-4 threads
|
|
354
|
+
- **Medium documents** (50-200 pages): 4-8 threads
|
|
355
|
+
- **Large documents** (>200 pages): 8-16 threads
|
|
341
356
|
|
|
342
357
|
---
|
|
343
358
|
|
|
@@ -408,25 +423,25 @@ Chat with an LLM that can access your document through MCP tools. Perfect for ex
|
|
|
408
423
|
|
|
409
424
|
```bash
|
|
410
425
|
# Basic chat with GPT-4
|
|
411
|
-
bookdatamaker chat
|
|
426
|
+
bookdatamaker chat ./extracted
|
|
412
427
|
|
|
413
428
|
# With vLLM server
|
|
414
|
-
bookdatamaker chat
|
|
429
|
+
bookdatamaker chat ./extracted \
|
|
415
430
|
--openai-api-url http://localhost:8000/v1 \
|
|
416
431
|
--model Qwen/Qwen3-4B-Thinking-2507
|
|
417
432
|
|
|
418
433
|
# With custom database
|
|
419
|
-
bookdatamaker chat
|
|
434
|
+
bookdatamaker chat ./extracted --db my_dataset.db
|
|
420
435
|
```
|
|
421
436
|
|
|
422
437
|
### Example Interaction
|
|
423
438
|
|
|
424
439
|
```
|
|
425
|
-
📚 Document:
|
|
426
|
-
📊
|
|
440
|
+
📚 Document: ./extracted
|
|
441
|
+
📊 Pages: 50
|
|
427
442
|
🤖 Model: gpt-4
|
|
428
443
|
|
|
429
|
-
You: What's
|
|
444
|
+
You: What's on page 10?
|
|
430
445
|
- `-f, --format`: Format: `jsonl`, `parquet`, `csv`, `json` (default: `parquet`)
|
|
431
446
|
- `--include-metadata`: Include timestamps
|
|
432
447
|
|
|
@@ -449,7 +464,7 @@ You: What's in paragraph 100?
|
|
|
449
464
|
|
|
450
465
|
| Parameter | Type | Default | Description |
|
|
451
466
|
|-----------|------|---------|-------------|
|
|
452
|
-
| `
|
|
467
|
+
| `extracted_dir` | required | - | Directory containing page subdirectories (page_XXX/) |
|
|
453
468
|
| `--db` | optional | `dataset.db` | Database file path |
|
|
454
469
|
| `--mode` | optional | `api` | LLM mode: `api` or `vllm` |
|
|
455
470
|
| `--distribution` | optional | `10,10,20,30,20,10` | Position distribution (determines threads) |
|
|
@@ -501,7 +516,7 @@ Set environment variable for verbose logging:
|
|
|
501
516
|
|
|
502
517
|
```bash
|
|
503
518
|
export LOG_LEVEL=DEBUG
|
|
504
|
-
bookdatamaker generate
|
|
519
|
+
bookdatamaker generate ./extracted -d dataset.db
|
|
505
520
|
```
|
|
506
521
|
|
|
507
522
|
---
|
|
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "bookdatamaker"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "CLI tool for extracting text with DeepSeek OCR and generating datasets"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10,<3.13"
|
|
11
11
|
license = {text = "MIT"}
|
|
12
12
|
authors = [
|
|
13
|
-
{name = "
|
|
13
|
+
{name = "zwh20081", email = "zwh20081@solart.pro"}
|
|
14
14
|
]
|
|
15
15
|
dependencies = [
|
|
16
16
|
"click",
|