bookdatamaker 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {bookdatamaker-0.2.0/src/bookdatamaker.egg-info → bookdatamaker-0.2.2}/PKG-INFO +50 -35
  2. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/README.md +48 -33
  3. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/pyproject.toml +2 -2
  4. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/cli.py +90 -72
  5. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/llm/parallel_generator.py +141 -74
  6. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/mcp/server.py +102 -1
  7. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/extractor.py +90 -51
  8. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/page_manager.py +49 -9
  9. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2/src/bookdatamaker.egg-info}/PKG-INFO +50 -35
  10. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/LICENSE +0 -0
  11. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/setup.cfg +0 -0
  12. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/__init__.py +0 -0
  13. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/__init__.py +0 -0
  14. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/builder.py +0 -0
  15. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/dataset/dataset_manager.py +0 -0
  16. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/llm/__init__.py +0 -0
  17. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/mcp/__init__.py +0 -0
  18. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/__init__.py +0 -0
  19. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/ocr/document_parser.py +0 -0
  20. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/__init__.py +0 -0
  21. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker/utils/status.py +0 -0
  22. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/SOURCES.txt +0 -0
  23. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/dependency_links.txt +0 -0
  24. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/entry_points.txt +0 -0
  25. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/requires.txt +0 -0
  26. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/src/bookdatamaker.egg-info/top_level.txt +0 -0
  27. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_dataset.py +0 -0
  28. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_mcp.py +0 -0
  29. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_ocr.py +0 -0
  30. {bookdatamaker-0.2.0 → bookdatamaker-0.2.2}/tests/test_paragraph_indexing.py +0 -0
@@ -1,8 +1,8 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bookdatamaker
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: CLI tool for extracting text with DeepSeek OCR and generating datasets
5
- Author-email: Book Data Maker <contact@example.com>
5
+ Author-email: zwh20081 <zwh20081@solart.pro>
6
6
  License: MIT
7
7
  Requires-Python: <3.13,>=3.10
8
8
  Description-Content-Type: text/markdown
@@ -108,7 +108,7 @@ pip install -r requirements.txt && pip install -e .
108
108
 
109
109
  # 2. Extract → Generate → Export
110
110
  bookdatamaker extract book.pdf -o ./extracted
111
- bookdatamaker generate ./extracted/combined.txt -d dataset.db --distribution "10,10,20,30,20,10"
111
+ bookdatamaker generate ./extracted -d dataset.db --distribution "10,10,20,30,20,10"
112
112
  bookdatamaker export-dataset dataset.db -o output.parquet
113
113
  ```
114
114
 
@@ -122,7 +122,7 @@ pip install -r requirements.txt && pip install -e ".[local]"
122
122
  bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
123
123
 
124
124
  # 3. Generate with vLLM
125
- bookdatamaker generate ./extracted/combined.txt \
125
+ bookdatamaker generate ./extracted \
126
126
  --mode vllm \
127
127
  --vllm-model-path meta-llama/Llama-3-8B-Instruct \
128
128
  --distribution "25,25,25,25" \
@@ -225,42 +225,57 @@ bookdatamaker extract ./images/ --mode local -o ./extracted
225
225
 
226
226
  ```
227
227
  ./extracted/
228
- ├── page_001.txt
229
- ├── page_002.txt
230
- ├── ...
231
- └── combined.txt # All pages with [PAGE_XXX] markers
228
+ ├── page_001/
229
+ ├── page_001.png # Page image
230
+ │ └── result.mmd # Extracted text in markdown
231
+ ├── page_002/
232
+ │ ├── page_002.png
233
+ │ └── result.mmd
234
+ └── ...
232
235
  ```
233
236
 
237
+ **Note**: Each page is stored in its own subdirectory with the extracted text in `result.mmd` format.
238
+
234
239
  ---
235
240
 
236
241
  ## Generate Dataset (Stage 2)
237
242
 
238
- Generate Q&A datasets using parallel LLM threads.
243
+ Generate Q&A datasets using parallel LLM threads with **page-based navigation**.
244
+
245
+ ### Navigation Model
246
+
247
+ The system now uses **page navigation** instead of paragraph navigation:
248
+ - LLM threads navigate through document pages
249
+ - Tools available: `get_current_page`, `next_page`, `previous_page`, `jump_to_page`, `get_page_context`
250
+ - Each thread starts at a specific page based on distribution
251
+ - Threads can move forward/backward through pages to explore content
239
252
 
240
253
  ### Basic Usage
241
254
 
242
255
  ```bash
243
256
  # 6 threads (from distribution), 20 Q&A pairs per thread
244
- bookdatamaker generate combined.txt \
257
+ bookdatamaker generate ./extracted \
245
258
  -d dataset.db \
246
259
  --distribution "10,10,20,30,20,10" \
247
260
  --datasets-per-thread 20
248
261
  ```
249
262
 
263
+ **Note**: The `generate` command now accepts the extracted directory (containing page_XXX/ subdirectories) instead of a combined text file.
264
+
250
265
  **Key Concept**: Thread count is determined by the number of comma-separated values in `--distribution`.
251
266
 
252
267
  ### API Mode Examples
253
268
 
254
269
  ```bash
255
270
  # OpenAI/Azure
256
- bookdatamaker generate combined.txt \
271
+ bookdatamaker generate ./extracted \
257
272
  -d dataset.db \
258
273
  --openai-api-url https://api.openai.com/v1 \
259
274
  --model gpt-4 \
260
275
  --distribution "10,10,20,30,20,10"
261
276
 
262
277
  # Custom API endpoint
263
- bookdatamaker generate combined.txt \
278
+ bookdatamaker generate ./extracted \
264
279
  --openai-api-url http://localhost:8000/v1 \
265
280
  --model your-model-name \
266
281
  --distribution "25,25,25,25"
@@ -272,14 +287,14 @@ Use vLLM directly without API server:
272
287
 
273
288
  ```bash
274
289
  # Single GPU
275
- bookdatamaker generate combined.txt \
290
+ bookdatamaker generate ./extracted \
276
291
  --mode vllm \
277
292
  --vllm-model-path meta-llama/Llama-3-8B-Instruct \
278
293
  --distribution "25,25,25,25" \
279
294
  -d dataset.db
280
295
 
281
296
  # Multi-GPU (4 GPUs, 6 threads)
282
- bookdatamaker generate combined.txt \
297
+ bookdatamaker generate ./extracted \
283
298
  --mode vllm \
284
299
  --vllm-model-path meta-llama/Llama-3-70B-Instruct \
285
300
  --tensor-parallel-size 4 \
@@ -300,15 +315,15 @@ Add specific instructions to guide LLM behavior:
300
315
 
301
316
  ```bash
302
317
  # Language specification
303
- bookdatamaker generate combined.txt \
318
+ bookdatamaker generate ./extracted \
304
319
  --custom-prompt "Generate all Q&A in Chinese with simplified characters"
305
320
 
306
321
  # Format specification
307
- bookdatamaker generate combined.txt \
322
+ bookdatamaker generate ./extracted \
308
323
  --custom-prompt "Questions should be multiple-choice with 4 options"
309
324
 
310
325
  # Multiple requirements
311
- bookdatamaker generate combined.txt \
326
+ bookdatamaker generate ./extracted \
312
327
  --custom-prompt "Requirements:
313
328
  1. Generate questions in English
314
329
  2. Focus on practical applications
@@ -355,15 +370,15 @@ Control where threads start in the document using distribution percentages.
355
370
  ### How It Works
356
371
 
357
372
  ```
358
- Document: 500 paragraphs
373
+ Document: 100 pages
359
374
  Distribution: "10,10,20,30,20,10" (6 threads)
360
375
 
361
- Thread 0: Start at 0% → Paragraph 1
362
- Thread 1: Start at 10% → Paragraph 50
363
- Thread 2: Start at 20% → Paragraph 100
364
- Thread 3: Start at 50% → Paragraph 250
365
- Thread 4: Start at 70% → Paragraph 350
366
- Thread 5: Start at 80% → Paragraph 400
376
+ Thread 0: Start at 0% → Page 1
377
+ Thread 1: Start at 10% → Page 10
378
+ Thread 2: Start at 20% → Page 20
379
+ Thread 3: Start at 50% → Page 50
380
+ Thread 4: Start at 70% → Page 70
381
+ Thread 5: Start at 80% → Page 80
367
382
  ```
368
383
 
369
384
  ### Distribution Strategies
@@ -387,9 +402,9 @@ Thread 5: Start at 80% → Paragraph 400
387
402
 
388
403
  ### Thread Count Guidelines
389
404
 
390
- - **Small documents** (<100 paragraphs): 2-4 threads
391
- - **Medium documents** (100-500 paragraphs): 4-8 threads
392
- - **Large documents** (>500 paragraphs): 8-16 threads
405
+ - **Small documents** (<50 pages): 2-4 threads
406
+ - **Medium documents** (50-200 pages): 4-8 threads
407
+ - **Large documents** (>200 pages): 8-16 threads
393
408
 
394
409
  ---
395
410
 
@@ -460,25 +475,25 @@ Chat with an LLM that can access your document through MCP tools. Perfect for ex
460
475
 
461
476
  ```bash
462
477
  # Basic chat with GPT-4
463
- bookdatamaker chat combined.txt
478
+ bookdatamaker chat ./extracted
464
479
 
465
480
  # With vLLM server
466
- bookdatamaker chat combined.txt \
481
+ bookdatamaker chat ./extracted \
467
482
  --openai-api-url http://localhost:8000/v1 \
468
483
  --model Qwen/Qwen3-4B-Thinking-2507
469
484
 
470
485
  # With custom database
471
- bookdatamaker chat combined.txt --db my_dataset.db
486
+ bookdatamaker chat ./extracted --db my_dataset.db
472
487
  ```
473
488
 
474
489
  ### Example Interaction
475
490
 
476
491
  ```
477
- 📚 Document: combined.txt
478
- 📊 Paragraphs: 578
492
+ 📚 Document: ./extracted
493
+ 📊 Pages: 50
479
494
  🤖 Model: gpt-4
480
495
 
481
- You: What's in paragraph 100?
496
+ You: What's on page 10?
482
497
  - `-f, --format`: Format: `jsonl`, `parquet`, `csv`, `json` (default: `parquet`)
483
498
  - `--include-metadata`: Include timestamps
484
499
 
@@ -501,7 +516,7 @@ You: What's in paragraph 100?
501
516
 
502
517
  | Parameter | Type | Default | Description |
503
518
  |-----------|------|---------|-------------|
504
- | `text_file` | required | - | Combined text file |
519
+ | `extracted_dir` | required | - | Directory containing page subdirectories (page_XXX/) |
505
520
  | `--db` | optional | `dataset.db` | Database file path |
506
521
  | `--mode` | optional | `api` | LLM mode: `api` or `vllm` |
507
522
  | `--distribution` | optional | `10,10,20,30,20,10` | Position distribution (determines threads) |
@@ -553,7 +568,7 @@ Set environment variable for verbose logging:
553
568
 
554
569
  ```bash
555
570
  export LOG_LEVEL=DEBUG
556
- bookdatamaker generate combined.txt -d dataset.db
571
+ bookdatamaker generate ./extracted -d dataset.db
557
572
  ```
558
573
 
559
574
  ---
@@ -56,7 +56,7 @@ pip install -r requirements.txt && pip install -e .
56
56
 
57
57
  # 2. Extract → Generate → Export
58
58
  bookdatamaker extract book.pdf -o ./extracted
59
- bookdatamaker generate ./extracted/combined.txt -d dataset.db --distribution "10,10,20,30,20,10"
59
+ bookdatamaker generate ./extracted -d dataset.db --distribution "10,10,20,30,20,10"
60
60
  bookdatamaker export-dataset dataset.db -o output.parquet
61
61
  ```
62
62
 
@@ -70,7 +70,7 @@ pip install -r requirements.txt && pip install -e ".[local]"
70
70
  bookdatamaker extract book.pdf --mode local --batch-size 8 -o ./extracted
71
71
 
72
72
  # 3. Generate with vLLM
73
- bookdatamaker generate ./extracted/combined.txt \
73
+ bookdatamaker generate ./extracted \
74
74
  --mode vllm \
75
75
  --vllm-model-path meta-llama/Llama-3-8B-Instruct \
76
76
  --distribution "25,25,25,25" \
@@ -173,42 +173,57 @@ bookdatamaker extract ./images/ --mode local -o ./extracted
173
173
 
174
174
  ```
175
175
  ./extracted/
176
- ├── page_001.txt
177
- ├── page_002.txt
178
- ├── ...
179
- └── combined.txt # All pages with [PAGE_XXX] markers
176
+ ├── page_001/
177
+ ├── page_001.png # Page image
178
+ │ └── result.mmd # Extracted text in markdown
179
+ ├── page_002/
180
+ │ ├── page_002.png
181
+ │ └── result.mmd
182
+ └── ...
180
183
  ```
181
184
 
185
+ **Note**: Each page is stored in its own subdirectory with the extracted text in `result.mmd` format.
186
+
182
187
  ---
183
188
 
184
189
  ## Generate Dataset (Stage 2)
185
190
 
186
- Generate Q&A datasets using parallel LLM threads.
191
+ Generate Q&A datasets using parallel LLM threads with **page-based navigation**.
192
+
193
+ ### Navigation Model
194
+
195
+ The system now uses **page navigation** instead of paragraph navigation:
196
+ - LLM threads navigate through document pages
197
+ - Tools available: `get_current_page`, `next_page`, `previous_page`, `jump_to_page`, `get_page_context`
198
+ - Each thread starts at a specific page based on distribution
199
+ - Threads can move forward/backward through pages to explore content
187
200
 
188
201
  ### Basic Usage
189
202
 
190
203
  ```bash
191
204
  # 6 threads (from distribution), 20 Q&A pairs per thread
192
- bookdatamaker generate combined.txt \
205
+ bookdatamaker generate ./extracted \
193
206
  -d dataset.db \
194
207
  --distribution "10,10,20,30,20,10" \
195
208
  --datasets-per-thread 20
196
209
  ```
197
210
 
211
+ **Note**: The `generate` command now accepts the extracted directory (containing page_XXX/ subdirectories) instead of a combined text file.
212
+
198
213
  **Key Concept**: Thread count is determined by the number of comma-separated values in `--distribution`.
199
214
 
200
215
  ### API Mode Examples
201
216
 
202
217
  ```bash
203
218
  # OpenAI/Azure
204
- bookdatamaker generate combined.txt \
219
+ bookdatamaker generate ./extracted \
205
220
  -d dataset.db \
206
221
  --openai-api-url https://api.openai.com/v1 \
207
222
  --model gpt-4 \
208
223
  --distribution "10,10,20,30,20,10"
209
224
 
210
225
  # Custom API endpoint
211
- bookdatamaker generate combined.txt \
226
+ bookdatamaker generate ./extracted \
212
227
  --openai-api-url http://localhost:8000/v1 \
213
228
  --model your-model-name \
214
229
  --distribution "25,25,25,25"
@@ -220,14 +235,14 @@ Use vLLM directly without API server:
220
235
 
221
236
  ```bash
222
237
  # Single GPU
223
- bookdatamaker generate combined.txt \
238
+ bookdatamaker generate ./extracted \
224
239
  --mode vllm \
225
240
  --vllm-model-path meta-llama/Llama-3-8B-Instruct \
226
241
  --distribution "25,25,25,25" \
227
242
  -d dataset.db
228
243
 
229
244
  # Multi-GPU (4 GPUs, 6 threads)
230
- bookdatamaker generate combined.txt \
245
+ bookdatamaker generate ./extracted \
231
246
  --mode vllm \
232
247
  --vllm-model-path meta-llama/Llama-3-70B-Instruct \
233
248
  --tensor-parallel-size 4 \
@@ -248,15 +263,15 @@ Add specific instructions to guide LLM behavior:
248
263
 
249
264
  ```bash
250
265
  # Language specification
251
- bookdatamaker generate combined.txt \
266
+ bookdatamaker generate ./extracted \
252
267
  --custom-prompt "Generate all Q&A in Chinese with simplified characters"
253
268
 
254
269
  # Format specification
255
- bookdatamaker generate combined.txt \
270
+ bookdatamaker generate ./extracted \
256
271
  --custom-prompt "Questions should be multiple-choice with 4 options"
257
272
 
258
273
  # Multiple requirements
259
- bookdatamaker generate combined.txt \
274
+ bookdatamaker generate ./extracted \
260
275
  --custom-prompt "Requirements:
261
276
  1. Generate questions in English
262
277
  2. Focus on practical applications
@@ -303,15 +318,15 @@ Control where threads start in the document using distribution percentages.
303
318
  ### How It Works
304
319
 
305
320
  ```
306
- Document: 500 paragraphs
321
+ Document: 100 pages
307
322
  Distribution: "10,10,20,30,20,10" (6 threads)
308
323
 
309
- Thread 0: Start at 0% → Paragraph 1
310
- Thread 1: Start at 10% → Paragraph 50
311
- Thread 2: Start at 20% → Paragraph 100
312
- Thread 3: Start at 50% → Paragraph 250
313
- Thread 4: Start at 70% → Paragraph 350
314
- Thread 5: Start at 80% → Paragraph 400
324
+ Thread 0: Start at 0% → Page 1
325
+ Thread 1: Start at 10% → Page 10
326
+ Thread 2: Start at 20% → Page 20
327
+ Thread 3: Start at 50% → Page 50
328
+ Thread 4: Start at 70% → Page 70
329
+ Thread 5: Start at 80% → Page 80
315
330
  ```
316
331
 
317
332
  ### Distribution Strategies
@@ -335,9 +350,9 @@ Thread 5: Start at 80% → Paragraph 400
335
350
 
336
351
  ### Thread Count Guidelines
337
352
 
338
- - **Small documents** (<100 paragraphs): 2-4 threads
339
- - **Medium documents** (100-500 paragraphs): 4-8 threads
340
- - **Large documents** (>500 paragraphs): 8-16 threads
353
+ - **Small documents** (<50 pages): 2-4 threads
354
+ - **Medium documents** (50-200 pages): 4-8 threads
355
+ - **Large documents** (>200 pages): 8-16 threads
341
356
 
342
357
  ---
343
358
 
@@ -408,25 +423,25 @@ Chat with an LLM that can access your document through MCP tools. Perfect for ex
408
423
 
409
424
  ```bash
410
425
  # Basic chat with GPT-4
411
- bookdatamaker chat combined.txt
426
+ bookdatamaker chat ./extracted
412
427
 
413
428
  # With vLLM server
414
- bookdatamaker chat combined.txt \
429
+ bookdatamaker chat ./extracted \
415
430
  --openai-api-url http://localhost:8000/v1 \
416
431
  --model Qwen/Qwen3-4B-Thinking-2507
417
432
 
418
433
  # With custom database
419
- bookdatamaker chat combined.txt --db my_dataset.db
434
+ bookdatamaker chat ./extracted --db my_dataset.db
420
435
  ```
421
436
 
422
437
  ### Example Interaction
423
438
 
424
439
  ```
425
- 📚 Document: combined.txt
426
- 📊 Paragraphs: 578
440
+ 📚 Document: ./extracted
441
+ 📊 Pages: 50
427
442
  🤖 Model: gpt-4
428
443
 
429
- You: What's in paragraph 100?
444
+ You: What's on page 10?
430
445
  - `-f, --format`: Format: `jsonl`, `parquet`, `csv`, `json` (default: `parquet`)
431
446
  - `--include-metadata`: Include timestamps
432
447
 
@@ -449,7 +464,7 @@ You: What's in paragraph 100?
449
464
 
450
465
  | Parameter | Type | Default | Description |
451
466
  |-----------|------|---------|-------------|
452
- | `text_file` | required | - | Combined text file |
467
+ | `extracted_dir` | required | - | Directory containing page subdirectories (page_XXX/) |
453
468
  | `--db` | optional | `dataset.db` | Database file path |
454
469
  | `--mode` | optional | `api` | LLM mode: `api` or `vllm` |
455
470
  | `--distribution` | optional | `10,10,20,30,20,10` | Position distribution (determines threads) |
@@ -501,7 +516,7 @@ Set environment variable for verbose logging:
501
516
 
502
517
  ```bash
503
518
  export LOG_LEVEL=DEBUG
504
- bookdatamaker generate combined.txt -d dataset.db
519
+ bookdatamaker generate ./extracted -d dataset.db
505
520
  ```
506
521
 
507
522
  ---
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "bookdatamaker"
7
- version = "0.2.0"
7
+ version = "0.2.2"
8
8
  description = "CLI tool for extracting text with DeepSeek OCR and generating datasets"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.10,<3.13"
11
11
  license = {text = "MIT"}
12
12
  authors = [
13
- {name = "Book Data Maker", email = "contact@example.com"}
13
+ {name = "zwh20081", email = "zwh20081@solart.pro"}
14
14
  ]
15
15
  dependencies = [
16
16
  "click",