preocr 1.2.2__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {preocr-1.2.2 → preocr-1.3.0}/PKG-INFO +128 -54
- {preocr-1.2.2 → preocr-1.3.0}/README.md +127 -53
- {preocr-1.2.2 → preocr-1.3.0}/preocr/__init__.py +2 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/constants.py +23 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/core/decision.py +101 -5
- {preocr-1.2.2 → preocr-1.3.0}/preocr/core/detector.py +70 -5
- {preocr-1.2.2 → preocr-1.3.0}/preocr/core/signals.py +3 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/schemas.py +4 -3
- preocr-1.3.0/preocr/planner/__init__.py +15 -0
- preocr-1.3.0/preocr/planner/_extract.py +79 -0
- preocr-1.3.0/preocr/planner/config.py +99 -0
- preocr-1.3.0/preocr/planner/decision.py +126 -0
- preocr-1.3.0/preocr/planner/intent.py +131 -0
- preocr-1.3.0/preocr/planner/models.py +101 -0
- preocr-1.3.0/preocr/planner/planner.py +231 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/version.py +1 -1
- {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/PKG-INFO +128 -54
- {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/SOURCES.txt +9 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_batch.py +18 -19
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_config_thresholds.py +20 -20
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_decision.py +30 -9
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_layout_aware_needs_ocr.py +26 -25
- preocr-1.3.0/tests/test_planner.py +102 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_signals.py +26 -1
- preocr-1.3.0/tests/test_skip_opencv_heuristics.py +77 -0
- {preocr-1.2.2 → preocr-1.3.0}/LICENSE +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/__init__.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/layout_analyzer.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/opencv_layout.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/page_detection.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/core/__init__.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/core/extractor.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/exceptions.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/__init__.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/base.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/formatters.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/office_extractor.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/pdf_extractor.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/text_extractor.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/__init__.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/image_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/office_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/pdf_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/text_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/py.typed +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/reason_codes.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/__init__.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/batch.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/cache.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/filetype.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/logger.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/dependency_links.txt +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/requires.txt +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/top_level.txt +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/pyproject.toml +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/setup.cfg +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_detector.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_filetype.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_hybrid_pipeline.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_image_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_integration.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_layout_analyzer.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_office_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_opencv_layout.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_page_detection.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_pdf_probe.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_reason_codes.py +0 -0
- {preocr-1.2.2 → preocr-1.3.0}/tests/test_text_probe.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: preocr
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: A fast, CPU-only library that intelligently detects whether files need OCR processing before expensive OCR operations. Uses hybrid adaptive pipeline for 92-95% accuracy.
|
|
5
5
|
Author: PreOCR Contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -45,11 +45,11 @@ Provides-Extra: batch
|
|
|
45
45
|
Requires-Dist: tqdm>=4.65.0; extra == "batch"
|
|
46
46
|
Dynamic: license-file
|
|
47
47
|
|
|
48
|
-
# PreOCR
|
|
48
|
+
# PreOCR – Python OCR Detection Library | Skip OCR for Digital PDFs
|
|
49
49
|
|
|
50
50
|
<div align="center">
|
|
51
51
|
|
|
52
|
-
**
|
|
52
|
+
**Open-source Python library for OCR detection and document extraction. Detect if PDFs need OCR before expensive processing—save 50–70% on OCR costs.**
|
|
53
53
|
|
|
54
54
|
[](https://www.python.org/downloads/)
|
|
55
55
|
[](LICENSE)
|
|
@@ -57,32 +57,53 @@ Dynamic: license-file
|
|
|
57
57
|
[](https://pepy.tech/project/preocr)
|
|
58
58
|
[](https://github.com/psf/black)
|
|
59
59
|
|
|
60
|
-
*
|
|
60
|
+
*2–10× faster than alternatives • 100% accuracy on benchmark • CPU-only, no GPU required*
|
|
61
61
|
|
|
62
|
-
**🌐
|
|
62
|
+
**🌐 [preocr.io](https://preocr.io)** • [Installation](#-installation) • [Quick Start](#-quick-start) • [API Reference](#-api-reference) • [Examples](#-usage-examples) • [Performance](#-performance)
|
|
63
63
|
|
|
64
64
|
</div>
|
|
65
65
|
|
|
66
66
|
---
|
|
67
67
|
|
|
68
|
-
|
|
68
|
+
### ⚡ TL;DR
|
|
69
69
|
|
|
70
|
-
|
|
70
|
+
| Metric | Result |
|
|
71
|
+
|--------|--------|
|
|
72
|
+
| **Accuracy** | 100% (TP=1, FP=0, TN=9, FN=0) |
|
|
73
|
+
| **Latency** | ~2.7s mean, ~1.9s median (≤1MB PDFs) |
|
|
74
|
+
| **Office docs** | ~7ms |
|
|
75
|
+
| **Focus** | Zero false positives. Zero missed scans. |
|
|
71
76
|
|
|
72
|
-
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## What is PreOCR? Python OCR Detection & Document Processing
|
|
80
|
+
|
|
81
|
+
**PreOCR** is an open-source **Python OCR detection library** that determines whether documents need OCR before you run expensive processing. It analyzes **PDFs**, **Office documents** (DOCX, PPTX, XLSX), **images**, and text files to detect if they're already machine-readable—helping you **skip OCR** for 50–70% of documents and cut costs.
|
|
82
|
+
|
|
83
|
+
Use PreOCR to filter documents before Tesseract, AWS Textract, Google Vision, Azure Document Intelligence, or MinerU. Works offline, CPU-only, with 100% accuracy on validation benchmarks.
|
|
84
|
+
|
|
85
|
+
**🌐 [preocr.io](https://preocr.io)**
|
|
73
86
|
|
|
74
87
|
### Key Benefits
|
|
75
88
|
|
|
76
|
-
- ⚡ **Fast**: CPU-only
|
|
77
|
-
- 🎯 **Accurate**: 92
|
|
78
|
-
- 💰 **Cost-Effective**: Skip OCR for 50
|
|
79
|
-
- 📊 **Structured Extraction**:
|
|
89
|
+
- ⚡ **Fast**: CPU-only, typically < 1 second per file—no GPU needed
|
|
90
|
+
- 🎯 **Accurate**: 92–95% accuracy (100% on validation benchmark)
|
|
91
|
+
- 💰 **Cost-Effective**: Skip OCR for 50–70% of documents
|
|
92
|
+
- 📊 **Structured Extraction**: Tables, forms, images, semantic data—Pydantic models, JSON, or Markdown
|
|
80
93
|
- 🔒 **Type-Safe**: Full Pydantic models with IDE autocomplete
|
|
81
|
-
- 🚀 **Production-Ready**:
|
|
94
|
+
- 🚀 **Offline & Production-Ready**: No API keys; battle-tested error handling
|
|
95
|
+
|
|
96
|
+
### Use Cases: When to Use PreOCR
|
|
97
|
+
|
|
98
|
+
- **Document pipelines**: Filter PDFs before OCR (Tesseract, AWS Textract, Google Vision)
|
|
99
|
+
- **RAG / LLM ingestion**: Decide which documents need OCR vs. native text extraction
|
|
100
|
+
- **Batch processing**: Process thousands of PDFs with page-level OCR decisions
|
|
101
|
+
- **Cost optimization**: Reduce cloud OCR API costs by skipping digital documents
|
|
102
|
+
- **Medical / legal**: Intent-aware planner for prescriptions, discharge summaries, lab reports
|
|
82
103
|
|
|
83
104
|
---
|
|
84
105
|
|
|
85
|
-
##
|
|
106
|
+
## Quick Comparison: PreOCR vs. Alternatives
|
|
86
107
|
|
|
87
108
|
| Feature | PreOCR 🏆 | Unstructured.io | Docugami |
|
|
88
109
|
|---------|-----------|-----------------|----------|
|
|
@@ -157,6 +178,17 @@ results.print_summary()
|
|
|
157
178
|
- **Page-Level Granularity**: Analyze PDFs page-by-page for precise detection
|
|
158
179
|
- **Confidence Scores**: Per-decision confidence with reason codes
|
|
159
180
|
- **Hybrid Pipeline**: Fast heuristics + OpenCV refinement for edge cases
|
|
181
|
+
- **OpenCV Skip Heuristics**: Skips OpenCV for clearly digital documents (file size, page count, text coverage) to improve performance
|
|
182
|
+
- **Digital/Table Bias**: Reduces false positives on high-text PDFs (product manuals, marketing docs) via configurable rules
|
|
183
|
+
|
|
184
|
+
### Intent-Aware OCR Planner (`plan_ocr_for_document`)
|
|
185
|
+
|
|
186
|
+
- **Medical Domain**: Terminal overrides for prescriptions, diagnosis, discharge summaries, lab reports
|
|
187
|
+
- **Weighted Scoring**: Configurable threshold with safety/balanced/cost modes
|
|
188
|
+
- **Explainability**: Per-page score breakdown (intent, image_dominance, text_weakness)
|
|
189
|
+
- **Evaluation**: Threshold sweep and confusion matrix for calibration
|
|
190
|
+
|
|
191
|
+
See [docs/OCR_DECISION_MODEL.md](docs/OCR_DECISION_MODEL.md) for the full specification.
|
|
160
192
|
|
|
161
193
|
### Document Extraction (`extract_native_data`)
|
|
162
194
|
|
|
@@ -222,6 +254,18 @@ print(f"Confidence: {result['confidence']:.2f}")
|
|
|
222
254
|
print(f"Reason: {result['reason']}")
|
|
223
255
|
```
|
|
224
256
|
|
|
257
|
+
#### Intent-Aware Planner (Medical/Domain-Specific)
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
from preocr import plan_ocr_for_document
|
|
261
|
+
|
|
262
|
+
result = plan_ocr_for_document("hospital_discharge.pdf")
|
|
263
|
+
print(f"Needs OCR (any page): {result['needs_ocr_any']}")
|
|
264
|
+
for page in result["pages"]:
|
|
265
|
+
print(f" Page {page['page_number']}: needs_ocr={page['needs_ocr']} "
|
|
266
|
+
f"type={page['decision_type']} score={page['debug']['score']:.2f}")
|
|
267
|
+
```
|
|
268
|
+
|
|
225
269
|
#### Layout-Aware Detection
|
|
226
270
|
|
|
227
271
|
```python
|
|
@@ -352,8 +396,6 @@ PreOCR supports **20+ file formats** for OCR detection and extraction:
|
|
|
352
396
|
| **Text** | ✅ Yes | ✅ Yes | TXT, CSV, HTML |
|
|
353
397
|
| **Structured** | ✅ Yes | ✅ Yes | JSON, XML |
|
|
354
398
|
|
|
355
|
-
See [Supported Formats](SUPPORTED_FORMATS.md) for complete list.
|
|
356
|
-
|
|
357
399
|
---
|
|
358
400
|
|
|
359
401
|
## ⚙️ Configuration
|
|
@@ -377,6 +419,10 @@ result = needs_ocr("document.pdf", config=config)
|
|
|
377
419
|
- `min_text_length`: Minimum text length (default: 50)
|
|
378
420
|
- `min_office_text_length`: Minimum office text length (default: 100)
|
|
379
421
|
- `layout_refinement_threshold`: OpenCV trigger threshold (default: 0.9)
|
|
422
|
+
- `skip_opencv_if_file_size_mb`: Skip OpenCV when file size ≥ N MB (default: None)
|
|
423
|
+
- `skip_opencv_if_page_count`: Skip OpenCV when page count ≥ N (default: None)
|
|
424
|
+
- `digital_bias_text_coverage_min`: Force no-OCR when text_coverage ≥ this and image_coverage is low (default: 65)
|
|
425
|
+
- `table_bias_text_density_min`: For mixed layout, treat as digital when text_density ≥ this (default: 1.5)
|
|
380
426
|
|
|
381
427
|
---
|
|
382
428
|
|
|
@@ -415,15 +461,34 @@ if result["reason_code"] == "PDF_MIXED":
|
|
|
415
461
|
|----------|------|----------|
|
|
416
462
|
| Fast Path (Heuristics) | < 150ms | ~99% |
|
|
417
463
|
| OpenCV Refinement | 150-300ms | 92-96% |
|
|
418
|
-
| **
|
|
464
|
+
| **Typical (single file)** | **< 1 second** | **94-97%** |
|
|
465
|
+
|
|
466
|
+
*Typical: most PDFs finish in under 1 second. Heuristics-only files: 120–180ms avg. Large or mixed documents may take 1–3s with OpenCV.*
|
|
467
|
+
|
|
468
|
+
### Benchmark Results (≤1MB Dataset)
|
|
469
|
+
|
|
470
|
+
<p align="center">
|
|
471
|
+
<img src="docs/benchmarks/avg-time-by-type.png" alt="Average processing time by file type" width="500">
|
|
472
|
+
<br><em>Average Processing Time by File Type</em>
|
|
473
|
+
</p>
|
|
474
|
+
|
|
475
|
+
<p align="center">
|
|
476
|
+
<img src="docs/benchmarks/latency-summary.png" alt="Latency summary for PDFs" width="500">
|
|
477
|
+
<br><em>Latency Summary (Mean, Median, P95)</em>
|
|
478
|
+
</p>
|
|
419
479
|
|
|
420
480
|
### Accuracy Metrics
|
|
421
481
|
|
|
422
|
-
- **Overall Accuracy**: 92-95% (100% on
|
|
482
|
+
- **Overall Accuracy**: 92-95% (100% on validation benchmark)
|
|
423
483
|
- **Precision**: 100% (all flagged files actually need OCR)
|
|
424
484
|
- **Recall**: 100% (all OCR-needed files detected)
|
|
425
485
|
- **F1-Score**: 100%
|
|
426
486
|
|
|
487
|
+
<p align="center">
|
|
488
|
+
<img src="docs/benchmarks/confusion-matrix.png" alt="Confusion matrix - 100% accuracy" width="500">
|
|
489
|
+
<br><em>Confusion Matrix (TP:1, FP:0, TN:9, FN:0)</em>
|
|
490
|
+
</p>
|
|
491
|
+
|
|
427
492
|
### Performance Factors
|
|
428
493
|
|
|
429
494
|
- **File size**: Larger files take longer
|
|
@@ -528,12 +593,14 @@ Batch processor for multiple files with parallel processing.
|
|
|
528
593
|
### When to Choose PreOCR
|
|
529
594
|
|
|
530
595
|
✅ **Choose PreOCR when:**
|
|
531
|
-
- You
|
|
532
|
-
- You
|
|
533
|
-
- You
|
|
534
|
-
- You
|
|
535
|
-
|
|
536
|
-
|
|
596
|
+
- You're building **document ingestion pipelines** or **RAG/LLM systems**—decide which files need OCR vs. native extraction
|
|
597
|
+
- You need **speed** (< 1 second per file) and **cost optimization** (skip OCR for 50–70% of documents)
|
|
598
|
+
- You want **page-level granularity** (which pages need OCR in mixed PDFs)
|
|
599
|
+
- You prefer **type safety** (Pydantic models) and **edge deployment** (CPU-only, no GPU)
|
|
600
|
+
|
|
601
|
+
### Switched from Unstructured.io or another library?
|
|
602
|
+
|
|
603
|
+
PreOCR focuses on **OCR routing**—it doesn't perform extraction by default. Use it as a pre-filter: call `needs_ocr()` first, then route to your OCR engine or to `extract_native_data()` for digital documents. The API is simple: `needs_ocr(path)`, `extract_native_data(path)`, `BatchProcessor`.
|
|
537
604
|
|
|
538
605
|
---
|
|
539
606
|
|
|
@@ -560,22 +627,25 @@ Batch processor for multiple files with parallel processing.
|
|
|
560
627
|
|
|
561
628
|
---
|
|
562
629
|
|
|
563
|
-
##
|
|
630
|
+
## Frequently Asked Questions (FAQ)
|
|
564
631
|
|
|
565
|
-
**
|
|
566
|
-
|
|
632
|
+
**Does PreOCR perform OCR?**
|
|
633
|
+
No. PreOCR is an **OCR detection** library—it analyzes files to determine if OCR is needed. It does not run OCR itself. Use it to decide whether to call Tesseract, Textract, or another OCR engine.
|
|
567
634
|
|
|
568
|
-
**
|
|
569
|
-
|
|
635
|
+
**How accurate is PreOCR for PDF OCR detection?**
|
|
636
|
+
PreOCR achieves 92–95% accuracy with the hybrid pipeline. Validation on benchmark datasets reached 100% accuracy (10/10 PDFs correct).
|
|
570
637
|
|
|
571
|
-
**
|
|
572
|
-
|
|
638
|
+
**Can I use PreOCR with AWS Textract, Google Vision, or Azure Document Intelligence?**
|
|
639
|
+
Yes. PreOCR is ideal for filtering documents before sending them to cloud OCR APIs. Skip OCR for digital PDFs to reduce API costs.
|
|
573
640
|
|
|
574
|
-
**
|
|
575
|
-
|
|
641
|
+
**Does PreOCR work offline?**
|
|
642
|
+
Yes. PreOCR is CPU-only and runs fully offline—no API keys or internet required.
|
|
576
643
|
|
|
577
|
-
**
|
|
578
|
-
|
|
644
|
+
**How do I customize OCR detection thresholds?**
|
|
645
|
+
Use the `Config` class or pass threshold parameters to `BatchProcessor`. See [Configuration](#-configuration).
|
|
646
|
+
|
|
647
|
+
**Is there an HTTP/REST API?**
|
|
648
|
+
PreOCR is a Python library. For HTTP APIs, wrap it in FastAPI or Flask—see [preocr.io](https://preocr.io) for hosted options.
|
|
579
649
|
|
|
580
650
|
---
|
|
581
651
|
|
|
@@ -592,6 +662,10 @@ pip install -e ".[dev]"
|
|
|
592
662
|
# Run tests
|
|
593
663
|
pytest
|
|
594
664
|
|
|
665
|
+
# Run benchmarks (add PDFs to datasets/ for testing)
|
|
666
|
+
python scripts/benchmark_accuracy.py datasets -g scripts/ground_truth_data_source_formats.json --layout-aware --page-level
|
|
667
|
+
python scripts/benchmark_planner.py datasets
|
|
668
|
+
|
|
595
669
|
# Run linting
|
|
596
670
|
ruff check preocr/
|
|
597
671
|
black --check preocr/
|
|
@@ -605,20 +679,20 @@ See [CHANGELOG.md](docs/CHANGELOG.md) for complete version history.
|
|
|
605
679
|
|
|
606
680
|
### Recent Updates
|
|
607
681
|
|
|
608
|
-
**
|
|
609
|
-
- ✅ **
|
|
610
|
-
- ✅ **
|
|
611
|
-
- ✅ **
|
|
612
|
-
- ✅ **
|
|
613
|
-
- ✅ **
|
|
614
|
-
|
|
615
|
-
|
|
682
|
+
**v2.0.0** - Accuracy & Performance (Latest)
|
|
683
|
+
- ✅ **100% Accuracy**: Fixed false positives on digital PDFs; benchmark validation at 100%
|
|
684
|
+
- ✅ **OpenCV Skip Heuristics**: Skip OpenCV for clearly digital documents (configurable by file size, page count)
|
|
685
|
+
- ✅ **Digital/Table Bias Rules**: New config options to reduce false positives on product manuals, marketing PDFs
|
|
686
|
+
- ✅ **Unified Datasets**: Consolidated `benchmarkdata` and `data-source-formats` into `datasets/` directory
|
|
687
|
+
- ✅ **Page Count in Signals**: PDF analysis includes page count for smarter heuristics
|
|
688
|
+
|
|
689
|
+
**v1.1.0** - Invoice Intelligence & Advanced Extraction
|
|
690
|
+
- ✅ Semantic deduplication, invoice intelligence, text merging
|
|
691
|
+
- ✅ Table stitching, finance validation, reversed text detection
|
|
616
692
|
|
|
617
693
|
**v1.0.0** - Structured Data Extraction
|
|
618
|
-
- ✅ Comprehensive extraction
|
|
619
|
-
- ✅ Element classification
|
|
620
|
-
- ✅ Table, form, and image extraction
|
|
621
|
-
- ✅ Multiple output formats (Pydantic, JSON, Markdown)
|
|
694
|
+
- ✅ Comprehensive extraction for PDFs, Office docs, text files
|
|
695
|
+
- ✅ Element classification, table/form/image extraction
|
|
622
696
|
|
|
623
697
|
---
|
|
624
698
|
|
|
@@ -634,19 +708,19 @@ Apache License 2.0 - see [LICENSE](LICENSE) for details.
|
|
|
634
708
|
|
|
635
709
|
---
|
|
636
710
|
|
|
637
|
-
##
|
|
711
|
+
## Links & Resources
|
|
638
712
|
|
|
639
|
-
-
|
|
640
|
-
- **
|
|
641
|
-
- **
|
|
642
|
-
- **
|
|
713
|
+
- **Website**: [preocr.io](https://preocr.io) – Python OCR detection and document processing
|
|
714
|
+
- **PyPI**: [pypi.org/project/preocr](https://pypi.org/project/preocr) – Install with `pip install preocr`
|
|
715
|
+
- **GitHub**: [github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr) – Source code and issues
|
|
716
|
+
- **Documentation**: [CHANGELOG](docs/CHANGELOG.md) • [OCR Decision Model](docs/OCR_DECISION_MODEL.md) • [Contributing](docs/CONTRIBUTING.md)
|
|
643
717
|
|
|
644
718
|
---
|
|
645
719
|
|
|
646
720
|
<div align="center">
|
|
647
721
|
|
|
648
|
-
**
|
|
722
|
+
**PreOCR – Python OCR detection library. Skip OCR for digital PDFs. Save time and money.**
|
|
649
723
|
|
|
650
|
-
[
|
|
724
|
+
[Website](https://preocr.io) · [GitHub](https://github.com/yuvaraj3855/preocr) · [PyPI](https://pypi.org/project/preocr) · [Report Issue](https://github.com/yuvaraj3855/preocr/issues)
|
|
651
725
|
|
|
652
726
|
</div>
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
# PreOCR
|
|
1
|
+
# PreOCR – Python OCR Detection Library | Skip OCR for Digital PDFs
|
|
2
2
|
|
|
3
3
|
<div align="center">
|
|
4
4
|
|
|
5
|
-
**
|
|
5
|
+
**Open-source Python library for OCR detection and document extraction. Detect if PDFs need OCR before expensive processing—save 50–70% on OCR costs.**
|
|
6
6
|
|
|
7
7
|
[](https://www.python.org/downloads/)
|
|
8
8
|
[](LICENSE)
|
|
@@ -10,32 +10,53 @@
|
|
|
10
10
|
[](https://pepy.tech/project/preocr)
|
|
11
11
|
[](https://github.com/psf/black)
|
|
12
12
|
|
|
13
|
-
*
|
|
13
|
+
*2–10× faster than alternatives • 100% accuracy on benchmark • CPU-only, no GPU required*
|
|
14
14
|
|
|
15
|
-
**🌐
|
|
15
|
+
**🌐 [preocr.io](https://preocr.io)** • [Installation](#-installation) • [Quick Start](#-quick-start) • [API Reference](#-api-reference) • [Examples](#-usage-examples) • [Performance](#-performance)
|
|
16
16
|
|
|
17
17
|
</div>
|
|
18
18
|
|
|
19
19
|
---
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
### ⚡ TL;DR
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
| Metric | Result |
|
|
24
|
+
|--------|--------|
|
|
25
|
+
| **Accuracy** | 100% (TP=1, FP=0, TN=9, FN=0) |
|
|
26
|
+
| **Latency** | ~2.7s mean, ~1.9s median (≤1MB PDFs) |
|
|
27
|
+
| **Office docs** | ~7ms |
|
|
28
|
+
| **Focus** | Zero false positives. Zero missed scans. |
|
|
24
29
|
|
|
25
|
-
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## What is PreOCR? Python OCR Detection & Document Processing
|
|
33
|
+
|
|
34
|
+
**PreOCR** is an open-source **Python OCR detection library** that determines whether documents need OCR before you run expensive processing. It analyzes **PDFs**, **Office documents** (DOCX, PPTX, XLSX), **images**, and text files to detect if they're already machine-readable—helping you **skip OCR** for 50–70% of documents and cut costs.
|
|
35
|
+
|
|
36
|
+
Use PreOCR to filter documents before Tesseract, AWS Textract, Google Vision, Azure Document Intelligence, or MinerU. Works offline, CPU-only, with 100% accuracy on validation benchmarks.
|
|
37
|
+
|
|
38
|
+
**🌐 [preocr.io](https://preocr.io)**
|
|
26
39
|
|
|
27
40
|
### Key Benefits
|
|
28
41
|
|
|
29
|
-
- ⚡ **Fast**: CPU-only
|
|
30
|
-
- 🎯 **Accurate**: 92
|
|
31
|
-
- 💰 **Cost-Effective**: Skip OCR for 50
|
|
32
|
-
- 📊 **Structured Extraction**:
|
|
42
|
+
- ⚡ **Fast**: CPU-only, typically < 1 second per file—no GPU needed
|
|
43
|
+
- 🎯 **Accurate**: 92–95% accuracy (100% on validation benchmark)
|
|
44
|
+
- 💰 **Cost-Effective**: Skip OCR for 50–70% of documents
|
|
45
|
+
- 📊 **Structured Extraction**: Tables, forms, images, semantic data—Pydantic models, JSON, or Markdown
|
|
33
46
|
- 🔒 **Type-Safe**: Full Pydantic models with IDE autocomplete
|
|
34
|
-
- 🚀 **Production-Ready**:
|
|
47
|
+
- 🚀 **Offline & Production-Ready**: No API keys; battle-tested error handling
|
|
48
|
+
|
|
49
|
+
### Use Cases: When to Use PreOCR
|
|
50
|
+
|
|
51
|
+
- **Document pipelines**: Filter PDFs before OCR (Tesseract, AWS Textract, Google Vision)
|
|
52
|
+
- **RAG / LLM ingestion**: Decide which documents need OCR vs. native text extraction
|
|
53
|
+
- **Batch processing**: Process thousands of PDFs with page-level OCR decisions
|
|
54
|
+
- **Cost optimization**: Reduce cloud OCR API costs by skipping digital documents
|
|
55
|
+
- **Medical / legal**: Intent-aware planner for prescriptions, discharge summaries, lab reports
|
|
35
56
|
|
|
36
57
|
---
|
|
37
58
|
|
|
38
|
-
##
|
|
59
|
+
## Quick Comparison: PreOCR vs. Alternatives
|
|
39
60
|
|
|
40
61
|
| Feature | PreOCR 🏆 | Unstructured.io | Docugami |
|
|
41
62
|
|---------|-----------|-----------------|----------|
|
|
@@ -110,6 +131,17 @@ results.print_summary()
|
|
|
110
131
|
- **Page-Level Granularity**: Analyze PDFs page-by-page for precise detection
|
|
111
132
|
- **Confidence Scores**: Per-decision confidence with reason codes
|
|
112
133
|
- **Hybrid Pipeline**: Fast heuristics + OpenCV refinement for edge cases
|
|
134
|
+
- **OpenCV Skip Heuristics**: Skips OpenCV for clearly digital documents (file size, page count, text coverage) to improve performance
|
|
135
|
+
- **Digital/Table Bias**: Reduces false positives on high-text PDFs (product manuals, marketing docs) via configurable rules
|
|
136
|
+
|
|
137
|
+
### Intent-Aware OCR Planner (`plan_ocr_for_document`)
|
|
138
|
+
|
|
139
|
+
- **Medical Domain**: Terminal overrides for prescriptions, diagnosis, discharge summaries, lab reports
|
|
140
|
+
- **Weighted Scoring**: Configurable threshold with safety/balanced/cost modes
|
|
141
|
+
- **Explainability**: Per-page score breakdown (intent, image_dominance, text_weakness)
|
|
142
|
+
- **Evaluation**: Threshold sweep and confusion matrix for calibration
|
|
143
|
+
|
|
144
|
+
See [docs/OCR_DECISION_MODEL.md](docs/OCR_DECISION_MODEL.md) for the full specification.
|
|
113
145
|
|
|
114
146
|
### Document Extraction (`extract_native_data`)
|
|
115
147
|
|
|
@@ -175,6 +207,18 @@ print(f"Confidence: {result['confidence']:.2f}")
|
|
|
175
207
|
print(f"Reason: {result['reason']}")
|
|
176
208
|
```
|
|
177
209
|
|
|
210
|
+
#### Intent-Aware Planner (Medical/Domain-Specific)
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from preocr import plan_ocr_for_document
|
|
214
|
+
|
|
215
|
+
result = plan_ocr_for_document("hospital_discharge.pdf")
|
|
216
|
+
print(f"Needs OCR (any page): {result['needs_ocr_any']}")
|
|
217
|
+
for page in result["pages"]:
|
|
218
|
+
print(f" Page {page['page_number']}: needs_ocr={page['needs_ocr']} "
|
|
219
|
+
f"type={page['decision_type']} score={page['debug']['score']:.2f}")
|
|
220
|
+
```
|
|
221
|
+
|
|
178
222
|
#### Layout-Aware Detection
|
|
179
223
|
|
|
180
224
|
```python
|
|
@@ -305,8 +349,6 @@ PreOCR supports **20+ file formats** for OCR detection and extraction:
|
|
|
305
349
|
| **Text** | ✅ Yes | ✅ Yes | TXT, CSV, HTML |
|
|
306
350
|
| **Structured** | ✅ Yes | ✅ Yes | JSON, XML |
|
|
307
351
|
|
|
308
|
-
See [Supported Formats](SUPPORTED_FORMATS.md) for complete list.
|
|
309
|
-
|
|
310
352
|
---
|
|
311
353
|
|
|
312
354
|
## ⚙️ Configuration
|
|
@@ -330,6 +372,10 @@ result = needs_ocr("document.pdf", config=config)
|
|
|
330
372
|
- `min_text_length`: Minimum text length (default: 50)
|
|
331
373
|
- `min_office_text_length`: Minimum office text length (default: 100)
|
|
332
374
|
- `layout_refinement_threshold`: OpenCV trigger threshold (default: 0.9)
|
|
375
|
+
- `skip_opencv_if_file_size_mb`: Skip OpenCV when file size ≥ N MB (default: None)
|
|
376
|
+
- `skip_opencv_if_page_count`: Skip OpenCV when page count ≥ N (default: None)
|
|
377
|
+
- `digital_bias_text_coverage_min`: Force no-OCR when text_coverage ≥ this and image_coverage is low (default: 65)
|
|
378
|
+
- `table_bias_text_density_min`: For mixed layout, treat as digital when text_density ≥ this (default: 1.5)
|
|
333
379
|
|
|
334
380
|
---
|
|
335
381
|
|
|
@@ -368,15 +414,34 @@ if result["reason_code"] == "PDF_MIXED":
|
|
|
368
414
|
|----------|------|----------|
|
|
369
415
|
| Fast Path (Heuristics) | < 150ms | ~99% |
|
|
370
416
|
| OpenCV Refinement | 150-300ms | 92-96% |
|
|
371
|
-
| **
|
|
417
|
+
| **Typical (single file)** | **< 1 second** | **94-97%** |
|
|
418
|
+
|
|
419
|
+
*Typical: most PDFs finish in under 1 second. Heuristics-only files: 120–180ms avg. Large or mixed documents may take 1–3s with OpenCV.*
|
|
420
|
+
|
|
421
|
+
### Benchmark Results (≤1MB Dataset)
|
|
422
|
+
|
|
423
|
+
<p align="center">
|
|
424
|
+
<img src="docs/benchmarks/avg-time-by-type.png" alt="Average processing time by file type" width="500">
|
|
425
|
+
<br><em>Average Processing Time by File Type</em>
|
|
426
|
+
</p>
|
|
427
|
+
|
|
428
|
+
<p align="center">
|
|
429
|
+
<img src="docs/benchmarks/latency-summary.png" alt="Latency summary for PDFs" width="500">
|
|
430
|
+
<br><em>Latency Summary (Mean, Median, P95)</em>
|
|
431
|
+
</p>
|
|
372
432
|
|
|
373
433
|
### Accuracy Metrics
|
|
374
434
|
|
|
375
|
-
- **Overall Accuracy**: 92-95% (100% on
|
|
435
|
+
- **Overall Accuracy**: 92-95% (100% on validation benchmark)
|
|
376
436
|
- **Precision**: 100% (all flagged files actually need OCR)
|
|
377
437
|
- **Recall**: 100% (all OCR-needed files detected)
|
|
378
438
|
- **F1-Score**: 100%
|
|
379
439
|
|
|
440
|
+
<p align="center">
|
|
441
|
+
<img src="docs/benchmarks/confusion-matrix.png" alt="Confusion matrix - 100% accuracy" width="500">
|
|
442
|
+
<br><em>Confusion Matrix (TP:1, FP:0, TN:9, FN:0)</em>
|
|
443
|
+
</p>
|
|
444
|
+
|
|
380
445
|
### Performance Factors
|
|
381
446
|
|
|
382
447
|
- **File size**: Larger files take longer
|
|
@@ -481,12 +546,14 @@ Batch processor for multiple files with parallel processing.
|
|
|
481
546
|
### When to Choose PreOCR
|
|
482
547
|
|
|
483
548
|
✅ **Choose PreOCR when:**
|
|
484
|
-
- You
|
|
485
|
-
- You
|
|
486
|
-
- You
|
|
487
|
-
- You
|
|
488
|
-
|
|
489
|
-
|
|
549
|
+
- You're building **document ingestion pipelines** or **RAG/LLM systems**—decide which files need OCR vs. native extraction
|
|
550
|
+
- You need **speed** (< 1 second per file) and **cost optimization** (skip OCR for 50–70% of documents)
|
|
551
|
+
- You want **page-level granularity** (which pages need OCR in mixed PDFs)
|
|
552
|
+
- You prefer **type safety** (Pydantic models) and **edge deployment** (CPU-only, no GPU)
|
|
553
|
+
|
|
554
|
+
### Switched from Unstructured.io or another library?
|
|
555
|
+
|
|
556
|
+
PreOCR focuses on **OCR routing**—it doesn't perform extraction by default. Use it as a pre-filter: call `needs_ocr()` first, then route to your OCR engine or to `extract_native_data()` for digital documents. The API is simple: `needs_ocr(path)`, `extract_native_data(path)`, `BatchProcessor`.
|
|
490
557
|
|
|
491
558
|
---
|
|
492
559
|
|
|
@@ -513,22 +580,25 @@ Batch processor for multiple files with parallel processing.
|
|
|
513
580
|
|
|
514
581
|
---
|
|
515
582
|
|
|
516
|
-
##
|
|
583
|
+
## Frequently Asked Questions (FAQ)
|
|
517
584
|
|
|
518
|
-
**
|
|
519
|
-
|
|
585
|
+
**Does PreOCR perform OCR?**
|
|
586
|
+
No. PreOCR is an **OCR detection** library—it analyzes files to determine if OCR is needed. It does not run OCR itself. Use it to decide whether to call Tesseract, Textract, or another OCR engine.
|
|
520
587
|
|
|
521
|
-
**
|
|
522
|
-
|
|
588
|
+
**How accurate is PreOCR for PDF OCR detection?**
|
|
589
|
+
PreOCR achieves 92–95% accuracy with the hybrid pipeline. Validation on benchmark datasets reached 100% accuracy (10/10 PDFs correct).
|
|
523
590
|
|
|
524
|
-
**
|
|
525
|
-
|
|
591
|
+
**Can I use PreOCR with AWS Textract, Google Vision, or Azure Document Intelligence?**
|
|
592
|
+
Yes. PreOCR is ideal for filtering documents before sending them to cloud OCR APIs. Skip OCR for digital PDFs to reduce API costs.
|
|
526
593
|
|
|
527
|
-
**
|
|
528
|
-
|
|
594
|
+
**Does PreOCR work offline?**
|
|
595
|
+
Yes. PreOCR is CPU-only and runs fully offline—no API keys or internet required.
|
|
529
596
|
|
|
530
|
-
**
|
|
531
|
-
|
|
597
|
+
**How do I customize OCR detection thresholds?**
|
|
598
|
+
Use the `Config` class or pass threshold parameters to `BatchProcessor`. See [Configuration](#-configuration).
|
|
599
|
+
|
|
600
|
+
**Is there an HTTP/REST API?**
|
|
601
|
+
PreOCR is a Python library. For HTTP APIs, wrap it in FastAPI or Flask—see [preocr.io](https://preocr.io) for hosted options.
|
|
532
602
|
|
|
533
603
|
---
|
|
534
604
|
|
|
@@ -545,6 +615,10 @@ pip install -e ".[dev]"
|
|
|
545
615
|
# Run tests
|
|
546
616
|
pytest
|
|
547
617
|
|
|
618
|
+
# Run benchmarks (add PDFs to datasets/ for testing)
|
|
619
|
+
python scripts/benchmark_accuracy.py datasets -g scripts/ground_truth_data_source_formats.json --layout-aware --page-level
|
|
620
|
+
python scripts/benchmark_planner.py datasets
|
|
621
|
+
|
|
548
622
|
# Run linting
|
|
549
623
|
ruff check preocr/
|
|
550
624
|
black --check preocr/
|
|
@@ -558,20 +632,20 @@ See [CHANGELOG.md](docs/CHANGELOG.md) for complete version history.
|
|
|
558
632
|
|
|
559
633
|
### Recent Updates
|
|
560
634
|
|
|
561
|
-
**
|
|
562
|
-
- ✅ **
|
|
563
|
-
- ✅ **
|
|
564
|
-
- ✅ **
|
|
565
|
-
- ✅ **
|
|
566
|
-
- ✅ **
|
|
567
|
-
|
|
568
|
-
|
|
635
|
+
**v2.0.0** - Accuracy & Performance (Latest)
|
|
636
|
+
- ✅ **100% Accuracy**: Fixed false positives on digital PDFs; benchmark validation at 100%
|
|
637
|
+
- ✅ **OpenCV Skip Heuristics**: Skip OpenCV for clearly digital documents (configurable by file size, page count)
|
|
638
|
+
- ✅ **Digital/Table Bias Rules**: New config options to reduce false positives on product manuals, marketing PDFs
|
|
639
|
+
- ✅ **Unified Datasets**: Consolidated `benchmarkdata` and `data-source-formats` into `datasets/` directory
|
|
640
|
+
- ✅ **Page Count in Signals**: PDF analysis includes page count for smarter heuristics
|
|
641
|
+
|
|
642
|
+
**v1.1.0** - Invoice Intelligence & Advanced Extraction
|
|
643
|
+
- ✅ Semantic deduplication, invoice intelligence, text merging
|
|
644
|
+
- ✅ Table stitching, finance validation, reversed text detection
|
|
569
645
|
|
|
570
646
|
**v1.0.0** - Structured Data Extraction
|
|
571
|
-
- ✅ Comprehensive extraction
|
|
572
|
-
- ✅ Element classification
|
|
573
|
-
- ✅ Table, form, and image extraction
|
|
574
|
-
- ✅ Multiple output formats (Pydantic, JSON, Markdown)
|
|
647
|
+
- ✅ Comprehensive extraction for PDFs, Office docs, text files
|
|
648
|
+
- ✅ Element classification, table/form/image extraction
|
|
575
649
|
|
|
576
650
|
---
|
|
577
651
|
|
|
@@ -587,19 +661,19 @@ Apache License 2.0 - see [LICENSE](LICENSE) for details.
|
|
|
587
661
|
|
|
588
662
|
---
|
|
589
663
|
|
|
590
|
-
##
|
|
664
|
+
## Links & Resources
|
|
591
665
|
|
|
592
|
-
-
|
|
593
|
-
- **
|
|
594
|
-
- **
|
|
595
|
-
- **
|
|
666
|
+
- **Website**: [preocr.io](https://preocr.io) – Python OCR detection and document processing
|
|
667
|
+
- **PyPI**: [pypi.org/project/preocr](https://pypi.org/project/preocr) – Install with `pip install preocr`
|
|
668
|
+
- **GitHub**: [github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr) – Source code and issues
|
|
669
|
+
- **Documentation**: [CHANGELOG](docs/CHANGELOG.md) • [OCR Decision Model](docs/OCR_DECISION_MODEL.md) • [Contributing](docs/CONTRIBUTING.md)
|
|
596
670
|
|
|
597
671
|
---
|
|
598
672
|
|
|
599
673
|
<div align="center">
|
|
600
674
|
|
|
601
|
-
**
|
|
675
|
+
**PreOCR – Python OCR detection library. Skip OCR for digital PDFs. Save time and money.**
|
|
602
676
|
|
|
603
|
-
[
|
|
677
|
+
[Website](https://preocr.io) · [GitHub](https://github.com/yuvaraj3855/preocr) · [PyPI](https://pypi.org/project/preocr) · [Report Issue](https://github.com/yuvaraj3855/preocr/issues)
|
|
604
678
|
|
|
605
679
|
</div>
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from .core.detector import needs_ocr
|
|
4
4
|
from .core.extractor import extract_native_data
|
|
5
|
+
from .planner import plan_ocr_for_document
|
|
5
6
|
from .utils.batch import BatchProcessor, BatchResults
|
|
6
7
|
from .version import __version__
|
|
7
8
|
|
|
@@ -20,6 +21,7 @@ __all__ = [
|
|
|
20
21
|
# Main API
|
|
21
22
|
"needs_ocr",
|
|
22
23
|
"extract_native_data",
|
|
24
|
+
"plan_ocr_for_document",
|
|
23
25
|
"__version__",
|
|
24
26
|
"BatchProcessor",
|
|
25
27
|
"BatchResults",
|