preocr 1.2.2__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {preocr-1.2.2 → preocr-1.3.0}/PKG-INFO +128 -54
  2. {preocr-1.2.2 → preocr-1.3.0}/README.md +127 -53
  3. {preocr-1.2.2 → preocr-1.3.0}/preocr/__init__.py +2 -0
  4. {preocr-1.2.2 → preocr-1.3.0}/preocr/constants.py +23 -0
  5. {preocr-1.2.2 → preocr-1.3.0}/preocr/core/decision.py +101 -5
  6. {preocr-1.2.2 → preocr-1.3.0}/preocr/core/detector.py +70 -5
  7. {preocr-1.2.2 → preocr-1.3.0}/preocr/core/signals.py +3 -0
  8. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/schemas.py +4 -3
  9. preocr-1.3.0/preocr/planner/__init__.py +15 -0
  10. preocr-1.3.0/preocr/planner/_extract.py +79 -0
  11. preocr-1.3.0/preocr/planner/config.py +99 -0
  12. preocr-1.3.0/preocr/planner/decision.py +126 -0
  13. preocr-1.3.0/preocr/planner/intent.py +131 -0
  14. preocr-1.3.0/preocr/planner/models.py +101 -0
  15. preocr-1.3.0/preocr/planner/planner.py +231 -0
  16. {preocr-1.2.2 → preocr-1.3.0}/preocr/version.py +1 -1
  17. {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/PKG-INFO +128 -54
  18. {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/SOURCES.txt +9 -0
  19. {preocr-1.2.2 → preocr-1.3.0}/tests/test_batch.py +18 -19
  20. {preocr-1.2.2 → preocr-1.3.0}/tests/test_config_thresholds.py +20 -20
  21. {preocr-1.2.2 → preocr-1.3.0}/tests/test_decision.py +30 -9
  22. {preocr-1.2.2 → preocr-1.3.0}/tests/test_layout_aware_needs_ocr.py +26 -25
  23. preocr-1.3.0/tests/test_planner.py +102 -0
  24. {preocr-1.2.2 → preocr-1.3.0}/tests/test_signals.py +26 -1
  25. preocr-1.3.0/tests/test_skip_opencv_heuristics.py +77 -0
  26. {preocr-1.2.2 → preocr-1.3.0}/LICENSE +0 -0
  27. {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/__init__.py +0 -0
  28. {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/layout_analyzer.py +0 -0
  29. {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/opencv_layout.py +0 -0
  30. {preocr-1.2.2 → preocr-1.3.0}/preocr/analysis/page_detection.py +0 -0
  31. {preocr-1.2.2 → preocr-1.3.0}/preocr/core/__init__.py +0 -0
  32. {preocr-1.2.2 → preocr-1.3.0}/preocr/core/extractor.py +0 -0
  33. {preocr-1.2.2 → preocr-1.3.0}/preocr/exceptions.py +0 -0
  34. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/__init__.py +0 -0
  35. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/base.py +0 -0
  36. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/formatters.py +0 -0
  37. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/office_extractor.py +0 -0
  38. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/pdf_extractor.py +0 -0
  39. {preocr-1.2.2 → preocr-1.3.0}/preocr/extraction/text_extractor.py +0 -0
  40. {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/__init__.py +0 -0
  41. {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/image_probe.py +0 -0
  42. {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/office_probe.py +0 -0
  43. {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/pdf_probe.py +0 -0
  44. {preocr-1.2.2 → preocr-1.3.0}/preocr/probes/text_probe.py +0 -0
  45. {preocr-1.2.2 → preocr-1.3.0}/preocr/py.typed +0 -0
  46. {preocr-1.2.2 → preocr-1.3.0}/preocr/reason_codes.py +0 -0
  47. {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/__init__.py +0 -0
  48. {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/batch.py +0 -0
  49. {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/cache.py +0 -0
  50. {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/filetype.py +0 -0
  51. {preocr-1.2.2 → preocr-1.3.0}/preocr/utils/logger.py +0 -0
  52. {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/dependency_links.txt +0 -0
  53. {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/requires.txt +0 -0
  54. {preocr-1.2.2 → preocr-1.3.0}/preocr.egg-info/top_level.txt +0 -0
  55. {preocr-1.2.2 → preocr-1.3.0}/pyproject.toml +0 -0
  56. {preocr-1.2.2 → preocr-1.3.0}/setup.cfg +0 -0
  57. {preocr-1.2.2 → preocr-1.3.0}/tests/test_detector.py +0 -0
  58. {preocr-1.2.2 → preocr-1.3.0}/tests/test_filetype.py +0 -0
  59. {preocr-1.2.2 → preocr-1.3.0}/tests/test_hybrid_pipeline.py +0 -0
  60. {preocr-1.2.2 → preocr-1.3.0}/tests/test_image_probe.py +0 -0
  61. {preocr-1.2.2 → preocr-1.3.0}/tests/test_integration.py +0 -0
  62. {preocr-1.2.2 → preocr-1.3.0}/tests/test_layout_analyzer.py +0 -0
  63. {preocr-1.2.2 → preocr-1.3.0}/tests/test_office_probe.py +0 -0
  64. {preocr-1.2.2 → preocr-1.3.0}/tests/test_opencv_layout.py +0 -0
  65. {preocr-1.2.2 → preocr-1.3.0}/tests/test_page_detection.py +0 -0
  66. {preocr-1.2.2 → preocr-1.3.0}/tests/test_pdf_probe.py +0 -0
  67. {preocr-1.2.2 → preocr-1.3.0}/tests/test_reason_codes.py +0 -0
  68. {preocr-1.2.2 → preocr-1.3.0}/tests/test_text_probe.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: preocr
3
- Version: 1.2.2
3
+ Version: 1.3.0
4
4
  Summary: A fast, CPU-only library that intelligently detects whether files need OCR processing before expensive OCR operations. Uses hybrid adaptive pipeline for 92-95% accuracy.
5
5
  Author: PreOCR Contributors
6
6
  License-Expression: Apache-2.0
@@ -45,11 +45,11 @@ Provides-Extra: batch
45
45
  Requires-Dist: tqdm>=4.65.0; extra == "batch"
46
46
  Dynamic: license-file
47
47
 
48
- # PreOCR - Fast OCR Detection & Document Extraction Library
48
+ # PreOCR Python OCR Detection Library | Skip OCR for Digital PDFs
49
49
 
50
50
  <div align="center">
51
51
 
52
- **Intelligent OCR detection and structured document extraction - 2-10x faster than competitors**
52
+ **Open-source Python library for OCR detection and document extraction. Detect if PDFs need OCR before expensive processing—save 50–70% on OCR costs.**
53
53
 
54
54
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
55
55
  [![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
@@ -57,32 +57,53 @@ Dynamic: license-file
57
57
  [![Downloads](https://pepy.tech/badge/preocr)](https://pepy.tech/project/preocr)
58
58
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
59
59
 
60
- *Save time and money by skipping OCR for files that are already machine-readable*
60
+ *2–10× faster than alternatives 100% accuracy on benchmark CPU-only, no GPU required*
61
61
 
62
- **🌐 Website**: [preocr.io](https://preocr.io) • **[Installation](#-installation)****[Quick Start](#-quick-start)****[Documentation](#-api-reference)****[Examples](#-usage-examples)****[Benchmarks](#-performance)**
62
+ **🌐 [preocr.io](https://preocr.io)** • [Installation](#-installation) • [Quick Start](#-quick-start) • [API Reference](#-api-reference) • [Examples](#-usage-examples) • [Performance](#-performance)
63
63
 
64
64
  </div>
65
65
 
66
66
  ---
67
67
 
68
- ## 🎯 What is PreOCR?
68
+ ### TL;DR
69
69
 
70
- **PreOCR** is a Python library for **OCR detection** and **document extraction** that intelligently determines whether files need OCR processing before expensive operations. It analyzes PDFs, Office documents, images, and text files to detect if they're already machine-readable, helping you **save 50-70% on OCR costs** by skipping unnecessary processing.
70
+ | Metric | Result |
71
+ |--------|--------|
72
+ | **Accuracy** | 100% (TP=1, FP=0, TN=9, FN=0) |
73
+ | **Latency** | ~2.7s mean, ~1.9s median (≤1MB PDFs) |
74
+ | **Office docs** | ~7ms |
75
+ | **Focus** | Zero false positives. Zero missed scans. |
71
76
 
72
- **🌐 Learn more at [preocr.io](https://preocr.io)**
77
+ ---
78
+
79
+ ## What is PreOCR? Python OCR Detection & Document Processing
80
+
81
+ **PreOCR** is an open-source **Python OCR detection library** that determines whether documents need OCR before you run expensive processing. It analyzes **PDFs**, **Office documents** (DOCX, PPTX, XLSX), **images**, and text files to detect if they're already machine-readable—helping you **skip OCR** for 50–70% of documents and cut costs.
82
+
83
+ Use PreOCR to filter documents before Tesseract, AWS Textract, Google Vision, Azure Document Intelligence, or MinerU. Works offline, CPU-only, with 100% accuracy on validation benchmarks.
84
+
85
+ **🌐 [preocr.io](https://preocr.io)**
73
86
 
74
87
  ### Key Benefits
75
88
 
76
- - ⚡ **Fast**: CPU-only processing, typically < 1 second per file
77
- - 🎯 **Accurate**: 92-95% accuracy (100% on recent validation dataset)
78
- - 💰 **Cost-Effective**: Skip OCR for 50-70% of documents
79
- - 📊 **Structured Extraction**: Extract tables, forms, images, and semantic data
89
+ - ⚡ **Fast**: CPU-only, typically < 1 second per file—no GPU needed
90
+ - 🎯 **Accurate**: 9295% accuracy (100% on validation benchmark)
91
+ - 💰 **Cost-Effective**: Skip OCR for 5070% of documents
92
+ - 📊 **Structured Extraction**: Tables, forms, images, semantic data—Pydantic models, JSON, or Markdown
80
93
  - 🔒 **Type-Safe**: Full Pydantic models with IDE autocomplete
81
- - 🚀 **Production-Ready**: Battle-tested with comprehensive error handling
94
+ - 🚀 **Offline & Production-Ready**: No API keys; battle-tested error handling
95
+
96
+ ### Use Cases: When to Use PreOCR
97
+
98
+ - **Document pipelines**: Filter PDFs before OCR (Tesseract, AWS Textract, Google Vision)
99
+ - **RAG / LLM ingestion**: Decide which documents need OCR vs. native text extraction
100
+ - **Batch processing**: Process thousands of PDFs with page-level OCR decisions
101
+ - **Cost optimization**: Reduce cloud OCR API costs by skipping digital documents
102
+ - **Medical / legal**: Intent-aware planner for prescriptions, discharge summaries, lab reports
82
103
 
83
104
  ---
84
105
 
85
- ## Quick Comparison
106
+ ## Quick Comparison: PreOCR vs. Alternatives
86
107
 
87
108
  | Feature | PreOCR 🏆 | Unstructured.io | Docugami |
88
109
  |---------|-----------|-----------------|----------|
@@ -157,6 +178,17 @@ results.print_summary()
157
178
  - **Page-Level Granularity**: Analyze PDFs page-by-page for precise detection
158
179
  - **Confidence Scores**: Per-decision confidence with reason codes
159
180
  - **Hybrid Pipeline**: Fast heuristics + OpenCV refinement for edge cases
181
+ - **OpenCV Skip Heuristics**: Skips OpenCV for clearly digital documents (file size, page count, text coverage) to improve performance
182
+ - **Digital/Table Bias**: Reduces false positives on high-text PDFs (product manuals, marketing docs) via configurable rules
183
+
184
+ ### Intent-Aware OCR Planner (`plan_ocr_for_document`)
185
+
186
+ - **Medical Domain**: Terminal overrides for prescriptions, diagnosis, discharge summaries, lab reports
187
+ - **Weighted Scoring**: Configurable threshold with safety/balanced/cost modes
188
+ - **Explainability**: Per-page score breakdown (intent, image_dominance, text_weakness)
189
+ - **Evaluation**: Threshold sweep and confusion matrix for calibration
190
+
191
+ See [docs/OCR_DECISION_MODEL.md](docs/OCR_DECISION_MODEL.md) for the full specification.
160
192
 
161
193
  ### Document Extraction (`extract_native_data`)
162
194
 
@@ -222,6 +254,18 @@ print(f"Confidence: {result['confidence']:.2f}")
222
254
  print(f"Reason: {result['reason']}")
223
255
  ```
224
256
 
257
+ #### Intent-Aware Planner (Medical/Domain-Specific)
258
+
259
+ ```python
260
+ from preocr import plan_ocr_for_document
261
+
262
+ result = plan_ocr_for_document("hospital_discharge.pdf")
263
+ print(f"Needs OCR (any page): {result['needs_ocr_any']}")
264
+ for page in result["pages"]:
265
+ print(f" Page {page['page_number']}: needs_ocr={page['needs_ocr']} "
266
+ f"type={page['decision_type']} score={page['debug']['score']:.2f}")
267
+ ```
268
+
225
269
  #### Layout-Aware Detection
226
270
 
227
271
  ```python
@@ -352,8 +396,6 @@ PreOCR supports **20+ file formats** for OCR detection and extraction:
352
396
  | **Text** | ✅ Yes | ✅ Yes | TXT, CSV, HTML |
353
397
  | **Structured** | ✅ Yes | ✅ Yes | JSON, XML |
354
398
 
355
- See [Supported Formats](SUPPORTED_FORMATS.md) for complete list.
356
-
357
399
  ---
358
400
 
359
401
  ## ⚙️ Configuration
@@ -377,6 +419,10 @@ result = needs_ocr("document.pdf", config=config)
377
419
  - `min_text_length`: Minimum text length (default: 50)
378
420
  - `min_office_text_length`: Minimum office text length (default: 100)
379
421
  - `layout_refinement_threshold`: OpenCV trigger threshold (default: 0.9)
422
+ - `skip_opencv_if_file_size_mb`: Skip OpenCV when file size ≥ N MB (default: None)
423
+ - `skip_opencv_if_page_count`: Skip OpenCV when page count ≥ N (default: None)
424
+ - `digital_bias_text_coverage_min`: Force no-OCR when text_coverage ≥ this and image_coverage is low (default: 65)
425
+ - `table_bias_text_density_min`: For mixed layout, treat as digital when text_density ≥ this (default: 1.5)
380
426
 
381
427
  ---
382
428
 
@@ -415,15 +461,34 @@ if result["reason_code"] == "PDF_MIXED":
415
461
  |----------|------|----------|
416
462
  | Fast Path (Heuristics) | < 150ms | ~99% |
417
463
  | OpenCV Refinement | 150-300ms | 92-96% |
418
- | **Average** | **120-180ms** | **94-97%** |
464
+ | **Typical (single file)** | **< 1 second** | **94-97%** |
465
+
466
+ *Typical: most PDFs finish in under 1 second. Heuristics-only files: 120–180ms avg. Large or mixed documents may take 1–3s with OpenCV.*
467
+
468
+ ### Benchmark Results (≤1MB Dataset)
469
+
470
+ <p align="center">
471
+ <img src="docs/benchmarks/avg-time-by-type.png" alt="Average processing time by file type" width="500">
472
+ <br><em>Average Processing Time by File Type</em>
473
+ </p>
474
+
475
+ <p align="center">
476
+ <img src="docs/benchmarks/latency-summary.png" alt="Latency summary for PDFs" width="500">
477
+ <br><em>Latency Summary (Mean, Median, P95)</em>
478
+ </p>
419
479
 
420
480
  ### Accuracy Metrics
421
481
 
422
- - **Overall Accuracy**: 92-95% (100% on recent validation)
482
+ - **Overall Accuracy**: 92-95% (100% on validation benchmark)
423
483
  - **Precision**: 100% (all flagged files actually need OCR)
424
484
  - **Recall**: 100% (all OCR-needed files detected)
425
485
  - **F1-Score**: 100%
426
486
 
487
+ <p align="center">
488
+ <img src="docs/benchmarks/confusion-matrix.png" alt="Confusion matrix - 100% accuracy" width="500">
489
+ <br><em>Confusion Matrix (TP:1, FP:0, TN:9, FN:0)</em>
490
+ </p>
491
+
427
492
  ### Performance Factors
428
493
 
429
494
  - **File size**: Larger files take longer
@@ -528,12 +593,14 @@ Batch processor for multiple files with parallel processing.
528
593
  ### When to Choose PreOCR
529
594
 
530
595
  ✅ **Choose PreOCR when:**
531
- - You need **speed** (< 1 second processing)
532
- - You want **cost optimization** (skip OCR for 50-70% of documents)
533
- - You need **page-level granularity**
534
- - You want **type safety** (Pydantic models)
535
- - You're building **LLM/RAG pipelines**
536
- - You need **edge deployment** (CPU-only)
596
+ - You're building **document ingestion pipelines** or **RAG/LLM systems**—decide which files need OCR vs. native extraction
597
+ - You need **speed** (< 1 second per file) and **cost optimization** (skip OCR for 5070% of documents)
598
+ - You want **page-level granularity** (which pages need OCR in mixed PDFs)
599
+ - You prefer **type safety** (Pydantic models) and **edge deployment** (CPU-only, no GPU)
600
+
601
+ ### Switched from Unstructured.io or another library?
602
+
603
+ PreOCR focuses on **OCR routing**—it doesn't perform extraction by default. Use it as a pre-filter: call `needs_ocr()` first, then route to your OCR engine or to `extract_native_data()` for digital documents. The API is simple: `needs_ocr(path)`, `extract_native_data(path)`, `BatchProcessor`.
537
604
 
538
605
  ---
539
606
 
@@ -560,22 +627,25 @@ Batch processor for multiple files with parallel processing.
560
627
 
561
628
  ---
562
629
 
563
- ## Frequently Asked Questions
630
+ ## Frequently Asked Questions (FAQ)
564
631
 
565
- **Q: Does PreOCR perform OCR?**
566
- A: No, PreOCR never performs OCR. It only analyzes files to determine if OCR is needed.
632
+ **Does PreOCR perform OCR?**
633
+ No. PreOCR is an **OCR detection** library—it analyzes files to determine if OCR is needed. It does not run OCR itself. Use it to decide whether to call Tesseract, Textract, or another OCR engine.
567
634
 
568
- **Q: How accurate is PreOCR?**
569
- A: PreOCR achieves 92-95% accuracy with the hybrid pipeline. Recent validation on 27 files achieved 100% accuracy.
635
+ **How accurate is PreOCR for PDF OCR detection?**
636
+ PreOCR achieves 9295% accuracy with the hybrid pipeline. Validation on benchmark datasets reached 100% accuracy (10/10 PDFs correct).
570
637
 
571
- **Q: Can I use PreOCR with cloud OCR services?**
572
- A: Yes! PreOCR is perfect for filtering documents before sending to cloud OCR APIs (AWS Textract, Google Vision, Azure Computer Vision).
638
+ **Can I use PreOCR with AWS Textract, Google Vision, or Azure Document Intelligence?**
639
+ Yes. PreOCR is ideal for filtering documents before sending them to cloud OCR APIs. Skip OCR for digital PDFs to reduce API costs.
573
640
 
574
- **Q: Does PreOCR work offline?**
575
- A: Yes! PreOCR is CPU-only and works completely offline.
641
+ **Does PreOCR work offline?**
642
+ Yes. PreOCR is CPU-only and runs fully offline—no API keys or internet required.
576
643
 
577
- **Q: Can I customize decision thresholds?**
578
- A: Yes! Use the `Config` class or pass threshold parameters to `BatchProcessor`.
644
+ **How do I customize OCR detection thresholds?**
645
+ Use the `Config` class or pass threshold parameters to `BatchProcessor`. See [Configuration](#-configuration).
646
+
647
+ **Is there an HTTP/REST API?**
648
+ PreOCR is a Python library. For HTTP APIs, wrap it in FastAPI or Flask—see [preocr.io](https://preocr.io) for hosted options.
579
649
 
580
650
  ---
581
651
 
@@ -592,6 +662,10 @@ pip install -e ".[dev]"
592
662
  # Run tests
593
663
  pytest
594
664
 
665
+ # Run benchmarks (add PDFs to datasets/ for testing)
666
+ python scripts/benchmark_accuracy.py datasets -g scripts/ground_truth_data_source_formats.json --layout-aware --page-level
667
+ python scripts/benchmark_planner.py datasets
668
+
595
669
  # Run linting
596
670
  ruff check preocr/
597
671
  black --check preocr/
@@ -605,20 +679,20 @@ See [CHANGELOG.md](docs/CHANGELOG.md) for complete version history.
605
679
 
606
680
  ### Recent Updates
607
681
 
608
- **v1.1.0** - Invoice Intelligence & Advanced Extraction (Latest)
609
- - ✅ **Semantic Deduplication**: Intelligent line item deduplication for invoices
610
- - ✅ **Invoice Intelligence**: Semantic extraction with finance validation
611
- - ✅ **Text Merging**: Geometry-aware character-to-word merging improvements
612
- - ✅ **Table Stitching**: Merges fragmented tables across pages
613
- - ✅ **Finance Validation**: Validates invoice totals (subtotal + tax = total)
614
- - ✅ **Reversed Text Detection**: Detects and corrects rotated/mirrored text
615
- - ✅ **Footer Exclusion**: Removes footer from reading order
682
+ **v2.0.0** - Accuracy & Performance (Latest)
683
+ - ✅ **100% Accuracy**: Fixed false positives on digital PDFs; benchmark validation at 100%
684
+ - ✅ **OpenCV Skip Heuristics**: Skip OpenCV for clearly digital documents (configurable by file size, page count)
685
+ - ✅ **Digital/Table Bias Rules**: New config options to reduce false positives on product manuals, marketing PDFs
686
+ - ✅ **Unified Datasets**: Consolidated `benchmarkdata` and `data-source-formats` into `datasets/` directory
687
+ - ✅ **Page Count in Signals**: PDF analysis includes page count for smarter heuristics
688
+
689
+ **v1.1.0** - Invoice Intelligence & Advanced Extraction
690
+ - ✅ Semantic deduplication, invoice intelligence, text merging
691
+ - ✅ Table stitching, finance validation, reversed text detection
616
692
 
617
693
  **v1.0.0** - Structured Data Extraction
618
- - ✅ Comprehensive extraction system for PDFs, Office docs, and text files
619
- - ✅ Element classification (11+ types)
620
- - ✅ Table, form, and image extraction
621
- - ✅ Multiple output formats (Pydantic, JSON, Markdown)
694
+ - ✅ Comprehensive extraction for PDFs, Office docs, text files
695
+ - ✅ Element classification, table/form/image extraction
622
696
 
623
697
  ---
624
698
 
@@ -634,19 +708,19 @@ Apache License 2.0 - see [LICENSE](LICENSE) for details.
634
708
 
635
709
  ---
636
710
 
637
- ## 🔗 Links
711
+ ## Links & Resources
638
712
 
639
- - **🌐 Website**: [preocr.io](https://preocr.io)
640
- - **GitHub**: [https://github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr)
641
- - **PyPI**: [https://pypi.org/project/preocr](https://pypi.org/project/preocr)
642
- - **Issues**: [https://github.com/yuvaraj3855/preocr/issues](https://github.com/yuvaraj3855/preocr/issues)
713
+ - **Website**: [preocr.io](https://preocr.io) – Python OCR detection and document processing
714
+ - **PyPI**: [pypi.org/project/preocr](https://pypi.org/project/preocr) – Install with `pip install preocr`
715
+ - **GitHub**: [github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr) – Source code and issues
716
+ - **Documentation**: [CHANGELOG](docs/CHANGELOG.md) • [OCR Decision Model](docs/OCR_DECISION_MODEL.md) • [Contributing](docs/CONTRIBUTING.md)
643
717
 
644
718
  ---
645
719
 
646
720
  <div align="center">
647
721
 
648
- **Made with ❤️ for efficient document processing**
722
+ **PreOCR Python OCR detection library. Skip OCR for digital PDFs. Save time and money.**
649
723
 
650
- [🌐 Website](https://preocr.io) | [⭐ Star on GitHub](https://github.com/yuvaraj3855/preocr) | [📖 Documentation](https://github.com/yuvaraj3855/preocr#readme) | [🐛 Report Issue](https://github.com/yuvaraj3855/preocr/issues)
724
+ [Website](https://preocr.io) · [GitHub](https://github.com/yuvaraj3855/preocr) · [PyPI](https://pypi.org/project/preocr) · [Report Issue](https://github.com/yuvaraj3855/preocr/issues)
651
725
 
652
726
  </div>
@@ -1,8 +1,8 @@
1
- # PreOCR - Fast OCR Detection & Document Extraction Library
1
+ # PreOCR Python OCR Detection Library | Skip OCR for Digital PDFs
2
2
 
3
3
  <div align="center">
4
4
 
5
- **Intelligent OCR detection and structured document extraction - 2-10x faster than competitors**
5
+ **Open-source Python library for OCR detection and document extraction. Detect if PDFs need OCR before expensive processing—save 50–70% on OCR costs.**
6
6
 
7
7
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
8
8
  [![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
@@ -10,32 +10,53 @@
10
10
  [![Downloads](https://pepy.tech/badge/preocr)](https://pepy.tech/project/preocr)
11
11
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
12
12
 
13
- *Save time and money by skipping OCR for files that are already machine-readable*
13
+ *2–10× faster than alternatives 100% accuracy on benchmark CPU-only, no GPU required*
14
14
 
15
- **🌐 Website**: [preocr.io](https://preocr.io) • **[Installation](#-installation)****[Quick Start](#-quick-start)****[Documentation](#-api-reference)****[Examples](#-usage-examples)****[Benchmarks](#-performance)**
15
+ **🌐 [preocr.io](https://preocr.io)** • [Installation](#-installation) • [Quick Start](#-quick-start) • [API Reference](#-api-reference) • [Examples](#-usage-examples) • [Performance](#-performance)
16
16
 
17
17
  </div>
18
18
 
19
19
  ---
20
20
 
21
- ## 🎯 What is PreOCR?
21
+ ### TL;DR
22
22
 
23
- **PreOCR** is a Python library for **OCR detection** and **document extraction** that intelligently determines whether files need OCR processing before expensive operations. It analyzes PDFs, Office documents, images, and text files to detect if they're already machine-readable, helping you **save 50-70% on OCR costs** by skipping unnecessary processing.
23
+ | Metric | Result |
24
+ |--------|--------|
25
+ | **Accuracy** | 100% (TP=1, FP=0, TN=9, FN=0) |
26
+ | **Latency** | ~2.7s mean, ~1.9s median (≤1MB PDFs) |
27
+ | **Office docs** | ~7ms |
28
+ | **Focus** | Zero false positives. Zero missed scans. |
24
29
 
25
- **🌐 Learn more at [preocr.io](https://preocr.io)**
30
+ ---
31
+
32
+ ## What is PreOCR? Python OCR Detection & Document Processing
33
+
34
+ **PreOCR** is an open-source **Python OCR detection library** that determines whether documents need OCR before you run expensive processing. It analyzes **PDFs**, **Office documents** (DOCX, PPTX, XLSX), **images**, and text files to detect if they're already machine-readable—helping you **skip OCR** for 50–70% of documents and cut costs.
35
+
36
+ Use PreOCR to filter documents before Tesseract, AWS Textract, Google Vision, Azure Document Intelligence, or MinerU. Works offline, CPU-only, with 100% accuracy on validation benchmarks.
37
+
38
+ **🌐 [preocr.io](https://preocr.io)**
26
39
 
27
40
  ### Key Benefits
28
41
 
29
- - ⚡ **Fast**: CPU-only processing, typically < 1 second per file
30
- - 🎯 **Accurate**: 92-95% accuracy (100% on recent validation dataset)
31
- - 💰 **Cost-Effective**: Skip OCR for 50-70% of documents
32
- - 📊 **Structured Extraction**: Extract tables, forms, images, and semantic data
42
+ - ⚡ **Fast**: CPU-only, typically < 1 second per file—no GPU needed
43
+ - 🎯 **Accurate**: 9295% accuracy (100% on validation benchmark)
44
+ - 💰 **Cost-Effective**: Skip OCR for 5070% of documents
45
+ - 📊 **Structured Extraction**: Tables, forms, images, semantic data—Pydantic models, JSON, or Markdown
33
46
  - 🔒 **Type-Safe**: Full Pydantic models with IDE autocomplete
34
- - 🚀 **Production-Ready**: Battle-tested with comprehensive error handling
47
+ - 🚀 **Offline & Production-Ready**: No API keys; battle-tested error handling
48
+
49
+ ### Use Cases: When to Use PreOCR
50
+
51
+ - **Document pipelines**: Filter PDFs before OCR (Tesseract, AWS Textract, Google Vision)
52
+ - **RAG / LLM ingestion**: Decide which documents need OCR vs. native text extraction
53
+ - **Batch processing**: Process thousands of PDFs with page-level OCR decisions
54
+ - **Cost optimization**: Reduce cloud OCR API costs by skipping digital documents
55
+ - **Medical / legal**: Intent-aware planner for prescriptions, discharge summaries, lab reports
35
56
 
36
57
  ---
37
58
 
38
- ## Quick Comparison
59
+ ## Quick Comparison: PreOCR vs. Alternatives
39
60
 
40
61
  | Feature | PreOCR 🏆 | Unstructured.io | Docugami |
41
62
  |---------|-----------|-----------------|----------|
@@ -110,6 +131,17 @@ results.print_summary()
110
131
  - **Page-Level Granularity**: Analyze PDFs page-by-page for precise detection
111
132
  - **Confidence Scores**: Per-decision confidence with reason codes
112
133
  - **Hybrid Pipeline**: Fast heuristics + OpenCV refinement for edge cases
134
+ - **OpenCV Skip Heuristics**: Skips OpenCV for clearly digital documents (file size, page count, text coverage) to improve performance
135
+ - **Digital/Table Bias**: Reduces false positives on high-text PDFs (product manuals, marketing docs) via configurable rules
136
+
137
+ ### Intent-Aware OCR Planner (`plan_ocr_for_document`)
138
+
139
+ - **Medical Domain**: Terminal overrides for prescriptions, diagnosis, discharge summaries, lab reports
140
+ - **Weighted Scoring**: Configurable threshold with safety/balanced/cost modes
141
+ - **Explainability**: Per-page score breakdown (intent, image_dominance, text_weakness)
142
+ - **Evaluation**: Threshold sweep and confusion matrix for calibration
143
+
144
+ See [docs/OCR_DECISION_MODEL.md](docs/OCR_DECISION_MODEL.md) for the full specification.
113
145
 
114
146
  ### Document Extraction (`extract_native_data`)
115
147
 
@@ -175,6 +207,18 @@ print(f"Confidence: {result['confidence']:.2f}")
175
207
  print(f"Reason: {result['reason']}")
176
208
  ```
177
209
 
210
+ #### Intent-Aware Planner (Medical/Domain-Specific)
211
+
212
+ ```python
213
+ from preocr import plan_ocr_for_document
214
+
215
+ result = plan_ocr_for_document("hospital_discharge.pdf")
216
+ print(f"Needs OCR (any page): {result['needs_ocr_any']}")
217
+ for page in result["pages"]:
218
+ print(f" Page {page['page_number']}: needs_ocr={page['needs_ocr']} "
219
+ f"type={page['decision_type']} score={page['debug']['score']:.2f}")
220
+ ```
221
+
178
222
  #### Layout-Aware Detection
179
223
 
180
224
  ```python
@@ -305,8 +349,6 @@ PreOCR supports **20+ file formats** for OCR detection and extraction:
305
349
  | **Text** | ✅ Yes | ✅ Yes | TXT, CSV, HTML |
306
350
  | **Structured** | ✅ Yes | ✅ Yes | JSON, XML |
307
351
 
308
- See [Supported Formats](SUPPORTED_FORMATS.md) for complete list.
309
-
310
352
  ---
311
353
 
312
354
  ## ⚙️ Configuration
@@ -330,6 +372,10 @@ result = needs_ocr("document.pdf", config=config)
330
372
  - `min_text_length`: Minimum text length (default: 50)
331
373
  - `min_office_text_length`: Minimum office text length (default: 100)
332
374
  - `layout_refinement_threshold`: OpenCV trigger threshold (default: 0.9)
375
+ - `skip_opencv_if_file_size_mb`: Skip OpenCV when file size ≥ N MB (default: None)
376
+ - `skip_opencv_if_page_count`: Skip OpenCV when page count ≥ N (default: None)
377
+ - `digital_bias_text_coverage_min`: Force no-OCR when text_coverage ≥ this and image_coverage is low (default: 65)
378
+ - `table_bias_text_density_min`: For mixed layout, treat as digital when text_density ≥ this (default: 1.5)
333
379
 
334
380
  ---
335
381
 
@@ -368,15 +414,34 @@ if result["reason_code"] == "PDF_MIXED":
368
414
  |----------|------|----------|
369
415
  | Fast Path (Heuristics) | < 150ms | ~99% |
370
416
  | OpenCV Refinement | 150-300ms | 92-96% |
371
- | **Average** | **120-180ms** | **94-97%** |
417
+ | **Typical (single file)** | **< 1 second** | **94-97%** |
418
+
419
+ *Typical: most PDFs finish in under 1 second. Heuristics-only files: 120–180ms avg. Large or mixed documents may take 1–3s with OpenCV.*
420
+
421
+ ### Benchmark Results (≤1MB Dataset)
422
+
423
+ <p align="center">
424
+ <img src="docs/benchmarks/avg-time-by-type.png" alt="Average processing time by file type" width="500">
425
+ <br><em>Average Processing Time by File Type</em>
426
+ </p>
427
+
428
+ <p align="center">
429
+ <img src="docs/benchmarks/latency-summary.png" alt="Latency summary for PDFs" width="500">
430
+ <br><em>Latency Summary (Mean, Median, P95)</em>
431
+ </p>
372
432
 
373
433
  ### Accuracy Metrics
374
434
 
375
- - **Overall Accuracy**: 92-95% (100% on recent validation)
435
+ - **Overall Accuracy**: 92-95% (100% on validation benchmark)
376
436
  - **Precision**: 100% (all flagged files actually need OCR)
377
437
  - **Recall**: 100% (all OCR-needed files detected)
378
438
  - **F1-Score**: 100%
379
439
 
440
+ <p align="center">
441
+ <img src="docs/benchmarks/confusion-matrix.png" alt="Confusion matrix - 100% accuracy" width="500">
442
+ <br><em>Confusion Matrix (TP:1, FP:0, TN:9, FN:0)</em>
443
+ </p>
444
+
380
445
  ### Performance Factors
381
446
 
382
447
  - **File size**: Larger files take longer
@@ -481,12 +546,14 @@ Batch processor for multiple files with parallel processing.
481
546
  ### When to Choose PreOCR
482
547
 
483
548
  ✅ **Choose PreOCR when:**
484
- - You need **speed** (< 1 second processing)
485
- - You want **cost optimization** (skip OCR for 50-70% of documents)
486
- - You need **page-level granularity**
487
- - You want **type safety** (Pydantic models)
488
- - You're building **LLM/RAG pipelines**
489
- - You need **edge deployment** (CPU-only)
549
+ - You're building **document ingestion pipelines** or **RAG/LLM systems**—decide which files need OCR vs. native extraction
550
+ - You need **speed** (< 1 second per file) and **cost optimization** (skip OCR for 5070% of documents)
551
+ - You want **page-level granularity** (which pages need OCR in mixed PDFs)
552
+ - You prefer **type safety** (Pydantic models) and **edge deployment** (CPU-only, no GPU)
553
+
554
+ ### Switched from Unstructured.io or another library?
555
+
556
+ PreOCR focuses on **OCR routing**—it doesn't perform extraction by default. Use it as a pre-filter: call `needs_ocr()` first, then route to your OCR engine or to `extract_native_data()` for digital documents. The API is simple: `needs_ocr(path)`, `extract_native_data(path)`, `BatchProcessor`.
490
557
 
491
558
  ---
492
559
 
@@ -513,22 +580,25 @@ Batch processor for multiple files with parallel processing.
513
580
 
514
581
  ---
515
582
 
516
- ## Frequently Asked Questions
583
+ ## Frequently Asked Questions (FAQ)
517
584
 
518
- **Q: Does PreOCR perform OCR?**
519
- A: No, PreOCR never performs OCR. It only analyzes files to determine if OCR is needed.
585
+ **Does PreOCR perform OCR?**
586
+ No. PreOCR is an **OCR detection** library—it analyzes files to determine if OCR is needed. It does not run OCR itself. Use it to decide whether to call Tesseract, Textract, or another OCR engine.
520
587
 
521
- **Q: How accurate is PreOCR?**
522
- A: PreOCR achieves 92-95% accuracy with the hybrid pipeline. Recent validation on 27 files achieved 100% accuracy.
588
+ **How accurate is PreOCR for PDF OCR detection?**
589
+ PreOCR achieves 9295% accuracy with the hybrid pipeline. Validation on benchmark datasets reached 100% accuracy (10/10 PDFs correct).
523
590
 
524
- **Q: Can I use PreOCR with cloud OCR services?**
525
- A: Yes! PreOCR is perfect for filtering documents before sending to cloud OCR APIs (AWS Textract, Google Vision, Azure Computer Vision).
591
+ **Can I use PreOCR with AWS Textract, Google Vision, or Azure Document Intelligence?**
592
+ Yes. PreOCR is ideal for filtering documents before sending them to cloud OCR APIs. Skip OCR for digital PDFs to reduce API costs.
526
593
 
527
- **Q: Does PreOCR work offline?**
528
- A: Yes! PreOCR is CPU-only and works completely offline.
594
+ **Does PreOCR work offline?**
595
+ Yes. PreOCR is CPU-only and runs fully offline—no API keys or internet required.
529
596
 
530
- **Q: Can I customize decision thresholds?**
531
- A: Yes! Use the `Config` class or pass threshold parameters to `BatchProcessor`.
597
+ **How do I customize OCR detection thresholds?**
598
+ Use the `Config` class or pass threshold parameters to `BatchProcessor`. See [Configuration](#-configuration).
599
+
600
+ **Is there an HTTP/REST API?**
601
+ PreOCR is a Python library. For HTTP APIs, wrap it in FastAPI or Flask—see [preocr.io](https://preocr.io) for hosted options.
532
602
 
533
603
  ---
534
604
 
@@ -545,6 +615,10 @@ pip install -e ".[dev]"
545
615
  # Run tests
546
616
  pytest
547
617
 
618
+ # Run benchmarks (add PDFs to datasets/ for testing)
619
+ python scripts/benchmark_accuracy.py datasets -g scripts/ground_truth_data_source_formats.json --layout-aware --page-level
620
+ python scripts/benchmark_planner.py datasets
621
+
548
622
  # Run linting
549
623
  ruff check preocr/
550
624
  black --check preocr/
@@ -558,20 +632,20 @@ See [CHANGELOG.md](docs/CHANGELOG.md) for complete version history.
558
632
 
559
633
  ### Recent Updates
560
634
 
561
- **v1.1.0** - Invoice Intelligence & Advanced Extraction (Latest)
562
- - ✅ **Semantic Deduplication**: Intelligent line item deduplication for invoices
563
- - ✅ **Invoice Intelligence**: Semantic extraction with finance validation
564
- - ✅ **Text Merging**: Geometry-aware character-to-word merging improvements
565
- - ✅ **Table Stitching**: Merges fragmented tables across pages
566
- - ✅ **Finance Validation**: Validates invoice totals (subtotal + tax = total)
567
- - ✅ **Reversed Text Detection**: Detects and corrects rotated/mirrored text
568
- - ✅ **Footer Exclusion**: Removes footer from reading order
635
+ **v2.0.0** - Accuracy & Performance (Latest)
636
+ - ✅ **100% Accuracy**: Fixed false positives on digital PDFs; benchmark validation at 100%
637
+ - ✅ **OpenCV Skip Heuristics**: Skip OpenCV for clearly digital documents (configurable by file size, page count)
638
+ - ✅ **Digital/Table Bias Rules**: New config options to reduce false positives on product manuals, marketing PDFs
639
+ - ✅ **Unified Datasets**: Consolidated `benchmarkdata` and `data-source-formats` into `datasets/` directory
640
+ - ✅ **Page Count in Signals**: PDF analysis includes page count for smarter heuristics
641
+
642
+ **v1.1.0** - Invoice Intelligence & Advanced Extraction
643
+ - ✅ Semantic deduplication, invoice intelligence, text merging
644
+ - ✅ Table stitching, finance validation, reversed text detection
569
645
 
570
646
  **v1.0.0** - Structured Data Extraction
571
- - ✅ Comprehensive extraction system for PDFs, Office docs, and text files
572
- - ✅ Element classification (11+ types)
573
- - ✅ Table, form, and image extraction
574
- - ✅ Multiple output formats (Pydantic, JSON, Markdown)
647
+ - ✅ Comprehensive extraction for PDFs, Office docs, text files
648
+ - ✅ Element classification, table/form/image extraction
575
649
 
576
650
  ---
577
651
 
@@ -587,19 +661,19 @@ Apache License 2.0 - see [LICENSE](LICENSE) for details.
587
661
 
588
662
  ---
589
663
 
590
- ## 🔗 Links
664
+ ## Links & Resources
591
665
 
592
- - **🌐 Website**: [preocr.io](https://preocr.io)
593
- - **GitHub**: [https://github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr)
594
- - **PyPI**: [https://pypi.org/project/preocr](https://pypi.org/project/preocr)
595
- - **Issues**: [https://github.com/yuvaraj3855/preocr/issues](https://github.com/yuvaraj3855/preocr/issues)
666
+ - **Website**: [preocr.io](https://preocr.io) – Python OCR detection and document processing
667
+ - **PyPI**: [pypi.org/project/preocr](https://pypi.org/project/preocr) – Install with `pip install preocr`
668
+ - **GitHub**: [github.com/yuvaraj3855/preocr](https://github.com/yuvaraj3855/preocr) – Source code and issues
669
+ - **Documentation**: [CHANGELOG](docs/CHANGELOG.md) • [OCR Decision Model](docs/OCR_DECISION_MODEL.md) • [Contributing](docs/CONTRIBUTING.md)
596
670
 
597
671
  ---
598
672
 
599
673
  <div align="center">
600
674
 
601
- **Made with ❤️ for efficient document processing**
675
+ **PreOCR Python OCR detection library. Skip OCR for digital PDFs. Save time and money.**
602
676
 
603
- [🌐 Website](https://preocr.io) | [⭐ Star on GitHub](https://github.com/yuvaraj3855/preocr) | [📖 Documentation](https://github.com/yuvaraj3855/preocr#readme) | [🐛 Report Issue](https://github.com/yuvaraj3855/preocr/issues)
677
+ [Website](https://preocr.io) · [GitHub](https://github.com/yuvaraj3855/preocr) · [PyPI](https://pypi.org/project/preocr) · [Report Issue](https://github.com/yuvaraj3855/preocr/issues)
604
678
 
605
679
  </div>
@@ -2,6 +2,7 @@
2
2
 
3
3
  from .core.detector import needs_ocr
4
4
  from .core.extractor import extract_native_data
5
+ from .planner import plan_ocr_for_document
5
6
  from .utils.batch import BatchProcessor, BatchResults
6
7
  from .version import __version__
7
8
 
@@ -20,6 +21,7 @@ __all__ = [
20
21
  # Main API
21
22
  "needs_ocr",
22
23
  "extract_native_data",
24
+ "plan_ocr_for_document",
23
25
  "__version__",
24
26
  "BatchProcessor",
25
27
  "BatchResults",