PyPI - preocr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

preocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

preocr/__init__.py +7 -0
preocr/constants.py +17 -0
preocr/decision.py +129 -0
preocr/detector.py +100 -0
preocr/filetype.py +90 -0
preocr/image_probe.py +139 -0
preocr/office_probe.py +161 -0
preocr/pdf_probe.py +101 -0
preocr/signals.py +52 -0
preocr/text_probe.py +110 -0
preocr/version.py +4 -0
preocr-0.1.0.dist-info/METADATA +256 -0
preocr-0.1.0.dist-info/RECORD +16 -0
preocr-0.1.0.dist-info/WHEEL +5 -0
preocr-0.1.0.dist-info/licenses/LICENSE +192 -0
preocr-0.1.0.dist-info/top_level.txt +1 -0

preocr/pdf_probe.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""PDF text extraction probe."""
+from pathlib import Path
+from typing import Dict, Optional
+try:
+    import pdfplumber
+except ImportError:
+    pdfplumber = None
+try:
+    import fitz  # PyMuPDF
+except ImportError:
+    fitz = None
+def extract_pdf_text(file_path: str) -> Dict[str, any]:
+    """
+    Extract text from PDF file.
+    Tries pdfplumber first (better text extraction), falls back to PyMuPDF.
+    Args:
+        file_path: Path to the PDF file
+    Returns:
+        Dictionary with keys:
+            - text_length: Number of characters in extracted text
+            - text: Extracted text (may be truncated for large files)
+            - page_count: Number of pages in PDF
+            - method: Extraction method used ("pdfplumber" or "pymupdf")
+    """
+    path = Path(file_path)
+    # Try pdfplumber first
+    if pdfplumber:
+        try:
+            return _extract_with_pdfplumber(path)
+        except Exception:
+            pass
+    # Fallback to PyMuPDF
+    if fitz:
+        try:
+            return _extract_with_pymupdf(path)
+        except Exception:
+            pass
+    # No extractors available or both failed
+    return {
+        "text_length": 0,
+        "text": "",
+        "page_count": 0,
+        "method": None,
+    }
+def _extract_with_pdfplumber(path: Path) -> Dict[str, any]:
+    """Extract text using pdfplumber."""
+    text_parts = []
+    page_count = 0
+    with pdfplumber.open(path) as pdf:
+        page_count = len(pdf.pages)
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text_parts.append(page_text)
+    full_text = "\n".join(text_parts)
+    return {
+        "text_length": len(full_text),
+        "text": full_text[:1000] if len(full_text) > 1000 else full_text,
+        "page_count": page_count,
+        "method": "pdfplumber",
+    }
+def _extract_with_pymupdf(path: Path) -> Dict[str, any]:
+    """Extract text using PyMuPDF."""
+    doc = fitz.open(path)
+    text_parts = []
+    page_count = len(doc)
+    for page_num in range(page_count):
+        page = doc[page_num]
+        page_text = page.get_text()
+        if page_text:
+            text_parts.append(page_text)
+    doc.close()
+    full_text = "\n".join(text_parts)
+    return {
+        "text_length": len(full_text),
+        "text": full_text[:1000] if len(full_text) > 1000 else full_text,
+        "page_count": page_count,
+        "method": "pymupdf",
+    }

preocr/signals.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Signal collection and aggregation for OCR detection."""
+from pathlib import Path
+from typing import Any, Dict, Optional
+def collect_signals(
+    file_path: str,
+    file_info: Dict[str, str],
+    text_result: Optional[Dict[str, Any]] = None,
+    image_result: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """
+    Collect and aggregate all detection signals.
+    Args:
+        file_path: Path to the file being analyzed
+        file_info: File type information from filetype.detect_file_type()
+        text_result: Text extraction result (from text_probe, pdf_probe, or office_probe)
+        image_result: Image analysis result (from image_probe)
+    Returns:
+        Dictionary containing all collected signals:
+            - mime: MIME type
+            - extension: File extension
+            - is_binary: Whether file is binary
+            - text_length: Length of extracted text (0 if none)
+            - image_entropy: Image entropy (if image)
+            - file_size: File size in bytes
+            - has_text: Boolean indicating if meaningful text was found
+    """
+    path = Path(file_path)
+    file_size = path.stat().st_size if path.exists() else 0
+    text_length = 0
+    if text_result:
+        text_length = text_result.get("text_length", 0)
+    image_entropy = None
+    if image_result:
+        image_entropy = image_result.get("entropy")
+    return {
+        "mime": file_info.get("mime", "application/octet-stream"),
+        "extension": file_info.get("extension", ""),
+        "is_binary": file_info.get("is_binary", True),
+        "text_length": text_length,
+        "image_entropy": image_entropy,
+        "file_size": file_size,
+        "has_text": text_length > 0,
+    }

preocr/text_probe.py ADDED Viewed

@@ -0,0 +1,110 @@
+"""Text extraction for plain text files and HTML."""
+import codecs
+from pathlib import Path
+from typing import Dict, Optional
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    BeautifulSoup = None
+def extract_text_from_file(file_path: str, mime_type: str) -> Dict[str, any]:
+    """
+    Extract text from plain text files and HTML.
+    Args:
+        file_path: Path to the file
+        mime_type: MIME type of the file
+    Returns:
+        Dictionary with keys:
+            - text_length: Number of characters in extracted text
+            - text: Extracted text (may be truncated for large files)
+            - encoding: Detected encoding (for text files)
+    """
+    path = Path(file_path)
+    if mime_type.startswith("text/html") or mime_type == "application/xhtml+xml":
+        return _extract_html_text(path)
+    elif mime_type.startswith("text/"):
+        return _extract_plain_text(path)
+    else:
+        return {"text_length": 0, "text": "", "encoding": None}
+def _extract_plain_text(path: Path) -> Dict[str, any]:
+    """Extract text from plain text files."""
+    encodings = ["utf-8", "latin-1", "cp1252", "iso-8859-1"]
+    text = ""
+    encoding = None
+    for enc in encodings:
+        try:
+            with open(path, "r", encoding=enc) as f:
+                text = f.read()
+                encoding = enc
+                break
+        except (UnicodeDecodeError, UnicodeError):
+            continue
+    if not text:
+        # Last resort: try binary read and decode
+        try:
+            with open(path, "rb") as f:
+                raw = f.read()
+                text = raw.decode("utf-8", errors="ignore")
+                encoding = "utf-8"
+        except Exception:
+            pass
+    return {
+        "text_length": len(text),
+        "text": text[:1000] if len(text) > 1000 else text,  # Truncate for large files
+        "encoding": encoding,
+    }
+def _extract_html_text(path: Path) -> Dict[str, any]:
+    """Extract text from HTML files."""
+    if not BeautifulSoup:
+        # Fallback: basic HTML tag removal
+        return _extract_plain_text(path)
+    try:
+        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+            content = f.read()
+        soup = BeautifulSoup(content, "html.parser")
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        text = soup.get_text(separator=" ", strip=True)
+        return {
+            "text_length": len(text),
+            "text": text[:1000] if len(text) > 1000 else text,
+            "encoding": "utf-8",
+        }
+    except Exception:
+        # Fallback to plain text extraction
+        return _extract_plain_text(path)
+def has_meaningful_text(text: str, min_chars: int = 50) -> bool:
+    """
+    Check if text has meaningful content.
+    Args:
+        text: Text to check
+        min_chars: Minimum number of characters to consider meaningful
+    Returns:
+        True if text has meaningful content, False otherwise
+    """
+    if not text:
+        return False
+    return len(text.strip()) >= min_chars

preocr/version.py ADDED Viewed

@@ -0,0 +1,4 @@
+"""Version information for preocr package."""
+__version__ = "0.1.0"

preocr-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,256 @@
+Metadata-Version: 2.4
+Name: preocr
+Version: 0.1.0
+Summary: A fast, CPU-only library that detects whether files need OCR processing before expensive OCR operations
+Author: PreOCR Contributors
+License: Apache-2.0
+Project-URL: Homepage, https://github.com/yourusername/preocr
+Project-URL: Documentation, https://github.com/yourusername/preocr#readme
+Project-URL: Repository, https://github.com/yourusername/preocr
+Project-URL: Issues, https://github.com/yourusername/preocr/issues
+Keywords: ocr,document,detection,preprocessing,file-analysis
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup
+Classifier: Topic :: Scientific/Engineering :: Image Recognition
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: python-magic>=0.4.27
+Requires-Dist: pdfplumber>=0.10.0
+Requires-Dist: python-docx>=1.1.0
+Requires-Dist: python-pptx>=0.6.23
+Requires-Dist: openpyxl>=3.1.0
+Requires-Dist: Pillow>=10.0.0
+Requires-Dist: beautifulsoup4>=4.12.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.4.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Requires-Dist: mypy>=1.5.0; extra == "dev"
+Dynamic: license-file
+# PreOCR
+A fast, CPU-only, deterministic library that detects whether files need OCR processing before expensive OCR operations.
+## Overview
+PreOCR acts as a **universal document gatekeeper** that analyzes any file type and determines:
+> **"Is this file already machine-readable, or do I need OCR?"**
+Instead of performing OCR to detect OCR, PreOCR uses intelligent file analysis:
+1. **File type detection** (MIME types, extensions)
+2. **Text extraction probes** (PDF, Office docs, plain text)
+3. **Visual/binary analysis** (images, entropy)
+4. **Decision engine** (rule-based logic)
+## Features
+- ✅ **Fast**: CPU-only, no OCR required
+- ✅ **Deterministic**: Same input → same output
+- ✅ **OCR-free**: Never performs OCR to detect OCR
+- ✅ **Extensible**: Easy to add new file type handlers
+- ✅ **Conservative**: When uncertain, defaults to "needs OCR"
+## Supported File Types
+- **PDFs**: Digital PDFs (no OCR) vs Scanned PDFs (needs OCR)
+- **Images**: PNG, JPG, TIFF, etc. (always needs OCR)
+- **Office Documents**: DOCX, PPTX, XLSX (extracts text if available)
+- **Text Files**: TXT, CSV, HTML (no OCR needed)
+- **Structured Data**: JSON, XML (no OCR needed)
+- **Unknown Binaries**: Defaults to needing OCR (conservative)
+## Installation
+```bash
+pip install preocr
+```
+## Quick Start
+```python
+from preocr import needs_ocr
+# Check if a file needs OCR
+result = needs_ocr("document.pdf")
+if result["needs_ocr"]:
+    print(f"File needs OCR: {result['reason']}")
+    # Run your OCR here (e.g., MinerU)
+else:
+    print(f"File is already machine-readable: {result['reason']}")
+```
+## API Reference
+### `needs_ocr(file_path)`
+Main API function that determines if a file needs OCR.
+**Parameters:**
+- `file_path` (str or Path): Path to the file to analyze
+**Returns:**
+Dictionary with the following keys:
+- `needs_ocr` (bool): Whether OCR is needed
+- `file_type` (str): File type category ("pdf", "image", "office", "text", etc.)
+- `category` (str): "structured" (no OCR) or "unstructured" (needs OCR)
+- `confidence` (float): Confidence score (0.0-1.0)
+- `reason` (str): Human-readable reason for the decision
+- `signals` (dict): All collected detection signals (for debugging)
+**Example:**
+```python
+result = needs_ocr("document.pdf")
+print(result)
+# {
+#     "needs_ocr": False,
+#     "file_type": "pdf",
+#     "category": "structured",
+#     "confidence": 0.9,
+#     "reason": "digital PDF with 1234 characters of extractable text",
+#     "signals": {
+#         "mime": "application/pdf",
+#         "extension": "pdf",
+#         "text_length": 1234,
+#         "has_text": True,
+#         ...
+#     }
+# }
+```
+## Usage Examples
+### Basic Usage
+```python
+from preocr import needs_ocr
+result = needs_ocr("my_document.pdf")
+if result["needs_ocr"]:
+    print("This file needs OCR processing")
+    # Your OCR code here
+else:
+    print("This file is already machine-readable")
+    print(f"Reason: {result['reason']}")
+```
+### Batch Processing
+```python
+from pathlib import Path
+from preocr import needs_ocr
+files = Path("documents").glob("*.pdf")
+for file_path in files:
+    result = needs_ocr(file_path)
+    status = "NEEDS OCR" if result["needs_ocr"] else "READY"
+    print(f"{file_path.name}: {status} ({result['reason']})")
+```
+### Integration with MinerU
+```python
+from preocr import needs_ocr
+# Assuming you have MinerU OCR available
+# from mineru import ocr
+def process_document(file_path):
+    result = needs_ocr(file_path)
+    if result["needs_ocr"]:
+        # Only run expensive OCR if needed
+        # ocr_result = ocr(file_path)
+        print(f"Running OCR on {file_path}")
+    else:
+        # Use existing text extraction
+        print(f"Using existing text from {file_path}")
+```
+## Architecture
+```
+Any File
+  ↓
+File Type Detector
+  ↓
+Text Extractability Probe
+  ↓
+Visual / Binary Probe
+  ↓
+Decision Engine
+  ↓
+Result (needs_ocr: bool)
+```
+## Decision Logic
+PreOCR uses rule-based logic to make decisions:
+1. **Plain text formats** → NO OCR
+2. **Office docs with text** → NO OCR
+3. **PDFs with extractable text** → NO OCR
+4. **PDFs without text** → YES OCR (likely scanned)
+5. **Images** → YES OCR (always)
+6. **Unknown binaries** → YES OCR (conservative default)
+## Requirements
+- Python 3.9+
+- See `pyproject.toml` for full dependency list
+## Development
+```bash
+# Clone the repository
+git clone https://github.com/yourusername/preocr.git
+cd preocr
+# Install in development mode
+pip install -e ".[dev]"
+# Run tests
+pytest
+# Run with coverage
+pytest --cov=preocr --cov-report=html
+```
+## Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## License
+Apache License 2.0 - see [LICENSE](LICENSE) file for details.
+## Versioning
+PreOCR follows [Semantic Versioning](https://semver.org/):
+- **MAJOR**: Breaking API changes
+- **MINOR**: New features (backward-compatible)
+- **PATCH**: Bug fixes (backward-compatible)
+## Changelog
+See [CHANGELOG.md](CHANGELOG.md) for version history and changes.
+## Support
+- **Issues**: [GitHub Issues](https://github.com/yourusername/preocr/issues)
+- **Documentation**: [GitHub README](https://github.com/yourusername/preocr#readme)

preocr-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+preocr/__init__.py,sha256=qY1nuleiyM1J2mnCTdmUbjV78MQ6d-XpzztZnBIsPM8,195
+preocr/constants.py,sha256=TAjLZTNeT6La_4Ssf7CHYeOCay6ZO-2x1JLU38JmtC4,411
+preocr/decision.py,sha256=RGIZ_Jj1huqwr5Yy09Pk9VJNu79AJy4-9SozRqBvEuQ,3866
+preocr/detector.py,sha256=SdHAM3Qpcu7YhiCDmRAnpJNMOEbrCaQ10gRJmQmeE2Q,3530
+preocr/filetype.py,sha256=vngtWF2v6tkPZ0f5EgLlxf2jcJhPTmetNNp2gWseH9c,2756
+preocr/image_probe.py,sha256=6udIWR48V6sjIdb3FMsgUCMHdgeZmdV4JL9f6nn3xQA,3838
+preocr/office_probe.py,sha256=nSceMNOCSd2wc6idJkrLS6Q6A_zqqDmjZ-57EEMdnj0,4438
+preocr/pdf_probe.py,sha256=KcsmVQJ65Mp8WFOSzjJRnMsCTNYeWN_vN7zI38bzSBA,2494
+preocr/signals.py,sha256=UBoP0CUxgD4kW6l5k2NGFJd-50AVZE9IdFVz73qScMA,1741
+preocr/text_probe.py,sha256=xSE0NZIfOezHY8gCPO3ks0Z7594amBEGJc8BcKaYPfk,3177
+preocr/version.py,sha256=ceHpGxrQwgwp0y2ZecPUziMziTNpVP1BvKunP1JZAus,70
+preocr-0.1.0.dist-info/licenses/LICENSE,sha256=tSEUrFBMfq5_wYGUqNHIHsAT2avQLNusPHfhDbLZ1K8,10301
+preocr-0.1.0.dist-info/METADATA,sha256=eGgUYLc_AYpMgSgdlsyh5DTyTjVCE250thJ4V9vgKDQ,6910
+preocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+preocr-0.1.0.dist-info/top_level.txt,sha256=q3NK_rx1PuYHeeK3I5MnmBKXD7aG2ZwArJ1t2-R_cRw,7
+preocr-0.1.0.dist-info/RECORD,,

preocr-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (80.9.0)
+Root-Is-Purelib: true
+Tag: py3-none-any