preocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
preocr/pdf_probe.py ADDED
@@ -0,0 +1,101 @@
1
+ """PDF text extraction probe."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict, Optional
5
+
6
+ try:
7
+ import pdfplumber
8
+ except ImportError:
9
+ pdfplumber = None
10
+
11
+ try:
12
+ import fitz # PyMuPDF
13
+ except ImportError:
14
+ fitz = None
15
+
16
+
17
+ def extract_pdf_text(file_path: str) -> Dict[str, any]:
18
+ """
19
+ Extract text from PDF file.
20
+
21
+ Tries pdfplumber first (better text extraction), falls back to PyMuPDF.
22
+
23
+ Args:
24
+ file_path: Path to the PDF file
25
+
26
+ Returns:
27
+ Dictionary with keys:
28
+ - text_length: Number of characters in extracted text
29
+ - text: Extracted text (may be truncated for large files)
30
+ - page_count: Number of pages in PDF
31
+ - method: Extraction method used ("pdfplumber" or "pymupdf")
32
+ """
33
+ path = Path(file_path)
34
+
35
+ # Try pdfplumber first
36
+ if pdfplumber:
37
+ try:
38
+ return _extract_with_pdfplumber(path)
39
+ except Exception:
40
+ pass
41
+
42
+ # Fallback to PyMuPDF
43
+ if fitz:
44
+ try:
45
+ return _extract_with_pymupdf(path)
46
+ except Exception:
47
+ pass
48
+
49
+ # No extractors available or both failed
50
+ return {
51
+ "text_length": 0,
52
+ "text": "",
53
+ "page_count": 0,
54
+ "method": None,
55
+ }
56
+
57
+
58
+ def _extract_with_pdfplumber(path: Path) -> Dict[str, any]:
59
+ """Extract text using pdfplumber."""
60
+ text_parts = []
61
+ page_count = 0
62
+
63
+ with pdfplumber.open(path) as pdf:
64
+ page_count = len(pdf.pages)
65
+ for page in pdf.pages:
66
+ page_text = page.extract_text()
67
+ if page_text:
68
+ text_parts.append(page_text)
69
+
70
+ full_text = "\n".join(text_parts)
71
+
72
+ return {
73
+ "text_length": len(full_text),
74
+ "text": full_text[:1000] if len(full_text) > 1000 else full_text,
75
+ "page_count": page_count,
76
+ "method": "pdfplumber",
77
+ }
78
+
79
+
80
+ def _extract_with_pymupdf(path: Path) -> Dict[str, any]:
81
+ """Extract text using PyMuPDF."""
82
+ doc = fitz.open(path)
83
+ text_parts = []
84
+ page_count = len(doc)
85
+
86
+ for page_num in range(page_count):
87
+ page = doc[page_num]
88
+ page_text = page.get_text()
89
+ if page_text:
90
+ text_parts.append(page_text)
91
+
92
+ doc.close()
93
+ full_text = "\n".join(text_parts)
94
+
95
+ return {
96
+ "text_length": len(full_text),
97
+ "text": full_text[:1000] if len(full_text) > 1000 else full_text,
98
+ "page_count": page_count,
99
+ "method": "pymupdf",
100
+ }
101
+
preocr/signals.py ADDED
@@ -0,0 +1,52 @@
1
+ """Signal collection and aggregation for OCR detection."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Optional
5
+
6
+
7
+ def collect_signals(
8
+ file_path: str,
9
+ file_info: Dict[str, str],
10
+ text_result: Optional[Dict[str, Any]] = None,
11
+ image_result: Optional[Dict[str, Any]] = None,
12
+ ) -> Dict[str, Any]:
13
+ """
14
+ Collect and aggregate all detection signals.
15
+
16
+ Args:
17
+ file_path: Path to the file being analyzed
18
+ file_info: File type information from filetype.detect_file_type()
19
+ text_result: Text extraction result (from text_probe, pdf_probe, or office_probe)
20
+ image_result: Image analysis result (from image_probe)
21
+
22
+ Returns:
23
+ Dictionary containing all collected signals:
24
+ - mime: MIME type
25
+ - extension: File extension
26
+ - is_binary: Whether file is binary
27
+ - text_length: Length of extracted text (0 if none)
28
+ - image_entropy: Image entropy (if image)
29
+ - file_size: File size in bytes
30
+ - has_text: Boolean indicating if meaningful text was found
31
+ """
32
+ path = Path(file_path)
33
+ file_size = path.stat().st_size if path.exists() else 0
34
+
35
+ text_length = 0
36
+ if text_result:
37
+ text_length = text_result.get("text_length", 0)
38
+
39
+ image_entropy = None
40
+ if image_result:
41
+ image_entropy = image_result.get("entropy")
42
+
43
+ return {
44
+ "mime": file_info.get("mime", "application/octet-stream"),
45
+ "extension": file_info.get("extension", ""),
46
+ "is_binary": file_info.get("is_binary", True),
47
+ "text_length": text_length,
48
+ "image_entropy": image_entropy,
49
+ "file_size": file_size,
50
+ "has_text": text_length > 0,
51
+ }
52
+
preocr/text_probe.py ADDED
@@ -0,0 +1,110 @@
1
+ """Text extraction for plain text files and HTML."""
2
+
3
+ import codecs
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+
7
+ try:
8
+ from bs4 import BeautifulSoup
9
+ except ImportError:
10
+ BeautifulSoup = None
11
+
12
+
13
+ def extract_text_from_file(file_path: str, mime_type: str) -> Dict[str, any]:
14
+ """
15
+ Extract text from plain text files and HTML.
16
+
17
+ Args:
18
+ file_path: Path to the file
19
+ mime_type: MIME type of the file
20
+
21
+ Returns:
22
+ Dictionary with keys:
23
+ - text_length: Number of characters in extracted text
24
+ - text: Extracted text (may be truncated for large files)
25
+ - encoding: Detected encoding (for text files)
26
+ """
27
+ path = Path(file_path)
28
+
29
+ if mime_type.startswith("text/html") or mime_type == "application/xhtml+xml":
30
+ return _extract_html_text(path)
31
+ elif mime_type.startswith("text/"):
32
+ return _extract_plain_text(path)
33
+ else:
34
+ return {"text_length": 0, "text": "", "encoding": None}
35
+
36
+
37
+ def _extract_plain_text(path: Path) -> Dict[str, any]:
38
+ """Extract text from plain text files."""
39
+ encodings = ["utf-8", "latin-1", "cp1252", "iso-8859-1"]
40
+ text = ""
41
+ encoding = None
42
+
43
+ for enc in encodings:
44
+ try:
45
+ with open(path, "r", encoding=enc) as f:
46
+ text = f.read()
47
+ encoding = enc
48
+ break
49
+ except (UnicodeDecodeError, UnicodeError):
50
+ continue
51
+
52
+ if not text:
53
+ # Last resort: try binary read and decode
54
+ try:
55
+ with open(path, "rb") as f:
56
+ raw = f.read()
57
+ text = raw.decode("utf-8", errors="ignore")
58
+ encoding = "utf-8"
59
+ except Exception:
60
+ pass
61
+
62
+ return {
63
+ "text_length": len(text),
64
+ "text": text[:1000] if len(text) > 1000 else text, # Truncate for large files
65
+ "encoding": encoding,
66
+ }
67
+
68
+
69
+ def _extract_html_text(path: Path) -> Dict[str, any]:
70
+ """Extract text from HTML files."""
71
+ if not BeautifulSoup:
72
+ # Fallback: basic HTML tag removal
73
+ return _extract_plain_text(path)
74
+
75
+ try:
76
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
77
+ content = f.read()
78
+
79
+ soup = BeautifulSoup(content, "html.parser")
80
+ # Remove script and style elements
81
+ for script in soup(["script", "style"]):
82
+ script.decompose()
83
+
84
+ text = soup.get_text(separator=" ", strip=True)
85
+
86
+ return {
87
+ "text_length": len(text),
88
+ "text": text[:1000] if len(text) > 1000 else text,
89
+ "encoding": "utf-8",
90
+ }
91
+ except Exception:
92
+ # Fallback to plain text extraction
93
+ return _extract_plain_text(path)
94
+
95
+
96
+ def has_meaningful_text(text: str, min_chars: int = 50) -> bool:
97
+ """
98
+ Check if text has meaningful content.
99
+
100
+ Args:
101
+ text: Text to check
102
+ min_chars: Minimum number of characters to consider meaningful
103
+
104
+ Returns:
105
+ True if text has meaningful content, False otherwise
106
+ """
107
+ if not text:
108
+ return False
109
+ return len(text.strip()) >= min_chars
110
+
preocr/version.py ADDED
@@ -0,0 +1,4 @@
1
+ """Version information for preocr package."""
2
+
3
+ __version__ = "0.1.0"
4
+
@@ -0,0 +1,256 @@
1
+ Metadata-Version: 2.4
2
+ Name: preocr
3
+ Version: 0.1.0
4
+ Summary: A fast, CPU-only library that detects whether files need OCR processing before expensive OCR operations
5
+ Author: PreOCR Contributors
6
+ License: Apache-2.0
7
+ Project-URL: Homepage, https://github.com/yourusername/preocr
8
+ Project-URL: Documentation, https://github.com/yourusername/preocr#readme
9
+ Project-URL: Repository, https://github.com/yourusername/preocr
10
+ Project-URL: Issues, https://github.com/yourusername/preocr/issues
11
+ Keywords: ocr,document,detection,preprocessing,file-analysis
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: Apache Software License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: Markup
22
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
23
+ Requires-Python: >=3.9
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: python-magic>=0.4.27
27
+ Requires-Dist: pdfplumber>=0.10.0
28
+ Requires-Dist: python-docx>=1.1.0
29
+ Requires-Dist: python-pptx>=0.6.23
30
+ Requires-Dist: openpyxl>=3.1.0
31
+ Requires-Dist: Pillow>=10.0.0
32
+ Requires-Dist: beautifulsoup4>=4.12.0
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7.4.0; extra == "dev"
35
+ Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
36
+ Requires-Dist: black>=23.0.0; extra == "dev"
37
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.5.0; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # PreOCR
42
+
43
+ A fast, CPU-only, deterministic library that detects whether files need OCR processing before expensive OCR operations.
44
+
45
+ ## Overview
46
+
47
+ PreOCR acts as a **universal document gatekeeper** that analyzes any file type and determines:
48
+
49
+ > **"Is this file already machine-readable, or do I need OCR?"**
50
+
51
+ Instead of performing OCR to detect OCR, PreOCR uses intelligent file analysis:
52
+ 1. **File type detection** (MIME types, extensions)
53
+ 2. **Text extraction probes** (PDF, Office docs, plain text)
54
+ 3. **Visual/binary analysis** (images, entropy)
55
+ 4. **Decision engine** (rule-based logic)
56
+
57
+ ## Features
58
+
59
+ - ✅ **Fast**: CPU-only, no OCR required
60
+ - ✅ **Deterministic**: Same input → same output
61
+ - ✅ **OCR-free**: Never performs OCR to detect OCR
62
+ - ✅ **Extensible**: Easy to add new file type handlers
63
+ - ✅ **Conservative**: When uncertain, defaults to "needs OCR"
64
+
65
+ ## Supported File Types
66
+
67
+ - **PDFs**: Digital PDFs (no OCR) vs Scanned PDFs (needs OCR)
68
+ - **Images**: PNG, JPG, TIFF, etc. (always needs OCR)
69
+ - **Office Documents**: DOCX, PPTX, XLSX (extracts text if available)
70
+ - **Text Files**: TXT, CSV, HTML (no OCR needed)
71
+ - **Structured Data**: JSON, XML (no OCR needed)
72
+ - **Unknown Binaries**: Defaults to needing OCR (conservative)
73
+
74
+ ## Installation
75
+
76
+ ```bash
77
+ pip install preocr
78
+ ```
79
+
80
+ ## Quick Start
81
+
82
+ ```python
83
+ from preocr import needs_ocr
84
+
85
+ # Check if a file needs OCR
86
+ result = needs_ocr("document.pdf")
87
+
88
+ if result["needs_ocr"]:
89
+ print(f"File needs OCR: {result['reason']}")
90
+ # Run your OCR here (e.g., MinerU)
91
+ else:
92
+ print(f"File is already machine-readable: {result['reason']}")
93
+ ```
94
+
95
+ ## API Reference
96
+
97
+ ### `needs_ocr(file_path)`
98
+
99
+ Main API function that determines if a file needs OCR.
100
+
101
+ **Parameters:**
102
+ - `file_path` (str or Path): Path to the file to analyze
103
+
104
+ **Returns:**
105
+ Dictionary with the following keys:
106
+ - `needs_ocr` (bool): Whether OCR is needed
107
+ - `file_type` (str): File type category ("pdf", "image", "office", "text", etc.)
108
+ - `category` (str): "structured" (no OCR) or "unstructured" (needs OCR)
109
+ - `confidence` (float): Confidence score (0.0-1.0)
110
+ - `reason` (str): Human-readable reason for the decision
111
+ - `signals` (dict): All collected detection signals (for debugging)
112
+
113
+ **Example:**
114
+
115
+ ```python
116
+ result = needs_ocr("document.pdf")
117
+ print(result)
118
+ # {
119
+ # "needs_ocr": False,
120
+ # "file_type": "pdf",
121
+ # "category": "structured",
122
+ # "confidence": 0.9,
123
+ # "reason": "digital PDF with 1234 characters of extractable text",
124
+ # "signals": {
125
+ # "mime": "application/pdf",
126
+ # "extension": "pdf",
127
+ # "text_length": 1234,
128
+ # "has_text": True,
129
+ # ...
130
+ # }
131
+ # }
132
+ ```
133
+
134
+ ## Usage Examples
135
+
136
+ ### Basic Usage
137
+
138
+ ```python
139
+ from preocr import needs_ocr
140
+
141
+ result = needs_ocr("my_document.pdf")
142
+
143
+ if result["needs_ocr"]:
144
+ print("This file needs OCR processing")
145
+ # Your OCR code here
146
+ else:
147
+ print("This file is already machine-readable")
148
+ print(f"Reason: {result['reason']}")
149
+ ```
150
+
151
+ ### Batch Processing
152
+
153
+ ```python
154
+ from pathlib import Path
155
+ from preocr import needs_ocr
156
+
157
+ files = Path("documents").glob("*.pdf")
158
+
159
+ for file_path in files:
160
+ result = needs_ocr(file_path)
161
+ status = "NEEDS OCR" if result["needs_ocr"] else "READY"
162
+ print(f"{file_path.name}: {status} ({result['reason']})")
163
+ ```
164
+
165
+ ### Integration with MinerU
166
+
167
+ ```python
168
+ from preocr import needs_ocr
169
+ # Assuming you have MinerU OCR available
170
+ # from mineru import ocr
171
+
172
+ def process_document(file_path):
173
+ result = needs_ocr(file_path)
174
+
175
+ if result["needs_ocr"]:
176
+ # Only run expensive OCR if needed
177
+ # ocr_result = ocr(file_path)
178
+ print(f"Running OCR on {file_path}")
179
+ else:
180
+ # Use existing text extraction
181
+ print(f"Using existing text from {file_path}")
182
+ ```
183
+
184
+ ## Architecture
185
+
186
+ ```
187
+ Any File
188
+
189
+ File Type Detector
190
+
191
+ Text Extractability Probe
192
+
193
+ Visual / Binary Probe
194
+
195
+ Decision Engine
196
+
197
+ Result (needs_ocr: bool)
198
+ ```
199
+
200
+ ## Decision Logic
201
+
202
+ PreOCR uses rule-based logic to make decisions:
203
+
204
+ 1. **Plain text formats** → NO OCR
205
+ 2. **Office docs with text** → NO OCR
206
+ 3. **PDFs with extractable text** → NO OCR
207
+ 4. **PDFs without text** → YES OCR (likely scanned)
208
+ 5. **Images** → YES OCR (always)
209
+ 6. **Unknown binaries** → YES OCR (conservative default)
210
+
211
+ ## Requirements
212
+
213
+ - Python 3.9+
214
+ - See `pyproject.toml` for full dependency list
215
+
216
+ ## Development
217
+
218
+ ```bash
219
+ # Clone the repository
220
+ git clone https://github.com/yourusername/preocr.git
221
+ cd preocr
222
+
223
+ # Install in development mode
224
+ pip install -e ".[dev]"
225
+
226
+ # Run tests
227
+ pytest
228
+
229
+ # Run with coverage
230
+ pytest --cov=preocr --cov-report=html
231
+ ```
232
+
233
+ ## Contributing
234
+
235
+ Contributions are welcome! Please feel free to submit a Pull Request.
236
+
237
+ ## License
238
+
239
+ Apache License 2.0 - see [LICENSE](LICENSE) file for details.
240
+
241
+ ## Versioning
242
+
243
+ PreOCR follows [Semantic Versioning](https://semver.org/):
244
+ - **MAJOR**: Breaking API changes
245
+ - **MINOR**: New features (backward-compatible)
246
+ - **PATCH**: Bug fixes (backward-compatible)
247
+
248
+ ## Changelog
249
+
250
+ See [CHANGELOG.md](CHANGELOG.md) for version history and changes.
251
+
252
+ ## Support
253
+
254
+ - **Issues**: [GitHub Issues](https://github.com/yourusername/preocr/issues)
255
+ - **Documentation**: [GitHub README](https://github.com/yourusername/preocr#readme)
256
+
@@ -0,0 +1,16 @@
1
+ preocr/__init__.py,sha256=qY1nuleiyM1J2mnCTdmUbjV78MQ6d-XpzztZnBIsPM8,195
2
+ preocr/constants.py,sha256=TAjLZTNeT6La_4Ssf7CHYeOCay6ZO-2x1JLU38JmtC4,411
3
+ preocr/decision.py,sha256=RGIZ_Jj1huqwr5Yy09Pk9VJNu79AJy4-9SozRqBvEuQ,3866
4
+ preocr/detector.py,sha256=SdHAM3Qpcu7YhiCDmRAnpJNMOEbrCaQ10gRJmQmeE2Q,3530
5
+ preocr/filetype.py,sha256=vngtWF2v6tkPZ0f5EgLlxf2jcJhPTmetNNp2gWseH9c,2756
6
+ preocr/image_probe.py,sha256=6udIWR48V6sjIdb3FMsgUCMHdgeZmdV4JL9f6nn3xQA,3838
7
+ preocr/office_probe.py,sha256=nSceMNOCSd2wc6idJkrLS6Q6A_zqqDmjZ-57EEMdnj0,4438
8
+ preocr/pdf_probe.py,sha256=KcsmVQJ65Mp8WFOSzjJRnMsCTNYeWN_vN7zI38bzSBA,2494
9
+ preocr/signals.py,sha256=UBoP0CUxgD4kW6l5k2NGFJd-50AVZE9IdFVz73qScMA,1741
10
+ preocr/text_probe.py,sha256=xSE0NZIfOezHY8gCPO3ks0Z7594amBEGJc8BcKaYPfk,3177
11
+ preocr/version.py,sha256=ceHpGxrQwgwp0y2ZecPUziMziTNpVP1BvKunP1JZAus,70
12
+ preocr-0.1.0.dist-info/licenses/LICENSE,sha256=tSEUrFBMfq5_wYGUqNHIHsAT2avQLNusPHfhDbLZ1K8,10301
13
+ preocr-0.1.0.dist-info/METADATA,sha256=eGgUYLc_AYpMgSgdlsyh5DTyTjVCE250thJ4V9vgKDQ,6910
14
+ preocr-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
15
+ preocr-0.1.0.dist-info/top_level.txt,sha256=q3NK_rx1PuYHeeK3I5MnmBKXD7aG2ZwArJ1t2-R_cRw,7
16
+ preocr-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+