preocr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
preocr/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """PreOCR - A fast, CPU-only library that detects whether files need OCR processing."""
2
+
3
+ from .version import __version__
4
+ from .detector import needs_ocr
5
+
6
+ __all__ = ["needs_ocr", "__version__"]
7
+
preocr/constants.py ADDED
@@ -0,0 +1,17 @@
1
+ """Constants and configuration for preocr."""
2
+
3
+ # Minimum text length to consider a file as having meaningful text
4
+ MIN_TEXT_LENGTH = 50
5
+
6
+ # Minimum text length for office documents to skip OCR
7
+ MIN_OFFICE_TEXT_LENGTH = 100
8
+
9
+ # File type categories
10
+ CATEGORY_STRUCTURED = "structured"
11
+ CATEGORY_UNSTRUCTURED = "unstructured"
12
+
13
+ # Confidence thresholds
14
+ HIGH_CONFIDENCE = 0.9
15
+ MEDIUM_CONFIDENCE = 0.7
16
+ LOW_CONFIDENCE = 0.5
17
+
preocr/decision.py ADDED
@@ -0,0 +1,129 @@
1
+ """Decision engine to determine if OCR is needed."""
2
+
3
+ from typing import Dict, Tuple
4
+
5
+ from .constants import (
6
+ CATEGORY_STRUCTURED,
7
+ CATEGORY_UNSTRUCTURED,
8
+ HIGH_CONFIDENCE,
9
+ LOW_CONFIDENCE,
10
+ MEDIUM_CONFIDENCE,
11
+ MIN_OFFICE_TEXT_LENGTH,
12
+ MIN_TEXT_LENGTH,
13
+ )
14
+
15
+
16
+ def decide(signals: Dict[str, any]) -> Tuple[bool, str, float, str]:
17
+ """
18
+ Decide if a file needs OCR based on collected signals.
19
+
20
+ Args:
21
+ signals: Dictionary of signals from signals.collect_signals()
22
+
23
+ Returns:
24
+ Tuple of:
25
+ - needs_ocr: Boolean indicating if OCR is needed
26
+ - reason: Human-readable reason for the decision
27
+ - confidence: Confidence score (0.0-1.0)
28
+ - category: "structured" or "unstructured"
29
+ """
30
+ mime = signals.get("mime", "")
31
+ text_length = signals.get("text_length", 0)
32
+ extension = signals.get("extension", "")
33
+ is_binary = signals.get("is_binary", True)
34
+
35
+ # Rule 1: Plain text formats - NO OCR
36
+ if mime.startswith("text/"):
37
+ return (
38
+ False,
39
+ "text file with extractable content",
40
+ HIGH_CONFIDENCE,
41
+ CATEGORY_STRUCTURED,
42
+ )
43
+
44
+ # Rule 2: Office documents with text - NO OCR
45
+ if "officedocument" in mime or extension in ["docx", "pptx", "xlsx"]:
46
+ if text_length >= MIN_OFFICE_TEXT_LENGTH:
47
+ return (
48
+ False,
49
+ f"office document with {text_length} characters of text",
50
+ HIGH_CONFIDENCE,
51
+ CATEGORY_STRUCTURED,
52
+ )
53
+ else:
54
+ return (
55
+ True,
56
+ f"office document with insufficient text ({text_length} chars)",
57
+ MEDIUM_CONFIDENCE,
58
+ CATEGORY_UNSTRUCTURED,
59
+ )
60
+
61
+ # Rule 3: Images - YES OCR (always)
62
+ if mime.startswith("image/"):
63
+ return (
64
+ True,
65
+ "image file (no text extraction possible)",
66
+ HIGH_CONFIDENCE,
67
+ CATEGORY_UNSTRUCTURED,
68
+ )
69
+
70
+ # Rule 4: PDFs
71
+ if mime == "application/pdf" or extension == "pdf":
72
+ if text_length >= MIN_TEXT_LENGTH:
73
+ return (
74
+ False,
75
+ f"digital PDF with {text_length} characters of extractable text",
76
+ HIGH_CONFIDENCE,
77
+ CATEGORY_STRUCTURED,
78
+ )
79
+ else:
80
+ return (
81
+ True,
82
+ f"PDF without extractable text ({text_length} chars) - likely scanned",
83
+ MEDIUM_CONFIDENCE,
84
+ CATEGORY_UNSTRUCTURED,
85
+ )
86
+
87
+ # Rule 5: JSON/XML - NO OCR
88
+ if mime in ["application/json", "application/xml"] or extension in ["json", "xml"]:
89
+ return (
90
+ False,
91
+ "structured data file (JSON/XML)",
92
+ HIGH_CONFIDENCE,
93
+ CATEGORY_STRUCTURED,
94
+ )
95
+
96
+ # Rule 6: HTML - NO OCR (text can be extracted)
97
+ if mime in ["text/html", "application/xhtml+xml"] or extension in ["html", "htm"]:
98
+ if text_length >= MIN_TEXT_LENGTH:
99
+ return (
100
+ False,
101
+ f"HTML file with {text_length} characters of text",
102
+ HIGH_CONFIDENCE,
103
+ CATEGORY_STRUCTURED,
104
+ )
105
+ else:
106
+ return (
107
+ True,
108
+ "HTML file with minimal content",
109
+ LOW_CONFIDENCE,
110
+ CATEGORY_UNSTRUCTURED,
111
+ )
112
+
113
+ # Rule 7: Unknown binaries - YES OCR (conservative default)
114
+ if is_binary:
115
+ return (
116
+ True,
117
+ "unknown binary file type",
118
+ LOW_CONFIDENCE,
119
+ CATEGORY_UNSTRUCTURED,
120
+ )
121
+
122
+ # Fallback: default to needing OCR
123
+ return (
124
+ True,
125
+ "unrecognized file type",
126
+ LOW_CONFIDENCE,
127
+ CATEGORY_UNSTRUCTURED,
128
+ )
129
+
preocr/detector.py ADDED
@@ -0,0 +1,100 @@
1
+ """Main API for OCR detection."""
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Dict, Union
5
+
6
+ from . import decision
7
+ from . import filetype
8
+ from . import image_probe
9
+ from . import office_probe
10
+ from . import pdf_probe
11
+ from . import signals
12
+ from . import text_probe
13
+
14
+
15
+ def needs_ocr(file_path: Union[str, Path]) -> Dict[str, Any]:
16
+ """
17
+ Determine if a file needs OCR processing.
18
+
19
+ This is the main API function. It analyzes the file type, extracts text
20
+ where possible, and makes an intelligent decision about whether OCR is needed.
21
+
22
+ Args:
23
+ file_path: Path to the file to analyze (string or Path object)
24
+
25
+ Returns:
26
+ Dictionary with keys:
27
+ - needs_ocr: Boolean indicating if OCR is needed
28
+ - file_type: Detected file type category (e.g., "image", "pdf", "office")
29
+ - category: "structured" (no OCR) or "unstructured" (needs OCR)
30
+ - confidence: Confidence score (0.0-1.0)
31
+ - reason: Human-readable reason for the decision
32
+ - signals: Dictionary of all collected signals (for debugging)
33
+
34
+ Example:
35
+ >>> result = needs_ocr("document.pdf")
36
+ >>> if result["needs_ocr"]:
37
+ ... run_ocr("document.pdf")
38
+ """
39
+ path = Path(file_path)
40
+
41
+ if not path.exists():
42
+ raise FileNotFoundError(f"File not found: {file_path}")
43
+
44
+ # Step 1: Detect file type
45
+ file_info = filetype.detect_file_type(str(path))
46
+ mime = file_info["mime"]
47
+
48
+ # Step 2: Extract text based on file type
49
+ text_result = None
50
+ image_result = None
51
+
52
+ if mime == "application/pdf":
53
+ # PDF text extraction
54
+ text_result = pdf_probe.extract_pdf_text(str(path))
55
+ elif "officedocument" in mime or file_info["extension"] in ["docx", "pptx", "xlsx"]:
56
+ # Office document text extraction
57
+ text_result = office_probe.extract_office_text(str(path), mime)
58
+ elif mime.startswith("text/") or mime in ["text/html", "application/xhtml+xml"]:
59
+ # Plain text or HTML extraction
60
+ text_result = text_probe.extract_text_from_file(str(path), mime)
61
+ elif mime.startswith("image/"):
62
+ # Image analysis (no text extraction)
63
+ image_result = image_probe.analyze_image(str(path))
64
+
65
+ # Step 3: Collect all signals
66
+ collected_signals = signals.collect_signals(
67
+ str(path), file_info, text_result, image_result
68
+ )
69
+
70
+ # Step 4: Make decision
71
+ needs_ocr_flag, reason, confidence, category = decision.decide(collected_signals)
72
+
73
+ # Step 5: Determine file type category for user
74
+ file_type_category = _get_file_type_category(mime, file_info["extension"])
75
+
76
+ return {
77
+ "needs_ocr": needs_ocr_flag,
78
+ "file_type": file_type_category,
79
+ "category": category,
80
+ "confidence": confidence,
81
+ "reason": reason,
82
+ "signals": collected_signals,
83
+ }
84
+
85
+
86
+ def _get_file_type_category(mime: str, extension: str) -> str:
87
+ """Get a user-friendly file type category."""
88
+ if mime.startswith("image/"):
89
+ return "image"
90
+ elif mime == "application/pdf" or extension == "pdf":
91
+ return "pdf"
92
+ elif "officedocument" in mime or extension in ["docx", "pptx", "xlsx", "doc", "ppt", "xls"]:
93
+ return "office"
94
+ elif mime.startswith("text/") or extension in ["txt", "csv", "html", "htm"]:
95
+ return "text"
96
+ elif mime in ["application/json", "application/xml"] or extension in ["json", "xml"]:
97
+ return "structured"
98
+ else:
99
+ return "unknown"
100
+
preocr/filetype.py ADDED
@@ -0,0 +1,90 @@
1
+ """File type detection using MIME types and extensions."""
2
+
3
+ import mimetypes
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+
7
+ try:
8
+ import magic
9
+ except ImportError:
10
+ magic = None
11
+
12
+
13
+ def detect_file_type(file_path: str) -> Dict[str, str]:
14
+ """
15
+ Detect file type using MIME detection and extension fallback.
16
+
17
+ Args:
18
+ file_path: Path to the file to analyze
19
+
20
+ Returns:
21
+ Dictionary with keys:
22
+ - mime: MIME type string (e.g., "application/pdf")
23
+ - extension: File extension without dot (e.g., "pdf")
24
+ - is_binary: Boolean indicating if file is binary (True for non-text types)
25
+ """
26
+ path = Path(file_path)
27
+ extension = path.suffix.lower().lstrip(".")
28
+
29
+ # Try python-magic first (more reliable)
30
+ mime_type = None
31
+ if magic:
32
+ try:
33
+ mime_type = magic.from_file(str(path), mime=True)
34
+ except (OSError, magic.MagicException):
35
+ # Fallback if magic fails
36
+ pass
37
+
38
+ # Fallback to mimetypes module
39
+ if not mime_type:
40
+ mime_type, _ = mimetypes.guess_type(str(path))
41
+
42
+ # Final fallback: use extension-based detection
43
+ if not mime_type:
44
+ mime_type = _guess_mime_from_extension(extension)
45
+
46
+ # Default to application/octet-stream for unknown types
47
+ if not mime_type:
48
+ mime_type = "application/octet-stream"
49
+
50
+ # Determine if binary (non-text types)
51
+ is_binary = not (
52
+ mime_type.startswith("text/") or
53
+ mime_type in ["application/json", "application/xml"]
54
+ )
55
+
56
+ return {
57
+ "mime": mime_type,
58
+ "extension": extension,
59
+ "is_binary": is_binary,
60
+ }
61
+
62
+
63
+ def _guess_mime_from_extension(extension: str) -> Optional[str]:
64
+ """Guess MIME type from file extension."""
65
+ extension_map = {
66
+ "pdf": "application/pdf",
67
+ "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
68
+ "doc": "application/msword",
69
+ "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
70
+ "ppt": "application/vnd.ms-powerpoint",
71
+ "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
72
+ "xls": "application/vnd.ms-excel",
73
+ "png": "image/png",
74
+ "jpg": "image/jpeg",
75
+ "jpeg": "image/jpeg",
76
+ "gif": "image/gif",
77
+ "tiff": "image/tiff",
78
+ "tif": "image/tiff",
79
+ "bmp": "image/bmp",
80
+ "webp": "image/webp",
81
+ "txt": "text/plain",
82
+ "csv": "text/csv",
83
+ "html": "text/html",
84
+ "htm": "text/html",
85
+ "json": "application/json",
86
+ "xml": "application/xml",
87
+ "eml": "message/rfc822",
88
+ }
89
+ return extension_map.get(extension.lower())
90
+
preocr/image_probe.py ADDED
@@ -0,0 +1,139 @@
1
+ """Image analysis and entropy calculation."""
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Any, Dict, Optional
5
+
6
+ if TYPE_CHECKING:
7
+ from PIL import Image as PILImage
8
+
9
+ try:
10
+ from PIL import Image
11
+ import numpy as np
12
+ except ImportError:
13
+ Image = None
14
+ np = None
15
+
16
+
17
+ def is_image_file(mime_type: str) -> bool:
18
+ """
19
+ Check if MIME type represents an image.
20
+
21
+ Args:
22
+ mime_type: MIME type string
23
+
24
+ Returns:
25
+ True if MIME type is an image, False otherwise
26
+ """
27
+ return mime_type.startswith("image/")
28
+
29
+
30
+ def analyze_image(file_path: str) -> Dict[str, any]:
31
+ """
32
+ Analyze image file and calculate entropy.
33
+
34
+ Args:
35
+ file_path: Path to the image file
36
+
37
+ Returns:
38
+ Dictionary with keys:
39
+ - entropy: Image entropy value (0-8, higher = more complex)
40
+ - width: Image width in pixels
41
+ - height: Image height in pixels
42
+ - mode: Image mode (RGB, L, etc.)
43
+ - is_image: Always True for images
44
+ """
45
+ if not Image:
46
+ return {
47
+ "entropy": None,
48
+ "width": None,
49
+ "height": None,
50
+ "mode": None,
51
+ "is_image": True,
52
+ }
53
+
54
+ path = Path(file_path)
55
+
56
+ try:
57
+ with Image.open(path) as img:
58
+ # Convert to grayscale for entropy calculation
59
+ gray_img = img.convert("L")
60
+
61
+ # Calculate entropy
62
+ entropy = _calculate_entropy(gray_img)
63
+
64
+ return {
65
+ "entropy": entropy,
66
+ "width": img.width,
67
+ "height": img.height,
68
+ "mode": img.mode,
69
+ "is_image": True,
70
+ }
71
+ except Exception:
72
+ return {
73
+ "entropy": None,
74
+ "width": None,
75
+ "height": None,
76
+ "mode": None,
77
+ "is_image": True,
78
+ }
79
+
80
+
81
+ def _calculate_entropy(image: Any) -> float:
82
+ """
83
+ Calculate entropy of an image.
84
+
85
+ Entropy measures the randomness/complexity of pixel values.
86
+ Low entropy (0-4): Simple images, likely scanned text
87
+ High entropy (4-8): Complex images, photos
88
+
89
+ Args:
90
+ image: PIL Image object (should be grayscale)
91
+
92
+ Returns:
93
+ Entropy value between 0 and 8
94
+ """
95
+ if not np:
96
+ # Fallback: simple histogram-based entropy
97
+ histogram = image.histogram()
98
+ histogram = [h for h in histogram if h > 0] # Remove zeros
99
+ total_pixels = sum(histogram)
100
+
101
+ if total_pixels == 0:
102
+ return 0.0
103
+
104
+ entropy = 0.0
105
+ for count in histogram:
106
+ probability = count / total_pixels
107
+ if probability > 0:
108
+ entropy -= probability * (probability.bit_length() - 1) # Approximate log2
109
+ return entropy
110
+
111
+ # NumPy-based calculation (more accurate)
112
+ try:
113
+ img_array = np.array(image)
114
+ histogram, _ = np.histogram(img_array.flatten(), bins=256, range=(0, 256))
115
+ histogram = histogram[histogram > 0] # Remove zeros
116
+ total_pixels = histogram.sum()
117
+
118
+ if total_pixels == 0:
119
+ return 0.0
120
+
121
+ probabilities = histogram / total_pixels
122
+ entropy = -np.sum(probabilities * np.log2(probabilities))
123
+ return float(entropy)
124
+ except Exception:
125
+ # Fallback to simple calculation
126
+ histogram = image.histogram()
127
+ histogram = [h for h in histogram if h > 0]
128
+ total_pixels = sum(histogram)
129
+
130
+ if total_pixels == 0:
131
+ return 0.0
132
+
133
+ entropy = 0.0
134
+ for count in histogram:
135
+ probability = count / total_pixels
136
+ if probability > 0:
137
+ entropy -= probability * (probability.bit_length() - 1)
138
+ return entropy
139
+
preocr/office_probe.py ADDED
@@ -0,0 +1,161 @@
1
+ """Office document text extraction (DOCX, PPTX, XLSX)."""
2
+
3
+ from pathlib import Path
4
+ from typing import Dict
5
+
6
+ try:
7
+ from docx import Document
8
+ except ImportError:
9
+ Document = None
10
+
11
+ try:
12
+ from pptx import Presentation
13
+ except ImportError:
14
+ Presentation = None
15
+
16
+ try:
17
+ from openpyxl import load_workbook
18
+ except ImportError:
19
+ load_workbook = None
20
+
21
+
22
+ def extract_office_text(file_path: str, mime_type: str) -> Dict[str, any]:
23
+ """
24
+ Extract text from Office documents (DOCX, PPTX, XLSX).
25
+
26
+ Args:
27
+ file_path: Path to the office document
28
+ mime_type: MIME type of the file
29
+
30
+ Returns:
31
+ Dictionary with keys:
32
+ - text_length: Number of characters in extracted text
33
+ - text: Extracted text (may be truncated for large files)
34
+ - document_type: Type of document ("docx", "pptx", "xlsx")
35
+ """
36
+ path = Path(file_path)
37
+
38
+ if "wordprocessingml" in mime_type or path.suffix.lower() == ".docx":
39
+ return _extract_docx(path)
40
+ elif "presentationml" in mime_type or path.suffix.lower() == ".pptx":
41
+ return _extract_pptx(path)
42
+ elif "spreadsheetml" in mime_type or path.suffix.lower() == ".xlsx":
43
+ return _extract_xlsx(path)
44
+ else:
45
+ return {
46
+ "text_length": 0,
47
+ "text": "",
48
+ "document_type": None,
49
+ }
50
+
51
+
52
+ def _extract_docx(path: Path) -> Dict[str, any]:
53
+ """Extract text from DOCX file."""
54
+ if not Document:
55
+ return {
56
+ "text_length": 0,
57
+ "text": "",
58
+ "document_type": "docx",
59
+ }
60
+
61
+ try:
62
+ doc = Document(path)
63
+ text_parts = []
64
+
65
+ # Extract paragraphs
66
+ for paragraph in doc.paragraphs:
67
+ if paragraph.text.strip():
68
+ text_parts.append(paragraph.text)
69
+
70
+ # Extract tables
71
+ for table in doc.tables:
72
+ for row in table.rows:
73
+ for cell in row.cells:
74
+ if cell.text.strip():
75
+ text_parts.append(cell.text)
76
+
77
+ full_text = "\n".join(text_parts)
78
+
79
+ return {
80
+ "text_length": len(full_text),
81
+ "text": full_text[:1000] if len(full_text) > 1000 else full_text,
82
+ "document_type": "docx",
83
+ }
84
+ except Exception:
85
+ return {
86
+ "text_length": 0,
87
+ "text": "",
88
+ "document_type": "docx",
89
+ }
90
+
91
+
92
+ def _extract_pptx(path: Path) -> Dict[str, any]:
93
+ """Extract text from PPTX file."""
94
+ if not Presentation:
95
+ return {
96
+ "text_length": 0,
97
+ "text": "",
98
+ "document_type": "pptx",
99
+ }
100
+
101
+ try:
102
+ prs = Presentation(path)
103
+ text_parts = []
104
+
105
+ # Extract text from slides
106
+ for slide in prs.slides:
107
+ for shape in slide.shapes:
108
+ if hasattr(shape, "text") and shape.text.strip():
109
+ text_parts.append(shape.text)
110
+
111
+ full_text = "\n".join(text_parts)
112
+
113
+ return {
114
+ "text_length": len(full_text),
115
+ "text": full_text[:1000] if len(full_text) > 1000 else full_text,
116
+ "document_type": "pptx",
117
+ }
118
+ except Exception:
119
+ return {
120
+ "text_length": 0,
121
+ "text": "",
122
+ "document_type": "pptx",
123
+ }
124
+
125
+
126
+ def _extract_xlsx(path: Path) -> Dict[str, any]:
127
+ """Extract text from XLSX file."""
128
+ if not load_workbook:
129
+ return {
130
+ "text_length": 0,
131
+ "text": "",
132
+ "document_type": "xlsx",
133
+ }
134
+
135
+ try:
136
+ wb = load_workbook(path, data_only=True)
137
+ text_parts = []
138
+
139
+ # Extract text from all sheets
140
+ for sheet_name in wb.sheetnames:
141
+ sheet = wb[sheet_name]
142
+ for row in sheet.iter_rows(values_only=True):
143
+ row_text = " ".join(str(cell) for cell in row if cell is not None)
144
+ if row_text.strip():
145
+ text_parts.append(row_text)
146
+
147
+ wb.close()
148
+ full_text = "\n".join(text_parts)
149
+
150
+ return {
151
+ "text_length": len(full_text),
152
+ "text": full_text[:1000] if len(full_text) > 1000 else full_text,
153
+ "document_type": "xlsx",
154
+ }
155
+ except Exception:
156
+ return {
157
+ "text_length": 0,
158
+ "text": "",
159
+ "document_type": "xlsx",
160
+ }
161
+