preocr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- preocr/__init__.py +7 -0
- preocr/constants.py +17 -0
- preocr/decision.py +129 -0
- preocr/detector.py +100 -0
- preocr/filetype.py +90 -0
- preocr/image_probe.py +139 -0
- preocr/office_probe.py +161 -0
- preocr/pdf_probe.py +101 -0
- preocr/signals.py +52 -0
- preocr/text_probe.py +110 -0
- preocr/version.py +4 -0
- preocr-0.1.0.dist-info/METADATA +256 -0
- preocr-0.1.0.dist-info/RECORD +16 -0
- preocr-0.1.0.dist-info/WHEEL +5 -0
- preocr-0.1.0.dist-info/licenses/LICENSE +192 -0
- preocr-0.1.0.dist-info/top_level.txt +1 -0
preocr/__init__.py
ADDED
preocr/constants.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Constants and configuration for preocr."""
|
|
2
|
+
|
|
3
|
+
# Minimum text length to consider a file as having meaningful text
|
|
4
|
+
MIN_TEXT_LENGTH = 50
|
|
5
|
+
|
|
6
|
+
# Minimum text length for office documents to skip OCR
|
|
7
|
+
MIN_OFFICE_TEXT_LENGTH = 100
|
|
8
|
+
|
|
9
|
+
# File type categories
|
|
10
|
+
CATEGORY_STRUCTURED = "structured"
|
|
11
|
+
CATEGORY_UNSTRUCTURED = "unstructured"
|
|
12
|
+
|
|
13
|
+
# Confidence thresholds
|
|
14
|
+
HIGH_CONFIDENCE = 0.9
|
|
15
|
+
MEDIUM_CONFIDENCE = 0.7
|
|
16
|
+
LOW_CONFIDENCE = 0.5
|
|
17
|
+
|
preocr/decision.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Decision engine to determine if OCR is needed."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Tuple
|
|
4
|
+
|
|
5
|
+
from .constants import (
|
|
6
|
+
CATEGORY_STRUCTURED,
|
|
7
|
+
CATEGORY_UNSTRUCTURED,
|
|
8
|
+
HIGH_CONFIDENCE,
|
|
9
|
+
LOW_CONFIDENCE,
|
|
10
|
+
MEDIUM_CONFIDENCE,
|
|
11
|
+
MIN_OFFICE_TEXT_LENGTH,
|
|
12
|
+
MIN_TEXT_LENGTH,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def decide(signals: Dict[str, any]) -> Tuple[bool, str, float, str]:
|
|
17
|
+
"""
|
|
18
|
+
Decide if a file needs OCR based on collected signals.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
signals: Dictionary of signals from signals.collect_signals()
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Tuple of:
|
|
25
|
+
- needs_ocr: Boolean indicating if OCR is needed
|
|
26
|
+
- reason: Human-readable reason for the decision
|
|
27
|
+
- confidence: Confidence score (0.0-1.0)
|
|
28
|
+
- category: "structured" or "unstructured"
|
|
29
|
+
"""
|
|
30
|
+
mime = signals.get("mime", "")
|
|
31
|
+
text_length = signals.get("text_length", 0)
|
|
32
|
+
extension = signals.get("extension", "")
|
|
33
|
+
is_binary = signals.get("is_binary", True)
|
|
34
|
+
|
|
35
|
+
# Rule 1: Plain text formats - NO OCR
|
|
36
|
+
if mime.startswith("text/"):
|
|
37
|
+
return (
|
|
38
|
+
False,
|
|
39
|
+
"text file with extractable content",
|
|
40
|
+
HIGH_CONFIDENCE,
|
|
41
|
+
CATEGORY_STRUCTURED,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Rule 2: Office documents with text - NO OCR
|
|
45
|
+
if "officedocument" in mime or extension in ["docx", "pptx", "xlsx"]:
|
|
46
|
+
if text_length >= MIN_OFFICE_TEXT_LENGTH:
|
|
47
|
+
return (
|
|
48
|
+
False,
|
|
49
|
+
f"office document with {text_length} characters of text",
|
|
50
|
+
HIGH_CONFIDENCE,
|
|
51
|
+
CATEGORY_STRUCTURED,
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
return (
|
|
55
|
+
True,
|
|
56
|
+
f"office document with insufficient text ({text_length} chars)",
|
|
57
|
+
MEDIUM_CONFIDENCE,
|
|
58
|
+
CATEGORY_UNSTRUCTURED,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Rule 3: Images - YES OCR (always)
|
|
62
|
+
if mime.startswith("image/"):
|
|
63
|
+
return (
|
|
64
|
+
True,
|
|
65
|
+
"image file (no text extraction possible)",
|
|
66
|
+
HIGH_CONFIDENCE,
|
|
67
|
+
CATEGORY_UNSTRUCTURED,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Rule 4: PDFs
|
|
71
|
+
if mime == "application/pdf" or extension == "pdf":
|
|
72
|
+
if text_length >= MIN_TEXT_LENGTH:
|
|
73
|
+
return (
|
|
74
|
+
False,
|
|
75
|
+
f"digital PDF with {text_length} characters of extractable text",
|
|
76
|
+
HIGH_CONFIDENCE,
|
|
77
|
+
CATEGORY_STRUCTURED,
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
return (
|
|
81
|
+
True,
|
|
82
|
+
f"PDF without extractable text ({text_length} chars) - likely scanned",
|
|
83
|
+
MEDIUM_CONFIDENCE,
|
|
84
|
+
CATEGORY_UNSTRUCTURED,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Rule 5: JSON/XML - NO OCR
|
|
88
|
+
if mime in ["application/json", "application/xml"] or extension in ["json", "xml"]:
|
|
89
|
+
return (
|
|
90
|
+
False,
|
|
91
|
+
"structured data file (JSON/XML)",
|
|
92
|
+
HIGH_CONFIDENCE,
|
|
93
|
+
CATEGORY_STRUCTURED,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Rule 6: HTML - NO OCR (text can be extracted)
|
|
97
|
+
if mime in ["text/html", "application/xhtml+xml"] or extension in ["html", "htm"]:
|
|
98
|
+
if text_length >= MIN_TEXT_LENGTH:
|
|
99
|
+
return (
|
|
100
|
+
False,
|
|
101
|
+
f"HTML file with {text_length} characters of text",
|
|
102
|
+
HIGH_CONFIDENCE,
|
|
103
|
+
CATEGORY_STRUCTURED,
|
|
104
|
+
)
|
|
105
|
+
else:
|
|
106
|
+
return (
|
|
107
|
+
True,
|
|
108
|
+
"HTML file with minimal content",
|
|
109
|
+
LOW_CONFIDENCE,
|
|
110
|
+
CATEGORY_UNSTRUCTURED,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Rule 7: Unknown binaries - YES OCR (conservative default)
|
|
114
|
+
if is_binary:
|
|
115
|
+
return (
|
|
116
|
+
True,
|
|
117
|
+
"unknown binary file type",
|
|
118
|
+
LOW_CONFIDENCE,
|
|
119
|
+
CATEGORY_UNSTRUCTURED,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Fallback: default to needing OCR
|
|
123
|
+
return (
|
|
124
|
+
True,
|
|
125
|
+
"unrecognized file type",
|
|
126
|
+
LOW_CONFIDENCE,
|
|
127
|
+
CATEGORY_UNSTRUCTURED,
|
|
128
|
+
)
|
|
129
|
+
|
preocr/detector.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Main API for OCR detection."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Dict, Union
|
|
5
|
+
|
|
6
|
+
from . import decision
|
|
7
|
+
from . import filetype
|
|
8
|
+
from . import image_probe
|
|
9
|
+
from . import office_probe
|
|
10
|
+
from . import pdf_probe
|
|
11
|
+
from . import signals
|
|
12
|
+
from . import text_probe
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def needs_ocr(file_path: Union[str, Path]) -> Dict[str, Any]:
|
|
16
|
+
"""
|
|
17
|
+
Determine if a file needs OCR processing.
|
|
18
|
+
|
|
19
|
+
This is the main API function. It analyzes the file type, extracts text
|
|
20
|
+
where possible, and makes an intelligent decision about whether OCR is needed.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
file_path: Path to the file to analyze (string or Path object)
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Dictionary with keys:
|
|
27
|
+
- needs_ocr: Boolean indicating if OCR is needed
|
|
28
|
+
- file_type: Detected file type category (e.g., "image", "pdf", "office")
|
|
29
|
+
- category: "structured" (no OCR) or "unstructured" (needs OCR)
|
|
30
|
+
- confidence: Confidence score (0.0-1.0)
|
|
31
|
+
- reason: Human-readable reason for the decision
|
|
32
|
+
- signals: Dictionary of all collected signals (for debugging)
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = needs_ocr("document.pdf")
|
|
36
|
+
>>> if result["needs_ocr"]:
|
|
37
|
+
... run_ocr("document.pdf")
|
|
38
|
+
"""
|
|
39
|
+
path = Path(file_path)
|
|
40
|
+
|
|
41
|
+
if not path.exists():
|
|
42
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
43
|
+
|
|
44
|
+
# Step 1: Detect file type
|
|
45
|
+
file_info = filetype.detect_file_type(str(path))
|
|
46
|
+
mime = file_info["mime"]
|
|
47
|
+
|
|
48
|
+
# Step 2: Extract text based on file type
|
|
49
|
+
text_result = None
|
|
50
|
+
image_result = None
|
|
51
|
+
|
|
52
|
+
if mime == "application/pdf":
|
|
53
|
+
# PDF text extraction
|
|
54
|
+
text_result = pdf_probe.extract_pdf_text(str(path))
|
|
55
|
+
elif "officedocument" in mime or file_info["extension"] in ["docx", "pptx", "xlsx"]:
|
|
56
|
+
# Office document text extraction
|
|
57
|
+
text_result = office_probe.extract_office_text(str(path), mime)
|
|
58
|
+
elif mime.startswith("text/") or mime in ["text/html", "application/xhtml+xml"]:
|
|
59
|
+
# Plain text or HTML extraction
|
|
60
|
+
text_result = text_probe.extract_text_from_file(str(path), mime)
|
|
61
|
+
elif mime.startswith("image/"):
|
|
62
|
+
# Image analysis (no text extraction)
|
|
63
|
+
image_result = image_probe.analyze_image(str(path))
|
|
64
|
+
|
|
65
|
+
# Step 3: Collect all signals
|
|
66
|
+
collected_signals = signals.collect_signals(
|
|
67
|
+
str(path), file_info, text_result, image_result
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Step 4: Make decision
|
|
71
|
+
needs_ocr_flag, reason, confidence, category = decision.decide(collected_signals)
|
|
72
|
+
|
|
73
|
+
# Step 5: Determine file type category for user
|
|
74
|
+
file_type_category = _get_file_type_category(mime, file_info["extension"])
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"needs_ocr": needs_ocr_flag,
|
|
78
|
+
"file_type": file_type_category,
|
|
79
|
+
"category": category,
|
|
80
|
+
"confidence": confidence,
|
|
81
|
+
"reason": reason,
|
|
82
|
+
"signals": collected_signals,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _get_file_type_category(mime: str, extension: str) -> str:
|
|
87
|
+
"""Get a user-friendly file type category."""
|
|
88
|
+
if mime.startswith("image/"):
|
|
89
|
+
return "image"
|
|
90
|
+
elif mime == "application/pdf" or extension == "pdf":
|
|
91
|
+
return "pdf"
|
|
92
|
+
elif "officedocument" in mime or extension in ["docx", "pptx", "xlsx", "doc", "ppt", "xls"]:
|
|
93
|
+
return "office"
|
|
94
|
+
elif mime.startswith("text/") or extension in ["txt", "csv", "html", "htm"]:
|
|
95
|
+
return "text"
|
|
96
|
+
elif mime in ["application/json", "application/xml"] or extension in ["json", "xml"]:
|
|
97
|
+
return "structured"
|
|
98
|
+
else:
|
|
99
|
+
return "unknown"
|
|
100
|
+
|
preocr/filetype.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""File type detection using MIME types and extensions."""
|
|
2
|
+
|
|
3
|
+
import mimetypes
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Optional
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
import magic
|
|
9
|
+
except ImportError:
|
|
10
|
+
magic = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def detect_file_type(file_path: str) -> Dict[str, str]:
|
|
14
|
+
"""
|
|
15
|
+
Detect file type using MIME detection and extension fallback.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
file_path: Path to the file to analyze
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Dictionary with keys:
|
|
22
|
+
- mime: MIME type string (e.g., "application/pdf")
|
|
23
|
+
- extension: File extension without dot (e.g., "pdf")
|
|
24
|
+
- is_binary: Boolean indicating if file is binary (True for non-text types)
|
|
25
|
+
"""
|
|
26
|
+
path = Path(file_path)
|
|
27
|
+
extension = path.suffix.lower().lstrip(".")
|
|
28
|
+
|
|
29
|
+
# Try python-magic first (more reliable)
|
|
30
|
+
mime_type = None
|
|
31
|
+
if magic:
|
|
32
|
+
try:
|
|
33
|
+
mime_type = magic.from_file(str(path), mime=True)
|
|
34
|
+
except (OSError, magic.MagicException):
|
|
35
|
+
# Fallback if magic fails
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
# Fallback to mimetypes module
|
|
39
|
+
if not mime_type:
|
|
40
|
+
mime_type, _ = mimetypes.guess_type(str(path))
|
|
41
|
+
|
|
42
|
+
# Final fallback: use extension-based detection
|
|
43
|
+
if not mime_type:
|
|
44
|
+
mime_type = _guess_mime_from_extension(extension)
|
|
45
|
+
|
|
46
|
+
# Default to application/octet-stream for unknown types
|
|
47
|
+
if not mime_type:
|
|
48
|
+
mime_type = "application/octet-stream"
|
|
49
|
+
|
|
50
|
+
# Determine if binary (non-text types)
|
|
51
|
+
is_binary = not (
|
|
52
|
+
mime_type.startswith("text/") or
|
|
53
|
+
mime_type in ["application/json", "application/xml"]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
"mime": mime_type,
|
|
58
|
+
"extension": extension,
|
|
59
|
+
"is_binary": is_binary,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _guess_mime_from_extension(extension: str) -> Optional[str]:
|
|
64
|
+
"""Guess MIME type from file extension."""
|
|
65
|
+
extension_map = {
|
|
66
|
+
"pdf": "application/pdf",
|
|
67
|
+
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
68
|
+
"doc": "application/msword",
|
|
69
|
+
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
70
|
+
"ppt": "application/vnd.ms-powerpoint",
|
|
71
|
+
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
72
|
+
"xls": "application/vnd.ms-excel",
|
|
73
|
+
"png": "image/png",
|
|
74
|
+
"jpg": "image/jpeg",
|
|
75
|
+
"jpeg": "image/jpeg",
|
|
76
|
+
"gif": "image/gif",
|
|
77
|
+
"tiff": "image/tiff",
|
|
78
|
+
"tif": "image/tiff",
|
|
79
|
+
"bmp": "image/bmp",
|
|
80
|
+
"webp": "image/webp",
|
|
81
|
+
"txt": "text/plain",
|
|
82
|
+
"csv": "text/csv",
|
|
83
|
+
"html": "text/html",
|
|
84
|
+
"htm": "text/html",
|
|
85
|
+
"json": "application/json",
|
|
86
|
+
"xml": "application/xml",
|
|
87
|
+
"eml": "message/rfc822",
|
|
88
|
+
}
|
|
89
|
+
return extension_map.get(extension.lower())
|
|
90
|
+
|
preocr/image_probe.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Image analysis and entropy calculation."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from PIL import Image as PILImage
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from PIL import Image
|
|
11
|
+
import numpy as np
|
|
12
|
+
except ImportError:
|
|
13
|
+
Image = None
|
|
14
|
+
np = None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def is_image_file(mime_type: str) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
Check if MIME type represents an image.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
mime_type: MIME type string
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
True if MIME type is an image, False otherwise
|
|
26
|
+
"""
|
|
27
|
+
return mime_type.startswith("image/")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def analyze_image(file_path: str) -> Dict[str, any]:
|
|
31
|
+
"""
|
|
32
|
+
Analyze image file and calculate entropy.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_path: Path to the image file
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary with keys:
|
|
39
|
+
- entropy: Image entropy value (0-8, higher = more complex)
|
|
40
|
+
- width: Image width in pixels
|
|
41
|
+
- height: Image height in pixels
|
|
42
|
+
- mode: Image mode (RGB, L, etc.)
|
|
43
|
+
- is_image: Always True for images
|
|
44
|
+
"""
|
|
45
|
+
if not Image:
|
|
46
|
+
return {
|
|
47
|
+
"entropy": None,
|
|
48
|
+
"width": None,
|
|
49
|
+
"height": None,
|
|
50
|
+
"mode": None,
|
|
51
|
+
"is_image": True,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
path = Path(file_path)
|
|
55
|
+
|
|
56
|
+
try:
|
|
57
|
+
with Image.open(path) as img:
|
|
58
|
+
# Convert to grayscale for entropy calculation
|
|
59
|
+
gray_img = img.convert("L")
|
|
60
|
+
|
|
61
|
+
# Calculate entropy
|
|
62
|
+
entropy = _calculate_entropy(gray_img)
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"entropy": entropy,
|
|
66
|
+
"width": img.width,
|
|
67
|
+
"height": img.height,
|
|
68
|
+
"mode": img.mode,
|
|
69
|
+
"is_image": True,
|
|
70
|
+
}
|
|
71
|
+
except Exception:
|
|
72
|
+
return {
|
|
73
|
+
"entropy": None,
|
|
74
|
+
"width": None,
|
|
75
|
+
"height": None,
|
|
76
|
+
"mode": None,
|
|
77
|
+
"is_image": True,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _calculate_entropy(image: Any) -> float:
|
|
82
|
+
"""
|
|
83
|
+
Calculate entropy of an image.
|
|
84
|
+
|
|
85
|
+
Entropy measures the randomness/complexity of pixel values.
|
|
86
|
+
Low entropy (0-4): Simple images, likely scanned text
|
|
87
|
+
High entropy (4-8): Complex images, photos
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
image: PIL Image object (should be grayscale)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Entropy value between 0 and 8
|
|
94
|
+
"""
|
|
95
|
+
if not np:
|
|
96
|
+
# Fallback: simple histogram-based entropy
|
|
97
|
+
histogram = image.histogram()
|
|
98
|
+
histogram = [h for h in histogram if h > 0] # Remove zeros
|
|
99
|
+
total_pixels = sum(histogram)
|
|
100
|
+
|
|
101
|
+
if total_pixels == 0:
|
|
102
|
+
return 0.0
|
|
103
|
+
|
|
104
|
+
entropy = 0.0
|
|
105
|
+
for count in histogram:
|
|
106
|
+
probability = count / total_pixels
|
|
107
|
+
if probability > 0:
|
|
108
|
+
entropy -= probability * (probability.bit_length() - 1) # Approximate log2
|
|
109
|
+
return entropy
|
|
110
|
+
|
|
111
|
+
# NumPy-based calculation (more accurate)
|
|
112
|
+
try:
|
|
113
|
+
img_array = np.array(image)
|
|
114
|
+
histogram, _ = np.histogram(img_array.flatten(), bins=256, range=(0, 256))
|
|
115
|
+
histogram = histogram[histogram > 0] # Remove zeros
|
|
116
|
+
total_pixels = histogram.sum()
|
|
117
|
+
|
|
118
|
+
if total_pixels == 0:
|
|
119
|
+
return 0.0
|
|
120
|
+
|
|
121
|
+
probabilities = histogram / total_pixels
|
|
122
|
+
entropy = -np.sum(probabilities * np.log2(probabilities))
|
|
123
|
+
return float(entropy)
|
|
124
|
+
except Exception:
|
|
125
|
+
# Fallback to simple calculation
|
|
126
|
+
histogram = image.histogram()
|
|
127
|
+
histogram = [h for h in histogram if h > 0]
|
|
128
|
+
total_pixels = sum(histogram)
|
|
129
|
+
|
|
130
|
+
if total_pixels == 0:
|
|
131
|
+
return 0.0
|
|
132
|
+
|
|
133
|
+
entropy = 0.0
|
|
134
|
+
for count in histogram:
|
|
135
|
+
probability = count / total_pixels
|
|
136
|
+
if probability > 0:
|
|
137
|
+
entropy -= probability * (probability.bit_length() - 1)
|
|
138
|
+
return entropy
|
|
139
|
+
|
preocr/office_probe.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""Office document text extraction (DOCX, PPTX, XLSX)."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from docx import Document
|
|
8
|
+
except ImportError:
|
|
9
|
+
Document = None
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from pptx import Presentation
|
|
13
|
+
except ImportError:
|
|
14
|
+
Presentation = None
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
from openpyxl import load_workbook
|
|
18
|
+
except ImportError:
|
|
19
|
+
load_workbook = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_office_text(file_path: str, mime_type: str) -> Dict[str, any]:
|
|
23
|
+
"""
|
|
24
|
+
Extract text from Office documents (DOCX, PPTX, XLSX).
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to the office document
|
|
28
|
+
mime_type: MIME type of the file
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Dictionary with keys:
|
|
32
|
+
- text_length: Number of characters in extracted text
|
|
33
|
+
- text: Extracted text (may be truncated for large files)
|
|
34
|
+
- document_type: Type of document ("docx", "pptx", "xlsx")
|
|
35
|
+
"""
|
|
36
|
+
path = Path(file_path)
|
|
37
|
+
|
|
38
|
+
if "wordprocessingml" in mime_type or path.suffix.lower() == ".docx":
|
|
39
|
+
return _extract_docx(path)
|
|
40
|
+
elif "presentationml" in mime_type or path.suffix.lower() == ".pptx":
|
|
41
|
+
return _extract_pptx(path)
|
|
42
|
+
elif "spreadsheetml" in mime_type or path.suffix.lower() == ".xlsx":
|
|
43
|
+
return _extract_xlsx(path)
|
|
44
|
+
else:
|
|
45
|
+
return {
|
|
46
|
+
"text_length": 0,
|
|
47
|
+
"text": "",
|
|
48
|
+
"document_type": None,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _extract_docx(path: Path) -> Dict[str, any]:
|
|
53
|
+
"""Extract text from DOCX file."""
|
|
54
|
+
if not Document:
|
|
55
|
+
return {
|
|
56
|
+
"text_length": 0,
|
|
57
|
+
"text": "",
|
|
58
|
+
"document_type": "docx",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
doc = Document(path)
|
|
63
|
+
text_parts = []
|
|
64
|
+
|
|
65
|
+
# Extract paragraphs
|
|
66
|
+
for paragraph in doc.paragraphs:
|
|
67
|
+
if paragraph.text.strip():
|
|
68
|
+
text_parts.append(paragraph.text)
|
|
69
|
+
|
|
70
|
+
# Extract tables
|
|
71
|
+
for table in doc.tables:
|
|
72
|
+
for row in table.rows:
|
|
73
|
+
for cell in row.cells:
|
|
74
|
+
if cell.text.strip():
|
|
75
|
+
text_parts.append(cell.text)
|
|
76
|
+
|
|
77
|
+
full_text = "\n".join(text_parts)
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
"text_length": len(full_text),
|
|
81
|
+
"text": full_text[:1000] if len(full_text) > 1000 else full_text,
|
|
82
|
+
"document_type": "docx",
|
|
83
|
+
}
|
|
84
|
+
except Exception:
|
|
85
|
+
return {
|
|
86
|
+
"text_length": 0,
|
|
87
|
+
"text": "",
|
|
88
|
+
"document_type": "docx",
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _extract_pptx(path: Path) -> Dict[str, any]:
|
|
93
|
+
"""Extract text from PPTX file."""
|
|
94
|
+
if not Presentation:
|
|
95
|
+
return {
|
|
96
|
+
"text_length": 0,
|
|
97
|
+
"text": "",
|
|
98
|
+
"document_type": "pptx",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
prs = Presentation(path)
|
|
103
|
+
text_parts = []
|
|
104
|
+
|
|
105
|
+
# Extract text from slides
|
|
106
|
+
for slide in prs.slides:
|
|
107
|
+
for shape in slide.shapes:
|
|
108
|
+
if hasattr(shape, "text") and shape.text.strip():
|
|
109
|
+
text_parts.append(shape.text)
|
|
110
|
+
|
|
111
|
+
full_text = "\n".join(text_parts)
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"text_length": len(full_text),
|
|
115
|
+
"text": full_text[:1000] if len(full_text) > 1000 else full_text,
|
|
116
|
+
"document_type": "pptx",
|
|
117
|
+
}
|
|
118
|
+
except Exception:
|
|
119
|
+
return {
|
|
120
|
+
"text_length": 0,
|
|
121
|
+
"text": "",
|
|
122
|
+
"document_type": "pptx",
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _extract_xlsx(path: Path) -> Dict[str, any]:
|
|
127
|
+
"""Extract text from XLSX file."""
|
|
128
|
+
if not load_workbook:
|
|
129
|
+
return {
|
|
130
|
+
"text_length": 0,
|
|
131
|
+
"text": "",
|
|
132
|
+
"document_type": "xlsx",
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
wb = load_workbook(path, data_only=True)
|
|
137
|
+
text_parts = []
|
|
138
|
+
|
|
139
|
+
# Extract text from all sheets
|
|
140
|
+
for sheet_name in wb.sheetnames:
|
|
141
|
+
sheet = wb[sheet_name]
|
|
142
|
+
for row in sheet.iter_rows(values_only=True):
|
|
143
|
+
row_text = " ".join(str(cell) for cell in row if cell is not None)
|
|
144
|
+
if row_text.strip():
|
|
145
|
+
text_parts.append(row_text)
|
|
146
|
+
|
|
147
|
+
wb.close()
|
|
148
|
+
full_text = "\n".join(text_parts)
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
"text_length": len(full_text),
|
|
152
|
+
"text": full_text[:1000] if len(full_text) > 1000 else full_text,
|
|
153
|
+
"document_type": "xlsx",
|
|
154
|
+
}
|
|
155
|
+
except Exception:
|
|
156
|
+
return {
|
|
157
|
+
"text_length": 0,
|
|
158
|
+
"text": "",
|
|
159
|
+
"document_type": "xlsx",
|
|
160
|
+
}
|
|
161
|
+
|