stache-ai-ocr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stache_ai_ocr-0.1.0/PKG-INFO +54 -0
- stache_ai_ocr-0.1.0/README.md +32 -0
- stache_ai_ocr-0.1.0/pyproject.toml +40 -0
- stache_ai_ocr-0.1.0/setup.cfg +4 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr/__init__.py +3 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr/loaders.py +84 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/PKG-INFO +54 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/SOURCES.txt +10 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/dependency_links.txt +1 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/entry_points.txt +2 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/requires.txt +2 -0
- stache_ai_ocr-0.1.0/src/stache_ai_ocr.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stache-ai-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OCR support for Stache AI document loaders
|
|
5
|
+
Author: Stache Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/stache-ai/stache-ai
|
|
8
|
+
Project-URL: Repository, https://github.com/stache-ai/stache-ai
|
|
9
|
+
Keywords: stache,ocr,pdf,document-processing
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: stache-ai>=0.1.0
|
|
21
|
+
Requires-Dist: pdfplumber>=0.10.0
|
|
22
|
+
|
|
23
|
+
# stache-ai-ocr
|
|
24
|
+
|
|
25
|
+
OCR support for Stache AI document loaders.
|
|
26
|
+
|
|
27
|
+
Provides a high-priority PDF loader that falls back to OCR for scanned documents.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install stache-ai-ocr
|
|
33
|
+
apt install ocrmypdf # System dependency required
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
|
|
39
|
+
|
|
40
|
+
The loader will:
|
|
41
|
+
1. First attempt normal text extraction with pdfplumber
|
|
42
|
+
2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
|
|
43
|
+
3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
|
|
44
|
+
|
|
45
|
+
## System Requirements
|
|
46
|
+
|
|
47
|
+
- **ocrmypdf** system binary must be installed
|
|
48
|
+
- Ubuntu/Debian: `apt install ocrmypdf`
|
|
49
|
+
- macOS: `brew install ocrmypdf`
|
|
50
|
+
- Includes Tesseract OCR engine
|
|
51
|
+
|
|
52
|
+
## Priority Override
|
|
53
|
+
|
|
54
|
+
This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# stache-ai-ocr
|
|
2
|
+
|
|
3
|
+
OCR support for Stache AI document loaders.
|
|
4
|
+
|
|
5
|
+
Provides a high-priority PDF loader that falls back to OCR for scanned documents.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install stache-ai-ocr
|
|
11
|
+
apt install ocrmypdf # System dependency required
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Usage
|
|
15
|
+
|
|
16
|
+
Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
|
|
17
|
+
|
|
18
|
+
The loader will:
|
|
19
|
+
1. First attempt normal text extraction with pdfplumber
|
|
20
|
+
2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
|
|
21
|
+
3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
|
|
22
|
+
|
|
23
|
+
## System Requirements
|
|
24
|
+
|
|
25
|
+
- **ocrmypdf** system binary must be installed
|
|
26
|
+
- Ubuntu/Debian: `apt install ocrmypdf`
|
|
27
|
+
- macOS: `brew install ocrmypdf`
|
|
28
|
+
- Includes Tesseract OCR engine
|
|
29
|
+
|
|
30
|
+
## Priority Override
|
|
31
|
+
|
|
32
|
+
This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stache-ai-ocr"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
description = "OCR support for Stache AI document loaders"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Stache Contributors"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["stache", "ocr", "pdf", "document-processing"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Operating System :: OS Independent",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"stache-ai>=0.1.0",
|
|
28
|
+
"pdfplumber>=0.10.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.entry-points."stache.loader"]
|
|
32
|
+
pdf-ocr = "stache_ai_ocr.loaders:OcrPdfLoader"
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/stache-ai/stache-ai"
|
|
36
|
+
Repository = "https://github.com/stache-ai/stache-ai"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools.packages.find]
|
|
39
|
+
where = ["src"]
|
|
40
|
+
include = ["stache_ai_ocr*"]
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""PDF loader with OCR fallback for scanned documents"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from stache_ai.loaders.base import DocumentLoader
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class OcrPdfLoader(DocumentLoader):
|
|
14
|
+
"""PDF loader with OCR fallback for scanned documents
|
|
15
|
+
|
|
16
|
+
Requires ocrmypdf system binary: apt install ocrmypdf
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@property
|
|
20
|
+
def extensions(self) -> list[str]:
|
|
21
|
+
return ['pdf']
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def priority(self) -> int:
|
|
25
|
+
return 10 # Override basic PdfLoader (priority 0)
|
|
26
|
+
|
|
27
|
+
def load(self, file_path: str) -> str:
|
|
28
|
+
# Lazy import
|
|
29
|
+
import pdfplumber
|
|
30
|
+
|
|
31
|
+
# Try normal extraction first
|
|
32
|
+
text_parts = []
|
|
33
|
+
with pdfplumber.open(file_path) as pdf:
|
|
34
|
+
for page in pdf.pages:
|
|
35
|
+
text = page.extract_text()
|
|
36
|
+
if text:
|
|
37
|
+
text_parts.append(text)
|
|
38
|
+
|
|
39
|
+
# If no text found, try OCR
|
|
40
|
+
if not text_parts:
|
|
41
|
+
logger.info(f"No text in PDF, attempting OCR: {file_path}")
|
|
42
|
+
text_parts = self._ocr_extract(file_path)
|
|
43
|
+
|
|
44
|
+
return "\n\n".join(text_parts)
|
|
45
|
+
|
|
46
|
+
def _ocr_extract(self, file_path: str) -> list[str]:
|
|
47
|
+
"""Run OCR on PDF and extract text"""
|
|
48
|
+
import pdfplumber
|
|
49
|
+
|
|
50
|
+
text_parts = []
|
|
51
|
+
tmp_path = None
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
|
|
55
|
+
tmp_path = tmp.name
|
|
56
|
+
|
|
57
|
+
result = subprocess.run(
|
|
58
|
+
['ocrmypdf', '--skip-text', '--quiet', file_path, tmp_path],
|
|
59
|
+
capture_output=True,
|
|
60
|
+
text=True
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
if result.returncode != 0:
|
|
64
|
+
logger.warning(f"OCR failed: {result.stderr}")
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
with pdfplumber.open(tmp_path) as pdf:
|
|
68
|
+
for page in pdf.pages:
|
|
69
|
+
text = page.extract_text()
|
|
70
|
+
if text:
|
|
71
|
+
text_parts.append(text)
|
|
72
|
+
|
|
73
|
+
except FileNotFoundError:
|
|
74
|
+
logger.warning(
|
|
75
|
+
"ocrmypdf not installed (install with: apt install ocrmypdf), "
|
|
76
|
+
"skipping OCR"
|
|
77
|
+
)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"OCR error: {e}")
|
|
80
|
+
finally:
|
|
81
|
+
if tmp_path:
|
|
82
|
+
Path(tmp_path).unlink(missing_ok=True)
|
|
83
|
+
|
|
84
|
+
return text_parts
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: stache-ai-ocr
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: OCR support for Stache AI document loaders
|
|
5
|
+
Author: Stache Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/stache-ai/stache-ai
|
|
8
|
+
Project-URL: Repository, https://github.com/stache-ai/stache-ai
|
|
9
|
+
Keywords: stache,ocr,pdf,document-processing
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: stache-ai>=0.1.0
|
|
21
|
+
Requires-Dist: pdfplumber>=0.10.0
|
|
22
|
+
|
|
23
|
+
# stache-ai-ocr
|
|
24
|
+
|
|
25
|
+
OCR support for Stache AI document loaders.
|
|
26
|
+
|
|
27
|
+
Provides a high-priority PDF loader that falls back to OCR for scanned documents.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install stache-ai-ocr
|
|
33
|
+
apt install ocrmypdf # System dependency required
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
|
|
39
|
+
|
|
40
|
+
The loader will:
|
|
41
|
+
1. First attempt normal text extraction with pdfplumber
|
|
42
|
+
2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
|
|
43
|
+
3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
|
|
44
|
+
|
|
45
|
+
## System Requirements
|
|
46
|
+
|
|
47
|
+
- **ocrmypdf** system binary must be installed
|
|
48
|
+
- Ubuntu/Debian: `apt install ocrmypdf`
|
|
49
|
+
- macOS: `brew install ocrmypdf`
|
|
50
|
+
- Includes Tesseract OCR engine
|
|
51
|
+
|
|
52
|
+
## Priority Override
|
|
53
|
+
|
|
54
|
+
This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/stache_ai_ocr/__init__.py
|
|
4
|
+
src/stache_ai_ocr/loaders.py
|
|
5
|
+
src/stache_ai_ocr.egg-info/PKG-INFO
|
|
6
|
+
src/stache_ai_ocr.egg-info/SOURCES.txt
|
|
7
|
+
src/stache_ai_ocr.egg-info/dependency_links.txt
|
|
8
|
+
src/stache_ai_ocr.egg-info/entry_points.txt
|
|
9
|
+
src/stache_ai_ocr.egg-info/requires.txt
|
|
10
|
+
src/stache_ai_ocr.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stache_ai_ocr
|