stache-ai-ocr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: stache-ai-ocr
3
+ Version: 0.1.0
4
+ Summary: OCR support for Stache AI document loaders
5
+ Author: Stache Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/stache-ai/stache-ai
8
+ Project-URL: Repository, https://github.com/stache-ai/stache-ai
9
+ Keywords: stache,ocr,pdf,document-processing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: stache-ai>=0.1.0
21
+ Requires-Dist: pdfplumber>=0.10.0
22
+
23
+ # stache-ai-ocr
24
+
25
+ OCR support for Stache AI document loaders.
26
+
27
+ Provides a high-priority PDF loader that falls back to OCR for scanned documents.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install stache-ai-ocr
33
+ apt install ocrmypdf # System dependency required
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
39
+
40
+ The loader will:
41
+ 1. First attempt normal text extraction with pdfplumber
42
+ 2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
43
+ 3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
44
+
45
+ ## System Requirements
46
+
47
+ - **ocrmypdf** system binary must be installed
48
+ - Ubuntu/Debian: `apt install ocrmypdf`
49
+ - macOS: `brew install ocrmypdf`
50
+ - Includes Tesseract OCR engine
51
+
52
+ ## Priority Override
53
+
54
+ This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
@@ -0,0 +1,32 @@
1
+ # stache-ai-ocr
2
+
3
+ OCR support for Stache AI document loaders.
4
+
5
+ Provides a high-priority PDF loader that falls back to OCR for scanned documents.
6
+
7
+ ## Installation
8
+
9
+ ```bash
10
+ pip install stache-ai-ocr
11
+ apt install ocrmypdf # System dependency required
12
+ ```
13
+
14
+ ## Usage
15
+
16
+ Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
17
+
18
+ The loader will:
19
+ 1. First attempt normal text extraction with pdfplumber
20
+ 2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
21
+ 3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
22
+
23
+ ## System Requirements
24
+
25
+ - **ocrmypdf** system binary must be installed
26
+ - Ubuntu/Debian: `apt install ocrmypdf`
27
+ - macOS: `brew install ocrmypdf`
28
+ - Includes Tesseract OCR engine
29
+
30
+ ## Priority Override
31
+
32
+ This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "stache-ai-ocr"
7
+ version = "0.1.0"
8
+ readme = "README.md"
9
+ description = "OCR support for Stache AI document loaders"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Stache Contributors"}
14
+ ]
15
+ keywords = ["stache", "ocr", "pdf", "document-processing"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Operating System :: OS Independent",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ ]
26
+ dependencies = [
27
+ "stache-ai>=0.1.0",
28
+ "pdfplumber>=0.10.0",
29
+ ]
30
+
31
+ [project.entry-points."stache.loader"]
32
+ pdf-ocr = "stache_ai_ocr.loaders:OcrPdfLoader"
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/stache-ai/stache-ai"
36
+ Repository = "https://github.com/stache-ai/stache-ai"
37
+
38
+ [tool.setuptools.packages.find]
39
+ where = ["src"]
40
+ include = ["stache_ai_ocr*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """OCR support for Stache AI document loaders"""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,84 @@
1
+ """PDF loader with OCR fallback for scanned documents"""
2
+
3
+ import logging
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ from stache_ai.loaders.base import DocumentLoader
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class OcrPdfLoader(DocumentLoader):
14
+ """PDF loader with OCR fallback for scanned documents
15
+
16
+ Requires ocrmypdf system binary: apt install ocrmypdf
17
+ """
18
+
19
+ @property
20
+ def extensions(self) -> list[str]:
21
+ return ['pdf']
22
+
23
+ @property
24
+ def priority(self) -> int:
25
+ return 10 # Override basic PdfLoader (priority 0)
26
+
27
+ def load(self, file_path: str) -> str:
28
+ # Lazy import
29
+ import pdfplumber
30
+
31
+ # Try normal extraction first
32
+ text_parts = []
33
+ with pdfplumber.open(file_path) as pdf:
34
+ for page in pdf.pages:
35
+ text = page.extract_text()
36
+ if text:
37
+ text_parts.append(text)
38
+
39
+ # If no text found, try OCR
40
+ if not text_parts:
41
+ logger.info(f"No text in PDF, attempting OCR: {file_path}")
42
+ text_parts = self._ocr_extract(file_path)
43
+
44
+ return "\n\n".join(text_parts)
45
+
46
+ def _ocr_extract(self, file_path: str) -> list[str]:
47
+ """Run OCR on PDF and extract text"""
48
+ import pdfplumber
49
+
50
+ text_parts = []
51
+ tmp_path = None
52
+
53
+ try:
54
+ with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
55
+ tmp_path = tmp.name
56
+
57
+ result = subprocess.run(
58
+ ['ocrmypdf', '--skip-text', '--quiet', file_path, tmp_path],
59
+ capture_output=True,
60
+ text=True
61
+ )
62
+
63
+ if result.returncode != 0:
64
+ logger.warning(f"OCR failed: {result.stderr}")
65
+ return []
66
+
67
+ with pdfplumber.open(tmp_path) as pdf:
68
+ for page in pdf.pages:
69
+ text = page.extract_text()
70
+ if text:
71
+ text_parts.append(text)
72
+
73
+ except FileNotFoundError:
74
+ logger.warning(
75
+ "ocrmypdf not installed (install with: apt install ocrmypdf), "
76
+ "skipping OCR"
77
+ )
78
+ except Exception as e:
79
+ logger.warning(f"OCR error: {e}")
80
+ finally:
81
+ if tmp_path:
82
+ Path(tmp_path).unlink(missing_ok=True)
83
+
84
+ return text_parts
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: stache-ai-ocr
3
+ Version: 0.1.0
4
+ Summary: OCR support for Stache AI document loaders
5
+ Author: Stache Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/stache-ai/stache-ai
8
+ Project-URL: Repository, https://github.com/stache-ai/stache-ai
9
+ Keywords: stache,ocr,pdf,document-processing
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: stache-ai>=0.1.0
21
+ Requires-Dist: pdfplumber>=0.10.0
22
+
23
+ # stache-ai-ocr
24
+
25
+ OCR support for Stache AI document loaders.
26
+
27
+ Provides a high-priority PDF loader that falls back to OCR for scanned documents.
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install stache-ai-ocr
33
+ apt install ocrmypdf # System dependency required
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Once installed, the OCR loader automatically registers and takes priority over the basic PDF loader for all PDF files.
39
+
40
+ The loader will:
41
+ 1. First attempt normal text extraction with pdfplumber
42
+ 2. If no text is found (scanned PDF), fall back to OCR using ocrmypdf
43
+ 3. Gracefully handle missing ocrmypdf (logs warning and returns empty text)
44
+
45
+ ## System Requirements
46
+
47
+ - **ocrmypdf** system binary must be installed
48
+ - Ubuntu/Debian: `apt install ocrmypdf`
49
+ - macOS: `brew install ocrmypdf`
50
+ - Includes Tesseract OCR engine
51
+
52
+ ## Priority Override
53
+
54
+ This loader registers with priority 10, overriding the basic PDF loader (priority 0). This ensures OCR is used when available without affecting systems where it's not installed.
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/stache_ai_ocr/__init__.py
4
+ src/stache_ai_ocr/loaders.py
5
+ src/stache_ai_ocr.egg-info/PKG-INFO
6
+ src/stache_ai_ocr.egg-info/SOURCES.txt
7
+ src/stache_ai_ocr.egg-info/dependency_links.txt
8
+ src/stache_ai_ocr.egg-info/entry_points.txt
9
+ src/stache_ai_ocr.egg-info/requires.txt
10
+ src/stache_ai_ocr.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [stache.loader]
2
+ pdf-ocr = stache_ai_ocr.loaders:OcrPdfLoader
@@ -0,0 +1,2 @@
1
+ stache-ai>=0.1.0
2
+ pdfplumber>=0.10.0
@@ -0,0 +1 @@
1
+ stache_ai_ocr