pure-visual-grounder 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pure_visual_grounder-1.0.0/LICENSE +21 -0
- pure_visual_grounder-1.0.0/MANIFEST.in +6 -0
- pure_visual_grounder-1.0.0/PKG-INFO +338 -0
- pure_visual_grounder-1.0.0/README.md +311 -0
- pure_visual_grounder-1.0.0/pure_visual_grounder.egg-info/PKG-INFO +338 -0
- pure_visual_grounder-1.0.0/pure_visual_grounder.egg-info/SOURCES.txt +11 -0
- pure_visual_grounder-1.0.0/pure_visual_grounder.egg-info/dependency_links.txt +1 -0
- pure_visual_grounder-1.0.0/pure_visual_grounder.egg-info/requires.txt +5 -0
- pure_visual_grounder-1.0.0/pure_visual_grounder.egg-info/top_level.txt +1 -0
- pure_visual_grounder-1.0.0/pyproject.toml +34 -0
- pure_visual_grounder-1.0.0/requirements.txt +5 -0
- pure_visual_grounder-1.0.0/setup.cfg +4 -0
- pure_visual_grounder-1.0.0/setup.py +34 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Strategion
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pure-visual-grounder
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A package for processing PDFs with vision-based language models
|
|
5
|
+
Author: Strategion
|
|
6
|
+
Author-email: Strategion <development@strategion.de>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: langchain~=0.3.27
|
|
20
|
+
Requires-Dist: PyMuPDF
|
|
21
|
+
Requires-Dist: langchain-core~=0.3.72
|
|
22
|
+
Requires-Dist: pathlib~=1.0.1
|
|
23
|
+
Requires-Dist: langsmith>=0.1.17
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
|
|
28
|
+
# Pure Visual Grounding
|
|
29
|
+
|
|
30
|
+
[](https://badge.fury.io/py/pure-visual-grounding)
|
|
31
|
+
[](https://www.python.org/downloads/release/python-380/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
|
|
34
|
+
A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
|
|
39
|
+
|
|
40
|
+
- **Technical Document Processing**: Extract structured data from complex technical documents
|
|
41
|
+
- **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
|
|
42
|
+
- **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
|
|
43
|
+
- **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
|
|
48
|
+
- 📄 **Multi-page Support**: Processes entire PDF documents page by page
|
|
49
|
+
- 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
|
|
50
|
+
- 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
|
|
51
|
+
- 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
|
|
52
|
+
- 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
|
|
53
|
+
- 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
Install from PyPI:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install pure-visual-grounding
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### Basic Usage
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
69
|
+
from langchain_openai import ChatOpenAI
|
|
70
|
+
|
|
71
|
+
# Initialize your vision model
|
|
72
|
+
llm = ChatOpenAI(
|
|
73
|
+
model="gpt-4-vision-preview",
|
|
74
|
+
api_key="your-openai-api-key"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Read PDF file as bytes
|
|
78
|
+
with open("document.pdf", "rb") as f:
|
|
79
|
+
pdf_bytes = f.read()
|
|
80
|
+
|
|
81
|
+
# Process the PDF
|
|
82
|
+
results = process_pdf_with_vision(
|
|
83
|
+
pdf_name="document.pdf",
|
|
84
|
+
pdf=pdf_bytes,
|
|
85
|
+
llm=llm,
|
|
86
|
+
vision_prompt="First prompt to get the information out of image",
|
|
87
|
+
reinforced_prompt="Re inforced prompt to make sure all information is extracted"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Access structured results
|
|
91
|
+
for page_result in results:
|
|
92
|
+
print(f"Page {page_result['metadata']['page_number']}")
|
|
93
|
+
print(f"Content: {page_result.get('content', 'No content extracted')}")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Custom Processing
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
100
|
+
from langchain_openai import ChatOpenAI
|
|
101
|
+
|
|
102
|
+
# Custom prompts for specific document types
|
|
103
|
+
custom_prompt = """
|
|
104
|
+
Analyze this technical document page and extract:
|
|
105
|
+
1. Section headings and hierarchy
|
|
106
|
+
2. Technical specifications and parameters
|
|
107
|
+
3. Diagrams, charts, and visual elements
|
|
108
|
+
4. Tables with numerical data
|
|
109
|
+
5. Key formulas or equations
|
|
110
|
+
|
|
111
|
+
Format the response as structured JSON with clear categorization.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
reinforcement_prompt = """
|
|
115
|
+
Review the previous analysis and enhance it by:
|
|
116
|
+
1. Ensuring all technical details are captured accurately
|
|
117
|
+
2. Organizing information hierarchically
|
|
118
|
+
3. Adding any missed visual elements
|
|
119
|
+
4. Validating numerical data and units
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
123
|
+
|
|
124
|
+
with open("technical_doc.pdf", "rb") as f:
|
|
125
|
+
pdf_bytes = f.read()
|
|
126
|
+
|
|
127
|
+
results = process_pdf_with_vision(
|
|
128
|
+
pdf_name="technical_doc.pdf",
|
|
129
|
+
pdf=pdf_bytes,
|
|
130
|
+
llm=llm,
|
|
131
|
+
vision_prompt=custom_prompt,
|
|
132
|
+
reinforced_prompt=reinforcement_prompt,
|
|
133
|
+
dpi=300 # High resolution for detailed documents
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Advanced Usage
|
|
138
|
+
|
|
139
|
+
### Batch Processing
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import glob
|
|
143
|
+
from pathlib import Path
|
|
144
|
+
|
|
145
|
+
def process_pdf_batch(pdf_directory, llm):
|
|
146
|
+
"""Process multiple PDFs in a directory"""
|
|
147
|
+
pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
|
|
148
|
+
results = {}
|
|
149
|
+
|
|
150
|
+
for pdf_path in pdf_files:
|
|
151
|
+
pdf_name = Path(pdf_path).name
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
with open(pdf_path, "rb") as f:
|
|
155
|
+
pdf_bytes = f.read()
|
|
156
|
+
|
|
157
|
+
results[pdf_name] = process_pdf_with_vision(
|
|
158
|
+
pdf_name=pdf_name,
|
|
159
|
+
pdf=pdf_bytes,
|
|
160
|
+
llm=llm
|
|
161
|
+
)
|
|
162
|
+
print(f"✓ Processed: {pdf_name}")
|
|
163
|
+
|
|
164
|
+
except Exception as e:
|
|
165
|
+
print(f"✗ Failed: {pdf_name} - {e}")
|
|
166
|
+
results[pdf_name] = {"error": str(e)}
|
|
167
|
+
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
# Usage
|
|
171
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
172
|
+
batch_results = process_pdf_batch("./documents/", llm)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Integration with Different Models
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# OpenAI GPT-4 Vision
|
|
179
|
+
from langchain_openai import ChatOpenAI
|
|
180
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
181
|
+
|
|
182
|
+
# Anthropic Claude (when vision is available)
|
|
183
|
+
from langchain_anthropic import ChatAnthropic
|
|
184
|
+
llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
|
|
185
|
+
|
|
186
|
+
# Google Gemini Vision
|
|
187
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
188
|
+
llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Output Format
|
|
192
|
+
|
|
193
|
+
The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
|
|
194
|
+
|
|
195
|
+
```json
|
|
196
|
+
[
|
|
197
|
+
{
|
|
198
|
+
"content": "Extracted and structured content from the page",
|
|
199
|
+
"metadata": {
|
|
200
|
+
"pdf_name": "document.pdf",
|
|
201
|
+
"page_number": 1,
|
|
202
|
+
"error": "none",
|
|
203
|
+
"processing_time": "2.34s",
|
|
204
|
+
"model_used": "gpt-4-vision-preview"
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
]
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Configuration Options
|
|
211
|
+
|
|
212
|
+
### Function Parameters
|
|
213
|
+
|
|
214
|
+
- **pdf_name** (str): Name identifier for the PDF file
|
|
215
|
+
- **pdf** (bytes): PDF content as bytes
|
|
216
|
+
- **llm** (BaseChatModel): LangChain vision-capable language model
|
|
217
|
+
- **vision_prompt** (str): Initial extraction prompt
|
|
218
|
+
- **reinforced_prompt** (str): Secondary refinement prompt
|
|
219
|
+
- **dpi** (int): Image resolution for PDF rendering (default: 300)
|
|
220
|
+
|
|
221
|
+
### Recommended DPI Settings
|
|
222
|
+
|
|
223
|
+
- **150 DPI**: Fast processing, basic documents
|
|
224
|
+
- **300 DPI**: Standard quality, most documents (default)
|
|
225
|
+
- **600 DPI**: High quality, detailed technical documents
|
|
226
|
+
|
|
227
|
+
## Use Cases
|
|
228
|
+
|
|
229
|
+
### Technical Documentation
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
# Optimized for technical manuals, specifications, and research papers
|
|
233
|
+
tech_prompt = """
|
|
234
|
+
Extract from this technical document:
|
|
235
|
+
1. Technical specifications and parameters
|
|
236
|
+
2. Procedural steps and instructions
|
|
237
|
+
3. Diagrams, schematics, and their descriptions
|
|
238
|
+
4. Tables with measurements and data points
|
|
239
|
+
5. Safety warnings and important notes
|
|
240
|
+
Format as structured JSON with clear sections.
|
|
241
|
+
"""
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Financial Documents
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
# Optimized for financial reports, statements, and analysis
|
|
248
|
+
finance_prompt = """
|
|
249
|
+
Analyze this financial document and extract:
|
|
250
|
+
1. Financial figures, ratios, and metrics
|
|
251
|
+
2. Tables with numerical data
|
|
252
|
+
3. Charts and graphs with their insights
|
|
253
|
+
4. Key financial statements sections
|
|
254
|
+
5. Important dates and periods
|
|
255
|
+
Return structured data suitable for financial analysis.
|
|
256
|
+
"""
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Research Papers
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
# Optimized for academic and research publications
|
|
263
|
+
research_prompt = """
|
|
264
|
+
Process this research document and identify:
|
|
265
|
+
1. Abstract and key findings
|
|
266
|
+
2. Methodology and experimental setup
|
|
267
|
+
3. Results, data tables, and statistics
|
|
268
|
+
4. Figures, graphs, and their captions
|
|
269
|
+
5. References and citations
|
|
270
|
+
Structure the output for academic analysis.
|
|
271
|
+
"""
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## Error Handling
|
|
275
|
+
|
|
276
|
+
The package includes robust error handling:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
try:
|
|
280
|
+
results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
|
|
281
|
+
|
|
282
|
+
for result in results:
|
|
283
|
+
if result["metadata"]["error"] != "none":
|
|
284
|
+
print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
|
|
285
|
+
else:
|
|
286
|
+
# Process successful result
|
|
287
|
+
print(f"Successfully processed page {result['metadata']['page_number']}")
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
print(f"Fatal error processing PDF: {e}")
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## Requirements
|
|
294
|
+
|
|
295
|
+
- Python 3.8+
|
|
296
|
+
- langchain ~= 0.3.27
|
|
297
|
+
- langchain-core ~= 0.3.72
|
|
298
|
+
- PyMuPDF (for PDF processing)
|
|
299
|
+
- pathlib ~= 1.0.1
|
|
300
|
+
- langsmith >= 0.1.17
|
|
301
|
+
|
|
302
|
+
## Performance Tips
|
|
303
|
+
|
|
304
|
+
1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
|
|
305
|
+
2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
|
|
306
|
+
3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
|
|
307
|
+
4. **Prompt Engineering**: Craft specific prompts for your document types
|
|
308
|
+
5. **Error Handling**: Implement retry logic for transient API failures
|
|
309
|
+
|
|
310
|
+
## Contributing
|
|
311
|
+
|
|
312
|
+
We welcome contributions! Please see our contributing guidelines for more information.
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
317
|
+
|
|
318
|
+
## Support
|
|
319
|
+
|
|
320
|
+
For issues, questions, or feature requests:
|
|
321
|
+
|
|
322
|
+
1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
|
|
323
|
+
2. Review existing issues on GitHub
|
|
324
|
+
3. Create a new issue with detailed information
|
|
325
|
+
|
|
326
|
+
## Changelog
|
|
327
|
+
|
|
328
|
+
### v1.0.8
|
|
329
|
+
|
|
330
|
+
- Improved error handling and metadata processing
|
|
331
|
+
- Enhanced prompt templates
|
|
332
|
+
- Better support for technical documents
|
|
333
|
+
|
|
334
|
+
---
|
|
335
|
+
|
|
336
|
+
**Author**: Strategion (development@strategion.de)
|
|
337
|
+
|
|
338
|
+
**Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
|
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
# Pure Visual Grounding
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/pure-visual-grounding)
|
|
4
|
+
[](https://www.python.org/downloads/release/python-380/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
|
|
8
|
+
|
|
9
|
+
## Overview
|
|
10
|
+
|
|
11
|
+
Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
|
|
12
|
+
|
|
13
|
+
- **Technical Document Processing**: Extract structured data from complex technical documents
|
|
14
|
+
- **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
|
|
15
|
+
- **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
|
|
16
|
+
- **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
|
|
21
|
+
- 📄 **Multi-page Support**: Processes entire PDF documents page by page
|
|
22
|
+
- 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
|
|
23
|
+
- 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
|
|
24
|
+
- 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
|
|
25
|
+
- 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
|
|
26
|
+
- 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
Install from PyPI:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install pure-visual-grounding
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Quick Start
|
|
37
|
+
|
|
38
|
+
### Basic Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
42
|
+
from langchain_openai import ChatOpenAI
|
|
43
|
+
|
|
44
|
+
# Initialize your vision model
|
|
45
|
+
llm = ChatOpenAI(
|
|
46
|
+
model="gpt-4-vision-preview",
|
|
47
|
+
api_key="your-openai-api-key"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Read PDF file as bytes
|
|
51
|
+
with open("document.pdf", "rb") as f:
|
|
52
|
+
pdf_bytes = f.read()
|
|
53
|
+
|
|
54
|
+
# Process the PDF
|
|
55
|
+
results = process_pdf_with_vision(
|
|
56
|
+
pdf_name="document.pdf",
|
|
57
|
+
pdf=pdf_bytes,
|
|
58
|
+
llm=llm,
|
|
59
|
+
vision_prompt="First prompt to get the information out of image",
|
|
60
|
+
reinforced_prompt="Re inforced prompt to make sure all information is extracted"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Access structured results
|
|
64
|
+
for page_result in results:
|
|
65
|
+
print(f"Page {page_result['metadata']['page_number']}")
|
|
66
|
+
print(f"Content: {page_result.get('content', 'No content extracted')}")
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Custom Processing
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
73
|
+
from langchain_openai import ChatOpenAI
|
|
74
|
+
|
|
75
|
+
# Custom prompts for specific document types
|
|
76
|
+
custom_prompt = """
|
|
77
|
+
Analyze this technical document page and extract:
|
|
78
|
+
1. Section headings and hierarchy
|
|
79
|
+
2. Technical specifications and parameters
|
|
80
|
+
3. Diagrams, charts, and visual elements
|
|
81
|
+
4. Tables with numerical data
|
|
82
|
+
5. Key formulas or equations
|
|
83
|
+
|
|
84
|
+
Format the response as structured JSON with clear categorization.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
reinforcement_prompt = """
|
|
88
|
+
Review the previous analysis and enhance it by:
|
|
89
|
+
1. Ensuring all technical details are captured accurately
|
|
90
|
+
2. Organizing information hierarchically
|
|
91
|
+
3. Adding any missed visual elements
|
|
92
|
+
4. Validating numerical data and units
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
96
|
+
|
|
97
|
+
with open("technical_doc.pdf", "rb") as f:
|
|
98
|
+
pdf_bytes = f.read()
|
|
99
|
+
|
|
100
|
+
results = process_pdf_with_vision(
|
|
101
|
+
pdf_name="technical_doc.pdf",
|
|
102
|
+
pdf=pdf_bytes,
|
|
103
|
+
llm=llm,
|
|
104
|
+
vision_prompt=custom_prompt,
|
|
105
|
+
reinforced_prompt=reinforcement_prompt,
|
|
106
|
+
dpi=300 # High resolution for detailed documents
|
|
107
|
+
)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
## Advanced Usage
|
|
111
|
+
|
|
112
|
+
### Batch Processing
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import glob
|
|
116
|
+
from pathlib import Path
|
|
117
|
+
|
|
118
|
+
def process_pdf_batch(pdf_directory, llm):
|
|
119
|
+
"""Process multiple PDFs in a directory"""
|
|
120
|
+
pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
|
|
121
|
+
results = {}
|
|
122
|
+
|
|
123
|
+
for pdf_path in pdf_files:
|
|
124
|
+
pdf_name = Path(pdf_path).name
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
with open(pdf_path, "rb") as f:
|
|
128
|
+
pdf_bytes = f.read()
|
|
129
|
+
|
|
130
|
+
results[pdf_name] = process_pdf_with_vision(
|
|
131
|
+
pdf_name=pdf_name,
|
|
132
|
+
pdf=pdf_bytes,
|
|
133
|
+
llm=llm
|
|
134
|
+
)
|
|
135
|
+
print(f"✓ Processed: {pdf_name}")
|
|
136
|
+
|
|
137
|
+
except Exception as e:
|
|
138
|
+
print(f"✗ Failed: {pdf_name} - {e}")
|
|
139
|
+
results[pdf_name] = {"error": str(e)}
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
# Usage
|
|
144
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
145
|
+
batch_results = process_pdf_batch("./documents/", llm)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Integration with Different Models
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
# OpenAI GPT-4 Vision
|
|
152
|
+
from langchain_openai import ChatOpenAI
|
|
153
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
154
|
+
|
|
155
|
+
# Anthropic Claude (when vision is available)
|
|
156
|
+
from langchain_anthropic import ChatAnthropic
|
|
157
|
+
llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
|
|
158
|
+
|
|
159
|
+
# Google Gemini Vision
|
|
160
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
161
|
+
llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Output Format
|
|
165
|
+
|
|
166
|
+
The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
|
|
167
|
+
|
|
168
|
+
```json
|
|
169
|
+
[
|
|
170
|
+
{
|
|
171
|
+
"content": "Extracted and structured content from the page",
|
|
172
|
+
"metadata": {
|
|
173
|
+
"pdf_name": "document.pdf",
|
|
174
|
+
"page_number": 1,
|
|
175
|
+
"error": "none",
|
|
176
|
+
"processing_time": "2.34s",
|
|
177
|
+
"model_used": "gpt-4-vision-preview"
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
]
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## Configuration Options
|
|
184
|
+
|
|
185
|
+
### Function Parameters
|
|
186
|
+
|
|
187
|
+
- **pdf_name** (str): Name identifier for the PDF file
|
|
188
|
+
- **pdf** (bytes): PDF content as bytes
|
|
189
|
+
- **llm** (BaseChatModel): LangChain vision-capable language model
|
|
190
|
+
- **vision_prompt** (str): Initial extraction prompt
|
|
191
|
+
- **reinforced_prompt** (str): Secondary refinement prompt
|
|
192
|
+
- **dpi** (int): Image resolution for PDF rendering (default: 300)
|
|
193
|
+
|
|
194
|
+
### Recommended DPI Settings
|
|
195
|
+
|
|
196
|
+
- **150 DPI**: Fast processing, basic documents
|
|
197
|
+
- **300 DPI**: Standard quality, most documents (default)
|
|
198
|
+
- **600 DPI**: High quality, detailed technical documents
|
|
199
|
+
|
|
200
|
+
## Use Cases
|
|
201
|
+
|
|
202
|
+
### Technical Documentation
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
# Optimized for technical manuals, specifications, and research papers
|
|
206
|
+
tech_prompt = """
|
|
207
|
+
Extract from this technical document:
|
|
208
|
+
1. Technical specifications and parameters
|
|
209
|
+
2. Procedural steps and instructions
|
|
210
|
+
3. Diagrams, schematics, and their descriptions
|
|
211
|
+
4. Tables with measurements and data points
|
|
212
|
+
5. Safety warnings and important notes
|
|
213
|
+
Format as structured JSON with clear sections.
|
|
214
|
+
"""
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Financial Documents
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
# Optimized for financial reports, statements, and analysis
|
|
221
|
+
finance_prompt = """
|
|
222
|
+
Analyze this financial document and extract:
|
|
223
|
+
1. Financial figures, ratios, and metrics
|
|
224
|
+
2. Tables with numerical data
|
|
225
|
+
3. Charts and graphs with their insights
|
|
226
|
+
4. Key financial statements sections
|
|
227
|
+
5. Important dates and periods
|
|
228
|
+
Return structured data suitable for financial analysis.
|
|
229
|
+
"""
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Research Papers
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
# Optimized for academic and research publications
|
|
236
|
+
research_prompt = """
|
|
237
|
+
Process this research document and identify:
|
|
238
|
+
1. Abstract and key findings
|
|
239
|
+
2. Methodology and experimental setup
|
|
240
|
+
3. Results, data tables, and statistics
|
|
241
|
+
4. Figures, graphs, and their captions
|
|
242
|
+
5. References and citations
|
|
243
|
+
Structure the output for academic analysis.
|
|
244
|
+
"""
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
## Error Handling
|
|
248
|
+
|
|
249
|
+
The package includes robust error handling:
|
|
250
|
+
|
|
251
|
+
```python
|
|
252
|
+
try:
|
|
253
|
+
results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
|
|
254
|
+
|
|
255
|
+
for result in results:
|
|
256
|
+
if result["metadata"]["error"] != "none":
|
|
257
|
+
print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
|
|
258
|
+
else:
|
|
259
|
+
# Process successful result
|
|
260
|
+
print(f"Successfully processed page {result['metadata']['page_number']}")
|
|
261
|
+
|
|
262
|
+
except Exception as e:
|
|
263
|
+
print(f"Fatal error processing PDF: {e}")
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## Requirements
|
|
267
|
+
|
|
268
|
+
- Python 3.8+
|
|
269
|
+
- langchain ~= 0.3.27
|
|
270
|
+
- langchain-core ~= 0.3.72
|
|
271
|
+
- PyMuPDF (for PDF processing)
|
|
272
|
+
- pathlib ~= 1.0.1
|
|
273
|
+
- langsmith >= 0.1.17
|
|
274
|
+
|
|
275
|
+
## Performance Tips
|
|
276
|
+
|
|
277
|
+
1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
|
|
278
|
+
2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
|
|
279
|
+
3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
|
|
280
|
+
4. **Prompt Engineering**: Craft specific prompts for your document types
|
|
281
|
+
5. **Error Handling**: Implement retry logic for transient API failures
|
|
282
|
+
|
|
283
|
+
## Contributing
|
|
284
|
+
|
|
285
|
+
We welcome contributions! Please see our contributing guidelines for more information.
|
|
286
|
+
|
|
287
|
+
## License
|
|
288
|
+
|
|
289
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
290
|
+
|
|
291
|
+
## Support
|
|
292
|
+
|
|
293
|
+
For issues, questions, or feature requests:
|
|
294
|
+
|
|
295
|
+
1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
|
|
296
|
+
2. Review existing issues on GitHub
|
|
297
|
+
3. Create a new issue with detailed information
|
|
298
|
+
|
|
299
|
+
## Changelog
|
|
300
|
+
|
|
301
|
+
### v1.0.8
|
|
302
|
+
|
|
303
|
+
- Improved error handling and metadata processing
|
|
304
|
+
- Enhanced prompt templates
|
|
305
|
+
- Better support for technical documents
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
**Author**: Strategion (development@strategion.de)
|
|
310
|
+
|
|
311
|
+
**Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pure-visual-grounder
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A package for processing PDFs with vision-based language models
|
|
5
|
+
Author: Strategion
|
|
6
|
+
Author-email: Strategion <development@strategion.de>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: langchain~=0.3.27
|
|
20
|
+
Requires-Dist: PyMuPDF
|
|
21
|
+
Requires-Dist: langchain-core~=0.3.72
|
|
22
|
+
Requires-Dist: pathlib~=1.0.1
|
|
23
|
+
Requires-Dist: langsmith>=0.1.17
|
|
24
|
+
Dynamic: author
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
Dynamic: requires-python
|
|
27
|
+
|
|
28
|
+
# Pure Visual Grounding
|
|
29
|
+
|
|
30
|
+
[](https://badge.fury.io/py/pure-visual-grounding)
|
|
31
|
+
[](https://www.python.org/downloads/release/python-380/)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
|
|
34
|
+
A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
|
|
39
|
+
|
|
40
|
+
- **Technical Document Processing**: Extract structured data from complex technical documents
|
|
41
|
+
- **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
|
|
42
|
+
- **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
|
|
43
|
+
- **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
|
|
44
|
+
|
|
45
|
+
## Features
|
|
46
|
+
|
|
47
|
+
- 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
|
|
48
|
+
- 📄 **Multi-page Support**: Processes entire PDF documents page by page
|
|
49
|
+
- 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
|
|
50
|
+
- 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
|
|
51
|
+
- 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
|
|
52
|
+
- 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
|
|
53
|
+
- 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
|
|
54
|
+
|
|
55
|
+
## Installation
|
|
56
|
+
|
|
57
|
+
Install from PyPI:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install pure-visual-grounding
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Quick Start
|
|
64
|
+
|
|
65
|
+
### Basic Usage
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
69
|
+
from langchain_openai import ChatOpenAI
|
|
70
|
+
|
|
71
|
+
# Initialize your vision model
|
|
72
|
+
llm = ChatOpenAI(
|
|
73
|
+
model="gpt-4-vision-preview",
|
|
74
|
+
api_key="your-openai-api-key"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Read PDF file as bytes
|
|
78
|
+
with open("document.pdf", "rb") as f:
|
|
79
|
+
pdf_bytes = f.read()
|
|
80
|
+
|
|
81
|
+
# Process the PDF
|
|
82
|
+
results = process_pdf_with_vision(
|
|
83
|
+
pdf_name="document.pdf",
|
|
84
|
+
pdf=pdf_bytes,
|
|
85
|
+
llm=llm,
|
|
86
|
+
vision_prompt="First prompt to get the information out of image",
|
|
87
|
+
reinforced_prompt="Re inforced prompt to make sure all information is extracted"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Access structured results
|
|
91
|
+
for page_result in results:
|
|
92
|
+
print(f"Page {page_result['metadata']['page_number']}")
|
|
93
|
+
print(f"Content: {page_result.get('content', 'No content extracted')}")
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Custom Processing
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pure_visual_grounding import process_pdf_with_vision
|
|
100
|
+
from langchain_openai import ChatOpenAI
|
|
101
|
+
|
|
102
|
+
# Custom prompts for specific document types
|
|
103
|
+
custom_prompt = """
|
|
104
|
+
Analyze this technical document page and extract:
|
|
105
|
+
1. Section headings and hierarchy
|
|
106
|
+
2. Technical specifications and parameters
|
|
107
|
+
3. Diagrams, charts, and visual elements
|
|
108
|
+
4. Tables with numerical data
|
|
109
|
+
5. Key formulas or equations
|
|
110
|
+
|
|
111
|
+
Format the response as structured JSON with clear categorization.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
reinforcement_prompt = """
|
|
115
|
+
Review the previous analysis and enhance it by:
|
|
116
|
+
1. Ensuring all technical details are captured accurately
|
|
117
|
+
2. Organizing information hierarchically
|
|
118
|
+
3. Adding any missed visual elements
|
|
119
|
+
4. Validating numerical data and units
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
123
|
+
|
|
124
|
+
with open("technical_doc.pdf", "rb") as f:
|
|
125
|
+
pdf_bytes = f.read()
|
|
126
|
+
|
|
127
|
+
results = process_pdf_with_vision(
|
|
128
|
+
pdf_name="technical_doc.pdf",
|
|
129
|
+
pdf=pdf_bytes,
|
|
130
|
+
llm=llm,
|
|
131
|
+
vision_prompt=custom_prompt,
|
|
132
|
+
reinforced_prompt=reinforcement_prompt,
|
|
133
|
+
dpi=300 # High resolution for detailed documents
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Advanced Usage
|
|
138
|
+
|
|
139
|
+
### Batch Processing
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import glob
|
|
143
|
+
from pathlib import Path
|
|
144
|
+
|
|
145
|
+
def process_pdf_batch(pdf_directory, llm):
|
|
146
|
+
"""Process multiple PDFs in a directory"""
|
|
147
|
+
pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
|
|
148
|
+
results = {}
|
|
149
|
+
|
|
150
|
+
for pdf_path in pdf_files:
|
|
151
|
+
pdf_name = Path(pdf_path).name
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
with open(pdf_path, "rb") as f:
|
|
155
|
+
pdf_bytes = f.read()
|
|
156
|
+
|
|
157
|
+
results[pdf_name] = process_pdf_with_vision(
|
|
158
|
+
pdf_name=pdf_name,
|
|
159
|
+
pdf=pdf_bytes,
|
|
160
|
+
llm=llm
|
|
161
|
+
)
|
|
162
|
+
print(f"✓ Processed: {pdf_name}")
|
|
163
|
+
|
|
164
|
+
except Exception as e:
|
|
165
|
+
print(f"✗ Failed: {pdf_name} - {e}")
|
|
166
|
+
results[pdf_name] = {"error": str(e)}
|
|
167
|
+
|
|
168
|
+
return results
|
|
169
|
+
|
|
170
|
+
# Usage
|
|
171
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
172
|
+
batch_results = process_pdf_batch("./documents/", llm)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Integration with Different Models
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
# OpenAI GPT-4 Vision
|
|
179
|
+
from langchain_openai import ChatOpenAI
|
|
180
|
+
llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
|
|
181
|
+
|
|
182
|
+
# Anthropic Claude (when vision is available)
|
|
183
|
+
from langchain_anthropic import ChatAnthropic
|
|
184
|
+
llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
|
|
185
|
+
|
|
186
|
+
# Google Gemini Vision
|
|
187
|
+
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
188
|
+
llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Output Format
|
|
192
|
+
|
|
193
|
+
The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
|
|
194
|
+
|
|
195
|
+
```json
|
|
196
|
+
[
|
|
197
|
+
{
|
|
198
|
+
"content": "Extracted and structured content from the page",
|
|
199
|
+
"metadata": {
|
|
200
|
+
"pdf_name": "document.pdf",
|
|
201
|
+
"page_number": 1,
|
|
202
|
+
"error": "none",
|
|
203
|
+
"processing_time": "2.34s",
|
|
204
|
+
"model_used": "gpt-4-vision-preview"
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
]
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Configuration Options
|
|
211
|
+
|
|
212
|
+
### Function Parameters
|
|
213
|
+
|
|
214
|
+
- **pdf_name** (str): Name identifier for the PDF file
|
|
215
|
+
- **pdf** (bytes): PDF content as bytes
|
|
216
|
+
- **llm** (BaseChatModel): LangChain vision-capable language model
|
|
217
|
+
- **vision_prompt** (str): Initial extraction prompt
|
|
218
|
+
- **reinforced_prompt** (str): Secondary refinement prompt
|
|
219
|
+
- **dpi** (int): Image resolution for PDF rendering (default: 300)
|
|
220
|
+
|
|
221
|
+
### Recommended DPI Settings
|
|
222
|
+
|
|
223
|
+
- **150 DPI**: Fast processing, basic documents
|
|
224
|
+
- **300 DPI**: Standard quality, most documents (default)
|
|
225
|
+
- **600 DPI**: High quality, detailed technical documents
|
|
226
|
+
|
|
227
|
+
## Use Cases
|
|
228
|
+
|
|
229
|
+
### Technical Documentation
|
|
230
|
+
|
|
231
|
+
```python
|
|
232
|
+
# Optimized for technical manuals, specifications, and research papers
|
|
233
|
+
tech_prompt = """
|
|
234
|
+
Extract from this technical document:
|
|
235
|
+
1. Technical specifications and parameters
|
|
236
|
+
2. Procedural steps and instructions
|
|
237
|
+
3. Diagrams, schematics, and their descriptions
|
|
238
|
+
4. Tables with measurements and data points
|
|
239
|
+
5. Safety warnings and important notes
|
|
240
|
+
Format as structured JSON with clear sections.
|
|
241
|
+
"""
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### Financial Documents
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
# Optimized for financial reports, statements, and analysis
|
|
248
|
+
finance_prompt = """
|
|
249
|
+
Analyze this financial document and extract:
|
|
250
|
+
1. Financial figures, ratios, and metrics
|
|
251
|
+
2. Tables with numerical data
|
|
252
|
+
3. Charts and graphs with their insights
|
|
253
|
+
4. Key financial statements sections
|
|
254
|
+
5. Important dates and periods
|
|
255
|
+
Return structured data suitable for financial analysis.
|
|
256
|
+
"""
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### Research Papers
|
|
260
|
+
|
|
261
|
+
```python
|
|
262
|
+
# Optimized for academic and research publications
|
|
263
|
+
research_prompt = """
|
|
264
|
+
Process this research document and identify:
|
|
265
|
+
1. Abstract and key findings
|
|
266
|
+
2. Methodology and experimental setup
|
|
267
|
+
3. Results, data tables, and statistics
|
|
268
|
+
4. Figures, graphs, and their captions
|
|
269
|
+
5. References and citations
|
|
270
|
+
Structure the output for academic analysis.
|
|
271
|
+
"""
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
## Error Handling
|
|
275
|
+
|
|
276
|
+
The package includes robust error handling:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
try:
|
|
280
|
+
results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
|
|
281
|
+
|
|
282
|
+
for result in results:
|
|
283
|
+
if result["metadata"]["error"] != "none":
|
|
284
|
+
print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
|
|
285
|
+
else:
|
|
286
|
+
# Process successful result
|
|
287
|
+
print(f"Successfully processed page {result['metadata']['page_number']}")
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
print(f"Fatal error processing PDF: {e}")
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
## Requirements
|
|
294
|
+
|
|
295
|
+
- Python 3.8+
|
|
296
|
+
- langchain ~= 0.3.27
|
|
297
|
+
- langchain-core ~= 0.3.72
|
|
298
|
+
- PyMuPDF (for PDF processing)
|
|
299
|
+
- pathlib ~= 1.0.1
|
|
300
|
+
- langsmith >= 0.1.17
|
|
301
|
+
|
|
302
|
+
## Performance Tips
|
|
303
|
+
|
|
304
|
+
1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
|
|
305
|
+
2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
|
|
306
|
+
3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
|
|
307
|
+
4. **Prompt Engineering**: Craft specific prompts for your document types
|
|
308
|
+
5. **Error Handling**: Implement retry logic for transient API failures
|
|
309
|
+
|
|
310
|
+
## Contributing
|
|
311
|
+
|
|
312
|
+
We welcome contributions! Please see our contributing guidelines for more information.
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
317
|
+
|
|
318
|
+
## Support
|
|
319
|
+
|
|
320
|
+
For issues, questions, or feature requests:
|
|
321
|
+
|
|
322
|
+
1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
|
|
323
|
+
2. Review existing issues on GitHub
|
|
324
|
+
3. Create a new issue with detailed information
|
|
325
|
+
|
|
326
|
+
## Changelog
|
|
327
|
+
|
|
328
|
+
### v1.0.8
|
|
329
|
+
|
|
330
|
+
- Improved error handling and metadata processing
|
|
331
|
+
- Enhanced prompt templates
|
|
332
|
+
- Better support for technical documents
|
|
333
|
+
|
|
334
|
+
---
|
|
335
|
+
|
|
336
|
+
**Author**: Strategion (development@strategion.de)
|
|
337
|
+
|
|
338
|
+
**Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
requirements.txt
|
|
6
|
+
setup.py
|
|
7
|
+
pure_visual_grounder.egg-info/PKG-INFO
|
|
8
|
+
pure_visual_grounder.egg-info/SOURCES.txt
|
|
9
|
+
pure_visual_grounder.egg-info/dependency_links.txt
|
|
10
|
+
pure_visual_grounder.egg-info/requires.txt
|
|
11
|
+
pure_visual_grounder.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pure-visual-grounder"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Strategion", email = "development@strategion.de"}
|
|
10
|
+
]
|
|
11
|
+
description = "A package for processing PDFs with vision-based language models"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = {text = "MIT"}
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.8",
|
|
21
|
+
"Programming Language :: Python :: 3.9",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
]
|
|
25
|
+
dynamic = ["dependencies"]
|
|
26
|
+
|
|
27
|
+
[tool.setuptools.dynamic]
|
|
28
|
+
dependencies = {file = ["requirements.txt"]}
|
|
29
|
+
|
|
30
|
+
[tool.setuptools.packages.find]
|
|
31
|
+
include = ["pure_visual_grounding*"]
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.package-data]
|
|
34
|
+
"*" = ["*.txt", "*.md"]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
with open("requirements.txt", "r", encoding="utf-8") as fh:
|
|
7
|
+
requirements = fh.read().splitlines()
|
|
8
|
+
|
|
9
|
+
setup(
|
|
10
|
+
name="pure-visual-grounder",
|
|
11
|
+
version="1.0.9",
|
|
12
|
+
author="Strategion",
|
|
13
|
+
author_email="development@strategion.de",
|
|
14
|
+
description="A package for processing PDFs with vision-based language models",
|
|
15
|
+
long_description="This package uses the given LLM and pdf to perform the OCR operation. Technical documents are "
|
|
16
|
+
"often in need to be stored in RAG and lack the uniform structure. This package helps you to get "
|
|
17
|
+
"the relevant data out of the pdf",
|
|
18
|
+
long_description_content_type="text/markdown",
|
|
19
|
+
packages=find_packages(include=['pure_visual_grounding', 'pure_visual_grounding.*']),
|
|
20
|
+
classifiers=[
|
|
21
|
+
"Development Status :: 3 - Alpha",
|
|
22
|
+
"Intended Audience :: Developers",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3.8",
|
|
27
|
+
"Programming Language :: Python :: 3.9",
|
|
28
|
+
"Programming Language :: Python :: 3.10",
|
|
29
|
+
"Programming Language :: Python :: 3.11",
|
|
30
|
+
],
|
|
31
|
+
python_requires=">=3.8",
|
|
32
|
+
install_requires=requirements,
|
|
33
|
+
include_package_data=True,
|
|
34
|
+
)
|