pure-visual-grounder 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Strategion
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include requirements.txt
3
+ include LICENSE
4
+ recursive-include pure_visual_grounding *.py
5
+ recursive-exclude * __pycache__
6
+ recursive-exclude * *.py[co]
@@ -0,0 +1,338 @@
1
+ Metadata-Version: 2.4
2
+ Name: pure-visual-grounder
3
+ Version: 1.0.0
4
+ Summary: A package for processing PDFs with vision-based language models
5
+ Author: Strategion
6
+ Author-email: Strategion <development@strategion.de>
7
+ License: MIT
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: langchain~=0.3.27
20
+ Requires-Dist: PyMuPDF
21
+ Requires-Dist: langchain-core~=0.3.72
22
+ Requires-Dist: pathlib~=1.0.1
23
+ Requires-Dist: langsmith>=0.1.17
24
+ Dynamic: author
25
+ Dynamic: license-file
26
+ Dynamic: requires-python
27
+
28
+ # Pure Visual Grounding
29
+
30
+ [![PyPI version](https://badge.fury.io/py/pure-visual-grounding.svg)](https://badge.fury.io/py/pure-visual-grounding)
31
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
+
34
+ A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
35
+
36
+ ## Overview
37
+
38
+ Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
39
+
40
+ - **Technical Document Processing**: Extract structured data from complex technical documents
41
+ - **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
42
+ - **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
43
+ - **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
44
+
45
+ ## Features
46
+
47
+ - 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
48
+ - 📄 **Multi-page Support**: Processes entire PDF documents page by page
49
+ - 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
50
+ - 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
51
+ - 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
52
+ - 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
53
+ - 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
54
+
55
+ ## Installation
56
+
57
+ Install from PyPI:
58
+
59
+ ```bash
60
+ pip install pure-visual-grounding
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ### Basic Usage
66
+
67
+ ```python
68
+ from pure_visual_grounding import process_pdf_with_vision
69
+ from langchain_openai import ChatOpenAI
70
+
71
+ # Initialize your vision model
72
+ llm = ChatOpenAI(
73
+ model="gpt-4-vision-preview",
74
+ api_key="your-openai-api-key"
75
+ )
76
+
77
+ # Read PDF file as bytes
78
+ with open("document.pdf", "rb") as f:
79
+ pdf_bytes = f.read()
80
+
81
+ # Process the PDF
82
+ results = process_pdf_with_vision(
83
+ pdf_name="document.pdf",
84
+ pdf=pdf_bytes,
85
+ llm=llm,
86
+ vision_prompt="First prompt to get the information out of image",
87
+ reinforced_prompt="Re inforced prompt to make sure all information is extracted"
88
+ )
89
+
90
+ # Access structured results
91
+ for page_result in results:
92
+ print(f"Page {page_result['metadata']['page_number']}")
93
+ print(f"Content: {page_result.get('content', 'No content extracted')}")
94
+ ```
95
+
96
+ ### Custom Processing
97
+
98
+ ```python
99
+ from pure_visual_grounding import process_pdf_with_vision
100
+ from langchain_openai import ChatOpenAI
101
+
102
+ # Custom prompts for specific document types
103
+ custom_prompt = """
104
+ Analyze this technical document page and extract:
105
+ 1. Section headings and hierarchy
106
+ 2. Technical specifications and parameters
107
+ 3. Diagrams, charts, and visual elements
108
+ 4. Tables with numerical data
109
+ 5. Key formulas or equations
110
+
111
+ Format the response as structured JSON with clear categorization.
112
+ """
113
+
114
+ reinforcement_prompt = """
115
+ Review the previous analysis and enhance it by:
116
+ 1. Ensuring all technical details are captured accurately
117
+ 2. Organizing information hierarchically
118
+ 3. Adding any missed visual elements
119
+ 4. Validating numerical data and units
120
+ """
121
+
122
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
123
+
124
+ with open("technical_doc.pdf", "rb") as f:
125
+ pdf_bytes = f.read()
126
+
127
+ results = process_pdf_with_vision(
128
+ pdf_name="technical_doc.pdf",
129
+ pdf=pdf_bytes,
130
+ llm=llm,
131
+ vision_prompt=custom_prompt,
132
+ reinforced_prompt=reinforcement_prompt,
133
+ dpi=300 # High resolution for detailed documents
134
+ )
135
+ ```
136
+
137
+ ## Advanced Usage
138
+
139
+ ### Batch Processing
140
+
141
+ ```python
142
+ import glob
143
+ from pathlib import Path
144
+
145
+ def process_pdf_batch(pdf_directory, llm):
146
+ """Process multiple PDFs in a directory"""
147
+ pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
148
+ results = {}
149
+
150
+ for pdf_path in pdf_files:
151
+ pdf_name = Path(pdf_path).name
152
+
153
+ try:
154
+ with open(pdf_path, "rb") as f:
155
+ pdf_bytes = f.read()
156
+
157
+ results[pdf_name] = process_pdf_with_vision(
158
+ pdf_name=pdf_name,
159
+ pdf=pdf_bytes,
160
+ llm=llm
161
+ )
162
+ print(f"✓ Processed: {pdf_name}")
163
+
164
+ except Exception as e:
165
+ print(f"✗ Failed: {pdf_name} - {e}")
166
+ results[pdf_name] = {"error": str(e)}
167
+
168
+ return results
169
+
170
+ # Usage
171
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
172
+ batch_results = process_pdf_batch("./documents/", llm)
173
+ ```
174
+
175
+ ### Integration with Different Models
176
+
177
+ ```python
178
+ # OpenAI GPT-4 Vision
179
+ from langchain_openai import ChatOpenAI
180
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
181
+
182
+ # Anthropic Claude (when vision is available)
183
+ from langchain_anthropic import ChatAnthropic
184
+ llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
185
+
186
+ # Google Gemini Vision
187
+ from langchain_google_genai import ChatGoogleGenerativeAI
188
+ llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
189
+ ```
190
+
191
+ ## Output Format
192
+
193
+ The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
194
+
195
+ ```json
196
+ [
197
+ {
198
+ "content": "Extracted and structured content from the page",
199
+ "metadata": {
200
+ "pdf_name": "document.pdf",
201
+ "page_number": 1,
202
+ "error": "none",
203
+ "processing_time": "2.34s",
204
+ "model_used": "gpt-4-vision-preview"
205
+ }
206
+ }
207
+ ]
208
+ ```
209
+
210
+ ## Configuration Options
211
+
212
+ ### Function Parameters
213
+
214
+ - **pdf_name** (str): Name identifier for the PDF file
215
+ - **pdf** (bytes): PDF content as bytes
216
+ - **llm** (BaseChatModel): LangChain vision-capable language model
217
+ - **vision_prompt** (str): Initial extraction prompt
218
+ - **reinforced_prompt** (str): Secondary refinement prompt
219
+ - **dpi** (int): Image resolution for PDF rendering (default: 300)
220
+
221
+ ### Recommended DPI Settings
222
+
223
+ - **150 DPI**: Fast processing, basic documents
224
+ - **300 DPI**: Standard quality, most documents (default)
225
+ - **600 DPI**: High quality, detailed technical documents
226
+
227
+ ## Use Cases
228
+
229
+ ### Technical Documentation
230
+
231
+ ```python
232
+ # Optimized for technical manuals, specifications, and research papers
233
+ tech_prompt = """
234
+ Extract from this technical document:
235
+ 1. Technical specifications and parameters
236
+ 2. Procedural steps and instructions
237
+ 3. Diagrams, schematics, and their descriptions
238
+ 4. Tables with measurements and data points
239
+ 5. Safety warnings and important notes
240
+ Format as structured JSON with clear sections.
241
+ """
242
+ ```
243
+
244
+ ### Financial Documents
245
+
246
+ ```python
247
+ # Optimized for financial reports, statements, and analysis
248
+ finance_prompt = """
249
+ Analyze this financial document and extract:
250
+ 1. Financial figures, ratios, and metrics
251
+ 2. Tables with numerical data
252
+ 3. Charts and graphs with their insights
253
+ 4. Key financial statements sections
254
+ 5. Important dates and periods
255
+ Return structured data suitable for financial analysis.
256
+ """
257
+ ```
258
+
259
+ ### Research Papers
260
+
261
+ ```python
262
+ # Optimized for academic and research publications
263
+ research_prompt = """
264
+ Process this research document and identify:
265
+ 1. Abstract and key findings
266
+ 2. Methodology and experimental setup
267
+ 3. Results, data tables, and statistics
268
+ 4. Figures, graphs, and their captions
269
+ 5. References and citations
270
+ Structure the output for academic analysis.
271
+ """
272
+ ```
273
+
274
+ ## Error Handling
275
+
276
+ The package includes robust error handling:
277
+
278
+ ```python
279
+ try:
280
+ results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
281
+
282
+ for result in results:
283
+ if result["metadata"]["error"] != "none":
284
+ print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
285
+ else:
286
+ # Process successful result
287
+ print(f"Successfully processed page {result['metadata']['page_number']}")
288
+
289
+ except Exception as e:
290
+ print(f"Fatal error processing PDF: {e}")
291
+ ```
292
+
293
+ ## Requirements
294
+
295
+ - Python 3.8+
296
+ - langchain ~= 0.3.27
297
+ - langchain-core ~= 0.3.72
298
+ - PyMuPDF (for PDF processing)
299
+ - pathlib ~= 1.0.1
300
+ - langsmith >= 0.1.17
301
+
302
+ ## Performance Tips
303
+
304
+ 1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
305
+ 2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
306
+ 3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
307
+ 4. **Prompt Engineering**: Craft specific prompts for your document types
308
+ 5. **Error Handling**: Implement retry logic for transient API failures
309
+
310
+ ## Contributing
311
+
312
+ We welcome contributions! Please see our contributing guidelines for more information.
313
+
314
+ ## License
315
+
316
+ This project is licensed under the MIT License - see the LICENSE file for details.
317
+
318
+ ## Support
319
+
320
+ For issues, questions, or feature requests:
321
+
322
+ 1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
323
+ 2. Review existing issues on GitHub
324
+ 3. Create a new issue with detailed information
325
+
326
+ ## Changelog
327
+
328
+ ### v1.0.8
329
+
330
+ - Improved error handling and metadata processing
331
+ - Enhanced prompt templates
332
+ - Better support for technical documents
333
+
334
+ ---
335
+
336
+ **Author**: Strategion (development@strategion.de)
337
+
338
+ **Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
@@ -0,0 +1,311 @@
1
+ # Pure Visual Grounding
2
+
3
+ [![PyPI version](https://badge.fury.io/py/pure-visual-grounding.svg)](https://badge.fury.io/py/pure-visual-grounding)
4
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
8
+
9
+ ## Overview
10
+
11
+ Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
12
+
13
+ - **Technical Document Processing**: Extract structured data from complex technical documents
14
+ - **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
15
+ - **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
16
+ - **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
17
+
18
+ ## Features
19
+
20
+ - 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
21
+ - 📄 **Multi-page Support**: Processes entire PDF documents page by page
22
+ - 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
23
+ - 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
24
+ - 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
25
+ - 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
26
+ - 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
27
+
28
+ ## Installation
29
+
30
+ Install from PyPI:
31
+
32
+ ```bash
33
+ pip install pure-visual-grounding
34
+ ```
35
+
36
+ ## Quick Start
37
+
38
+ ### Basic Usage
39
+
40
+ ```python
41
+ from pure_visual_grounding import process_pdf_with_vision
42
+ from langchain_openai import ChatOpenAI
43
+
44
+ # Initialize your vision model
45
+ llm = ChatOpenAI(
46
+ model="gpt-4-vision-preview",
47
+ api_key="your-openai-api-key"
48
+ )
49
+
50
+ # Read PDF file as bytes
51
+ with open("document.pdf", "rb") as f:
52
+ pdf_bytes = f.read()
53
+
54
+ # Process the PDF
55
+ results = process_pdf_with_vision(
56
+ pdf_name="document.pdf",
57
+ pdf=pdf_bytes,
58
+ llm=llm,
59
+ vision_prompt="First prompt to get the information out of image",
60
+ reinforced_prompt="Re inforced prompt to make sure all information is extracted"
61
+ )
62
+
63
+ # Access structured results
64
+ for page_result in results:
65
+ print(f"Page {page_result['metadata']['page_number']}")
66
+ print(f"Content: {page_result.get('content', 'No content extracted')}")
67
+ ```
68
+
69
+ ### Custom Processing
70
+
71
+ ```python
72
+ from pure_visual_grounding import process_pdf_with_vision
73
+ from langchain_openai import ChatOpenAI
74
+
75
+ # Custom prompts for specific document types
76
+ custom_prompt = """
77
+ Analyze this technical document page and extract:
78
+ 1. Section headings and hierarchy
79
+ 2. Technical specifications and parameters
80
+ 3. Diagrams, charts, and visual elements
81
+ 4. Tables with numerical data
82
+ 5. Key formulas or equations
83
+
84
+ Format the response as structured JSON with clear categorization.
85
+ """
86
+
87
+ reinforcement_prompt = """
88
+ Review the previous analysis and enhance it by:
89
+ 1. Ensuring all technical details are captured accurately
90
+ 2. Organizing information hierarchically
91
+ 3. Adding any missed visual elements
92
+ 4. Validating numerical data and units
93
+ """
94
+
95
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
96
+
97
+ with open("technical_doc.pdf", "rb") as f:
98
+ pdf_bytes = f.read()
99
+
100
+ results = process_pdf_with_vision(
101
+ pdf_name="technical_doc.pdf",
102
+ pdf=pdf_bytes,
103
+ llm=llm,
104
+ vision_prompt=custom_prompt,
105
+ reinforced_prompt=reinforcement_prompt,
106
+ dpi=300 # High resolution for detailed documents
107
+ )
108
+ ```
109
+
110
+ ## Advanced Usage
111
+
112
+ ### Batch Processing
113
+
114
+ ```python
115
+ import glob
116
+ from pathlib import Path
117
+
118
+ def process_pdf_batch(pdf_directory, llm):
119
+ """Process multiple PDFs in a directory"""
120
+ pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
121
+ results = {}
122
+
123
+ for pdf_path in pdf_files:
124
+ pdf_name = Path(pdf_path).name
125
+
126
+ try:
127
+ with open(pdf_path, "rb") as f:
128
+ pdf_bytes = f.read()
129
+
130
+ results[pdf_name] = process_pdf_with_vision(
131
+ pdf_name=pdf_name,
132
+ pdf=pdf_bytes,
133
+ llm=llm
134
+ )
135
+ print(f"✓ Processed: {pdf_name}")
136
+
137
+ except Exception as e:
138
+ print(f"✗ Failed: {pdf_name} - {e}")
139
+ results[pdf_name] = {"error": str(e)}
140
+
141
+ return results
142
+
143
+ # Usage
144
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
145
+ batch_results = process_pdf_batch("./documents/", llm)
146
+ ```
147
+
148
+ ### Integration with Different Models
149
+
150
+ ```python
151
+ # OpenAI GPT-4 Vision
152
+ from langchain_openai import ChatOpenAI
153
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
154
+
155
+ # Anthropic Claude (when vision is available)
156
+ from langchain_anthropic import ChatAnthropic
157
+ llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
158
+
159
+ # Google Gemini Vision
160
+ from langchain_google_genai import ChatGoogleGenerativeAI
161
+ llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
162
+ ```
163
+
164
+ ## Output Format
165
+
166
+ The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
167
+
168
+ ```json
169
+ [
170
+ {
171
+ "content": "Extracted and structured content from the page",
172
+ "metadata": {
173
+ "pdf_name": "document.pdf",
174
+ "page_number": 1,
175
+ "error": "none",
176
+ "processing_time": "2.34s",
177
+ "model_used": "gpt-4-vision-preview"
178
+ }
179
+ }
180
+ ]
181
+ ```
182
+
183
+ ## Configuration Options
184
+
185
+ ### Function Parameters
186
+
187
+ - **pdf_name** (str): Name identifier for the PDF file
188
+ - **pdf** (bytes): PDF content as bytes
189
+ - **llm** (BaseChatModel): LangChain vision-capable language model
190
+ - **vision_prompt** (str): Initial extraction prompt
191
+ - **reinforced_prompt** (str): Secondary refinement prompt
192
+ - **dpi** (int): Image resolution for PDF rendering (default: 300)
193
+
194
+ ### Recommended DPI Settings
195
+
196
+ - **150 DPI**: Fast processing, basic documents
197
+ - **300 DPI**: Standard quality, most documents (default)
198
+ - **600 DPI**: High quality, detailed technical documents
199
+
200
+ ## Use Cases
201
+
202
+ ### Technical Documentation
203
+
204
+ ```python
205
+ # Optimized for technical manuals, specifications, and research papers
206
+ tech_prompt = """
207
+ Extract from this technical document:
208
+ 1. Technical specifications and parameters
209
+ 2. Procedural steps and instructions
210
+ 3. Diagrams, schematics, and their descriptions
211
+ 4. Tables with measurements and data points
212
+ 5. Safety warnings and important notes
213
+ Format as structured JSON with clear sections.
214
+ """
215
+ ```
216
+
217
+ ### Financial Documents
218
+
219
+ ```python
220
+ # Optimized for financial reports, statements, and analysis
221
+ finance_prompt = """
222
+ Analyze this financial document and extract:
223
+ 1. Financial figures, ratios, and metrics
224
+ 2. Tables with numerical data
225
+ 3. Charts and graphs with their insights
226
+ 4. Key financial statements sections
227
+ 5. Important dates and periods
228
+ Return structured data suitable for financial analysis.
229
+ """
230
+ ```
231
+
232
+ ### Research Papers
233
+
234
+ ```python
235
+ # Optimized for academic and research publications
236
+ research_prompt = """
237
+ Process this research document and identify:
238
+ 1. Abstract and key findings
239
+ 2. Methodology and experimental setup
240
+ 3. Results, data tables, and statistics
241
+ 4. Figures, graphs, and their captions
242
+ 5. References and citations
243
+ Structure the output for academic analysis.
244
+ """
245
+ ```
246
+
247
+ ## Error Handling
248
+
249
+ The package includes robust error handling:
250
+
251
+ ```python
252
+ try:
253
+ results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
254
+
255
+ for result in results:
256
+ if result["metadata"]["error"] != "none":
257
+ print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
258
+ else:
259
+ # Process successful result
260
+ print(f"Successfully processed page {result['metadata']['page_number']}")
261
+
262
+ except Exception as e:
263
+ print(f"Fatal error processing PDF: {e}")
264
+ ```
265
+
266
+ ## Requirements
267
+
268
+ - Python 3.8+
269
+ - langchain ~= 0.3.27
270
+ - langchain-core ~= 0.3.72
271
+ - PyMuPDF (for PDF processing)
272
+ - pathlib ~= 1.0.1
273
+ - langsmith >= 0.1.17
274
+
275
+ ## Performance Tips
276
+
277
+ 1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
278
+ 2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
279
+ 3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
280
+ 4. **Prompt Engineering**: Craft specific prompts for your document types
281
+ 5. **Error Handling**: Implement retry logic for transient API failures
282
+
283
+ ## Contributing
284
+
285
+ We welcome contributions! Please see our contributing guidelines for more information.
286
+
287
+ ## License
288
+
289
+ This project is licensed under the MIT License - see the LICENSE file for details.
290
+
291
+ ## Support
292
+
293
+ For issues, questions, or feature requests:
294
+
295
+ 1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
296
+ 2. Review existing issues on GitHub
297
+ 3. Create a new issue with detailed information
298
+
299
+ ## Changelog
300
+
301
+ ### v1.0.8
302
+
303
+ - Improved error handling and metadata processing
304
+ - Enhanced prompt templates
305
+ - Better support for technical documents
306
+
307
+ ---
308
+
309
+ **Author**: Strategion (development@strategion.de)
310
+
311
+ **Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
@@ -0,0 +1,338 @@
1
+ Metadata-Version: 2.4
2
+ Name: pure-visual-grounder
3
+ Version: 1.0.0
4
+ Summary: A package for processing PDFs with vision-based language models
5
+ Author: Strategion
6
+ Author-email: Strategion <development@strategion.de>
7
+ License: MIT
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Requires-Python: >=3.8
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: langchain~=0.3.27
20
+ Requires-Dist: PyMuPDF
21
+ Requires-Dist: langchain-core~=0.3.72
22
+ Requires-Dist: pathlib~=1.0.1
23
+ Requires-Dist: langsmith>=0.1.17
24
+ Dynamic: author
25
+ Dynamic: license-file
26
+ Dynamic: requires-python
27
+
28
+ # Pure Visual Grounding
29
+
30
+ [![PyPI version](https://badge.fury.io/py/pure-visual-grounding.svg)](https://badge.fury.io/py/pure-visual-grounding)
31
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/release/python-380/)
32
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
+
34
+ A Python package for processing PDFs with vision-based language models, specialized for technical document OCR and structured data extraction.
35
+
36
+ ## Overview
37
+
38
+ Pure Visual Grounding leverages the power of vision-enabled large language models to extract structured information from PDF documents. This package is particularly useful for:
39
+
40
+ - **Technical Document Processing**: Extract structured data from complex technical documents
41
+ - **OCR with Context**: Go beyond simple OCR to understand document structure and meaning
42
+ - **RAG Pipeline Integration**: Prepare documents for Retrieval-Augmented Generation workflows
43
+ - **Automated Document Analysis**: Process large volumes of PDFs with consistent structured output
44
+
45
+ ## Features
46
+
47
+ - 🔍 **Vision-based PDF Processing**: Uses advanced vision models for accurate document analysis
48
+ - 📄 **Multi-page Support**: Processes entire PDF documents page by page
49
+ - 🏗️ **Structured Output**: Returns organized JSON data with metadata for each page
50
+ - 🎯 **Customizable Prompts**: Configure extraction prompts for specific document types
51
+ - 🔄 **Two-pass Processing**: Initial extraction followed by refinement for improved accuracy
52
+ - 📊 **High-DPI Rendering**: Configurable DPI settings for optimal image quality
53
+ - 🛠️ **LangChain Integration**: Built on LangChain for easy model swapping and configuration
54
+
55
+ ## Installation
56
+
57
+ Install from PyPI:
58
+
59
+ ```bash
60
+ pip install pure-visual-grounding
61
+ ```
62
+
63
+ ## Quick Start
64
+
65
+ ### Basic Usage
66
+
67
+ ```python
68
+ from pure_visual_grounding import process_pdf_with_vision
69
+ from langchain_openai import ChatOpenAI
70
+
71
+ # Initialize your vision model
72
+ llm = ChatOpenAI(
73
+ model="gpt-4-vision-preview",
74
+ api_key="your-openai-api-key"
75
+ )
76
+
77
+ # Read PDF file as bytes
78
+ with open("document.pdf", "rb") as f:
79
+ pdf_bytes = f.read()
80
+
81
+ # Process the PDF
82
+ results = process_pdf_with_vision(
83
+ pdf_name="document.pdf",
84
+ pdf=pdf_bytes,
85
+ llm=llm,
86
+ vision_prompt="First prompt to get the information out of image",
87
+ reinforced_prompt="Re inforced prompt to make sure all information is extracted"
88
+ )
89
+
90
+ # Access structured results
91
+ for page_result in results:
92
+ print(f"Page {page_result['metadata']['page_number']}")
93
+ print(f"Content: {page_result.get('content', 'No content extracted')}")
94
+ ```
95
+
96
+ ### Custom Processing
97
+
98
+ ```python
99
+ from pure_visual_grounding import process_pdf_with_vision
100
+ from langchain_openai import ChatOpenAI
101
+
102
+ # Custom prompts for specific document types
103
+ custom_prompt = """
104
+ Analyze this technical document page and extract:
105
+ 1. Section headings and hierarchy
106
+ 2. Technical specifications and parameters
107
+ 3. Diagrams, charts, and visual elements
108
+ 4. Tables with numerical data
109
+ 5. Key formulas or equations
110
+
111
+ Format the response as structured JSON with clear categorization.
112
+ """
113
+
114
+ reinforcement_prompt = """
115
+ Review the previous analysis and enhance it by:
116
+ 1. Ensuring all technical details are captured accurately
117
+ 2. Organizing information hierarchically
118
+ 3. Adding any missed visual elements
119
+ 4. Validating numerical data and units
120
+ """
121
+
122
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
123
+
124
+ with open("technical_doc.pdf", "rb") as f:
125
+ pdf_bytes = f.read()
126
+
127
+ results = process_pdf_with_vision(
128
+ pdf_name="technical_doc.pdf",
129
+ pdf=pdf_bytes,
130
+ llm=llm,
131
+ vision_prompt=custom_prompt,
132
+ reinforced_prompt=reinforcement_prompt,
133
+ dpi=300 # High resolution for detailed documents
134
+ )
135
+ ```
136
+
137
+ ## Advanced Usage
138
+
139
+ ### Batch Processing
140
+
141
+ ```python
142
+ import glob
143
+ from pathlib import Path
144
+
145
+ def process_pdf_batch(pdf_directory, llm):
146
+ """Process multiple PDFs in a directory"""
147
+ pdf_files = glob.glob(str(Path(pdf_directory) / "*.pdf"))
148
+ results = {}
149
+
150
+ for pdf_path in pdf_files:
151
+ pdf_name = Path(pdf_path).name
152
+
153
+ try:
154
+ with open(pdf_path, "rb") as f:
155
+ pdf_bytes = f.read()
156
+
157
+ results[pdf_name] = process_pdf_with_vision(
158
+ pdf_name=pdf_name,
159
+ pdf=pdf_bytes,
160
+ llm=llm
161
+ )
162
+ print(f"✓ Processed: {pdf_name}")
163
+
164
+ except Exception as e:
165
+ print(f"✗ Failed: {pdf_name} - {e}")
166
+ results[pdf_name] = {"error": str(e)}
167
+
168
+ return results
169
+
170
+ # Usage
171
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
172
+ batch_results = process_pdf_batch("./documents/", llm)
173
+ ```
174
+
175
+ ### Integration with Different Models
176
+
177
+ ```python
178
+ # OpenAI GPT-4 Vision
179
+ from langchain_openai import ChatOpenAI
180
+ llm = ChatOpenAI(model="gpt-4-vision-preview", api_key="your-key")
181
+
182
+ # Anthropic Claude (when vision is available)
183
+ from langchain_anthropic import ChatAnthropic
184
+ llm = ChatAnthropic(model="claude-3-sonnet-20240229", api_key="your-key")
185
+
186
+ # Google Gemini Vision
187
+ from langchain_google_genai import ChatGoogleGenerativeAI
188
+ llm = ChatGoogleGenerativeAI(model="gemini-pro-vision", api_key="your-key")
189
+ ```
190
+
191
+ ## Output Format
192
+
193
+ The package returns a list of dictionaries, based on the provided output structure in prompt. For example,:
194
+
195
+ ```json
196
+ [
197
+ {
198
+ "content": "Extracted and structured content from the page",
199
+ "metadata": {
200
+ "pdf_name": "document.pdf",
201
+ "page_number": 1,
202
+ "error": "none",
203
+ "processing_time": "2.34s",
204
+ "model_used": "gpt-4-vision-preview"
205
+ }
206
+ }
207
+ ]
208
+ ```
209
+
210
+ ## Configuration Options
211
+
212
+ ### Function Parameters
213
+
214
+ - **pdf_name** (str): Name identifier for the PDF file
215
+ - **pdf** (bytes): PDF content as bytes
216
+ - **llm** (BaseChatModel): LangChain vision-capable language model
217
+ - **vision_prompt** (str): Initial extraction prompt
218
+ - **reinforced_prompt** (str): Secondary refinement prompt
219
+ - **dpi** (int): Image resolution for PDF rendering (default: 300)
220
+
221
+ ### Recommended DPI Settings
222
+
223
+ - **150 DPI**: Fast processing, basic documents
224
+ - **300 DPI**: Standard quality, most documents (default)
225
+ - **600 DPI**: High quality, detailed technical documents
226
+
227
+ ## Use Cases
228
+
229
+ ### Technical Documentation
230
+
231
+ ```python
232
+ # Optimized for technical manuals, specifications, and research papers
233
+ tech_prompt = """
234
+ Extract from this technical document:
235
+ 1. Technical specifications and parameters
236
+ 2. Procedural steps and instructions
237
+ 3. Diagrams, schematics, and their descriptions
238
+ 4. Tables with measurements and data points
239
+ 5. Safety warnings and important notes
240
+ Format as structured JSON with clear sections.
241
+ """
242
+ ```
243
+
244
+ ### Financial Documents
245
+
246
+ ```python
247
+ # Optimized for financial reports, statements, and analysis
248
+ finance_prompt = """
249
+ Analyze this financial document and extract:
250
+ 1. Financial figures, ratios, and metrics
251
+ 2. Tables with numerical data
252
+ 3. Charts and graphs with their insights
253
+ 4. Key financial statements sections
254
+ 5. Important dates and periods
255
+ Return structured data suitable for financial analysis.
256
+ """
257
+ ```
258
+
259
+ ### Research Papers
260
+
261
+ ```python
262
+ # Optimized for academic and research publications
263
+ research_prompt = """
264
+ Process this research document and identify:
265
+ 1. Abstract and key findings
266
+ 2. Methodology and experimental setup
267
+ 3. Results, data tables, and statistics
268
+ 4. Figures, graphs, and their captions
269
+ 5. References and citations
270
+ Structure the output for academic analysis.
271
+ """
272
+ ```
273
+
274
+ ## Error Handling
275
+
276
+ The package includes robust error handling:
277
+
278
+ ```python
279
+ try:
280
+ results = process_pdf_with_vision(pdf_name, pdf_bytes, llm)
281
+
282
+ for result in results:
283
+ if result["metadata"]["error"] != "none":
284
+ print(f"Page {result['metadata']['page_number']} had errors: {result['metadata']['error']}")
285
+ else:
286
+ # Process successful result
287
+ print(f"Successfully processed page {result['metadata']['page_number']}")
288
+
289
+ except Exception as e:
290
+ print(f"Fatal error processing PDF: {e}")
291
+ ```
292
+
293
+ ## Requirements
294
+
295
+ - Python 3.8+
296
+ - langchain ~= 0.3.27
297
+ - langchain-core ~= 0.3.72
298
+ - PyMuPDF (for PDF processing)
299
+ - pathlib ~= 1.0.1
300
+ - langsmith >= 0.1.17
301
+
302
+ ## Performance Tips
303
+
304
+ 1. **Optimize DPI**: Use 300 DPI for most documents; increase only for fine details
305
+ 2. **Batch Processing**: Process multiple pages/documents in batches for efficiency
306
+ 3. **Model Selection**: Choose appropriate models based on accuracy vs. speed requirements
307
+ 4. **Prompt Engineering**: Craft specific prompts for your document types
308
+ 5. **Error Handling**: Implement retry logic for transient API failures
309
+
310
+ ## Contributing
311
+
312
+ We welcome contributions! Please see our contributing guidelines for more information.
313
+
314
+ ## License
315
+
316
+ This project is licensed under the MIT License - see the LICENSE file for details.
317
+
318
+ ## Support
319
+
320
+ For issues, questions, or feature requests:
321
+
322
+ 1. Check the [documentation](https://github.com/Strategion-GmbH/pure_visual_grounding)
323
+ 2. Review existing issues on GitHub
324
+ 3. Create a new issue with detailed information
325
+
326
+ ## Changelog
327
+
328
+ ### v1.0.8
329
+
330
+ - Improved error handling and metadata processing
331
+ - Enhanced prompt templates
332
+ - Better support for technical documents
333
+
334
+ ---
335
+
336
+ **Author**: Strategion (development@strategion.de)
337
+
338
+ **Keywords**: PDF, OCR, Vision, LLM, Document Processing, Technical Documents, RAG, LangChain
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ requirements.txt
6
+ setup.py
7
+ pure_visual_grounder.egg-info/PKG-INFO
8
+ pure_visual_grounder.egg-info/SOURCES.txt
9
+ pure_visual_grounder.egg-info/dependency_links.txt
10
+ pure_visual_grounder.egg-info/requires.txt
11
+ pure_visual_grounder.egg-info/top_level.txt
@@ -0,0 +1,5 @@
1
+ langchain~=0.3.27
2
+ PyMuPDF
3
+ langchain-core~=0.3.72
4
+ pathlib~=1.0.1
5
+ langsmith>=0.1.17
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pure-visual-grounder"
7
+ version = "1.0.0"
8
+ authors = [
9
+ {name = "Strategion", email = "development@strategion.de"}
10
+ ]
11
+ description = "A package for processing PDFs with vision-based language models"
12
+ readme = "README.md"
13
+ license = {text = "MIT"}
14
+ requires-python = ">=3.8"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Intended Audience :: Developers",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.8",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ ]
25
+ dynamic = ["dependencies"]
26
+
27
+ [tool.setuptools.dynamic]
28
+ dependencies = {file = ["requirements.txt"]}
29
+
30
+ [tool.setuptools.packages.find]
31
+ include = ["pure_visual_grounding*"]
32
+
33
+ [tool.setuptools.package-data]
34
+ "*" = ["*.txt", "*.md"]
@@ -0,0 +1,5 @@
1
+ langchain~=0.3.27
2
+ PyMuPDF
3
+ langchain-core~=0.3.72
4
+ pathlib~=1.0.1
5
+ langsmith>=0.1.17
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,34 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ with open("requirements.txt", "r", encoding="utf-8") as fh:
7
+ requirements = fh.read().splitlines()
8
+
9
+ setup(
10
+ name="pure-visual-grounder",
11
+ version="1.0.9",
12
+ author="Strategion",
13
+ author_email="development@strategion.de",
14
+ description="A package for processing PDFs with vision-based language models",
15
+ long_description="This package uses the given LLM and pdf to perform the OCR operation. Technical documents are "
16
+ "often in need to be stored in RAG and lack the uniform structure. This package helps you to get "
17
+ "the relevant data out of the pdf",
18
+ long_description_content_type="text/markdown",
19
+ packages=find_packages(include=['pure_visual_grounding', 'pure_visual_grounding.*']),
20
+ classifiers=[
21
+ "Development Status :: 3 - Alpha",
22
+ "Intended Audience :: Developers",
23
+ "License :: OSI Approved :: MIT License",
24
+ "Operating System :: OS Independent",
25
+ "Programming Language :: Python :: 3",
26
+ "Programming Language :: Python :: 3.8",
27
+ "Programming Language :: Python :: 3.9",
28
+ "Programming Language :: Python :: 3.10",
29
+ "Programming Language :: Python :: 3.11",
30
+ ],
31
+ python_requires=">=3.8",
32
+ install_requires=requirements,
33
+ include_package_data=True,
34
+ )