mb-rag 1.1.23__tar.gz → 1.1.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- {mb_rag-1.1.23 → mb_rag-1.1.26}/PKG-INFO +2 -2
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/chatbot/basic.py +4 -3
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/rag/embeddings.py +1 -1
- mb_rag-1.1.26/mb_rag/utils/document_extract.py +354 -0
- mb_rag-1.1.26/mb_rag/utils/pdf_extract.py +428 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/version.py +1 -1
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag.egg-info/PKG-INFO +2 -2
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag.egg-info/SOURCES.txt +3 -1
- {mb_rag-1.1.23 → mb_rag-1.1.26}/README.md +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/__init__.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/chatbot/__init__.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/chatbot/chains.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/chatbot/prompts.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/rag/__init__.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/utils/__init__.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/utils/bounding_box.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag/utils/extra.py +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag.egg-info/dependency_links.txt +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag.egg-info/requires.txt +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/mb_rag.egg-info/top_level.txt +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/pyproject.toml +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/setup.cfg +0 -0
- {mb_rag-1.1.23 → mb_rag-1.1.26}/setup.py +0 -0
|
@@ -218,12 +218,12 @@ class ModelFactory:
|
|
|
218
218
|
if not check_package("transformers"):
|
|
219
219
|
raise ImportError("Transformers package not found. Please install it using: pip install transformers")
|
|
220
220
|
if not check_package("langchain_huggingface"):
|
|
221
|
-
raise ImportError("
|
|
221
|
+
raise ImportError("langchain_huggingface package not found. Please install it using: pip install langchain_huggingface")
|
|
222
222
|
if not check_package("torch"):
|
|
223
223
|
raise ImportError("Torch package not found. Please install it using: pip install torch")
|
|
224
224
|
|
|
225
225
|
from langchain_huggingface import HuggingFacePipeline
|
|
226
|
-
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForImageTextToText
|
|
226
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForImageTextToText,AutoProcessor
|
|
227
227
|
import torch
|
|
228
228
|
|
|
229
229
|
device = torch.device(device) if torch.cuda.is_available() else torch.device("cpu")
|
|
@@ -231,8 +231,8 @@ class ModelFactory:
|
|
|
231
231
|
temperature = kwargs.pop("temperature", 0.7)
|
|
232
232
|
max_length = kwargs.pop("max_length", 1024)
|
|
233
233
|
|
|
234
|
-
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
|
|
235
234
|
if model_function == "image-text-to-text":
|
|
235
|
+
tokenizer = AutoProcessor.from_pretrained(model_name,trust_remote_code=True)
|
|
236
236
|
model = AutoModelForImageTextToText.from_pretrained(
|
|
237
237
|
model_name,
|
|
238
238
|
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
|
@@ -241,6 +241,7 @@ class ModelFactory:
|
|
|
241
241
|
**kwargs
|
|
242
242
|
)
|
|
243
243
|
else:
|
|
244
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
|
|
244
245
|
model = AutoModelForCausalLM.from_pretrained(
|
|
245
246
|
model_name,
|
|
246
247
|
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
|
|
@@ -585,7 +585,7 @@ class embedding_generator:
|
|
|
585
585
|
if not ModelProvider.check_package("langchain_openai"):
|
|
586
586
|
raise ImportError("OpenAI package not found. Please install: pip install langchain-openai")
|
|
587
587
|
from langchain_openai import ChatOpenAI
|
|
588
|
-
llm = ChatOpenAI(model="gpt-
|
|
588
|
+
llm = ChatOpenAI(model="gpt-4o", temperature=0.8)
|
|
589
589
|
|
|
590
590
|
history_aware_retriever = create_history_aware_retriever(llm, retriever,
|
|
591
591
|
contextualize_q_prompt)
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Extraction Module
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting text and metadata from various document types
|
|
5
|
+
including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
|
|
6
|
+
functionality in pdf_extract.py.
|
|
7
|
+
|
|
8
|
+
Example Usage:
|
|
9
|
+
```python
|
|
10
|
+
# Initialize CSV extractor
|
|
11
|
+
csv_extractor = CSVExtractor()
|
|
12
|
+
|
|
13
|
+
# Extract data from a CSV file
|
|
14
|
+
docs = csv_extractor.extract_csv("data.csv")
|
|
15
|
+
|
|
16
|
+
# Initialize PowerPoint extractor
|
|
17
|
+
ppt_extractor = PowerPointExtractor()
|
|
18
|
+
|
|
19
|
+
# Extract content from a PowerPoint file
|
|
20
|
+
docs = ppt_extractor.extract_ppt("presentation.pptx")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- CSV file extraction with metadata
|
|
25
|
+
- PowerPoint (PPT/PPTX) extraction
|
|
26
|
+
- Batch processing for multiple files
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import importlib.util
|
|
32
|
+
from typing import List, Dict, Optional, Union, Any
|
|
33
|
+
from langchain_core.documents import Document
|
|
34
|
+
|
|
35
|
+
class CSVExtractor:
|
|
36
|
+
"""
|
|
37
|
+
Class for extracting data from CSV files.
|
|
38
|
+
|
|
39
|
+
This class provides methods for extracting content from CSV files
|
|
40
|
+
and converting it to Document objects for use with RAG systems.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
logger: Optional logger instance for logging operations
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
```python
|
|
47
|
+
extractor = CSVExtractor()
|
|
48
|
+
docs = extractor.extract_csv("data.csv")
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, logger=None):
|
|
53
|
+
"""Initialize the CSV extractor."""
|
|
54
|
+
self.logger = logger
|
|
55
|
+
|
|
56
|
+
def check_file(self, file_path: str) -> bool:
|
|
57
|
+
"""
|
|
58
|
+
Check if file exists.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
file_path (str): Path to the file
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
bool: True if file exists, False otherwise
|
|
65
|
+
"""
|
|
66
|
+
return os.path.exists(file_path)
|
|
67
|
+
|
|
68
|
+
def extract_csv(self, csv_path: str, include_stats: bool = True,
|
|
69
|
+
chunk_by_row: bool = False, rows_per_chunk: int = 10,
|
|
70
|
+
**kwargs) -> List[Document]:
|
|
71
|
+
"""
|
|
72
|
+
Extract data from a CSV file.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
csv_path (str): Path to the CSV file
|
|
76
|
+
include_stats (bool): Whether to include basic statistics in the metadata
|
|
77
|
+
chunk_by_row (bool): Whether to create a separate document for each row or group of rows
|
|
78
|
+
rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
|
|
79
|
+
**kwargs: Additional arguments for pandas.read_csv
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List[Document]: List of Document objects containing extracted content
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: If the file doesn't exist
|
|
86
|
+
ImportError: If pandas is not installed
|
|
87
|
+
"""
|
|
88
|
+
if not self.check_file(csv_path):
|
|
89
|
+
raise ValueError(f"File {csv_path} not found")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
# Read CSV file
|
|
93
|
+
df = pd.read_csv(csv_path, **kwargs)
|
|
94
|
+
|
|
95
|
+
# Create metadata
|
|
96
|
+
metadata = {
|
|
97
|
+
"source": csv_path,
|
|
98
|
+
"rows": len(df),
|
|
99
|
+
"columns": list(df.columns),
|
|
100
|
+
"file_type": "csv"
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# Add basic statistics if requested
|
|
104
|
+
if include_stats:
|
|
105
|
+
stats = {}
|
|
106
|
+
for column in df.columns:
|
|
107
|
+
if pd.api.types.is_numeric_dtype(df[column]):
|
|
108
|
+
stats[column] = {
|
|
109
|
+
"min": float(df[column].min()),
|
|
110
|
+
"max": float(df[column].max()),
|
|
111
|
+
"mean": float(df[column].mean()),
|
|
112
|
+
"median": float(df[column].median())
|
|
113
|
+
}
|
|
114
|
+
metadata["statistics"] = stats
|
|
115
|
+
|
|
116
|
+
documents = []
|
|
117
|
+
|
|
118
|
+
if chunk_by_row:
|
|
119
|
+
# Create a separate document for each chunk of rows
|
|
120
|
+
for i in range(0, len(df), rows_per_chunk):
|
|
121
|
+
chunk = df.iloc[i:i+rows_per_chunk]
|
|
122
|
+
chunk_text = chunk.to_string(index=False)
|
|
123
|
+
|
|
124
|
+
chunk_metadata = metadata.copy()
|
|
125
|
+
chunk_metadata["chunk"] = {
|
|
126
|
+
"start_row": i,
|
|
127
|
+
"end_row": min(i + rows_per_chunk - 1, len(df) - 1),
|
|
128
|
+
"total_rows": len(chunk)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
documents.append(Document(
|
|
132
|
+
page_content=chunk_text,
|
|
133
|
+
metadata=chunk_metadata
|
|
134
|
+
))
|
|
135
|
+
else:
|
|
136
|
+
# Create a single document with all data
|
|
137
|
+
text = df.to_string(index=False)
|
|
138
|
+
documents.append(Document(
|
|
139
|
+
page_content=text,
|
|
140
|
+
metadata=metadata
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
if self.logger:
|
|
144
|
+
self.logger.info(f"Extracted data from {csv_path}")
|
|
145
|
+
else:
|
|
146
|
+
print(f"Extracted data from {csv_path}")
|
|
147
|
+
|
|
148
|
+
return documents
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
if self.logger:
|
|
152
|
+
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
153
|
+
else:
|
|
154
|
+
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
|
|
158
|
+
"""
|
|
159
|
+
Extract data from multiple CSV files.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
csv_paths (List[str]): List of paths to CSV files
|
|
163
|
+
**kwargs: Additional arguments for extract_csv
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List[Document]: List of Document objects containing extracted content
|
|
167
|
+
"""
|
|
168
|
+
all_docs = []
|
|
169
|
+
for csv_path in csv_paths:
|
|
170
|
+
try:
|
|
171
|
+
docs = self.extract_csv(csv_path, **kwargs)
|
|
172
|
+
all_docs.extend(docs)
|
|
173
|
+
if self.logger:
|
|
174
|
+
self.logger.info(f"Successfully extracted content from {csv_path}")
|
|
175
|
+
else:
|
|
176
|
+
print(f"Successfully extracted content from {csv_path}")
|
|
177
|
+
except Exception as e:
|
|
178
|
+
if self.logger:
|
|
179
|
+
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
180
|
+
else:
|
|
181
|
+
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
182
|
+
|
|
183
|
+
return all_docs
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class PowerPointExtractor:
|
|
187
|
+
"""
|
|
188
|
+
Class for extracting content from PowerPoint (PPT/PPTX) files.
|
|
189
|
+
|
|
190
|
+
This class provides methods for extracting text, notes, and metadata
|
|
191
|
+
from PowerPoint presentations.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
logger: Optional logger instance for logging operations
|
|
195
|
+
|
|
196
|
+
Example:
|
|
197
|
+
```python
|
|
198
|
+
extractor = PowerPointExtractor()
|
|
199
|
+
docs = extractor.extract_ppt("presentation.pptx")
|
|
200
|
+
```
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self, logger=None):
|
|
204
|
+
"""Initialize the PowerPoint extractor."""
|
|
205
|
+
self.logger = logger
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def check_package(package_name: str) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Check if a Python package is installed.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
package_name (str): Name of the package to check
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
bool: True if package is installed, False otherwise
|
|
217
|
+
"""
|
|
218
|
+
return importlib.util.find_spec(package_name) is not None
|
|
219
|
+
|
|
220
|
+
def check_file(self, file_path: str) -> bool:
|
|
221
|
+
"""
|
|
222
|
+
Check if file exists.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
file_path (str): Path to the file
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
bool: True if file exists, False otherwise
|
|
229
|
+
"""
|
|
230
|
+
return os.path.exists(file_path)
|
|
231
|
+
|
|
232
|
+
def extract_ppt(self, ppt_path: str, include_notes: bool = True,
|
|
233
|
+
include_hidden_slides: bool = False,
|
|
234
|
+
extract_images: bool = False) -> List[Document]:
|
|
235
|
+
"""
|
|
236
|
+
Extract content from a PowerPoint file.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
ppt_path (str): Path to the PowerPoint file
|
|
240
|
+
include_notes (bool): Whether to include speaker notes
|
|
241
|
+
include_hidden_slides (bool): Whether to include hidden slides
|
|
242
|
+
extract_images (bool): Whether to extract images
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List[Document]: List of Document objects containing extracted content
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: If the file doesn't exist
|
|
249
|
+
ImportError: If python-pptx is not installed
|
|
250
|
+
"""
|
|
251
|
+
if not self.check_file(ppt_path):
|
|
252
|
+
raise ValueError(f"File {ppt_path} not found")
|
|
253
|
+
|
|
254
|
+
if not self.check_package("pptx"):
|
|
255
|
+
raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
|
|
256
|
+
|
|
257
|
+
from pptx import Presentation
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
# Load presentation
|
|
261
|
+
presentation = Presentation(ppt_path)
|
|
262
|
+
|
|
263
|
+
documents = []
|
|
264
|
+
|
|
265
|
+
# Process each slide
|
|
266
|
+
for i, slide in enumerate(presentation.slides):
|
|
267
|
+
# Skip hidden slides if not requested
|
|
268
|
+
if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
# Extract text from shapes
|
|
272
|
+
texts = []
|
|
273
|
+
for shape in slide.shapes:
|
|
274
|
+
if hasattr(shape, "text") and shape.text:
|
|
275
|
+
texts.append(shape.text)
|
|
276
|
+
|
|
277
|
+
# Extract notes if requested
|
|
278
|
+
notes = ""
|
|
279
|
+
if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
|
|
280
|
+
for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
|
|
281
|
+
if note_shape.text:
|
|
282
|
+
notes += note_shape.text + "\n"
|
|
283
|
+
|
|
284
|
+
# Create metadata
|
|
285
|
+
metadata = {
|
|
286
|
+
"source": ppt_path,
|
|
287
|
+
"slide_number": i + 1,
|
|
288
|
+
"total_slides": len(presentation.slides),
|
|
289
|
+
"file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
# Add slide title if available
|
|
293
|
+
if slide.shapes.title and slide.shapes.title.text:
|
|
294
|
+
metadata["title"] = slide.shapes.title.text
|
|
295
|
+
|
|
296
|
+
# Combine text content
|
|
297
|
+
content = f"Slide {i+1}"
|
|
298
|
+
if "title" in metadata:
|
|
299
|
+
content += f": {metadata['title']}"
|
|
300
|
+
content += "\n\n"
|
|
301
|
+
|
|
302
|
+
if texts:
|
|
303
|
+
content += "\n".join(texts) + "\n"
|
|
304
|
+
|
|
305
|
+
if notes:
|
|
306
|
+
content += "\nNotes:\n" + notes
|
|
307
|
+
|
|
308
|
+
# Create document
|
|
309
|
+
documents.append(Document(
|
|
310
|
+
page_content=content,
|
|
311
|
+
metadata=metadata
|
|
312
|
+
))
|
|
313
|
+
|
|
314
|
+
if self.logger:
|
|
315
|
+
self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
316
|
+
else:
|
|
317
|
+
print(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
318
|
+
|
|
319
|
+
return documents
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
if self.logger:
|
|
323
|
+
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
324
|
+
else:
|
|
325
|
+
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
326
|
+
raise
|
|
327
|
+
|
|
328
|
+
def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
|
|
329
|
+
"""
|
|
330
|
+
Extract content from multiple PowerPoint files.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
ppt_paths (List[str]): List of paths to PowerPoint files
|
|
334
|
+
**kwargs: Additional arguments for extract_ppt
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List[Document]: List of Document objects containing extracted content
|
|
338
|
+
"""
|
|
339
|
+
all_docs = []
|
|
340
|
+
for ppt_path in ppt_paths:
|
|
341
|
+
try:
|
|
342
|
+
docs = self.extract_ppt(ppt_path, **kwargs)
|
|
343
|
+
all_docs.extend(docs)
|
|
344
|
+
if self.logger:
|
|
345
|
+
self.logger.info(f"Successfully extracted content from {ppt_path}")
|
|
346
|
+
else:
|
|
347
|
+
print(f"Successfully extracted content from {ppt_path}")
|
|
348
|
+
except Exception as e:
|
|
349
|
+
if self.logger:
|
|
350
|
+
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
351
|
+
else:
|
|
352
|
+
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
353
|
+
|
|
354
|
+
return all_docs
|
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Extraction Module
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting text and metadata from PDF files.
|
|
5
|
+
It supports various extraction methods and includes features for handling different
|
|
6
|
+
PDF structures, including tables and images.
|
|
7
|
+
|
|
8
|
+
Example Usage:
|
|
9
|
+
```python
|
|
10
|
+
# Initialize PDF extractor
|
|
11
|
+
extractor = PDFExtractor()
|
|
12
|
+
|
|
13
|
+
# Extract text from a PDF file
|
|
14
|
+
docs = extractor.extract_pdf("document.pdf")
|
|
15
|
+
|
|
16
|
+
# Extract with specific options
|
|
17
|
+
docs = extractor.extract_pdf(
|
|
18
|
+
"document.pdf",
|
|
19
|
+
extraction_method="pdfplumber",
|
|
20
|
+
extract_images=True
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Extract from multiple PDFs
|
|
24
|
+
docs = extractor.extract_multiple_pdfs(
|
|
25
|
+
["doc1.pdf", "doc2.pdf"],
|
|
26
|
+
extraction_method="pymupdf"
|
|
27
|
+
)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Features:
|
|
31
|
+
- Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
|
|
32
|
+
- Text and metadata extraction
|
|
33
|
+
- Optional image extraction
|
|
34
|
+
- Table detection and extraction
|
|
35
|
+
- Batch processing for multiple PDFs
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
import os
|
|
39
|
+
import tempfile
|
|
40
|
+
from typing import List, Dict, Optional, Union, Any, Tuple
|
|
41
|
+
import importlib.util
|
|
42
|
+
from langchain_core.documents import Document
|
|
43
|
+
|
|
44
|
+
class PDFExtractor:
|
|
45
|
+
"""
|
|
46
|
+
Class for extracting text and metadata from PDF files.
|
|
47
|
+
|
|
48
|
+
This class provides methods for extracting content from PDF files using
|
|
49
|
+
different extraction methods and processing options.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
logger: Optional logger instance for logging operations
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
```python
|
|
56
|
+
extractor = PDFExtractor()
|
|
57
|
+
docs = extractor.extract_pdf("document.pdf")
|
|
58
|
+
```
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, logger=None):
|
|
62
|
+
"""Initialize the PDF extractor."""
|
|
63
|
+
self.logger = logger
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def check_package(package_name: str) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Check if a Python package is installed.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
package_name (str): Name of the package to check
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
bool: True if package is installed, False otherwise
|
|
75
|
+
"""
|
|
76
|
+
return importlib.util.find_spec(package_name) is not None
|
|
77
|
+
|
|
78
|
+
def check_file(self, file_path: str) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
Check if file exists.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
file_path (str): Path to the file
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
bool: True if file exists, False otherwise
|
|
87
|
+
"""
|
|
88
|
+
return os.path.exists(file_path)
|
|
89
|
+
|
|
90
|
+
def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
|
|
91
|
+
extract_images: bool = False, extract_tables: bool = False,
|
|
92
|
+
**kwargs) -> List[Document]:
|
|
93
|
+
"""
|
|
94
|
+
Extract text and metadata from a PDF file.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
pdf_path (str): Path to the PDF file
|
|
98
|
+
extraction_method (str): Method to use for extraction
|
|
99
|
+
("pypdf", "pdfplumber", or "pymupdf")
|
|
100
|
+
extract_images (bool): Whether to extract images
|
|
101
|
+
extract_tables (bool): Whether to extract tables
|
|
102
|
+
**kwargs: Additional arguments for the extraction method
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List[Document]: List of Document objects containing extracted content
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
ValueError: If the file doesn't exist or extraction method is invalid
|
|
109
|
+
ImportError: If required packages are not installed
|
|
110
|
+
"""
|
|
111
|
+
if not self.check_file(pdf_path):
|
|
112
|
+
raise ValueError(f"File {pdf_path} not found")
|
|
113
|
+
|
|
114
|
+
if extraction_method == "pypdf":
|
|
115
|
+
return self._extract_with_pypdf(pdf_path, **kwargs)
|
|
116
|
+
elif extraction_method == "pdfplumber":
|
|
117
|
+
return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
|
|
118
|
+
elif extraction_method == "pymupdf":
|
|
119
|
+
return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"Invalid extraction method: {extraction_method}")
|
|
122
|
+
|
|
123
|
+
def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
|
|
124
|
+
extract_images: bool = False, extract_tables: bool = False,
|
|
125
|
+
**kwargs) -> List[Document]:
|
|
126
|
+
"""
|
|
127
|
+
Extract text and metadata from multiple PDF files.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
pdf_paths (List[str]): List of paths to PDF files
|
|
131
|
+
extraction_method (str): Method to use for extraction
|
|
132
|
+
extract_images (bool): Whether to extract images
|
|
133
|
+
extract_tables (bool): Whether to extract tables
|
|
134
|
+
**kwargs: Additional arguments for the extraction method
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List[Document]: List of Document objects containing extracted content
|
|
138
|
+
"""
|
|
139
|
+
all_docs = []
|
|
140
|
+
for pdf_path in pdf_paths:
|
|
141
|
+
try:
|
|
142
|
+
docs = self.extract_pdf(
|
|
143
|
+
pdf_path,
|
|
144
|
+
extraction_method=extraction_method,
|
|
145
|
+
extract_images=extract_images,
|
|
146
|
+
extract_tables=extract_tables,
|
|
147
|
+
**kwargs
|
|
148
|
+
)
|
|
149
|
+
all_docs.extend(docs)
|
|
150
|
+
if self.logger:
|
|
151
|
+
self.logger.info(f"Successfully extracted content from {pdf_path}")
|
|
152
|
+
else:
|
|
153
|
+
print(f"Successfully extracted content from {pdf_path}")
|
|
154
|
+
except Exception as e:
|
|
155
|
+
if self.logger:
|
|
156
|
+
self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
|
|
157
|
+
else:
|
|
158
|
+
print(f"Error extracting from {pdf_path}: {str(e)}")
|
|
159
|
+
|
|
160
|
+
return all_docs
|
|
161
|
+
|
|
162
|
+
def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
|
|
163
|
+
"""
|
|
164
|
+
Extract text using PyPDF2.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
pdf_path (str): Path to the PDF file
|
|
168
|
+
**kwargs: Additional arguments for PyPDF2
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List[Document]: List of Document objects
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
ImportError: If PyPDF2 is not installed
|
|
175
|
+
"""
|
|
176
|
+
if not self.check_package("pypdf"):
|
|
177
|
+
raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
|
|
178
|
+
|
|
179
|
+
from langchain_community.document_loaders import PyPDFLoader
|
|
180
|
+
|
|
181
|
+
loader = PyPDFLoader(pdf_path, **kwargs)
|
|
182
|
+
documents = loader.load()
|
|
183
|
+
|
|
184
|
+
if self.logger:
|
|
185
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
186
|
+
else:
|
|
187
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
188
|
+
|
|
189
|
+
return documents
|
|
190
|
+
|
|
191
|
+
def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
|
|
192
|
+
"""
|
|
193
|
+
Extract text using PDFPlumber.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
pdf_path (str): Path to the PDF file
|
|
197
|
+
extract_tables (bool): Whether to extract tables
|
|
198
|
+
**kwargs: Additional arguments for PDFPlumber
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
List[Document]: List of Document objects
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
ImportError: If PDFPlumber is not installed
|
|
205
|
+
"""
|
|
206
|
+
if not self.check_package("pdfplumber"):
|
|
207
|
+
raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
|
|
208
|
+
|
|
209
|
+
import pdfplumber
|
|
210
|
+
|
|
211
|
+
documents = []
|
|
212
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
213
|
+
for i, page in enumerate(pdf.pages):
|
|
214
|
+
text = page.extract_text()
|
|
215
|
+
|
|
216
|
+
metadata = {
|
|
217
|
+
"source": pdf_path,
|
|
218
|
+
"page": i + 1,
|
|
219
|
+
"total_pages": len(pdf.pages)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if extract_tables:
|
|
223
|
+
tables = page.extract_tables()
|
|
224
|
+
if tables:
|
|
225
|
+
table_text = []
|
|
226
|
+
for table in tables:
|
|
227
|
+
table_rows = []
|
|
228
|
+
for row in table:
|
|
229
|
+
# Filter out None values and convert to strings
|
|
230
|
+
row_text = [str(cell) if cell is not None else "" for cell in row]
|
|
231
|
+
table_rows.append(" | ".join(row_text))
|
|
232
|
+
table_text.append("\n".join(table_rows))
|
|
233
|
+
|
|
234
|
+
metadata["tables"] = table_text
|
|
235
|
+
# Append table text to the main text
|
|
236
|
+
text += "\n\nTABLES:\n" + "\n\n".join(table_text)
|
|
237
|
+
|
|
238
|
+
documents.append(Document(page_content=text, metadata=metadata))
|
|
239
|
+
|
|
240
|
+
if self.logger:
|
|
241
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
242
|
+
else:
|
|
243
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
244
|
+
|
|
245
|
+
return documents
|
|
246
|
+
|
|
247
|
+
def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
|
|
248
|
+
"""
|
|
249
|
+
Extract text using PyMuPDF (fitz).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
pdf_path (str): Path to the PDF file
|
|
253
|
+
extract_images (bool): Whether to extract images
|
|
254
|
+
**kwargs: Additional arguments for PyMuPDF
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List[Document]: List of Document objects
|
|
258
|
+
|
|
259
|
+
Raises:
|
|
260
|
+
ImportError: If PyMuPDF is not installed
|
|
261
|
+
"""
|
|
262
|
+
if not self.check_package("fitz"):
|
|
263
|
+
raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
|
|
264
|
+
|
|
265
|
+
import fitz
|
|
266
|
+
|
|
267
|
+
documents = []
|
|
268
|
+
temp_dir = None
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
if extract_images:
|
|
272
|
+
temp_dir = tempfile.mkdtemp()
|
|
273
|
+
|
|
274
|
+
with fitz.open(pdf_path) as doc:
|
|
275
|
+
for i, page in enumerate(doc):
|
|
276
|
+
text = page.get_text()
|
|
277
|
+
|
|
278
|
+
metadata = {
|
|
279
|
+
"source": pdf_path,
|
|
280
|
+
"page": i + 1,
|
|
281
|
+
"total_pages": len(doc),
|
|
282
|
+
"title": doc.metadata.get("title", ""),
|
|
283
|
+
"author": doc.metadata.get("author", ""),
|
|
284
|
+
"subject": doc.metadata.get("subject", ""),
|
|
285
|
+
"keywords": doc.metadata.get("keywords", "")
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if extract_images and temp_dir:
|
|
289
|
+
image_list = page.get_images(full=True)
|
|
290
|
+
image_paths = []
|
|
291
|
+
|
|
292
|
+
for img_index, img in enumerate(image_list):
|
|
293
|
+
xref = img[0]
|
|
294
|
+
base_image = doc.extract_image(xref)
|
|
295
|
+
image_bytes = base_image["image"]
|
|
296
|
+
|
|
297
|
+
image_path = os.path.join(
|
|
298
|
+
temp_dir,
|
|
299
|
+
f"page{i+1}_img{img_index+1}.{base_image['ext']}"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
with open(image_path, "wb") as img_file:
|
|
303
|
+
img_file.write(image_bytes)
|
|
304
|
+
|
|
305
|
+
image_paths.append(image_path)
|
|
306
|
+
|
|
307
|
+
if image_paths:
|
|
308
|
+
metadata["images"] = image_paths
|
|
309
|
+
|
|
310
|
+
documents.append(Document(page_content=text, metadata=metadata))
|
|
311
|
+
|
|
312
|
+
finally:
|
|
313
|
+
# Clean up temporary directory if it was created
|
|
314
|
+
if extract_images and temp_dir and os.path.exists(temp_dir):
|
|
315
|
+
import shutil
|
|
316
|
+
shutil.rmtree(temp_dir)
|
|
317
|
+
|
|
318
|
+
if self.logger:
|
|
319
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
320
|
+
else:
|
|
321
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
322
|
+
|
|
323
|
+
return documents
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class PDFToCSV:
|
|
327
|
+
"""
|
|
328
|
+
Class for converting PDF tables to CSV format.
|
|
329
|
+
|
|
330
|
+
This class provides methods for extracting tables from PDF files
|
|
331
|
+
and converting them to CSV format.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
logger: Optional logger instance for logging operations
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
```python
|
|
338
|
+
converter = PDFToCSV()
|
|
339
|
+
csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
|
|
340
|
+
```
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(self, logger=None):
|
|
344
|
+
"""Initialize the PDF to CSV converter."""
|
|
345
|
+
self.logger = logger
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def check_package(package_name: str) -> bool:
|
|
349
|
+
"""
|
|
350
|
+
Check if a Python package is installed.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
package_name (str): Name of the package to check
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
bool: True if package is installed, False otherwise
|
|
357
|
+
"""
|
|
358
|
+
return importlib.util.find_spec(package_name) is not None
|
|
359
|
+
|
|
360
|
+
def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
|
|
361
|
+
pages: List[int] = None) -> List[str]:
|
|
362
|
+
"""
|
|
363
|
+
Extract tables from PDF and convert to CSV.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
pdf_path (str): Path to the PDF file
|
|
367
|
+
output_dir (str): Directory to save CSV files (default: same as PDF)
|
|
368
|
+
pages (List[int]): Specific pages to extract tables from (default: all)
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
List[str]: Paths to the created CSV files
|
|
372
|
+
|
|
373
|
+
Raises:
|
|
374
|
+
ImportError: If required packages are not installed
|
|
375
|
+
ValueError: If the PDF file doesn't exist
|
|
376
|
+
"""
|
|
377
|
+
if not os.path.exists(pdf_path):
|
|
378
|
+
raise ValueError(f"PDF file not found: {pdf_path}")
|
|
379
|
+
|
|
380
|
+
if not self.check_package("tabula"):
|
|
381
|
+
raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
|
|
382
|
+
|
|
383
|
+
import tabula
|
|
384
|
+
import pandas as pd
|
|
385
|
+
|
|
386
|
+
# Determine output directory
|
|
387
|
+
if output_dir is None:
|
|
388
|
+
output_dir = os.path.dirname(pdf_path)
|
|
389
|
+
|
|
390
|
+
# Create output directory if it doesn't exist
|
|
391
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
392
|
+
|
|
393
|
+
# Extract tables
|
|
394
|
+
try:
|
|
395
|
+
if pages:
|
|
396
|
+
dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
|
|
397
|
+
else:
|
|
398
|
+
dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
|
|
399
|
+
except Exception as e:
|
|
400
|
+
if self.logger:
|
|
401
|
+
self.logger.error(f"Error extracting tables: {str(e)}")
|
|
402
|
+
else:
|
|
403
|
+
print(f"Error extracting tables: {str(e)}")
|
|
404
|
+
return []
|
|
405
|
+
|
|
406
|
+
if not dfs:
|
|
407
|
+
if self.logger:
|
|
408
|
+
self.logger.warning(f"No tables found in {pdf_path}")
|
|
409
|
+
else:
|
|
410
|
+
print(f"No tables found in {pdf_path}")
|
|
411
|
+
return []
|
|
412
|
+
|
|
413
|
+
# Save tables to CSV
|
|
414
|
+
csv_paths = []
|
|
415
|
+
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
416
|
+
|
|
417
|
+
for i, df in enumerate(dfs):
|
|
418
|
+
if not df.empty:
|
|
419
|
+
csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
|
|
420
|
+
df.to_csv(csv_path, index=False)
|
|
421
|
+
csv_paths.append(csv_path)
|
|
422
|
+
|
|
423
|
+
if self.logger:
|
|
424
|
+
self.logger.info(f"Saved table {i+1} to {csv_path}")
|
|
425
|
+
else:
|
|
426
|
+
print(f"Saved table {i+1} to {csv_path}")
|
|
427
|
+
|
|
428
|
+
return csv_paths
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|