mb-rag 1.1.47__py3-none-any.whl → 1.1.57.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- mb_rag/basic.py +306 -0
- mb_rag/chatbot/chains.py +206 -206
- mb_rag/chatbot/conversation.py +185 -0
- mb_rag/chatbot/prompts.py +58 -58
- mb_rag/rag/embeddings.py +810 -810
- mb_rag/utils/all_data_extract.py +64 -64
- mb_rag/utils/bounding_box.py +231 -231
- mb_rag/utils/document_extract.py +354 -354
- mb_rag/utils/extra.py +73 -73
- mb_rag/utils/pdf_extract.py +428 -428
- mb_rag/version.py +1 -1
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.57.post1.dist-info}/METADATA +11 -11
- mb_rag-1.1.57.post1.dist-info/RECORD +19 -0
- mb_rag/chatbot/basic.py +0 -644
- mb_rag-1.1.47.dist-info/RECORD +0 -18
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.57.post1.dist-info}/WHEEL +0 -0
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.57.post1.dist-info}/top_level.txt +0 -0
mb_rag/utils/pdf_extract.py
CHANGED
|
@@ -1,428 +1,428 @@
|
|
|
1
|
-
"""
|
|
2
|
-
PDF Extraction Module
|
|
3
|
-
|
|
4
|
-
This module provides functionality for extracting text and metadata from PDF files.
|
|
5
|
-
It supports various extraction methods and includes features for handling different
|
|
6
|
-
PDF structures, including tables and images.
|
|
7
|
-
|
|
8
|
-
Example Usage:
|
|
9
|
-
```python
|
|
10
|
-
# Initialize PDF extractor
|
|
11
|
-
extractor = PDFExtractor()
|
|
12
|
-
|
|
13
|
-
# Extract text from a PDF file
|
|
14
|
-
docs = extractor.extract_pdf("document.pdf")
|
|
15
|
-
|
|
16
|
-
# Extract with specific options
|
|
17
|
-
docs = extractor.extract_pdf(
|
|
18
|
-
"document.pdf",
|
|
19
|
-
extraction_method="pdfplumber",
|
|
20
|
-
extract_images=True
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
# Extract from multiple PDFs
|
|
24
|
-
docs = extractor.extract_multiple_pdfs(
|
|
25
|
-
["doc1.pdf", "doc2.pdf"],
|
|
26
|
-
extraction_method="pymupdf"
|
|
27
|
-
)
|
|
28
|
-
```
|
|
29
|
-
|
|
30
|
-
Features:
|
|
31
|
-
- Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
|
|
32
|
-
- Text and metadata extraction
|
|
33
|
-
- Optional image extraction
|
|
34
|
-
- Table detection and extraction
|
|
35
|
-
- Batch processing for multiple PDFs
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
import os
|
|
39
|
-
import tempfile
|
|
40
|
-
from typing import List, Dict, Optional, Union, Any, Tuple
|
|
41
|
-
import importlib.util
|
|
42
|
-
from langchain_core.documents import Document
|
|
43
|
-
|
|
44
|
-
class PDFExtractor:
|
|
45
|
-
"""
|
|
46
|
-
Class for extracting text and metadata from PDF files.
|
|
47
|
-
|
|
48
|
-
This class provides methods for extracting content from PDF files using
|
|
49
|
-
different extraction methods and processing options.
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
logger: Optional logger instance for logging operations
|
|
53
|
-
|
|
54
|
-
Example:
|
|
55
|
-
```python
|
|
56
|
-
extractor = PDFExtractor()
|
|
57
|
-
docs = extractor.extract_pdf("document.pdf")
|
|
58
|
-
```
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
def __init__(self, logger=None):
|
|
62
|
-
"""Initialize the PDF extractor."""
|
|
63
|
-
self.logger = logger
|
|
64
|
-
|
|
65
|
-
@staticmethod
|
|
66
|
-
def check_package(package_name: str) -> bool:
|
|
67
|
-
"""
|
|
68
|
-
Check if a Python package is installed.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
package_name (str): Name of the package to check
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
bool: True if package is installed, False otherwise
|
|
75
|
-
"""
|
|
76
|
-
return importlib.util.find_spec(package_name) is not None
|
|
77
|
-
|
|
78
|
-
def check_file(self, file_path: str) -> bool:
|
|
79
|
-
"""
|
|
80
|
-
Check if file exists.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
file_path (str): Path to the file
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
bool: True if file exists, False otherwise
|
|
87
|
-
"""
|
|
88
|
-
return os.path.exists(file_path)
|
|
89
|
-
|
|
90
|
-
def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
|
|
91
|
-
extract_images: bool = False, extract_tables: bool = False,
|
|
92
|
-
**kwargs) -> List[Document]:
|
|
93
|
-
"""
|
|
94
|
-
Extract text and metadata from a PDF file.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
pdf_path (str): Path to the PDF file
|
|
98
|
-
extraction_method (str): Method to use for extraction
|
|
99
|
-
("pypdf", "pdfplumber", or "pymupdf")
|
|
100
|
-
extract_images (bool): Whether to extract images
|
|
101
|
-
extract_tables (bool): Whether to extract tables
|
|
102
|
-
**kwargs: Additional arguments for the extraction method
|
|
103
|
-
|
|
104
|
-
Returns:
|
|
105
|
-
List[Document]: List of Document objects containing extracted content
|
|
106
|
-
|
|
107
|
-
Raises:
|
|
108
|
-
ValueError: If the file doesn't exist or extraction method is invalid
|
|
109
|
-
ImportError: If required packages are not installed
|
|
110
|
-
"""
|
|
111
|
-
if not self.check_file(pdf_path):
|
|
112
|
-
raise ValueError(f"File {pdf_path} not found")
|
|
113
|
-
|
|
114
|
-
if extraction_method == "pypdf":
|
|
115
|
-
return self._extract_with_pypdf(pdf_path, **kwargs)
|
|
116
|
-
elif extraction_method == "pdfplumber":
|
|
117
|
-
return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
|
|
118
|
-
elif extraction_method == "pymupdf":
|
|
119
|
-
return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
|
|
120
|
-
else:
|
|
121
|
-
raise ValueError(f"Invalid extraction method: {extraction_method}")
|
|
122
|
-
|
|
123
|
-
def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
|
|
124
|
-
extract_images: bool = False, extract_tables: bool = False,
|
|
125
|
-
**kwargs) -> List[Document]:
|
|
126
|
-
"""
|
|
127
|
-
Extract text and metadata from multiple PDF files.
|
|
128
|
-
|
|
129
|
-
Args:
|
|
130
|
-
pdf_paths (List[str]): List of paths to PDF files
|
|
131
|
-
extraction_method (str): Method to use for extraction
|
|
132
|
-
extract_images (bool): Whether to extract images
|
|
133
|
-
extract_tables (bool): Whether to extract tables
|
|
134
|
-
**kwargs: Additional arguments for the extraction method
|
|
135
|
-
|
|
136
|
-
Returns:
|
|
137
|
-
List[Document]: List of Document objects containing extracted content
|
|
138
|
-
"""
|
|
139
|
-
all_docs = []
|
|
140
|
-
for pdf_path in pdf_paths:
|
|
141
|
-
try:
|
|
142
|
-
docs = self.extract_pdf(
|
|
143
|
-
pdf_path,
|
|
144
|
-
extraction_method=extraction_method,
|
|
145
|
-
extract_images=extract_images,
|
|
146
|
-
extract_tables=extract_tables,
|
|
147
|
-
**kwargs
|
|
148
|
-
)
|
|
149
|
-
all_docs.extend(docs)
|
|
150
|
-
if self.logger:
|
|
151
|
-
self.logger.info(f"Successfully extracted content from {pdf_path}")
|
|
152
|
-
else:
|
|
153
|
-
print(f"Successfully extracted content from {pdf_path}")
|
|
154
|
-
except Exception as e:
|
|
155
|
-
if self.logger:
|
|
156
|
-
self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
|
|
157
|
-
else:
|
|
158
|
-
print(f"Error extracting from {pdf_path}: {str(e)}")
|
|
159
|
-
|
|
160
|
-
return all_docs
|
|
161
|
-
|
|
162
|
-
def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
|
|
163
|
-
"""
|
|
164
|
-
Extract text using PyPDF2.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
pdf_path (str): Path to the PDF file
|
|
168
|
-
**kwargs: Additional arguments for PyPDF2
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
List[Document]: List of Document objects
|
|
172
|
-
|
|
173
|
-
Raises:
|
|
174
|
-
ImportError: If PyPDF2 is not installed
|
|
175
|
-
"""
|
|
176
|
-
if not self.check_package("pypdf"):
|
|
177
|
-
raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
|
|
178
|
-
|
|
179
|
-
from langchain_community.document_loaders import PyPDFLoader
|
|
180
|
-
|
|
181
|
-
loader = PyPDFLoader(pdf_path, **kwargs)
|
|
182
|
-
documents = loader.load()
|
|
183
|
-
|
|
184
|
-
if self.logger:
|
|
185
|
-
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
186
|
-
else:
|
|
187
|
-
print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
188
|
-
|
|
189
|
-
return documents
|
|
190
|
-
|
|
191
|
-
def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
|
|
192
|
-
"""
|
|
193
|
-
Extract text using PDFPlumber.
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
pdf_path (str): Path to the PDF file
|
|
197
|
-
extract_tables (bool): Whether to extract tables
|
|
198
|
-
**kwargs: Additional arguments for PDFPlumber
|
|
199
|
-
|
|
200
|
-
Returns:
|
|
201
|
-
List[Document]: List of Document objects
|
|
202
|
-
|
|
203
|
-
Raises:
|
|
204
|
-
ImportError: If PDFPlumber is not installed
|
|
205
|
-
"""
|
|
206
|
-
if not self.check_package("pdfplumber"):
|
|
207
|
-
raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
|
|
208
|
-
|
|
209
|
-
import pdfplumber
|
|
210
|
-
|
|
211
|
-
documents = []
|
|
212
|
-
with pdfplumber.open(pdf_path) as pdf:
|
|
213
|
-
for i, page in enumerate(pdf.pages):
|
|
214
|
-
text = page.extract_text()
|
|
215
|
-
|
|
216
|
-
metadata = {
|
|
217
|
-
"source": pdf_path,
|
|
218
|
-
"page": i + 1,
|
|
219
|
-
"total_pages": len(pdf.pages)
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
if extract_tables:
|
|
223
|
-
tables = page.extract_tables()
|
|
224
|
-
if tables:
|
|
225
|
-
table_text = []
|
|
226
|
-
for table in tables:
|
|
227
|
-
table_rows = []
|
|
228
|
-
for row in table:
|
|
229
|
-
# Filter out None values and convert to strings
|
|
230
|
-
row_text = [str(cell) if cell is not None else "" for cell in row]
|
|
231
|
-
table_rows.append(" | ".join(row_text))
|
|
232
|
-
table_text.append("\n".join(table_rows))
|
|
233
|
-
|
|
234
|
-
metadata["tables"] = table_text
|
|
235
|
-
# Append table text to the main text
|
|
236
|
-
text += "\n\nTABLES:\n" + "\n\n".join(table_text)
|
|
237
|
-
|
|
238
|
-
documents.append(Document(page_content=text, metadata=metadata))
|
|
239
|
-
|
|
240
|
-
if self.logger:
|
|
241
|
-
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
242
|
-
else:
|
|
243
|
-
print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
244
|
-
|
|
245
|
-
return documents
|
|
246
|
-
|
|
247
|
-
def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
|
|
248
|
-
"""
|
|
249
|
-
Extract text using PyMuPDF (fitz).
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
pdf_path (str): Path to the PDF file
|
|
253
|
-
extract_images (bool): Whether to extract images
|
|
254
|
-
**kwargs: Additional arguments for PyMuPDF
|
|
255
|
-
|
|
256
|
-
Returns:
|
|
257
|
-
List[Document]: List of Document objects
|
|
258
|
-
|
|
259
|
-
Raises:
|
|
260
|
-
ImportError: If PyMuPDF is not installed
|
|
261
|
-
"""
|
|
262
|
-
if not self.check_package("fitz"):
|
|
263
|
-
raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
|
|
264
|
-
|
|
265
|
-
import fitz
|
|
266
|
-
|
|
267
|
-
documents = []
|
|
268
|
-
temp_dir = None
|
|
269
|
-
|
|
270
|
-
try:
|
|
271
|
-
if extract_images:
|
|
272
|
-
temp_dir = tempfile.mkdtemp()
|
|
273
|
-
|
|
274
|
-
with fitz.open(pdf_path) as doc:
|
|
275
|
-
for i, page in enumerate(doc):
|
|
276
|
-
text = page.get_text()
|
|
277
|
-
|
|
278
|
-
metadata = {
|
|
279
|
-
"source": pdf_path,
|
|
280
|
-
"page": i + 1,
|
|
281
|
-
"total_pages": len(doc),
|
|
282
|
-
"title": doc.metadata.get("title", ""),
|
|
283
|
-
"author": doc.metadata.get("author", ""),
|
|
284
|
-
"subject": doc.metadata.get("subject", ""),
|
|
285
|
-
"keywords": doc.metadata.get("keywords", "")
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
if extract_images and temp_dir:
|
|
289
|
-
image_list = page.get_images(full=True)
|
|
290
|
-
image_paths = []
|
|
291
|
-
|
|
292
|
-
for img_index, img in enumerate(image_list):
|
|
293
|
-
xref = img[0]
|
|
294
|
-
base_image = doc.extract_image(xref)
|
|
295
|
-
image_bytes = base_image["image"]
|
|
296
|
-
|
|
297
|
-
image_path = os.path.join(
|
|
298
|
-
temp_dir,
|
|
299
|
-
f"page{i+1}_img{img_index+1}.{base_image['ext']}"
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
with open(image_path, "wb") as img_file:
|
|
303
|
-
img_file.write(image_bytes)
|
|
304
|
-
|
|
305
|
-
image_paths.append(image_path)
|
|
306
|
-
|
|
307
|
-
if image_paths:
|
|
308
|
-
metadata["images"] = image_paths
|
|
309
|
-
|
|
310
|
-
documents.append(Document(page_content=text, metadata=metadata))
|
|
311
|
-
|
|
312
|
-
finally:
|
|
313
|
-
# Clean up temporary directory if it was created
|
|
314
|
-
if extract_images and temp_dir and os.path.exists(temp_dir):
|
|
315
|
-
import shutil
|
|
316
|
-
shutil.rmtree(temp_dir)
|
|
317
|
-
|
|
318
|
-
if self.logger:
|
|
319
|
-
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
320
|
-
else:
|
|
321
|
-
print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
322
|
-
|
|
323
|
-
return documents
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
class PDFToCSV:
|
|
327
|
-
"""
|
|
328
|
-
Class for converting PDF tables to CSV format.
|
|
329
|
-
|
|
330
|
-
This class provides methods for extracting tables from PDF files
|
|
331
|
-
and converting them to CSV format.
|
|
332
|
-
|
|
333
|
-
Args:
|
|
334
|
-
logger: Optional logger instance for logging operations
|
|
335
|
-
|
|
336
|
-
Example:
|
|
337
|
-
```python
|
|
338
|
-
converter = PDFToCSV()
|
|
339
|
-
csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
|
|
340
|
-
```
|
|
341
|
-
"""
|
|
342
|
-
|
|
343
|
-
def __init__(self, logger=None):
|
|
344
|
-
"""Initialize the PDF to CSV converter."""
|
|
345
|
-
self.logger = logger
|
|
346
|
-
|
|
347
|
-
@staticmethod
|
|
348
|
-
def check_package(package_name: str) -> bool:
|
|
349
|
-
"""
|
|
350
|
-
Check if a Python package is installed.
|
|
351
|
-
|
|
352
|
-
Args:
|
|
353
|
-
package_name (str): Name of the package to check
|
|
354
|
-
|
|
355
|
-
Returns:
|
|
356
|
-
bool: True if package is installed, False otherwise
|
|
357
|
-
"""
|
|
358
|
-
return importlib.util.find_spec(package_name) is not None
|
|
359
|
-
|
|
360
|
-
def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
|
|
361
|
-
pages: List[int] = None) -> List[str]:
|
|
362
|
-
"""
|
|
363
|
-
Extract tables from PDF and convert to CSV.
|
|
364
|
-
|
|
365
|
-
Args:
|
|
366
|
-
pdf_path (str): Path to the PDF file
|
|
367
|
-
output_dir (str): Directory to save CSV files (default: same as PDF)
|
|
368
|
-
pages (List[int]): Specific pages to extract tables from (default: all)
|
|
369
|
-
|
|
370
|
-
Returns:
|
|
371
|
-
List[str]: Paths to the created CSV files
|
|
372
|
-
|
|
373
|
-
Raises:
|
|
374
|
-
ImportError: If required packages are not installed
|
|
375
|
-
ValueError: If the PDF file doesn't exist
|
|
376
|
-
"""
|
|
377
|
-
if not os.path.exists(pdf_path):
|
|
378
|
-
raise ValueError(f"PDF file not found: {pdf_path}")
|
|
379
|
-
|
|
380
|
-
if not self.check_package("tabula"):
|
|
381
|
-
raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
|
|
382
|
-
|
|
383
|
-
import tabula
|
|
384
|
-
import pandas as pd
|
|
385
|
-
|
|
386
|
-
# Determine output directory
|
|
387
|
-
if output_dir is None:
|
|
388
|
-
output_dir = os.path.dirname(pdf_path)
|
|
389
|
-
|
|
390
|
-
# Create output directory if it doesn't exist
|
|
391
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
392
|
-
|
|
393
|
-
# Extract tables
|
|
394
|
-
try:
|
|
395
|
-
if pages:
|
|
396
|
-
dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
|
|
397
|
-
else:
|
|
398
|
-
dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
|
|
399
|
-
except Exception as e:
|
|
400
|
-
if self.logger:
|
|
401
|
-
self.logger.error(f"Error extracting tables: {str(e)}")
|
|
402
|
-
else:
|
|
403
|
-
print(f"Error extracting tables: {str(e)}")
|
|
404
|
-
return []
|
|
405
|
-
|
|
406
|
-
if not dfs:
|
|
407
|
-
if self.logger:
|
|
408
|
-
self.logger.warning(f"No tables found in {pdf_path}")
|
|
409
|
-
else:
|
|
410
|
-
print(f"No tables found in {pdf_path}")
|
|
411
|
-
return []
|
|
412
|
-
|
|
413
|
-
# Save tables to CSV
|
|
414
|
-
csv_paths = []
|
|
415
|
-
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
416
|
-
|
|
417
|
-
for i, df in enumerate(dfs):
|
|
418
|
-
if not df.empty:
|
|
419
|
-
csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
|
|
420
|
-
df.to_csv(csv_path, index=False)
|
|
421
|
-
csv_paths.append(csv_path)
|
|
422
|
-
|
|
423
|
-
if self.logger:
|
|
424
|
-
self.logger.info(f"Saved table {i+1} to {csv_path}")
|
|
425
|
-
else:
|
|
426
|
-
print(f"Saved table {i+1} to {csv_path}")
|
|
427
|
-
|
|
428
|
-
return csv_paths
|
|
1
|
+
"""
|
|
2
|
+
PDF Extraction Module
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting text and metadata from PDF files.
|
|
5
|
+
It supports various extraction methods and includes features for handling different
|
|
6
|
+
PDF structures, including tables and images.
|
|
7
|
+
|
|
8
|
+
Example Usage:
|
|
9
|
+
```python
|
|
10
|
+
# Initialize PDF extractor
|
|
11
|
+
extractor = PDFExtractor()
|
|
12
|
+
|
|
13
|
+
# Extract text from a PDF file
|
|
14
|
+
docs = extractor.extract_pdf("document.pdf")
|
|
15
|
+
|
|
16
|
+
# Extract with specific options
|
|
17
|
+
docs = extractor.extract_pdf(
|
|
18
|
+
"document.pdf",
|
|
19
|
+
extraction_method="pdfplumber",
|
|
20
|
+
extract_images=True
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Extract from multiple PDFs
|
|
24
|
+
docs = extractor.extract_multiple_pdfs(
|
|
25
|
+
["doc1.pdf", "doc2.pdf"],
|
|
26
|
+
extraction_method="pymupdf"
|
|
27
|
+
)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Features:
|
|
31
|
+
- Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
|
|
32
|
+
- Text and metadata extraction
|
|
33
|
+
- Optional image extraction
|
|
34
|
+
- Table detection and extraction
|
|
35
|
+
- Batch processing for multiple PDFs
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
import os
|
|
39
|
+
import tempfile
|
|
40
|
+
from typing import List, Dict, Optional, Union, Any, Tuple
|
|
41
|
+
import importlib.util
|
|
42
|
+
from langchain_core.documents import Document
|
|
43
|
+
|
|
44
|
+
class PDFExtractor:
|
|
45
|
+
"""
|
|
46
|
+
Class for extracting text and metadata from PDF files.
|
|
47
|
+
|
|
48
|
+
This class provides methods for extracting content from PDF files using
|
|
49
|
+
different extraction methods and processing options.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
logger: Optional logger instance for logging operations
|
|
53
|
+
|
|
54
|
+
Example:
|
|
55
|
+
```python
|
|
56
|
+
extractor = PDFExtractor()
|
|
57
|
+
docs = extractor.extract_pdf("document.pdf")
|
|
58
|
+
```
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, logger=None):
|
|
62
|
+
"""Initialize the PDF extractor."""
|
|
63
|
+
self.logger = logger
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def check_package(package_name: str) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Check if a Python package is installed.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
package_name (str): Name of the package to check
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
bool: True if package is installed, False otherwise
|
|
75
|
+
"""
|
|
76
|
+
return importlib.util.find_spec(package_name) is not None
|
|
77
|
+
|
|
78
|
+
def check_file(self, file_path: str) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
Check if file exists.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
file_path (str): Path to the file
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
bool: True if file exists, False otherwise
|
|
87
|
+
"""
|
|
88
|
+
return os.path.exists(file_path)
|
|
89
|
+
|
|
90
|
+
def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
|
|
91
|
+
extract_images: bool = False, extract_tables: bool = False,
|
|
92
|
+
**kwargs) -> List[Document]:
|
|
93
|
+
"""
|
|
94
|
+
Extract text and metadata from a PDF file.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
pdf_path (str): Path to the PDF file
|
|
98
|
+
extraction_method (str): Method to use for extraction
|
|
99
|
+
("pypdf", "pdfplumber", or "pymupdf")
|
|
100
|
+
extract_images (bool): Whether to extract images
|
|
101
|
+
extract_tables (bool): Whether to extract tables
|
|
102
|
+
**kwargs: Additional arguments for the extraction method
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List[Document]: List of Document objects containing extracted content
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
ValueError: If the file doesn't exist or extraction method is invalid
|
|
109
|
+
ImportError: If required packages are not installed
|
|
110
|
+
"""
|
|
111
|
+
if not self.check_file(pdf_path):
|
|
112
|
+
raise ValueError(f"File {pdf_path} not found")
|
|
113
|
+
|
|
114
|
+
if extraction_method == "pypdf":
|
|
115
|
+
return self._extract_with_pypdf(pdf_path, **kwargs)
|
|
116
|
+
elif extraction_method == "pdfplumber":
|
|
117
|
+
return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
|
|
118
|
+
elif extraction_method == "pymupdf":
|
|
119
|
+
return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f"Invalid extraction method: {extraction_method}")
|
|
122
|
+
|
|
123
|
+
def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
|
|
124
|
+
extract_images: bool = False, extract_tables: bool = False,
|
|
125
|
+
**kwargs) -> List[Document]:
|
|
126
|
+
"""
|
|
127
|
+
Extract text and metadata from multiple PDF files.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
pdf_paths (List[str]): List of paths to PDF files
|
|
131
|
+
extraction_method (str): Method to use for extraction
|
|
132
|
+
extract_images (bool): Whether to extract images
|
|
133
|
+
extract_tables (bool): Whether to extract tables
|
|
134
|
+
**kwargs: Additional arguments for the extraction method
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
List[Document]: List of Document objects containing extracted content
|
|
138
|
+
"""
|
|
139
|
+
all_docs = []
|
|
140
|
+
for pdf_path in pdf_paths:
|
|
141
|
+
try:
|
|
142
|
+
docs = self.extract_pdf(
|
|
143
|
+
pdf_path,
|
|
144
|
+
extraction_method=extraction_method,
|
|
145
|
+
extract_images=extract_images,
|
|
146
|
+
extract_tables=extract_tables,
|
|
147
|
+
**kwargs
|
|
148
|
+
)
|
|
149
|
+
all_docs.extend(docs)
|
|
150
|
+
if self.logger:
|
|
151
|
+
self.logger.info(f"Successfully extracted content from {pdf_path}")
|
|
152
|
+
else:
|
|
153
|
+
print(f"Successfully extracted content from {pdf_path}")
|
|
154
|
+
except Exception as e:
|
|
155
|
+
if self.logger:
|
|
156
|
+
self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
|
|
157
|
+
else:
|
|
158
|
+
print(f"Error extracting from {pdf_path}: {str(e)}")
|
|
159
|
+
|
|
160
|
+
return all_docs
|
|
161
|
+
|
|
162
|
+
def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
|
|
163
|
+
"""
|
|
164
|
+
Extract text using PyPDF2.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
pdf_path (str): Path to the PDF file
|
|
168
|
+
**kwargs: Additional arguments for PyPDF2
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List[Document]: List of Document objects
|
|
172
|
+
|
|
173
|
+
Raises:
|
|
174
|
+
ImportError: If PyPDF2 is not installed
|
|
175
|
+
"""
|
|
176
|
+
if not self.check_package("pypdf"):
|
|
177
|
+
raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
|
|
178
|
+
|
|
179
|
+
from langchain_community.document_loaders import PyPDFLoader
|
|
180
|
+
|
|
181
|
+
loader = PyPDFLoader(pdf_path, **kwargs)
|
|
182
|
+
documents = loader.load()
|
|
183
|
+
|
|
184
|
+
if self.logger:
|
|
185
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
186
|
+
else:
|
|
187
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
|
|
188
|
+
|
|
189
|
+
return documents
|
|
190
|
+
|
|
191
|
+
def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
|
|
192
|
+
"""
|
|
193
|
+
Extract text using PDFPlumber.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
pdf_path (str): Path to the PDF file
|
|
197
|
+
extract_tables (bool): Whether to extract tables
|
|
198
|
+
**kwargs: Additional arguments for PDFPlumber
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
List[Document]: List of Document objects
|
|
202
|
+
|
|
203
|
+
Raises:
|
|
204
|
+
ImportError: If PDFPlumber is not installed
|
|
205
|
+
"""
|
|
206
|
+
if not self.check_package("pdfplumber"):
|
|
207
|
+
raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
|
|
208
|
+
|
|
209
|
+
import pdfplumber
|
|
210
|
+
|
|
211
|
+
documents = []
|
|
212
|
+
with pdfplumber.open(pdf_path) as pdf:
|
|
213
|
+
for i, page in enumerate(pdf.pages):
|
|
214
|
+
text = page.extract_text()
|
|
215
|
+
|
|
216
|
+
metadata = {
|
|
217
|
+
"source": pdf_path,
|
|
218
|
+
"page": i + 1,
|
|
219
|
+
"total_pages": len(pdf.pages)
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if extract_tables:
|
|
223
|
+
tables = page.extract_tables()
|
|
224
|
+
if tables:
|
|
225
|
+
table_text = []
|
|
226
|
+
for table in tables:
|
|
227
|
+
table_rows = []
|
|
228
|
+
for row in table:
|
|
229
|
+
# Filter out None values and convert to strings
|
|
230
|
+
row_text = [str(cell) if cell is not None else "" for cell in row]
|
|
231
|
+
table_rows.append(" | ".join(row_text))
|
|
232
|
+
table_text.append("\n".join(table_rows))
|
|
233
|
+
|
|
234
|
+
metadata["tables"] = table_text
|
|
235
|
+
# Append table text to the main text
|
|
236
|
+
text += "\n\nTABLES:\n" + "\n\n".join(table_text)
|
|
237
|
+
|
|
238
|
+
documents.append(Document(page_content=text, metadata=metadata))
|
|
239
|
+
|
|
240
|
+
if self.logger:
|
|
241
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
242
|
+
else:
|
|
243
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
|
|
244
|
+
|
|
245
|
+
return documents
|
|
246
|
+
|
|
247
|
+
def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
|
|
248
|
+
"""
|
|
249
|
+
Extract text using PyMuPDF (fitz).
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
pdf_path (str): Path to the PDF file
|
|
253
|
+
extract_images (bool): Whether to extract images
|
|
254
|
+
**kwargs: Additional arguments for PyMuPDF
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
List[Document]: List of Document objects
|
|
258
|
+
|
|
259
|
+
Raises:
|
|
260
|
+
ImportError: If PyMuPDF is not installed
|
|
261
|
+
"""
|
|
262
|
+
if not self.check_package("fitz"):
|
|
263
|
+
raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
|
|
264
|
+
|
|
265
|
+
import fitz
|
|
266
|
+
|
|
267
|
+
documents = []
|
|
268
|
+
temp_dir = None
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
if extract_images:
|
|
272
|
+
temp_dir = tempfile.mkdtemp()
|
|
273
|
+
|
|
274
|
+
with fitz.open(pdf_path) as doc:
|
|
275
|
+
for i, page in enumerate(doc):
|
|
276
|
+
text = page.get_text()
|
|
277
|
+
|
|
278
|
+
metadata = {
|
|
279
|
+
"source": pdf_path,
|
|
280
|
+
"page": i + 1,
|
|
281
|
+
"total_pages": len(doc),
|
|
282
|
+
"title": doc.metadata.get("title", ""),
|
|
283
|
+
"author": doc.metadata.get("author", ""),
|
|
284
|
+
"subject": doc.metadata.get("subject", ""),
|
|
285
|
+
"keywords": doc.metadata.get("keywords", "")
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if extract_images and temp_dir:
|
|
289
|
+
image_list = page.get_images(full=True)
|
|
290
|
+
image_paths = []
|
|
291
|
+
|
|
292
|
+
for img_index, img in enumerate(image_list):
|
|
293
|
+
xref = img[0]
|
|
294
|
+
base_image = doc.extract_image(xref)
|
|
295
|
+
image_bytes = base_image["image"]
|
|
296
|
+
|
|
297
|
+
image_path = os.path.join(
|
|
298
|
+
temp_dir,
|
|
299
|
+
f"page{i+1}_img{img_index+1}.{base_image['ext']}"
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
with open(image_path, "wb") as img_file:
|
|
303
|
+
img_file.write(image_bytes)
|
|
304
|
+
|
|
305
|
+
image_paths.append(image_path)
|
|
306
|
+
|
|
307
|
+
if image_paths:
|
|
308
|
+
metadata["images"] = image_paths
|
|
309
|
+
|
|
310
|
+
documents.append(Document(page_content=text, metadata=metadata))
|
|
311
|
+
|
|
312
|
+
finally:
|
|
313
|
+
# Clean up temporary directory if it was created
|
|
314
|
+
if extract_images and temp_dir and os.path.exists(temp_dir):
|
|
315
|
+
import shutil
|
|
316
|
+
shutil.rmtree(temp_dir)
|
|
317
|
+
|
|
318
|
+
if self.logger:
|
|
319
|
+
self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
320
|
+
else:
|
|
321
|
+
print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
|
|
322
|
+
|
|
323
|
+
return documents
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class PDFToCSV:
|
|
327
|
+
"""
|
|
328
|
+
Class for converting PDF tables to CSV format.
|
|
329
|
+
|
|
330
|
+
This class provides methods for extracting tables from PDF files
|
|
331
|
+
and converting them to CSV format.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
logger: Optional logger instance for logging operations
|
|
335
|
+
|
|
336
|
+
Example:
|
|
337
|
+
```python
|
|
338
|
+
converter = PDFToCSV()
|
|
339
|
+
csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
|
|
340
|
+
```
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
def __init__(self, logger=None):
|
|
344
|
+
"""Initialize the PDF to CSV converter."""
|
|
345
|
+
self.logger = logger
|
|
346
|
+
|
|
347
|
+
@staticmethod
|
|
348
|
+
def check_package(package_name: str) -> bool:
|
|
349
|
+
"""
|
|
350
|
+
Check if a Python package is installed.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
package_name (str): Name of the package to check
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
bool: True if package is installed, False otherwise
|
|
357
|
+
"""
|
|
358
|
+
return importlib.util.find_spec(package_name) is not None
|
|
359
|
+
|
|
360
|
+
def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
|
|
361
|
+
pages: List[int] = None) -> List[str]:
|
|
362
|
+
"""
|
|
363
|
+
Extract tables from PDF and convert to CSV.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
pdf_path (str): Path to the PDF file
|
|
367
|
+
output_dir (str): Directory to save CSV files (default: same as PDF)
|
|
368
|
+
pages (List[int]): Specific pages to extract tables from (default: all)
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
List[str]: Paths to the created CSV files
|
|
372
|
+
|
|
373
|
+
Raises:
|
|
374
|
+
ImportError: If required packages are not installed
|
|
375
|
+
ValueError: If the PDF file doesn't exist
|
|
376
|
+
"""
|
|
377
|
+
if not os.path.exists(pdf_path):
|
|
378
|
+
raise ValueError(f"PDF file not found: {pdf_path}")
|
|
379
|
+
|
|
380
|
+
if not self.check_package("tabula"):
|
|
381
|
+
raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
|
|
382
|
+
|
|
383
|
+
import tabula
|
|
384
|
+
import pandas as pd
|
|
385
|
+
|
|
386
|
+
# Determine output directory
|
|
387
|
+
if output_dir is None:
|
|
388
|
+
output_dir = os.path.dirname(pdf_path)
|
|
389
|
+
|
|
390
|
+
# Create output directory if it doesn't exist
|
|
391
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
392
|
+
|
|
393
|
+
# Extract tables
|
|
394
|
+
try:
|
|
395
|
+
if pages:
|
|
396
|
+
dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
|
|
397
|
+
else:
|
|
398
|
+
dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
|
|
399
|
+
except Exception as e:
|
|
400
|
+
if self.logger:
|
|
401
|
+
self.logger.error(f"Error extracting tables: {str(e)}")
|
|
402
|
+
else:
|
|
403
|
+
print(f"Error extracting tables: {str(e)}")
|
|
404
|
+
return []
|
|
405
|
+
|
|
406
|
+
if not dfs:
|
|
407
|
+
if self.logger:
|
|
408
|
+
self.logger.warning(f"No tables found in {pdf_path}")
|
|
409
|
+
else:
|
|
410
|
+
print(f"No tables found in {pdf_path}")
|
|
411
|
+
return []
|
|
412
|
+
|
|
413
|
+
# Save tables to CSV
|
|
414
|
+
csv_paths = []
|
|
415
|
+
pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
416
|
+
|
|
417
|
+
for i, df in enumerate(dfs):
|
|
418
|
+
if not df.empty:
|
|
419
|
+
csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
|
|
420
|
+
df.to_csv(csv_path, index=False)
|
|
421
|
+
csv_paths.append(csv_path)
|
|
422
|
+
|
|
423
|
+
if self.logger:
|
|
424
|
+
self.logger.info(f"Saved table {i+1} to {csv_path}")
|
|
425
|
+
else:
|
|
426
|
+
print(f"Saved table {i+1} to {csv_path}")
|
|
427
|
+
|
|
428
|
+
return csv_paths
|