mb-rag 1.1.47__py3-none-any.whl → 1.1.56.post0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mb-rag might be problematic. Click here for more details.

@@ -1,428 +1,428 @@
1
- """
2
- PDF Extraction Module
3
-
4
- This module provides functionality for extracting text and metadata from PDF files.
5
- It supports various extraction methods and includes features for handling different
6
- PDF structures, including tables and images.
7
-
8
- Example Usage:
9
- ```python
10
- # Initialize PDF extractor
11
- extractor = PDFExtractor()
12
-
13
- # Extract text from a PDF file
14
- docs = extractor.extract_pdf("document.pdf")
15
-
16
- # Extract with specific options
17
- docs = extractor.extract_pdf(
18
- "document.pdf",
19
- extraction_method="pdfplumber",
20
- extract_images=True
21
- )
22
-
23
- # Extract from multiple PDFs
24
- docs = extractor.extract_multiple_pdfs(
25
- ["doc1.pdf", "doc2.pdf"],
26
- extraction_method="pymupdf"
27
- )
28
- ```
29
-
30
- Features:
31
- - Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
32
- - Text and metadata extraction
33
- - Optional image extraction
34
- - Table detection and extraction
35
- - Batch processing for multiple PDFs
36
- """
37
-
38
- import os
39
- import tempfile
40
- from typing import List, Dict, Optional, Union, Any, Tuple
41
- import importlib.util
42
- from langchain_core.documents import Document
43
-
44
- class PDFExtractor:
45
- """
46
- Class for extracting text and metadata from PDF files.
47
-
48
- This class provides methods for extracting content from PDF files using
49
- different extraction methods and processing options.
50
-
51
- Args:
52
- logger: Optional logger instance for logging operations
53
-
54
- Example:
55
- ```python
56
- extractor = PDFExtractor()
57
- docs = extractor.extract_pdf("document.pdf")
58
- ```
59
- """
60
-
61
- def __init__(self, logger=None):
62
- """Initialize the PDF extractor."""
63
- self.logger = logger
64
-
65
- @staticmethod
66
- def check_package(package_name: str) -> bool:
67
- """
68
- Check if a Python package is installed.
69
-
70
- Args:
71
- package_name (str): Name of the package to check
72
-
73
- Returns:
74
- bool: True if package is installed, False otherwise
75
- """
76
- return importlib.util.find_spec(package_name) is not None
77
-
78
- def check_file(self, file_path: str) -> bool:
79
- """
80
- Check if file exists.
81
-
82
- Args:
83
- file_path (str): Path to the file
84
-
85
- Returns:
86
- bool: True if file exists, False otherwise
87
- """
88
- return os.path.exists(file_path)
89
-
90
- def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
91
- extract_images: bool = False, extract_tables: bool = False,
92
- **kwargs) -> List[Document]:
93
- """
94
- Extract text and metadata from a PDF file.
95
-
96
- Args:
97
- pdf_path (str): Path to the PDF file
98
- extraction_method (str): Method to use for extraction
99
- ("pypdf", "pdfplumber", or "pymupdf")
100
- extract_images (bool): Whether to extract images
101
- extract_tables (bool): Whether to extract tables
102
- **kwargs: Additional arguments for the extraction method
103
-
104
- Returns:
105
- List[Document]: List of Document objects containing extracted content
106
-
107
- Raises:
108
- ValueError: If the file doesn't exist or extraction method is invalid
109
- ImportError: If required packages are not installed
110
- """
111
- if not self.check_file(pdf_path):
112
- raise ValueError(f"File {pdf_path} not found")
113
-
114
- if extraction_method == "pypdf":
115
- return self._extract_with_pypdf(pdf_path, **kwargs)
116
- elif extraction_method == "pdfplumber":
117
- return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
118
- elif extraction_method == "pymupdf":
119
- return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
120
- else:
121
- raise ValueError(f"Invalid extraction method: {extraction_method}")
122
-
123
- def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
124
- extract_images: bool = False, extract_tables: bool = False,
125
- **kwargs) -> List[Document]:
126
- """
127
- Extract text and metadata from multiple PDF files.
128
-
129
- Args:
130
- pdf_paths (List[str]): List of paths to PDF files
131
- extraction_method (str): Method to use for extraction
132
- extract_images (bool): Whether to extract images
133
- extract_tables (bool): Whether to extract tables
134
- **kwargs: Additional arguments for the extraction method
135
-
136
- Returns:
137
- List[Document]: List of Document objects containing extracted content
138
- """
139
- all_docs = []
140
- for pdf_path in pdf_paths:
141
- try:
142
- docs = self.extract_pdf(
143
- pdf_path,
144
- extraction_method=extraction_method,
145
- extract_images=extract_images,
146
- extract_tables=extract_tables,
147
- **kwargs
148
- )
149
- all_docs.extend(docs)
150
- if self.logger:
151
- self.logger.info(f"Successfully extracted content from {pdf_path}")
152
- else:
153
- print(f"Successfully extracted content from {pdf_path}")
154
- except Exception as e:
155
- if self.logger:
156
- self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
157
- else:
158
- print(f"Error extracting from {pdf_path}: {str(e)}")
159
-
160
- return all_docs
161
-
162
- def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
163
- """
164
- Extract text using PyPDF2.
165
-
166
- Args:
167
- pdf_path (str): Path to the PDF file
168
- **kwargs: Additional arguments for PyPDF2
169
-
170
- Returns:
171
- List[Document]: List of Document objects
172
-
173
- Raises:
174
- ImportError: If PyPDF2 is not installed
175
- """
176
- if not self.check_package("pypdf"):
177
- raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
178
-
179
- from langchain_community.document_loaders import PyPDFLoader
180
-
181
- loader = PyPDFLoader(pdf_path, **kwargs)
182
- documents = loader.load()
183
-
184
- if self.logger:
185
- self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
186
- else:
187
- print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
188
-
189
- return documents
190
-
191
- def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
192
- """
193
- Extract text using PDFPlumber.
194
-
195
- Args:
196
- pdf_path (str): Path to the PDF file
197
- extract_tables (bool): Whether to extract tables
198
- **kwargs: Additional arguments for PDFPlumber
199
-
200
- Returns:
201
- List[Document]: List of Document objects
202
-
203
- Raises:
204
- ImportError: If PDFPlumber is not installed
205
- """
206
- if not self.check_package("pdfplumber"):
207
- raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
208
-
209
- import pdfplumber
210
-
211
- documents = []
212
- with pdfplumber.open(pdf_path) as pdf:
213
- for i, page in enumerate(pdf.pages):
214
- text = page.extract_text()
215
-
216
- metadata = {
217
- "source": pdf_path,
218
- "page": i + 1,
219
- "total_pages": len(pdf.pages)
220
- }
221
-
222
- if extract_tables:
223
- tables = page.extract_tables()
224
- if tables:
225
- table_text = []
226
- for table in tables:
227
- table_rows = []
228
- for row in table:
229
- # Filter out None values and convert to strings
230
- row_text = [str(cell) if cell is not None else "" for cell in row]
231
- table_rows.append(" | ".join(row_text))
232
- table_text.append("\n".join(table_rows))
233
-
234
- metadata["tables"] = table_text
235
- # Append table text to the main text
236
- text += "\n\nTABLES:\n" + "\n\n".join(table_text)
237
-
238
- documents.append(Document(page_content=text, metadata=metadata))
239
-
240
- if self.logger:
241
- self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
242
- else:
243
- print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
244
-
245
- return documents
246
-
247
- def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
248
- """
249
- Extract text using PyMuPDF (fitz).
250
-
251
- Args:
252
- pdf_path (str): Path to the PDF file
253
- extract_images (bool): Whether to extract images
254
- **kwargs: Additional arguments for PyMuPDF
255
-
256
- Returns:
257
- List[Document]: List of Document objects
258
-
259
- Raises:
260
- ImportError: If PyMuPDF is not installed
261
- """
262
- if not self.check_package("fitz"):
263
- raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
264
-
265
- import fitz
266
-
267
- documents = []
268
- temp_dir = None
269
-
270
- try:
271
- if extract_images:
272
- temp_dir = tempfile.mkdtemp()
273
-
274
- with fitz.open(pdf_path) as doc:
275
- for i, page in enumerate(doc):
276
- text = page.get_text()
277
-
278
- metadata = {
279
- "source": pdf_path,
280
- "page": i + 1,
281
- "total_pages": len(doc),
282
- "title": doc.metadata.get("title", ""),
283
- "author": doc.metadata.get("author", ""),
284
- "subject": doc.metadata.get("subject", ""),
285
- "keywords": doc.metadata.get("keywords", "")
286
- }
287
-
288
- if extract_images and temp_dir:
289
- image_list = page.get_images(full=True)
290
- image_paths = []
291
-
292
- for img_index, img in enumerate(image_list):
293
- xref = img[0]
294
- base_image = doc.extract_image(xref)
295
- image_bytes = base_image["image"]
296
-
297
- image_path = os.path.join(
298
- temp_dir,
299
- f"page{i+1}_img{img_index+1}.{base_image['ext']}"
300
- )
301
-
302
- with open(image_path, "wb") as img_file:
303
- img_file.write(image_bytes)
304
-
305
- image_paths.append(image_path)
306
-
307
- if image_paths:
308
- metadata["images"] = image_paths
309
-
310
- documents.append(Document(page_content=text, metadata=metadata))
311
-
312
- finally:
313
- # Clean up temporary directory if it was created
314
- if extract_images and temp_dir and os.path.exists(temp_dir):
315
- import shutil
316
- shutil.rmtree(temp_dir)
317
-
318
- if self.logger:
319
- self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
320
- else:
321
- print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
322
-
323
- return documents
324
-
325
-
326
- class PDFToCSV:
327
- """
328
- Class for converting PDF tables to CSV format.
329
-
330
- This class provides methods for extracting tables from PDF files
331
- and converting them to CSV format.
332
-
333
- Args:
334
- logger: Optional logger instance for logging operations
335
-
336
- Example:
337
- ```python
338
- converter = PDFToCSV()
339
- csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
340
- ```
341
- """
342
-
343
- def __init__(self, logger=None):
344
- """Initialize the PDF to CSV converter."""
345
- self.logger = logger
346
-
347
- @staticmethod
348
- def check_package(package_name: str) -> bool:
349
- """
350
- Check if a Python package is installed.
351
-
352
- Args:
353
- package_name (str): Name of the package to check
354
-
355
- Returns:
356
- bool: True if package is installed, False otherwise
357
- """
358
- return importlib.util.find_spec(package_name) is not None
359
-
360
- def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
361
- pages: List[int] = None) -> List[str]:
362
- """
363
- Extract tables from PDF and convert to CSV.
364
-
365
- Args:
366
- pdf_path (str): Path to the PDF file
367
- output_dir (str): Directory to save CSV files (default: same as PDF)
368
- pages (List[int]): Specific pages to extract tables from (default: all)
369
-
370
- Returns:
371
- List[str]: Paths to the created CSV files
372
-
373
- Raises:
374
- ImportError: If required packages are not installed
375
- ValueError: If the PDF file doesn't exist
376
- """
377
- if not os.path.exists(pdf_path):
378
- raise ValueError(f"PDF file not found: {pdf_path}")
379
-
380
- if not self.check_package("tabula"):
381
- raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
382
-
383
- import tabula
384
- import pandas as pd
385
-
386
- # Determine output directory
387
- if output_dir is None:
388
- output_dir = os.path.dirname(pdf_path)
389
-
390
- # Create output directory if it doesn't exist
391
- os.makedirs(output_dir, exist_ok=True)
392
-
393
- # Extract tables
394
- try:
395
- if pages:
396
- dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
397
- else:
398
- dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
399
- except Exception as e:
400
- if self.logger:
401
- self.logger.error(f"Error extracting tables: {str(e)}")
402
- else:
403
- print(f"Error extracting tables: {str(e)}")
404
- return []
405
-
406
- if not dfs:
407
- if self.logger:
408
- self.logger.warning(f"No tables found in {pdf_path}")
409
- else:
410
- print(f"No tables found in {pdf_path}")
411
- return []
412
-
413
- # Save tables to CSV
414
- csv_paths = []
415
- pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
416
-
417
- for i, df in enumerate(dfs):
418
- if not df.empty:
419
- csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
420
- df.to_csv(csv_path, index=False)
421
- csv_paths.append(csv_path)
422
-
423
- if self.logger:
424
- self.logger.info(f"Saved table {i+1} to {csv_path}")
425
- else:
426
- print(f"Saved table {i+1} to {csv_path}")
427
-
428
- return csv_paths
1
+ """
2
+ PDF Extraction Module
3
+
4
+ This module provides functionality for extracting text and metadata from PDF files.
5
+ It supports various extraction methods and includes features for handling different
6
+ PDF structures, including tables and images.
7
+
8
+ Example Usage:
9
+ ```python
10
+ # Initialize PDF extractor
11
+ extractor = PDFExtractor()
12
+
13
+ # Extract text from a PDF file
14
+ docs = extractor.extract_pdf("document.pdf")
15
+
16
+ # Extract with specific options
17
+ docs = extractor.extract_pdf(
18
+ "document.pdf",
19
+ extraction_method="pdfplumber",
20
+ extract_images=True
21
+ )
22
+
23
+ # Extract from multiple PDFs
24
+ docs = extractor.extract_multiple_pdfs(
25
+ ["doc1.pdf", "doc2.pdf"],
26
+ extraction_method="pymupdf"
27
+ )
28
+ ```
29
+
30
+ Features:
31
+ - Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
32
+ - Text and metadata extraction
33
+ - Optional image extraction
34
+ - Table detection and extraction
35
+ - Batch processing for multiple PDFs
36
+ """
37
+
38
+ import os
39
+ import tempfile
40
+ from typing import List, Dict, Optional, Union, Any, Tuple
41
+ import importlib.util
42
+ from langchain_core.documents import Document
43
+
44
+ class PDFExtractor:
45
+ """
46
+ Class for extracting text and metadata from PDF files.
47
+
48
+ This class provides methods for extracting content from PDF files using
49
+ different extraction methods and processing options.
50
+
51
+ Args:
52
+ logger: Optional logger instance for logging operations
53
+
54
+ Example:
55
+ ```python
56
+ extractor = PDFExtractor()
57
+ docs = extractor.extract_pdf("document.pdf")
58
+ ```
59
+ """
60
+
61
+ def __init__(self, logger=None):
62
+ """Initialize the PDF extractor."""
63
+ self.logger = logger
64
+
65
+ @staticmethod
66
+ def check_package(package_name: str) -> bool:
67
+ """
68
+ Check if a Python package is installed.
69
+
70
+ Args:
71
+ package_name (str): Name of the package to check
72
+
73
+ Returns:
74
+ bool: True if package is installed, False otherwise
75
+ """
76
+ return importlib.util.find_spec(package_name) is not None
77
+
78
+ def check_file(self, file_path: str) -> bool:
79
+ """
80
+ Check if file exists.
81
+
82
+ Args:
83
+ file_path (str): Path to the file
84
+
85
+ Returns:
86
+ bool: True if file exists, False otherwise
87
+ """
88
+ return os.path.exists(file_path)
89
+
90
+ def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
91
+ extract_images: bool = False, extract_tables: bool = False,
92
+ **kwargs) -> List[Document]:
93
+ """
94
+ Extract text and metadata from a PDF file.
95
+
96
+ Args:
97
+ pdf_path (str): Path to the PDF file
98
+ extraction_method (str): Method to use for extraction
99
+ ("pypdf", "pdfplumber", or "pymupdf")
100
+ extract_images (bool): Whether to extract images
101
+ extract_tables (bool): Whether to extract tables
102
+ **kwargs: Additional arguments for the extraction method
103
+
104
+ Returns:
105
+ List[Document]: List of Document objects containing extracted content
106
+
107
+ Raises:
108
+ ValueError: If the file doesn't exist or extraction method is invalid
109
+ ImportError: If required packages are not installed
110
+ """
111
+ if not self.check_file(pdf_path):
112
+ raise ValueError(f"File {pdf_path} not found")
113
+
114
+ if extraction_method == "pypdf":
115
+ return self._extract_with_pypdf(pdf_path, **kwargs)
116
+ elif extraction_method == "pdfplumber":
117
+ return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
118
+ elif extraction_method == "pymupdf":
119
+ return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
120
+ else:
121
+ raise ValueError(f"Invalid extraction method: {extraction_method}")
122
+
123
+ def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
124
+ extract_images: bool = False, extract_tables: bool = False,
125
+ **kwargs) -> List[Document]:
126
+ """
127
+ Extract text and metadata from multiple PDF files.
128
+
129
+ Args:
130
+ pdf_paths (List[str]): List of paths to PDF files
131
+ extraction_method (str): Method to use for extraction
132
+ extract_images (bool): Whether to extract images
133
+ extract_tables (bool): Whether to extract tables
134
+ **kwargs: Additional arguments for the extraction method
135
+
136
+ Returns:
137
+ List[Document]: List of Document objects containing extracted content
138
+ """
139
+ all_docs = []
140
+ for pdf_path in pdf_paths:
141
+ try:
142
+ docs = self.extract_pdf(
143
+ pdf_path,
144
+ extraction_method=extraction_method,
145
+ extract_images=extract_images,
146
+ extract_tables=extract_tables,
147
+ **kwargs
148
+ )
149
+ all_docs.extend(docs)
150
+ if self.logger:
151
+ self.logger.info(f"Successfully extracted content from {pdf_path}")
152
+ else:
153
+ print(f"Successfully extracted content from {pdf_path}")
154
+ except Exception as e:
155
+ if self.logger:
156
+ self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
157
+ else:
158
+ print(f"Error extracting from {pdf_path}: {str(e)}")
159
+
160
+ return all_docs
161
+
162
+ def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
163
+ """
164
+ Extract text using PyPDF2.
165
+
166
+ Args:
167
+ pdf_path (str): Path to the PDF file
168
+ **kwargs: Additional arguments for PyPDF2
169
+
170
+ Returns:
171
+ List[Document]: List of Document objects
172
+
173
+ Raises:
174
+ ImportError: If PyPDF2 is not installed
175
+ """
176
+ if not self.check_package("pypdf"):
177
+ raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
178
+
179
+ from langchain_community.document_loaders import PyPDFLoader
180
+
181
+ loader = PyPDFLoader(pdf_path, **kwargs)
182
+ documents = loader.load()
183
+
184
+ if self.logger:
185
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
186
+ else:
187
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
188
+
189
+ return documents
190
+
191
+ def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
192
+ """
193
+ Extract text using PDFPlumber.
194
+
195
+ Args:
196
+ pdf_path (str): Path to the PDF file
197
+ extract_tables (bool): Whether to extract tables
198
+ **kwargs: Additional arguments for PDFPlumber
199
+
200
+ Returns:
201
+ List[Document]: List of Document objects
202
+
203
+ Raises:
204
+ ImportError: If PDFPlumber is not installed
205
+ """
206
+ if not self.check_package("pdfplumber"):
207
+ raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
208
+
209
+ import pdfplumber
210
+
211
+ documents = []
212
+ with pdfplumber.open(pdf_path) as pdf:
213
+ for i, page in enumerate(pdf.pages):
214
+ text = page.extract_text()
215
+
216
+ metadata = {
217
+ "source": pdf_path,
218
+ "page": i + 1,
219
+ "total_pages": len(pdf.pages)
220
+ }
221
+
222
+ if extract_tables:
223
+ tables = page.extract_tables()
224
+ if tables:
225
+ table_text = []
226
+ for table in tables:
227
+ table_rows = []
228
+ for row in table:
229
+ # Filter out None values and convert to strings
230
+ row_text = [str(cell) if cell is not None else "" for cell in row]
231
+ table_rows.append(" | ".join(row_text))
232
+ table_text.append("\n".join(table_rows))
233
+
234
+ metadata["tables"] = table_text
235
+ # Append table text to the main text
236
+ text += "\n\nTABLES:\n" + "\n\n".join(table_text)
237
+
238
+ documents.append(Document(page_content=text, metadata=metadata))
239
+
240
+ if self.logger:
241
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
242
+ else:
243
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
244
+
245
+ return documents
246
+
247
+ def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
248
+ """
249
+ Extract text using PyMuPDF (fitz).
250
+
251
+ Args:
252
+ pdf_path (str): Path to the PDF file
253
+ extract_images (bool): Whether to extract images
254
+ **kwargs: Additional arguments for PyMuPDF
255
+
256
+ Returns:
257
+ List[Document]: List of Document objects
258
+
259
+ Raises:
260
+ ImportError: If PyMuPDF is not installed
261
+ """
262
+ if not self.check_package("fitz"):
263
+ raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
264
+
265
+ import fitz
266
+
267
+ documents = []
268
+ temp_dir = None
269
+
270
+ try:
271
+ if extract_images:
272
+ temp_dir = tempfile.mkdtemp()
273
+
274
+ with fitz.open(pdf_path) as doc:
275
+ for i, page in enumerate(doc):
276
+ text = page.get_text()
277
+
278
+ metadata = {
279
+ "source": pdf_path,
280
+ "page": i + 1,
281
+ "total_pages": len(doc),
282
+ "title": doc.metadata.get("title", ""),
283
+ "author": doc.metadata.get("author", ""),
284
+ "subject": doc.metadata.get("subject", ""),
285
+ "keywords": doc.metadata.get("keywords", "")
286
+ }
287
+
288
+ if extract_images and temp_dir:
289
+ image_list = page.get_images(full=True)
290
+ image_paths = []
291
+
292
+ for img_index, img in enumerate(image_list):
293
+ xref = img[0]
294
+ base_image = doc.extract_image(xref)
295
+ image_bytes = base_image["image"]
296
+
297
+ image_path = os.path.join(
298
+ temp_dir,
299
+ f"page{i+1}_img{img_index+1}.{base_image['ext']}"
300
+ )
301
+
302
+ with open(image_path, "wb") as img_file:
303
+ img_file.write(image_bytes)
304
+
305
+ image_paths.append(image_path)
306
+
307
+ if image_paths:
308
+ metadata["images"] = image_paths
309
+
310
+ documents.append(Document(page_content=text, metadata=metadata))
311
+
312
+ finally:
313
+ # Clean up temporary directory if it was created
314
+ if extract_images and temp_dir and os.path.exists(temp_dir):
315
+ import shutil
316
+ shutil.rmtree(temp_dir)
317
+
318
+ if self.logger:
319
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
320
+ else:
321
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
322
+
323
+ return documents
324
+
325
+
326
+ class PDFToCSV:
327
+ """
328
+ Class for converting PDF tables to CSV format.
329
+
330
+ This class provides methods for extracting tables from PDF files
331
+ and converting them to CSV format.
332
+
333
+ Args:
334
+ logger: Optional logger instance for logging operations
335
+
336
+ Example:
337
+ ```python
338
+ converter = PDFToCSV()
339
+ csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
340
+ ```
341
+ """
342
+
343
+ def __init__(self, logger=None):
344
+ """Initialize the PDF to CSV converter."""
345
+ self.logger = logger
346
+
347
+ @staticmethod
348
+ def check_package(package_name: str) -> bool:
349
+ """
350
+ Check if a Python package is installed.
351
+
352
+ Args:
353
+ package_name (str): Name of the package to check
354
+
355
+ Returns:
356
+ bool: True if package is installed, False otherwise
357
+ """
358
+ return importlib.util.find_spec(package_name) is not None
359
+
360
+ def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
361
+ pages: List[int] = None) -> List[str]:
362
+ """
363
+ Extract tables from PDF and convert to CSV.
364
+
365
+ Args:
366
+ pdf_path (str): Path to the PDF file
367
+ output_dir (str): Directory to save CSV files (default: same as PDF)
368
+ pages (List[int]): Specific pages to extract tables from (default: all)
369
+
370
+ Returns:
371
+ List[str]: Paths to the created CSV files
372
+
373
+ Raises:
374
+ ImportError: If required packages are not installed
375
+ ValueError: If the PDF file doesn't exist
376
+ """
377
+ if not os.path.exists(pdf_path):
378
+ raise ValueError(f"PDF file not found: {pdf_path}")
379
+
380
+ if not self.check_package("tabula"):
381
+ raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
382
+
383
+ import tabula
384
+ import pandas as pd
385
+
386
+ # Determine output directory
387
+ if output_dir is None:
388
+ output_dir = os.path.dirname(pdf_path)
389
+
390
+ # Create output directory if it doesn't exist
391
+ os.makedirs(output_dir, exist_ok=True)
392
+
393
+ # Extract tables
394
+ try:
395
+ if pages:
396
+ dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
397
+ else:
398
+ dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
399
+ except Exception as e:
400
+ if self.logger:
401
+ self.logger.error(f"Error extracting tables: {str(e)}")
402
+ else:
403
+ print(f"Error extracting tables: {str(e)}")
404
+ return []
405
+
406
+ if not dfs:
407
+ if self.logger:
408
+ self.logger.warning(f"No tables found in {pdf_path}")
409
+ else:
410
+ print(f"No tables found in {pdf_path}")
411
+ return []
412
+
413
+ # Save tables to CSV
414
+ csv_paths = []
415
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
416
+
417
+ for i, df in enumerate(dfs):
418
+ if not df.empty:
419
+ csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
420
+ df.to_csv(csv_path, index=False)
421
+ csv_paths.append(csv_path)
422
+
423
+ if self.logger:
424
+ self.logger.info(f"Saved table {i+1} to {csv_path}")
425
+ else:
426
+ print(f"Saved table {i+1} to {csv_path}")
427
+
428
+ return csv_paths