mb-rag 1.1.24__tar.gz → 1.1.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mb-rag might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mb_rag
3
- Version: 1.1.24
3
+ Version: 1.1.29
4
4
  Summary: RAG function file
5
5
  Author: ['Malav Bateriwala']
6
6
  Requires-Python: >=3.8
@@ -146,10 +146,11 @@ class ModelFactory:
146
146
  if not check_package("langchain_ollama"):
147
147
  raise ImportError("Langchain Community package not found. Please install it using: pip install langchain_ollama")
148
148
 
149
- from langchain_ollama import ChatOllama
149
+ from langchain_ollama import OllamaLLM
150
+
150
151
  print(f"Current Ollama serve model is {os.system('ollama ps')}")
151
152
  kwargs["model"] = model_name
152
- return ChatOllama(**kwargs)
153
+ return OllamaLLM(**kwargs)
153
154
 
154
155
  @classmethod
155
156
  def create_groq(cls, model_name: str = "llama-3.3-70b-versatile", **kwargs) -> Any:
@@ -315,21 +316,26 @@ class ModelFactory:
315
316
  str: Output from the model
316
317
  """
317
318
  base64_images = [self._image_to_base64(image) for image in images]
318
- image_prompt_create = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_images[i]}"}} for i in range(len(images))]
319
- prompt_new = [{"type": "text", "text": prompt},
320
- *image_prompt_create,]
321
- if pydantic_model is not None:
322
- try:
323
- self.model = self.model.with_structured_output(pydantic_model)
324
- except Exception as e:
325
- print(f"Error with pydantic_model: {e}")
326
- print("Continuing without structured output")
327
- message= HumanMessage(content=prompt_new,)
328
- response = self.model.invoke([message])
329
- try:
319
+ if self.model_name=='ollama':
320
+ ollama_model = self.model.bind(images=[base64_images])
321
+ response = ollama_model.invoke([HumanMessage(content=prompt)])
330
322
  return response.content
331
- except Exception:
332
- return response
323
+ else:
324
+ image_prompt_create = [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_images[i]}"}} for i in range(len(images))]
325
+ prompt_new = [{"type": "text", "text": prompt},
326
+ *image_prompt_create,]
327
+ if pydantic_model is not None:
328
+ try:
329
+ self.model = self.model.with_structured_output(pydantic_model)
330
+ except Exception as e:
331
+ print(f"Error with pydantic_model: {e}")
332
+ print("Continuing without structured output")
333
+ message= HumanMessage(content=prompt_new,)
334
+ response = self.model.invoke([message])
335
+ try:
336
+ return response.content
337
+ except Exception:
338
+ return response
333
339
 
334
340
  class ConversationModel:
335
341
  """
@@ -585,7 +585,7 @@ class embedding_generator:
585
585
  if not ModelProvider.check_package("langchain_openai"):
586
586
  raise ImportError("OpenAI package not found. Please install: pip install langchain-openai")
587
587
  from langchain_openai import ChatOpenAI
588
- llm = ChatOpenAI(model="gpt-4")
588
+ llm = ChatOpenAI(model="gpt-4o", temperature=0.8)
589
589
 
590
590
  history_aware_retriever = create_history_aware_retriever(llm, retriever,
591
591
  contextualize_q_prompt)
@@ -0,0 +1,354 @@
1
+ """
2
+ Document Extraction Module
3
+
4
+ This module provides functionality for extracting text and metadata from various document types
5
+ including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
6
+ functionality in pdf_extract.py.
7
+
8
+ Example Usage:
9
+ ```python
10
+ # Initialize CSV extractor
11
+ csv_extractor = CSVExtractor()
12
+
13
+ # Extract data from a CSV file
14
+ docs = csv_extractor.extract_csv("data.csv")
15
+
16
+ # Initialize PowerPoint extractor
17
+ ppt_extractor = PowerPointExtractor()
18
+
19
+ # Extract content from a PowerPoint file
20
+ docs = ppt_extractor.extract_ppt("presentation.pptx")
21
+ ```
22
+
23
+ Features:
24
+ - CSV file extraction with metadata
25
+ - PowerPoint (PPT/PPTX) extraction
26
+ - Batch processing for multiple files
27
+ """
28
+
29
+ import os
30
+ import pandas as pd
31
+ import importlib.util
32
+ from typing import List, Dict, Optional, Union, Any
33
+ from langchain_core.documents import Document
34
+
35
+ class CSVExtractor:
36
+ """
37
+ Class for extracting data from CSV files.
38
+
39
+ This class provides methods for extracting content from CSV files
40
+ and converting it to Document objects for use with RAG systems.
41
+
42
+ Args:
43
+ logger: Optional logger instance for logging operations
44
+
45
+ Example:
46
+ ```python
47
+ extractor = CSVExtractor()
48
+ docs = extractor.extract_csv("data.csv")
49
+ ```
50
+ """
51
+
52
+ def __init__(self, logger=None):
53
+ """Initialize the CSV extractor."""
54
+ self.logger = logger
55
+
56
+ def check_file(self, file_path: str) -> bool:
57
+ """
58
+ Check if file exists.
59
+
60
+ Args:
61
+ file_path (str): Path to the file
62
+
63
+ Returns:
64
+ bool: True if file exists, False otherwise
65
+ """
66
+ return os.path.exists(file_path)
67
+
68
+ def extract_csv(self, csv_path: str, include_stats: bool = True,
69
+ chunk_by_row: bool = False, rows_per_chunk: int = 10,
70
+ **kwargs) -> List[Document]:
71
+ """
72
+ Extract data from a CSV file.
73
+
74
+ Args:
75
+ csv_path (str): Path to the CSV file
76
+ include_stats (bool): Whether to include basic statistics in the metadata
77
+ chunk_by_row (bool): Whether to create a separate document for each row or group of rows
78
+ rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
79
+ **kwargs: Additional arguments for pandas.read_csv
80
+
81
+ Returns:
82
+ List[Document]: List of Document objects containing extracted content
83
+
84
+ Raises:
85
+ ValueError: If the file doesn't exist
86
+ ImportError: If pandas is not installed
87
+ """
88
+ if not self.check_file(csv_path):
89
+ raise ValueError(f"File {csv_path} not found")
90
+
91
+ try:
92
+ # Read CSV file
93
+ df = pd.read_csv(csv_path, **kwargs)
94
+
95
+ # Create metadata
96
+ metadata = {
97
+ "source": csv_path,
98
+ "rows": len(df),
99
+ "columns": list(df.columns),
100
+ "file_type": "csv"
101
+ }
102
+
103
+ # Add basic statistics if requested
104
+ if include_stats:
105
+ stats = {}
106
+ for column in df.columns:
107
+ if pd.api.types.is_numeric_dtype(df[column]):
108
+ stats[column] = {
109
+ "min": float(df[column].min()),
110
+ "max": float(df[column].max()),
111
+ "mean": float(df[column].mean()),
112
+ "median": float(df[column].median())
113
+ }
114
+ metadata["statistics"] = stats
115
+
116
+ documents = []
117
+
118
+ if chunk_by_row:
119
+ # Create a separate document for each chunk of rows
120
+ for i in range(0, len(df), rows_per_chunk):
121
+ chunk = df.iloc[i:i+rows_per_chunk]
122
+ chunk_text = chunk.to_string(index=False)
123
+
124
+ chunk_metadata = metadata.copy()
125
+ chunk_metadata["chunk"] = {
126
+ "start_row": i,
127
+ "end_row": min(i + rows_per_chunk - 1, len(df) - 1),
128
+ "total_rows": len(chunk)
129
+ }
130
+
131
+ documents.append(Document(
132
+ page_content=chunk_text,
133
+ metadata=chunk_metadata
134
+ ))
135
+ else:
136
+ # Create a single document with all data
137
+ text = df.to_string(index=False)
138
+ documents.append(Document(
139
+ page_content=text,
140
+ metadata=metadata
141
+ ))
142
+
143
+ if self.logger:
144
+ self.logger.info(f"Extracted data from {csv_path}")
145
+ else:
146
+ print(f"Extracted data from {csv_path}")
147
+
148
+ return documents
149
+
150
+ except Exception as e:
151
+ if self.logger:
152
+ self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
153
+ else:
154
+ print(f"Error extracting from {csv_path}: {str(e)}")
155
+ raise
156
+
157
+ def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
158
+ """
159
+ Extract data from multiple CSV files.
160
+
161
+ Args:
162
+ csv_paths (List[str]): List of paths to CSV files
163
+ **kwargs: Additional arguments for extract_csv
164
+
165
+ Returns:
166
+ List[Document]: List of Document objects containing extracted content
167
+ """
168
+ all_docs = []
169
+ for csv_path in csv_paths:
170
+ try:
171
+ docs = self.extract_csv(csv_path, **kwargs)
172
+ all_docs.extend(docs)
173
+ if self.logger:
174
+ self.logger.info(f"Successfully extracted content from {csv_path}")
175
+ else:
176
+ print(f"Successfully extracted content from {csv_path}")
177
+ except Exception as e:
178
+ if self.logger:
179
+ self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
180
+ else:
181
+ print(f"Error extracting from {csv_path}: {str(e)}")
182
+
183
+ return all_docs
184
+
185
+
186
+ class PowerPointExtractor:
187
+ """
188
+ Class for extracting content from PowerPoint (PPT/PPTX) files.
189
+
190
+ This class provides methods for extracting text, notes, and metadata
191
+ from PowerPoint presentations.
192
+
193
+ Args:
194
+ logger: Optional logger instance for logging operations
195
+
196
+ Example:
197
+ ```python
198
+ extractor = PowerPointExtractor()
199
+ docs = extractor.extract_ppt("presentation.pptx")
200
+ ```
201
+ """
202
+
203
+ def __init__(self, logger=None):
204
+ """Initialize the PowerPoint extractor."""
205
+ self.logger = logger
206
+
207
+ @staticmethod
208
+ def check_package(package_name: str) -> bool:
209
+ """
210
+ Check if a Python package is installed.
211
+
212
+ Args:
213
+ package_name (str): Name of the package to check
214
+
215
+ Returns:
216
+ bool: True if package is installed, False otherwise
217
+ """
218
+ return importlib.util.find_spec(package_name) is not None
219
+
220
+ def check_file(self, file_path: str) -> bool:
221
+ """
222
+ Check if file exists.
223
+
224
+ Args:
225
+ file_path (str): Path to the file
226
+
227
+ Returns:
228
+ bool: True if file exists, False otherwise
229
+ """
230
+ return os.path.exists(file_path)
231
+
232
+ def extract_ppt(self, ppt_path: str, include_notes: bool = True,
233
+ include_hidden_slides: bool = False,
234
+ extract_images: bool = False) -> List[Document]:
235
+ """
236
+ Extract content from a PowerPoint file.
237
+
238
+ Args:
239
+ ppt_path (str): Path to the PowerPoint file
240
+ include_notes (bool): Whether to include speaker notes
241
+ include_hidden_slides (bool): Whether to include hidden slides
242
+ extract_images (bool): Whether to extract images
243
+
244
+ Returns:
245
+ List[Document]: List of Document objects containing extracted content
246
+
247
+ Raises:
248
+ ValueError: If the file doesn't exist
249
+ ImportError: If python-pptx is not installed
250
+ """
251
+ if not self.check_file(ppt_path):
252
+ raise ValueError(f"File {ppt_path} not found")
253
+
254
+ if not self.check_package("pptx"):
255
+ raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
256
+
257
+ from pptx import Presentation
258
+
259
+ try:
260
+ # Load presentation
261
+ presentation = Presentation(ppt_path)
262
+
263
+ documents = []
264
+
265
+ # Process each slide
266
+ for i, slide in enumerate(presentation.slides):
267
+ # Skip hidden slides if not requested
268
+ if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
269
+ continue
270
+
271
+ # Extract text from shapes
272
+ texts = []
273
+ for shape in slide.shapes:
274
+ if hasattr(shape, "text") and shape.text:
275
+ texts.append(shape.text)
276
+
277
+ # Extract notes if requested
278
+ notes = ""
279
+ if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
280
+ for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
281
+ if note_shape.text:
282
+ notes += note_shape.text + "\n"
283
+
284
+ # Create metadata
285
+ metadata = {
286
+ "source": ppt_path,
287
+ "slide_number": i + 1,
288
+ "total_slides": len(presentation.slides),
289
+ "file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
290
+ }
291
+
292
+ # Add slide title if available
293
+ if slide.shapes.title and slide.shapes.title.text:
294
+ metadata["title"] = slide.shapes.title.text
295
+
296
+ # Combine text content
297
+ content = f"Slide {i+1}"
298
+ if "title" in metadata:
299
+ content += f": {metadata['title']}"
300
+ content += "\n\n"
301
+
302
+ if texts:
303
+ content += "\n".join(texts) + "\n"
304
+
305
+ if notes:
306
+ content += "\nNotes:\n" + notes
307
+
308
+ # Create document
309
+ documents.append(Document(
310
+ page_content=content,
311
+ metadata=metadata
312
+ ))
313
+
314
+ if self.logger:
315
+ self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
316
+ else:
317
+ print(f"Extracted {len(documents)} slides from {ppt_path}")
318
+
319
+ return documents
320
+
321
+ except Exception as e:
322
+ if self.logger:
323
+ self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
324
+ else:
325
+ print(f"Error extracting from {ppt_path}: {str(e)}")
326
+ raise
327
+
328
+ def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
329
+ """
330
+ Extract content from multiple PowerPoint files.
331
+
332
+ Args:
333
+ ppt_paths (List[str]): List of paths to PowerPoint files
334
+ **kwargs: Additional arguments for extract_ppt
335
+
336
+ Returns:
337
+ List[Document]: List of Document objects containing extracted content
338
+ """
339
+ all_docs = []
340
+ for ppt_path in ppt_paths:
341
+ try:
342
+ docs = self.extract_ppt(ppt_path, **kwargs)
343
+ all_docs.extend(docs)
344
+ if self.logger:
345
+ self.logger.info(f"Successfully extracted content from {ppt_path}")
346
+ else:
347
+ print(f"Successfully extracted content from {ppt_path}")
348
+ except Exception as e:
349
+ if self.logger:
350
+ self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
351
+ else:
352
+ print(f"Error extracting from {ppt_path}: {str(e)}")
353
+
354
+ return all_docs
@@ -0,0 +1,428 @@
1
+ """
2
+ PDF Extraction Module
3
+
4
+ This module provides functionality for extracting text and metadata from PDF files.
5
+ It supports various extraction methods and includes features for handling different
6
+ PDF structures, including tables and images.
7
+
8
+ Example Usage:
9
+ ```python
10
+ # Initialize PDF extractor
11
+ extractor = PDFExtractor()
12
+
13
+ # Extract text from a PDF file
14
+ docs = extractor.extract_pdf("document.pdf")
15
+
16
+ # Extract with specific options
17
+ docs = extractor.extract_pdf(
18
+ "document.pdf",
19
+ extraction_method="pdfplumber",
20
+ extract_images=True
21
+ )
22
+
23
+ # Extract from multiple PDFs
24
+ docs = extractor.extract_multiple_pdfs(
25
+ ["doc1.pdf", "doc2.pdf"],
26
+ extraction_method="pymupdf"
27
+ )
28
+ ```
29
+
30
+ Features:
31
+ - Multiple extraction methods (PyPDF2, PDFPlumber, PyMuPDF)
32
+ - Text and metadata extraction
33
+ - Optional image extraction
34
+ - Table detection and extraction
35
+ - Batch processing for multiple PDFs
36
+ """
37
+
38
+ import os
39
+ import tempfile
40
+ from typing import List, Dict, Optional, Union, Any, Tuple
41
+ import importlib.util
42
+ from langchain_core.documents import Document
43
+
44
+ class PDFExtractor:
45
+ """
46
+ Class for extracting text and metadata from PDF files.
47
+
48
+ This class provides methods for extracting content from PDF files using
49
+ different extraction methods and processing options.
50
+
51
+ Args:
52
+ logger: Optional logger instance for logging operations
53
+
54
+ Example:
55
+ ```python
56
+ extractor = PDFExtractor()
57
+ docs = extractor.extract_pdf("document.pdf")
58
+ ```
59
+ """
60
+
61
+ def __init__(self, logger=None):
62
+ """Initialize the PDF extractor."""
63
+ self.logger = logger
64
+
65
+ @staticmethod
66
+ def check_package(package_name: str) -> bool:
67
+ """
68
+ Check if a Python package is installed.
69
+
70
+ Args:
71
+ package_name (str): Name of the package to check
72
+
73
+ Returns:
74
+ bool: True if package is installed, False otherwise
75
+ """
76
+ return importlib.util.find_spec(package_name) is not None
77
+
78
+ def check_file(self, file_path: str) -> bool:
79
+ """
80
+ Check if file exists.
81
+
82
+ Args:
83
+ file_path (str): Path to the file
84
+
85
+ Returns:
86
+ bool: True if file exists, False otherwise
87
+ """
88
+ return os.path.exists(file_path)
89
+
90
+ def extract_pdf(self, pdf_path: str, extraction_method: str = "pypdf",
91
+ extract_images: bool = False, extract_tables: bool = False,
92
+ **kwargs) -> List[Document]:
93
+ """
94
+ Extract text and metadata from a PDF file.
95
+
96
+ Args:
97
+ pdf_path (str): Path to the PDF file
98
+ extraction_method (str): Method to use for extraction
99
+ ("pypdf", "pdfplumber", or "pymupdf")
100
+ extract_images (bool): Whether to extract images
101
+ extract_tables (bool): Whether to extract tables
102
+ **kwargs: Additional arguments for the extraction method
103
+
104
+ Returns:
105
+ List[Document]: List of Document objects containing extracted content
106
+
107
+ Raises:
108
+ ValueError: If the file doesn't exist or extraction method is invalid
109
+ ImportError: If required packages are not installed
110
+ """
111
+ if not self.check_file(pdf_path):
112
+ raise ValueError(f"File {pdf_path} not found")
113
+
114
+ if extraction_method == "pypdf":
115
+ return self._extract_with_pypdf(pdf_path, **kwargs)
116
+ elif extraction_method == "pdfplumber":
117
+ return self._extract_with_pdfplumber(pdf_path, extract_tables, **kwargs)
118
+ elif extraction_method == "pymupdf":
119
+ return self._extract_with_pymupdf(pdf_path, extract_images, **kwargs)
120
+ else:
121
+ raise ValueError(f"Invalid extraction method: {extraction_method}")
122
+
123
+ def extract_multiple_pdfs(self, pdf_paths: List[str], extraction_method: str = "pypdf",
124
+ extract_images: bool = False, extract_tables: bool = False,
125
+ **kwargs) -> List[Document]:
126
+ """
127
+ Extract text and metadata from multiple PDF files.
128
+
129
+ Args:
130
+ pdf_paths (List[str]): List of paths to PDF files
131
+ extraction_method (str): Method to use for extraction
132
+ extract_images (bool): Whether to extract images
133
+ extract_tables (bool): Whether to extract tables
134
+ **kwargs: Additional arguments for the extraction method
135
+
136
+ Returns:
137
+ List[Document]: List of Document objects containing extracted content
138
+ """
139
+ all_docs = []
140
+ for pdf_path in pdf_paths:
141
+ try:
142
+ docs = self.extract_pdf(
143
+ pdf_path,
144
+ extraction_method=extraction_method,
145
+ extract_images=extract_images,
146
+ extract_tables=extract_tables,
147
+ **kwargs
148
+ )
149
+ all_docs.extend(docs)
150
+ if self.logger:
151
+ self.logger.info(f"Successfully extracted content from {pdf_path}")
152
+ else:
153
+ print(f"Successfully extracted content from {pdf_path}")
154
+ except Exception as e:
155
+ if self.logger:
156
+ self.logger.error(f"Error extracting from {pdf_path}: {str(e)}")
157
+ else:
158
+ print(f"Error extracting from {pdf_path}: {str(e)}")
159
+
160
+ return all_docs
161
+
162
+ def _extract_with_pypdf(self, pdf_path: str, **kwargs) -> List[Document]:
163
+ """
164
+ Extract text using PyPDF2.
165
+
166
+ Args:
167
+ pdf_path (str): Path to the PDF file
168
+ **kwargs: Additional arguments for PyPDF2
169
+
170
+ Returns:
171
+ List[Document]: List of Document objects
172
+
173
+ Raises:
174
+ ImportError: If PyPDF2 is not installed
175
+ """
176
+ if not self.check_package("pypdf"):
177
+ raise ImportError("PyPDF2 package not found. Please install: pip install pypdf")
178
+
179
+ from langchain_community.document_loaders import PyPDFLoader
180
+
181
+ loader = PyPDFLoader(pdf_path, **kwargs)
182
+ documents = loader.load()
183
+
184
+ if self.logger:
185
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
186
+ else:
187
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PyPDF2")
188
+
189
+ return documents
190
+
191
+ def _extract_with_pdfplumber(self, pdf_path: str, extract_tables: bool = False, **kwargs) -> List[Document]:
192
+ """
193
+ Extract text using PDFPlumber.
194
+
195
+ Args:
196
+ pdf_path (str): Path to the PDF file
197
+ extract_tables (bool): Whether to extract tables
198
+ **kwargs: Additional arguments for PDFPlumber
199
+
200
+ Returns:
201
+ List[Document]: List of Document objects
202
+
203
+ Raises:
204
+ ImportError: If PDFPlumber is not installed
205
+ """
206
+ if not self.check_package("pdfplumber"):
207
+ raise ImportError("PDFPlumber package not found. Please install: pip install pdfplumber")
208
+
209
+ import pdfplumber
210
+
211
+ documents = []
212
+ with pdfplumber.open(pdf_path) as pdf:
213
+ for i, page in enumerate(pdf.pages):
214
+ text = page.extract_text()
215
+
216
+ metadata = {
217
+ "source": pdf_path,
218
+ "page": i + 1,
219
+ "total_pages": len(pdf.pages)
220
+ }
221
+
222
+ if extract_tables:
223
+ tables = page.extract_tables()
224
+ if tables:
225
+ table_text = []
226
+ for table in tables:
227
+ table_rows = []
228
+ for row in table:
229
+ # Filter out None values and convert to strings
230
+ row_text = [str(cell) if cell is not None else "" for cell in row]
231
+ table_rows.append(" | ".join(row_text))
232
+ table_text.append("\n".join(table_rows))
233
+
234
+ metadata["tables"] = table_text
235
+ # Append table text to the main text
236
+ text += "\n\nTABLES:\n" + "\n\n".join(table_text)
237
+
238
+ documents.append(Document(page_content=text, metadata=metadata))
239
+
240
+ if self.logger:
241
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
242
+ else:
243
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PDFPlumber")
244
+
245
+ return documents
246
+
247
+ def _extract_with_pymupdf(self, pdf_path: str, extract_images: bool = False, **kwargs) -> List[Document]:
248
+ """
249
+ Extract text using PyMuPDF (fitz).
250
+
251
+ Args:
252
+ pdf_path (str): Path to the PDF file
253
+ extract_images (bool): Whether to extract images
254
+ **kwargs: Additional arguments for PyMuPDF
255
+
256
+ Returns:
257
+ List[Document]: List of Document objects
258
+
259
+ Raises:
260
+ ImportError: If PyMuPDF is not installed
261
+ """
262
+ if not self.check_package("fitz"):
263
+ raise ImportError("PyMuPDF package not found. Please install: pip install pymupdf")
264
+
265
+ import fitz
266
+
267
+ documents = []
268
+ temp_dir = None
269
+
270
+ try:
271
+ if extract_images:
272
+ temp_dir = tempfile.mkdtemp()
273
+
274
+ with fitz.open(pdf_path) as doc:
275
+ for i, page in enumerate(doc):
276
+ text = page.get_text()
277
+
278
+ metadata = {
279
+ "source": pdf_path,
280
+ "page": i + 1,
281
+ "total_pages": len(doc),
282
+ "title": doc.metadata.get("title", ""),
283
+ "author": doc.metadata.get("author", ""),
284
+ "subject": doc.metadata.get("subject", ""),
285
+ "keywords": doc.metadata.get("keywords", "")
286
+ }
287
+
288
+ if extract_images and temp_dir:
289
+ image_list = page.get_images(full=True)
290
+ image_paths = []
291
+
292
+ for img_index, img in enumerate(image_list):
293
+ xref = img[0]
294
+ base_image = doc.extract_image(xref)
295
+ image_bytes = base_image["image"]
296
+
297
+ image_path = os.path.join(
298
+ temp_dir,
299
+ f"page{i+1}_img{img_index+1}.{base_image['ext']}"
300
+ )
301
+
302
+ with open(image_path, "wb") as img_file:
303
+ img_file.write(image_bytes)
304
+
305
+ image_paths.append(image_path)
306
+
307
+ if image_paths:
308
+ metadata["images"] = image_paths
309
+
310
+ documents.append(Document(page_content=text, metadata=metadata))
311
+
312
+ finally:
313
+ # Clean up temporary directory if it was created
314
+ if extract_images and temp_dir and os.path.exists(temp_dir):
315
+ import shutil
316
+ shutil.rmtree(temp_dir)
317
+
318
+ if self.logger:
319
+ self.logger.info(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
320
+ else:
321
+ print(f"Extracted {len(documents)} pages from {pdf_path} using PyMuPDF")
322
+
323
+ return documents
324
+
325
+
326
+ class PDFToCSV:
327
+ """
328
+ Class for converting PDF tables to CSV format.
329
+
330
+ This class provides methods for extracting tables from PDF files
331
+ and converting them to CSV format.
332
+
333
+ Args:
334
+ logger: Optional logger instance for logging operations
335
+
336
+ Example:
337
+ ```python
338
+ converter = PDFToCSV()
339
+ csv_paths = converter.convert_pdf_tables_to_csv("document.pdf", "output_dir")
340
+ ```
341
+ """
342
+
343
+ def __init__(self, logger=None):
344
+ """Initialize the PDF to CSV converter."""
345
+ self.logger = logger
346
+
347
+ @staticmethod
348
+ def check_package(package_name: str) -> bool:
349
+ """
350
+ Check if a Python package is installed.
351
+
352
+ Args:
353
+ package_name (str): Name of the package to check
354
+
355
+ Returns:
356
+ bool: True if package is installed, False otherwise
357
+ """
358
+ return importlib.util.find_spec(package_name) is not None
359
+
360
+ def convert_pdf_tables_to_csv(self, pdf_path: str, output_dir: str = None,
361
+ pages: List[int] = None) -> List[str]:
362
+ """
363
+ Extract tables from PDF and convert to CSV.
364
+
365
+ Args:
366
+ pdf_path (str): Path to the PDF file
367
+ output_dir (str): Directory to save CSV files (default: same as PDF)
368
+ pages (List[int]): Specific pages to extract tables from (default: all)
369
+
370
+ Returns:
371
+ List[str]: Paths to the created CSV files
372
+
373
+ Raises:
374
+ ImportError: If required packages are not installed
375
+ ValueError: If the PDF file doesn't exist
376
+ """
377
+ if not os.path.exists(pdf_path):
378
+ raise ValueError(f"PDF file not found: {pdf_path}")
379
+
380
+ if not self.check_package("tabula"):
381
+ raise ImportError("Tabula-py package not found. Please install: pip install tabula-py")
382
+
383
+ import tabula
384
+ import pandas as pd
385
+
386
+ # Determine output directory
387
+ if output_dir is None:
388
+ output_dir = os.path.dirname(pdf_path)
389
+
390
+ # Create output directory if it doesn't exist
391
+ os.makedirs(output_dir, exist_ok=True)
392
+
393
+ # Extract tables
394
+ try:
395
+ if pages:
396
+ dfs = tabula.read_pdf(pdf_path, pages=pages, multiple_tables=True)
397
+ else:
398
+ dfs = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
399
+ except Exception as e:
400
+ if self.logger:
401
+ self.logger.error(f"Error extracting tables: {str(e)}")
402
+ else:
403
+ print(f"Error extracting tables: {str(e)}")
404
+ return []
405
+
406
+ if not dfs:
407
+ if self.logger:
408
+ self.logger.warning(f"No tables found in {pdf_path}")
409
+ else:
410
+ print(f"No tables found in {pdf_path}")
411
+ return []
412
+
413
+ # Save tables to CSV
414
+ csv_paths = []
415
+ pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
416
+
417
+ for i, df in enumerate(dfs):
418
+ if not df.empty:
419
+ csv_path = os.path.join(output_dir, f"{pdf_name}_table_{i+1}.csv")
420
+ df.to_csv(csv_path, index=False)
421
+ csv_paths.append(csv_path)
422
+
423
+ if self.logger:
424
+ self.logger.info(f"Saved table {i+1} to {csv_path}")
425
+ else:
426
+ print(f"Saved table {i+1} to {csv_path}")
427
+
428
+ return csv_paths
@@ -1,5 +1,5 @@
1
1
  MAJOR_VERSION = 1
2
2
  MINOR_VERSION = 1
3
- PATCH_VERSION = 24
3
+ PATCH_VERSION = 29
4
4
  version = '{}.{}.{}'.format(MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION)
5
5
  __all__ = ['MAJOR_VERSION', 'MINOR_VERSION', 'PATCH_VERSION', 'version']
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: mb_rag
3
- Version: 1.1.24
3
+ Version: 1.1.29
4
4
  Summary: RAG function file
5
5
  Author: ['Malav Bateriwala']
6
6
  Requires-Python: >=3.8
@@ -16,4 +16,6 @@ mb_rag/rag/__init__.py
16
16
  mb_rag/rag/embeddings.py
17
17
  mb_rag/utils/__init__.py
18
18
  mb_rag/utils/bounding_box.py
19
- mb_rag/utils/extra.py
19
+ mb_rag/utils/document_extract.py
20
+ mb_rag/utils/extra.py
21
+ mb_rag/utils/pdf_extract.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes