mb-rag 1.1.57.post1__py3-none-any.whl → 1.1.59__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mb-rag might be problematic. Click here for more details.

@@ -1,354 +1,354 @@
1
- """
2
- Document Extraction Module
3
-
4
- This module provides functionality for extracting text and metadata from various document types
5
- including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
6
- functionality in pdf_extract.py.
7
-
8
- Example Usage:
9
- ```python
10
- # Initialize CSV extractor
11
- csv_extractor = CSVExtractor()
12
-
13
- # Extract data from a CSV file
14
- docs = csv_extractor.extract_csv("data.csv")
15
-
16
- # Initialize PowerPoint extractor
17
- ppt_extractor = PowerPointExtractor()
18
-
19
- # Extract content from a PowerPoint file
20
- docs = ppt_extractor.extract_ppt("presentation.pptx")
21
- ```
22
-
23
- Features:
24
- - CSV file extraction with metadata
25
- - PowerPoint (PPT/PPTX) extraction
26
- - Batch processing for multiple files
27
- """
28
-
29
- import os
30
- import pandas as pd
31
- import importlib.util
32
- from typing import List, Dict, Optional, Union, Any
33
- from langchain_core.documents import Document
34
-
35
- class CSVExtractor:
36
- """
37
- Class for extracting data from CSV files.
38
-
39
- This class provides methods for extracting content from CSV files
40
- and converting it to Document objects for use with RAG systems.
41
-
42
- Args:
43
- logger: Optional logger instance for logging operations
44
-
45
- Example:
46
- ```python
47
- extractor = CSVExtractor()
48
- docs = extractor.extract_csv("data.csv")
49
- ```
50
- """
51
-
52
- def __init__(self, logger=None):
53
- """Initialize the CSV extractor."""
54
- self.logger = logger
55
-
56
- def check_file(self, file_path: str) -> bool:
57
- """
58
- Check if file exists.
59
-
60
- Args:
61
- file_path (str): Path to the file
62
-
63
- Returns:
64
- bool: True if file exists, False otherwise
65
- """
66
- return os.path.exists(file_path)
67
-
68
- def extract_csv(self, csv_path: str, include_stats: bool = True,
69
- chunk_by_row: bool = False, rows_per_chunk: int = 10,
70
- **kwargs) -> List[Document]:
71
- """
72
- Extract data from a CSV file.
73
-
74
- Args:
75
- csv_path (str): Path to the CSV file
76
- include_stats (bool): Whether to include basic statistics in the metadata
77
- chunk_by_row (bool): Whether to create a separate document for each row or group of rows
78
- rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
79
- **kwargs: Additional arguments for pandas.read_csv
80
-
81
- Returns:
82
- List[Document]: List of Document objects containing extracted content
83
-
84
- Raises:
85
- ValueError: If the file doesn't exist
86
- ImportError: If pandas is not installed
87
- """
88
- if not self.check_file(csv_path):
89
- raise ValueError(f"File {csv_path} not found")
90
-
91
- try:
92
- # Read CSV file
93
- df = pd.read_csv(csv_path, **kwargs)
94
-
95
- # Create metadata
96
- metadata = {
97
- "source": csv_path,
98
- "rows": len(df),
99
- "columns": list(df.columns),
100
- "file_type": "csv"
101
- }
102
-
103
- # Add basic statistics if requested
104
- if include_stats:
105
- stats = {}
106
- for column in df.columns:
107
- if pd.api.types.is_numeric_dtype(df[column]):
108
- stats[column] = {
109
- "min": float(df[column].min()),
110
- "max": float(df[column].max()),
111
- "mean": float(df[column].mean()),
112
- "median": float(df[column].median())
113
- }
114
- metadata["statistics"] = stats
115
-
116
- documents = []
117
-
118
- if chunk_by_row:
119
- # Create a separate document for each chunk of rows
120
- for i in range(0, len(df), rows_per_chunk):
121
- chunk = df.iloc[i:i+rows_per_chunk]
122
- chunk_text = chunk.to_string(index=False)
123
-
124
- chunk_metadata = metadata.copy()
125
- chunk_metadata["chunk"] = {
126
- "start_row": i,
127
- "end_row": min(i + rows_per_chunk - 1, len(df) - 1),
128
- "total_rows": len(chunk)
129
- }
130
-
131
- documents.append(Document(
132
- page_content=chunk_text,
133
- metadata=chunk_metadata
134
- ))
135
- else:
136
- # Create a single document with all data
137
- text = df.to_string(index=False)
138
- documents.append(Document(
139
- page_content=text,
140
- metadata=metadata
141
- ))
142
-
143
- if self.logger:
144
- self.logger.info(f"Extracted data from {csv_path}")
145
- else:
146
- print(f"Extracted data from {csv_path}")
147
-
148
- return documents
149
-
150
- except Exception as e:
151
- if self.logger:
152
- self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
153
- else:
154
- print(f"Error extracting from {csv_path}: {str(e)}")
155
- raise
156
-
157
- def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
158
- """
159
- Extract data from multiple CSV files.
160
-
161
- Args:
162
- csv_paths (List[str]): List of paths to CSV files
163
- **kwargs: Additional arguments for extract_csv
164
-
165
- Returns:
166
- List[Document]: List of Document objects containing extracted content
167
- """
168
- all_docs = []
169
- for csv_path in csv_paths:
170
- try:
171
- docs = self.extract_csv(csv_path, **kwargs)
172
- all_docs.extend(docs)
173
- if self.logger:
174
- self.logger.info(f"Successfully extracted content from {csv_path}")
175
- else:
176
- print(f"Successfully extracted content from {csv_path}")
177
- except Exception as e:
178
- if self.logger:
179
- self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
180
- else:
181
- print(f"Error extracting from {csv_path}: {str(e)}")
182
-
183
- return all_docs
184
-
185
-
186
- class PowerPointExtractor:
187
- """
188
- Class for extracting content from PowerPoint (PPT/PPTX) files.
189
-
190
- This class provides methods for extracting text, notes, and metadata
191
- from PowerPoint presentations.
192
-
193
- Args:
194
- logger: Optional logger instance for logging operations
195
-
196
- Example:
197
- ```python
198
- extractor = PowerPointExtractor()
199
- docs = extractor.extract_ppt("presentation.pptx")
200
- ```
201
- """
202
-
203
- def __init__(self, logger=None):
204
- """Initialize the PowerPoint extractor."""
205
- self.logger = logger
206
-
207
- @staticmethod
208
- def check_package(package_name: str) -> bool:
209
- """
210
- Check if a Python package is installed.
211
-
212
- Args:
213
- package_name (str): Name of the package to check
214
-
215
- Returns:
216
- bool: True if package is installed, False otherwise
217
- """
218
- return importlib.util.find_spec(package_name) is not None
219
-
220
- def check_file(self, file_path: str) -> bool:
221
- """
222
- Check if file exists.
223
-
224
- Args:
225
- file_path (str): Path to the file
226
-
227
- Returns:
228
- bool: True if file exists, False otherwise
229
- """
230
- return os.path.exists(file_path)
231
-
232
- def extract_ppt(self, ppt_path: str, include_notes: bool = True,
233
- include_hidden_slides: bool = False,
234
- extract_images: bool = False) -> List[Document]:
235
- """
236
- Extract content from a PowerPoint file.
237
-
238
- Args:
239
- ppt_path (str): Path to the PowerPoint file
240
- include_notes (bool): Whether to include speaker notes
241
- include_hidden_slides (bool): Whether to include hidden slides
242
- extract_images (bool): Whether to extract images
243
-
244
- Returns:
245
- List[Document]: List of Document objects containing extracted content
246
-
247
- Raises:
248
- ValueError: If the file doesn't exist
249
- ImportError: If python-pptx is not installed
250
- """
251
- if not self.check_file(ppt_path):
252
- raise ValueError(f"File {ppt_path} not found")
253
-
254
- if not self.check_package("pptx"):
255
- raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
256
-
257
- from pptx import Presentation
258
-
259
- try:
260
- # Load presentation
261
- presentation = Presentation(ppt_path)
262
-
263
- documents = []
264
-
265
- # Process each slide
266
- for i, slide in enumerate(presentation.slides):
267
- # Skip hidden slides if not requested
268
- if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
269
- continue
270
-
271
- # Extract text from shapes
272
- texts = []
273
- for shape in slide.shapes:
274
- if hasattr(shape, "text") and shape.text:
275
- texts.append(shape.text)
276
-
277
- # Extract notes if requested
278
- notes = ""
279
- if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
280
- for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
281
- if note_shape.text:
282
- notes += note_shape.text + "\n"
283
-
284
- # Create metadata
285
- metadata = {
286
- "source": ppt_path,
287
- "slide_number": i + 1,
288
- "total_slides": len(presentation.slides),
289
- "file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
290
- }
291
-
292
- # Add slide title if available
293
- if slide.shapes.title and slide.shapes.title.text:
294
- metadata["title"] = slide.shapes.title.text
295
-
296
- # Combine text content
297
- content = f"Slide {i+1}"
298
- if "title" in metadata:
299
- content += f": {metadata['title']}"
300
- content += "\n\n"
301
-
302
- if texts:
303
- content += "\n".join(texts) + "\n"
304
-
305
- if notes:
306
- content += "\nNotes:\n" + notes
307
-
308
- # Create document
309
- documents.append(Document(
310
- page_content=content,
311
- metadata=metadata
312
- ))
313
-
314
- if self.logger:
315
- self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
316
- else:
317
- print(f"Extracted {len(documents)} slides from {ppt_path}")
318
-
319
- return documents
320
-
321
- except Exception as e:
322
- if self.logger:
323
- self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
324
- else:
325
- print(f"Error extracting from {ppt_path}: {str(e)}")
326
- raise
327
-
328
- def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
329
- """
330
- Extract content from multiple PowerPoint files.
331
-
332
- Args:
333
- ppt_paths (List[str]): List of paths to PowerPoint files
334
- **kwargs: Additional arguments for extract_ppt
335
-
336
- Returns:
337
- List[Document]: List of Document objects containing extracted content
338
- """
339
- all_docs = []
340
- for ppt_path in ppt_paths:
341
- try:
342
- docs = self.extract_ppt(ppt_path, **kwargs)
343
- all_docs.extend(docs)
344
- if self.logger:
345
- self.logger.info(f"Successfully extracted content from {ppt_path}")
346
- else:
347
- print(f"Successfully extracted content from {ppt_path}")
348
- except Exception as e:
349
- if self.logger:
350
- self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
351
- else:
352
- print(f"Error extracting from {ppt_path}: {str(e)}")
353
-
354
- return all_docs
1
+ """
2
+ Document Extraction Module
3
+
4
+ This module provides functionality for extracting text and metadata from various document types
5
+ including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
6
+ functionality in pdf_extract.py.
7
+
8
+ Example Usage:
9
+ ```python
10
+ # Initialize CSV extractor
11
+ csv_extractor = CSVExtractor()
12
+
13
+ # Extract data from a CSV file
14
+ docs = csv_extractor.extract_csv("data.csv")
15
+
16
+ # Initialize PowerPoint extractor
17
+ ppt_extractor = PowerPointExtractor()
18
+
19
+ # Extract content from a PowerPoint file
20
+ docs = ppt_extractor.extract_ppt("presentation.pptx")
21
+ ```
22
+
23
+ Features:
24
+ - CSV file extraction with metadata
25
+ - PowerPoint (PPT/PPTX) extraction
26
+ - Batch processing for multiple files
27
+ """
28
+
29
+ import os
30
+ import pandas as pd
31
+ import importlib.util
32
+ from typing import List, Dict, Optional, Union, Any
33
+ from langchain_core.documents import Document
34
+
35
+ class CSVExtractor:
36
+ """
37
+ Class for extracting data from CSV files.
38
+
39
+ This class provides methods for extracting content from CSV files
40
+ and converting it to Document objects for use with RAG systems.
41
+
42
+ Args:
43
+ logger: Optional logger instance for logging operations
44
+
45
+ Example:
46
+ ```python
47
+ extractor = CSVExtractor()
48
+ docs = extractor.extract_csv("data.csv")
49
+ ```
50
+ """
51
+
52
+ def __init__(self, logger=None):
53
+ """Initialize the CSV extractor."""
54
+ self.logger = logger
55
+
56
+ def check_file(self, file_path: str) -> bool:
57
+ """
58
+ Check if file exists.
59
+
60
+ Args:
61
+ file_path (str): Path to the file
62
+
63
+ Returns:
64
+ bool: True if file exists, False otherwise
65
+ """
66
+ return os.path.exists(file_path)
67
+
68
+ def extract_csv(self, csv_path: str, include_stats: bool = True,
69
+ chunk_by_row: bool = False, rows_per_chunk: int = 10,
70
+ **kwargs) -> List[Document]:
71
+ """
72
+ Extract data from a CSV file.
73
+
74
+ Args:
75
+ csv_path (str): Path to the CSV file
76
+ include_stats (bool): Whether to include basic statistics in the metadata
77
+ chunk_by_row (bool): Whether to create a separate document for each row or group of rows
78
+ rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
79
+ **kwargs: Additional arguments for pandas.read_csv
80
+
81
+ Returns:
82
+ List[Document]: List of Document objects containing extracted content
83
+
84
+ Raises:
85
+ ValueError: If the file doesn't exist
86
+ ImportError: If pandas is not installed
87
+ """
88
+ if not self.check_file(csv_path):
89
+ raise ValueError(f"File {csv_path} not found")
90
+
91
+ try:
92
+ # Read CSV file
93
+ df = pd.read_csv(csv_path, **kwargs)
94
+
95
+ # Create metadata
96
+ metadata = {
97
+ "source": csv_path,
98
+ "rows": len(df),
99
+ "columns": list(df.columns),
100
+ "file_type": "csv"
101
+ }
102
+
103
+ # Add basic statistics if requested
104
+ if include_stats:
105
+ stats = {}
106
+ for column in df.columns:
107
+ if pd.api.types.is_numeric_dtype(df[column]):
108
+ stats[column] = {
109
+ "min": float(df[column].min()),
110
+ "max": float(df[column].max()),
111
+ "mean": float(df[column].mean()),
112
+ "median": float(df[column].median())
113
+ }
114
+ metadata["statistics"] = stats
115
+
116
+ documents = []
117
+
118
+ if chunk_by_row:
119
+ # Create a separate document for each chunk of rows
120
+ for i in range(0, len(df), rows_per_chunk):
121
+ chunk = df.iloc[i:i+rows_per_chunk]
122
+ chunk_text = chunk.to_string(index=False)
123
+
124
+ chunk_metadata = metadata.copy()
125
+ chunk_metadata["chunk"] = {
126
+ "start_row": i,
127
+ "end_row": min(i + rows_per_chunk - 1, len(df) - 1),
128
+ "total_rows": len(chunk)
129
+ }
130
+
131
+ documents.append(Document(
132
+ page_content=chunk_text,
133
+ metadata=chunk_metadata
134
+ ))
135
+ else:
136
+ # Create a single document with all data
137
+ text = df.to_string(index=False)
138
+ documents.append(Document(
139
+ page_content=text,
140
+ metadata=metadata
141
+ ))
142
+
143
+ if self.logger:
144
+ self.logger.info(f"Extracted data from {csv_path}")
145
+ else:
146
+ print(f"Extracted data from {csv_path}")
147
+
148
+ return documents
149
+
150
+ except Exception as e:
151
+ if self.logger:
152
+ self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
153
+ else:
154
+ print(f"Error extracting from {csv_path}: {str(e)}")
155
+ raise
156
+
157
+ def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
158
+ """
159
+ Extract data from multiple CSV files.
160
+
161
+ Args:
162
+ csv_paths (List[str]): List of paths to CSV files
163
+ **kwargs: Additional arguments for extract_csv
164
+
165
+ Returns:
166
+ List[Document]: List of Document objects containing extracted content
167
+ """
168
+ all_docs = []
169
+ for csv_path in csv_paths:
170
+ try:
171
+ docs = self.extract_csv(csv_path, **kwargs)
172
+ all_docs.extend(docs)
173
+ if self.logger:
174
+ self.logger.info(f"Successfully extracted content from {csv_path}")
175
+ else:
176
+ print(f"Successfully extracted content from {csv_path}")
177
+ except Exception as e:
178
+ if self.logger:
179
+ self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
180
+ else:
181
+ print(f"Error extracting from {csv_path}: {str(e)}")
182
+
183
+ return all_docs
184
+
185
+
186
+ class PowerPointExtractor:
187
+ """
188
+ Class for extracting content from PowerPoint (PPT/PPTX) files.
189
+
190
+ This class provides methods for extracting text, notes, and metadata
191
+ from PowerPoint presentations.
192
+
193
+ Args:
194
+ logger: Optional logger instance for logging operations
195
+
196
+ Example:
197
+ ```python
198
+ extractor = PowerPointExtractor()
199
+ docs = extractor.extract_ppt("presentation.pptx")
200
+ ```
201
+ """
202
+
203
+ def __init__(self, logger=None):
204
+ """Initialize the PowerPoint extractor."""
205
+ self.logger = logger
206
+
207
+ @staticmethod
208
+ def check_package(package_name: str) -> bool:
209
+ """
210
+ Check if a Python package is installed.
211
+
212
+ Args:
213
+ package_name (str): Name of the package to check
214
+
215
+ Returns:
216
+ bool: True if package is installed, False otherwise
217
+ """
218
+ return importlib.util.find_spec(package_name) is not None
219
+
220
+ def check_file(self, file_path: str) -> bool:
221
+ """
222
+ Check if file exists.
223
+
224
+ Args:
225
+ file_path (str): Path to the file
226
+
227
+ Returns:
228
+ bool: True if file exists, False otherwise
229
+ """
230
+ return os.path.exists(file_path)
231
+
232
+ def extract_ppt(self, ppt_path: str, include_notes: bool = True,
233
+ include_hidden_slides: bool = False,
234
+ extract_images: bool = False) -> List[Document]:
235
+ """
236
+ Extract content from a PowerPoint file.
237
+
238
+ Args:
239
+ ppt_path (str): Path to the PowerPoint file
240
+ include_notes (bool): Whether to include speaker notes
241
+ include_hidden_slides (bool): Whether to include hidden slides
242
+ extract_images (bool): Whether to extract images
243
+
244
+ Returns:
245
+ List[Document]: List of Document objects containing extracted content
246
+
247
+ Raises:
248
+ ValueError: If the file doesn't exist
249
+ ImportError: If python-pptx is not installed
250
+ """
251
+ if not self.check_file(ppt_path):
252
+ raise ValueError(f"File {ppt_path} not found")
253
+
254
+ if not self.check_package("pptx"):
255
+ raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
256
+
257
+ from pptx import Presentation
258
+
259
+ try:
260
+ # Load presentation
261
+ presentation = Presentation(ppt_path)
262
+
263
+ documents = []
264
+
265
+ # Process each slide
266
+ for i, slide in enumerate(presentation.slides):
267
+ # Skip hidden slides if not requested
268
+ if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
269
+ continue
270
+
271
+ # Extract text from shapes
272
+ texts = []
273
+ for shape in slide.shapes:
274
+ if hasattr(shape, "text") and shape.text:
275
+ texts.append(shape.text)
276
+
277
+ # Extract notes if requested
278
+ notes = ""
279
+ if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
280
+ for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
281
+ if note_shape.text:
282
+ notes += note_shape.text + "\n"
283
+
284
+ # Create metadata
285
+ metadata = {
286
+ "source": ppt_path,
287
+ "slide_number": i + 1,
288
+ "total_slides": len(presentation.slides),
289
+ "file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
290
+ }
291
+
292
+ # Add slide title if available
293
+ if slide.shapes.title and slide.shapes.title.text:
294
+ metadata["title"] = slide.shapes.title.text
295
+
296
+ # Combine text content
297
+ content = f"Slide {i+1}"
298
+ if "title" in metadata:
299
+ content += f": {metadata['title']}"
300
+ content += "\n\n"
301
+
302
+ if texts:
303
+ content += "\n".join(texts) + "\n"
304
+
305
+ if notes:
306
+ content += "\nNotes:\n" + notes
307
+
308
+ # Create document
309
+ documents.append(Document(
310
+ page_content=content,
311
+ metadata=metadata
312
+ ))
313
+
314
+ if self.logger:
315
+ self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
316
+ else:
317
+ print(f"Extracted {len(documents)} slides from {ppt_path}")
318
+
319
+ return documents
320
+
321
+ except Exception as e:
322
+ if self.logger:
323
+ self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
324
+ else:
325
+ print(f"Error extracting from {ppt_path}: {str(e)}")
326
+ raise
327
+
328
+ def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
329
+ """
330
+ Extract content from multiple PowerPoint files.
331
+
332
+ Args:
333
+ ppt_paths (List[str]): List of paths to PowerPoint files
334
+ **kwargs: Additional arguments for extract_ppt
335
+
336
+ Returns:
337
+ List[Document]: List of Document objects containing extracted content
338
+ """
339
+ all_docs = []
340
+ for ppt_path in ppt_paths:
341
+ try:
342
+ docs = self.extract_ppt(ppt_path, **kwargs)
343
+ all_docs.extend(docs)
344
+ if self.logger:
345
+ self.logger.info(f"Successfully extracted content from {ppt_path}")
346
+ else:
347
+ print(f"Successfully extracted content from {ppt_path}")
348
+ except Exception as e:
349
+ if self.logger:
350
+ self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
351
+ else:
352
+ print(f"Error extracting from {ppt_path}: {str(e)}")
353
+
354
+ return all_docs