mb-rag 1.1.47__py3-none-any.whl → 1.1.56.post0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- mb_rag/basic.py +306 -0
- mb_rag/chatbot/chains.py +206 -206
- mb_rag/chatbot/conversation.py +185 -0
- mb_rag/chatbot/prompts.py +58 -58
- mb_rag/rag/embeddings.py +810 -810
- mb_rag/utils/all_data_extract.py +64 -64
- mb_rag/utils/bounding_box.py +231 -231
- mb_rag/utils/document_extract.py +354 -354
- mb_rag/utils/extra.py +73 -73
- mb_rag/utils/pdf_extract.py +428 -428
- mb_rag/version.py +1 -1
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/METADATA +11 -11
- mb_rag-1.1.56.post0.dist-info/RECORD +19 -0
- mb_rag/chatbot/basic.py +0 -644
- mb_rag-1.1.47.dist-info/RECORD +0 -18
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/WHEEL +0 -0
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/top_level.txt +0 -0
mb_rag/utils/document_extract.py
CHANGED
|
@@ -1,354 +1,354 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Document Extraction Module
|
|
3
|
-
|
|
4
|
-
This module provides functionality for extracting text and metadata from various document types
|
|
5
|
-
including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
|
|
6
|
-
functionality in pdf_extract.py.
|
|
7
|
-
|
|
8
|
-
Example Usage:
|
|
9
|
-
```python
|
|
10
|
-
# Initialize CSV extractor
|
|
11
|
-
csv_extractor = CSVExtractor()
|
|
12
|
-
|
|
13
|
-
# Extract data from a CSV file
|
|
14
|
-
docs = csv_extractor.extract_csv("data.csv")
|
|
15
|
-
|
|
16
|
-
# Initialize PowerPoint extractor
|
|
17
|
-
ppt_extractor = PowerPointExtractor()
|
|
18
|
-
|
|
19
|
-
# Extract content from a PowerPoint file
|
|
20
|
-
docs = ppt_extractor.extract_ppt("presentation.pptx")
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
Features:
|
|
24
|
-
- CSV file extraction with metadata
|
|
25
|
-
- PowerPoint (PPT/PPTX) extraction
|
|
26
|
-
- Batch processing for multiple files
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
import os
|
|
30
|
-
import pandas as pd
|
|
31
|
-
import importlib.util
|
|
32
|
-
from typing import List, Dict, Optional, Union, Any
|
|
33
|
-
from langchain_core.documents import Document
|
|
34
|
-
|
|
35
|
-
class CSVExtractor:
|
|
36
|
-
"""
|
|
37
|
-
Class for extracting data from CSV files.
|
|
38
|
-
|
|
39
|
-
This class provides methods for extracting content from CSV files
|
|
40
|
-
and converting it to Document objects for use with RAG systems.
|
|
41
|
-
|
|
42
|
-
Args:
|
|
43
|
-
logger: Optional logger instance for logging operations
|
|
44
|
-
|
|
45
|
-
Example:
|
|
46
|
-
```python
|
|
47
|
-
extractor = CSVExtractor()
|
|
48
|
-
docs = extractor.extract_csv("data.csv")
|
|
49
|
-
```
|
|
50
|
-
"""
|
|
51
|
-
|
|
52
|
-
def __init__(self, logger=None):
|
|
53
|
-
"""Initialize the CSV extractor."""
|
|
54
|
-
self.logger = logger
|
|
55
|
-
|
|
56
|
-
def check_file(self, file_path: str) -> bool:
|
|
57
|
-
"""
|
|
58
|
-
Check if file exists.
|
|
59
|
-
|
|
60
|
-
Args:
|
|
61
|
-
file_path (str): Path to the file
|
|
62
|
-
|
|
63
|
-
Returns:
|
|
64
|
-
bool: True if file exists, False otherwise
|
|
65
|
-
"""
|
|
66
|
-
return os.path.exists(file_path)
|
|
67
|
-
|
|
68
|
-
def extract_csv(self, csv_path: str, include_stats: bool = True,
|
|
69
|
-
chunk_by_row: bool = False, rows_per_chunk: int = 10,
|
|
70
|
-
**kwargs) -> List[Document]:
|
|
71
|
-
"""
|
|
72
|
-
Extract data from a CSV file.
|
|
73
|
-
|
|
74
|
-
Args:
|
|
75
|
-
csv_path (str): Path to the CSV file
|
|
76
|
-
include_stats (bool): Whether to include basic statistics in the metadata
|
|
77
|
-
chunk_by_row (bool): Whether to create a separate document for each row or group of rows
|
|
78
|
-
rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
|
|
79
|
-
**kwargs: Additional arguments for pandas.read_csv
|
|
80
|
-
|
|
81
|
-
Returns:
|
|
82
|
-
List[Document]: List of Document objects containing extracted content
|
|
83
|
-
|
|
84
|
-
Raises:
|
|
85
|
-
ValueError: If the file doesn't exist
|
|
86
|
-
ImportError: If pandas is not installed
|
|
87
|
-
"""
|
|
88
|
-
if not self.check_file(csv_path):
|
|
89
|
-
raise ValueError(f"File {csv_path} not found")
|
|
90
|
-
|
|
91
|
-
try:
|
|
92
|
-
# Read CSV file
|
|
93
|
-
df = pd.read_csv(csv_path, **kwargs)
|
|
94
|
-
|
|
95
|
-
# Create metadata
|
|
96
|
-
metadata = {
|
|
97
|
-
"source": csv_path,
|
|
98
|
-
"rows": len(df),
|
|
99
|
-
"columns": list(df.columns),
|
|
100
|
-
"file_type": "csv"
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
# Add basic statistics if requested
|
|
104
|
-
if include_stats:
|
|
105
|
-
stats = {}
|
|
106
|
-
for column in df.columns:
|
|
107
|
-
if pd.api.types.is_numeric_dtype(df[column]):
|
|
108
|
-
stats[column] = {
|
|
109
|
-
"min": float(df[column].min()),
|
|
110
|
-
"max": float(df[column].max()),
|
|
111
|
-
"mean": float(df[column].mean()),
|
|
112
|
-
"median": float(df[column].median())
|
|
113
|
-
}
|
|
114
|
-
metadata["statistics"] = stats
|
|
115
|
-
|
|
116
|
-
documents = []
|
|
117
|
-
|
|
118
|
-
if chunk_by_row:
|
|
119
|
-
# Create a separate document for each chunk of rows
|
|
120
|
-
for i in range(0, len(df), rows_per_chunk):
|
|
121
|
-
chunk = df.iloc[i:i+rows_per_chunk]
|
|
122
|
-
chunk_text = chunk.to_string(index=False)
|
|
123
|
-
|
|
124
|
-
chunk_metadata = metadata.copy()
|
|
125
|
-
chunk_metadata["chunk"] = {
|
|
126
|
-
"start_row": i,
|
|
127
|
-
"end_row": min(i + rows_per_chunk - 1, len(df) - 1),
|
|
128
|
-
"total_rows": len(chunk)
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
documents.append(Document(
|
|
132
|
-
page_content=chunk_text,
|
|
133
|
-
metadata=chunk_metadata
|
|
134
|
-
))
|
|
135
|
-
else:
|
|
136
|
-
# Create a single document with all data
|
|
137
|
-
text = df.to_string(index=False)
|
|
138
|
-
documents.append(Document(
|
|
139
|
-
page_content=text,
|
|
140
|
-
metadata=metadata
|
|
141
|
-
))
|
|
142
|
-
|
|
143
|
-
if self.logger:
|
|
144
|
-
self.logger.info(f"Extracted data from {csv_path}")
|
|
145
|
-
else:
|
|
146
|
-
print(f"Extracted data from {csv_path}")
|
|
147
|
-
|
|
148
|
-
return documents
|
|
149
|
-
|
|
150
|
-
except Exception as e:
|
|
151
|
-
if self.logger:
|
|
152
|
-
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
153
|
-
else:
|
|
154
|
-
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
155
|
-
raise
|
|
156
|
-
|
|
157
|
-
def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
|
|
158
|
-
"""
|
|
159
|
-
Extract data from multiple CSV files.
|
|
160
|
-
|
|
161
|
-
Args:
|
|
162
|
-
csv_paths (List[str]): List of paths to CSV files
|
|
163
|
-
**kwargs: Additional arguments for extract_csv
|
|
164
|
-
|
|
165
|
-
Returns:
|
|
166
|
-
List[Document]: List of Document objects containing extracted content
|
|
167
|
-
"""
|
|
168
|
-
all_docs = []
|
|
169
|
-
for csv_path in csv_paths:
|
|
170
|
-
try:
|
|
171
|
-
docs = self.extract_csv(csv_path, **kwargs)
|
|
172
|
-
all_docs.extend(docs)
|
|
173
|
-
if self.logger:
|
|
174
|
-
self.logger.info(f"Successfully extracted content from {csv_path}")
|
|
175
|
-
else:
|
|
176
|
-
print(f"Successfully extracted content from {csv_path}")
|
|
177
|
-
except Exception as e:
|
|
178
|
-
if self.logger:
|
|
179
|
-
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
180
|
-
else:
|
|
181
|
-
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
182
|
-
|
|
183
|
-
return all_docs
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
class PowerPointExtractor:
|
|
187
|
-
"""
|
|
188
|
-
Class for extracting content from PowerPoint (PPT/PPTX) files.
|
|
189
|
-
|
|
190
|
-
This class provides methods for extracting text, notes, and metadata
|
|
191
|
-
from PowerPoint presentations.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
logger: Optional logger instance for logging operations
|
|
195
|
-
|
|
196
|
-
Example:
|
|
197
|
-
```python
|
|
198
|
-
extractor = PowerPointExtractor()
|
|
199
|
-
docs = extractor.extract_ppt("presentation.pptx")
|
|
200
|
-
```
|
|
201
|
-
"""
|
|
202
|
-
|
|
203
|
-
def __init__(self, logger=None):
|
|
204
|
-
"""Initialize the PowerPoint extractor."""
|
|
205
|
-
self.logger = logger
|
|
206
|
-
|
|
207
|
-
@staticmethod
|
|
208
|
-
def check_package(package_name: str) -> bool:
|
|
209
|
-
"""
|
|
210
|
-
Check if a Python package is installed.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
package_name (str): Name of the package to check
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
bool: True if package is installed, False otherwise
|
|
217
|
-
"""
|
|
218
|
-
return importlib.util.find_spec(package_name) is not None
|
|
219
|
-
|
|
220
|
-
def check_file(self, file_path: str) -> bool:
|
|
221
|
-
"""
|
|
222
|
-
Check if file exists.
|
|
223
|
-
|
|
224
|
-
Args:
|
|
225
|
-
file_path (str): Path to the file
|
|
226
|
-
|
|
227
|
-
Returns:
|
|
228
|
-
bool: True if file exists, False otherwise
|
|
229
|
-
"""
|
|
230
|
-
return os.path.exists(file_path)
|
|
231
|
-
|
|
232
|
-
def extract_ppt(self, ppt_path: str, include_notes: bool = True,
|
|
233
|
-
include_hidden_slides: bool = False,
|
|
234
|
-
extract_images: bool = False) -> List[Document]:
|
|
235
|
-
"""
|
|
236
|
-
Extract content from a PowerPoint file.
|
|
237
|
-
|
|
238
|
-
Args:
|
|
239
|
-
ppt_path (str): Path to the PowerPoint file
|
|
240
|
-
include_notes (bool): Whether to include speaker notes
|
|
241
|
-
include_hidden_slides (bool): Whether to include hidden slides
|
|
242
|
-
extract_images (bool): Whether to extract images
|
|
243
|
-
|
|
244
|
-
Returns:
|
|
245
|
-
List[Document]: List of Document objects containing extracted content
|
|
246
|
-
|
|
247
|
-
Raises:
|
|
248
|
-
ValueError: If the file doesn't exist
|
|
249
|
-
ImportError: If python-pptx is not installed
|
|
250
|
-
"""
|
|
251
|
-
if not self.check_file(ppt_path):
|
|
252
|
-
raise ValueError(f"File {ppt_path} not found")
|
|
253
|
-
|
|
254
|
-
if not self.check_package("pptx"):
|
|
255
|
-
raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
|
|
256
|
-
|
|
257
|
-
from pptx import Presentation
|
|
258
|
-
|
|
259
|
-
try:
|
|
260
|
-
# Load presentation
|
|
261
|
-
presentation = Presentation(ppt_path)
|
|
262
|
-
|
|
263
|
-
documents = []
|
|
264
|
-
|
|
265
|
-
# Process each slide
|
|
266
|
-
for i, slide in enumerate(presentation.slides):
|
|
267
|
-
# Skip hidden slides if not requested
|
|
268
|
-
if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
|
|
269
|
-
continue
|
|
270
|
-
|
|
271
|
-
# Extract text from shapes
|
|
272
|
-
texts = []
|
|
273
|
-
for shape in slide.shapes:
|
|
274
|
-
if hasattr(shape, "text") and shape.text:
|
|
275
|
-
texts.append(shape.text)
|
|
276
|
-
|
|
277
|
-
# Extract notes if requested
|
|
278
|
-
notes = ""
|
|
279
|
-
if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
|
|
280
|
-
for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
|
|
281
|
-
if note_shape.text:
|
|
282
|
-
notes += note_shape.text + "\n"
|
|
283
|
-
|
|
284
|
-
# Create metadata
|
|
285
|
-
metadata = {
|
|
286
|
-
"source": ppt_path,
|
|
287
|
-
"slide_number": i + 1,
|
|
288
|
-
"total_slides": len(presentation.slides),
|
|
289
|
-
"file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
|
|
290
|
-
}
|
|
291
|
-
|
|
292
|
-
# Add slide title if available
|
|
293
|
-
if slide.shapes.title and slide.shapes.title.text:
|
|
294
|
-
metadata["title"] = slide.shapes.title.text
|
|
295
|
-
|
|
296
|
-
# Combine text content
|
|
297
|
-
content = f"Slide {i+1}"
|
|
298
|
-
if "title" in metadata:
|
|
299
|
-
content += f": {metadata['title']}"
|
|
300
|
-
content += "\n\n"
|
|
301
|
-
|
|
302
|
-
if texts:
|
|
303
|
-
content += "\n".join(texts) + "\n"
|
|
304
|
-
|
|
305
|
-
if notes:
|
|
306
|
-
content += "\nNotes:\n" + notes
|
|
307
|
-
|
|
308
|
-
# Create document
|
|
309
|
-
documents.append(Document(
|
|
310
|
-
page_content=content,
|
|
311
|
-
metadata=metadata
|
|
312
|
-
))
|
|
313
|
-
|
|
314
|
-
if self.logger:
|
|
315
|
-
self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
316
|
-
else:
|
|
317
|
-
print(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
318
|
-
|
|
319
|
-
return documents
|
|
320
|
-
|
|
321
|
-
except Exception as e:
|
|
322
|
-
if self.logger:
|
|
323
|
-
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
324
|
-
else:
|
|
325
|
-
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
326
|
-
raise
|
|
327
|
-
|
|
328
|
-
def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
|
|
329
|
-
"""
|
|
330
|
-
Extract content from multiple PowerPoint files.
|
|
331
|
-
|
|
332
|
-
Args:
|
|
333
|
-
ppt_paths (List[str]): List of paths to PowerPoint files
|
|
334
|
-
**kwargs: Additional arguments for extract_ppt
|
|
335
|
-
|
|
336
|
-
Returns:
|
|
337
|
-
List[Document]: List of Document objects containing extracted content
|
|
338
|
-
"""
|
|
339
|
-
all_docs = []
|
|
340
|
-
for ppt_path in ppt_paths:
|
|
341
|
-
try:
|
|
342
|
-
docs = self.extract_ppt(ppt_path, **kwargs)
|
|
343
|
-
all_docs.extend(docs)
|
|
344
|
-
if self.logger:
|
|
345
|
-
self.logger.info(f"Successfully extracted content from {ppt_path}")
|
|
346
|
-
else:
|
|
347
|
-
print(f"Successfully extracted content from {ppt_path}")
|
|
348
|
-
except Exception as e:
|
|
349
|
-
if self.logger:
|
|
350
|
-
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
351
|
-
else:
|
|
352
|
-
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
353
|
-
|
|
354
|
-
return all_docs
|
|
1
|
+
"""
|
|
2
|
+
Document Extraction Module
|
|
3
|
+
|
|
4
|
+
This module provides functionality for extracting text and metadata from various document types
|
|
5
|
+
including CSV, PowerPoint (PPT/PPTX), and other document formats. It complements the PDF extraction
|
|
6
|
+
functionality in pdf_extract.py.
|
|
7
|
+
|
|
8
|
+
Example Usage:
|
|
9
|
+
```python
|
|
10
|
+
# Initialize CSV extractor
|
|
11
|
+
csv_extractor = CSVExtractor()
|
|
12
|
+
|
|
13
|
+
# Extract data from a CSV file
|
|
14
|
+
docs = csv_extractor.extract_csv("data.csv")
|
|
15
|
+
|
|
16
|
+
# Initialize PowerPoint extractor
|
|
17
|
+
ppt_extractor = PowerPointExtractor()
|
|
18
|
+
|
|
19
|
+
# Extract content from a PowerPoint file
|
|
20
|
+
docs = ppt_extractor.extract_ppt("presentation.pptx")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- CSV file extraction with metadata
|
|
25
|
+
- PowerPoint (PPT/PPTX) extraction
|
|
26
|
+
- Batch processing for multiple files
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
import os
|
|
30
|
+
import pandas as pd
|
|
31
|
+
import importlib.util
|
|
32
|
+
from typing import List, Dict, Optional, Union, Any
|
|
33
|
+
from langchain_core.documents import Document
|
|
34
|
+
|
|
35
|
+
class CSVExtractor:
|
|
36
|
+
"""
|
|
37
|
+
Class for extracting data from CSV files.
|
|
38
|
+
|
|
39
|
+
This class provides methods for extracting content from CSV files
|
|
40
|
+
and converting it to Document objects for use with RAG systems.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
logger: Optional logger instance for logging operations
|
|
44
|
+
|
|
45
|
+
Example:
|
|
46
|
+
```python
|
|
47
|
+
extractor = CSVExtractor()
|
|
48
|
+
docs = extractor.extract_csv("data.csv")
|
|
49
|
+
```
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, logger=None):
|
|
53
|
+
"""Initialize the CSV extractor."""
|
|
54
|
+
self.logger = logger
|
|
55
|
+
|
|
56
|
+
def check_file(self, file_path: str) -> bool:
|
|
57
|
+
"""
|
|
58
|
+
Check if file exists.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
file_path (str): Path to the file
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
bool: True if file exists, False otherwise
|
|
65
|
+
"""
|
|
66
|
+
return os.path.exists(file_path)
|
|
67
|
+
|
|
68
|
+
def extract_csv(self, csv_path: str, include_stats: bool = True,
|
|
69
|
+
chunk_by_row: bool = False, rows_per_chunk: int = 10,
|
|
70
|
+
**kwargs) -> List[Document]:
|
|
71
|
+
"""
|
|
72
|
+
Extract data from a CSV file.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
csv_path (str): Path to the CSV file
|
|
76
|
+
include_stats (bool): Whether to include basic statistics in the metadata
|
|
77
|
+
chunk_by_row (bool): Whether to create a separate document for each row or group of rows
|
|
78
|
+
rows_per_chunk (int): Number of rows per chunk if chunk_by_row is True
|
|
79
|
+
**kwargs: Additional arguments for pandas.read_csv
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
List[Document]: List of Document objects containing extracted content
|
|
83
|
+
|
|
84
|
+
Raises:
|
|
85
|
+
ValueError: If the file doesn't exist
|
|
86
|
+
ImportError: If pandas is not installed
|
|
87
|
+
"""
|
|
88
|
+
if not self.check_file(csv_path):
|
|
89
|
+
raise ValueError(f"File {csv_path} not found")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
# Read CSV file
|
|
93
|
+
df = pd.read_csv(csv_path, **kwargs)
|
|
94
|
+
|
|
95
|
+
# Create metadata
|
|
96
|
+
metadata = {
|
|
97
|
+
"source": csv_path,
|
|
98
|
+
"rows": len(df),
|
|
99
|
+
"columns": list(df.columns),
|
|
100
|
+
"file_type": "csv"
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# Add basic statistics if requested
|
|
104
|
+
if include_stats:
|
|
105
|
+
stats = {}
|
|
106
|
+
for column in df.columns:
|
|
107
|
+
if pd.api.types.is_numeric_dtype(df[column]):
|
|
108
|
+
stats[column] = {
|
|
109
|
+
"min": float(df[column].min()),
|
|
110
|
+
"max": float(df[column].max()),
|
|
111
|
+
"mean": float(df[column].mean()),
|
|
112
|
+
"median": float(df[column].median())
|
|
113
|
+
}
|
|
114
|
+
metadata["statistics"] = stats
|
|
115
|
+
|
|
116
|
+
documents = []
|
|
117
|
+
|
|
118
|
+
if chunk_by_row:
|
|
119
|
+
# Create a separate document for each chunk of rows
|
|
120
|
+
for i in range(0, len(df), rows_per_chunk):
|
|
121
|
+
chunk = df.iloc[i:i+rows_per_chunk]
|
|
122
|
+
chunk_text = chunk.to_string(index=False)
|
|
123
|
+
|
|
124
|
+
chunk_metadata = metadata.copy()
|
|
125
|
+
chunk_metadata["chunk"] = {
|
|
126
|
+
"start_row": i,
|
|
127
|
+
"end_row": min(i + rows_per_chunk - 1, len(df) - 1),
|
|
128
|
+
"total_rows": len(chunk)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
documents.append(Document(
|
|
132
|
+
page_content=chunk_text,
|
|
133
|
+
metadata=chunk_metadata
|
|
134
|
+
))
|
|
135
|
+
else:
|
|
136
|
+
# Create a single document with all data
|
|
137
|
+
text = df.to_string(index=False)
|
|
138
|
+
documents.append(Document(
|
|
139
|
+
page_content=text,
|
|
140
|
+
metadata=metadata
|
|
141
|
+
))
|
|
142
|
+
|
|
143
|
+
if self.logger:
|
|
144
|
+
self.logger.info(f"Extracted data from {csv_path}")
|
|
145
|
+
else:
|
|
146
|
+
print(f"Extracted data from {csv_path}")
|
|
147
|
+
|
|
148
|
+
return documents
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
if self.logger:
|
|
152
|
+
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
153
|
+
else:
|
|
154
|
+
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
def extract_multiple_csvs(self, csv_paths: List[str], **kwargs) -> List[Document]:
|
|
158
|
+
"""
|
|
159
|
+
Extract data from multiple CSV files.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
csv_paths (List[str]): List of paths to CSV files
|
|
163
|
+
**kwargs: Additional arguments for extract_csv
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
List[Document]: List of Document objects containing extracted content
|
|
167
|
+
"""
|
|
168
|
+
all_docs = []
|
|
169
|
+
for csv_path in csv_paths:
|
|
170
|
+
try:
|
|
171
|
+
docs = self.extract_csv(csv_path, **kwargs)
|
|
172
|
+
all_docs.extend(docs)
|
|
173
|
+
if self.logger:
|
|
174
|
+
self.logger.info(f"Successfully extracted content from {csv_path}")
|
|
175
|
+
else:
|
|
176
|
+
print(f"Successfully extracted content from {csv_path}")
|
|
177
|
+
except Exception as e:
|
|
178
|
+
if self.logger:
|
|
179
|
+
self.logger.error(f"Error extracting from {csv_path}: {str(e)}")
|
|
180
|
+
else:
|
|
181
|
+
print(f"Error extracting from {csv_path}: {str(e)}")
|
|
182
|
+
|
|
183
|
+
return all_docs
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class PowerPointExtractor:
|
|
187
|
+
"""
|
|
188
|
+
Class for extracting content from PowerPoint (PPT/PPTX) files.
|
|
189
|
+
|
|
190
|
+
This class provides methods for extracting text, notes, and metadata
|
|
191
|
+
from PowerPoint presentations.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
logger: Optional logger instance for logging operations
|
|
195
|
+
|
|
196
|
+
Example:
|
|
197
|
+
```python
|
|
198
|
+
extractor = PowerPointExtractor()
|
|
199
|
+
docs = extractor.extract_ppt("presentation.pptx")
|
|
200
|
+
```
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
def __init__(self, logger=None):
|
|
204
|
+
"""Initialize the PowerPoint extractor."""
|
|
205
|
+
self.logger = logger
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def check_package(package_name: str) -> bool:
|
|
209
|
+
"""
|
|
210
|
+
Check if a Python package is installed.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
package_name (str): Name of the package to check
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
bool: True if package is installed, False otherwise
|
|
217
|
+
"""
|
|
218
|
+
return importlib.util.find_spec(package_name) is not None
|
|
219
|
+
|
|
220
|
+
def check_file(self, file_path: str) -> bool:
|
|
221
|
+
"""
|
|
222
|
+
Check if file exists.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
file_path (str): Path to the file
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
bool: True if file exists, False otherwise
|
|
229
|
+
"""
|
|
230
|
+
return os.path.exists(file_path)
|
|
231
|
+
|
|
232
|
+
def extract_ppt(self, ppt_path: str, include_notes: bool = True,
|
|
233
|
+
include_hidden_slides: bool = False,
|
|
234
|
+
extract_images: bool = False) -> List[Document]:
|
|
235
|
+
"""
|
|
236
|
+
Extract content from a PowerPoint file.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
ppt_path (str): Path to the PowerPoint file
|
|
240
|
+
include_notes (bool): Whether to include speaker notes
|
|
241
|
+
include_hidden_slides (bool): Whether to include hidden slides
|
|
242
|
+
extract_images (bool): Whether to extract images
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
List[Document]: List of Document objects containing extracted content
|
|
246
|
+
|
|
247
|
+
Raises:
|
|
248
|
+
ValueError: If the file doesn't exist
|
|
249
|
+
ImportError: If python-pptx is not installed
|
|
250
|
+
"""
|
|
251
|
+
if not self.check_file(ppt_path):
|
|
252
|
+
raise ValueError(f"File {ppt_path} not found")
|
|
253
|
+
|
|
254
|
+
if not self.check_package("pptx"):
|
|
255
|
+
raise ImportError("python-pptx package not found. Please install: pip install python-pptx")
|
|
256
|
+
|
|
257
|
+
from pptx import Presentation
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
# Load presentation
|
|
261
|
+
presentation = Presentation(ppt_path)
|
|
262
|
+
|
|
263
|
+
documents = []
|
|
264
|
+
|
|
265
|
+
# Process each slide
|
|
266
|
+
for i, slide in enumerate(presentation.slides):
|
|
267
|
+
# Skip hidden slides if not requested
|
|
268
|
+
if hasattr(slide, 'show') and not slide.show and not include_hidden_slides:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
# Extract text from shapes
|
|
272
|
+
texts = []
|
|
273
|
+
for shape in slide.shapes:
|
|
274
|
+
if hasattr(shape, "text") and shape.text:
|
|
275
|
+
texts.append(shape.text)
|
|
276
|
+
|
|
277
|
+
# Extract notes if requested
|
|
278
|
+
notes = ""
|
|
279
|
+
if include_notes and hasattr(slide, "notes_slide") and slide.notes_slide:
|
|
280
|
+
for note_shape in slide.notes_slide.notes_text_frame.paragraphs:
|
|
281
|
+
if note_shape.text:
|
|
282
|
+
notes += note_shape.text + "\n"
|
|
283
|
+
|
|
284
|
+
# Create metadata
|
|
285
|
+
metadata = {
|
|
286
|
+
"source": ppt_path,
|
|
287
|
+
"slide_number": i + 1,
|
|
288
|
+
"total_slides": len(presentation.slides),
|
|
289
|
+
"file_type": "pptx" if ppt_path.endswith(".pptx") else "ppt"
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
# Add slide title if available
|
|
293
|
+
if slide.shapes.title and slide.shapes.title.text:
|
|
294
|
+
metadata["title"] = slide.shapes.title.text
|
|
295
|
+
|
|
296
|
+
# Combine text content
|
|
297
|
+
content = f"Slide {i+1}"
|
|
298
|
+
if "title" in metadata:
|
|
299
|
+
content += f": {metadata['title']}"
|
|
300
|
+
content += "\n\n"
|
|
301
|
+
|
|
302
|
+
if texts:
|
|
303
|
+
content += "\n".join(texts) + "\n"
|
|
304
|
+
|
|
305
|
+
if notes:
|
|
306
|
+
content += "\nNotes:\n" + notes
|
|
307
|
+
|
|
308
|
+
# Create document
|
|
309
|
+
documents.append(Document(
|
|
310
|
+
page_content=content,
|
|
311
|
+
metadata=metadata
|
|
312
|
+
))
|
|
313
|
+
|
|
314
|
+
if self.logger:
|
|
315
|
+
self.logger.info(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
316
|
+
else:
|
|
317
|
+
print(f"Extracted {len(documents)} slides from {ppt_path}")
|
|
318
|
+
|
|
319
|
+
return documents
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
if self.logger:
|
|
323
|
+
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
324
|
+
else:
|
|
325
|
+
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
326
|
+
raise
|
|
327
|
+
|
|
328
|
+
def extract_multiple_ppts(self, ppt_paths: List[str], **kwargs) -> List[Document]:
|
|
329
|
+
"""
|
|
330
|
+
Extract content from multiple PowerPoint files.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
ppt_paths (List[str]): List of paths to PowerPoint files
|
|
334
|
+
**kwargs: Additional arguments for extract_ppt
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List[Document]: List of Document objects containing extracted content
|
|
338
|
+
"""
|
|
339
|
+
all_docs = []
|
|
340
|
+
for ppt_path in ppt_paths:
|
|
341
|
+
try:
|
|
342
|
+
docs = self.extract_ppt(ppt_path, **kwargs)
|
|
343
|
+
all_docs.extend(docs)
|
|
344
|
+
if self.logger:
|
|
345
|
+
self.logger.info(f"Successfully extracted content from {ppt_path}")
|
|
346
|
+
else:
|
|
347
|
+
print(f"Successfully extracted content from {ppt_path}")
|
|
348
|
+
except Exception as e:
|
|
349
|
+
if self.logger:
|
|
350
|
+
self.logger.error(f"Error extracting from {ppt_path}: {str(e)}")
|
|
351
|
+
else:
|
|
352
|
+
print(f"Error extracting from {ppt_path}: {str(e)}")
|
|
353
|
+
|
|
354
|
+
return all_docs
|