mb-rag 1.1.47__py3-none-any.whl → 1.1.56.post0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mb-rag might be problematic. Click here for more details.
- mb_rag/basic.py +306 -0
- mb_rag/chatbot/chains.py +206 -206
- mb_rag/chatbot/conversation.py +185 -0
- mb_rag/chatbot/prompts.py +58 -58
- mb_rag/rag/embeddings.py +810 -810
- mb_rag/utils/all_data_extract.py +64 -64
- mb_rag/utils/bounding_box.py +231 -231
- mb_rag/utils/document_extract.py +354 -354
- mb_rag/utils/extra.py +73 -73
- mb_rag/utils/pdf_extract.py +428 -428
- mb_rag/version.py +1 -1
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/METADATA +11 -11
- mb_rag-1.1.56.post0.dist-info/RECORD +19 -0
- mb_rag/chatbot/basic.py +0 -644
- mb_rag-1.1.47.dist-info/RECORD +0 -18
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/WHEEL +0 -0
- {mb_rag-1.1.47.dist-info → mb_rag-1.1.56.post0.dist-info}/top_level.txt +0 -0
mb_rag/utils/extra.py
CHANGED
|
@@ -1,74 +1,74 @@
|
|
|
1
|
-
## extra functions for RAG's
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
from dotenv import load_dotenv
|
|
5
|
-
import importlib.util
|
|
6
|
-
|
|
7
|
-
__all__ = ["load_env_file", "check_package", "pdf_to_text", "convert_pdfs_in_folder"]
|
|
8
|
-
|
|
9
|
-
def load_env_file(file_path='.env'):
|
|
10
|
-
"""
|
|
11
|
-
Load environment variables from a .env file.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
file_path (str): Path to the .env file. Defaults to '.env'.
|
|
15
|
-
|
|
16
|
-
Returns:
|
|
17
|
-
None
|
|
18
|
-
"""
|
|
19
|
-
load_dotenv(file_path)
|
|
20
|
-
|
|
21
|
-
# Get the loaded environment variables
|
|
22
|
-
env_vars = os.environ
|
|
23
|
-
|
|
24
|
-
return env_vars
|
|
25
|
-
|
|
26
|
-
def check_package(package_name):
|
|
27
|
-
"""
|
|
28
|
-
Check if a package is installed
|
|
29
|
-
Args:
|
|
30
|
-
package_name (str): Name of the package
|
|
31
|
-
Returns:
|
|
32
|
-
bool: True if package is installed, False otherwise
|
|
33
|
-
"""
|
|
34
|
-
return importlib.util.find_spec(package_name) is not None
|
|
35
|
-
|
|
36
|
-
def pdf_to_text(pdf_path):
|
|
37
|
-
"""Extract text from a PDF file."""
|
|
38
|
-
text = ""
|
|
39
|
-
try:
|
|
40
|
-
if not check_package("PyPDF2"):
|
|
41
|
-
raise ImportError("PyPDF2 package not found. Please install it using: pip install pypdf2")
|
|
42
|
-
import PyPDF2
|
|
43
|
-
with open(pdf_path, "rb") as file:
|
|
44
|
-
reader = PyPDF2.PdfReader(file)
|
|
45
|
-
for page in reader.pages:
|
|
46
|
-
text += page.extract_text() + "\n"
|
|
47
|
-
except PyPDF2.errors.PdfReadError as e:
|
|
48
|
-
print(f"Error reading {pdf_path}: {e}")
|
|
49
|
-
except Exception as e:
|
|
50
|
-
print(f"An unexpected error occurred with {pdf_path}: {e}")
|
|
51
|
-
return text
|
|
52
|
-
|
|
53
|
-
def convert_pdfs_in_folder(folder_path):
|
|
54
|
-
"""
|
|
55
|
-
Convert all PDF files in the given folder to text files.
|
|
56
|
-
Args:
|
|
57
|
-
folder_path (str): Path to the folder containing the PDF files.
|
|
58
|
-
Returns:
|
|
59
|
-
None
|
|
60
|
-
Example : convert_pdfs_in_folder('/folder_path') # folder_path is the path to the folder containing the PDF files.
|
|
61
|
-
The converted PDF files and text files will be created in the same folder
|
|
62
|
-
"""
|
|
63
|
-
for filename in os.listdir(folder_path):
|
|
64
|
-
if filename.endswith('.pdf'):
|
|
65
|
-
pdf_path = os.path.join(folder_path, filename)
|
|
66
|
-
text = pdf_to_text(pdf_path)
|
|
67
|
-
if text: # Only write to file if text is not empty
|
|
68
|
-
text_filename = os.path.splitext(filename)[0] + '.txt'
|
|
69
|
-
text_path = os.path.join(folder_path, text_filename)
|
|
70
|
-
with open(text_path, 'w', encoding='utf-8') as text_file:
|
|
71
|
-
text_file.write(text)
|
|
72
|
-
print(f"Converted: {filename} to {text_filename}")
|
|
73
|
-
else:
|
|
1
|
+
## extra functions for RAG's
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
import importlib.util
|
|
6
|
+
|
|
7
|
+
__all__ = ["load_env_file", "check_package", "pdf_to_text", "convert_pdfs_in_folder"]
|
|
8
|
+
|
|
9
|
+
def load_env_file(file_path='.env'):
|
|
10
|
+
"""
|
|
11
|
+
Load environment variables from a .env file.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
file_path (str): Path to the .env file. Defaults to '.env'.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
None
|
|
18
|
+
"""
|
|
19
|
+
load_dotenv(file_path)
|
|
20
|
+
|
|
21
|
+
# Get the loaded environment variables
|
|
22
|
+
env_vars = os.environ
|
|
23
|
+
|
|
24
|
+
return env_vars
|
|
25
|
+
|
|
26
|
+
def check_package(package_name):
|
|
27
|
+
"""
|
|
28
|
+
Check if a package is installed
|
|
29
|
+
Args:
|
|
30
|
+
package_name (str): Name of the package
|
|
31
|
+
Returns:
|
|
32
|
+
bool: True if package is installed, False otherwise
|
|
33
|
+
"""
|
|
34
|
+
return importlib.util.find_spec(package_name) is not None
|
|
35
|
+
|
|
36
|
+
def pdf_to_text(pdf_path):
|
|
37
|
+
"""Extract text from a PDF file."""
|
|
38
|
+
text = ""
|
|
39
|
+
try:
|
|
40
|
+
if not check_package("PyPDF2"):
|
|
41
|
+
raise ImportError("PyPDF2 package not found. Please install it using: pip install pypdf2")
|
|
42
|
+
import PyPDF2
|
|
43
|
+
with open(pdf_path, "rb") as file:
|
|
44
|
+
reader = PyPDF2.PdfReader(file)
|
|
45
|
+
for page in reader.pages:
|
|
46
|
+
text += page.extract_text() + "\n"
|
|
47
|
+
except PyPDF2.errors.PdfReadError as e:
|
|
48
|
+
print(f"Error reading {pdf_path}: {e}")
|
|
49
|
+
except Exception as e:
|
|
50
|
+
print(f"An unexpected error occurred with {pdf_path}: {e}")
|
|
51
|
+
return text
|
|
52
|
+
|
|
53
|
+
def convert_pdfs_in_folder(folder_path):
|
|
54
|
+
"""
|
|
55
|
+
Convert all PDF files in the given folder to text files.
|
|
56
|
+
Args:
|
|
57
|
+
folder_path (str): Path to the folder containing the PDF files.
|
|
58
|
+
Returns:
|
|
59
|
+
None
|
|
60
|
+
Example : convert_pdfs_in_folder('/folder_path') # folder_path is the path to the folder containing the PDF files.
|
|
61
|
+
The converted PDF files and text files will be created in the same folder
|
|
62
|
+
"""
|
|
63
|
+
for filename in os.listdir(folder_path):
|
|
64
|
+
if filename.endswith('.pdf'):
|
|
65
|
+
pdf_path = os.path.join(folder_path, filename)
|
|
66
|
+
text = pdf_to_text(pdf_path)
|
|
67
|
+
if text: # Only write to file if text is not empty
|
|
68
|
+
text_filename = os.path.splitext(filename)[0] + '.txt'
|
|
69
|
+
text_path = os.path.join(folder_path, text_filename)
|
|
70
|
+
with open(text_path, 'w', encoding='utf-8') as text_file:
|
|
71
|
+
text_file.write(text)
|
|
72
|
+
print(f"Converted: {filename} to {text_filename}")
|
|
73
|
+
else:
|
|
74
74
|
print(f"No text extracted from {filename}")
|