mb-rag 1.1.47__py3-none-any.whl → 1.1.56.post0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mb-rag might be problematic. Click here for more details.

mb_rag/utils/extra.py CHANGED
@@ -1,74 +1,74 @@
1
- ## extra functions for RAG's
2
-
3
- import os
4
- from dotenv import load_dotenv
5
- import importlib.util
6
-
7
- __all__ = ["load_env_file", "check_package", "pdf_to_text", "convert_pdfs_in_folder"]
8
-
9
- def load_env_file(file_path='.env'):
10
- """
11
- Load environment variables from a .env file.
12
-
13
- Args:
14
- file_path (str): Path to the .env file. Defaults to '.env'.
15
-
16
- Returns:
17
- None
18
- """
19
- load_dotenv(file_path)
20
-
21
- # Get the loaded environment variables
22
- env_vars = os.environ
23
-
24
- return env_vars
25
-
26
- def check_package(package_name):
27
- """
28
- Check if a package is installed
29
- Args:
30
- package_name (str): Name of the package
31
- Returns:
32
- bool: True if package is installed, False otherwise
33
- """
34
- return importlib.util.find_spec(package_name) is not None
35
-
36
- def pdf_to_text(pdf_path):
37
- """Extract text from a PDF file."""
38
- text = ""
39
- try:
40
- if not check_package("PyPDF2"):
41
- raise ImportError("PyPDF2 package not found. Please install it using: pip install pypdf2")
42
- import PyPDF2
43
- with open(pdf_path, "rb") as file:
44
- reader = PyPDF2.PdfReader(file)
45
- for page in reader.pages:
46
- text += page.extract_text() + "\n"
47
- except PyPDF2.errors.PdfReadError as e:
48
- print(f"Error reading {pdf_path}: {e}")
49
- except Exception as e:
50
- print(f"An unexpected error occurred with {pdf_path}: {e}")
51
- return text
52
-
53
- def convert_pdfs_in_folder(folder_path):
54
- """
55
- Convert all PDF files in the given folder to text files.
56
- Args:
57
- folder_path (str): Path to the folder containing the PDF files.
58
- Returns:
59
- None
60
- Example : convert_pdfs_in_folder('/folder_path') # folder_path is the path to the folder containing the PDF files.
61
- The converted PDF files and text files will be created in the same folder
62
- """
63
- for filename in os.listdir(folder_path):
64
- if filename.endswith('.pdf'):
65
- pdf_path = os.path.join(folder_path, filename)
66
- text = pdf_to_text(pdf_path)
67
- if text: # Only write to file if text is not empty
68
- text_filename = os.path.splitext(filename)[0] + '.txt'
69
- text_path = os.path.join(folder_path, text_filename)
70
- with open(text_path, 'w', encoding='utf-8') as text_file:
71
- text_file.write(text)
72
- print(f"Converted: {filename} to {text_filename}")
73
- else:
1
+ ## extra functions for RAG's
2
+
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import importlib.util
6
+
7
+ __all__ = ["load_env_file", "check_package", "pdf_to_text", "convert_pdfs_in_folder"]
8
+
9
+ def load_env_file(file_path='.env'):
10
+ """
11
+ Load environment variables from a .env file.
12
+
13
+ Args:
14
+ file_path (str): Path to the .env file. Defaults to '.env'.
15
+
16
+ Returns:
17
+ None
18
+ """
19
+ load_dotenv(file_path)
20
+
21
+ # Get the loaded environment variables
22
+ env_vars = os.environ
23
+
24
+ return env_vars
25
+
26
+ def check_package(package_name):
27
+ """
28
+ Check if a package is installed
29
+ Args:
30
+ package_name (str): Name of the package
31
+ Returns:
32
+ bool: True if package is installed, False otherwise
33
+ """
34
+ return importlib.util.find_spec(package_name) is not None
35
+
36
+ def pdf_to_text(pdf_path):
37
+ """Extract text from a PDF file."""
38
+ text = ""
39
+ try:
40
+ if not check_package("PyPDF2"):
41
+ raise ImportError("PyPDF2 package not found. Please install it using: pip install pypdf2")
42
+ import PyPDF2
43
+ with open(pdf_path, "rb") as file:
44
+ reader = PyPDF2.PdfReader(file)
45
+ for page in reader.pages:
46
+ text += page.extract_text() + "\n"
47
+ except PyPDF2.errors.PdfReadError as e:
48
+ print(f"Error reading {pdf_path}: {e}")
49
+ except Exception as e:
50
+ print(f"An unexpected error occurred with {pdf_path}: {e}")
51
+ return text
52
+
53
+ def convert_pdfs_in_folder(folder_path):
54
+ """
55
+ Convert all PDF files in the given folder to text files.
56
+ Args:
57
+ folder_path (str): Path to the folder containing the PDF files.
58
+ Returns:
59
+ None
60
+ Example : convert_pdfs_in_folder('/folder_path') # folder_path is the path to the folder containing the PDF files.
61
+ The converted PDF files and text files will be created in the same folder
62
+ """
63
+ for filename in os.listdir(folder_path):
64
+ if filename.endswith('.pdf'):
65
+ pdf_path = os.path.join(folder_path, filename)
66
+ text = pdf_to_text(pdf_path)
67
+ if text: # Only write to file if text is not empty
68
+ text_filename = os.path.splitext(filename)[0] + '.txt'
69
+ text_path = os.path.join(folder_path, text_filename)
70
+ with open(text_path, 'w', encoding='utf-8') as text_file:
71
+ text_file.write(text)
72
+ print(f"Converted: {filename} to {text_filename}")
73
+ else:
74
74
  print(f"No text extracted from {filename}")