vectoriz 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: vectoriz
3
+ Version: 0.0.3
4
+ Summary: Python library for creating vectorized data from text or files.
5
+ Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
6
+ Author: PedroHenriqueDevBR
7
+ Author-email: pedro.henrique.particular@gmail.com
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: faiss-cpu==1.10.0
13
+ Requires-Dist: numpy==2.2.4
14
+ Requires-Dist: sentence-transformers==4.0.2
15
+ Requires-Dist: python-docx==1.1.2
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: requires-dist
23
+ Dynamic: requires-python
24
+ Dynamic: summary
25
+
26
+ # RAG-vector-creator
27
+
28
+ ## Overview
29
+ This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
30
+
31
+ ## Features
32
+
33
+ - Document ingestion and preprocessing
34
+ - Vector embedding generation using state-of-the-art models
35
+ - Efficient storage and retrieval of embeddings
36
+ - Integration with LLM-based generation systems
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install -r requirements.txt
42
+ python app.py
43
+ ```
44
+
45
+ ## Build lib
46
+
47
+ To build the lib run the commands:
48
+
49
+ ```
50
+ python setup.py sdist bdist_wheel
51
+ ```
52
+
53
+ To test the install run:
54
+ ```
55
+ pip install .
56
+ ```
57
+
58
+ ## License
59
+
60
+ MIT
@@ -0,0 +1,35 @@
1
+ # RAG-vector-creator
2
+
3
+ ## Overview
4
+ This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
5
+
6
+ ## Features
7
+
8
+ - Document ingestion and preprocessing
9
+ - Vector embedding generation using state-of-the-art models
10
+ - Efficient storage and retrieval of embeddings
11
+ - Integration with LLM-based generation systems
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install -r requirements.txt
17
+ python app.py
18
+ ```
19
+
20
+ ## Build lib
21
+
22
+ To build the lib run the commands:
23
+
24
+ ```
25
+ python setup.py sdist bdist_wheel
26
+ ```
27
+
28
+ To test the install run:
29
+ ```
30
+ pip install .
31
+ ```
32
+
33
+ ## License
34
+
35
+ MIT
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,24 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="vectoriz",
5
+ version="0.0.3",
6
+ author="PedroHenriqueDevBR",
7
+ author_email="pedro.henrique.particular@gmail.com",
8
+ description="Python library for creating vectorized data from text or files.",
9
+ long_description = open("README.md", encoding="utf-8").read(),
10
+ long_description_content_type="text/markdown",
11
+ url="https://github.com/PedroHenriqueDevBR/vectoriz",
12
+ packages=find_packages(),
13
+ classifiers=[
14
+ "Programming Language :: Python :: 3.12",
15
+ "Operating System :: OS Independent",
16
+ ],
17
+ python_requires=">=3.12",
18
+ install_requires=[
19
+ "faiss-cpu==1.10.0",
20
+ "numpy==2.2.4",
21
+ "sentence-transformers==4.0.2",
22
+ "python-docx==1.1.2"
23
+ ],
24
+ )
File without changes
@@ -0,0 +1,40 @@
1
+ import pytest
2
+ import numpy as np
3
+ from unittest.mock import patch
4
+ from vectoriz.files import FileArgument
5
+
6
+
7
+ def test_add_data_appends_to_lists():
8
+ file_arg = FileArgument([], [], [])
9
+ filename = "test.txt"
10
+ text = "Test content"
11
+
12
+ with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.1, 0.2, 0.3])):
13
+ file_arg.add_data(filename, text)
14
+
15
+ assert file_arg.chunk_names == [filename]
16
+ assert file_arg.text_list == [text]
17
+ assert len(file_arg.embeddings) == 1
18
+ np.testing.assert_array_equal(file_arg.embeddings[0], np.array([0.1, 0.2, 0.3]))
19
+
20
+ def test_add_data_multiple_entries():
21
+ file_arg = FileArgument(["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])])
22
+ filename = "new.txt"
23
+ text = "New content"
24
+
25
+ with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.7, 0.8, 0.9])):
26
+ file_arg.add_data(filename, text)
27
+ assert file_arg.chunk_names == ["existing.txt", "new.txt"]
28
+ assert file_arg.text_list == ["existing content", "New content"]
29
+ assert len(file_arg.embeddings) == 2
30
+ np.testing.assert_array_equal(file_arg.embeddings[1], np.array([0.7, 0.8, 0.9]))
31
+
32
+ def test_add_data_calls_create_embedding():
33
+ file_arg = FileArgument([], [], [])
34
+ filename = "test.txt"
35
+ text = "Test content"
36
+
37
+ with patch.object(FileArgument, '_create_embedding') as mock_create_embedding:
38
+ mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
39
+ file_arg.add_data(filename, text)
40
+ mock_create_embedding.assert_called_once_with(text)
@@ -0,0 +1,3 @@
1
+ # from files import FileArgument, FilesFeature
2
+ # from token_transformer import TokenData, TokenTransformer
3
+ # from vector_db import VectorDBClient, VectorDB
@@ -0,0 +1,213 @@
1
+ import os
2
+ import docx
3
+ import numpy as np
4
+ from typing import Optional
5
+ from vectoriz.token_transformer import TokenTransformer
6
+
7
+ class FileArgument:
8
+ def __init__(
9
+ self,
10
+ chunk_names: list[str],
11
+ text_list: list[str],
12
+ embeddings: list[float],
13
+ ndarray_data: Optional[np.ndarray] = None,
14
+ ) -> None:
15
+ """
16
+ Initializes the FileProcessor instance with file data and embeddings.
17
+
18
+ This constructor sets up an instance with chunk_names, their text content, and associated embeddings.
19
+ It also initializes a TokenTransformer instance for potential token transformations.
20
+
21
+ Parameters
22
+ ----------
23
+ chunk_names : list[str]
24
+ List of chunk_names corresponding to processed files
25
+ text_list : list[str]
26
+ List of text content extracted from the files
27
+ embeddings : list[float]
28
+ List of embeddings (vector representations) of the text content
29
+ ndarray_data : Optional[np.ndarray], default=None
30
+ NumPy array representation of the embeddings for efficient vector operations
31
+
32
+ Returns
33
+ -------
34
+ None
35
+ """
36
+ self.chunk_names: list[str] = chunk_names
37
+ self.text_list: list[str] = text_list
38
+ self.embeddings: list[float] = embeddings
39
+ self.ndarray_data: np.ndarray = ndarray_data
40
+
41
+ def add_data(self, filename: str, text: str) -> None:
42
+ """
43
+ Adds text data to the vectorizer along with its filename and creates the corresponding embedding.
44
+ This method appends the provided filename and text to their respective lists in the object,
45
+ and also creates and stores the embedding vector for the text.
46
+ Args:
47
+ filename (str): The name of the file or identifier for the text data
48
+ text (str): The text content to be added and embedded
49
+ Returns:
50
+ None: This method doesn't return anything, it updates the internal state of the object
51
+ """
52
+
53
+ self.chunk_names.append(filename)
54
+ self.text_list.append(text)
55
+ self.embeddings.append(self._create_embedding(text))
56
+
57
+ def _create_embedding(self, text: str) -> np.ndarray:
58
+ """
59
+ Creates an embedding vector for the given text using the transformer model.
60
+ This method transforms the input text into a numerical vector representation
61
+ that captures semantic meaning, which can be used for similarity comparisons
62
+ or as input to machine learning models.
63
+ Args:
64
+ text (str): The text to be embedded.
65
+ Returns:
66
+ np.ndarray: A numpy array containing the embedding vector for the input text.
67
+ """
68
+ instance = TokenTransformer()
69
+ return instance.text_to_embeddings([text])[0]
70
+
71
+
72
+ class FilesFeature:
73
+
74
+ def _extract_txt_content(self, directory: str, file: str) -> Optional[str]:
75
+ """
76
+ Extract content from a text file and add it to the response data.
77
+
78
+ This method opens a text file in read mode with UTF-8 encoding, reads its content,
79
+ and adds the file name and its content to the response data.
80
+
81
+ Parameters:
82
+ ----------
83
+ directory : str
84
+ The directory path where the file is located.
85
+ file : str
86
+ The name of the text file to read.
87
+
88
+ Returns:
89
+ -------
90
+ None
91
+ This method doesn't return any value but updates the internal response data.
92
+
93
+ Raises:
94
+ ------
95
+ FileNotFoundError
96
+ If the specified file does not exist.
97
+ UnicodeDecodeError
98
+ If the file cannot be decoded using UTF-8 encoding.
99
+ """
100
+ with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
101
+ text = fl.read()
102
+ return text
103
+
104
+ def _extract_docx_content(self, directory: str, file: str) -> Optional[str]:
105
+ """
106
+ Extracts text content from a Microsoft Word document.
107
+ This method opens a Word document, reads all paragraphs, and joins non-empty
108
+ paragraphs into a single text string. The extracted content is then stored
109
+ using the add_response_data method.
110
+ Args:
111
+ directory (str): The directory path where the Word file is located
112
+ file (str): The filename of the Word document to process
113
+ Returns:
114
+ Optional[str]: The extracted text content or None if no content is found.
115
+ Note:
116
+ Empty paragraphs (those that contain only whitespace) are skipped.
117
+ The python-docx library is required for this method to work.
118
+ """
119
+ file_path = os.path.join(directory, file)
120
+ doc = docx.Document(file_path)
121
+ full_text = []
122
+
123
+ for paragraph in doc.paragraphs:
124
+ content = paragraph.text.strip()
125
+ if len(content) == 0:
126
+ continue
127
+ full_text.append(paragraph.text)
128
+ return "\n".join(full_text)
129
+
130
+ def load_txt_files_from_directory(self, directory: str) -> FileArgument:
131
+ """
132
+ Load all text files from the specified directory and extract their content.
133
+ This method scans the specified directory for files with the '.txt' extension
134
+ and processes each of them using the extract_txt_content method.
135
+ Parameters:
136
+ ----------
137
+ directory : str
138
+ Path to the directory containing text files to be loaded.
139
+ Returns:
140
+ -------
141
+ None
142
+ This method does not return any value. It updates the internal state
143
+ by processing text files found in the directory.
144
+ """
145
+ argument: FileArgument = FileArgument([], [], [])
146
+ for file in os.listdir(directory):
147
+ if not file.endswith(".txt"):
148
+ continue
149
+
150
+ text = self._extract_txt_content(directory, file)
151
+ if text is None:
152
+ continue
153
+
154
+ argument.add_data(file, text)
155
+ return argument
156
+
157
+ def load_docx_files_from_directory(self, directory: str) -> FileArgument:
158
+ """
159
+ Load all Word (.docx) files from the specified directory and extract their content.
160
+
161
+ This method iterates through all files in the given directory, identifies those
162
+ with a .docx extension, and processes them using the extract_word_content method.
163
+
164
+ Args:
165
+ directory (str): Path to the directory containing Word files to be processed
166
+
167
+ Returns:
168
+ None
169
+
170
+ Examples:
171
+ >>> processor = DocumentProcessor()
172
+ >>> processor.load_word_files("/path/to/documents")
173
+ """
174
+ argument: FileArgument = FileArgument([], [], [])
175
+ for file in os.listdir(directory):
176
+ if not file.endswith(".docx"):
177
+ continue
178
+
179
+ text = self._extract_docx_content(directory, file)
180
+ if text is None:
181
+ continue
182
+
183
+ argument.add_data(file, text)
184
+ return argument
185
+
186
+ def load_all_files_from_directory(self, directory: str) -> FileArgument:
187
+ """
188
+ Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
189
+
190
+ This method walks through the directory tree, processing all text and Word files
191
+ by adding them to the response data.
192
+
193
+ Args:
194
+ directory (str): Path to the directory containing files to be loaded
195
+
196
+ Returns:
197
+ None
198
+ """
199
+ argument: FileArgument = FileArgument([], [], [])
200
+ for root, _, files in os.walk(directory):
201
+ for file in files:
202
+ if file.endswith(".txt"):
203
+ text = self._extract_txt_content(root, file)
204
+ if text is not None:
205
+ argument.add_data(file, text)
206
+ elif file.endswith(".docx"):
207
+ try:
208
+ text = self._extract_docx_content(root, file)
209
+ if text is not None:
210
+ argument.add_data(file, text)
211
+ except Exception as e:
212
+ print(f"Error processing {file}: {str(e)}")
213
+ return argument
@@ -0,0 +1,164 @@
1
+ import faiss
2
+ import numpy as np
3
+ from typing import Self
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+
7
+ class TokenData:
8
+ """
9
+ A class that holds text data along with their vector representations and indexing.
10
+ This class is designed to store and manage tokenized texts, their corresponding
11
+ embeddings, and a FAISS index for efficient similarity search.
12
+ Attributes:
13
+ texts (list[str]): List of text strings that have been tokenized.
14
+ index (faiss.IndexFlatL2): A FAISS index using L2 (Euclidean) distance metric
15
+ for similarity search.
16
+ embeddings (np.ndarray, optional): Matrix of vector embeddings corresponding
17
+ to the texts. Default is None.
18
+ """
19
+
20
+ def __init__(
21
+ self,
22
+ texts: list[str],
23
+ index: faiss.IndexFlatL2,
24
+ embeddings: np.ndarray = None,
25
+ ):
26
+ self.texts = texts
27
+ self.index = index
28
+ self.embeddings = embeddings
29
+
30
+ @staticmethod
31
+ def from_vector_db(vector_data) -> Self:
32
+ """
33
+ Creates a TokenData instance from a VectorDBClient.
34
+
35
+ This static method extracts the necessary components from a VectorDBClient instance
36
+ and uses them to instantiate a new TokenData object.
37
+
38
+ Parameters
39
+ ----------
40
+ vector_data : VectorDBClient
41
+ The VectorDBClient instance containing the FAISS index, embeddings, and text data.
42
+
43
+ Returns
44
+ -------
45
+ TokenData
46
+ A new TokenData instance initialized with texts, FAISS index, and embeddings from the
47
+ VectorDBClient.
48
+ """
49
+ index = vector_data.faiss_index
50
+ embeddings = vector_data.file_argument.embeddings
51
+ texts = vector_data.file_argument.text_list
52
+ return TokenData(texts, index, embeddings)
53
+
54
+ @staticmethod
55
+ def from_file_argument(file_argument, index: faiss.IndexFlatL2) -> Self:
56
+ """
57
+ Loads the FAISS index and numpy embeddings from a file argument.
58
+
59
+ Args:
60
+ file_argument (FileArgument): An instance of FileArgument containing
61
+ the FAISS index and numpy embeddings.
62
+ """
63
+ embeddings = file_argument.embeddings
64
+ texts = file_argument.text_list
65
+ return TokenData(texts, index, embeddings)
66
+
67
+
68
+ class TokenTransformer:
69
+
70
+ def __init__(self):
71
+ self.model = SentenceTransformer("all-MiniLM-L6-v2")
72
+
73
+ def search(
74
+ self,
75
+ query: str,
76
+ data: TokenData,
77
+ context_amount: int = 1,
78
+ ) -> str:
79
+ query_embedding = self._query_to_embeddings(query)
80
+ _, I = data.index.search(query_embedding, k=context_amount)
81
+ context = ""
82
+
83
+ for i in I[0]:
84
+ context += data.texts[i] + "\n"
85
+
86
+ return context.strip()
87
+
88
+ def create_index(self, texts: list[str]) -> TokenData:
89
+ """
90
+ Creates a FAISS index from a list of text strings.
91
+
92
+ This method converts the input texts to embeddings and then creates a
93
+ FAISS IndexFlatL2 (L2 distance/Euclidean space) index from these embeddings.
94
+
95
+ Args:
96
+ texts (list[str]): A list of text strings to be indexed.
97
+
98
+ Returns:
99
+ faiss.IndexFlatL2: A FAISS index containing the embeddings of the input texts.
100
+ """
101
+ embeddings = self.text_to_embeddings(texts)
102
+ index = self.embeddings_to_index(embeddings)
103
+ return TokenData(texts, index, embeddings)
104
+
105
+ def embeddings_to_index(self, embeddings_np: np.ndarray) -> faiss.IndexFlatL2:
106
+ """
107
+ Creates a FAISS index using the provided numpy array of embeddings.
108
+
109
+ This method initializes a FAISS IndexFlatL2 (L2 distance/Euclidean) index with
110
+ the dimensions from the input embeddings, adds the embeddings to the index.
111
+
112
+ Args:
113
+ embeddings_np (np.ndarray): A numpy array of embedding vectors to be indexed.
114
+ The shape should be (n, dimension) where n is the number of vectors
115
+ and dimension is the size of each vector.
116
+
117
+ Returns:
118
+ faiss.IndexFlatL2: The created FAISS index containing the embeddings.
119
+
120
+ Note:
121
+ This method also sets the index as an instance attribute and saves it to disk
122
+ using the save_faiss_index method.
123
+ """
124
+ dimension = embeddings_np.shape[1]
125
+ index: faiss.IndexFlatL2 = faiss.IndexFlatL2(dimension)
126
+ index.add(embeddings_np)
127
+ return index
128
+
129
+ def text_to_embeddings(self, sentences: list[str]) -> np.ndarray:
130
+ """
131
+ Transforms a list of sentences into embeddings using the model.
132
+
133
+ Args:
134
+ sentences (list[str]): A list of sentences to be transformed into embeddings.
135
+
136
+ Returns:
137
+ np.ndarray: A numpy array containing the embeddings for each sentence.
138
+ """
139
+ return self.model.encode(sentences)
140
+
141
+ def get_np_vectors(self, embeddings: list[float]) -> np.ndarray:
142
+ """
143
+ Converts input embeddings to a numpy array of float32 type.
144
+
145
+ Args:
146
+ embeddings (list[float]): The embeddings to convert.
147
+
148
+ Returns:
149
+ np.ndarray: A numpy array containing the embeddings as float32 values.
150
+ """
151
+ return np.array(embeddings).astype("float32")
152
+
153
+ def _query_to_embeddings(self, query: str) -> np.ndarray:
154
+ """
155
+ Converts a text query into embeddings using the model.
156
+
157
+ Args:
158
+ query (str): The text query to be transformed into embeddings.
159
+
160
+ Returns:
161
+ np.ndarray: The embedding representation of the query reshaped to
162
+ have dimensions (1, embedding_size).
163
+ """
164
+ return self.model.encode([query]).reshape(1, -1)
@@ -0,0 +1,203 @@
1
+ import os
2
+ import faiss
3
+ import numpy as np
4
+ from typing import Optional
5
+
6
+ from vectoriz.files import FileArgument
7
+ from vectoriz.token_transformer import TokenTransformer
8
+
9
+
10
+ class VectorDBClient:
11
+
12
+ def __init__(
13
+ self,
14
+ faiss_index: Optional[faiss.IndexFlatL2] = None,
15
+ file_argument: Optional[FileArgument] = None,
16
+ ):
17
+ """
18
+ Initialize the SavedVectorData with a FAISS index and embeddings.
19
+
20
+ Args:
21
+ faiss_index (faiss.IndexFlatL2): The FAISS index containing the vector data.
22
+ embeddings (np.ndarray): The numpy array of embeddings associated with the index.
23
+ """
24
+ self.faiss_index = faiss_index
25
+ self.file_argument = file_argument
26
+
27
+ def save_data(self, faiss_db_path: str, np_db_path: str) -> None:
28
+ """
29
+ Save the FAISS index and numpy embeddings to disk.
30
+
31
+ Args:
32
+ faiss_db_path (str): Path to save the FAISS index.
33
+ np_db_path (str): Path to save the numpy embeddings.
34
+ """
35
+ if self.faiss_index is None or self.file_argument is None:
36
+ raise ValueError("FAISS index or file argument is not initialized.")
37
+
38
+ vectorDB = VectorDB()
39
+ vectorDB.save_faiss_index(self.faiss_index, faiss_db_path)
40
+ vectorDB.save_numpy_embeddings(self.file_argument, np_db_path)
41
+
42
+ def load_data(self, faiss_db_path: str, np_db_path: str) -> None:
43
+ """
44
+ Load the FAISS index and numpy embeddings from disk.
45
+
46
+ Args:
47
+ faiss_db_path (str): Path to load the FAISS index from.
48
+ np_db_path (str): Path to load the numpy embeddings from.
49
+ """
50
+ vectorDB = VectorDB()
51
+ self.faiss_index = vectorDB.load_faiss_index(faiss_db_path)
52
+ self.file_argument = vectorDB.load_numpy_embeddings(np_db_path)
53
+
54
+
55
+ class VectorDB:
56
+
57
+ def __init__(self):
58
+ """
59
+ Constructor for the class.
60
+
61
+ Initializes the following attributes:
62
+ - transformer: A TokenTransformer instance for text transformation.
63
+ """
64
+ self.transformer = TokenTransformer()
65
+
66
+ def load_saved_data(
67
+ self, faiss_db_path: str, np_db_path: str
68
+ ) -> Optional[VectorDBClient]:
69
+ """
70
+ Load previously saved FAISS index and numpy embeddings data.
71
+
72
+ This function attempts to load a FAISS index and numpy embeddings from specified paths.
73
+ It combines them into a SavedVectorData object if both are successfully loaded.
74
+
75
+ Parameters:
76
+ ----------
77
+ faiss_db_path : str
78
+ Path to the saved FAISS index file
79
+ np_db_path : str
80
+ Path to the saved numpy embeddings file
81
+
82
+ Returns:
83
+ -------
84
+ Optional[SavedVectorData]
85
+ A SavedVectorData object containing the loaded index and embeddings if successful,
86
+ or None if either file could not be loaded.
87
+ """
88
+ index = self.load_faiss_index(faiss_db_path)
89
+ file_argument = self.load_numpy_embeddings(np_db_path)
90
+
91
+ if index is None or file_argument is None:
92
+ return None
93
+ return VectorDBClient(index, file_argument)
94
+
95
+ def save_faiss_index(
96
+ self,
97
+ index: faiss.IndexFlatL2,
98
+ faiss_db_path: str,
99
+ ) -> None:
100
+ """
101
+ Save a FAISS index to disk.
102
+
103
+ This method takes a FAISS index and saves it to the specified location.
104
+ It ensures the filename has the correct extension and the folder path ends with a slash.
105
+
106
+ Args:
107
+ index (faiss.IndexFlatL2): The FAISS index to save
108
+ faiss_db_path (str): The directory and the name of the file to save the index as where the index will be saved
109
+
110
+ Returns:
111
+ None: This method doesn't return anything
112
+
113
+ Note:
114
+ If the filename doesn't end with '.index', the extension will be added automatically.
115
+ If the folder_path doesn't end with '/', it will be added automatically.
116
+ """
117
+ faiss_db_path = (
118
+ faiss_db_path
119
+ if faiss_db_path.endswith(".index")
120
+ else faiss_db_path + ".index"
121
+ )
122
+ faiss.write_index(index, faiss_db_path)
123
+
124
+ def load_faiss_index(self, faiss_db_path: str) -> Optional[faiss.IndexFlatL2]:
125
+ """
126
+ Load a FAISS index from a specified file path.
127
+
128
+ Args:
129
+ faiss_db_path (str): Path to the FAISS index file.
130
+
131
+ Returns:
132
+ Optional[faiss.IndexFlatL2]: The loaded FAISS index if the file exists,
133
+ None otherwise.
134
+
135
+ Note:
136
+ If the file does not exist, a message will be printed to console.
137
+ """
138
+ if not os.path.exists(faiss_db_path):
139
+ return None
140
+ return faiss.read_index(faiss_db_path)
141
+
142
+ def save_numpy_embeddings(
143
+ self,
144
+ argument: FileArgument,
145
+ np_db_path: str,
146
+ ) -> None:
147
+ """
148
+ Save embeddings, chunk names, and texts to a compressed numpy file (.npz).
149
+ Args:
150
+ argument (FileArgument): An object containing embeddings, ndarray_data, chunk_names, and text_list.
151
+ np_db_path (str): Directory path and filename where the file will be saved.
152
+ Returns:
153
+ np.ndarray: The numpy array containing the embeddings, either from argument.ndarray_data
154
+ or generated from argument.embeddings.
155
+ Notes:
156
+ The saved .npz file will contain three arrays:
157
+ - 'embeddings': The vector embeddings
158
+ - 'chunk_names': The chunk names
159
+ - 'texts': The text content
160
+ """
161
+ np_db_path = np_db_path if np_db_path.endswith(".npz") else np_db_path + ".npz"
162
+
163
+ embeddings_np: np.ndarray = None
164
+ if argument.ndarray_data is not None:
165
+ embeddings_np = argument.ndarray_data
166
+ else:
167
+ embeddings_np = self.transformer.get_np_vectors(argument.embeddings)
168
+
169
+ np.savez(
170
+ np_db_path,
171
+ embeddings=embeddings_np,
172
+ chunk_names=argument.chunk_names,
173
+ texts=argument.text_list,
174
+ )
175
+
176
+ def load_numpy_embeddings(self, np_db_path: str) -> Optional[FileArgument]:
177
+ """
178
+ Load embeddings from a NumPy archive file.
179
+
180
+ This method reads embeddings, filenames, and text data from a .npz file
181
+ created by a previous vectorization process.
182
+
183
+ Args:
184
+ np_db_path (str): Path to the NumPy archive file containing embeddings and metadata.
185
+
186
+ Returns:
187
+ Optional[FileArgument]: A FileArgument object containing the loaded data,
188
+ or None if the specified file does not exist.
189
+ """
190
+ if not os.path.exists(np_db_path):
191
+ return None
192
+
193
+ data = np.load(self.np_db_path)
194
+ embeddings_np = data["embeddings"]
195
+ chunk_names = data["chunk_names"]
196
+ texts = data["texts"]
197
+
198
+ return FileArgument(
199
+ chunk_names=chunk_names,
200
+ text_list=texts,
201
+ embeddings=[],
202
+ ndarray_data=embeddings_np,
203
+ )
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.4
2
+ Name: vectoriz
3
+ Version: 0.0.3
4
+ Summary: Python library for creating vectorized data from text or files.
5
+ Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
6
+ Author: PedroHenriqueDevBR
7
+ Author-email: pedro.henrique.particular@gmail.com
8
+ Classifier: Programming Language :: Python :: 3.12
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.12
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: faiss-cpu==1.10.0
13
+ Requires-Dist: numpy==2.2.4
14
+ Requires-Dist: sentence-transformers==4.0.2
15
+ Requires-Dist: python-docx==1.1.2
16
+ Dynamic: author
17
+ Dynamic: author-email
18
+ Dynamic: classifier
19
+ Dynamic: description
20
+ Dynamic: description-content-type
21
+ Dynamic: home-page
22
+ Dynamic: requires-dist
23
+ Dynamic: requires-python
24
+ Dynamic: summary
25
+
26
+ # RAG-vector-creator
27
+
28
+ ## Overview
29
+ This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
30
+
31
+ ## Features
32
+
33
+ - Document ingestion and preprocessing
34
+ - Vector embedding generation using state-of-the-art models
35
+ - Efficient storage and retrieval of embeddings
36
+ - Integration with LLM-based generation systems
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install -r requirements.txt
42
+ python app.py
43
+ ```
44
+
45
+ ## Build lib
46
+
47
+ To build the lib run the commands:
48
+
49
+ ```
50
+ python setup.py sdist bdist_wheel
51
+ ```
52
+
53
+ To test the install run:
54
+ ```
55
+ pip install .
56
+ ```
57
+
58
+ ## License
59
+
60
+ MIT
@@ -0,0 +1,14 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ tests/__init__.py
5
+ tests/test_files.py
6
+ vectoriz/__init__.py
7
+ vectoriz/files.py
8
+ vectoriz/token_transformer.py
9
+ vectoriz/vector_db.py
10
+ vectoriz.egg-info/PKG-INFO
11
+ vectoriz.egg-info/SOURCES.txt
12
+ vectoriz.egg-info/dependency_links.txt
13
+ vectoriz.egg-info/requires.txt
14
+ vectoriz.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ faiss-cpu==1.10.0
2
+ numpy==2.2.4
3
+ sentence-transformers==4.0.2
4
+ python-docx==1.1.2
@@ -0,0 +1,2 @@
1
+ tests
2
+ vectoriz