vectoriz 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vectoriz-0.0.3/PKG-INFO +60 -0
- vectoriz-0.0.3/README.md +35 -0
- vectoriz-0.0.3/pyproject.toml +3 -0
- vectoriz-0.0.3/setup.cfg +4 -0
- vectoriz-0.0.3/setup.py +24 -0
- vectoriz-0.0.3/tests/__init__.py +0 -0
- vectoriz-0.0.3/tests/test_files.py +40 -0
- vectoriz-0.0.3/vectoriz/__init__.py +3 -0
- vectoriz-0.0.3/vectoriz/files.py +213 -0
- vectoriz-0.0.3/vectoriz/token_transformer.py +164 -0
- vectoriz-0.0.3/vectoriz/vector_db.py +203 -0
- vectoriz-0.0.3/vectoriz.egg-info/PKG-INFO +60 -0
- vectoriz-0.0.3/vectoriz.egg-info/SOURCES.txt +14 -0
- vectoriz-0.0.3/vectoriz.egg-info/dependency_links.txt +1 -0
- vectoriz-0.0.3/vectoriz.egg-info/requires.txt +4 -0
- vectoriz-0.0.3/vectoriz.egg-info/top_level.txt +2 -0
vectoriz-0.0.3/PKG-INFO
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: vectoriz
|
3
|
+
Version: 0.0.3
|
4
|
+
Summary: Python library for creating vectorized data from text or files.
|
5
|
+
Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
|
6
|
+
Author: PedroHenriqueDevBR
|
7
|
+
Author-email: pedro.henrique.particular@gmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
9
|
+
Classifier: Operating System :: OS Independent
|
10
|
+
Requires-Python: >=3.12
|
11
|
+
Description-Content-Type: text/markdown
|
12
|
+
Requires-Dist: faiss-cpu==1.10.0
|
13
|
+
Requires-Dist: numpy==2.2.4
|
14
|
+
Requires-Dist: sentence-transformers==4.0.2
|
15
|
+
Requires-Dist: python-docx==1.1.2
|
16
|
+
Dynamic: author
|
17
|
+
Dynamic: author-email
|
18
|
+
Dynamic: classifier
|
19
|
+
Dynamic: description
|
20
|
+
Dynamic: description-content-type
|
21
|
+
Dynamic: home-page
|
22
|
+
Dynamic: requires-dist
|
23
|
+
Dynamic: requires-python
|
24
|
+
Dynamic: summary
|
25
|
+
|
26
|
+
# RAG-vector-creator
|
27
|
+
|
28
|
+
## Overview
|
29
|
+
This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
|
30
|
+
|
31
|
+
## Features
|
32
|
+
|
33
|
+
- Document ingestion and preprocessing
|
34
|
+
- Vector embedding generation using state-of-the-art models
|
35
|
+
- Efficient storage and retrieval of embeddings
|
36
|
+
- Integration with LLM-based generation systems
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install -r requirements.txt
|
42
|
+
python app.py
|
43
|
+
```
|
44
|
+
|
45
|
+
## Build lib
|
46
|
+
|
47
|
+
To build the lib run the commands:
|
48
|
+
|
49
|
+
```
|
50
|
+
python setup.py sdist bdist_wheel
|
51
|
+
```
|
52
|
+
|
53
|
+
To test the install run:
|
54
|
+
```
|
55
|
+
pip install .
|
56
|
+
```
|
57
|
+
|
58
|
+
## License
|
59
|
+
|
60
|
+
MIT
|
vectoriz-0.0.3/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# RAG-vector-creator
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
|
5
|
+
|
6
|
+
## Features
|
7
|
+
|
8
|
+
- Document ingestion and preprocessing
|
9
|
+
- Vector embedding generation using state-of-the-art models
|
10
|
+
- Efficient storage and retrieval of embeddings
|
11
|
+
- Integration with LLM-based generation systems
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
```bash
|
16
|
+
pip install -r requirements.txt
|
17
|
+
python app.py
|
18
|
+
```
|
19
|
+
|
20
|
+
## Build lib
|
21
|
+
|
22
|
+
To build the lib run the commands:
|
23
|
+
|
24
|
+
```
|
25
|
+
python setup.py sdist bdist_wheel
|
26
|
+
```
|
27
|
+
|
28
|
+
To test the install run:
|
29
|
+
```
|
30
|
+
pip install .
|
31
|
+
```
|
32
|
+
|
33
|
+
## License
|
34
|
+
|
35
|
+
MIT
|
vectoriz-0.0.3/setup.cfg
ADDED
vectoriz-0.0.3/setup.py
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
from setuptools import setup, find_packages
|
2
|
+
|
3
|
+
setup(
|
4
|
+
name="vectoriz",
|
5
|
+
version="0.0.3",
|
6
|
+
author="PedroHenriqueDevBR",
|
7
|
+
author_email="pedro.henrique.particular@gmail.com",
|
8
|
+
description="Python library for creating vectorized data from text or files.",
|
9
|
+
long_description = open("README.md", encoding="utf-8").read(),
|
10
|
+
long_description_content_type="text/markdown",
|
11
|
+
url="https://github.com/PedroHenriqueDevBR/vectoriz",
|
12
|
+
packages=find_packages(),
|
13
|
+
classifiers=[
|
14
|
+
"Programming Language :: Python :: 3.12",
|
15
|
+
"Operating System :: OS Independent",
|
16
|
+
],
|
17
|
+
python_requires=">=3.12",
|
18
|
+
install_requires=[
|
19
|
+
"faiss-cpu==1.10.0",
|
20
|
+
"numpy==2.2.4",
|
21
|
+
"sentence-transformers==4.0.2",
|
22
|
+
"python-docx==1.1.2"
|
23
|
+
],
|
24
|
+
)
|
File without changes
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import pytest
|
2
|
+
import numpy as np
|
3
|
+
from unittest.mock import patch
|
4
|
+
from vectoriz.files import FileArgument
|
5
|
+
|
6
|
+
|
7
|
+
def test_add_data_appends_to_lists():
|
8
|
+
file_arg = FileArgument([], [], [])
|
9
|
+
filename = "test.txt"
|
10
|
+
text = "Test content"
|
11
|
+
|
12
|
+
with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.1, 0.2, 0.3])):
|
13
|
+
file_arg.add_data(filename, text)
|
14
|
+
|
15
|
+
assert file_arg.chunk_names == [filename]
|
16
|
+
assert file_arg.text_list == [text]
|
17
|
+
assert len(file_arg.embeddings) == 1
|
18
|
+
np.testing.assert_array_equal(file_arg.embeddings[0], np.array([0.1, 0.2, 0.3]))
|
19
|
+
|
20
|
+
def test_add_data_multiple_entries():
|
21
|
+
file_arg = FileArgument(["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])])
|
22
|
+
filename = "new.txt"
|
23
|
+
text = "New content"
|
24
|
+
|
25
|
+
with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.7, 0.8, 0.9])):
|
26
|
+
file_arg.add_data(filename, text)
|
27
|
+
assert file_arg.chunk_names == ["existing.txt", "new.txt"]
|
28
|
+
assert file_arg.text_list == ["existing content", "New content"]
|
29
|
+
assert len(file_arg.embeddings) == 2
|
30
|
+
np.testing.assert_array_equal(file_arg.embeddings[1], np.array([0.7, 0.8, 0.9]))
|
31
|
+
|
32
|
+
def test_add_data_calls_create_embedding():
|
33
|
+
file_arg = FileArgument([], [], [])
|
34
|
+
filename = "test.txt"
|
35
|
+
text = "Test content"
|
36
|
+
|
37
|
+
with patch.object(FileArgument, '_create_embedding') as mock_create_embedding:
|
38
|
+
mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
|
39
|
+
file_arg.add_data(filename, text)
|
40
|
+
mock_create_embedding.assert_called_once_with(text)
|
@@ -0,0 +1,213 @@
|
|
1
|
+
import os
|
2
|
+
import docx
|
3
|
+
import numpy as np
|
4
|
+
from typing import Optional
|
5
|
+
from vectoriz.token_transformer import TokenTransformer
|
6
|
+
|
7
|
+
class FileArgument:
|
8
|
+
def __init__(
|
9
|
+
self,
|
10
|
+
chunk_names: list[str],
|
11
|
+
text_list: list[str],
|
12
|
+
embeddings: list[float],
|
13
|
+
ndarray_data: Optional[np.ndarray] = None,
|
14
|
+
) -> None:
|
15
|
+
"""
|
16
|
+
Initializes the FileProcessor instance with file data and embeddings.
|
17
|
+
|
18
|
+
This constructor sets up an instance with chunk_names, their text content, and associated embeddings.
|
19
|
+
It also initializes a TokenTransformer instance for potential token transformations.
|
20
|
+
|
21
|
+
Parameters
|
22
|
+
----------
|
23
|
+
chunk_names : list[str]
|
24
|
+
List of chunk_names corresponding to processed files
|
25
|
+
text_list : list[str]
|
26
|
+
List of text content extracted from the files
|
27
|
+
embeddings : list[float]
|
28
|
+
List of embeddings (vector representations) of the text content
|
29
|
+
ndarray_data : Optional[np.ndarray], default=None
|
30
|
+
NumPy array representation of the embeddings for efficient vector operations
|
31
|
+
|
32
|
+
Returns
|
33
|
+
-------
|
34
|
+
None
|
35
|
+
"""
|
36
|
+
self.chunk_names: list[str] = chunk_names
|
37
|
+
self.text_list: list[str] = text_list
|
38
|
+
self.embeddings: list[float] = embeddings
|
39
|
+
self.ndarray_data: np.ndarray = ndarray_data
|
40
|
+
|
41
|
+
def add_data(self, filename: str, text: str) -> None:
|
42
|
+
"""
|
43
|
+
Adds text data to the vectorizer along with its filename and creates the corresponding embedding.
|
44
|
+
This method appends the provided filename and text to their respective lists in the object,
|
45
|
+
and also creates and stores the embedding vector for the text.
|
46
|
+
Args:
|
47
|
+
filename (str): The name of the file or identifier for the text data
|
48
|
+
text (str): The text content to be added and embedded
|
49
|
+
Returns:
|
50
|
+
None: This method doesn't return anything, it updates the internal state of the object
|
51
|
+
"""
|
52
|
+
|
53
|
+
self.chunk_names.append(filename)
|
54
|
+
self.text_list.append(text)
|
55
|
+
self.embeddings.append(self._create_embedding(text))
|
56
|
+
|
57
|
+
def _create_embedding(self, text: str) -> np.ndarray:
|
58
|
+
"""
|
59
|
+
Creates an embedding vector for the given text using the transformer model.
|
60
|
+
This method transforms the input text into a numerical vector representation
|
61
|
+
that captures semantic meaning, which can be used for similarity comparisons
|
62
|
+
or as input to machine learning models.
|
63
|
+
Args:
|
64
|
+
text (str): The text to be embedded.
|
65
|
+
Returns:
|
66
|
+
np.ndarray: A numpy array containing the embedding vector for the input text.
|
67
|
+
"""
|
68
|
+
instance = TokenTransformer()
|
69
|
+
return instance.text_to_embeddings([text])[0]
|
70
|
+
|
71
|
+
|
72
|
+
class FilesFeature:
|
73
|
+
|
74
|
+
def _extract_txt_content(self, directory: str, file: str) -> Optional[str]:
|
75
|
+
"""
|
76
|
+
Extract content from a text file and add it to the response data.
|
77
|
+
|
78
|
+
This method opens a text file in read mode with UTF-8 encoding, reads its content,
|
79
|
+
and adds the file name and its content to the response data.
|
80
|
+
|
81
|
+
Parameters:
|
82
|
+
----------
|
83
|
+
directory : str
|
84
|
+
The directory path where the file is located.
|
85
|
+
file : str
|
86
|
+
The name of the text file to read.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
-------
|
90
|
+
None
|
91
|
+
This method doesn't return any value but updates the internal response data.
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
------
|
95
|
+
FileNotFoundError
|
96
|
+
If the specified file does not exist.
|
97
|
+
UnicodeDecodeError
|
98
|
+
If the file cannot be decoded using UTF-8 encoding.
|
99
|
+
"""
|
100
|
+
with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
|
101
|
+
text = fl.read()
|
102
|
+
return text
|
103
|
+
|
104
|
+
def _extract_docx_content(self, directory: str, file: str) -> Optional[str]:
|
105
|
+
"""
|
106
|
+
Extracts text content from a Microsoft Word document.
|
107
|
+
This method opens a Word document, reads all paragraphs, and joins non-empty
|
108
|
+
paragraphs into a single text string. The extracted content is then stored
|
109
|
+
using the add_response_data method.
|
110
|
+
Args:
|
111
|
+
directory (str): The directory path where the Word file is located
|
112
|
+
file (str): The filename of the Word document to process
|
113
|
+
Returns:
|
114
|
+
Optional[str]: The extracted text content or None if no content is found.
|
115
|
+
Note:
|
116
|
+
Empty paragraphs (those that contain only whitespace) are skipped.
|
117
|
+
The python-docx library is required for this method to work.
|
118
|
+
"""
|
119
|
+
file_path = os.path.join(directory, file)
|
120
|
+
doc = docx.Document(file_path)
|
121
|
+
full_text = []
|
122
|
+
|
123
|
+
for paragraph in doc.paragraphs:
|
124
|
+
content = paragraph.text.strip()
|
125
|
+
if len(content) == 0:
|
126
|
+
continue
|
127
|
+
full_text.append(paragraph.text)
|
128
|
+
return "\n".join(full_text)
|
129
|
+
|
130
|
+
def load_txt_files_from_directory(self, directory: str) -> FileArgument:
|
131
|
+
"""
|
132
|
+
Load all text files from the specified directory and extract their content.
|
133
|
+
This method scans the specified directory for files with the '.txt' extension
|
134
|
+
and processes each of them using the extract_txt_content method.
|
135
|
+
Parameters:
|
136
|
+
----------
|
137
|
+
directory : str
|
138
|
+
Path to the directory containing text files to be loaded.
|
139
|
+
Returns:
|
140
|
+
-------
|
141
|
+
None
|
142
|
+
This method does not return any value. It updates the internal state
|
143
|
+
by processing text files found in the directory.
|
144
|
+
"""
|
145
|
+
argument: FileArgument = FileArgument([], [], [])
|
146
|
+
for file in os.listdir(directory):
|
147
|
+
if not file.endswith(".txt"):
|
148
|
+
continue
|
149
|
+
|
150
|
+
text = self._extract_txt_content(directory, file)
|
151
|
+
if text is None:
|
152
|
+
continue
|
153
|
+
|
154
|
+
argument.add_data(file, text)
|
155
|
+
return argument
|
156
|
+
|
157
|
+
def load_docx_files_from_directory(self, directory: str) -> FileArgument:
|
158
|
+
"""
|
159
|
+
Load all Word (.docx) files from the specified directory and extract their content.
|
160
|
+
|
161
|
+
This method iterates through all files in the given directory, identifies those
|
162
|
+
with a .docx extension, and processes them using the extract_word_content method.
|
163
|
+
|
164
|
+
Args:
|
165
|
+
directory (str): Path to the directory containing Word files to be processed
|
166
|
+
|
167
|
+
Returns:
|
168
|
+
None
|
169
|
+
|
170
|
+
Examples:
|
171
|
+
>>> processor = DocumentProcessor()
|
172
|
+
>>> processor.load_word_files("/path/to/documents")
|
173
|
+
"""
|
174
|
+
argument: FileArgument = FileArgument([], [], [])
|
175
|
+
for file in os.listdir(directory):
|
176
|
+
if not file.endswith(".docx"):
|
177
|
+
continue
|
178
|
+
|
179
|
+
text = self._extract_docx_content(directory, file)
|
180
|
+
if text is None:
|
181
|
+
continue
|
182
|
+
|
183
|
+
argument.add_data(file, text)
|
184
|
+
return argument
|
185
|
+
|
186
|
+
def load_all_files_from_directory(self, directory: str) -> FileArgument:
|
187
|
+
"""
|
188
|
+
Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
|
189
|
+
|
190
|
+
This method walks through the directory tree, processing all text and Word files
|
191
|
+
by adding them to the response data.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
directory (str): Path to the directory containing files to be loaded
|
195
|
+
|
196
|
+
Returns:
|
197
|
+
None
|
198
|
+
"""
|
199
|
+
argument: FileArgument = FileArgument([], [], [])
|
200
|
+
for root, _, files in os.walk(directory):
|
201
|
+
for file in files:
|
202
|
+
if file.endswith(".txt"):
|
203
|
+
text = self._extract_txt_content(root, file)
|
204
|
+
if text is not None:
|
205
|
+
argument.add_data(file, text)
|
206
|
+
elif file.endswith(".docx"):
|
207
|
+
try:
|
208
|
+
text = self._extract_docx_content(root, file)
|
209
|
+
if text is not None:
|
210
|
+
argument.add_data(file, text)
|
211
|
+
except Exception as e:
|
212
|
+
print(f"Error processing {file}: {str(e)}")
|
213
|
+
return argument
|
@@ -0,0 +1,164 @@
|
|
1
|
+
import faiss
|
2
|
+
import numpy as np
|
3
|
+
from typing import Self
|
4
|
+
from sentence_transformers import SentenceTransformer
|
5
|
+
|
6
|
+
|
7
|
+
class TokenData:
|
8
|
+
"""
|
9
|
+
A class that holds text data along with their vector representations and indexing.
|
10
|
+
This class is designed to store and manage tokenized texts, their corresponding
|
11
|
+
embeddings, and a FAISS index for efficient similarity search.
|
12
|
+
Attributes:
|
13
|
+
texts (list[str]): List of text strings that have been tokenized.
|
14
|
+
index (faiss.IndexFlatL2): A FAISS index using L2 (Euclidean) distance metric
|
15
|
+
for similarity search.
|
16
|
+
embeddings (np.ndarray, optional): Matrix of vector embeddings corresponding
|
17
|
+
to the texts. Default is None.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
texts: list[str],
|
23
|
+
index: faiss.IndexFlatL2,
|
24
|
+
embeddings: np.ndarray = None,
|
25
|
+
):
|
26
|
+
self.texts = texts
|
27
|
+
self.index = index
|
28
|
+
self.embeddings = embeddings
|
29
|
+
|
30
|
+
@staticmethod
|
31
|
+
def from_vector_db(vector_data) -> Self:
|
32
|
+
"""
|
33
|
+
Creates a TokenData instance from a VectorDBClient.
|
34
|
+
|
35
|
+
This static method extracts the necessary components from a VectorDBClient instance
|
36
|
+
and uses them to instantiate a new TokenData object.
|
37
|
+
|
38
|
+
Parameters
|
39
|
+
----------
|
40
|
+
vector_data : VectorDBClient
|
41
|
+
The VectorDBClient instance containing the FAISS index, embeddings, and text data.
|
42
|
+
|
43
|
+
Returns
|
44
|
+
-------
|
45
|
+
TokenData
|
46
|
+
A new TokenData instance initialized with texts, FAISS index, and embeddings from the
|
47
|
+
VectorDBClient.
|
48
|
+
"""
|
49
|
+
index = vector_data.faiss_index
|
50
|
+
embeddings = vector_data.file_argument.embeddings
|
51
|
+
texts = vector_data.file_argument.text_list
|
52
|
+
return TokenData(texts, index, embeddings)
|
53
|
+
|
54
|
+
@staticmethod
|
55
|
+
def from_file_argument(file_argument, index: faiss.IndexFlatL2) -> Self:
|
56
|
+
"""
|
57
|
+
Loads the FAISS index and numpy embeddings from a file argument.
|
58
|
+
|
59
|
+
Args:
|
60
|
+
file_argument (FileArgument): An instance of FileArgument containing
|
61
|
+
the FAISS index and numpy embeddings.
|
62
|
+
"""
|
63
|
+
embeddings = file_argument.embeddings
|
64
|
+
texts = file_argument.text_list
|
65
|
+
return TokenData(texts, index, embeddings)
|
66
|
+
|
67
|
+
|
68
|
+
class TokenTransformer:
|
69
|
+
|
70
|
+
def __init__(self):
|
71
|
+
self.model = SentenceTransformer("all-MiniLM-L6-v2")
|
72
|
+
|
73
|
+
def search(
|
74
|
+
self,
|
75
|
+
query: str,
|
76
|
+
data: TokenData,
|
77
|
+
context_amount: int = 1,
|
78
|
+
) -> str:
|
79
|
+
query_embedding = self._query_to_embeddings(query)
|
80
|
+
_, I = data.index.search(query_embedding, k=context_amount)
|
81
|
+
context = ""
|
82
|
+
|
83
|
+
for i in I[0]:
|
84
|
+
context += data.texts[i] + "\n"
|
85
|
+
|
86
|
+
return context.strip()
|
87
|
+
|
88
|
+
def create_index(self, texts: list[str]) -> TokenData:
|
89
|
+
"""
|
90
|
+
Creates a FAISS index from a list of text strings.
|
91
|
+
|
92
|
+
This method converts the input texts to embeddings and then creates a
|
93
|
+
FAISS IndexFlatL2 (L2 distance/Euclidean space) index from these embeddings.
|
94
|
+
|
95
|
+
Args:
|
96
|
+
texts (list[str]): A list of text strings to be indexed.
|
97
|
+
|
98
|
+
Returns:
|
99
|
+
faiss.IndexFlatL2: A FAISS index containing the embeddings of the input texts.
|
100
|
+
"""
|
101
|
+
embeddings = self.text_to_embeddings(texts)
|
102
|
+
index = self.embeddings_to_index(embeddings)
|
103
|
+
return TokenData(texts, index, embeddings)
|
104
|
+
|
105
|
+
def embeddings_to_index(self, embeddings_np: np.ndarray) -> faiss.IndexFlatL2:
|
106
|
+
"""
|
107
|
+
Creates a FAISS index using the provided numpy array of embeddings.
|
108
|
+
|
109
|
+
This method initializes a FAISS IndexFlatL2 (L2 distance/Euclidean) index with
|
110
|
+
the dimensions from the input embeddings, adds the embeddings to the index.
|
111
|
+
|
112
|
+
Args:
|
113
|
+
embeddings_np (np.ndarray): A numpy array of embedding vectors to be indexed.
|
114
|
+
The shape should be (n, dimension) where n is the number of vectors
|
115
|
+
and dimension is the size of each vector.
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
faiss.IndexFlatL2: The created FAISS index containing the embeddings.
|
119
|
+
|
120
|
+
Note:
|
121
|
+
This method also sets the index as an instance attribute and saves it to disk
|
122
|
+
using the save_faiss_index method.
|
123
|
+
"""
|
124
|
+
dimension = embeddings_np.shape[1]
|
125
|
+
index: faiss.IndexFlatL2 = faiss.IndexFlatL2(dimension)
|
126
|
+
index.add(embeddings_np)
|
127
|
+
return index
|
128
|
+
|
129
|
+
def text_to_embeddings(self, sentences: list[str]) -> np.ndarray:
|
130
|
+
"""
|
131
|
+
Transforms a list of sentences into embeddings using the model.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
sentences (list[str]): A list of sentences to be transformed into embeddings.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
np.ndarray: A numpy array containing the embeddings for each sentence.
|
138
|
+
"""
|
139
|
+
return self.model.encode(sentences)
|
140
|
+
|
141
|
+
def get_np_vectors(self, embeddings: list[float]) -> np.ndarray:
|
142
|
+
"""
|
143
|
+
Converts input embeddings to a numpy array of float32 type.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
embeddings (list[float]): The embeddings to convert.
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
np.ndarray: A numpy array containing the embeddings as float32 values.
|
150
|
+
"""
|
151
|
+
return np.array(embeddings).astype("float32")
|
152
|
+
|
153
|
+
def _query_to_embeddings(self, query: str) -> np.ndarray:
|
154
|
+
"""
|
155
|
+
Converts a text query into embeddings using the model.
|
156
|
+
|
157
|
+
Args:
|
158
|
+
query (str): The text query to be transformed into embeddings.
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
np.ndarray: The embedding representation of the query reshaped to
|
162
|
+
have dimensions (1, embedding_size).
|
163
|
+
"""
|
164
|
+
return self.model.encode([query]).reshape(1, -1)
|
@@ -0,0 +1,203 @@
|
|
1
|
+
import os
|
2
|
+
import faiss
|
3
|
+
import numpy as np
|
4
|
+
from typing import Optional
|
5
|
+
|
6
|
+
from vectoriz.files import FileArgument
|
7
|
+
from vectoriz.token_transformer import TokenTransformer
|
8
|
+
|
9
|
+
|
10
|
+
class VectorDBClient:
|
11
|
+
|
12
|
+
def __init__(
|
13
|
+
self,
|
14
|
+
faiss_index: Optional[faiss.IndexFlatL2] = None,
|
15
|
+
file_argument: Optional[FileArgument] = None,
|
16
|
+
):
|
17
|
+
"""
|
18
|
+
Initialize the SavedVectorData with a FAISS index and embeddings.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
faiss_index (faiss.IndexFlatL2): The FAISS index containing the vector data.
|
22
|
+
embeddings (np.ndarray): The numpy array of embeddings associated with the index.
|
23
|
+
"""
|
24
|
+
self.faiss_index = faiss_index
|
25
|
+
self.file_argument = file_argument
|
26
|
+
|
27
|
+
def save_data(self, faiss_db_path: str, np_db_path: str) -> None:
|
28
|
+
"""
|
29
|
+
Save the FAISS index and numpy embeddings to disk.
|
30
|
+
|
31
|
+
Args:
|
32
|
+
faiss_db_path (str): Path to save the FAISS index.
|
33
|
+
np_db_path (str): Path to save the numpy embeddings.
|
34
|
+
"""
|
35
|
+
if self.faiss_index is None or self.file_argument is None:
|
36
|
+
raise ValueError("FAISS index or file argument is not initialized.")
|
37
|
+
|
38
|
+
vectorDB = VectorDB()
|
39
|
+
vectorDB.save_faiss_index(self.faiss_index, faiss_db_path)
|
40
|
+
vectorDB.save_numpy_embeddings(self.file_argument, np_db_path)
|
41
|
+
|
42
|
+
def load_data(self, faiss_db_path: str, np_db_path: str) -> None:
|
43
|
+
"""
|
44
|
+
Load the FAISS index and numpy embeddings from disk.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
faiss_db_path (str): Path to load the FAISS index from.
|
48
|
+
np_db_path (str): Path to load the numpy embeddings from.
|
49
|
+
"""
|
50
|
+
vectorDB = VectorDB()
|
51
|
+
self.faiss_index = vectorDB.load_faiss_index(faiss_db_path)
|
52
|
+
self.file_argument = vectorDB.load_numpy_embeddings(np_db_path)
|
53
|
+
|
54
|
+
|
55
|
+
class VectorDB:
|
56
|
+
|
57
|
+
def __init__(self):
|
58
|
+
"""
|
59
|
+
Constructor for the class.
|
60
|
+
|
61
|
+
Initializes the following attributes:
|
62
|
+
- transformer: A TokenTransformer instance for text transformation.
|
63
|
+
"""
|
64
|
+
self.transformer = TokenTransformer()
|
65
|
+
|
66
|
+
def load_saved_data(
|
67
|
+
self, faiss_db_path: str, np_db_path: str
|
68
|
+
) -> Optional[VectorDBClient]:
|
69
|
+
"""
|
70
|
+
Load previously saved FAISS index and numpy embeddings data.
|
71
|
+
|
72
|
+
This function attempts to load a FAISS index and numpy embeddings from specified paths.
|
73
|
+
It combines them into a SavedVectorData object if both are successfully loaded.
|
74
|
+
|
75
|
+
Parameters:
|
76
|
+
----------
|
77
|
+
faiss_db_path : str
|
78
|
+
Path to the saved FAISS index file
|
79
|
+
np_db_path : str
|
80
|
+
Path to the saved numpy embeddings file
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
-------
|
84
|
+
Optional[SavedVectorData]
|
85
|
+
A SavedVectorData object containing the loaded index and embeddings if successful,
|
86
|
+
or None if either file could not be loaded.
|
87
|
+
"""
|
88
|
+
index = self.load_faiss_index(faiss_db_path)
|
89
|
+
file_argument = self.load_numpy_embeddings(np_db_path)
|
90
|
+
|
91
|
+
if index is None or file_argument is None:
|
92
|
+
return None
|
93
|
+
return VectorDBClient(index, file_argument)
|
94
|
+
|
95
|
+
def save_faiss_index(
|
96
|
+
self,
|
97
|
+
index: faiss.IndexFlatL2,
|
98
|
+
faiss_db_path: str,
|
99
|
+
) -> None:
|
100
|
+
"""
|
101
|
+
Save a FAISS index to disk.
|
102
|
+
|
103
|
+
This method takes a FAISS index and saves it to the specified location.
|
104
|
+
It ensures the filename has the correct extension and the folder path ends with a slash.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
index (faiss.IndexFlatL2): The FAISS index to save
|
108
|
+
faiss_db_path (str): The directory and the name of the file to save the index as where the index will be saved
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
None: This method doesn't return anything
|
112
|
+
|
113
|
+
Note:
|
114
|
+
If the filename doesn't end with '.index', the extension will be added automatically.
|
115
|
+
If the folder_path doesn't end with '/', it will be added automatically.
|
116
|
+
"""
|
117
|
+
faiss_db_path = (
|
118
|
+
faiss_db_path
|
119
|
+
if faiss_db_path.endswith(".index")
|
120
|
+
else faiss_db_path + ".index"
|
121
|
+
)
|
122
|
+
faiss.write_index(index, faiss_db_path)
|
123
|
+
|
124
|
+
def load_faiss_index(self, faiss_db_path: str) -> Optional[faiss.IndexFlatL2]:
|
125
|
+
"""
|
126
|
+
Load a FAISS index from a specified file path.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
faiss_db_path (str): Path to the FAISS index file.
|
130
|
+
|
131
|
+
Returns:
|
132
|
+
Optional[faiss.IndexFlatL2]: The loaded FAISS index if the file exists,
|
133
|
+
None otherwise.
|
134
|
+
|
135
|
+
Note:
|
136
|
+
If the file does not exist, a message will be printed to console.
|
137
|
+
"""
|
138
|
+
if not os.path.exists(faiss_db_path):
|
139
|
+
return None
|
140
|
+
return faiss.read_index(faiss_db_path)
|
141
|
+
|
142
|
+
def save_numpy_embeddings(
|
143
|
+
self,
|
144
|
+
argument: FileArgument,
|
145
|
+
np_db_path: str,
|
146
|
+
) -> None:
|
147
|
+
"""
|
148
|
+
Save embeddings, chunk names, and texts to a compressed numpy file (.npz).
|
149
|
+
Args:
|
150
|
+
argument (FileArgument): An object containing embeddings, ndarray_data, chunk_names, and text_list.
|
151
|
+
np_db_path (str): Directory path and filename where the file will be saved.
|
152
|
+
Returns:
|
153
|
+
np.ndarray: The numpy array containing the embeddings, either from argument.ndarray_data
|
154
|
+
or generated from argument.embeddings.
|
155
|
+
Notes:
|
156
|
+
The saved .npz file will contain three arrays:
|
157
|
+
- 'embeddings': The vector embeddings
|
158
|
+
- 'chunk_names': The chunk names
|
159
|
+
- 'texts': The text content
|
160
|
+
"""
|
161
|
+
np_db_path = np_db_path if np_db_path.endswith(".npz") else np_db_path + ".npz"
|
162
|
+
|
163
|
+
embeddings_np: np.ndarray = None
|
164
|
+
if argument.ndarray_data is not None:
|
165
|
+
embeddings_np = argument.ndarray_data
|
166
|
+
else:
|
167
|
+
embeddings_np = self.transformer.get_np_vectors(argument.embeddings)
|
168
|
+
|
169
|
+
np.savez(
|
170
|
+
np_db_path,
|
171
|
+
embeddings=embeddings_np,
|
172
|
+
chunk_names=argument.chunk_names,
|
173
|
+
texts=argument.text_list,
|
174
|
+
)
|
175
|
+
|
176
|
+
def load_numpy_embeddings(self, np_db_path: str) -> Optional[FileArgument]:
|
177
|
+
"""
|
178
|
+
Load embeddings from a NumPy archive file.
|
179
|
+
|
180
|
+
This method reads embeddings, filenames, and text data from a .npz file
|
181
|
+
created by a previous vectorization process.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
np_db_path (str): Path to the NumPy archive file containing embeddings and metadata.
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
Optional[FileArgument]: A FileArgument object containing the loaded data,
|
188
|
+
or None if the specified file does not exist.
|
189
|
+
"""
|
190
|
+
if not os.path.exists(np_db_path):
|
191
|
+
return None
|
192
|
+
|
193
|
+
data = np.load(self.np_db_path)
|
194
|
+
embeddings_np = data["embeddings"]
|
195
|
+
chunk_names = data["chunk_names"]
|
196
|
+
texts = data["texts"]
|
197
|
+
|
198
|
+
return FileArgument(
|
199
|
+
chunk_names=chunk_names,
|
200
|
+
text_list=texts,
|
201
|
+
embeddings=[],
|
202
|
+
ndarray_data=embeddings_np,
|
203
|
+
)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: vectoriz
|
3
|
+
Version: 0.0.3
|
4
|
+
Summary: Python library for creating vectorized data from text or files.
|
5
|
+
Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
|
6
|
+
Author: PedroHenriqueDevBR
|
7
|
+
Author-email: pedro.henrique.particular@gmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3.12
|
9
|
+
Classifier: Operating System :: OS Independent
|
10
|
+
Requires-Python: >=3.12
|
11
|
+
Description-Content-Type: text/markdown
|
12
|
+
Requires-Dist: faiss-cpu==1.10.0
|
13
|
+
Requires-Dist: numpy==2.2.4
|
14
|
+
Requires-Dist: sentence-transformers==4.0.2
|
15
|
+
Requires-Dist: python-docx==1.1.2
|
16
|
+
Dynamic: author
|
17
|
+
Dynamic: author-email
|
18
|
+
Dynamic: classifier
|
19
|
+
Dynamic: description
|
20
|
+
Dynamic: description-content-type
|
21
|
+
Dynamic: home-page
|
22
|
+
Dynamic: requires-dist
|
23
|
+
Dynamic: requires-python
|
24
|
+
Dynamic: summary
|
25
|
+
|
26
|
+
# RAG-vector-creator
|
27
|
+
|
28
|
+
## Overview
|
29
|
+
This project implements a RAG (Retrieval-Augmented Generation) system for creating and managing vector embeddings from documents using FAISS and NumPy libraries. It efficiently transforms text data into high-dimensional vector representations that enable semantic search capabilities, similarity matching, and context-aware document retrieval for enhanced question answering applications.
|
30
|
+
|
31
|
+
## Features
|
32
|
+
|
33
|
+
- Document ingestion and preprocessing
|
34
|
+
- Vector embedding generation using state-of-the-art models
|
35
|
+
- Efficient storage and retrieval of embeddings
|
36
|
+
- Integration with LLM-based generation systems
|
37
|
+
|
38
|
+
## Installation
|
39
|
+
|
40
|
+
```bash
|
41
|
+
pip install -r requirements.txt
|
42
|
+
python app.py
|
43
|
+
```
|
44
|
+
|
45
|
+
## Build lib
|
46
|
+
|
47
|
+
To build the lib run the commands:
|
48
|
+
|
49
|
+
```
|
50
|
+
python setup.py sdist bdist_wheel
|
51
|
+
```
|
52
|
+
|
53
|
+
To test the install run:
|
54
|
+
```
|
55
|
+
pip install .
|
56
|
+
```
|
57
|
+
|
58
|
+
## License
|
59
|
+
|
60
|
+
MIT
|
@@ -0,0 +1,14 @@
|
|
1
|
+
README.md
|
2
|
+
pyproject.toml
|
3
|
+
setup.py
|
4
|
+
tests/__init__.py
|
5
|
+
tests/test_files.py
|
6
|
+
vectoriz/__init__.py
|
7
|
+
vectoriz/files.py
|
8
|
+
vectoriz/token_transformer.py
|
9
|
+
vectoriz/vector_db.py
|
10
|
+
vectoriz.egg-info/PKG-INFO
|
11
|
+
vectoriz.egg-info/SOURCES.txt
|
12
|
+
vectoriz.egg-info/dependency_links.txt
|
13
|
+
vectoriz.egg-info/requires.txt
|
14
|
+
vectoriz.egg-info/top_level.txt
|
@@ -0,0 +1 @@
|
|
1
|
+
|