vectoriz 0.0.5__tar.gz → 0.1.1b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/PKG-INFO +1 -1
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/setup.py +1 -1
- vectoriz-0.1.1b0/tests/test_files.py +165 -0
- vectoriz-0.1.1b0/tests/test_token_transformer.py +193 -0
- vectoriz-0.1.1b0/tests/test_vector_db.py +474 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz/files.py +5 -4
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz/token_transformer.py +19 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz/vector_db.py +3 -3
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz.egg-info/PKG-INFO +1 -1
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz.egg-info/SOURCES.txt +2 -0
- vectoriz-0.0.5/tests/test_files.py +0 -40
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/README.md +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/pyproject.toml +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/setup.cfg +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/tests/__init__.py +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz/__init__.py +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz.egg-info/dependency_links.txt +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz.egg-info/requires.txt +0 -0
- {vectoriz-0.0.5 → vectoriz-0.1.1b0}/vectoriz.egg-info/top_level.txt +0 -0
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name="vectoriz",
|
5
|
-
version="0.
|
5
|
+
version="0.1.1-beta",
|
6
6
|
author="PedroHenriqueDevBR",
|
7
7
|
author_email="pedro.henrique.particular@gmail.com",
|
8
8
|
description="Python library for creating vectorized data from text or files.",
|
@@ -0,0 +1,165 @@
|
|
1
|
+
import docx
|
2
|
+
import pytest
|
3
|
+
import numpy as np
|
4
|
+
from unittest.mock import patch, MagicMock
|
5
|
+
from vectoriz.files import FileArgument
|
6
|
+
from vectoriz.files import FilesFeature
|
7
|
+
|
8
|
+
|
9
|
+
class TestFileArgument:
|
10
|
+
def test_add_data_appends_to_lists(self):
|
11
|
+
file_arg = FileArgument()
|
12
|
+
filename = "test.txt"
|
13
|
+
text = "Test content"
|
14
|
+
|
15
|
+
with patch.object(
|
16
|
+
FileArgument, "_create_embedding", return_value=np.array([0.1, 0.2, 0.3])
|
17
|
+
):
|
18
|
+
file_arg.add_data(filename, text)
|
19
|
+
|
20
|
+
assert file_arg.chunk_names == [filename]
|
21
|
+
assert file_arg.text_list == [text]
|
22
|
+
assert len(file_arg.embeddings) == 1
|
23
|
+
np.testing.assert_array_equal(
|
24
|
+
file_arg.embeddings[0], np.array([0.1, 0.2, 0.3])
|
25
|
+
)
|
26
|
+
|
27
|
+
def test_add_data_multiple_entries(self):
|
28
|
+
file_arg = FileArgument(
|
29
|
+
["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])]
|
30
|
+
)
|
31
|
+
filename = "new.txt"
|
32
|
+
text = "New content"
|
33
|
+
|
34
|
+
with patch.object(
|
35
|
+
FileArgument, "_create_embedding", return_value=np.array([0.7, 0.8, 0.9])
|
36
|
+
):
|
37
|
+
file_arg.add_data(filename, text)
|
38
|
+
assert file_arg.chunk_names == ["existing.txt", "new.txt"]
|
39
|
+
assert file_arg.text_list == ["existing content", "New content"]
|
40
|
+
assert len(file_arg.embeddings) == 2
|
41
|
+
np.testing.assert_array_equal(
|
42
|
+
file_arg.embeddings[1], np.array([0.7, 0.8, 0.9])
|
43
|
+
)
|
44
|
+
|
45
|
+
def test_add_data_calls_create_embedding(self):
|
46
|
+
file_arg = FileArgument()
|
47
|
+
filename = "test.txt"
|
48
|
+
text = "Test content"
|
49
|
+
|
50
|
+
with patch.object(FileArgument, "_create_embedding") as mock_create_embedding:
|
51
|
+
mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
|
52
|
+
file_arg.add_data(filename, text)
|
53
|
+
mock_create_embedding.assert_called_once_with(text)
|
54
|
+
|
55
|
+
def test_create_embedding_returns_numpy_array(self):
|
56
|
+
file_arg = FileArgument()
|
57
|
+
text = "Test content"
|
58
|
+
|
59
|
+
with patch("vectoriz.files.TokenTransformer") as mock_transformer:
|
60
|
+
mock_instance = mock_transformer.return_value
|
61
|
+
mock_instance.text_to_embeddings.return_value = [np.array([0.1, 0.2, 0.3])]
|
62
|
+
|
63
|
+
result = file_arg._create_embedding(text)
|
64
|
+
|
65
|
+
assert isinstance(result, np.ndarray)
|
66
|
+
np.testing.assert_array_equal(result, np.array([0.1, 0.2, 0.3]))
|
67
|
+
mock_instance.text_to_embeddings.assert_called_once_with([text])
|
68
|
+
|
69
|
+
def test_create_embedding_handles_empty_text(self):
|
70
|
+
file_arg = FileArgument()
|
71
|
+
text = ""
|
72
|
+
|
73
|
+
with patch("vectoriz.files.TokenTransformer") as mock_transformer:
|
74
|
+
mock_instance = mock_transformer.return_value
|
75
|
+
mock_instance.text_to_embeddings.return_value = [np.array([0.0, 0.0, 0.0])]
|
76
|
+
|
77
|
+
result = file_arg._create_embedding(text)
|
78
|
+
|
79
|
+
assert isinstance(result, np.ndarray)
|
80
|
+
mock_instance.text_to_embeddings.assert_called_once_with([""])
|
81
|
+
|
82
|
+
def test_create_embedding_instantiates_token_transformer(self):
|
83
|
+
file_arg = FileArgument()
|
84
|
+
text = "Test content"
|
85
|
+
|
86
|
+
with patch("vectoriz.files.TokenTransformer") as mock_transformer:
|
87
|
+
mock_instance = mock_transformer.return_value
|
88
|
+
mock_instance.text_to_embeddings.return_value = [np.array([0.1, 0.2, 0.3])]
|
89
|
+
|
90
|
+
file_arg._create_embedding(text)
|
91
|
+
|
92
|
+
mock_transformer.assert_called_once()
|
93
|
+
|
94
|
+
|
95
|
+
class TestFilesFeature:
|
96
|
+
def test_extract_txt_content_reads_file_correctly(self, tmp_path):
|
97
|
+
test_content = "This is test content"
|
98
|
+
test_file = tmp_path / "test.txt"
|
99
|
+
test_file.write_text(test_content)
|
100
|
+
files_feature = FilesFeature()
|
101
|
+
result = files_feature._extract_txt_content(str(tmp_path), "test.txt")
|
102
|
+
assert result == test_content
|
103
|
+
|
104
|
+
def test_extract_txt_content_with_unicode_chars(self, tmp_path):
|
105
|
+
test_content = "Unicode content: àáâãäåæç"
|
106
|
+
test_file = tmp_path / "unicode.txt"
|
107
|
+
test_file.write_text(test_content, encoding="utf-8")
|
108
|
+
files_feature = FilesFeature()
|
109
|
+
result = files_feature._extract_txt_content(str(tmp_path), "unicode.txt")
|
110
|
+
assert result == test_content
|
111
|
+
|
112
|
+
def test_extract_txt_content_raises_file_not_found(self):
|
113
|
+
files_feature = FilesFeature()
|
114
|
+
with pytest.raises(FileNotFoundError):
|
115
|
+
files_feature._extract_txt_content(
|
116
|
+
"/non_existent_dir", "non_existent_file.txt"
|
117
|
+
)
|
118
|
+
|
119
|
+
def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
|
120
|
+
mock_doc = MagicMock()
|
121
|
+
mock_paragraph1 = MagicMock()
|
122
|
+
mock_paragraph1.text = "Paragraph 1"
|
123
|
+
mock_paragraph2 = MagicMock()
|
124
|
+
mock_paragraph2.text = "Paragraph 2"
|
125
|
+
mock_doc.paragraphs = [mock_paragraph1, mock_paragraph2]
|
126
|
+
|
127
|
+
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
128
|
+
files_feature = FilesFeature()
|
129
|
+
result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
|
130
|
+
|
131
|
+
assert result == "Paragraph 1\nParagraph 2"
|
132
|
+
|
133
|
+
def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
|
134
|
+
mock_doc = MagicMock()
|
135
|
+
mock_paragraph1 = MagicMock()
|
136
|
+
mock_paragraph1.text = "Paragraph 1"
|
137
|
+
mock_paragraph2 = MagicMock()
|
138
|
+
mock_paragraph2.text = " "
|
139
|
+
mock_paragraph3 = MagicMock()
|
140
|
+
mock_paragraph3.text = "Paragraph 3"
|
141
|
+
mock_doc.paragraphs = [mock_paragraph1, mock_paragraph2, mock_paragraph3]
|
142
|
+
|
143
|
+
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
144
|
+
files_feature = FilesFeature()
|
145
|
+
result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
|
146
|
+
|
147
|
+
assert result == "Paragraph 1\nParagraph 3"
|
148
|
+
|
149
|
+
def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
|
150
|
+
def mock_document(_):
|
151
|
+
raise Exception("Failed to open document")
|
152
|
+
|
153
|
+
monkeypatch.setattr(docx, "Document", mock_document)
|
154
|
+
|
155
|
+
files_feature = FilesFeature()
|
156
|
+
with pytest.raises(Exception):
|
157
|
+
files_feature._extract_docx_content(str(tmp_path), "invalid.docx")
|
158
|
+
|
159
|
+
def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
|
160
|
+
mock_doc = MagicMock()
|
161
|
+
mock_doc.paragraphs = []
|
162
|
+
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
163
|
+
files_feature = FilesFeature()
|
164
|
+
result = files_feature._extract_docx_content(str(tmp_path), "empty.docx")
|
165
|
+
assert result == ""
|
@@ -0,0 +1,193 @@
|
|
1
|
+
import pytest
|
2
|
+
import faiss
|
3
|
+
import numpy as np
|
4
|
+
from unittest.mock import Mock
|
5
|
+
from vectoriz.token_transformer import TokenData, TokenTransformer
|
6
|
+
|
7
|
+
|
8
|
+
class TestTokenData:
|
9
|
+
|
10
|
+
def test_from_vector_db(self):
|
11
|
+
mock_vector_data = Mock()
|
12
|
+
|
13
|
+
mock_index = faiss.IndexFlatL2(5)
|
14
|
+
mock_embeddings = np.random.random((3, 5)).astype("float32")
|
15
|
+
mock_texts = ["text1", "text2", "text3"]
|
16
|
+
|
17
|
+
mock_file_argument = Mock()
|
18
|
+
mock_file_argument.embeddings = mock_embeddings
|
19
|
+
mock_file_argument.text_list = mock_texts
|
20
|
+
mock_vector_data.faiss_index = mock_index
|
21
|
+
mock_vector_data.file_argument = mock_file_argument
|
22
|
+
token_data = TokenData.from_vector_db(mock_vector_data)
|
23
|
+
|
24
|
+
assert token_data.texts == mock_texts
|
25
|
+
assert token_data.index == mock_index
|
26
|
+
assert np.array_equal(token_data.embeddings, mock_embeddings)
|
27
|
+
assert isinstance(token_data, TokenData)
|
28
|
+
|
29
|
+
def test_from_file_argument(self):
|
30
|
+
mock_file_argument = Mock()
|
31
|
+
mock_file_argument.embeddings = np.random.random((3, 5)).astype("float32")
|
32
|
+
mock_file_argument.text_list = ["text1", "text2", "text3"]
|
33
|
+
mock_index = faiss.IndexFlatL2(5)
|
34
|
+
token_data = TokenData.from_file_argument(mock_file_argument, mock_index)
|
35
|
+
|
36
|
+
assert token_data.texts == mock_file_argument.text_list
|
37
|
+
assert token_data.index == mock_index
|
38
|
+
assert np.array_equal(token_data.embeddings, mock_file_argument.embeddings)
|
39
|
+
assert isinstance(token_data, TokenData)
|
40
|
+
|
41
|
+
|
42
|
+
class TestTokenTransformer:
|
43
|
+
|
44
|
+
def test_text_to_embeddings(self):
|
45
|
+
transformer = TokenTransformer()
|
46
|
+
sentences = ["This is a test sentence.", "Another test sentence."]
|
47
|
+
embeddings = transformer.text_to_embeddings(sentences)
|
48
|
+
|
49
|
+
assert isinstance(embeddings, np.ndarray)
|
50
|
+
assert embeddings.shape[0] == len(sentences)
|
51
|
+
assert embeddings.shape[1] > 0
|
52
|
+
|
53
|
+
def test_text_to_embeddings_empty_list(self):
|
54
|
+
transformer = TokenTransformer()
|
55
|
+
sentences = []
|
56
|
+
embeddings = transformer.text_to_embeddings(sentences)
|
57
|
+
|
58
|
+
assert isinstance(embeddings, np.ndarray)
|
59
|
+
|
60
|
+
def test_get_np_vectors(self):
|
61
|
+
transformer = TokenTransformer()
|
62
|
+
embeddings = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
|
63
|
+
result = transformer.get_np_vectors(embeddings)
|
64
|
+
|
65
|
+
assert isinstance(result, np.ndarray)
|
66
|
+
assert result.dtype == np.float32
|
67
|
+
assert result.shape == (2, 3)
|
68
|
+
assert np.array_equal(
|
69
|
+
result, np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
|
70
|
+
)
|
71
|
+
|
72
|
+
def test_get_np_vectors_empty_list(self):
|
73
|
+
transformer = TokenTransformer()
|
74
|
+
embeddings = []
|
75
|
+
result = transformer.get_np_vectors(embeddings)
|
76
|
+
|
77
|
+
assert isinstance(result, np.ndarray)
|
78
|
+
assert result.dtype == np.float32
|
79
|
+
assert result.shape == (0,)
|
80
|
+
|
81
|
+
def test_get_np_vectors_single_element(self):
|
82
|
+
transformer = TokenTransformer()
|
83
|
+
embeddings = [[1.5, 2.5]]
|
84
|
+
result = transformer.get_np_vectors(embeddings)
|
85
|
+
|
86
|
+
assert isinstance(result, np.ndarray)
|
87
|
+
assert result.dtype == np.float32
|
88
|
+
assert result.shape == (1, 2)
|
89
|
+
assert np.array_equal(result, np.array([[1.5, 2.5]], dtype=np.float32))
|
90
|
+
|
91
|
+
def test_query_to_embeddings(self):
|
92
|
+
transformer = TokenTransformer()
|
93
|
+
query = "This is a test query"
|
94
|
+
result = transformer._query_to_embeddings(query)
|
95
|
+
|
96
|
+
assert isinstance(result, np.ndarray)
|
97
|
+
assert result.shape[0] == 1
|
98
|
+
assert result.shape[1] == 384
|
99
|
+
|
100
|
+
def test_query_to_embeddings_empty_string(self):
|
101
|
+
transformer = TokenTransformer()
|
102
|
+
query = ""
|
103
|
+
result = transformer._query_to_embeddings(query)
|
104
|
+
|
105
|
+
assert isinstance(result, np.ndarray)
|
106
|
+
assert result.shape[0] == 1
|
107
|
+
assert result.shape[1] == 384
|
108
|
+
|
109
|
+
def test_query_to_embeddings_returns_correct_shape(self):
|
110
|
+
transformer = TokenTransformer()
|
111
|
+
query1 = "First query"
|
112
|
+
query2 = "Second query with more words"
|
113
|
+
result1 = transformer._query_to_embeddings(query1)
|
114
|
+
result2 = transformer._query_to_embeddings(query2)
|
115
|
+
|
116
|
+
assert result1.shape == result2.shape
|
117
|
+
assert len(result1.shape) == 2
|
118
|
+
assert result1.shape[0] == 1
|
119
|
+
|
120
|
+
def test_search(self):
|
121
|
+
transformer = TokenTransformer()
|
122
|
+
texts = ["First document", "Second document", "Third document"]
|
123
|
+
|
124
|
+
embeddings = transformer.text_to_embeddings(texts)
|
125
|
+
index = transformer.embeddings_to_index(embeddings)
|
126
|
+
|
127
|
+
result = transformer.search("first", index, texts, context_amount=1)
|
128
|
+
assert isinstance(result, str)
|
129
|
+
assert "First document" in result
|
130
|
+
|
131
|
+
result = transformer.search("document", index, texts, context_amount=2)
|
132
|
+
assert isinstance(result, str)
|
133
|
+
assert len(result.strip().split("\n")) == 2
|
134
|
+
|
135
|
+
def test_search_with_empty_texts(self):
|
136
|
+
transformer = TokenTransformer()
|
137
|
+
texts = []
|
138
|
+
|
139
|
+
if texts:
|
140
|
+
embeddings = transformer.text_to_embeddings(texts)
|
141
|
+
index = transformer.embeddings_to_index(embeddings)
|
142
|
+
else:
|
143
|
+
index = faiss.IndexFlatL2(384)
|
144
|
+
|
145
|
+
result = transformer.search("query", index, texts)
|
146
|
+
assert result == ""
|
147
|
+
|
148
|
+
def test_search_with_different_context_amounts(self):
|
149
|
+
transformer = TokenTransformer()
|
150
|
+
texts = ["Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5"]
|
151
|
+
|
152
|
+
embeddings = transformer.text_to_embeddings(texts)
|
153
|
+
index = transformer.embeddings_to_index(embeddings)
|
154
|
+
|
155
|
+
result1 = transformer.search("Doc", index, texts, context_amount=1)
|
156
|
+
result3 = transformer.search("Doc", index, texts, context_amount=3)
|
157
|
+
result5 = transformer.search("Doc", index, texts, context_amount=5)
|
158
|
+
|
159
|
+
assert len(result1.strip().split("\n")) == 1
|
160
|
+
assert len(result3.strip().split("\n")) == 3
|
161
|
+
assert len(result5.strip().split("\n")) == 5
|
162
|
+
|
163
|
+
def test_create_index(self):
|
164
|
+
transformer = TokenTransformer()
|
165
|
+
texts = ["First document", "Second document", "Third document"]
|
166
|
+
|
167
|
+
token_data = transformer.create_index(texts)
|
168
|
+
|
169
|
+
assert isinstance(token_data, TokenData)
|
170
|
+
assert token_data.texts == texts
|
171
|
+
assert isinstance(token_data.index, faiss.IndexFlatL2)
|
172
|
+
assert isinstance(token_data.embeddings, np.ndarray)
|
173
|
+
assert token_data.embeddings.shape[0] == len(texts)
|
174
|
+
assert token_data.embeddings.shape[1] == 384
|
175
|
+
|
176
|
+
def test_create_index_empty_list(self):
|
177
|
+
transformer = TokenTransformer()
|
178
|
+
texts = []
|
179
|
+
|
180
|
+
with pytest.raises(ValueError, match="The input texts list is empty."):
|
181
|
+
transformer.create_index(texts)
|
182
|
+
|
183
|
+
def test_create_index_single_element(self):
|
184
|
+
transformer = TokenTransformer()
|
185
|
+
texts = ["Single document text"]
|
186
|
+
|
187
|
+
token_data = transformer.create_index(texts)
|
188
|
+
|
189
|
+
assert isinstance(token_data, TokenData)
|
190
|
+
assert token_data.texts == texts
|
191
|
+
assert isinstance(token_data.index, faiss.IndexFlatL2)
|
192
|
+
assert isinstance(token_data.embeddings, np.ndarray)
|
193
|
+
assert token_data.embeddings.shape == (1, 384)
|
@@ -0,0 +1,474 @@
|
|
1
|
+
import os
|
2
|
+
import pytest
|
3
|
+
import numpy as np
|
4
|
+
import faiss
|
5
|
+
from unittest.mock import patch, MagicMock
|
6
|
+
from vectoriz.vector_db import VectorDB, VectorDBClient
|
7
|
+
from vectoriz.files import FileArgument
|
8
|
+
|
9
|
+
|
10
|
+
class TestVectorDB:
|
11
|
+
|
12
|
+
def test_load_saved_data_successful(self, tmp_path):
|
13
|
+
vector_db = VectorDB()
|
14
|
+
faiss_path = str(tmp_path / "test.index")
|
15
|
+
np_path = str(tmp_path / "test.npz")
|
16
|
+
|
17
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
18
|
+
mock_file_argument = MagicMock(spec=FileArgument)
|
19
|
+
|
20
|
+
with patch.object(
|
21
|
+
vector_db, "load_faiss_index", return_value=mock_index
|
22
|
+
) as mock_load_index:
|
23
|
+
with patch.object(
|
24
|
+
vector_db, "load_numpy_embeddings", return_value=mock_file_argument
|
25
|
+
) as mock_load_embeddings:
|
26
|
+
|
27
|
+
result = vector_db.load_saved_data(faiss_path, np_path)
|
28
|
+
|
29
|
+
assert isinstance(result, VectorDBClient)
|
30
|
+
assert result.faiss_index == mock_index
|
31
|
+
assert result.file_argument == mock_file_argument
|
32
|
+
mock_load_index.assert_called_once_with(faiss_path)
|
33
|
+
mock_load_embeddings.assert_called_once_with(np_path)
|
34
|
+
|
35
|
+
def test_load_saved_data_missing_faiss_index(self, tmp_path):
|
36
|
+
vector_db = VectorDB()
|
37
|
+
faiss_path = str(tmp_path / "nonexistent.index")
|
38
|
+
np_path = str(tmp_path / "test.npz")
|
39
|
+
|
40
|
+
mock_file_argument = MagicMock(spec=FileArgument)
|
41
|
+
|
42
|
+
with patch.object(
|
43
|
+
vector_db, "load_faiss_index", return_value=None
|
44
|
+
) as mock_load_index:
|
45
|
+
with patch.object(
|
46
|
+
vector_db, "load_numpy_embeddings", return_value=mock_file_argument
|
47
|
+
) as mock_load_embeddings:
|
48
|
+
|
49
|
+
result = vector_db.load_saved_data(faiss_path, np_path)
|
50
|
+
|
51
|
+
assert result is None
|
52
|
+
mock_load_index.assert_called_once_with(faiss_path)
|
53
|
+
mock_load_embeddings.assert_called_once_with(np_path)
|
54
|
+
|
55
|
+
def test_load_saved_data_missing_numpy_embeddings(self, tmp_path):
|
56
|
+
vector_db = VectorDB()
|
57
|
+
faiss_path = str(tmp_path / "test.index")
|
58
|
+
np_path = str(tmp_path / "nonexistent.npz")
|
59
|
+
|
60
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
61
|
+
|
62
|
+
with patch.object(
|
63
|
+
vector_db, "load_faiss_index", return_value=mock_index
|
64
|
+
) as mock_load_index:
|
65
|
+
with patch.object(
|
66
|
+
vector_db, "load_numpy_embeddings", return_value=None
|
67
|
+
) as mock_load_embeddings:
|
68
|
+
|
69
|
+
result = vector_db.load_saved_data(faiss_path, np_path)
|
70
|
+
|
71
|
+
assert result is None
|
72
|
+
mock_load_index.assert_called_once_with(faiss_path)
|
73
|
+
mock_load_embeddings.assert_called_once_with(np_path)
|
74
|
+
|
75
|
+
def test_load_saved_data_both_missing(self, tmp_path):
|
76
|
+
vector_db = VectorDB()
|
77
|
+
faiss_path = str(tmp_path / "nonexistent.index")
|
78
|
+
np_path = str(tmp_path / "nonexistent.npz")
|
79
|
+
|
80
|
+
with patch.object(
|
81
|
+
vector_db, "load_faiss_index", return_value=None
|
82
|
+
) as mock_load_index:
|
83
|
+
with patch.object(
|
84
|
+
vector_db, "load_numpy_embeddings", return_value=None
|
85
|
+
) as mock_load_embeddings:
|
86
|
+
|
87
|
+
result = vector_db.load_saved_data(faiss_path, np_path)
|
88
|
+
|
89
|
+
assert result is None
|
90
|
+
mock_load_index.assert_called_once_with(faiss_path)
|
91
|
+
mock_load_embeddings.assert_called_once_with(np_path)
|
92
|
+
|
93
|
+
def test_save_faiss_index(self, tmp_path):
|
94
|
+
vector_db = VectorDB()
|
95
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
96
|
+
faiss_path = str(tmp_path / "test")
|
97
|
+
|
98
|
+
with patch("faiss.write_index") as mock_write_index:
|
99
|
+
vector_db.save_faiss_index(mock_index, faiss_path)
|
100
|
+
|
101
|
+
mock_write_index.assert_called_once_with(mock_index, faiss_path + ".index")
|
102
|
+
|
103
|
+
def test_save_faiss_index_with_extension(self, tmp_path):
|
104
|
+
vector_db = VectorDB()
|
105
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
106
|
+
faiss_path = str(tmp_path / "test.index")
|
107
|
+
|
108
|
+
with patch("faiss.write_index") as mock_write_index:
|
109
|
+
vector_db.save_faiss_index(mock_index, faiss_path)
|
110
|
+
|
111
|
+
mock_write_index.assert_called_once_with(mock_index, faiss_path)
|
112
|
+
|
113
|
+
def test_save_faiss_index_integration(self, tmp_path):
|
114
|
+
vector_db = VectorDB()
|
115
|
+
dimension = 128
|
116
|
+
index = faiss.IndexFlatL2(dimension)
|
117
|
+
|
118
|
+
sample_vectors = np.random.random((10, dimension)).astype("float32")
|
119
|
+
index.add(sample_vectors)
|
120
|
+
faiss_path = str(tmp_path / "test.index")
|
121
|
+
|
122
|
+
vector_db.save_faiss_index(index, faiss_path)
|
123
|
+
|
124
|
+
assert os.path.exists(faiss_path)
|
125
|
+
|
126
|
+
loaded_index = faiss.read_index(faiss_path)
|
127
|
+
assert loaded_index.ntotal == index.ntotal
|
128
|
+
|
129
|
+
def test_load_faiss_index_successful(self, tmp_path):
|
130
|
+
vector_db = VectorDB()
|
131
|
+
dimension = 128
|
132
|
+
index = faiss.IndexFlatL2(dimension)
|
133
|
+
|
134
|
+
sample_vectors = np.random.random((5, dimension)).astype("float32")
|
135
|
+
index.add(sample_vectors)
|
136
|
+
faiss_path = str(tmp_path / "test.index")
|
137
|
+
|
138
|
+
faiss.write_index(index, faiss_path)
|
139
|
+
|
140
|
+
result = vector_db.load_faiss_index(faiss_path)
|
141
|
+
|
142
|
+
assert result is not None
|
143
|
+
assert result.ntotal == index.ntotal
|
144
|
+
assert result.d == index.d
|
145
|
+
|
146
|
+
def test_load_faiss_index_missing_file(self, tmp_path):
|
147
|
+
vector_db = VectorDB()
|
148
|
+
nonexistent_path = str(tmp_path / "nonexistent.index")
|
149
|
+
|
150
|
+
result = vector_db.load_faiss_index(nonexistent_path)
|
151
|
+
|
152
|
+
assert result is None
|
153
|
+
|
154
|
+
def test_load_faiss_index_with_mock(self):
|
155
|
+
vector_db = VectorDB()
|
156
|
+
test_path = "/mock/path/test.index"
|
157
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
158
|
+
|
159
|
+
with patch("os.path.exists", return_value=True) as mock_exists:
|
160
|
+
with patch("faiss.read_index", return_value=mock_index) as mock_read:
|
161
|
+
result = vector_db.load_faiss_index(test_path)
|
162
|
+
|
163
|
+
mock_exists.assert_called_once_with(test_path)
|
164
|
+
mock_read.assert_called_once_with(test_path)
|
165
|
+
assert result == mock_index
|
166
|
+
|
167
|
+
def test_save_numpy_embeddings_with_ndarray(self, tmp_path):
|
168
|
+
vector_db = VectorDB()
|
169
|
+
np_path = str(tmp_path / "test")
|
170
|
+
|
171
|
+
embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
|
172
|
+
chunk_names = np.array(["chunk1", "chunk2"])
|
173
|
+
texts = np.array(["text1", "text2"])
|
174
|
+
|
175
|
+
file_arg = FileArgument(
|
176
|
+
chunk_names=chunk_names,
|
177
|
+
text_list=texts,
|
178
|
+
embeddings=[],
|
179
|
+
ndarray_data=embeddings_np,
|
180
|
+
)
|
181
|
+
|
182
|
+
with patch("numpy.savez") as mock_savez:
|
183
|
+
vector_db.save_numpy_embeddings(file_arg, np_path)
|
184
|
+
|
185
|
+
mock_savez.assert_called_once_with(
|
186
|
+
np_path + ".npz",
|
187
|
+
embeddings=embeddings_np,
|
188
|
+
chunk_names=chunk_names,
|
189
|
+
texts=texts,
|
190
|
+
)
|
191
|
+
|
192
|
+
def test_save_numpy_embeddings_with_embeddings_list(self, tmp_path):
|
193
|
+
vector_db = VectorDB()
|
194
|
+
np_path = str(tmp_path / "test")
|
195
|
+
|
196
|
+
embeddings_list = [[[0.1, 0.2]], [[0.3, 0.4]]]
|
197
|
+
chunk_names = np.array(["chunk1", "chunk2"])
|
198
|
+
texts = np.array(["text1", "text2"])
|
199
|
+
|
200
|
+
file_arg = FileArgument(
|
201
|
+
chunk_names=chunk_names,
|
202
|
+
text_list=texts,
|
203
|
+
embeddings=embeddings_list,
|
204
|
+
ndarray_data=None,
|
205
|
+
)
|
206
|
+
|
207
|
+
transformed_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
|
208
|
+
|
209
|
+
with patch(
|
210
|
+
"vectoriz.token_transformer.TokenTransformer.get_np_vectors",
|
211
|
+
return_value=transformed_np,
|
212
|
+
) as mock_transform:
|
213
|
+
with patch("numpy.savez") as mock_savez:
|
214
|
+
vector_db.save_numpy_embeddings(file_arg, np_path)
|
215
|
+
|
216
|
+
mock_transform.assert_called_once_with(embeddings_list)
|
217
|
+
mock_savez.assert_called_once_with(
|
218
|
+
np_path + ".npz",
|
219
|
+
embeddings=transformed_np,
|
220
|
+
chunk_names=chunk_names,
|
221
|
+
texts=texts,
|
222
|
+
)
|
223
|
+
|
224
|
+
def test_save_numpy_embeddings_with_extension(self, tmp_path):
|
225
|
+
vector_db = VectorDB()
|
226
|
+
np_path = str(tmp_path / "test.npz")
|
227
|
+
|
228
|
+
embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
|
229
|
+
chunk_names = np.array(["chunk1", "chunk2"])
|
230
|
+
texts = np.array(["text1", "text2"])
|
231
|
+
|
232
|
+
file_arg = FileArgument(
|
233
|
+
chunk_names=chunk_names,
|
234
|
+
text_list=texts,
|
235
|
+
embeddings=[],
|
236
|
+
ndarray_data=embeddings_np,
|
237
|
+
)
|
238
|
+
|
239
|
+
with patch("numpy.savez") as mock_savez:
|
240
|
+
vector_db.save_numpy_embeddings(file_arg, np_path)
|
241
|
+
|
242
|
+
mock_savez.assert_called_once_with(
|
243
|
+
np_path, embeddings=embeddings_np, chunk_names=chunk_names, texts=texts
|
244
|
+
)
|
245
|
+
|
246
|
+
def test_save_numpy_embeddings_integration(self, tmp_path):
|
247
|
+
vector_db = VectorDB()
|
248
|
+
np_path = str(tmp_path / "test.npz")
|
249
|
+
|
250
|
+
embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
|
251
|
+
chunk_names = np.array(["chunk1", "chunk2"])
|
252
|
+
texts = np.array(["text1", "text2"])
|
253
|
+
|
254
|
+
file_arg = FileArgument(
|
255
|
+
chunk_names=chunk_names,
|
256
|
+
text_list=texts,
|
257
|
+
embeddings=[],
|
258
|
+
ndarray_data=embeddings_np,
|
259
|
+
)
|
260
|
+
|
261
|
+
vector_db.save_numpy_embeddings(file_arg, np_path)
|
262
|
+
|
263
|
+
assert os.path.exists(np_path)
|
264
|
+
loaded_data = np.load(np_path)
|
265
|
+
assert "embeddings" in loaded_data
|
266
|
+
assert "chunk_names" in loaded_data
|
267
|
+
assert "texts" in loaded_data
|
268
|
+
np.testing.assert_array_equal(loaded_data["embeddings"], embeddings_np)
|
269
|
+
np.testing.assert_array_equal(loaded_data["chunk_names"], chunk_names)
|
270
|
+
np.testing.assert_array_equal(loaded_data["texts"], texts)
|
271
|
+
|
272
|
+
def test_load_numpy_embeddings_successful(self, tmp_path):
|
273
|
+
vector_db = VectorDB()
|
274
|
+
np_path = str(tmp_path / "test.npz")
|
275
|
+
|
276
|
+
embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
|
277
|
+
chunk_names = np.array(["chunk1", "chunk2"])
|
278
|
+
texts = np.array(["text1", "text2"])
|
279
|
+
|
280
|
+
np.savez(
|
281
|
+
np_path, embeddings=embeddings_np, chunk_names=chunk_names, texts=texts
|
282
|
+
)
|
283
|
+
|
284
|
+
result = vector_db.load_numpy_embeddings(np_path)
|
285
|
+
|
286
|
+
assert result is not None
|
287
|
+
np.testing.assert_array_equal(result.ndarray_data, embeddings_np)
|
288
|
+
np.testing.assert_array_equal(result.chunk_names, chunk_names)
|
289
|
+
np.testing.assert_array_equal(result.text_list, texts)
|
290
|
+
assert result.embeddings == []
|
291
|
+
|
292
|
+
def test_load_numpy_embeddings_missing_file(self, tmp_path):
|
293
|
+
vector_db = VectorDB()
|
294
|
+
nonexistent_path = str(tmp_path / "nonexistent.npz")
|
295
|
+
|
296
|
+
result = vector_db.load_numpy_embeddings(nonexistent_path)
|
297
|
+
|
298
|
+
assert result is None
|
299
|
+
|
300
|
+
def test_load_numpy_embeddings_with_mock(self):
|
301
|
+
vector_db = VectorDB()
|
302
|
+
test_path = "/mock/path/test.npz"
|
303
|
+
|
304
|
+
mock_data = {
|
305
|
+
"embeddings": np.array([[0.1, 0.2], [0.3, 0.4]]),
|
306
|
+
"chunk_names": np.array(["name1", "name2"]),
|
307
|
+
"texts": np.array(["text1", "text2"]),
|
308
|
+
}
|
309
|
+
|
310
|
+
with patch("os.path.exists", return_value=True) as mock_exists:
|
311
|
+
with patch("numpy.load", return_value=mock_data) as mock_load:
|
312
|
+
result = vector_db.load_numpy_embeddings(test_path)
|
313
|
+
|
314
|
+
mock_exists.assert_called_once_with(test_path)
|
315
|
+
mock_load.assert_called_once_with(test_path)
|
316
|
+
assert isinstance(result, FileArgument)
|
317
|
+
np.testing.assert_array_equal(
|
318
|
+
result.ndarray_data, mock_data["embeddings"]
|
319
|
+
)
|
320
|
+
np.testing.assert_array_equal(
|
321
|
+
result.chunk_names, mock_data["chunk_names"]
|
322
|
+
)
|
323
|
+
np.testing.assert_array_equal(result.text_list, mock_data["texts"])
|
324
|
+
|
325
|
+
|
326
|
+
class TestVectorDBClient:
|
327
|
+
def test_save_data_successful(self, tmp_path):
|
328
|
+
# Setup
|
329
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
330
|
+
mock_file_argument = MagicMock(spec=FileArgument)
|
331
|
+
client = VectorDBClient(mock_index, mock_file_argument)
|
332
|
+
|
333
|
+
faiss_path = str(tmp_path / "test")
|
334
|
+
np_path = str(tmp_path / "test")
|
335
|
+
|
336
|
+
# Test with mocks to verify function calls
|
337
|
+
with patch.object(VectorDB, 'save_faiss_index') as mock_save_faiss:
|
338
|
+
with patch.object(VectorDB, 'save_numpy_embeddings') as mock_save_numpy:
|
339
|
+
client.save_data(faiss_path, np_path)
|
340
|
+
|
341
|
+
mock_save_faiss.assert_called_once_with(mock_index, faiss_path)
|
342
|
+
mock_save_numpy.assert_called_once_with(mock_file_argument, np_path)
|
343
|
+
|
344
|
+
def test_save_data_not_initialized(self):
|
345
|
+
# Test with None values
|
346
|
+
client1 = VectorDBClient(None, MagicMock(spec=FileArgument))
|
347
|
+
client2 = VectorDBClient(MagicMock(spec=faiss.IndexFlatL2), None)
|
348
|
+
client3 = VectorDBClient(None, None)
|
349
|
+
|
350
|
+
with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
|
351
|
+
client1.save_data("test.index", "test.npz")
|
352
|
+
|
353
|
+
with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
|
354
|
+
client2.save_data("test.index", "test.npz")
|
355
|
+
|
356
|
+
with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
|
357
|
+
client3.save_data("test.index", "test.npz")
|
358
|
+
|
359
|
+
def test_save_data_integration(self, tmp_path):
|
360
|
+
# Setup real objects for integration test
|
361
|
+
dimension = 128
|
362
|
+
index = faiss.IndexFlatL2(dimension)
|
363
|
+
|
364
|
+
# Add some vectors to the index
|
365
|
+
sample_vectors = np.random.random((5, dimension)).astype("float32")
|
366
|
+
index.add(sample_vectors)
|
367
|
+
|
368
|
+
# Create file argument
|
369
|
+
chunk_names = np.array(["chunk1", "chunk2", "chunk3", "chunk4", "chunk5"])
|
370
|
+
texts = np.array(["text1", "text2", "text3", "text4", "text5"])
|
371
|
+
file_arg = FileArgument(
|
372
|
+
chunk_names=chunk_names,
|
373
|
+
text_list=texts,
|
374
|
+
embeddings=[],
|
375
|
+
ndarray_data=sample_vectors
|
376
|
+
)
|
377
|
+
|
378
|
+
# Create client
|
379
|
+
client = VectorDBClient(index, file_arg)
|
380
|
+
|
381
|
+
# Define paths
|
382
|
+
faiss_path = str(tmp_path / "test.index")
|
383
|
+
np_path = str(tmp_path / "test.npz")
|
384
|
+
|
385
|
+
# Save the data
|
386
|
+
client.save_data(faiss_path, np_path)
|
387
|
+
|
388
|
+
# Verify files were created
|
389
|
+
assert os.path.exists(faiss_path)
|
390
|
+
assert os.path.exists(np_path)
|
391
|
+
|
392
|
+
# Load and verify the data
|
393
|
+
loaded_index = faiss.read_index(faiss_path)
|
394
|
+
loaded_data = np.load(np_path)
|
395
|
+
|
396
|
+
assert loaded_index.ntotal == index.ntotal
|
397
|
+
assert loaded_index.d == index.d
|
398
|
+
assert "embeddings" in loaded_data
|
399
|
+
assert "chunk_names" in loaded_data
|
400
|
+
assert "texts" in loaded_data
|
401
|
+
np.testing.assert_array_equal(loaded_data["embeddings"], sample_vectors)
|
402
|
+
np.testing.assert_array_equal(loaded_data["chunk_names"], chunk_names)
|
403
|
+
np.testing.assert_array_equal(loaded_data["texts"], texts)
|
404
|
+
|
405
|
+
def test_load_data_successful(self, tmp_path):
|
406
|
+
# Setup - create data files first
|
407
|
+
dimension = 128
|
408
|
+
index = faiss.IndexFlatL2(dimension)
|
409
|
+
sample_vectors = np.random.random((5, dimension)).astype("float32")
|
410
|
+
index.add(sample_vectors)
|
411
|
+
|
412
|
+
chunk_names = np.array(["chunk1", "chunk2", "chunk3", "chunk4", "chunk5"])
|
413
|
+
texts = np.array(["text1", "text2", "text3", "text4", "text5"])
|
414
|
+
|
415
|
+
# Save test data
|
416
|
+
faiss_path = str(tmp_path / "test.index")
|
417
|
+
np_path = str(tmp_path / "test.npz")
|
418
|
+
faiss.write_index(index, faiss_path)
|
419
|
+
np.savez(np_path, embeddings=sample_vectors, chunk_names=chunk_names, texts=texts)
|
420
|
+
|
421
|
+
# Create client and load data
|
422
|
+
client = VectorDBClient()
|
423
|
+
client.load_data(faiss_path, np_path)
|
424
|
+
|
425
|
+
# Verify data was loaded correctly
|
426
|
+
assert client.faiss_index is not None
|
427
|
+
assert client.file_argument is not None
|
428
|
+
assert client.faiss_index.ntotal == 5
|
429
|
+
assert client.faiss_index.d == dimension
|
430
|
+
np.testing.assert_array_equal(client.file_argument.ndarray_data, sample_vectors)
|
431
|
+
np.testing.assert_array_equal(client.file_argument.chunk_names, chunk_names)
|
432
|
+
np.testing.assert_array_equal(client.file_argument.text_list, texts)
|
433
|
+
|
434
|
+
def test_load_data_with_mocks(self):
|
435
|
+
client = VectorDBClient()
|
436
|
+
mock_index = MagicMock(spec=faiss.IndexFlatL2)
|
437
|
+
mock_file_arg = MagicMock(spec=FileArgument)
|
438
|
+
|
439
|
+
with patch.object(VectorDB, 'load_faiss_index', return_value=mock_index) as mock_load_faiss:
|
440
|
+
with patch.object(VectorDB, 'load_numpy_embeddings', return_value=mock_file_arg) as mock_load_numpy:
|
441
|
+
client.load_data("test.index", "test.npz")
|
442
|
+
|
443
|
+
mock_load_faiss.assert_called_once_with("test.index")
|
444
|
+
mock_load_numpy.assert_called_once_with("test.npz")
|
445
|
+
assert client.faiss_index == mock_index
|
446
|
+
assert client.file_argument == mock_file_arg
|
447
|
+
|
448
|
+
def test_load_data_missing_files(self, tmp_path):
|
449
|
+
client = VectorDBClient()
|
450
|
+
nonexistent_faiss = str(tmp_path / "nonexistent.index")
|
451
|
+
nonexistent_np = str(tmp_path / "nonexistent.npz")
|
452
|
+
|
453
|
+
# Test loading non-existent files
|
454
|
+
client.load_data(nonexistent_faiss, nonexistent_np)
|
455
|
+
|
456
|
+
# When files don't exist, the values should be None
|
457
|
+
assert client.faiss_index is None
|
458
|
+
assert client.file_argument is None
|
459
|
+
|
460
|
+
def test_load_data_partial_missing(self, tmp_path):
|
461
|
+
# Setup - create only one file
|
462
|
+
dimension = 128
|
463
|
+
index = faiss.IndexFlatL2(dimension)
|
464
|
+
faiss_path = str(tmp_path / "test.index")
|
465
|
+
nonexistent_np = str(tmp_path / "nonexistent.npz")
|
466
|
+
faiss.write_index(index, faiss_path)
|
467
|
+
|
468
|
+
# Test loading with one missing file
|
469
|
+
client = VectorDBClient()
|
470
|
+
client.load_data(faiss_path, nonexistent_np)
|
471
|
+
|
472
|
+
# Verify partial loading
|
473
|
+
assert client.faiss_index is not None
|
474
|
+
assert client.file_argument is None
|
@@ -2,14 +2,15 @@ import os
|
|
2
2
|
import docx
|
3
3
|
import numpy as np
|
4
4
|
from typing import Optional
|
5
|
-
|
5
|
+
|
6
|
+
from vectoriz.token_transformer import TokenTransformer
|
6
7
|
|
7
8
|
class FileArgument:
|
8
9
|
def __init__(
|
9
10
|
self,
|
10
|
-
chunk_names: list[str],
|
11
|
-
text_list: list[str],
|
12
|
-
embeddings: list[float],
|
11
|
+
chunk_names: list[str] = [],
|
12
|
+
text_list: list[str] = [],
|
13
|
+
embeddings: list[float] = [],
|
13
14
|
ndarray_data: Optional[np.ndarray] = None,
|
14
15
|
) -> None:
|
15
16
|
"""
|
@@ -77,6 +77,22 @@ class TokenTransformer:
|
|
77
77
|
texts: list[str],
|
78
78
|
context_amount: int = 1,
|
79
79
|
) -> str:
|
80
|
+
"""
|
81
|
+
Searches for the most similar texts to the given query using the provided FAISS index.
|
82
|
+
This method converts the query into an embedding, searches for the k nearest neighbors
|
83
|
+
in the index, and returns the corresponding texts as context.
|
84
|
+
Args:
|
85
|
+
query (str): The search query text
|
86
|
+
index (faiss.IndexFlatL2): A FAISS index containing embeddings for the texts
|
87
|
+
texts (list[str]): A list of texts corresponding to the embeddings in the index
|
88
|
+
context_amount (int, optional): The number of texts to retrieve. Defaults to 1.
|
89
|
+
Returns:
|
90
|
+
str: The concatenated text of the most similar documents, separated by newlines.
|
91
|
+
Returns an empty string if texts or query is empty or None.
|
92
|
+
"""
|
93
|
+
if texts is None or len(texts) == 0 or query is None or len(query) == 0:
|
94
|
+
return ""
|
95
|
+
|
80
96
|
query_embedding = self._query_to_embeddings(query)
|
81
97
|
_, I = index.search(query_embedding, k=context_amount)
|
82
98
|
context = ""
|
@@ -99,6 +115,9 @@ class TokenTransformer:
|
|
99
115
|
Returns:
|
100
116
|
faiss.IndexFlatL2: A FAISS index containing the embeddings of the input texts.
|
101
117
|
"""
|
118
|
+
if len(texts) == 0:
|
119
|
+
raise ValueError("The input texts list is empty.")
|
120
|
+
|
102
121
|
embeddings = self.text_to_embeddings(texts)
|
103
122
|
index = self.embeddings_to_index(embeddings)
|
104
123
|
return TokenData(texts, index, embeddings)
|
@@ -3,8 +3,8 @@ import faiss
|
|
3
3
|
import numpy as np
|
4
4
|
from typing import Optional
|
5
5
|
|
6
|
-
from files import FileArgument
|
7
|
-
from token_transformer import TokenTransformer
|
6
|
+
from vectoriz.files import FileArgument
|
7
|
+
from vectoriz.token_transformer import TokenTransformer
|
8
8
|
|
9
9
|
|
10
10
|
class VectorDBClient:
|
@@ -182,7 +182,7 @@ class VectorDB:
|
|
182
182
|
if not os.path.exists(np_db_path):
|
183
183
|
return None
|
184
184
|
|
185
|
-
data = np.load(
|
185
|
+
data = np.load(np_db_path)
|
186
186
|
embeddings_np = data["embeddings"]
|
187
187
|
chunk_names = data["chunk_names"]
|
188
188
|
texts = data["texts"]
|
@@ -1,40 +0,0 @@
|
|
1
|
-
import pytest
|
2
|
-
import numpy as np
|
3
|
-
from unittest.mock import patch
|
4
|
-
from vectoriz.files import FileArgument
|
5
|
-
|
6
|
-
|
7
|
-
def test_add_data_appends_to_lists():
|
8
|
-
file_arg = FileArgument([], [], [])
|
9
|
-
filename = "test.txt"
|
10
|
-
text = "Test content"
|
11
|
-
|
12
|
-
with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.1, 0.2, 0.3])):
|
13
|
-
file_arg.add_data(filename, text)
|
14
|
-
|
15
|
-
assert file_arg.chunk_names == [filename]
|
16
|
-
assert file_arg.text_list == [text]
|
17
|
-
assert len(file_arg.embeddings) == 1
|
18
|
-
np.testing.assert_array_equal(file_arg.embeddings[0], np.array([0.1, 0.2, 0.3]))
|
19
|
-
|
20
|
-
def test_add_data_multiple_entries():
|
21
|
-
file_arg = FileArgument(["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])])
|
22
|
-
filename = "new.txt"
|
23
|
-
text = "New content"
|
24
|
-
|
25
|
-
with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.7, 0.8, 0.9])):
|
26
|
-
file_arg.add_data(filename, text)
|
27
|
-
assert file_arg.chunk_names == ["existing.txt", "new.txt"]
|
28
|
-
assert file_arg.text_list == ["existing content", "New content"]
|
29
|
-
assert len(file_arg.embeddings) == 2
|
30
|
-
np.testing.assert_array_equal(file_arg.embeddings[1], np.array([0.7, 0.8, 0.9]))
|
31
|
-
|
32
|
-
def test_add_data_calls_create_embedding():
|
33
|
-
file_arg = FileArgument([], [], [])
|
34
|
-
filename = "test.txt"
|
35
|
-
text = "Test content"
|
36
|
-
|
37
|
-
with patch.object(FileArgument, '_create_embedding') as mock_create_embedding:
|
38
|
-
mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
|
39
|
-
file_arg.add_data(filename, text)
|
40
|
-
mock_create_embedding.assert_called_once_with(text)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|