vectoriz 0.0.5__py3-none-any.whl → 0.1.1b0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tests/test_files.py CHANGED
@@ -1,40 +1,165 @@
1
+ import docx
1
2
  import pytest
2
3
  import numpy as np
3
- from unittest.mock import patch
4
+ from unittest.mock import patch, MagicMock
4
5
  from vectoriz.files import FileArgument
6
+ from vectoriz.files import FilesFeature
5
7
 
6
8
 
7
- def test_add_data_appends_to_lists():
8
- file_arg = FileArgument([], [], [])
9
- filename = "test.txt"
10
- text = "Test content"
11
-
12
- with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.1, 0.2, 0.3])):
13
- file_arg.add_data(filename, text)
14
-
15
- assert file_arg.chunk_names == [filename]
16
- assert file_arg.text_list == [text]
17
- assert len(file_arg.embeddings) == 1
18
- np.testing.assert_array_equal(file_arg.embeddings[0], np.array([0.1, 0.2, 0.3]))
19
-
20
- def test_add_data_multiple_entries():
21
- file_arg = FileArgument(["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])])
22
- filename = "new.txt"
23
- text = "New content"
24
-
25
- with patch.object(FileArgument, '_create_embedding', return_value=np.array([0.7, 0.8, 0.9])):
26
- file_arg.add_data(filename, text)
27
- assert file_arg.chunk_names == ["existing.txt", "new.txt"]
28
- assert file_arg.text_list == ["existing content", "New content"]
29
- assert len(file_arg.embeddings) == 2
30
- np.testing.assert_array_equal(file_arg.embeddings[1], np.array([0.7, 0.8, 0.9]))
31
-
32
- def test_add_data_calls_create_embedding():
33
- file_arg = FileArgument([], [], [])
34
- filename = "test.txt"
35
- text = "Test content"
36
-
37
- with patch.object(FileArgument, '_create_embedding') as mock_create_embedding:
38
- mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
39
- file_arg.add_data(filename, text)
40
- mock_create_embedding.assert_called_once_with(text)
9
+ class TestFileArgument:
10
+ def test_add_data_appends_to_lists(self):
11
+ file_arg = FileArgument()
12
+ filename = "test.txt"
13
+ text = "Test content"
14
+
15
+ with patch.object(
16
+ FileArgument, "_create_embedding", return_value=np.array([0.1, 0.2, 0.3])
17
+ ):
18
+ file_arg.add_data(filename, text)
19
+
20
+ assert file_arg.chunk_names == [filename]
21
+ assert file_arg.text_list == [text]
22
+ assert len(file_arg.embeddings) == 1
23
+ np.testing.assert_array_equal(
24
+ file_arg.embeddings[0], np.array([0.1, 0.2, 0.3])
25
+ )
26
+
27
+ def test_add_data_multiple_entries(self):
28
+ file_arg = FileArgument(
29
+ ["existing.txt"], ["existing content"], [np.array([0.5, 0.5, 0.5])]
30
+ )
31
+ filename = "new.txt"
32
+ text = "New content"
33
+
34
+ with patch.object(
35
+ FileArgument, "_create_embedding", return_value=np.array([0.7, 0.8, 0.9])
36
+ ):
37
+ file_arg.add_data(filename, text)
38
+ assert file_arg.chunk_names == ["existing.txt", "new.txt"]
39
+ assert file_arg.text_list == ["existing content", "New content"]
40
+ assert len(file_arg.embeddings) == 2
41
+ np.testing.assert_array_equal(
42
+ file_arg.embeddings[1], np.array([0.7, 0.8, 0.9])
43
+ )
44
+
45
+ def test_add_data_calls_create_embedding(self):
46
+ file_arg = FileArgument()
47
+ filename = "test.txt"
48
+ text = "Test content"
49
+
50
+ with patch.object(FileArgument, "_create_embedding") as mock_create_embedding:
51
+ mock_create_embedding.return_value = np.array([0.1, 0.2, 0.3])
52
+ file_arg.add_data(filename, text)
53
+ mock_create_embedding.assert_called_once_with(text)
54
+
55
+ def test_create_embedding_returns_numpy_array(self):
56
+ file_arg = FileArgument()
57
+ text = "Test content"
58
+
59
+ with patch("vectoriz.files.TokenTransformer") as mock_transformer:
60
+ mock_instance = mock_transformer.return_value
61
+ mock_instance.text_to_embeddings.return_value = [np.array([0.1, 0.2, 0.3])]
62
+
63
+ result = file_arg._create_embedding(text)
64
+
65
+ assert isinstance(result, np.ndarray)
66
+ np.testing.assert_array_equal(result, np.array([0.1, 0.2, 0.3]))
67
+ mock_instance.text_to_embeddings.assert_called_once_with([text])
68
+
69
+ def test_create_embedding_handles_empty_text(self):
70
+ file_arg = FileArgument()
71
+ text = ""
72
+
73
+ with patch("vectoriz.files.TokenTransformer") as mock_transformer:
74
+ mock_instance = mock_transformer.return_value
75
+ mock_instance.text_to_embeddings.return_value = [np.array([0.0, 0.0, 0.0])]
76
+
77
+ result = file_arg._create_embedding(text)
78
+
79
+ assert isinstance(result, np.ndarray)
80
+ mock_instance.text_to_embeddings.assert_called_once_with([""])
81
+
82
+ def test_create_embedding_instantiates_token_transformer(self):
83
+ file_arg = FileArgument()
84
+ text = "Test content"
85
+
86
+ with patch("vectoriz.files.TokenTransformer") as mock_transformer:
87
+ mock_instance = mock_transformer.return_value
88
+ mock_instance.text_to_embeddings.return_value = [np.array([0.1, 0.2, 0.3])]
89
+
90
+ file_arg._create_embedding(text)
91
+
92
+ mock_transformer.assert_called_once()
93
+
94
+
95
+ class TestFilesFeature:
96
+ def test_extract_txt_content_reads_file_correctly(self, tmp_path):
97
+ test_content = "This is test content"
98
+ test_file = tmp_path / "test.txt"
99
+ test_file.write_text(test_content)
100
+ files_feature = FilesFeature()
101
+ result = files_feature._extract_txt_content(str(tmp_path), "test.txt")
102
+ assert result == test_content
103
+
104
+ def test_extract_txt_content_with_unicode_chars(self, tmp_path):
105
+ test_content = "Unicode content: àáâãäåæç"
106
+ test_file = tmp_path / "unicode.txt"
107
+ test_file.write_text(test_content, encoding="utf-8")
108
+ files_feature = FilesFeature()
109
+ result = files_feature._extract_txt_content(str(tmp_path), "unicode.txt")
110
+ assert result == test_content
111
+
112
+ def test_extract_txt_content_raises_file_not_found(self):
113
+ files_feature = FilesFeature()
114
+ with pytest.raises(FileNotFoundError):
115
+ files_feature._extract_txt_content(
116
+ "/non_existent_dir", "non_existent_file.txt"
117
+ )
118
+
119
+ def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
120
+ mock_doc = MagicMock()
121
+ mock_paragraph1 = MagicMock()
122
+ mock_paragraph1.text = "Paragraph 1"
123
+ mock_paragraph2 = MagicMock()
124
+ mock_paragraph2.text = "Paragraph 2"
125
+ mock_doc.paragraphs = [mock_paragraph1, mock_paragraph2]
126
+
127
+ monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
128
+ files_feature = FilesFeature()
129
+ result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
130
+
131
+ assert result == "Paragraph 1\nParagraph 2"
132
+
133
+ def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
134
+ mock_doc = MagicMock()
135
+ mock_paragraph1 = MagicMock()
136
+ mock_paragraph1.text = "Paragraph 1"
137
+ mock_paragraph2 = MagicMock()
138
+ mock_paragraph2.text = " "
139
+ mock_paragraph3 = MagicMock()
140
+ mock_paragraph3.text = "Paragraph 3"
141
+ mock_doc.paragraphs = [mock_paragraph1, mock_paragraph2, mock_paragraph3]
142
+
143
+ monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
144
+ files_feature = FilesFeature()
145
+ result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
146
+
147
+ assert result == "Paragraph 1\nParagraph 3"
148
+
149
+ def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
150
+ def mock_document(_):
151
+ raise Exception("Failed to open document")
152
+
153
+ monkeypatch.setattr(docx, "Document", mock_document)
154
+
155
+ files_feature = FilesFeature()
156
+ with pytest.raises(Exception):
157
+ files_feature._extract_docx_content(str(tmp_path), "invalid.docx")
158
+
159
+ def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
160
+ mock_doc = MagicMock()
161
+ mock_doc.paragraphs = []
162
+ monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
163
+ files_feature = FilesFeature()
164
+ result = files_feature._extract_docx_content(str(tmp_path), "empty.docx")
165
+ assert result == ""
@@ -0,0 +1,193 @@
1
+ import pytest
2
+ import faiss
3
+ import numpy as np
4
+ from unittest.mock import Mock
5
+ from vectoriz.token_transformer import TokenData, TokenTransformer
6
+
7
+
8
+ class TestTokenData:
9
+
10
+ def test_from_vector_db(self):
11
+ mock_vector_data = Mock()
12
+
13
+ mock_index = faiss.IndexFlatL2(5)
14
+ mock_embeddings = np.random.random((3, 5)).astype("float32")
15
+ mock_texts = ["text1", "text2", "text3"]
16
+
17
+ mock_file_argument = Mock()
18
+ mock_file_argument.embeddings = mock_embeddings
19
+ mock_file_argument.text_list = mock_texts
20
+ mock_vector_data.faiss_index = mock_index
21
+ mock_vector_data.file_argument = mock_file_argument
22
+ token_data = TokenData.from_vector_db(mock_vector_data)
23
+
24
+ assert token_data.texts == mock_texts
25
+ assert token_data.index == mock_index
26
+ assert np.array_equal(token_data.embeddings, mock_embeddings)
27
+ assert isinstance(token_data, TokenData)
28
+
29
+ def test_from_file_argument(self):
30
+ mock_file_argument = Mock()
31
+ mock_file_argument.embeddings = np.random.random((3, 5)).astype("float32")
32
+ mock_file_argument.text_list = ["text1", "text2", "text3"]
33
+ mock_index = faiss.IndexFlatL2(5)
34
+ token_data = TokenData.from_file_argument(mock_file_argument, mock_index)
35
+
36
+ assert token_data.texts == mock_file_argument.text_list
37
+ assert token_data.index == mock_index
38
+ assert np.array_equal(token_data.embeddings, mock_file_argument.embeddings)
39
+ assert isinstance(token_data, TokenData)
40
+
41
+
42
+ class TestTokenTransformer:
43
+
44
+ def test_text_to_embeddings(self):
45
+ transformer = TokenTransformer()
46
+ sentences = ["This is a test sentence.", "Another test sentence."]
47
+ embeddings = transformer.text_to_embeddings(sentences)
48
+
49
+ assert isinstance(embeddings, np.ndarray)
50
+ assert embeddings.shape[0] == len(sentences)
51
+ assert embeddings.shape[1] > 0
52
+
53
+ def test_text_to_embeddings_empty_list(self):
54
+ transformer = TokenTransformer()
55
+ sentences = []
56
+ embeddings = transformer.text_to_embeddings(sentences)
57
+
58
+ assert isinstance(embeddings, np.ndarray)
59
+
60
+ def test_get_np_vectors(self):
61
+ transformer = TokenTransformer()
62
+ embeddings = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
63
+ result = transformer.get_np_vectors(embeddings)
64
+
65
+ assert isinstance(result, np.ndarray)
66
+ assert result.dtype == np.float32
67
+ assert result.shape == (2, 3)
68
+ assert np.array_equal(
69
+ result, np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)
70
+ )
71
+
72
+ def test_get_np_vectors_empty_list(self):
73
+ transformer = TokenTransformer()
74
+ embeddings = []
75
+ result = transformer.get_np_vectors(embeddings)
76
+
77
+ assert isinstance(result, np.ndarray)
78
+ assert result.dtype == np.float32
79
+ assert result.shape == (0,)
80
+
81
+ def test_get_np_vectors_single_element(self):
82
+ transformer = TokenTransformer()
83
+ embeddings = [[1.5, 2.5]]
84
+ result = transformer.get_np_vectors(embeddings)
85
+
86
+ assert isinstance(result, np.ndarray)
87
+ assert result.dtype == np.float32
88
+ assert result.shape == (1, 2)
89
+ assert np.array_equal(result, np.array([[1.5, 2.5]], dtype=np.float32))
90
+
91
+ def test_query_to_embeddings(self):
92
+ transformer = TokenTransformer()
93
+ query = "This is a test query"
94
+ result = transformer._query_to_embeddings(query)
95
+
96
+ assert isinstance(result, np.ndarray)
97
+ assert result.shape[0] == 1
98
+ assert result.shape[1] == 384
99
+
100
+ def test_query_to_embeddings_empty_string(self):
101
+ transformer = TokenTransformer()
102
+ query = ""
103
+ result = transformer._query_to_embeddings(query)
104
+
105
+ assert isinstance(result, np.ndarray)
106
+ assert result.shape[0] == 1
107
+ assert result.shape[1] == 384
108
+
109
+ def test_query_to_embeddings_returns_correct_shape(self):
110
+ transformer = TokenTransformer()
111
+ query1 = "First query"
112
+ query2 = "Second query with more words"
113
+ result1 = transformer._query_to_embeddings(query1)
114
+ result2 = transformer._query_to_embeddings(query2)
115
+
116
+ assert result1.shape == result2.shape
117
+ assert len(result1.shape) == 2
118
+ assert result1.shape[0] == 1
119
+
120
+ def test_search(self):
121
+ transformer = TokenTransformer()
122
+ texts = ["First document", "Second document", "Third document"]
123
+
124
+ embeddings = transformer.text_to_embeddings(texts)
125
+ index = transformer.embeddings_to_index(embeddings)
126
+
127
+ result = transformer.search("first", index, texts, context_amount=1)
128
+ assert isinstance(result, str)
129
+ assert "First document" in result
130
+
131
+ result = transformer.search("document", index, texts, context_amount=2)
132
+ assert isinstance(result, str)
133
+ assert len(result.strip().split("\n")) == 2
134
+
135
+ def test_search_with_empty_texts(self):
136
+ transformer = TokenTransformer()
137
+ texts = []
138
+
139
+ if texts:
140
+ embeddings = transformer.text_to_embeddings(texts)
141
+ index = transformer.embeddings_to_index(embeddings)
142
+ else:
143
+ index = faiss.IndexFlatL2(384)
144
+
145
+ result = transformer.search("query", index, texts)
146
+ assert result == ""
147
+
148
+ def test_search_with_different_context_amounts(self):
149
+ transformer = TokenTransformer()
150
+ texts = ["Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5"]
151
+
152
+ embeddings = transformer.text_to_embeddings(texts)
153
+ index = transformer.embeddings_to_index(embeddings)
154
+
155
+ result1 = transformer.search("Doc", index, texts, context_amount=1)
156
+ result3 = transformer.search("Doc", index, texts, context_amount=3)
157
+ result5 = transformer.search("Doc", index, texts, context_amount=5)
158
+
159
+ assert len(result1.strip().split("\n")) == 1
160
+ assert len(result3.strip().split("\n")) == 3
161
+ assert len(result5.strip().split("\n")) == 5
162
+
163
+ def test_create_index(self):
164
+ transformer = TokenTransformer()
165
+ texts = ["First document", "Second document", "Third document"]
166
+
167
+ token_data = transformer.create_index(texts)
168
+
169
+ assert isinstance(token_data, TokenData)
170
+ assert token_data.texts == texts
171
+ assert isinstance(token_data.index, faiss.IndexFlatL2)
172
+ assert isinstance(token_data.embeddings, np.ndarray)
173
+ assert token_data.embeddings.shape[0] == len(texts)
174
+ assert token_data.embeddings.shape[1] == 384
175
+
176
+ def test_create_index_empty_list(self):
177
+ transformer = TokenTransformer()
178
+ texts = []
179
+
180
+ with pytest.raises(ValueError, match="The input texts list is empty."):
181
+ transformer.create_index(texts)
182
+
183
+ def test_create_index_single_element(self):
184
+ transformer = TokenTransformer()
185
+ texts = ["Single document text"]
186
+
187
+ token_data = transformer.create_index(texts)
188
+
189
+ assert isinstance(token_data, TokenData)
190
+ assert token_data.texts == texts
191
+ assert isinstance(token_data.index, faiss.IndexFlatL2)
192
+ assert isinstance(token_data.embeddings, np.ndarray)
193
+ assert token_data.embeddings.shape == (1, 384)
@@ -0,0 +1,474 @@
1
+ import os
2
+ import pytest
3
+ import numpy as np
4
+ import faiss
5
+ from unittest.mock import patch, MagicMock
6
+ from vectoriz.vector_db import VectorDB, VectorDBClient
7
+ from vectoriz.files import FileArgument
8
+
9
+
10
+ class TestVectorDB:
11
+
12
+ def test_load_saved_data_successful(self, tmp_path):
13
+ vector_db = VectorDB()
14
+ faiss_path = str(tmp_path / "test.index")
15
+ np_path = str(tmp_path / "test.npz")
16
+
17
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
18
+ mock_file_argument = MagicMock(spec=FileArgument)
19
+
20
+ with patch.object(
21
+ vector_db, "load_faiss_index", return_value=mock_index
22
+ ) as mock_load_index:
23
+ with patch.object(
24
+ vector_db, "load_numpy_embeddings", return_value=mock_file_argument
25
+ ) as mock_load_embeddings:
26
+
27
+ result = vector_db.load_saved_data(faiss_path, np_path)
28
+
29
+ assert isinstance(result, VectorDBClient)
30
+ assert result.faiss_index == mock_index
31
+ assert result.file_argument == mock_file_argument
32
+ mock_load_index.assert_called_once_with(faiss_path)
33
+ mock_load_embeddings.assert_called_once_with(np_path)
34
+
35
+ def test_load_saved_data_missing_faiss_index(self, tmp_path):
36
+ vector_db = VectorDB()
37
+ faiss_path = str(tmp_path / "nonexistent.index")
38
+ np_path = str(tmp_path / "test.npz")
39
+
40
+ mock_file_argument = MagicMock(spec=FileArgument)
41
+
42
+ with patch.object(
43
+ vector_db, "load_faiss_index", return_value=None
44
+ ) as mock_load_index:
45
+ with patch.object(
46
+ vector_db, "load_numpy_embeddings", return_value=mock_file_argument
47
+ ) as mock_load_embeddings:
48
+
49
+ result = vector_db.load_saved_data(faiss_path, np_path)
50
+
51
+ assert result is None
52
+ mock_load_index.assert_called_once_with(faiss_path)
53
+ mock_load_embeddings.assert_called_once_with(np_path)
54
+
55
+ def test_load_saved_data_missing_numpy_embeddings(self, tmp_path):
56
+ vector_db = VectorDB()
57
+ faiss_path = str(tmp_path / "test.index")
58
+ np_path = str(tmp_path / "nonexistent.npz")
59
+
60
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
61
+
62
+ with patch.object(
63
+ vector_db, "load_faiss_index", return_value=mock_index
64
+ ) as mock_load_index:
65
+ with patch.object(
66
+ vector_db, "load_numpy_embeddings", return_value=None
67
+ ) as mock_load_embeddings:
68
+
69
+ result = vector_db.load_saved_data(faiss_path, np_path)
70
+
71
+ assert result is None
72
+ mock_load_index.assert_called_once_with(faiss_path)
73
+ mock_load_embeddings.assert_called_once_with(np_path)
74
+
75
+ def test_load_saved_data_both_missing(self, tmp_path):
76
+ vector_db = VectorDB()
77
+ faiss_path = str(tmp_path / "nonexistent.index")
78
+ np_path = str(tmp_path / "nonexistent.npz")
79
+
80
+ with patch.object(
81
+ vector_db, "load_faiss_index", return_value=None
82
+ ) as mock_load_index:
83
+ with patch.object(
84
+ vector_db, "load_numpy_embeddings", return_value=None
85
+ ) as mock_load_embeddings:
86
+
87
+ result = vector_db.load_saved_data(faiss_path, np_path)
88
+
89
+ assert result is None
90
+ mock_load_index.assert_called_once_with(faiss_path)
91
+ mock_load_embeddings.assert_called_once_with(np_path)
92
+
93
+ def test_save_faiss_index(self, tmp_path):
94
+ vector_db = VectorDB()
95
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
96
+ faiss_path = str(tmp_path / "test")
97
+
98
+ with patch("faiss.write_index") as mock_write_index:
99
+ vector_db.save_faiss_index(mock_index, faiss_path)
100
+
101
+ mock_write_index.assert_called_once_with(mock_index, faiss_path + ".index")
102
+
103
+ def test_save_faiss_index_with_extension(self, tmp_path):
104
+ vector_db = VectorDB()
105
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
106
+ faiss_path = str(tmp_path / "test.index")
107
+
108
+ with patch("faiss.write_index") as mock_write_index:
109
+ vector_db.save_faiss_index(mock_index, faiss_path)
110
+
111
+ mock_write_index.assert_called_once_with(mock_index, faiss_path)
112
+
113
+ def test_save_faiss_index_integration(self, tmp_path):
114
+ vector_db = VectorDB()
115
+ dimension = 128
116
+ index = faiss.IndexFlatL2(dimension)
117
+
118
+ sample_vectors = np.random.random((10, dimension)).astype("float32")
119
+ index.add(sample_vectors)
120
+ faiss_path = str(tmp_path / "test.index")
121
+
122
+ vector_db.save_faiss_index(index, faiss_path)
123
+
124
+ assert os.path.exists(faiss_path)
125
+
126
+ loaded_index = faiss.read_index(faiss_path)
127
+ assert loaded_index.ntotal == index.ntotal
128
+
129
+ def test_load_faiss_index_successful(self, tmp_path):
130
+ vector_db = VectorDB()
131
+ dimension = 128
132
+ index = faiss.IndexFlatL2(dimension)
133
+
134
+ sample_vectors = np.random.random((5, dimension)).astype("float32")
135
+ index.add(sample_vectors)
136
+ faiss_path = str(tmp_path / "test.index")
137
+
138
+ faiss.write_index(index, faiss_path)
139
+
140
+ result = vector_db.load_faiss_index(faiss_path)
141
+
142
+ assert result is not None
143
+ assert result.ntotal == index.ntotal
144
+ assert result.d == index.d
145
+
146
+ def test_load_faiss_index_missing_file(self, tmp_path):
147
+ vector_db = VectorDB()
148
+ nonexistent_path = str(tmp_path / "nonexistent.index")
149
+
150
+ result = vector_db.load_faiss_index(nonexistent_path)
151
+
152
+ assert result is None
153
+
154
+ def test_load_faiss_index_with_mock(self):
155
+ vector_db = VectorDB()
156
+ test_path = "/mock/path/test.index"
157
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
158
+
159
+ with patch("os.path.exists", return_value=True) as mock_exists:
160
+ with patch("faiss.read_index", return_value=mock_index) as mock_read:
161
+ result = vector_db.load_faiss_index(test_path)
162
+
163
+ mock_exists.assert_called_once_with(test_path)
164
+ mock_read.assert_called_once_with(test_path)
165
+ assert result == mock_index
166
+
167
+ def test_save_numpy_embeddings_with_ndarray(self, tmp_path):
168
+ vector_db = VectorDB()
169
+ np_path = str(tmp_path / "test")
170
+
171
+ embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
172
+ chunk_names = np.array(["chunk1", "chunk2"])
173
+ texts = np.array(["text1", "text2"])
174
+
175
+ file_arg = FileArgument(
176
+ chunk_names=chunk_names,
177
+ text_list=texts,
178
+ embeddings=[],
179
+ ndarray_data=embeddings_np,
180
+ )
181
+
182
+ with patch("numpy.savez") as mock_savez:
183
+ vector_db.save_numpy_embeddings(file_arg, np_path)
184
+
185
+ mock_savez.assert_called_once_with(
186
+ np_path + ".npz",
187
+ embeddings=embeddings_np,
188
+ chunk_names=chunk_names,
189
+ texts=texts,
190
+ )
191
+
192
+ def test_save_numpy_embeddings_with_embeddings_list(self, tmp_path):
193
+ vector_db = VectorDB()
194
+ np_path = str(tmp_path / "test")
195
+
196
+ embeddings_list = [[[0.1, 0.2]], [[0.3, 0.4]]]
197
+ chunk_names = np.array(["chunk1", "chunk2"])
198
+ texts = np.array(["text1", "text2"])
199
+
200
+ file_arg = FileArgument(
201
+ chunk_names=chunk_names,
202
+ text_list=texts,
203
+ embeddings=embeddings_list,
204
+ ndarray_data=None,
205
+ )
206
+
207
+ transformed_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
208
+
209
+ with patch(
210
+ "vectoriz.token_transformer.TokenTransformer.get_np_vectors",
211
+ return_value=transformed_np,
212
+ ) as mock_transform:
213
+ with patch("numpy.savez") as mock_savez:
214
+ vector_db.save_numpy_embeddings(file_arg, np_path)
215
+
216
+ mock_transform.assert_called_once_with(embeddings_list)
217
+ mock_savez.assert_called_once_with(
218
+ np_path + ".npz",
219
+ embeddings=transformed_np,
220
+ chunk_names=chunk_names,
221
+ texts=texts,
222
+ )
223
+
224
+ def test_save_numpy_embeddings_with_extension(self, tmp_path):
225
+ vector_db = VectorDB()
226
+ np_path = str(tmp_path / "test.npz")
227
+
228
+ embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
229
+ chunk_names = np.array(["chunk1", "chunk2"])
230
+ texts = np.array(["text1", "text2"])
231
+
232
+ file_arg = FileArgument(
233
+ chunk_names=chunk_names,
234
+ text_list=texts,
235
+ embeddings=[],
236
+ ndarray_data=embeddings_np,
237
+ )
238
+
239
+ with patch("numpy.savez") as mock_savez:
240
+ vector_db.save_numpy_embeddings(file_arg, np_path)
241
+
242
+ mock_savez.assert_called_once_with(
243
+ np_path, embeddings=embeddings_np, chunk_names=chunk_names, texts=texts
244
+ )
245
+
246
+ def test_save_numpy_embeddings_integration(self, tmp_path):
247
+ vector_db = VectorDB()
248
+ np_path = str(tmp_path / "test.npz")
249
+
250
+ embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
251
+ chunk_names = np.array(["chunk1", "chunk2"])
252
+ texts = np.array(["text1", "text2"])
253
+
254
+ file_arg = FileArgument(
255
+ chunk_names=chunk_names,
256
+ text_list=texts,
257
+ embeddings=[],
258
+ ndarray_data=embeddings_np,
259
+ )
260
+
261
+ vector_db.save_numpy_embeddings(file_arg, np_path)
262
+
263
+ assert os.path.exists(np_path)
264
+ loaded_data = np.load(np_path)
265
+ assert "embeddings" in loaded_data
266
+ assert "chunk_names" in loaded_data
267
+ assert "texts" in loaded_data
268
+ np.testing.assert_array_equal(loaded_data["embeddings"], embeddings_np)
269
+ np.testing.assert_array_equal(loaded_data["chunk_names"], chunk_names)
270
+ np.testing.assert_array_equal(loaded_data["texts"], texts)
271
+
272
+ def test_load_numpy_embeddings_successful(self, tmp_path):
273
+ vector_db = VectorDB()
274
+ np_path = str(tmp_path / "test.npz")
275
+
276
+ embeddings_np = np.array([[0.1, 0.2], [0.3, 0.4]], dtype=np.float32)
277
+ chunk_names = np.array(["chunk1", "chunk2"])
278
+ texts = np.array(["text1", "text2"])
279
+
280
+ np.savez(
281
+ np_path, embeddings=embeddings_np, chunk_names=chunk_names, texts=texts
282
+ )
283
+
284
+ result = vector_db.load_numpy_embeddings(np_path)
285
+
286
+ assert result is not None
287
+ np.testing.assert_array_equal(result.ndarray_data, embeddings_np)
288
+ np.testing.assert_array_equal(result.chunk_names, chunk_names)
289
+ np.testing.assert_array_equal(result.text_list, texts)
290
+ assert result.embeddings == []
291
+
292
+ def test_load_numpy_embeddings_missing_file(self, tmp_path):
293
+ vector_db = VectorDB()
294
+ nonexistent_path = str(tmp_path / "nonexistent.npz")
295
+
296
+ result = vector_db.load_numpy_embeddings(nonexistent_path)
297
+
298
+ assert result is None
299
+
300
+ def test_load_numpy_embeddings_with_mock(self):
301
+ vector_db = VectorDB()
302
+ test_path = "/mock/path/test.npz"
303
+
304
+ mock_data = {
305
+ "embeddings": np.array([[0.1, 0.2], [0.3, 0.4]]),
306
+ "chunk_names": np.array(["name1", "name2"]),
307
+ "texts": np.array(["text1", "text2"]),
308
+ }
309
+
310
+ with patch("os.path.exists", return_value=True) as mock_exists:
311
+ with patch("numpy.load", return_value=mock_data) as mock_load:
312
+ result = vector_db.load_numpy_embeddings(test_path)
313
+
314
+ mock_exists.assert_called_once_with(test_path)
315
+ mock_load.assert_called_once_with(test_path)
316
+ assert isinstance(result, FileArgument)
317
+ np.testing.assert_array_equal(
318
+ result.ndarray_data, mock_data["embeddings"]
319
+ )
320
+ np.testing.assert_array_equal(
321
+ result.chunk_names, mock_data["chunk_names"]
322
+ )
323
+ np.testing.assert_array_equal(result.text_list, mock_data["texts"])
324
+
325
+
326
+ class TestVectorDBClient:
327
+ def test_save_data_successful(self, tmp_path):
328
+ # Setup
329
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
330
+ mock_file_argument = MagicMock(spec=FileArgument)
331
+ client = VectorDBClient(mock_index, mock_file_argument)
332
+
333
+ faiss_path = str(tmp_path / "test")
334
+ np_path = str(tmp_path / "test")
335
+
336
+ # Test with mocks to verify function calls
337
+ with patch.object(VectorDB, 'save_faiss_index') as mock_save_faiss:
338
+ with patch.object(VectorDB, 'save_numpy_embeddings') as mock_save_numpy:
339
+ client.save_data(faiss_path, np_path)
340
+
341
+ mock_save_faiss.assert_called_once_with(mock_index, faiss_path)
342
+ mock_save_numpy.assert_called_once_with(mock_file_argument, np_path)
343
+
344
+ def test_save_data_not_initialized(self):
345
+ # Test with None values
346
+ client1 = VectorDBClient(None, MagicMock(spec=FileArgument))
347
+ client2 = VectorDBClient(MagicMock(spec=faiss.IndexFlatL2), None)
348
+ client3 = VectorDBClient(None, None)
349
+
350
+ with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
351
+ client1.save_data("test.index", "test.npz")
352
+
353
+ with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
354
+ client2.save_data("test.index", "test.npz")
355
+
356
+ with pytest.raises(ValueError, match="FAISS index or file argument is not initialized."):
357
+ client3.save_data("test.index", "test.npz")
358
+
359
+ def test_save_data_integration(self, tmp_path):
360
+ # Setup real objects for integration test
361
+ dimension = 128
362
+ index = faiss.IndexFlatL2(dimension)
363
+
364
+ # Add some vectors to the index
365
+ sample_vectors = np.random.random((5, dimension)).astype("float32")
366
+ index.add(sample_vectors)
367
+
368
+ # Create file argument
369
+ chunk_names = np.array(["chunk1", "chunk2", "chunk3", "chunk4", "chunk5"])
370
+ texts = np.array(["text1", "text2", "text3", "text4", "text5"])
371
+ file_arg = FileArgument(
372
+ chunk_names=chunk_names,
373
+ text_list=texts,
374
+ embeddings=[],
375
+ ndarray_data=sample_vectors
376
+ )
377
+
378
+ # Create client
379
+ client = VectorDBClient(index, file_arg)
380
+
381
+ # Define paths
382
+ faiss_path = str(tmp_path / "test.index")
383
+ np_path = str(tmp_path / "test.npz")
384
+
385
+ # Save the data
386
+ client.save_data(faiss_path, np_path)
387
+
388
+ # Verify files were created
389
+ assert os.path.exists(faiss_path)
390
+ assert os.path.exists(np_path)
391
+
392
+ # Load and verify the data
393
+ loaded_index = faiss.read_index(faiss_path)
394
+ loaded_data = np.load(np_path)
395
+
396
+ assert loaded_index.ntotal == index.ntotal
397
+ assert loaded_index.d == index.d
398
+ assert "embeddings" in loaded_data
399
+ assert "chunk_names" in loaded_data
400
+ assert "texts" in loaded_data
401
+ np.testing.assert_array_equal(loaded_data["embeddings"], sample_vectors)
402
+ np.testing.assert_array_equal(loaded_data["chunk_names"], chunk_names)
403
+ np.testing.assert_array_equal(loaded_data["texts"], texts)
404
+
405
+ def test_load_data_successful(self, tmp_path):
406
+ # Setup - create data files first
407
+ dimension = 128
408
+ index = faiss.IndexFlatL2(dimension)
409
+ sample_vectors = np.random.random((5, dimension)).astype("float32")
410
+ index.add(sample_vectors)
411
+
412
+ chunk_names = np.array(["chunk1", "chunk2", "chunk3", "chunk4", "chunk5"])
413
+ texts = np.array(["text1", "text2", "text3", "text4", "text5"])
414
+
415
+ # Save test data
416
+ faiss_path = str(tmp_path / "test.index")
417
+ np_path = str(tmp_path / "test.npz")
418
+ faiss.write_index(index, faiss_path)
419
+ np.savez(np_path, embeddings=sample_vectors, chunk_names=chunk_names, texts=texts)
420
+
421
+ # Create client and load data
422
+ client = VectorDBClient()
423
+ client.load_data(faiss_path, np_path)
424
+
425
+ # Verify data was loaded correctly
426
+ assert client.faiss_index is not None
427
+ assert client.file_argument is not None
428
+ assert client.faiss_index.ntotal == 5
429
+ assert client.faiss_index.d == dimension
430
+ np.testing.assert_array_equal(client.file_argument.ndarray_data, sample_vectors)
431
+ np.testing.assert_array_equal(client.file_argument.chunk_names, chunk_names)
432
+ np.testing.assert_array_equal(client.file_argument.text_list, texts)
433
+
434
+ def test_load_data_with_mocks(self):
435
+ client = VectorDBClient()
436
+ mock_index = MagicMock(spec=faiss.IndexFlatL2)
437
+ mock_file_arg = MagicMock(spec=FileArgument)
438
+
439
+ with patch.object(VectorDB, 'load_faiss_index', return_value=mock_index) as mock_load_faiss:
440
+ with patch.object(VectorDB, 'load_numpy_embeddings', return_value=mock_file_arg) as mock_load_numpy:
441
+ client.load_data("test.index", "test.npz")
442
+
443
+ mock_load_faiss.assert_called_once_with("test.index")
444
+ mock_load_numpy.assert_called_once_with("test.npz")
445
+ assert client.faiss_index == mock_index
446
+ assert client.file_argument == mock_file_arg
447
+
448
+ def test_load_data_missing_files(self, tmp_path):
449
+ client = VectorDBClient()
450
+ nonexistent_faiss = str(tmp_path / "nonexistent.index")
451
+ nonexistent_np = str(tmp_path / "nonexistent.npz")
452
+
453
+ # Test loading non-existent files
454
+ client.load_data(nonexistent_faiss, nonexistent_np)
455
+
456
+ # When files don't exist, the values should be None
457
+ assert client.faiss_index is None
458
+ assert client.file_argument is None
459
+
460
+ def test_load_data_partial_missing(self, tmp_path):
461
+ # Setup - create only one file
462
+ dimension = 128
463
+ index = faiss.IndexFlatL2(dimension)
464
+ faiss_path = str(tmp_path / "test.index")
465
+ nonexistent_np = str(tmp_path / "nonexistent.npz")
466
+ faiss.write_index(index, faiss_path)
467
+
468
+ # Test loading with one missing file
469
+ client = VectorDBClient()
470
+ client.load_data(faiss_path, nonexistent_np)
471
+
472
+ # Verify partial loading
473
+ assert client.faiss_index is not None
474
+ assert client.file_argument is None
vectoriz/files.py CHANGED
@@ -2,14 +2,15 @@ import os
2
2
  import docx
3
3
  import numpy as np
4
4
  from typing import Optional
5
- from token_transformer import TokenTransformer
5
+
6
+ from vectoriz.token_transformer import TokenTransformer
6
7
 
7
8
  class FileArgument:
8
9
  def __init__(
9
10
  self,
10
- chunk_names: list[str],
11
- text_list: list[str],
12
- embeddings: list[float],
11
+ chunk_names: list[str] = [],
12
+ text_list: list[str] = [],
13
+ embeddings: list[float] = [],
13
14
  ndarray_data: Optional[np.ndarray] = None,
14
15
  ) -> None:
15
16
  """
@@ -77,6 +77,22 @@ class TokenTransformer:
77
77
  texts: list[str],
78
78
  context_amount: int = 1,
79
79
  ) -> str:
80
+ """
81
+ Searches for the most similar texts to the given query using the provided FAISS index.
82
+ This method converts the query into an embedding, searches for the k nearest neighbors
83
+ in the index, and returns the corresponding texts as context.
84
+ Args:
85
+ query (str): The search query text
86
+ index (faiss.IndexFlatL2): A FAISS index containing embeddings for the texts
87
+ texts (list[str]): A list of texts corresponding to the embeddings in the index
88
+ context_amount (int, optional): The number of texts to retrieve. Defaults to 1.
89
+ Returns:
90
+ str: The concatenated text of the most similar documents, separated by newlines.
91
+ Returns an empty string if texts or query is empty or None.
92
+ """
93
+ if texts is None or len(texts) == 0 or query is None or len(query) == 0:
94
+ return ""
95
+
80
96
  query_embedding = self._query_to_embeddings(query)
81
97
  _, I = index.search(query_embedding, k=context_amount)
82
98
  context = ""
@@ -99,6 +115,9 @@ class TokenTransformer:
99
115
  Returns:
100
116
  faiss.IndexFlatL2: A FAISS index containing the embeddings of the input texts.
101
117
  """
118
+ if len(texts) == 0:
119
+ raise ValueError("The input texts list is empty.")
120
+
102
121
  embeddings = self.text_to_embeddings(texts)
103
122
  index = self.embeddings_to_index(embeddings)
104
123
  return TokenData(texts, index, embeddings)
vectoriz/vector_db.py CHANGED
@@ -3,8 +3,8 @@ import faiss
3
3
  import numpy as np
4
4
  from typing import Optional
5
5
 
6
- from files import FileArgument
7
- from token_transformer import TokenTransformer
6
+ from vectoriz.files import FileArgument
7
+ from vectoriz.token_transformer import TokenTransformer
8
8
 
9
9
 
10
10
  class VectorDBClient:
@@ -182,7 +182,7 @@ class VectorDB:
182
182
  if not os.path.exists(np_db_path):
183
183
  return None
184
184
 
185
- data = np.load(self.np_db_path)
185
+ data = np.load(np_db_path)
186
186
  embeddings_np = data["embeddings"]
187
187
  chunk_names = data["chunk_names"]
188
188
  texts = data["texts"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vectoriz
3
- Version: 0.0.5
3
+ Version: 0.1.1b0
4
4
  Summary: Python library for creating vectorized data from text or files.
5
5
  Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
6
6
  Author: PedroHenriqueDevBR
@@ -0,0 +1,12 @@
1
+ tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ tests/test_files.py,sha256=EFXN9GChf9widEb3OvUcTXtOeU9X3naohMWyTZVPTJs,6559
3
+ tests/test_token_transformer.py,sha256=LoLA9t_7owaghB5jS2hJrM1LYk3VSxa3Xo-qrWM2QZY,7152
4
+ tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
5
+ vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
6
+ vectoriz/files.py,sha256=jTwNBs1A_nqo0WWzLFNDGaBnyAPvLep283q3GuOH8bk,9056
7
+ vectoriz/token_transformer.py,sha256=B7fPt-A-RzJjIoYns7wL_yyxQIj0UBRsnJIGCY_Ae2Q,6828
8
+ vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
9
+ vectoriz-0.1.1b0.dist-info/METADATA,sha256=wFmCkyq0v8xLZD9CUBkjqgiJyQUqs7eaEHynii_BUFE,3696
10
+ vectoriz-0.1.1b0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
11
+ vectoriz-0.1.1b0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
12
+ vectoriz-0.1.1b0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- tests/test_files.py,sha256=jNIcwdDrPGKv78zGJReb6s0kPZLr3mTvkwwl3cP6MlM,1622
3
- vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
4
- vectoriz/files.py,sha256=qpuD9CUZ4UFJQM1rokP4u23dnLz-3Du3fASDqmBpssk,9031
5
- vectoriz/token_transformer.py,sha256=1KIGL6EAiuqFhFzh3grkJcdEjjwTC3kH6RNqRkYMkQU,5811
6
- vectoriz/vector_db.py,sha256=cfcwN_QDbnbuBFqcJ_HyJy8jcRWpm8_pfsaug9JAiqo,6817
7
- vectoriz-0.0.5.dist-info/METADATA,sha256=FxQu0qlmjgRMPKq4OcmrdNKEYd2SnBa0LHwWExjxPQU,3694
8
- vectoriz-0.0.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
9
- vectoriz-0.0.5.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
10
- vectoriz-0.0.5.dist-info/RECORD,,