vectoriz 1.0.0__py3-none-any.whl → 1.0.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_files.py +122 -12
- tests/test_token_transformer.py +15 -0
- vectoriz/files.py +134 -56
- vectoriz/token_transformer.py +6 -2
- {vectoriz-1.0.0.dist-info → vectoriz-1.0.2rc0.dist-info}/METADATA +11 -11
- vectoriz-1.0.2rc0.dist-info/RECORD +12 -0
- vectoriz-1.0.0.dist-info/RECORD +0 -12
- {vectoriz-1.0.0.dist-info → vectoriz-1.0.2rc0.dist-info}/WHEEL +0 -0
- {vectoriz-1.0.0.dist-info → vectoriz-1.0.2rc0.dist-info}/top_level.txt +0 -0
tests/test_files.py
CHANGED
@@ -98,22 +98,22 @@ class TestFilesFeature:
|
|
98
98
|
test_file = tmp_path / "test.txt"
|
99
99
|
test_file.write_text(test_content)
|
100
100
|
files_feature = FilesFeature()
|
101
|
-
result = files_feature._extract_txt_content(
|
102
|
-
assert result == test_content
|
101
|
+
result = files_feature._extract_txt_content(test_file)
|
102
|
+
assert result == {"file": "test.txt", "content": test_content}
|
103
103
|
|
104
104
|
def test_extract_txt_content_with_unicode_chars(self, tmp_path):
|
105
105
|
test_content = "Unicode content: àáâãäåæç"
|
106
106
|
test_file = tmp_path / "unicode.txt"
|
107
107
|
test_file.write_text(test_content, encoding="utf-8")
|
108
108
|
files_feature = FilesFeature()
|
109
|
-
result = files_feature._extract_txt_content(
|
110
|
-
assert result == test_content
|
109
|
+
result = files_feature._extract_txt_content(test_file)
|
110
|
+
assert result == {"file": "unicode.txt", "content": test_content}
|
111
111
|
|
112
112
|
def test_extract_txt_content_raises_file_not_found(self):
|
113
113
|
files_feature = FilesFeature()
|
114
114
|
with pytest.raises(FileNotFoundError):
|
115
115
|
files_feature._extract_txt_content(
|
116
|
-
"/non_existent_dir
|
116
|
+
"/non_existent_dir/non_existent_file.txt"
|
117
117
|
)
|
118
118
|
|
119
119
|
def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
|
@@ -126,9 +126,10 @@ class TestFilesFeature:
|
|
126
126
|
|
127
127
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
128
128
|
files_feature = FilesFeature()
|
129
|
-
|
129
|
+
path = tmp_path / "test.docx"
|
130
|
+
result = files_feature._extract_docx_content(path)
|
130
131
|
|
131
|
-
assert result == "Paragraph 1\nParagraph 2"
|
132
|
+
assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 2"}
|
132
133
|
|
133
134
|
def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
|
134
135
|
mock_doc = MagicMock()
|
@@ -142,9 +143,10 @@ class TestFilesFeature:
|
|
142
143
|
|
143
144
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
144
145
|
files_feature = FilesFeature()
|
145
|
-
|
146
|
+
path = tmp_path / "test.docx"
|
147
|
+
result = files_feature._extract_docx_content(path)
|
146
148
|
|
147
|
-
assert result == "Paragraph 1\nParagraph 3"
|
149
|
+
assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 3"}
|
148
150
|
|
149
151
|
def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
|
150
152
|
def mock_document(_):
|
@@ -154,12 +156,120 @@ class TestFilesFeature:
|
|
154
156
|
|
155
157
|
files_feature = FilesFeature()
|
156
158
|
with pytest.raises(Exception):
|
157
|
-
|
159
|
+
path = tmp_path / "/invalid.docx"
|
160
|
+
files_feature._extract_docx_content(path)
|
158
161
|
|
159
162
|
def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
|
160
163
|
mock_doc = MagicMock()
|
161
164
|
mock_doc.paragraphs = []
|
162
165
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
163
166
|
files_feature = FilesFeature()
|
164
|
-
|
165
|
-
|
167
|
+
path = tmp_path / "empty.docx"
|
168
|
+
result = files_feature._extract_docx_content(path)
|
169
|
+
assert result == {"file": "empty.docx", "content": ""}
|
170
|
+
|
171
|
+
def test_extract_markdown_content_reads_file_correctly(self, tmp_path):
|
172
|
+
test_content = "# Markdown Title\nThis is some markdown content."
|
173
|
+
test_file = tmp_path / "test.md"
|
174
|
+
test_file.write_text(test_content)
|
175
|
+
files_feature = FilesFeature()
|
176
|
+
path = tmp_path / "test.md"
|
177
|
+
result = files_feature._extract_markdown_content(path)
|
178
|
+
assert result == {"file": "test.md", "content": test_content}
|
179
|
+
|
180
|
+
def test_extract_markdown_content_with_unicode_chars(self, tmp_path):
|
181
|
+
test_content = "# Unicode Title\nContent with unicode: àáâãäåæç"
|
182
|
+
test_file = tmp_path / "unicode.md"
|
183
|
+
test_file.write_text(test_content, encoding="utf-8")
|
184
|
+
files_feature = FilesFeature()
|
185
|
+
path = tmp_path / "unicode.md"
|
186
|
+
result = files_feature._extract_markdown_content(path)
|
187
|
+
assert result == {"file": "unicode.md", "content": test_content}
|
188
|
+
|
189
|
+
def test_extract_markdown_content_raises_file_not_found(self):
|
190
|
+
files_feature = FilesFeature()
|
191
|
+
with pytest.raises(FileNotFoundError):
|
192
|
+
path = str("/non_existent_dir/non_existent_file.md")
|
193
|
+
files_feature._extract_markdown_content(path)
|
194
|
+
|
195
|
+
def test_extract_markdown_content_handles_empty_file(self, tmp_path):
|
196
|
+
test_file = tmp_path / "empty.md"
|
197
|
+
test_file.write_text("")
|
198
|
+
files_feature = FilesFeature()
|
199
|
+
result = files_feature._extract_markdown_content(test_file)
|
200
|
+
assert result == {'file': 'empty.md', 'content': ''}
|
201
|
+
|
202
|
+
def test_extract_markdown_content_raises_unicode_decode_error(self, tmp_path):
|
203
|
+
test_file = tmp_path / "invalid_encoding.md"
|
204
|
+
test_file.write_bytes(b"\x80\x81\x82")
|
205
|
+
files_feature = FilesFeature()
|
206
|
+
with pytest.raises(UnicodeDecodeError):
|
207
|
+
path = str(tmp_path / "invalid_encoding.md")
|
208
|
+
files_feature._extract_markdown_content(path)
|
209
|
+
|
210
|
+
def test_load_markdown_files_from_directory_loads_files_correctly(self, tmp_path):
|
211
|
+
test_content_1 = "# Title 1\nContent 1"
|
212
|
+
test_content_2 = "# Title 2\nContent 2"
|
213
|
+
test_file_1 = tmp_path / "file1.md"
|
214
|
+
test_file_2 = tmp_path / "file2.md"
|
215
|
+
test_file_1.write_text(test_content_1)
|
216
|
+
test_file_2.write_text(test_content_2)
|
217
|
+
|
218
|
+
files_feature = FilesFeature()
|
219
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
220
|
+
|
221
|
+
assert len(result.chunk_names) == 2
|
222
|
+
assert len(result.text_list) == 2
|
223
|
+
assert test_file_1.name in result.chunk_names
|
224
|
+
assert test_file_2.name in result.chunk_names
|
225
|
+
assert test_content_1 in result.text_list
|
226
|
+
assert test_content_2 in result.text_list
|
227
|
+
|
228
|
+
def test_load_markdown_files_from_directory_skips_non_markdown_files(self, tmp_path):
|
229
|
+
test_content_md = "# Markdown Content"
|
230
|
+
test_content_txt = "Text Content"
|
231
|
+
test_file_md = tmp_path / "file.md"
|
232
|
+
test_file_txt = tmp_path / "file.txt"
|
233
|
+
test_file_md.write_text(test_content_md)
|
234
|
+
test_file_txt.write_text(test_content_txt)
|
235
|
+
|
236
|
+
files_feature = FilesFeature()
|
237
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
238
|
+
|
239
|
+
assert len(result.chunk_names) == 1
|
240
|
+
assert len(result.text_list) == 1
|
241
|
+
assert test_file_md.name in result.chunk_names
|
242
|
+
assert test_content_md in result.text_list
|
243
|
+
assert test_file_txt.name not in result.chunk_names
|
244
|
+
|
245
|
+
def test_load_markdown_files_from_directory_handles_empty_directory(self, tmp_path):
|
246
|
+
files_feature = FilesFeature()
|
247
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
248
|
+
|
249
|
+
assert len(result.chunk_names) == 0
|
250
|
+
assert len(result.text_list) == 0
|
251
|
+
|
252
|
+
def test_load_markdown_files_from_directory_handles_empty_markdown_file(self, tmp_path):
|
253
|
+
test_file = tmp_path / "empty.md"
|
254
|
+
test_file.write_text("")
|
255
|
+
|
256
|
+
files_feature = FilesFeature()
|
257
|
+
result = files_feature.load_markdown_files_from_directory(str(tmp_path))
|
258
|
+
|
259
|
+
assert len(result.chunk_names) == 1
|
260
|
+
assert len(result.text_list) == 1
|
261
|
+
assert test_file.name in result.chunk_names
|
262
|
+
assert result.text_list[0] == ""
|
263
|
+
|
264
|
+
def test_load_markdown_files_from_directory_with_verbose_output(self, tmp_path, capsys):
|
265
|
+
test_content = "# Markdown Content"
|
266
|
+
test_file = tmp_path / "file.md"
|
267
|
+
test_file.write_text(test_content)
|
268
|
+
|
269
|
+
files_feature = FilesFeature()
|
270
|
+
files_feature.load_markdown_files_from_directory(str(tmp_path), verbose=True)
|
271
|
+
|
272
|
+
captured = capsys.readouterr()
|
273
|
+
assert "Loaded Markdown file: file.md" in captured.out
|
274
|
+
|
275
|
+
|
tests/test_token_transformer.py
CHANGED
@@ -159,6 +159,21 @@ class TestTokenTransformer:
|
|
159
159
|
assert len(result1.strip().split("\n")) == 1
|
160
160
|
assert len(result3.strip().split("\n")) == 3
|
161
161
|
assert len(result5.strip().split("\n")) == 5
|
162
|
+
|
163
|
+
def test_search_with_list_arg(self):
|
164
|
+
transformer = TokenTransformer()
|
165
|
+
texts = ["Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5"]
|
166
|
+
|
167
|
+
embeddings = transformer.text_to_embeddings(texts)
|
168
|
+
index = transformer.embeddings_to_index(embeddings)
|
169
|
+
|
170
|
+
result1 = transformer.search("Doc", index, texts, context_amount=1, as_list=True)
|
171
|
+
result3 = transformer.search("Doc", index, texts, context_amount=3, as_list=True)
|
172
|
+
result5 = transformer.search("Doc", index, texts, context_amount=5, as_list=True)
|
173
|
+
|
174
|
+
assert len(result1) == 1
|
175
|
+
assert len(result3) == 3
|
176
|
+
assert len(result5) == 5
|
162
177
|
|
163
178
|
def test_create_index(self):
|
164
179
|
transformer = TokenTransformer()
|
vectoriz/files.py
CHANGED
@@ -2,9 +2,11 @@ import os
|
|
2
2
|
import docx
|
3
3
|
import numpy as np
|
4
4
|
from typing import Optional
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
5
6
|
|
6
7
|
from vectoriz.token_transformer import TokenTransformer
|
7
8
|
|
9
|
+
|
8
10
|
class FileArgument:
|
9
11
|
def __init__(
|
10
12
|
self,
|
@@ -50,7 +52,6 @@ class FileArgument:
|
|
50
52
|
Returns:
|
51
53
|
None: This method doesn't return anything, it updates the internal state of the object
|
52
54
|
"""
|
53
|
-
|
54
55
|
self.chunk_names.append(filename)
|
55
56
|
self.text_list.append(text)
|
56
57
|
self.embeddings.append(self._create_embedding(text))
|
@@ -72,7 +73,7 @@ class FileArgument:
|
|
72
73
|
|
73
74
|
class FilesFeature:
|
74
75
|
|
75
|
-
def _extract_txt_content(self,
|
76
|
+
def _extract_txt_content(self, path: str) -> dict[str, str]:
|
76
77
|
"""
|
77
78
|
Extract content from a text file and add it to the response data.
|
78
79
|
|
@@ -81,15 +82,42 @@ class FilesFeature:
|
|
81
82
|
|
82
83
|
Parameters:
|
83
84
|
----------
|
84
|
-
|
85
|
-
The directory path where the file is located.
|
86
|
-
file : str
|
85
|
+
path : str
|
87
86
|
The name of the text file to read.
|
88
87
|
|
89
88
|
Returns:
|
90
89
|
-------
|
91
|
-
|
92
|
-
|
90
|
+
Optional[str]
|
91
|
+
The content of the text file or None if the file is empty.
|
92
|
+
|
93
|
+
Raises:
|
94
|
+
------
|
95
|
+
FileNotFoundError
|
96
|
+
If the specified file does not exist.
|
97
|
+
UnicodeDecodeError
|
98
|
+
If the file cannot be decoded using UTF-8 encoding.
|
99
|
+
"""
|
100
|
+
file = os.path.basename(path)
|
101
|
+
with open(path, "r", encoding="utf-8") as fl:
|
102
|
+
content = fl.read()
|
103
|
+
return {"file": file, "content": content}
|
104
|
+
|
105
|
+
def _extract_markdown_content(self, path: str) -> dict[str, str]:
|
106
|
+
"""
|
107
|
+
Extract content from a Markdown file and add it to the response data.
|
108
|
+
|
109
|
+
This method opens a Markdown file in read mode with UTF-8 encoding, reads its content,
|
110
|
+
and adds the file name and its content to the response data.
|
111
|
+
|
112
|
+
Parameters:
|
113
|
+
----------
|
114
|
+
path : str
|
115
|
+
The name of the Markdown file to read.
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
-------
|
119
|
+
Optional[str]
|
120
|
+
The content of the Markdown file or None if the file is empty.
|
93
121
|
|
94
122
|
Raises:
|
95
123
|
------
|
@@ -98,37 +126,41 @@ class FilesFeature:
|
|
98
126
|
UnicodeDecodeError
|
99
127
|
If the file cannot be decoded using UTF-8 encoding.
|
100
128
|
"""
|
101
|
-
|
102
|
-
|
103
|
-
|
129
|
+
file = os.path.basename(path)
|
130
|
+
with open(path, "r", encoding="utf-8") as fl:
|
131
|
+
content = fl.read()
|
132
|
+
return {"file": file, "content": content}
|
104
133
|
|
105
|
-
def _extract_docx_content(self,
|
134
|
+
def _extract_docx_content(self, path: str) -> dict[str, str]:
|
106
135
|
"""
|
107
136
|
Extracts text content from a Microsoft Word document.
|
108
137
|
This method opens a Word document, reads all paragraphs, and joins non-empty
|
109
138
|
paragraphs into a single text string. The extracted content is then stored
|
110
139
|
using the add_response_data method.
|
111
140
|
Args:
|
112
|
-
|
113
|
-
file (str): The filename of the Word document to process
|
141
|
+
path (str): The path where the Word file is located
|
114
142
|
Returns:
|
115
|
-
|
143
|
+
dict[str, str]: A dictionary containing the file name and the extracted text content.
|
116
144
|
Note:
|
117
145
|
Empty paragraphs (those that contain only whitespace) are skipped.
|
118
146
|
The python-docx library is required for this method to work.
|
119
147
|
"""
|
120
|
-
|
121
|
-
doc = docx.Document(
|
148
|
+
file = os.path.basename(path)
|
149
|
+
doc = docx.Document(path)
|
122
150
|
full_text = []
|
123
151
|
|
124
152
|
for paragraph in doc.paragraphs:
|
125
153
|
content = paragraph.text.strip()
|
126
154
|
if len(content) == 0:
|
127
155
|
continue
|
128
|
-
full_text.append(
|
129
|
-
|
156
|
+
full_text.append(content)
|
157
|
+
|
158
|
+
content = "\n".join(full_text)
|
159
|
+
return {"file": file, "content": content}
|
130
160
|
|
131
|
-
def load_txt_files_from_directory(
|
161
|
+
def load_txt_files_from_directory(
|
162
|
+
self, directory: str, verbose: bool = False
|
163
|
+
) -> FileArgument:
|
132
164
|
"""
|
133
165
|
Load all text files from the specified directory and extract their content.
|
134
166
|
This method scans the specified directory for files with the '.txt' extension
|
@@ -143,30 +175,34 @@ class FilesFeature:
|
|
143
175
|
This method does not return any value. It updates the internal state
|
144
176
|
by processing text files found in the directory.
|
145
177
|
"""
|
146
|
-
argument: FileArgument = FileArgument(
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
178
|
+
argument: FileArgument = FileArgument()
|
179
|
+
|
180
|
+
paths = [
|
181
|
+
os.path.join(directory, file)
|
182
|
+
for file in os.listdir(directory)
|
183
|
+
if file.endswith(".txt")
|
184
|
+
]
|
185
|
+
|
186
|
+
with ThreadPoolExecutor() as executor:
|
187
|
+
results = list(executor.map(self._extract_txt_content, paths))
|
188
|
+
|
189
|
+
add_data_func = lambda result: (
|
190
|
+
argument.add_data(result.get("file"), result.get("content")),
|
191
|
+
print(f"Loaded txt file: {result.get('file')}") if verbose else print('')
|
192
|
+
)
|
193
|
+
with ThreadPoolExecutor() as executor:
|
194
|
+
executor.map(add_data_func, results)
|
195
|
+
|
162
196
|
return argument
|
163
197
|
|
164
|
-
def load_docx_files_from_directory(
|
198
|
+
def load_docx_files_from_directory(
|
199
|
+
self, directory: str, verbose: bool = False
|
200
|
+
) -> FileArgument:
|
165
201
|
"""
|
166
202
|
Load all Word (.docx) files from the specified directory and extract their content.
|
167
203
|
|
168
204
|
This method iterates through all files in the given directory, identifies those
|
169
|
-
with a .docx extension, and processes them using the
|
205
|
+
with a .docx extension, and processes them using the extract_docx_content method.
|
170
206
|
|
171
207
|
Args:
|
172
208
|
directory (str): Path to the directory containing Word files to be processed
|
@@ -178,25 +214,66 @@ class FilesFeature:
|
|
178
214
|
>>> processor = DocumentProcessor()
|
179
215
|
>>> processor.load_word_files("/path/to/documents")
|
180
216
|
"""
|
181
|
-
argument: FileArgument = FileArgument(
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
217
|
+
argument: FileArgument = FileArgument()
|
218
|
+
paths = [
|
219
|
+
os.path.join(directory, file)
|
220
|
+
for file in os.listdir(directory)
|
221
|
+
if file.endswith(".docx")
|
222
|
+
]
|
223
|
+
|
224
|
+
with ThreadPoolExecutor() as executor:
|
225
|
+
results = list(executor.map(self._extract_docx_content, paths))
|
226
|
+
|
227
|
+
add_data_func = lambda result: (
|
228
|
+
argument.add_data(result.get("file"), result.get("content")),
|
229
|
+
print(f"Loaded Word file: {result.get('file')}") if verbose else print('')
|
230
|
+
)
|
231
|
+
with ThreadPoolExecutor() as executor:
|
232
|
+
executor.map(add_data_func, results)
|
233
|
+
|
197
234
|
return argument
|
198
235
|
|
199
|
-
def
|
236
|
+
def load_markdown_files_from_directory(
|
237
|
+
self, directory: str, verbose: bool = False
|
238
|
+
) -> FileArgument:
|
239
|
+
"""
|
240
|
+
Load all Markdown (.md) files from the specified directory and extract their content.
|
241
|
+
|
242
|
+
This method iterates through all files in the given directory, identifies those
|
243
|
+
with a .md extension, and processes them using the extract_markdown_content method.
|
244
|
+
|
245
|
+
Args:
|
246
|
+
directory (str): Path to the directory containing Markdown files to be processed
|
247
|
+
|
248
|
+
Returns:
|
249
|
+
None
|
250
|
+
|
251
|
+
Examples:
|
252
|
+
>>> processor = DocumentProcessor()
|
253
|
+
>>> processor.load_markdown_files("/path/to/documents")
|
254
|
+
"""
|
255
|
+
argument = FileArgument()
|
256
|
+
paths = [
|
257
|
+
os.path.join(directory, file)
|
258
|
+
for file in os.listdir(directory)
|
259
|
+
if file.endswith(".md")
|
260
|
+
]
|
261
|
+
|
262
|
+
with ThreadPoolExecutor() as executor:
|
263
|
+
results = list(executor.map(self._extract_markdown_content, paths))
|
264
|
+
|
265
|
+
add_data_func = lambda result: (
|
266
|
+
argument.add_data(result.get("file"), result.get("content")),
|
267
|
+
print(f"Loaded Markdown file: {result.get('file')}") if verbose else print('')
|
268
|
+
)
|
269
|
+
with ThreadPoolExecutor() as executor:
|
270
|
+
executor.map(add_data_func, results)
|
271
|
+
|
272
|
+
return argument
|
273
|
+
|
274
|
+
def load_all_files_from_directory(
|
275
|
+
self, directory: str, verbose: bool = False
|
276
|
+
) -> FileArgument:
|
200
277
|
"""
|
201
278
|
Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
|
202
279
|
|
@@ -209,7 +286,7 @@ class FilesFeature:
|
|
209
286
|
Returns:
|
210
287
|
None
|
211
288
|
"""
|
212
|
-
argument: FileArgument = FileArgument(
|
289
|
+
argument: FileArgument = FileArgument()
|
213
290
|
for root, _, files in os.walk(directory):
|
214
291
|
for file in files:
|
215
292
|
readed = False
|
@@ -231,4 +308,5 @@ class FilesFeature:
|
|
231
308
|
print(f"Loaded file: {file}")
|
232
309
|
elif verbose and not readed:
|
233
310
|
print(f"Error file: {file}")
|
234
|
-
|
311
|
+
|
312
|
+
return argument
|
vectoriz/token_transformer.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
import faiss
|
2
2
|
import numpy as np
|
3
|
-
from typing import Self
|
3
|
+
from typing import Self, Union
|
4
4
|
from sentence_transformers import SentenceTransformer
|
5
5
|
|
6
6
|
|
@@ -76,7 +76,8 @@ class TokenTransformer:
|
|
76
76
|
index: faiss.IndexFlatL2,
|
77
77
|
texts: list[str],
|
78
78
|
context_amount: int = 1,
|
79
|
-
|
79
|
+
as_list: bool = False,
|
80
|
+
) -> Union[str, list[str]]:
|
80
81
|
"""
|
81
82
|
Searches for the most similar texts to the given query using the provided FAISS index.
|
82
83
|
This method converts the query into an embedding, searches for the k nearest neighbors
|
@@ -97,6 +98,9 @@ class TokenTransformer:
|
|
97
98
|
_, I = index.search(query_embedding, k=context_amount)
|
98
99
|
context = ""
|
99
100
|
|
101
|
+
if as_list:
|
102
|
+
return [texts[i].strip() for i in I[0]]
|
103
|
+
|
100
104
|
for i in I[0]:
|
101
105
|
context += texts[i] + "\n"
|
102
106
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vectoriz
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2rc0
|
4
4
|
Summary: Python library for creating vectorized data from text or files.
|
5
5
|
Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
|
6
6
|
Author: PedroHenriqueDevBR
|
@@ -25,19 +25,19 @@ Dynamic: summary
|
|
25
25
|
|
26
26
|
# Vectoriz
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/blob/main/LICENSE)
|
31
|
-
|
32
|
-
[](https://www.python.org/downloads/)
|
33
|
-
|
34
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/issues)
|
28
|
+
Vectoriz is available on PyPI and can be installed via pip:
|
35
29
|
|
36
|
-
|
30
|
+
<div align="center">
|
31
|
+
<a href="https://pypi.org/project/vectoriz/"><img src="https://badge.fury.io/py/vectoriz.svg" alt="PyPI version"></a>
|
32
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.12%2B-blue" alt="Python Version"></a>
|
33
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/issues"><img src="https://img.shields.io/github/issues/PedroHenriqueDevBR/vectoriz" alt="GitHub issues"></a>
|
34
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/stargazers"><img src="https://img.shields.io/github/stars/PedroHenriqueDevBR/vectoriz" alt="GitHub stars"></a>
|
35
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/network"><img src="https://img.shields.io/github/forks/PedroHenriqueDevBR/vectoriz" alt="GitHub forks"></a>
|
36
|
+
</div>
|
37
37
|
|
38
|
-
|
38
|
+
## Project description
|
39
39
|
|
40
|
-
|
40
|
+
For install another versions you can go to: <a href="https://pypi.org/project/vectoriz/#history">PyPI versions</a>
|
41
41
|
|
42
42
|
```bash
|
43
43
|
pip install vectoriz
|
@@ -0,0 +1,12 @@
|
|
1
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
tests/test_files.py,sha256=GQXNbGPUZeEzl5cE70D8OnyGbQr7kTnEBBvqK4ikKmo,11392
|
3
|
+
tests/test_token_transformer.py,sha256=xfB6_aP9pYSDHtUJzt9dioP_XBTZPvDnwAMWylyfuKQ,7796
|
4
|
+
tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
|
5
|
+
vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
|
6
|
+
vectoriz/files.py,sha256=IPNVztf3aNNPHvMj2lb7Yuf7akKTu3n7hsVYT97CzUY,11438
|
7
|
+
vectoriz/token_transformer.py,sha256=zx8TpCxYhrQYzvZy9JaerhniFY7IxZcQIiHedOzAZyQ,6957
|
8
|
+
vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
|
9
|
+
vectoriz-1.0.2rc0.dist-info/METADATA,sha256=CNvsojBRCYovh9DMnquIKh7IUVxdiy46XGmwRCibIpQ,3852
|
10
|
+
vectoriz-1.0.2rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
11
|
+
vectoriz-1.0.2rc0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
|
12
|
+
vectoriz-1.0.2rc0.dist-info/RECORD,,
|
vectoriz-1.0.0.dist-info/RECORD
DELETED
@@ -1,12 +0,0 @@
|
|
1
|
-
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
tests/test_files.py,sha256=EFXN9GChf9widEb3OvUcTXtOeU9X3naohMWyTZVPTJs,6559
|
3
|
-
tests/test_token_transformer.py,sha256=LoLA9t_7owaghB5jS2hJrM1LYk3VSxa3Xo-qrWM2QZY,7152
|
4
|
-
tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
|
5
|
-
vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
|
6
|
-
vectoriz/files.py,sha256=jTwNBs1A_nqo0WWzLFNDGaBnyAPvLep283q3GuOH8bk,9056
|
7
|
-
vectoriz/token_transformer.py,sha256=B7fPt-A-RzJjIoYns7wL_yyxQIj0UBRsnJIGCY_Ae2Q,6828
|
8
|
-
vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
|
9
|
-
vectoriz-1.0.0.dist-info/METADATA,sha256=Y-GqhHSugF2oESKiKGQ-iNYzp4-9ymYdrG3aV2LATi4,3694
|
10
|
-
vectoriz-1.0.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
11
|
-
vectoriz-1.0.0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
|
12
|
-
vectoriz-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|