vectoriz 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vectoriz-1.0.0 → vectoriz-1.0.1}/PKG-INFO +11 -11
- {vectoriz-1.0.0 → vectoriz-1.0.1}/README.md +10 -10
- {vectoriz-1.0.0 → vectoriz-1.0.1}/setup.py +1 -1
- {vectoriz-1.0.0 → vectoriz-1.0.1}/tests/test_files.py +104 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/tests/test_token_transformer.py +15 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz/files.py +65 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz/token_transformer.py +6 -2
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz.egg-info/PKG-INFO +11 -11
- {vectoriz-1.0.0 → vectoriz-1.0.1}/pyproject.toml +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/setup.cfg +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/tests/__init__.py +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/tests/test_vector_db.py +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz/__init__.py +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz/vector_db.py +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz.egg-info/SOURCES.txt +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz.egg-info/dependency_links.txt +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz.egg-info/requires.txt +0 -0
- {vectoriz-1.0.0 → vectoriz-1.0.1}/vectoriz.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vectoriz
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.1
|
4
4
|
Summary: Python library for creating vectorized data from text or files.
|
5
5
|
Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
|
6
6
|
Author: PedroHenriqueDevBR
|
@@ -25,19 +25,19 @@ Dynamic: summary
|
|
25
25
|
|
26
26
|
# Vectoriz
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/blob/main/LICENSE)
|
31
|
-
|
32
|
-
[](https://www.python.org/downloads/)
|
33
|
-
|
34
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/issues)
|
28
|
+
Vectoriz is available on PyPI and can be installed via pip:
|
35
29
|
|
36
|
-
|
30
|
+
<div align="center">
|
31
|
+
<a href="https://pypi.org/project/vectoriz/"><img src="https://badge.fury.io/py/vectoriz.svg" alt="PyPI version"></a>
|
32
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.12%2B-blue" alt="Python Version"></a>
|
33
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/issues"><img src="https://img.shields.io/github/issues/PedroHenriqueDevBR/vectoriz" alt="GitHub issues"></a>
|
34
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/stargazers"><img src="https://img.shields.io/github/stars/PedroHenriqueDevBR/vectoriz" alt="GitHub stars"></a>
|
35
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/network"><img src="https://img.shields.io/github/forks/PedroHenriqueDevBR/vectoriz" alt="GitHub forks"></a>
|
36
|
+
</div>
|
37
37
|
|
38
|
-
|
38
|
+
## Project description
|
39
39
|
|
40
|
-
|
40
|
+
For install another versions you can go to: <a href="https://pypi.org/project/vectoriz/#history">PyPI versions</a>
|
41
41
|
|
42
42
|
```bash
|
43
43
|
pip install vectoriz
|
@@ -1,18 +1,18 @@
|
|
1
1
|
# Vectoriz
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/blob/main/LICENSE)
|
6
|
-
|
7
|
-
[](https://www.python.org/downloads/)
|
8
|
-
|
9
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/issues)
|
3
|
+
Vectoriz is available on PyPI and can be installed via pip:
|
10
4
|
|
11
|
-
|
5
|
+
<div align="center">
|
6
|
+
<a href="https://pypi.org/project/vectoriz/"><img src="https://badge.fury.io/py/vectoriz.svg" alt="PyPI version"></a>
|
7
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.12%2B-blue" alt="Python Version"></a>
|
8
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/issues"><img src="https://img.shields.io/github/issues/PedroHenriqueDevBR/vectoriz" alt="GitHub issues"></a>
|
9
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/stargazers"><img src="https://img.shields.io/github/stars/PedroHenriqueDevBR/vectoriz" alt="GitHub stars"></a>
|
10
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/network"><img src="https://img.shields.io/github/forks/PedroHenriqueDevBR/vectoriz" alt="GitHub forks"></a>
|
11
|
+
</div>
|
12
12
|
|
13
|
-
|
13
|
+
## Project description
|
14
14
|
|
15
|
-
|
15
|
+
For install another versions you can go to: <a href="https://pypi.org/project/vectoriz/#history">PyPI versions</a>
|
16
16
|
|
17
17
|
```bash
|
18
18
|
pip install vectoriz
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
2
2
|
|
3
3
|
setup(
|
4
4
|
name="vectoriz",
|
5
|
-
version="1.0.
|
5
|
+
version="1.0.1",
|
6
6
|
author="PedroHenriqueDevBR",
|
7
7
|
author_email="pedro.henrique.particular@gmail.com",
|
8
8
|
description="Python library for creating vectorized data from text or files.",
|
@@ -163,3 +163,107 @@ class TestFilesFeature:
|
|
163
163
|
files_feature = FilesFeature()
|
164
164
|
result = files_feature._extract_docx_content(str(tmp_path), "empty.docx")
|
165
165
|
assert result == ""
|
166
|
+
|
167
|
+
def test_extract_markdown_content_reads_file_correctly(self, tmp_path):
|
168
|
+
test_content = "# Markdown Title\nThis is some markdown content."
|
169
|
+
test_file = tmp_path / "test.md"
|
170
|
+
test_file.write_text(test_content)
|
171
|
+
files_feature = FilesFeature()
|
172
|
+
result = files_feature._extract_markdown_content(str(tmp_path), "test.md")
|
173
|
+
assert result == test_content
|
174
|
+
|
175
|
+
def test_extract_markdown_content_with_unicode_chars(self, tmp_path):
|
176
|
+
test_content = "# Unicode Title\nContent with unicode: àáâãäåæç"
|
177
|
+
test_file = tmp_path / "unicode.md"
|
178
|
+
test_file.write_text(test_content, encoding="utf-8")
|
179
|
+
files_feature = FilesFeature()
|
180
|
+
result = files_feature._extract_markdown_content(str(tmp_path), "unicode.md")
|
181
|
+
assert result == test_content
|
182
|
+
|
183
|
+
def test_extract_markdown_content_raises_file_not_found(self):
|
184
|
+
files_feature = FilesFeature()
|
185
|
+
with pytest.raises(FileNotFoundError):
|
186
|
+
files_feature._extract_markdown_content(
|
187
|
+
"/non_existent_dir", "non_existent_file.md"
|
188
|
+
)
|
189
|
+
|
190
|
+
def test_extract_markdown_content_handles_empty_file(self, tmp_path):
|
191
|
+
test_file = tmp_path / "empty.md"
|
192
|
+
test_file.write_text("")
|
193
|
+
files_feature = FilesFeature()
|
194
|
+
result = files_feature._extract_markdown_content(str(tmp_path), "empty.md")
|
195
|
+
assert result == ""
|
196
|
+
|
197
|
+
def test_extract_markdown_content_raises_unicode_decode_error(self, tmp_path):
|
198
|
+
test_file = tmp_path / "invalid_encoding.md"
|
199
|
+
test_file.write_bytes(b"\x80\x81\x82") # Invalid UTF-8 bytes
|
200
|
+
files_feature = FilesFeature()
|
201
|
+
with pytest.raises(UnicodeDecodeError):
|
202
|
+
files_feature._extract_markdown_content(str(tmp_path), "invalid_encoding.md")
|
203
|
+
|
204
|
+
def test_load_markdown_files_from_directory_loads_files_correctly(self, tmp_path):
|
205
|
+
test_content_1 = "# Title 1\nContent 1"
|
206
|
+
test_content_2 = "# Title 2\nContent 2"
|
207
|
+
test_file_1 = tmp_path / "file1.md"
|
208
|
+
test_file_2 = tmp_path / "file2.md"
|
209
|
+
test_file_1.write_text(test_content_1)
|
210
|
+
test_file_2.write_text(test_content_2)
|
211
|
+
|
212
|
+
files_feature = FilesFeature()
|
213
|
+
result = files_feature.load_markdown_files_from_directory(str(tmp_path))
|
214
|
+
|
215
|
+
assert len(result.chunk_names) == 2
|
216
|
+
assert len(result.text_list) == 2
|
217
|
+
assert test_file_1.name in result.chunk_names
|
218
|
+
assert test_file_2.name in result.chunk_names
|
219
|
+
assert test_content_1 in result.text_list
|
220
|
+
assert test_content_2 in result.text_list
|
221
|
+
|
222
|
+
def test_load_markdown_files_from_directory_skips_non_markdown_files(self, tmp_path):
|
223
|
+
test_content_md = "# Markdown Content"
|
224
|
+
test_content_txt = "Text Content"
|
225
|
+
test_file_md = tmp_path / "file.md"
|
226
|
+
test_file_txt = tmp_path / "file.txt"
|
227
|
+
test_file_md.write_text(test_content_md)
|
228
|
+
test_file_txt.write_text(test_content_txt)
|
229
|
+
|
230
|
+
files_feature = FilesFeature()
|
231
|
+
result = files_feature.load_markdown_files_from_directory(str(tmp_path))
|
232
|
+
|
233
|
+
assert len(result.chunk_names) == 1
|
234
|
+
assert len(result.text_list) == 1
|
235
|
+
assert test_file_md.name in result.chunk_names
|
236
|
+
assert test_content_md in result.text_list
|
237
|
+
assert test_file_txt.name not in result.chunk_names
|
238
|
+
|
239
|
+
def test_load_markdown_files_from_directory_handles_empty_directory(self, tmp_path):
|
240
|
+
files_feature = FilesFeature()
|
241
|
+
result = files_feature.load_markdown_files_from_directory(str(tmp_path))
|
242
|
+
|
243
|
+
assert len(result.chunk_names) == 0
|
244
|
+
assert len(result.text_list) == 0
|
245
|
+
|
246
|
+
def test_load_markdown_files_from_directory_handles_empty_markdown_file(self, tmp_path):
|
247
|
+
test_file = tmp_path / "empty.md"
|
248
|
+
test_file.write_text("")
|
249
|
+
|
250
|
+
files_feature = FilesFeature()
|
251
|
+
result = files_feature.load_markdown_files_from_directory(str(tmp_path))
|
252
|
+
|
253
|
+
assert len(result.chunk_names) == 1
|
254
|
+
assert len(result.text_list) == 1
|
255
|
+
assert test_file.name in result.chunk_names
|
256
|
+
assert result.text_list[0] == ""
|
257
|
+
|
258
|
+
def test_load_markdown_files_from_directory_with_verbose_output(self, tmp_path, capsys):
|
259
|
+
test_content = "# Markdown Content"
|
260
|
+
test_file = tmp_path / "file.md"
|
261
|
+
test_file.write_text(test_content)
|
262
|
+
|
263
|
+
files_feature = FilesFeature()
|
264
|
+
files_feature.load_markdown_files_from_directory(str(tmp_path), verbose=True)
|
265
|
+
|
266
|
+
captured = capsys.readouterr()
|
267
|
+
assert "Loaded Markdown file: file.md" in captured.out
|
268
|
+
|
269
|
+
|
@@ -159,6 +159,21 @@ class TestTokenTransformer:
|
|
159
159
|
assert len(result1.strip().split("\n")) == 1
|
160
160
|
assert len(result3.strip().split("\n")) == 3
|
161
161
|
assert len(result5.strip().split("\n")) == 5
|
162
|
+
|
163
|
+
def test_search_with_list_arg(self):
|
164
|
+
transformer = TokenTransformer()
|
165
|
+
texts = ["Doc 1", "Doc 2", "Doc 3", "Doc 4", "Doc 5"]
|
166
|
+
|
167
|
+
embeddings = transformer.text_to_embeddings(texts)
|
168
|
+
index = transformer.embeddings_to_index(embeddings)
|
169
|
+
|
170
|
+
result1 = transformer.search("Doc", index, texts, context_amount=1, as_list=True)
|
171
|
+
result3 = transformer.search("Doc", index, texts, context_amount=3, as_list=True)
|
172
|
+
result5 = transformer.search("Doc", index, texts, context_amount=5, as_list=True)
|
173
|
+
|
174
|
+
assert len(result1) == 1
|
175
|
+
assert len(result3) == 3
|
176
|
+
assert len(result5) == 5
|
162
177
|
|
163
178
|
def test_create_index(self):
|
164
179
|
transformer = TokenTransformer()
|
@@ -101,6 +101,36 @@ class FilesFeature:
|
|
101
101
|
with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
|
102
102
|
text = fl.read()
|
103
103
|
return text
|
104
|
+
|
105
|
+
def _extract_markdown_content(self, directory: str, file: str) -> Optional[str]:
|
106
|
+
"""
|
107
|
+
Extract content from a Markdown file and add it to the response data.
|
108
|
+
|
109
|
+
This method opens a Markdown file in read mode with UTF-8 encoding, reads its content,
|
110
|
+
and adds the file name and its content to the response data.
|
111
|
+
|
112
|
+
Parameters:
|
113
|
+
----------
|
114
|
+
directory : str
|
115
|
+
The directory path where the file is located.
|
116
|
+
file : str
|
117
|
+
The name of the Markdown file to read.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
-------
|
121
|
+
Optional[str]
|
122
|
+
The content of the Markdown file or None if the file is empty.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
------
|
126
|
+
FileNotFoundError
|
127
|
+
If the specified file does not exist.
|
128
|
+
UnicodeDecodeError
|
129
|
+
If the file cannot be decoded using UTF-8 encoding.
|
130
|
+
"""
|
131
|
+
with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
|
132
|
+
text = fl.read()
|
133
|
+
return text
|
104
134
|
|
105
135
|
def _extract_docx_content(self, directory: str, file: str) -> Optional[str]:
|
106
136
|
"""
|
@@ -195,6 +225,41 @@ class FilesFeature:
|
|
195
225
|
if verbose:
|
196
226
|
print(f"Loaded Word file: {file}")
|
197
227
|
return argument
|
228
|
+
|
229
|
+
def load_markdown_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
|
230
|
+
"""
|
231
|
+
Load all Markdown (.md) files from the specified directory and extract their content.
|
232
|
+
|
233
|
+
This method iterates through all files in the given directory, identifies those
|
234
|
+
with a .md extension, and processes them using the extract_markdown_content method.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
directory (str): Path to the directory containing Markdown files to be processed
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
None
|
241
|
+
|
242
|
+
Examples:
|
243
|
+
>>> processor = DocumentProcessor()
|
244
|
+
>>> processor.load_markdown_files("/path/to/documents")
|
245
|
+
"""
|
246
|
+
argument: FileArgument = FileArgument([], [], [])
|
247
|
+
for file in os.listdir(directory):
|
248
|
+
if not file.endswith(".md"):
|
249
|
+
if verbose:
|
250
|
+
print(f"Error file: {file}")
|
251
|
+
continue
|
252
|
+
|
253
|
+
text = self._extract_markdown_content(directory, file)
|
254
|
+
if text is None:
|
255
|
+
if verbose:
|
256
|
+
print(f"Error file: {file}")
|
257
|
+
continue
|
258
|
+
|
259
|
+
argument.add_data(file, text)
|
260
|
+
if verbose:
|
261
|
+
print(f"Loaded Markdown file: {file}")
|
262
|
+
return argument
|
198
263
|
|
199
264
|
def load_all_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
|
200
265
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import faiss
|
2
2
|
import numpy as np
|
3
|
-
from typing import Self
|
3
|
+
from typing import Self, Union
|
4
4
|
from sentence_transformers import SentenceTransformer
|
5
5
|
|
6
6
|
|
@@ -76,7 +76,8 @@ class TokenTransformer:
|
|
76
76
|
index: faiss.IndexFlatL2,
|
77
77
|
texts: list[str],
|
78
78
|
context_amount: int = 1,
|
79
|
-
|
79
|
+
as_list: bool = False,
|
80
|
+
) -> Union[str, list[str]]:
|
80
81
|
"""
|
81
82
|
Searches for the most similar texts to the given query using the provided FAISS index.
|
82
83
|
This method converts the query into an embedding, searches for the k nearest neighbors
|
@@ -97,6 +98,9 @@ class TokenTransformer:
|
|
97
98
|
_, I = index.search(query_embedding, k=context_amount)
|
98
99
|
context = ""
|
99
100
|
|
101
|
+
if as_list:
|
102
|
+
return [texts[i].strip() for i in I[0]]
|
103
|
+
|
100
104
|
for i in I[0]:
|
101
105
|
context += texts[i] + "\n"
|
102
106
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: vectoriz
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.1
|
4
4
|
Summary: Python library for creating vectorized data from text or files.
|
5
5
|
Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
|
6
6
|
Author: PedroHenriqueDevBR
|
@@ -25,19 +25,19 @@ Dynamic: summary
|
|
25
25
|
|
26
26
|
# Vectoriz
|
27
27
|
|
28
|
-
|
29
|
-
|
30
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/blob/main/LICENSE)
|
31
|
-
|
32
|
-
[](https://www.python.org/downloads/)
|
33
|
-
|
34
|
-
[](https://github.com/PedroHenriqueDevBR/vectoriz/issues)
|
28
|
+
Vectoriz is available on PyPI and can be installed via pip:
|
35
29
|
|
36
|
-
|
30
|
+
<div align="center">
|
31
|
+
<a href="https://pypi.org/project/vectoriz/"><img src="https://badge.fury.io/py/vectoriz.svg" alt="PyPI version"></a>
|
32
|
+
<a href="https://www.python.org/downloads/"><img src="https://img.shields.io/badge/python-3.12%2B-blue" alt="Python Version"></a>
|
33
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/issues"><img src="https://img.shields.io/github/issues/PedroHenriqueDevBR/vectoriz" alt="GitHub issues"></a>
|
34
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/stargazers"><img src="https://img.shields.io/github/stars/PedroHenriqueDevBR/vectoriz" alt="GitHub stars"></a>
|
35
|
+
<a href="https://github.com/PedroHenriqueDevBR/vectoriz/network"><img src="https://img.shields.io/github/forks/PedroHenriqueDevBR/vectoriz" alt="GitHub forks"></a>
|
36
|
+
</div>
|
37
37
|
|
38
|
-
|
38
|
+
## Project description
|
39
39
|
|
40
|
-
|
40
|
+
For install another versions you can go to: <a href="https://pypi.org/project/vectoriz/#history">PyPI versions</a>
|
41
41
|
|
42
42
|
```bash
|
43
43
|
pip install vectoriz
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|