vectoriz 1.0.1__py3-none-any.whl → 1.0.2rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tests/test_files.py +33 -27
- vectoriz/files.py +95 -82
- {vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/METADATA +1 -1
- {vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/RECORD +6 -6
- {vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/WHEEL +0 -0
- {vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/top_level.txt +0 -0
tests/test_files.py
CHANGED
@@ -98,22 +98,22 @@ class TestFilesFeature:
|
|
98
98
|
test_file = tmp_path / "test.txt"
|
99
99
|
test_file.write_text(test_content)
|
100
100
|
files_feature = FilesFeature()
|
101
|
-
result = files_feature._extract_txt_content(
|
102
|
-
assert result == test_content
|
101
|
+
result = files_feature._extract_txt_content(test_file)
|
102
|
+
assert result == {"file": "test.txt", "content": test_content}
|
103
103
|
|
104
104
|
def test_extract_txt_content_with_unicode_chars(self, tmp_path):
|
105
105
|
test_content = "Unicode content: àáâãäåæç"
|
106
106
|
test_file = tmp_path / "unicode.txt"
|
107
107
|
test_file.write_text(test_content, encoding="utf-8")
|
108
108
|
files_feature = FilesFeature()
|
109
|
-
result = files_feature._extract_txt_content(
|
110
|
-
assert result == test_content
|
109
|
+
result = files_feature._extract_txt_content(test_file)
|
110
|
+
assert result == {"file": "unicode.txt", "content": test_content}
|
111
111
|
|
112
112
|
def test_extract_txt_content_raises_file_not_found(self):
|
113
113
|
files_feature = FilesFeature()
|
114
114
|
with pytest.raises(FileNotFoundError):
|
115
115
|
files_feature._extract_txt_content(
|
116
|
-
"/non_existent_dir
|
116
|
+
"/non_existent_dir/non_existent_file.txt"
|
117
117
|
)
|
118
118
|
|
119
119
|
def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
|
@@ -126,9 +126,10 @@ class TestFilesFeature:
|
|
126
126
|
|
127
127
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
128
128
|
files_feature = FilesFeature()
|
129
|
-
|
129
|
+
path = tmp_path / "test.docx"
|
130
|
+
result = files_feature._extract_docx_content(path)
|
130
131
|
|
131
|
-
assert result == "Paragraph 1\nParagraph 2"
|
132
|
+
assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 2"}
|
132
133
|
|
133
134
|
def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
|
134
135
|
mock_doc = MagicMock()
|
@@ -142,9 +143,10 @@ class TestFilesFeature:
|
|
142
143
|
|
143
144
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
144
145
|
files_feature = FilesFeature()
|
145
|
-
|
146
|
+
path = tmp_path / "test.docx"
|
147
|
+
result = files_feature._extract_docx_content(path)
|
146
148
|
|
147
|
-
assert result == "Paragraph 1\nParagraph 3"
|
149
|
+
assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 3"}
|
148
150
|
|
149
151
|
def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
|
150
152
|
def mock_document(_):
|
@@ -154,52 +156,56 @@ class TestFilesFeature:
|
|
154
156
|
|
155
157
|
files_feature = FilesFeature()
|
156
158
|
with pytest.raises(Exception):
|
157
|
-
|
159
|
+
path = tmp_path / "/invalid.docx"
|
160
|
+
files_feature._extract_docx_content(path)
|
158
161
|
|
159
162
|
def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
|
160
163
|
mock_doc = MagicMock()
|
161
164
|
mock_doc.paragraphs = []
|
162
165
|
monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
|
163
166
|
files_feature = FilesFeature()
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
+
path = tmp_path / "empty.docx"
|
168
|
+
result = files_feature._extract_docx_content(path)
|
169
|
+
assert result == {"file": "empty.docx", "content": ""}
|
170
|
+
|
167
171
|
def test_extract_markdown_content_reads_file_correctly(self, tmp_path):
|
168
172
|
test_content = "# Markdown Title\nThis is some markdown content."
|
169
173
|
test_file = tmp_path / "test.md"
|
170
174
|
test_file.write_text(test_content)
|
171
175
|
files_feature = FilesFeature()
|
172
|
-
|
173
|
-
|
176
|
+
path = tmp_path / "test.md"
|
177
|
+
result = files_feature._extract_markdown_content(path)
|
178
|
+
assert result == {"file": "test.md", "content": test_content}
|
174
179
|
|
175
180
|
def test_extract_markdown_content_with_unicode_chars(self, tmp_path):
|
176
181
|
test_content = "# Unicode Title\nContent with unicode: àáâãäåæç"
|
177
182
|
test_file = tmp_path / "unicode.md"
|
178
183
|
test_file.write_text(test_content, encoding="utf-8")
|
179
184
|
files_feature = FilesFeature()
|
180
|
-
|
181
|
-
|
185
|
+
path = tmp_path / "unicode.md"
|
186
|
+
result = files_feature._extract_markdown_content(path)
|
187
|
+
assert result == {"file": "unicode.md", "content": test_content}
|
182
188
|
|
183
189
|
def test_extract_markdown_content_raises_file_not_found(self):
|
184
190
|
files_feature = FilesFeature()
|
185
191
|
with pytest.raises(FileNotFoundError):
|
186
|
-
|
187
|
-
|
188
|
-
)
|
192
|
+
path = str("/non_existent_dir/non_existent_file.md")
|
193
|
+
files_feature._extract_markdown_content(path)
|
189
194
|
|
190
195
|
def test_extract_markdown_content_handles_empty_file(self, tmp_path):
|
191
196
|
test_file = tmp_path / "empty.md"
|
192
197
|
test_file.write_text("")
|
193
198
|
files_feature = FilesFeature()
|
194
|
-
result = files_feature._extract_markdown_content(
|
195
|
-
assert result ==
|
199
|
+
result = files_feature._extract_markdown_content(test_file)
|
200
|
+
assert result == {'file': 'empty.md', 'content': ''}
|
196
201
|
|
197
202
|
def test_extract_markdown_content_raises_unicode_decode_error(self, tmp_path):
|
198
203
|
test_file = tmp_path / "invalid_encoding.md"
|
199
|
-
test_file.write_bytes(b"\x80\x81\x82")
|
204
|
+
test_file.write_bytes(b"\x80\x81\x82")
|
200
205
|
files_feature = FilesFeature()
|
201
206
|
with pytest.raises(UnicodeDecodeError):
|
202
|
-
|
207
|
+
path = str(tmp_path / "invalid_encoding.md")
|
208
|
+
files_feature._extract_markdown_content(path)
|
203
209
|
|
204
210
|
def test_load_markdown_files_from_directory_loads_files_correctly(self, tmp_path):
|
205
211
|
test_content_1 = "# Title 1\nContent 1"
|
@@ -210,7 +216,7 @@ class TestFilesFeature:
|
|
210
216
|
test_file_2.write_text(test_content_2)
|
211
217
|
|
212
218
|
files_feature = FilesFeature()
|
213
|
-
result = files_feature.load_markdown_files_from_directory(
|
219
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
214
220
|
|
215
221
|
assert len(result.chunk_names) == 2
|
216
222
|
assert len(result.text_list) == 2
|
@@ -228,7 +234,7 @@ class TestFilesFeature:
|
|
228
234
|
test_file_txt.write_text(test_content_txt)
|
229
235
|
|
230
236
|
files_feature = FilesFeature()
|
231
|
-
result = files_feature.load_markdown_files_from_directory(
|
237
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
232
238
|
|
233
239
|
assert len(result.chunk_names) == 1
|
234
240
|
assert len(result.text_list) == 1
|
@@ -238,7 +244,7 @@ class TestFilesFeature:
|
|
238
244
|
|
239
245
|
def test_load_markdown_files_from_directory_handles_empty_directory(self, tmp_path):
|
240
246
|
files_feature = FilesFeature()
|
241
|
-
result = files_feature.load_markdown_files_from_directory(
|
247
|
+
result = files_feature.load_markdown_files_from_directory(tmp_path)
|
242
248
|
|
243
249
|
assert len(result.chunk_names) == 0
|
244
250
|
assert len(result.text_list) == 0
|
vectoriz/files.py
CHANGED
@@ -2,9 +2,11 @@ import os
|
|
2
2
|
import docx
|
3
3
|
import numpy as np
|
4
4
|
from typing import Optional
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
5
6
|
|
6
7
|
from vectoriz.token_transformer import TokenTransformer
|
7
8
|
|
9
|
+
|
8
10
|
class FileArgument:
|
9
11
|
def __init__(
|
10
12
|
self,
|
@@ -50,7 +52,6 @@ class FileArgument:
|
|
50
52
|
Returns:
|
51
53
|
None: This method doesn't return anything, it updates the internal state of the object
|
52
54
|
"""
|
53
|
-
|
54
55
|
self.chunk_names.append(filename)
|
55
56
|
self.text_list.append(text)
|
56
57
|
self.embeddings.append(self._create_embedding(text))
|
@@ -72,7 +73,7 @@ class FileArgument:
|
|
72
73
|
|
73
74
|
class FilesFeature:
|
74
75
|
|
75
|
-
def _extract_txt_content(self,
|
76
|
+
def _extract_txt_content(self, path: str) -> dict[str, str]:
|
76
77
|
"""
|
77
78
|
Extract content from a text file and add it to the response data.
|
78
79
|
|
@@ -81,15 +82,13 @@ class FilesFeature:
|
|
81
82
|
|
82
83
|
Parameters:
|
83
84
|
----------
|
84
|
-
|
85
|
-
The directory path where the file is located.
|
86
|
-
file : str
|
85
|
+
path : str
|
87
86
|
The name of the text file to read.
|
88
87
|
|
89
88
|
Returns:
|
90
89
|
-------
|
91
|
-
|
92
|
-
|
90
|
+
Optional[str]
|
91
|
+
The content of the text file or None if the file is empty.
|
93
92
|
|
94
93
|
Raises:
|
95
94
|
------
|
@@ -98,11 +97,12 @@ class FilesFeature:
|
|
98
97
|
UnicodeDecodeError
|
99
98
|
If the file cannot be decoded using UTF-8 encoding.
|
100
99
|
"""
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
100
|
+
file = os.path.basename(path)
|
101
|
+
with open(path, "r", encoding="utf-8") as fl:
|
102
|
+
content = fl.read()
|
103
|
+
return {"file": file, "content": content}
|
104
|
+
|
105
|
+
def _extract_markdown_content(self, path: str) -> dict[str, str]:
|
106
106
|
"""
|
107
107
|
Extract content from a Markdown file and add it to the response data.
|
108
108
|
|
@@ -111,9 +111,7 @@ class FilesFeature:
|
|
111
111
|
|
112
112
|
Parameters:
|
113
113
|
----------
|
114
|
-
|
115
|
-
The directory path where the file is located.
|
116
|
-
file : str
|
114
|
+
path : str
|
117
115
|
The name of the Markdown file to read.
|
118
116
|
|
119
117
|
Returns:
|
@@ -128,37 +126,41 @@ class FilesFeature:
|
|
128
126
|
UnicodeDecodeError
|
129
127
|
If the file cannot be decoded using UTF-8 encoding.
|
130
128
|
"""
|
131
|
-
|
132
|
-
|
133
|
-
|
129
|
+
file = os.path.basename(path)
|
130
|
+
with open(path, "r", encoding="utf-8") as fl:
|
131
|
+
content = fl.read()
|
132
|
+
return {"file": file, "content": content}
|
134
133
|
|
135
|
-
def _extract_docx_content(self,
|
134
|
+
def _extract_docx_content(self, path: str) -> dict[str, str]:
|
136
135
|
"""
|
137
136
|
Extracts text content from a Microsoft Word document.
|
138
137
|
This method opens a Word document, reads all paragraphs, and joins non-empty
|
139
138
|
paragraphs into a single text string. The extracted content is then stored
|
140
139
|
using the add_response_data method.
|
141
140
|
Args:
|
142
|
-
|
143
|
-
file (str): The filename of the Word document to process
|
141
|
+
path (str): The path where the Word file is located
|
144
142
|
Returns:
|
145
|
-
|
143
|
+
dict[str, str]: A dictionary containing the file name and the extracted text content.
|
146
144
|
Note:
|
147
145
|
Empty paragraphs (those that contain only whitespace) are skipped.
|
148
146
|
The python-docx library is required for this method to work.
|
149
147
|
"""
|
150
|
-
|
151
|
-
doc = docx.Document(
|
148
|
+
file = os.path.basename(path)
|
149
|
+
doc = docx.Document(path)
|
152
150
|
full_text = []
|
153
151
|
|
154
152
|
for paragraph in doc.paragraphs:
|
155
153
|
content = paragraph.text.strip()
|
156
154
|
if len(content) == 0:
|
157
155
|
continue
|
158
|
-
full_text.append(
|
159
|
-
|
156
|
+
full_text.append(content)
|
157
|
+
|
158
|
+
content = "\n".join(full_text)
|
159
|
+
return {"file": file, "content": content}
|
160
160
|
|
161
|
-
def load_txt_files_from_directory(
|
161
|
+
def load_txt_files_from_directory(
|
162
|
+
self, directory: str, verbose: bool = False
|
163
|
+
) -> FileArgument:
|
162
164
|
"""
|
163
165
|
Load all text files from the specified directory and extract their content.
|
164
166
|
This method scans the specified directory for files with the '.txt' extension
|
@@ -173,30 +175,34 @@ class FilesFeature:
|
|
173
175
|
This method does not return any value. It updates the internal state
|
174
176
|
by processing text files found in the directory.
|
175
177
|
"""
|
176
|
-
argument: FileArgument = FileArgument(
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
178
|
+
argument: FileArgument = FileArgument()
|
179
|
+
|
180
|
+
paths = [
|
181
|
+
os.path.join(directory, file)
|
182
|
+
for file in os.listdir(directory)
|
183
|
+
if file.endswith(".txt")
|
184
|
+
]
|
185
|
+
|
186
|
+
with ThreadPoolExecutor() as executor:
|
187
|
+
results = list(executor.map(self._extract_txt_content, paths))
|
188
|
+
|
189
|
+
add_data_func = lambda result: (
|
190
|
+
argument.add_data(result.get("file"), result.get("content")),
|
191
|
+
print(f"Loaded txt file: {result.get('file')}") if verbose else print('')
|
192
|
+
)
|
193
|
+
with ThreadPoolExecutor() as executor:
|
194
|
+
executor.map(add_data_func, results)
|
195
|
+
|
192
196
|
return argument
|
193
197
|
|
194
|
-
def load_docx_files_from_directory(
|
198
|
+
def load_docx_files_from_directory(
|
199
|
+
self, directory: str, verbose: bool = False
|
200
|
+
) -> FileArgument:
|
195
201
|
"""
|
196
202
|
Load all Word (.docx) files from the specified directory and extract their content.
|
197
203
|
|
198
204
|
This method iterates through all files in the given directory, identifies those
|
199
|
-
with a .docx extension, and processes them using the
|
205
|
+
with a .docx extension, and processes them using the extract_docx_content method.
|
200
206
|
|
201
207
|
Args:
|
202
208
|
directory (str): Path to the directory containing Word files to be processed
|
@@ -208,25 +214,28 @@ class FilesFeature:
|
|
208
214
|
>>> processor = DocumentProcessor()
|
209
215
|
>>> processor.load_word_files("/path/to/documents")
|
210
216
|
"""
|
211
|
-
argument: FileArgument = FileArgument(
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
217
|
+
argument: FileArgument = FileArgument()
|
218
|
+
paths = [
|
219
|
+
os.path.join(directory, file)
|
220
|
+
for file in os.listdir(directory)
|
221
|
+
if file.endswith(".docx")
|
222
|
+
]
|
223
|
+
|
224
|
+
with ThreadPoolExecutor() as executor:
|
225
|
+
results = list(executor.map(self._extract_docx_content, paths))
|
226
|
+
|
227
|
+
add_data_func = lambda result: (
|
228
|
+
argument.add_data(result.get("file"), result.get("content")),
|
229
|
+
print(f"Loaded Word file: {result.get('file')}") if verbose else print('')
|
230
|
+
)
|
231
|
+
with ThreadPoolExecutor() as executor:
|
232
|
+
executor.map(add_data_func, results)
|
233
|
+
|
227
234
|
return argument
|
228
|
-
|
229
|
-
def load_markdown_files_from_directory(
|
235
|
+
|
236
|
+
def load_markdown_files_from_directory(
|
237
|
+
self, directory: str, verbose: bool = False
|
238
|
+
) -> FileArgument:
|
230
239
|
"""
|
231
240
|
Load all Markdown (.md) files from the specified directory and extract their content.
|
232
241
|
|
@@ -243,25 +252,28 @@ class FilesFeature:
|
|
243
252
|
>>> processor = DocumentProcessor()
|
244
253
|
>>> processor.load_markdown_files("/path/to/documents")
|
245
254
|
"""
|
246
|
-
argument
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
255
|
+
argument = FileArgument()
|
256
|
+
paths = [
|
257
|
+
os.path.join(directory, file)
|
258
|
+
for file in os.listdir(directory)
|
259
|
+
if file.endswith(".md")
|
260
|
+
]
|
261
|
+
|
262
|
+
with ThreadPoolExecutor() as executor:
|
263
|
+
results = list(executor.map(self._extract_markdown_content, paths))
|
264
|
+
|
265
|
+
add_data_func = lambda result: (
|
266
|
+
argument.add_data(result.get("file"), result.get("content")),
|
267
|
+
print(f"Loaded Markdown file: {result.get('file')}") if verbose else print('')
|
268
|
+
)
|
269
|
+
with ThreadPoolExecutor() as executor:
|
270
|
+
executor.map(add_data_func, results)
|
271
|
+
|
262
272
|
return argument
|
263
273
|
|
264
|
-
def load_all_files_from_directory(
|
274
|
+
def load_all_files_from_directory(
|
275
|
+
self, directory: str, verbose: bool = False
|
276
|
+
) -> FileArgument:
|
265
277
|
"""
|
266
278
|
Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
|
267
279
|
|
@@ -274,7 +286,7 @@ class FilesFeature:
|
|
274
286
|
Returns:
|
275
287
|
None
|
276
288
|
"""
|
277
|
-
argument: FileArgument = FileArgument(
|
289
|
+
argument: FileArgument = FileArgument()
|
278
290
|
for root, _, files in os.walk(directory):
|
279
291
|
for file in files:
|
280
292
|
readed = False
|
@@ -296,4 +308,5 @@ class FilesFeature:
|
|
296
308
|
print(f"Loaded file: {file}")
|
297
309
|
elif verbose and not readed:
|
298
310
|
print(f"Error file: {file}")
|
299
|
-
|
311
|
+
|
312
|
+
return argument
|
@@ -1,12 +1,12 @@
|
|
1
1
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
tests/test_files.py,sha256=
|
2
|
+
tests/test_files.py,sha256=GQXNbGPUZeEzl5cE70D8OnyGbQr7kTnEBBvqK4ikKmo,11392
|
3
3
|
tests/test_token_transformer.py,sha256=xfB6_aP9pYSDHtUJzt9dioP_XBTZPvDnwAMWylyfuKQ,7796
|
4
4
|
tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
|
5
5
|
vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
|
6
|
-
vectoriz/files.py,sha256=
|
6
|
+
vectoriz/files.py,sha256=IPNVztf3aNNPHvMj2lb7Yuf7akKTu3n7hsVYT97CzUY,11438
|
7
7
|
vectoriz/token_transformer.py,sha256=zx8TpCxYhrQYzvZy9JaerhniFY7IxZcQIiHedOzAZyQ,6957
|
8
8
|
vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
|
9
|
-
vectoriz-1.0.
|
10
|
-
vectoriz-1.0.
|
11
|
-
vectoriz-1.0.
|
12
|
-
vectoriz-1.0.
|
9
|
+
vectoriz-1.0.2rc0.dist-info/METADATA,sha256=CNvsojBRCYovh9DMnquIKh7IUVxdiy46XGmwRCibIpQ,3852
|
10
|
+
vectoriz-1.0.2rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
11
|
+
vectoriz-1.0.2rc0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
|
12
|
+
vectoriz-1.0.2rc0.dist-info/RECORD,,
|
File without changes
|
File without changes
|