vectoriz 1.0.1__py3-none-any.whl → 1.0.2rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tests/test_files.py CHANGED
@@ -98,22 +98,22 @@ class TestFilesFeature:
98
98
  test_file = tmp_path / "test.txt"
99
99
  test_file.write_text(test_content)
100
100
  files_feature = FilesFeature()
101
- result = files_feature._extract_txt_content(str(tmp_path), "test.txt")
102
- assert result == test_content
101
+ result = files_feature._extract_txt_content(test_file)
102
+ assert result == {"file": "test.txt", "content": test_content}
103
103
 
104
104
  def test_extract_txt_content_with_unicode_chars(self, tmp_path):
105
105
  test_content = "Unicode content: àáâãäåæç"
106
106
  test_file = tmp_path / "unicode.txt"
107
107
  test_file.write_text(test_content, encoding="utf-8")
108
108
  files_feature = FilesFeature()
109
- result = files_feature._extract_txt_content(str(tmp_path), "unicode.txt")
110
- assert result == test_content
109
+ result = files_feature._extract_txt_content(test_file)
110
+ assert result == {"file": "unicode.txt", "content": test_content}
111
111
 
112
112
  def test_extract_txt_content_raises_file_not_found(self):
113
113
  files_feature = FilesFeature()
114
114
  with pytest.raises(FileNotFoundError):
115
115
  files_feature._extract_txt_content(
116
- "/non_existent_dir", "non_existent_file.txt"
116
+ "/non_existent_dir/non_existent_file.txt"
117
117
  )
118
118
 
119
119
  def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
@@ -126,9 +126,10 @@ class TestFilesFeature:
126
126
 
127
127
  monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
128
128
  files_feature = FilesFeature()
129
- result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
129
+ path = tmp_path / "test.docx"
130
+ result = files_feature._extract_docx_content(path)
130
131
 
131
- assert result == "Paragraph 1\nParagraph 2"
132
+ assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 2"}
132
133
 
133
134
  def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
134
135
  mock_doc = MagicMock()
@@ -142,9 +143,10 @@ class TestFilesFeature:
142
143
 
143
144
  monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
144
145
  files_feature = FilesFeature()
145
- result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
146
+ path = tmp_path / "test.docx"
147
+ result = files_feature._extract_docx_content(path)
146
148
 
147
- assert result == "Paragraph 1\nParagraph 3"
149
+ assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 3"}
148
150
 
149
151
  def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
150
152
  def mock_document(_):
@@ -154,52 +156,56 @@ class TestFilesFeature:
154
156
 
155
157
  files_feature = FilesFeature()
156
158
  with pytest.raises(Exception):
157
- files_feature._extract_docx_content(str(tmp_path), "invalid.docx")
159
+ path = tmp_path / "/invalid.docx"
160
+ files_feature._extract_docx_content(path)
158
161
 
159
162
  def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
160
163
  mock_doc = MagicMock()
161
164
  mock_doc.paragraphs = []
162
165
  monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
163
166
  files_feature = FilesFeature()
164
- result = files_feature._extract_docx_content(str(tmp_path), "empty.docx")
165
- assert result == ""
166
-
167
+ path = tmp_path / "empty.docx"
168
+ result = files_feature._extract_docx_content(path)
169
+ assert result == {"file": "empty.docx", "content": ""}
170
+
167
171
  def test_extract_markdown_content_reads_file_correctly(self, tmp_path):
168
172
  test_content = "# Markdown Title\nThis is some markdown content."
169
173
  test_file = tmp_path / "test.md"
170
174
  test_file.write_text(test_content)
171
175
  files_feature = FilesFeature()
172
- result = files_feature._extract_markdown_content(str(tmp_path), "test.md")
173
- assert result == test_content
176
+ path = tmp_path / "test.md"
177
+ result = files_feature._extract_markdown_content(path)
178
+ assert result == {"file": "test.md", "content": test_content}
174
179
 
175
180
  def test_extract_markdown_content_with_unicode_chars(self, tmp_path):
176
181
  test_content = "# Unicode Title\nContent with unicode: àáâãäåæç"
177
182
  test_file = tmp_path / "unicode.md"
178
183
  test_file.write_text(test_content, encoding="utf-8")
179
184
  files_feature = FilesFeature()
180
- result = files_feature._extract_markdown_content(str(tmp_path), "unicode.md")
181
- assert result == test_content
185
+ path = tmp_path / "unicode.md"
186
+ result = files_feature._extract_markdown_content(path)
187
+ assert result == {"file": "unicode.md", "content": test_content}
182
188
 
183
189
  def test_extract_markdown_content_raises_file_not_found(self):
184
190
  files_feature = FilesFeature()
185
191
  with pytest.raises(FileNotFoundError):
186
- files_feature._extract_markdown_content(
187
- "/non_existent_dir", "non_existent_file.md"
188
- )
192
+ path = str("/non_existent_dir/non_existent_file.md")
193
+ files_feature._extract_markdown_content(path)
189
194
 
190
195
  def test_extract_markdown_content_handles_empty_file(self, tmp_path):
191
196
  test_file = tmp_path / "empty.md"
192
197
  test_file.write_text("")
193
198
  files_feature = FilesFeature()
194
- result = files_feature._extract_markdown_content(str(tmp_path), "empty.md")
195
- assert result == ""
199
+ result = files_feature._extract_markdown_content(test_file)
200
+ assert result == {'file': 'empty.md', 'content': ''}
196
201
 
197
202
  def test_extract_markdown_content_raises_unicode_decode_error(self, tmp_path):
198
203
  test_file = tmp_path / "invalid_encoding.md"
199
- test_file.write_bytes(b"\x80\x81\x82") # Invalid UTF-8 bytes
204
+ test_file.write_bytes(b"\x80\x81\x82")
200
205
  files_feature = FilesFeature()
201
206
  with pytest.raises(UnicodeDecodeError):
202
- files_feature._extract_markdown_content(str(tmp_path), "invalid_encoding.md")
207
+ path = str(tmp_path / "invalid_encoding.md")
208
+ files_feature._extract_markdown_content(path)
203
209
 
204
210
  def test_load_markdown_files_from_directory_loads_files_correctly(self, tmp_path):
205
211
  test_content_1 = "# Title 1\nContent 1"
@@ -210,7 +216,7 @@ class TestFilesFeature:
210
216
  test_file_2.write_text(test_content_2)
211
217
 
212
218
  files_feature = FilesFeature()
213
- result = files_feature.load_markdown_files_from_directory(str(tmp_path))
219
+ result = files_feature.load_markdown_files_from_directory(tmp_path)
214
220
 
215
221
  assert len(result.chunk_names) == 2
216
222
  assert len(result.text_list) == 2
@@ -228,7 +234,7 @@ class TestFilesFeature:
228
234
  test_file_txt.write_text(test_content_txt)
229
235
 
230
236
  files_feature = FilesFeature()
231
- result = files_feature.load_markdown_files_from_directory(str(tmp_path))
237
+ result = files_feature.load_markdown_files_from_directory(tmp_path)
232
238
 
233
239
  assert len(result.chunk_names) == 1
234
240
  assert len(result.text_list) == 1
@@ -238,7 +244,7 @@ class TestFilesFeature:
238
244
 
239
245
  def test_load_markdown_files_from_directory_handles_empty_directory(self, tmp_path):
240
246
  files_feature = FilesFeature()
241
- result = files_feature.load_markdown_files_from_directory(str(tmp_path))
247
+ result = files_feature.load_markdown_files_from_directory(tmp_path)
242
248
 
243
249
  assert len(result.chunk_names) == 0
244
250
  assert len(result.text_list) == 0
vectoriz/files.py CHANGED
@@ -2,9 +2,11 @@ import os
2
2
  import docx
3
3
  import numpy as np
4
4
  from typing import Optional
5
+ from concurrent.futures import ThreadPoolExecutor
5
6
 
6
7
  from vectoriz.token_transformer import TokenTransformer
7
8
 
9
+
8
10
  class FileArgument:
9
11
  def __init__(
10
12
  self,
@@ -50,7 +52,6 @@ class FileArgument:
50
52
  Returns:
51
53
  None: This method doesn't return anything, it updates the internal state of the object
52
54
  """
53
-
54
55
  self.chunk_names.append(filename)
55
56
  self.text_list.append(text)
56
57
  self.embeddings.append(self._create_embedding(text))
@@ -72,7 +73,7 @@ class FileArgument:
72
73
 
73
74
  class FilesFeature:
74
75
 
75
- def _extract_txt_content(self, directory: str, file: str) -> Optional[str]:
76
+ def _extract_txt_content(self, path: str) -> dict[str, str]:
76
77
  """
77
78
  Extract content from a text file and add it to the response data.
78
79
 
@@ -81,15 +82,13 @@ class FilesFeature:
81
82
 
82
83
  Parameters:
83
84
  ----------
84
- directory : str
85
- The directory path where the file is located.
86
- file : str
85
+ path : str
87
86
  The name of the text file to read.
88
87
 
89
88
  Returns:
90
89
  -------
91
- None
92
- This method doesn't return any value but updates the internal response data.
90
+ Optional[str]
91
+ The content of the text file or None if the file is empty.
93
92
 
94
93
  Raises:
95
94
  ------
@@ -98,11 +97,12 @@ class FilesFeature:
98
97
  UnicodeDecodeError
99
98
  If the file cannot be decoded using UTF-8 encoding.
100
99
  """
101
- with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
102
- text = fl.read()
103
- return text
104
-
105
- def _extract_markdown_content(self, directory: str, file: str) -> Optional[str]:
100
+ file = os.path.basename(path)
101
+ with open(path, "r", encoding="utf-8") as fl:
102
+ content = fl.read()
103
+ return {"file": file, "content": content}
104
+
105
+ def _extract_markdown_content(self, path: str) -> dict[str, str]:
106
106
  """
107
107
  Extract content from a Markdown file and add it to the response data.
108
108
 
@@ -111,9 +111,7 @@ class FilesFeature:
111
111
 
112
112
  Parameters:
113
113
  ----------
114
- directory : str
115
- The directory path where the file is located.
116
- file : str
114
+ path : str
117
115
  The name of the Markdown file to read.
118
116
 
119
117
  Returns:
@@ -128,37 +126,41 @@ class FilesFeature:
128
126
  UnicodeDecodeError
129
127
  If the file cannot be decoded using UTF-8 encoding.
130
128
  """
131
- with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
132
- text = fl.read()
133
- return text
129
+ file = os.path.basename(path)
130
+ with open(path, "r", encoding="utf-8") as fl:
131
+ content = fl.read()
132
+ return {"file": file, "content": content}
134
133
 
135
- def _extract_docx_content(self, directory: str, file: str) -> Optional[str]:
134
+ def _extract_docx_content(self, path: str) -> dict[str, str]:
136
135
  """
137
136
  Extracts text content from a Microsoft Word document.
138
137
  This method opens a Word document, reads all paragraphs, and joins non-empty
139
138
  paragraphs into a single text string. The extracted content is then stored
140
139
  using the add_response_data method.
141
140
  Args:
142
- directory (str): The directory path where the Word file is located
143
- file (str): The filename of the Word document to process
141
+ path (str): The path where the Word file is located
144
142
  Returns:
145
- Optional[str]: The extracted text content or None if no content is found.
143
+ dict[str, str]: A dictionary containing the file name and the extracted text content.
146
144
  Note:
147
145
  Empty paragraphs (those that contain only whitespace) are skipped.
148
146
  The python-docx library is required for this method to work.
149
147
  """
150
- file_path = os.path.join(directory, file)
151
- doc = docx.Document(file_path)
148
+ file = os.path.basename(path)
149
+ doc = docx.Document(path)
152
150
  full_text = []
153
151
 
154
152
  for paragraph in doc.paragraphs:
155
153
  content = paragraph.text.strip()
156
154
  if len(content) == 0:
157
155
  continue
158
- full_text.append(paragraph.text)
159
- return "\n".join(full_text)
156
+ full_text.append(content)
157
+
158
+ content = "\n".join(full_text)
159
+ return {"file": file, "content": content}
160
160
 
161
- def load_txt_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
161
+ def load_txt_files_from_directory(
162
+ self, directory: str, verbose: bool = False
163
+ ) -> FileArgument:
162
164
  """
163
165
  Load all text files from the specified directory and extract their content.
164
166
  This method scans the specified directory for files with the '.txt' extension
@@ -173,30 +175,34 @@ class FilesFeature:
173
175
  This method does not return any value. It updates the internal state
174
176
  by processing text files found in the directory.
175
177
  """
176
- argument: FileArgument = FileArgument([], [], [])
177
- for file in os.listdir(directory):
178
- if not file.endswith(".txt"):
179
- if verbose:
180
- print(f"Error file: {file}")
181
- continue
182
-
183
- text = self._extract_txt_content(directory, file)
184
- if text is None:
185
- if verbose:
186
- print(f"Error file: {file}")
187
- continue
188
-
189
- argument.add_data(file, text)
190
- if verbose:
191
- print(f"Loaded txt file: {file}")
178
+ argument: FileArgument = FileArgument()
179
+
180
+ paths = [
181
+ os.path.join(directory, file)
182
+ for file in os.listdir(directory)
183
+ if file.endswith(".txt")
184
+ ]
185
+
186
+ with ThreadPoolExecutor() as executor:
187
+ results = list(executor.map(self._extract_txt_content, paths))
188
+
189
+ add_data_func = lambda result: (
190
+ argument.add_data(result.get("file"), result.get("content")),
191
+ print(f"Loaded txt file: {result.get('file')}") if verbose else print('')
192
+ )
193
+ with ThreadPoolExecutor() as executor:
194
+ executor.map(add_data_func, results)
195
+
192
196
  return argument
193
197
 
194
- def load_docx_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
198
+ def load_docx_files_from_directory(
199
+ self, directory: str, verbose: bool = False
200
+ ) -> FileArgument:
195
201
  """
196
202
  Load all Word (.docx) files from the specified directory and extract their content.
197
203
 
198
204
  This method iterates through all files in the given directory, identifies those
199
- with a .docx extension, and processes them using the extract_word_content method.
205
+ with a .docx extension, and processes them using the extract_docx_content method.
200
206
 
201
207
  Args:
202
208
  directory (str): Path to the directory containing Word files to be processed
@@ -208,25 +214,28 @@ class FilesFeature:
208
214
  >>> processor = DocumentProcessor()
209
215
  >>> processor.load_word_files("/path/to/documents")
210
216
  """
211
- argument: FileArgument = FileArgument([], [], [])
212
- for file in os.listdir(directory):
213
- if not file.endswith(".docx"):
214
- if verbose:
215
- print(f"Error file: {file}")
216
- continue
217
-
218
- text = self._extract_docx_content(directory, file)
219
- if text is None:
220
- if verbose:
221
- print(f"Error file: {file}")
222
- continue
223
-
224
- argument.add_data(file, text)
225
- if verbose:
226
- print(f"Loaded Word file: {file}")
217
+ argument: FileArgument = FileArgument()
218
+ paths = [
219
+ os.path.join(directory, file)
220
+ for file in os.listdir(directory)
221
+ if file.endswith(".docx")
222
+ ]
223
+
224
+ with ThreadPoolExecutor() as executor:
225
+ results = list(executor.map(self._extract_docx_content, paths))
226
+
227
+ add_data_func = lambda result: (
228
+ argument.add_data(result.get("file"), result.get("content")),
229
+ print(f"Loaded Word file: {result.get('file')}") if verbose else print('')
230
+ )
231
+ with ThreadPoolExecutor() as executor:
232
+ executor.map(add_data_func, results)
233
+
227
234
  return argument
228
-
229
- def load_markdown_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
235
+
236
+ def load_markdown_files_from_directory(
237
+ self, directory: str, verbose: bool = False
238
+ ) -> FileArgument:
230
239
  """
231
240
  Load all Markdown (.md) files from the specified directory and extract their content.
232
241
 
@@ -243,25 +252,28 @@ class FilesFeature:
243
252
  >>> processor = DocumentProcessor()
244
253
  >>> processor.load_markdown_files("/path/to/documents")
245
254
  """
246
- argument: FileArgument = FileArgument([], [], [])
247
- for file in os.listdir(directory):
248
- if not file.endswith(".md"):
249
- if verbose:
250
- print(f"Error file: {file}")
251
- continue
252
-
253
- text = self._extract_markdown_content(directory, file)
254
- if text is None:
255
- if verbose:
256
- print(f"Error file: {file}")
257
- continue
258
-
259
- argument.add_data(file, text)
260
- if verbose:
261
- print(f"Loaded Markdown file: {file}")
255
+ argument = FileArgument()
256
+ paths = [
257
+ os.path.join(directory, file)
258
+ for file in os.listdir(directory)
259
+ if file.endswith(".md")
260
+ ]
261
+
262
+ with ThreadPoolExecutor() as executor:
263
+ results = list(executor.map(self._extract_markdown_content, paths))
264
+
265
+ add_data_func = lambda result: (
266
+ argument.add_data(result.get("file"), result.get("content")),
267
+ print(f"Loaded Markdown file: {result.get('file')}") if verbose else print('')
268
+ )
269
+ with ThreadPoolExecutor() as executor:
270
+ executor.map(add_data_func, results)
271
+
262
272
  return argument
263
273
 
264
- def load_all_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
274
+ def load_all_files_from_directory(
275
+ self, directory: str, verbose: bool = False
276
+ ) -> FileArgument:
265
277
  """
266
278
  Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
267
279
 
@@ -274,7 +286,7 @@ class FilesFeature:
274
286
  Returns:
275
287
  None
276
288
  """
277
- argument: FileArgument = FileArgument([], [], [])
289
+ argument: FileArgument = FileArgument()
278
290
  for root, _, files in os.walk(directory):
279
291
  for file in files:
280
292
  readed = False
@@ -296,4 +308,5 @@ class FilesFeature:
296
308
  print(f"Loaded file: {file}")
297
309
  elif verbose and not readed:
298
310
  print(f"Error file: {file}")
299
- return argument
311
+
312
+ return argument
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vectoriz
3
- Version: 1.0.1
3
+ Version: 1.0.2rc0
4
4
  Summary: Python library for creating vectorized data from text or files.
5
5
  Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
6
6
  Author: PedroHenriqueDevBR
@@ -1,12 +1,12 @@
1
1
  tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- tests/test_files.py,sha256=nYh7IXtsg97wOf0ACUf5izKDKqcIMmHVOHl7od1SU4Q,11094
2
+ tests/test_files.py,sha256=GQXNbGPUZeEzl5cE70D8OnyGbQr7kTnEBBvqK4ikKmo,11392
3
3
  tests/test_token_transformer.py,sha256=xfB6_aP9pYSDHtUJzt9dioP_XBTZPvDnwAMWylyfuKQ,7796
4
4
  tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
5
5
  vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
6
- vectoriz/files.py,sha256=4U-n3fag6ci2ZdWoBG5zSqmtsK9XLQ103KLkSvs7f_I,11371
6
+ vectoriz/files.py,sha256=IPNVztf3aNNPHvMj2lb7Yuf7akKTu3n7hsVYT97CzUY,11438
7
7
  vectoriz/token_transformer.py,sha256=zx8TpCxYhrQYzvZy9JaerhniFY7IxZcQIiHedOzAZyQ,6957
8
8
  vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
9
- vectoriz-1.0.1.dist-info/METADATA,sha256=kqHf3Mvv54Lh9CpARrFc12AVdylvYE2n_yWsDJx4gig,3849
10
- vectoriz-1.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
11
- vectoriz-1.0.1.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
12
- vectoriz-1.0.1.dist-info/RECORD,,
9
+ vectoriz-1.0.2rc0.dist-info/METADATA,sha256=CNvsojBRCYovh9DMnquIKh7IUVxdiy46XGmwRCibIpQ,3852
10
+ vectoriz-1.0.2rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
11
+ vectoriz-1.0.2rc0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
12
+ vectoriz-1.0.2rc0.dist-info/RECORD,,