PyPI - vectoriz - Versions diffs - 1.0.1__py3-none-any.whl → 1.0.2rc0__py3-none-any.whl - Mend

vectoriz 1.0.1py3-none-any.whl → 1.0.2rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

tests/test_files.py CHANGED Viewed

@@ -98,22 +98,22 @@ class TestFilesFeature:
         test_file = tmp_path / "test.txt"
         test_file.write_text(test_content)
         files_feature = FilesFeature()
-        result = files_feature._extract_txt_content(str(tmp_path), "test.txt")
-        assert result == test_content
+        result = files_feature._extract_txt_content(test_file)
+        assert result == {"file": "test.txt", "content": test_content}
     def test_extract_txt_content_with_unicode_chars(self, tmp_path):
         test_content = "Unicode content: àáâãäåæç"
         test_file = tmp_path / "unicode.txt"
         test_file.write_text(test_content, encoding="utf-8")
         files_feature = FilesFeature()
-        result = files_feature._extract_txt_content(str(tmp_path), "unicode.txt")
-        assert result == test_content
+        result = files_feature._extract_txt_content(test_file)
+        assert result == {"file": "unicode.txt", "content": test_content}
     def test_extract_txt_content_raises_file_not_found(self):
         files_feature = FilesFeature()
         with pytest.raises(FileNotFoundError):
             files_feature._extract_txt_content(
-                "/non_existent_dir", "non_existent_file.txt"
+                "/non_existent_dir/non_existent_file.txt"
             )
     def test_extract_docx_content_reads_file_correctly(self, tmp_path, monkeypatch):
@@ -126,9 +126,10 @@ class TestFilesFeature:
         monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
         files_feature = FilesFeature()
-        result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
+        path = tmp_path / "test.docx"
+        result = files_feature._extract_docx_content(path)
-        assert result == "Paragraph 1\nParagraph 2"
+        assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 2"}
     def test_extract_docx_content_skips_empty_paragraphs(self, tmp_path, monkeypatch):
         mock_doc = MagicMock()
@@ -142,9 +143,10 @@ class TestFilesFeature:
         monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
         files_feature = FilesFeature()
-        result = files_feature._extract_docx_content(str(tmp_path), "test.docx")
+        path = tmp_path / "test.docx"
+        result = files_feature._extract_docx_content(path)
-        assert result == "Paragraph 1\nParagraph 3"
+        assert result == {"file": "test.docx", "content": "Paragraph 1\nParagraph 3"}
     def test_extract_docx_content_exception_handling(self, tmp_path, monkeypatch):
         def mock_document(_):
@@ -154,52 +156,56 @@ class TestFilesFeature:
         files_feature = FilesFeature()
         with pytest.raises(Exception):
-            files_feature._extract_docx_content(str(tmp_path), "invalid.docx")
+            path = tmp_path / "/invalid.docx"
+            files_feature._extract_docx_content(path)
     def test_extract_docx_content_with_no_paragraphs(self, tmp_path, monkeypatch):
         mock_doc = MagicMock()
         mock_doc.paragraphs = []
         monkeypatch.setattr(docx, "Document", lambda _: mock_doc)
         files_feature = FilesFeature()
-        result = files_feature._extract_docx_content(str(tmp_path), "empty.docx")
-        assert result == ""
+        path = tmp_path / "empty.docx"
+        result = files_feature._extract_docx_content(path)
+        assert result == {"file": "empty.docx", "content": ""}
     def test_extract_markdown_content_reads_file_correctly(self, tmp_path):
         test_content = "# Markdown Title\nThis is some markdown content."
         test_file = tmp_path / "test.md"
         test_file.write_text(test_content)
         files_feature = FilesFeature()
-        result = files_feature._extract_markdown_content(str(tmp_path), "test.md")
-        assert result == test_content
+        path = tmp_path / "test.md"
+        result = files_feature._extract_markdown_content(path)
+        assert result == {"file": "test.md", "content": test_content}
     def test_extract_markdown_content_with_unicode_chars(self, tmp_path):
         test_content = "# Unicode Title\nContent with unicode: àáâãäåæç"
         test_file = tmp_path / "unicode.md"
         test_file.write_text(test_content, encoding="utf-8")
         files_feature = FilesFeature()
-        result = files_feature._extract_markdown_content(str(tmp_path), "unicode.md")
-        assert result == test_content
+        path = tmp_path / "unicode.md"
+        result = files_feature._extract_markdown_content(path)
+        assert result == {"file": "unicode.md", "content": test_content}
     def test_extract_markdown_content_raises_file_not_found(self):
         files_feature = FilesFeature()
         with pytest.raises(FileNotFoundError):
-            files_feature._extract_markdown_content(
-                "/non_existent_dir", "non_existent_file.md"
-            )
+            path = str("/non_existent_dir/non_existent_file.md")
+            files_feature._extract_markdown_content(path)
     def test_extract_markdown_content_handles_empty_file(self, tmp_path):
         test_file = tmp_path / "empty.md"
         test_file.write_text("")
         files_feature = FilesFeature()
-        result = files_feature._extract_markdown_content(str(tmp_path), "empty.md")
-        assert result == ""
+        result = files_feature._extract_markdown_content(test_file)
+        assert result == {'file': 'empty.md', 'content': ''}
     def test_extract_markdown_content_raises_unicode_decode_error(self, tmp_path):
         test_file = tmp_path / "invalid_encoding.md"
-        test_file.write_bytes(b"\x80\x81\x82")  # Invalid UTF-8 bytes
+        test_file.write_bytes(b"\x80\x81\x82")
         files_feature = FilesFeature()
         with pytest.raises(UnicodeDecodeError):
-            files_feature._extract_markdown_content(str(tmp_path), "invalid_encoding.md")
+            path = str(tmp_path / "invalid_encoding.md")
+            files_feature._extract_markdown_content(path)
     def test_load_markdown_files_from_directory_loads_files_correctly(self, tmp_path):
         test_content_1 = "# Title 1\nContent 1"
@@ -210,7 +216,7 @@ class TestFilesFeature:
         test_file_2.write_text(test_content_2)
         files_feature = FilesFeature()
-        result = files_feature.load_markdown_files_from_directory(str(tmp_path))
+        result = files_feature.load_markdown_files_from_directory(tmp_path)
         assert len(result.chunk_names) == 2
         assert len(result.text_list) == 2
@@ -228,7 +234,7 @@ class TestFilesFeature:
         test_file_txt.write_text(test_content_txt)
         files_feature = FilesFeature()
-        result = files_feature.load_markdown_files_from_directory(str(tmp_path))
+        result = files_feature.load_markdown_files_from_directory(tmp_path)
         assert len(result.chunk_names) == 1
         assert len(result.text_list) == 1
@@ -238,7 +244,7 @@ class TestFilesFeature:
     def test_load_markdown_files_from_directory_handles_empty_directory(self, tmp_path):
         files_feature = FilesFeature()
-        result = files_feature.load_markdown_files_from_directory(str(tmp_path))
+        result = files_feature.load_markdown_files_from_directory(tmp_path)
         assert len(result.chunk_names) == 0
         assert len(result.text_list) == 0

vectoriz/files.py CHANGED Viewed

@@ -2,9 +2,11 @@ import os
 import docx
 import numpy as np
 from typing import Optional
+from concurrent.futures import ThreadPoolExecutor
 from vectoriz.token_transformer import TokenTransformer
 class FileArgument:
     def __init__(
         self,
@@ -50,7 +52,6 @@ class FileArgument:
         Returns:
             None: This method doesn't return anything, it updates the internal state of the object
         """
         self.chunk_names.append(filename)
         self.text_list.append(text)
         self.embeddings.append(self._create_embedding(text))
@@ -72,7 +73,7 @@ class FileArgument:
 class FilesFeature:
-    def _extract_txt_content(self, directory: str, file: str) -> Optional[str]:
+    def _extract_txt_content(self, path: str) -> dict[str, str]:
         """
         Extract content from a text file and add it to the response data.
@@ -81,15 +82,13 @@ class FilesFeature:
         Parameters:
         ----------
-        directory : str
-            The directory path where the file is located.
-        file : str
+        path : str
             The name of the text file to read.
         Returns:
         -------
-        None
-            This method doesn't return any value but updates the internal response data.
+        Optional[str]
+            The content of the text file or None if the file is empty.
         Raises:
         ------
@@ -98,11 +97,12 @@ class FilesFeature:
         UnicodeDecodeError
             If the file cannot be decoded using UTF-8 encoding.
         """
-        with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
-            text = fl.read()
-            return text
-    def _extract_markdown_content(self, directory: str, file: str) -> Optional[str]:
+        file = os.path.basename(path)
+        with open(path, "r", encoding="utf-8") as fl:
+            content = fl.read()
+        return {"file": file, "content": content}
+    def _extract_markdown_content(self, path: str) -> dict[str, str]:
         """
         Extract content from a Markdown file and add it to the response data.
@@ -111,9 +111,7 @@ class FilesFeature:
         Parameters:
         ----------
-        directory : str
-            The directory path where the file is located.
-        file : str
+        path : str
             The name of the Markdown file to read.
         Returns:
@@ -128,37 +126,41 @@ class FilesFeature:
         UnicodeDecodeError
             If the file cannot be decoded using UTF-8 encoding.
         """
-        with open(os.path.join(directory, file), "r", encoding="utf-8") as fl:
-            text = fl.read()
-            return text
+        file = os.path.basename(path)
+        with open(path, "r", encoding="utf-8") as fl:
+            content = fl.read()
+        return {"file": file, "content": content}
-    def _extract_docx_content(self, directory: str, file: str) -> Optional[str]:
+    def _extract_docx_content(self, path: str) -> dict[str, str]:
         """
         Extracts text content from a Microsoft Word document.
         This method opens a Word document, reads all paragraphs, and joins non-empty
         paragraphs into a single text string. The extracted content is then stored
         using the add_response_data method.
         Args:
-            directory (str): The directory path where the Word file is located
-            file (str): The filename of the Word document to process
+            path (str): The path where the Word file is located
         Returns:
-            Optional[str]: The extracted text content or None if no content is found.
+            dict[str, str]: A dictionary containing the file name and the extracted text content.
         Note:
             Empty paragraphs (those that contain only whitespace) are skipped.
             The python-docx library is required for this method to work.
         """
-        file_path = os.path.join(directory, file)
-        doc = docx.Document(file_path)
+        file = os.path.basename(path)
+        doc = docx.Document(path)
         full_text = []
         for paragraph in doc.paragraphs:
             content = paragraph.text.strip()
             if len(content) == 0:
                 continue
-            full_text.append(paragraph.text)
-        return "\n".join(full_text)
+            full_text.append(content)
+        content = "\n".join(full_text)
+        return {"file": file, "content": content}
-    def load_txt_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
+    def load_txt_files_from_directory(
+        self, directory: str, verbose: bool = False
+    ) -> FileArgument:
         """
         Load all text files from the specified directory and extract their content.
         This method scans the specified directory for files with the '.txt' extension
@@ -173,30 +175,34 @@ class FilesFeature:
             This method does not return any value. It updates the internal state
             by processing text files found in the directory.
         """
-        argument: FileArgument = FileArgument([], [], [])
-        for file in os.listdir(directory):
-            if not file.endswith(".txt"):
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            text = self._extract_txt_content(directory, file)
-            if text is None:
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            argument.add_data(file, text)
-            if verbose:
-                    print(f"Loaded txt file: {file}")
+        argument: FileArgument = FileArgument()
+        paths = [
+            os.path.join(directory, file)
+            for file in os.listdir(directory)
+            if file.endswith(".txt")
+        ]
+        with ThreadPoolExecutor() as executor:
+            results = list(executor.map(self._extract_txt_content, paths))
+        add_data_func = lambda result: (
+            argument.add_data(result.get("file"), result.get("content")),
+            print(f"Loaded txt file: {result.get('file')}") if verbose else print('')
+        )
+        with ThreadPoolExecutor() as executor:
+            executor.map(add_data_func, results)
         return argument
-    def load_docx_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
+    def load_docx_files_from_directory(
+        self, directory: str, verbose: bool = False
+    ) -> FileArgument:
         """
         Load all Word (.docx) files from the specified directory and extract their content.
         This method iterates through all files in the given directory, identifies those
-        with a .docx extension, and processes them using the extract_word_content method.
+        with a .docx extension, and processes them using the extract_docx_content method.
         Args:
             directory (str): Path to the directory containing Word files to be processed
@@ -208,25 +214,28 @@ class FilesFeature:
             >>> processor = DocumentProcessor()
             >>> processor.load_word_files("/path/to/documents")
         """
-        argument: FileArgument = FileArgument([], [], [])
-        for file in os.listdir(directory):
-            if not file.endswith(".docx"):
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            text = self._extract_docx_content(directory, file)
-            if text is None:
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            argument.add_data(file, text)
-            if verbose:
-                print(f"Loaded Word file: {file}")
+        argument: FileArgument = FileArgument()
+        paths = [
+            os.path.join(directory, file)
+            for file in os.listdir(directory)
+            if file.endswith(".docx")
+        ]
+        with ThreadPoolExecutor() as executor:
+            results = list(executor.map(self._extract_docx_content, paths))
+        add_data_func = lambda result: (
+            argument.add_data(result.get("file"), result.get("content")),
+            print(f"Loaded Word file: {result.get('file')}") if verbose else print('')
+        )
+        with ThreadPoolExecutor() as executor:
+            executor.map(add_data_func, results)
         return argument
-    def load_markdown_files_from_directory(self, directory: str, verbose: bool = False) -> FileArgument:
+    def load_markdown_files_from_directory(
+        self, directory: str, verbose: bool = False
+    ) -> FileArgument:
         """
         Load all Markdown (.md) files from the specified directory and extract their content.
@@ -243,25 +252,28 @@ class FilesFeature:
             >>> processor = DocumentProcessor()
             >>> processor.load_markdown_files("/path/to/documents")
         """
-        argument: FileArgument = FileArgument([], [], [])
-        for file in os.listdir(directory):
-            if not file.endswith(".md"):
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            text = self._extract_markdown_content(directory, file)
-            if text is None:
-                if verbose:
-                    print(f"Error file: {file}")
-                continue
-            argument.add_data(file, text)
-            if verbose:
-                print(f"Loaded Markdown file: {file}")
+        argument = FileArgument()
+        paths = [
+            os.path.join(directory, file)
+            for file in os.listdir(directory)
+            if file.endswith(".md")
+        ]
+        with ThreadPoolExecutor() as executor:
+            results = list(executor.map(self._extract_markdown_content, paths))
+        add_data_func = lambda result: (
+            argument.add_data(result.get("file"), result.get("content")),
+            print(f"Loaded Markdown file: {result.get('file')}") if verbose else print('')
+        )
+        with ThreadPoolExecutor() as executor:
+            executor.map(add_data_func, results)
         return argument
-    def load_all_files_from_directory(self, directory: str, verbose: bool =  False) -> FileArgument:
+    def load_all_files_from_directory(
+        self, directory: str, verbose: bool = False
+    ) -> FileArgument:
         """
         Load all supported files (.txt and .docx) from the specified directory and its subdirectories.
@@ -274,7 +286,7 @@ class FilesFeature:
         Returns:
             None
         """
-        argument: FileArgument = FileArgument([], [], [])
+        argument: FileArgument = FileArgument()
         for root, _, files in os.walk(directory):
             for file in files:
                 readed = False
@@ -296,4 +308,5 @@ class FilesFeature:
                     print(f"Loaded file: {file}")
                 elif verbose and not readed:
                     print(f"Error file: {file}")
-        return argument
+        return argument

{vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vectoriz
-Version: 1.0.1
+Version: 1.0.2rc0
 Summary: Python library for creating vectorized data from text or files.
 Home-page: https://github.com/PedroHenriqueDevBR/vectoriz
 Author: PedroHenriqueDevBR

{vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,12 @@
 tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-tests/test_files.py,sha256=nYh7IXtsg97wOf0ACUf5izKDKqcIMmHVOHl7od1SU4Q,11094
+tests/test_files.py,sha256=GQXNbGPUZeEzl5cE70D8OnyGbQr7kTnEBBvqK4ikKmo,11392
 tests/test_token_transformer.py,sha256=xfB6_aP9pYSDHtUJzt9dioP_XBTZPvDnwAMWylyfuKQ,7796
 tests/test_vector_db.py,sha256=4vFxM6nhFFtI4ERuEY61dnQGsc7B90JBcn2_mvT8bWA,18369
 vectoriz/__init__.py,sha256=fnnle0EjVejiZQ8t243kvFiqcTTFh9dzmZbNwayjh4U,156
-vectoriz/files.py,sha256=4U-n3fag6ci2ZdWoBG5zSqmtsK9XLQ103KLkSvs7f_I,11371
+vectoriz/files.py,sha256=IPNVztf3aNNPHvMj2lb7Yuf7akKTu3n7hsVYT97CzUY,11438
 vectoriz/token_transformer.py,sha256=zx8TpCxYhrQYzvZy9JaerhniFY7IxZcQIiHedOzAZyQ,6957
 vectoriz/vector_db.py,sha256=EqjKOTK1P4zP7wCmMo_Y2GsPzVP02UOzvurX-nTVuqI,6830
-vectoriz-1.0.1.dist-info/METADATA,sha256=kqHf3Mvv54Lh9CpARrFc12AVdylvYE2n_yWsDJx4gig,3849
-vectoriz-1.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-vectoriz-1.0.1.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
-vectoriz-1.0.1.dist-info/RECORD,,
+vectoriz-1.0.2rc0.dist-info/METADATA,sha256=CNvsojBRCYovh9DMnquIKh7IUVxdiy46XGmwRCibIpQ,3852
+vectoriz-1.0.2rc0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+vectoriz-1.0.2rc0.dist-info/top_level.txt,sha256=Tcfk3kazBwJ_yySjjhlIhLoTWLQGSb5xV006X18O6Nk,15
+vectoriz-1.0.2rc0.dist-info/RECORD,,

{vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/WHEEL RENAMED Viewed

File without changes

{vectoriz-1.0.1.dist-info → vectoriz-1.0.2rc0.dist-info}/top_level.txt RENAMED Viewed

File without changes

vectoriz 1.0.1__py3-none-any.whl → 1.0.2rc0__py3-none-any.whl

vectoriz 1.0.1py3-none-any.whl → 1.0.2rc0py3-none-any.whl