PyPI - prevectorchunks-core - Versions diffs - 0.1.33__tar.gz → 0.1.34__tar.gz - Mend

prevectorchunks-core 0.1.33tar.gz → 0.1.34tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.34}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.33
+Version: 0.1.34
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/DocuToImageConverter.py RENAMED Viewed

@@ -28,18 +28,54 @@ class DocuToImageConverter:
         """Write bytes to a temporary file and return path."""
         tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
         with os.fdopen(tmp_fd, "wb") as f:
-            f.write(input_bytes)
+            f.write(input_bytes.read())
         return tmp_path
-    def _convert_doc_to_pdf(self, input_path: str) -> str:
-        """Convert DOC/DOCX file to PDF using Word COM, LibreOffice, Pandoc, or fallback."""
+    def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert DOC/DOCX to PDF. Supports:
+        - file_path (string)
+        - input_bytes (bytes, InMemoryUploadedFile, or file-like)
+        """
+        # ✅ If bytes are provided, write them to a temporary .docx
+        if input_bytes is not None:
+            # Get filename or fallback
+            original_name = getattr(input_bytes, "name", "uploaded.docx")
+            ext = os.path.splitext(original_name)[1] or ".docx"
+            # Create a temporary file path
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            # Read bytes safely
+            if hasattr(input_bytes, "read"):  # Django UploadedFile
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:  # already bytes
+                content = input_bytes
+            # Write bytes to temp file
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        # ✅ If file_path is provided, use it directly
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        # ✅ Must exist at this point
         if not os.path.exists(input_path):
             raise FileNotFoundError(input_path)
+        # ✅ Prepare output PDF path
         output_dir = tempfile.mkdtemp()
         output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
-        # 1️⃣ Microsoft Word COM automation (Windows only)
+        # 1️⃣ Try Microsoft Word COM automation (Windows)
         try:
             import win32com.client
             word = win32com.client.Dispatch("Word.Application")
@@ -52,7 +88,7 @@ class DocuToImageConverter:
         except Exception:
             pass
-        # 2️⃣ LibreOffice fallback
+        # 2️⃣ Try LibreOffice
         try:
             subprocess.run(
                 ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
@@ -65,23 +101,28 @@ class DocuToImageConverter:
         # 3️⃣ Pandoc fallback
         try:
             pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
-            pypandoc.convert_file(input_path, "pdf", outputfile=output_pdf,
-                                  extra_args=["--standalone", f"--pdf-engine={pdf_engine}"])
+            pypandoc.convert_file(
+                input_path, "pdf",
+                outputfile=output_pdf,
+                extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
+            )
             return output_pdf
         except Exception:
             pass
-        # 4️⃣ Last resort: ReportLab plain text
+        # 4️⃣ Final fallback: Render plain text using ReportLab
         doc = Document(input_path)
         c = canvas.Canvas(output_pdf, pagesize=A4)
         width, height = A4
         y = height - 50
         for p in doc.paragraphs:
             c.drawString(50, y, p.text[:1000])
             y -= 15
             if y < 50:
                 c.showPage()
                 y = height - 50
         c.save()
         return output_pdf
@@ -96,7 +137,7 @@ class DocuToImageConverter:
         pdf_document.close()
         return images
-    def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
+    def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
         """
         Convert a file path or binary content to PIL images.
         Supports PDF, DOC, DOCX, and image files.
@@ -107,14 +148,15 @@ class DocuToImageConverter:
         # Determine extension
         if file_path:
             ext = os.path.splitext(file_path)[1].lower()
+            print('work')
         elif input_bytes:
             # Attempt to infer from first few bytes (simple)
-            if input_bytes[:4] == b"%PDF":
-                ext = ".pdf"
-            elif input_bytes[:2] == b"PK":
-                ext = ".docx"
-            else:
-                ext = ".img"  # Treat as generic image
+            # if input_bytes[:4] == b"%PDF":
+            #     ext = ".pdf"
+            # elif input_bytes[:2] == b"PK":
+            #     ext = ".docx"
+            # else:
+            #     ext = ".img"  # Treat as generic image
             # Write to temp file if doc/pdf
             if ext in [".pdf", ".doc", ".docx"]:

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/markdown_and_chunk_documents.py RENAMED Viewed

@@ -1,6 +1,8 @@
 import os
 import json
 import tempfile
+import uuid
+from io import BytesIO
 from pathlib import Path
 from docx import Document
@@ -17,21 +19,29 @@ from ..utils.file_loader import SplitType
 load_dotenv(override=True)
+def get_file_extension(file_path,file_name):
+    ext=''
+    if file_name:
+        ext = file_name[1]
+    else:
+        # Extract extension
+        ext = os.path.splitext(file_path)[1].lower()
+    return ext
 # -----------------------------
 # Abstract Strategy Interface
 # -----------------------------
 class BaseDocumentStrategy:
     """Defines a standard interface for all document processing strategies."""
-    def process(self, file_path: str, input_bytes: bytes = None):
+    def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
         raise NotImplementedError("process() must be implemented by subclasses")
 # -----------------------------
 # PDF Strategy
 # -----------------------------
 class PDFStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str, input_bytes: bytes = None):
+    def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
         print(f"📄 Using PDFStrategy for {file_path}")
         converter = DocuToImageConverter()
         # Example: detect multi-column layout or extract embedded text first
@@ -44,7 +54,7 @@ class PDFStrategy(BaseDocumentStrategy):
         # if text_ratio > 0.0001:
         #     print("📚 PDF appears text-based – using hybrid extract + image backup")
-        images = converter.convert_to_images(file_path)
+        images = converter.convert_to_images(file_path,input_bytes,ext=ext)
         return images
@@ -52,16 +62,18 @@ class PDFStrategy(BaseDocumentStrategy):
 # Word Strategy
 # -----------------------------
 class WordStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str, input_bytes: bytes = None):
-        file_path = Path(file_path)
-        print(f"📝 Using WordStrategy for {file_path}")
+    def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
+        file_name=''
+        if file_path:
+            file_name = Path(file_path)
+            print(f"📝 Using WordStrategy for {file_path}")
+        else:
+            file_name_no_ext = os.path.splitext(input_bytes.name)[0]
         with tempfile.TemporaryDirectory() as tmpdir:
-            pdf_path = Path(tmpdir) / f"{file_path.stem}.pdf"
+            pdf_path = Path(tmpdir) / f"{file_name}.pdf"
             converter = DocuToImageConverter()
-            pdf_path = converter._convert_doc_to_pdf(file_path)
+            pdf_path = converter._convert_doc_to_pdf(file_path=file_path, input_bytes=input_bytes)
             images = converter.convert_to_images(pdf_path)
@@ -72,9 +84,31 @@ class WordStrategy(BaseDocumentStrategy):
 # Image Strategy
 # -----------------------------
 class ImageStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str, input_bytes: bytes = None):
+    def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
         print(f"🖼️ Using ImageStrategy for {file_path}")
-        image = Image.open(file_path).convert("RGB")
+        if file_path:
+            # Path-based loading
+            image = Image.open(file_path).convert("RGB")
+        else:
+            # Byte-based loading
+            if input_bytes is None:
+                raise ValueError("Either file_path or input_bytes must be provided")
+            # If it's a Django UploadedFile → read() needed
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                image_bytes = input_bytes.read()
+            # If it's already bytes
+            elif isinstance(input_bytes, (bytes, bytearray)):
+                image_bytes = input_bytes
+            else:
+                raise TypeError("input_bytes must be bytes or file-like object")
+            image = Image.open(BytesIO(image_bytes)).convert("RGB")
         return [image]
@@ -96,8 +130,14 @@ class StrategyFactory:
     }
     @classmethod
-    def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
-        ext = os.path.splitext(file_path)[1].lower()
+    def get_strategy(cls, file_path: str,file_name:str=None) -> BaseDocumentStrategy:
+        if file_name:
+            ext=file_name[1]
+        else:
+            # Extract extension
+            ext = os.path.splitext(file_path)[1].lower()
         return cls.strategies.get(ext, None)
@@ -109,14 +149,15 @@ class MarkdownAndChunkDocuments:
         self.api_key = os.getenv("OPENAI_API_KEY")
         self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
-    def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
+    def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
         # Pick strategy
-        strategy = StrategyFactory.get_strategy(file_path)
+        strategy = StrategyFactory.get_strategy(file_path,file_name)
         if not strategy:
             raise ValueError(f"Unsupported file type: {file_path}")
         # Convert to images using correct strategy
-        images = strategy.process(file_path, input_bytes)
+        ext=get_file_extension(file_path,file_name)
+        images = strategy.process(file_path, input_bytes,ext)
         # Extract Markdown from images
         markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
@@ -146,10 +187,16 @@ class MarkdownAndChunkDocuments:
             if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
                 md_item["chunked_text"] = md_item["markdown_text"]
                 mapped_chunks.append(md_item)
+        adduuid(mapped_chunks)
         print("✅ Processing complete.")
         return mapped_chunks
+def adduuid(mapped_chunks):
+    # Assuming mapped_chunks is a list of dictionaries
+    for chunk in mapped_chunks:
+        chunk['id'] = str(uuid.uuid4())
 # -----------------------------
 # CLI Entry

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.33
+Version: 0.1.34
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,40 @@ LICENCE
 LICENSE
 README.md
 pyproject.toml
+./prevectorchunks_core/__init__.py
+./prevectorchunks_core/test_loader.py
+./prevectorchunks_core/config/__init__.py
+./prevectorchunks_core/config/splitter_config.py
+./prevectorchunks_core/migrations/__init__.py
+./prevectorchunks_core/os-llm/__init__.py
+./prevectorchunks_core/os-llm/llava.py
+./prevectorchunks_core/rlchunker/__init__.py
+./prevectorchunks_core/rlchunker/env.py
+./prevectorchunks_core/rlchunker/inference.py
+./prevectorchunks_core/rlchunker/model.py
+./prevectorchunks_core/rlchunker/reward.py
+./prevectorchunks_core/rlchunker/savepretrained.py
+./prevectorchunks_core/rlchunker/testpretrained.py
+./prevectorchunks_core/rlchunker/utils.py
+./prevectorchunks_core/rlchunker/pretrained/__init__.py
+./prevectorchunks_core/rlchunker/pretrained/model_info.txt
+./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
+./prevectorchunks_core/services/DocuToImageConverter.py
+./prevectorchunks_core/services/DocuToMarkdownExtractor.py
+./prevectorchunks_core/services/__init__.py
+./prevectorchunks_core/services/audio_processor.py
+./prevectorchunks_core/services/chunk_documents_crud_vdb.py
+./prevectorchunks_core/services/chunk_to_all_content_mapper.py
+./prevectorchunks_core/services/image_processor.py
+./prevectorchunks_core/services/markdown_and_chunk_documents.py
+./prevectorchunks_core/services/propositional_index.py
+./prevectorchunks_core/services/video_analyser.py
+./prevectorchunks_core/tests/__init__.py
+./prevectorchunks_core/tests/test_local.py
+./prevectorchunks_core/utils/__init__.py
+./prevectorchunks_core/utils/extract_content.py
+./prevectorchunks_core/utils/file_loader.py
+./prevectorchunks_core/utils/llm_wrapper.py
 prevectorchunks_core/__init__.py
 prevectorchunks_core/test_loader.py
 prevectorchunks_core.egg-info/PKG-INFO

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.33"
+version = "0.1.34"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -54,13 +54,17 @@ dependencies = [
     "tensorflow~=2.12.0",   # <-- Add this
 ]
 [tool.setuptools.packages.find]
+where = ["."]
 include = ["prevectorchunks_core*"]
 [tool.setuptools.package-data]
 "prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
 [tool.setuptools]
+package-dir = {"" = "."}
 include-package-data = true
 [project.urls]