PyPI - prevectorchunks-core - Versions diffs - 0.1.31__tar.gz → 0.1.32__tar.gz - Mend

prevectorchunks-core 0.1.31tar.gz → 0.1.32tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{prevectorchunks_core-0.1.31/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.32}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.31
+Version: 0.1.32
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
 Requires-Dist: lxml~=4.9.3
 Requires-Dist: cssselect2~=0.7.0
 Requires-Dist: cairocffi~=1.4.0
+Requires-Dist: tensorflow<3.0.0,>=2.15.0
+Requires-Dist: codecarbon>=2.3.0
 Dynamic: license-file
 # 📚 PreVectorChunks

prevectorchunks_core-0.1.32/prevectorchunks_core/services/DocuToImageConverter.py ADDED Viewed

@@ -0,0 +1,143 @@
+import os
+import tempfile
+import shutil
+import subprocess
+from pathlib import Path
+from PIL import Image
+import io
+import fitz
+from docx2pdf import convert as docx2pdf_convert
+from docx import Document
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import A4
+import pypandoc
+# Ensure pandoc is available
+try:
+    pypandoc.get_pandoc_path()
+except OSError:
+    pypandoc.download_pandoc()
+class DocuToImageConverter:
+    """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
+    def __init__(self):
+        pass
+    def _write_temp_file(self, input_bytes: bytes, suffix: str):
+        """Write bytes to a temporary file and return path."""
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
+        with os.fdopen(tmp_fd, "wb") as f:
+            f.write(input_bytes)
+        return tmp_path
+    def _convert_doc_to_pdf(self, input_path: str) -> str:
+        """Convert DOC/DOCX file to PDF using Word COM, LibreOffice, Pandoc, or fallback."""
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(input_path)
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Microsoft Word COM automation (Windows only)
+        try:
+            import win32com.client
+            word = win32com.client.Dispatch("Word.Application")
+            word.Visible = False
+            doc = word.Documents.Open(str(Path(input_path).resolve()))
+            doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
+            doc.Close()
+            word.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ LibreOffice fallback
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        # 3️⃣ Pandoc fallback
+        try:
+            pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
+            pypandoc.convert_file(input_path, "pdf", outputfile=output_pdf,
+                                  extra_args=["--standalone", f"--pdf-engine={pdf_engine}"])
+            return output_pdf
+        except Exception:
+            pass
+        # 4️⃣ Last resort: ReportLab plain text
+        doc = Document(input_path)
+        c = canvas.Canvas(output_pdf, pagesize=A4)
+        width, height = A4
+        y = height - 50
+        for p in doc.paragraphs:
+            c.drawString(50, y, p.text[:1000])
+            y -= 15
+            if y < 50:
+                c.showPage()
+                y = height - 50
+        c.save()
+        return output_pdf
+    def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
+        images = []
+        pdf_document = fitz.open(pdf_path)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            pixmap = page.get_pixmap(dpi=dpi)
+            image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
+            images.append(image)
+        pdf_document.close()
+        return images
+    def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
+        """
+        Convert a file path or binary content to PIL images.
+        Supports PDF, DOC, DOCX, and image files.
+        """
+        if not file_path and not input_bytes:
+            raise ValueError("Provide either file_path or input_bytes.")
+        # Determine extension
+        if file_path:
+            ext = os.path.splitext(file_path)[1].lower()
+        elif input_bytes:
+            # Attempt to infer from first few bytes (simple)
+            if input_bytes[:4] == b"%PDF":
+                ext = ".pdf"
+            elif input_bytes[:2] == b"PK":
+                ext = ".docx"
+            else:
+                ext = ".img"  # Treat as generic image
+            # Write to temp file if doc/pdf
+            if ext in [".pdf", ".doc", ".docx"]:
+                file_path = self._write_temp_file(input_bytes, suffix=ext)
+        # Word → PDF
+        if ext in [".doc", ".docx"]:
+            pdf_path = self._convert_doc_to_pdf(file_path)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # PDF → images
+        elif ext == ".pdf":
+            images = self._convert_pdf_to_images(file_path, dpi=dpi)
+        # Image
+        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
+            image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
+            buffer = io.BytesIO()
+            image.save(buffer, format=output_format)
+            buffer.seek(0)
+            images = [Image.open(buffer)]
+        else:
+            raise ValueError("Unsupported file type.")
+        return images

{prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/markdown_and_chunk_documents.py RENAMED Viewed

@@ -23,7 +23,7 @@ load_dotenv(override=True)
 class BaseDocumentStrategy:
     """Defines a standard interface for all document processing strategies."""
-    def process(self, file_path: str):
+    def process(self, file_path: str, input_bytes: bytes = None):
         raise NotImplementedError("process() must be implemented by subclasses")
@@ -31,7 +31,7 @@ class BaseDocumentStrategy:
 # PDF Strategy
 # -----------------------------
 class PDFStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str):
+    def process(self, file_path: str, input_bytes: bytes = None):
         print(f"📄 Using PDFStrategy for {file_path}")
         converter = DocuToImageConverter()
         # Example: detect multi-column layout or extract embedded text first
@@ -52,7 +52,7 @@ class PDFStrategy(BaseDocumentStrategy):
 # Word Strategy
 # -----------------------------
 class WordStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str):
+    def process(self, file_path: str, input_bytes: bytes = None):
         file_path = Path(file_path)
         print(f"📝 Using WordStrategy for {file_path}")
@@ -72,7 +72,7 @@ class WordStrategy(BaseDocumentStrategy):
 # Image Strategy
 # -----------------------------
 class ImageStrategy(BaseDocumentStrategy):
-    def process(self, file_path: str):
+    def process(self, file_path: str, input_bytes: bytes = None):
         print(f"🖼️ Using ImageStrategy for {file_path}")
         image = Image.open(file_path).convert("RGB")
         return [image]
@@ -109,14 +109,14 @@ class MarkdownAndChunkDocuments:
         self.api_key = os.getenv("OPENAI_API_KEY")
         self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
-    def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
+    def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
         # Pick strategy
         strategy = StrategyFactory.get_strategy(file_path)
         if not strategy:
             raise ValueError(f"Unsupported file type: {file_path}")
         # Convert to images using correct strategy
-        images = strategy.process(file_path)
+        images = strategy.process(file_path, input_bytes)
         # Extract Markdown from images
         markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)

{prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.31
+Version: 0.1.32
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
 Requires-Dist: lxml~=4.9.3
 Requires-Dist: cssselect2~=0.7.0
 Requires-Dist: cairocffi~=1.4.0
+Requires-Dist: tensorflow<3.0.0,>=2.15.0
+Requires-Dist: codecarbon>=2.3.0
 Dynamic: license-file
 # 📚 PreVectorChunks

{prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/requires.txt RENAMED Viewed

@@ -35,3 +35,5 @@ weasyprint~=62.0
 lxml~=4.9.3
 cssselect2~=0.7.0
 cairocffi~=1.4.0
+tensorflow<3.0.0,>=2.15.0
+codecarbon>=2.3.0

{prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.31"
+version = "0.1.32"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -50,7 +50,9 @@ dependencies = [
     "weasyprint~=62.0",
     "lxml~=4.9.3",
     "cssselect2~=0.7.0",
-    "cairocffi~=1.4.0"
+    "cairocffi~=1.4.0",
+    "tensorflow>=2.15.0,<3.0.0",   # <-- Add this
+    "codecarbon>=2.3.0"            # <-- Add this
 ]
 [tool.setuptools.packages.find]

prevectorchunks_core-0.1.31/prevectorchunks_core/services/DocuToImageConverter.py DELETED Viewed

@@ -1,148 +0,0 @@
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-import pypandoc
-from PIL import Image
-import io
-from docx2pdf import convert as docx_to_pdf
-import fitz
-from docx2pdf import convert as docx2pdf_convert
-try:
-    pypandoc.get_pandoc_path()
-except OSError:
-    print("Pandoc not found — downloading it temporarily...")
-    pypandoc.download_pandoc()
-class DocuToImageConverter:
-    """Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
-    def __init__(self):
-        pass
-    def _convert_doc_to_pdf(self, input_path: str) -> str:
-        import os, tempfile, shutil, subprocess
-        from pathlib import Path
-        if not os.path.exists(input_path):
-            raise FileNotFoundError(input_path)
-        output_dir = tempfile.mkdtemp()
-        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
-        # 1️⃣ Try Microsoft Word COM automation (Windows only)
-        try:
-            import win32com.client
-            word = win32com.client.Dispatch("Word.Application")
-            word.Visible = False
-            doc = word.Documents.Open(str(Path(input_path).resolve()))
-            doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)  # 17 = wdFormatPDF
-            doc.Close()
-            word.Quit()
-            print("✅ Word COM conversion successful:", output_pdf)
-            return output_pdf
-        except Exception as e:
-            print("⚠️ Word COM conversion failed:", e)
-        # 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
-        try:
-            # Requires LibreOffice installed and in PATH
-            subprocess.run(
-                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
-                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-            )
-            print("✅ LibreOffice conversion successful:", output_pdf)
-            return output_pdf
-        except Exception as e:
-            print("⚠️ LibreOffice conversion failed:", e)
-        # 3️⃣ Fallback: Pandoc (simpler, loses layout)
-        try:
-            import pypandoc
-            def which(cmd):
-                return shutil.which(cmd) is not None
-            pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
-            pypandoc.convert_file(
-                input_path, "pdf", outputfile=output_pdf,
-                extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
-            )
-            print("✅ Pandoc conversion successful:", output_pdf)
-            return output_pdf
-        except Exception as e:
-            print("⚠️ Pandoc conversion failed:", e)
-        # 4️⃣ Last resort: ReportLab basic text (no formatting)
-        from reportlab.pdfgen import canvas
-        from reportlab.lib.pagesizes import A4
-        from docx import Document
-        doc = Document(input_path)
-        c = canvas.Canvas(output_pdf, pagesize=A4)
-        width, height = A4
-        y = height - 50
-        for p in doc.paragraphs:
-            c.drawString(50, y, p.text[:1000])
-            y -= 15
-            if y < 50:
-                c.showPage()
-                y = height - 50
-        c.save()
-        print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
-        return output_pdf
-    def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
-        """
-        Converts each page of a PDF into images using PyMuPDF directly.
-        """
-        images = []
-        try:
-            pdf_document = fitz.open(pdf_path)  # Use `PyMuPDF` instead of fitz alias
-            for page_num in range(len(pdf_document)):
-                page = pdf_document[page_num]
-                # Render page to a pixmap with the specified DPI
-                pixmap = page.get_pixmap(dpi=dpi)
-                # Convert pixmap to an Image object using PIL
-                image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
-                images.append(image)
-            pdf_document.close()
-        except Exception as e:
-            raise RuntimeError(f"Failed to convert PDF to images: {e}")
-        return images
-    def convert_to_images(self, file_path: str, dpi: int = 200, output_format: str = "PNG"):
-        """
-        Converts each page of a document into a list of PIL images.
-        Supports .pdf, .doc, .docx, and image files (.jpg, .png, etc.)
-        Ensures all outputs are in a consistent image format.
-        """
-        ext = os.path.splitext(file_path)[1].lower()
-        # Convert Word → PDF first
-        if ext in [".doc", ".docx"]:
-            pdf_path = self._convert_doc_to_pdf(file_path)
-            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
-        # Convert PDF → list of images
-        elif ext == ".pdf":
-            images = self._convert_pdf_to_images(file_path, dpi=dpi)
-        # Handle already an image file
-        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
-            image = Image.open(file_path).convert("RGB")
-            # Convert to consistent format (e.g., PNG or JPEG in memory)
-            buffer = io.BytesIO()
-            image.save(buffer, format=output_format)
-            buffer.seek(0)
-            converted_image = Image.open(buffer)
-            images = [converted_image]
-        else:
-            raise ValueError("Unsupported file type. Use .pdf, .doc, .docx, or image files")
-        return images