PyPI - prevectorchunks-core - Versions diffs - 0.1.33__tar.gz → 0.1.35__tar.gz - Mend

prevectorchunks-core 0.1.33tar.gz → 0.1.35tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.35}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.33
+Version: 0.1.35
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
 Requires-Dist: cssselect2~=0.7.0
 Requires-Dist: cairocffi~=1.4.0
 Requires-Dist: tensorflow~=2.12.0
+Requires-Dist: pandas~=1.5.3
+Requires-Dist: openpyxl~=3.1.2
+Requires-Dist: python-pptx~=0.6.21
 Dynamic: license-file
 # 📚 PreVectorChunks

prevectorchunks_core-0.1.35/prevectorchunks_core/services/DocuToImageConverter.py ADDED Viewed

@@ -0,0 +1,318 @@
+import os
+import tempfile
+import shutil
+import subprocess
+from pathlib import Path
+from PIL import Image
+import io
+import fitz
+from docx2pdf import convert as docx2pdf_convert
+from docx import Document
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import A4
+import pypandoc
+# Ensure pandoc is available
+try:
+    pypandoc.get_pandoc_path()
+except OSError:
+    pypandoc.download_pandoc()
+class DocuToImageConverter:
+    """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
+    def __init__(self):
+        pass
+    def _write_temp_file(self, input_bytes: bytes, suffix: str):
+        """Write bytes to a temporary file and return path."""
+        tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
+        with os.fdopen(tmp_fd, "wb") as f:
+            f.write(input_bytes.read())
+        return tmp_path
+    def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert DOC/DOCX to PDF. Supports:
+        - file_path (string)
+        - input_bytes (bytes, InMemoryUploadedFile, or file-like)
+        """
+        # ✅ If bytes are provided, write them to a temporary .docx
+        if input_bytes is not None:
+            # Get filename or fallback
+            original_name = getattr(input_bytes, "name", "uploaded.docx")
+            ext = os.path.splitext(original_name)[1] or ".docx"
+            # Create a temporary file path
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            # Read bytes safely
+            if hasattr(input_bytes, "read"):  # Django UploadedFile
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:  # already bytes
+                content = input_bytes
+            # Write bytes to temp file
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        # ✅ If file_path is provided, use it directly
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        # ✅ Must exist at this point
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(input_path)
+        # ✅ Prepare output PDF path
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try Microsoft Word COM automation (Windows)
+        try:
+            import win32com.client
+            word = win32com.client.Dispatch("Word.Application")
+            word.Visible = False
+            doc = word.Documents.Open(str(Path(input_path).resolve()))
+            doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
+            doc.Close()
+            word.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ Try LibreOffice
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        # 3️⃣ Pandoc fallback
+        try:
+            pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
+            pypandoc.convert_file(
+                input_path, "pdf",
+                outputfile=output_pdf,
+                extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
+            )
+            return output_pdf
+        except Exception:
+            pass
+        # 4️⃣ Final fallback: Render plain text using ReportLab
+        doc = Document(input_path)
+        c = canvas.Canvas(output_pdf, pagesize=A4)
+        width, height = A4
+        y = height - 50
+        for p in doc.paragraphs:
+            c.drawString(50, y, p.text[:1000])
+            y -= 15
+            if y < 50:
+                c.showPage()
+                y = height - 50
+        c.save()
+        return output_pdf
+    def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
+        images = []
+        pdf_document = fitz.open(pdf_path)
+        for page_num in range(len(pdf_document)):
+            page = pdf_document[page_num]
+            pixmap = page.get_pixmap(dpi=dpi)
+            image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
+            images.append(image)
+        pdf_document.close()
+        return images
+    def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
+        """
+        Convert a file path or binary content to PIL images.
+        Supports PDF, DOC, DOCX, and image files.
+        """
+        if not file_path and not input_bytes:
+            raise ValueError("Provide either file_path or input_bytes.")
+        # Determine extension
+        if file_path:
+            ext = os.path.splitext(file_path)[1].lower()
+            print('work')
+        elif input_bytes:
+            # Attempt to infer from first few bytes (simple)
+            # if input_bytes[:4] == b"%PDF":
+            #     ext = ".pdf"
+            # elif input_bytes[:2] == b"PK":
+            #     ext = ".docx"
+            # else:
+            #     ext = ".img"  # Treat as generic image
+            # Write to temp file if doc/pdf
+            if ext in [".pdf", ".doc", ".docx"]:
+                file_path = self._write_temp_file(input_bytes, suffix=ext)
+        # Word → PDF
+        if ext in [".doc", ".docx"]:
+            pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # PowerPoint → PDF
+        elif ext in [".ppt", ".pptx"]:
+            pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # Excel → PDF
+        elif ext in [".xls", ".xlsx"]:
+            pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # PDF → images
+        elif ext == ".pdf":
+            images = self._convert_pdf_to_images(file_path, dpi=dpi)
+        # Image
+        elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
+            image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
+            buffer = io.BytesIO()
+            image.save(buffer, format=output_format)
+            buffer.seek(0)
+            images = [Image.open(buffer)]
+        else:
+            raise ValueError("Unsupported file type.")
+        return images
+    def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert PPT/PPTX to PDF using:
+        1. PowerPoint COM (Windows)
+        2. LibreOffice
+        """
+        # write bytes if needed
+        if input_bytes is not None:
+            original_name = getattr(input_bytes, "name", "uploaded.pptx")
+            ext = os.path.splitext(original_name)[1] or ".pptx"
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:
+                content = input_bytes
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try PowerPoint COM on Windows
+        try:
+            import win32com.client
+            powerpoint = win32com.client.Dispatch("PowerPoint.Application")
+            powerpoint.Visible = 1
+            deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
+            deck.SaveAs(str(Path(output_pdf).resolve()), 32)  # 32 = PDF
+            deck.Close()
+            powerpoint.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ Try LibreOffice
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        raise ValueError("Unable to convert PPT/PPTX to PDF")
+    def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert XLS/XLSX to PDF using:
+        1. Excel COM (Windows)
+        2. LibreOffice
+        """
+        # write bytes if needed
+        if input_bytes is not None:
+            original_name = getattr(input_bytes, "name", "uploaded.xlsx")
+            ext = os.path.splitext(original_name)[1] or ".xlsx"
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:
+                content = input_bytes
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try Excel COM (Windows)
+        try:
+            import win32com.client
+            excel = win32com.client.Dispatch("Excel.Application")
+            excel.Visible = False
+            wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
+            wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve()))  # 0 = PDF
+            wb.Close()
+            excel.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ Try LibreOffice
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        raise ValueError("Unable to convert XLS/XLSX to PDF")

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/DocuToMarkdownExtractor.py RENAMED Viewed

@@ -3,11 +3,13 @@ import os
 import tempfile
 import base64
+from langchain.chat_models import init_chat_model
 from openai import OpenAI
 from PIL import Image
 from dotenv import load_dotenv
+from openai.types import ChatModel
 from .image_processor import ImageProcessor
@@ -18,9 +20,19 @@ load_dotenv(override=True)
 class DocuToMarkdownExtractor:
     """Sends image pages to an LLM and extracts Markdown text + tables."""
-    def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
-        self.client = OpenAI(api_key=api_key)
-        self.model = model
+    def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
+        if client is None:
+            client = init_chat_model(
+                model=model,
+                model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+                api_key=api_key
+            )
+        self.client = client
+        self.model = client.model_name
+        # Initialize ImageProcessor once and pass the chat model
+        self.processor = ImageProcessor(client=self.client)
     def _image_to_base64(self, image: Image.Image) -> str:
         """Converts PIL image to base64-encoded PNG string."""
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
             with open(tmp.name, "rb") as f:
                 return base64.b64encode(f.read()).decode("utf-8")
-    def extract_markdown(self, images,include_image:True):
+    def extract_markdown(self, images,include_image:bool=True):
         """Extracts Markdown-formatted text from each image page."""
         all_outputs = []
         text_content=""
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
                 try:
                     response = json.loads(response)  # Convert JSON string to dictionary
                 except json.JSONDecodeError:
-                    raise ValueError("The response from 'processor.analyze' is not valid JSON.")
+                    print('skipping quietly')
+                    #raise ValueError("The response from 'processor.analyze' is not valid JSON.")
             text_content=text_content+"\n"+response["markdown_text"]
             if(include_image):
                 response["image_data"]=b64_image

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_documents_crud_vdb.py RENAMED Viewed

@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
 #function that chunks any document
-def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
-    return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
+def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
+    return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
 #function that chunks any document as well as inserts into vdb
 def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_to_all_content_mapper.py RENAMED Viewed

@@ -2,39 +2,35 @@ import numpy as np
 class ChunkMapper:
-    def __init__(self, client, markdown_output, embedding_model="text-embedding-3-small"):
+    def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
         """
         client: OpenAI client object
         markdown_output: list of JSON objects containing at least 'markdown_text'
         embedding_model: model for embeddings
         """
-        self.client = client
+        self.embedding_client = embedding_client
         self.markdown_output = markdown_output
         self.embedding_model = embedding_model
         # Precompute embeddings for markdown_output
         self.markdown_embeddings = self._compute_markdown_embeddings()
-    # -----------------------------
-    # Compute embeddings for all markdown items
-    # -----------------------------
+        # -----------------------------
+        # Compute embeddings for markdown JSON items
+        # -----------------------------
     def _compute_markdown_embeddings(self):
-        embeddings = []
-        for obj in self.markdown_output:
-            markdown_text = obj.get("markdown_text", "")
-            emb = self._get_embedding(markdown_text)
-            embeddings.append(emb)
-        return embeddings
+        texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
+        return self.embedding_client.embed_documents(texts)
+        # -----------------------------
+        # Get embedding for a single text
+        # -----------------------------
-    # -----------------------------
-    # Embedding helper
-    # -----------------------------
     def _get_embedding(self, text):
-        response = self.client.embeddings.create(
-            input=text,
-            model=self.embedding_model
-        )
-        return response.data[0].embedding
+        # LangChain uses a list input
+        emb = self.embedding_client.embed_query(text)
+        return emb
     # -----------------------------
     # Cosine similarity

{prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/image_processor.py RENAMED Viewed

@@ -10,6 +10,8 @@ import requests
 from dotenv import load_dotenv
 from typing import Optional
+from langchain.chat_models import init_chat_model
+from langchain_core.messages import HumanMessage
 from openai import OpenAI
 from langchain_core.pydantic_v1 import BaseModel
@@ -31,15 +33,22 @@ class ImageProcessor:
     Wrapper for a GPT-4o multimodal image reasoning pipeline.
     """
-    def __init__(self, model_name: str = "gpt-4o-mini"):
+    def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
         load_dotenv(override=True)
         self.api_key = os.getenv("OPENAI_API_KEY")
         if not self.api_key:
             raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
+        if client is None:
+            client = init_chat_model(
+                model=model_name,
+                model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+                api_key=api_key
+            )
+        self.llm = client
         # Initialize multimodal client
-        self.llm = OpenAI(api_key=self.api_key)
-        self.model_name = model_name
+        self.model_name = client.model_name
     # -------------------------------------------------
     # 3️⃣ Image encoding helper
@@ -70,17 +79,11 @@ class ImageProcessor:
                         },
                     ]
         content1.extend(finstructioncontent)
-        response = self.llm.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {
-                    "role": "user",
-                    "content": content1
-                }
-            ],
-        )
-        result_text = response.choices[0].message.content
+        # Call the LangChain model
+        response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
+        # Extract the text
+        result_text = response_msg.content
         print("✅ Analysis complete.")
         print(result_text)
         return result_text

prevectorchunks-core 0.1.33__tar.gz → 0.1.35__tar.gz

prevectorchunks-core 0.1.33tar.gz → 0.1.35tar.gz