PyPI - prevectorchunks-core - Versions diffs - 0.1.34__tar.gz → 0.1.36__tar.gz - Mend

prevectorchunks-core 0.1.34tar.gz → 0.1.36tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.36}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.34
+Version: 0.1.36
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
 Requires-Dist: cssselect2~=0.7.0
 Requires-Dist: cairocffi~=1.4.0
 Requires-Dist: tensorflow~=2.12.0
+Requires-Dist: pandas~=2.2.2
+Requires-Dist: openpyxl~=3.1.2
+Requires-Dist: python-pptx~=0.6.21
 Dynamic: license-file
 # 📚 PreVectorChunks

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToImageConverter.py RENAMED Viewed

@@ -164,9 +164,20 @@ class DocuToImageConverter:
         # Word → PDF
         if ext in [".doc", ".docx"]:
-            pdf_path = self._convert_doc_to_pdf(file_path)
+            pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
             images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # PowerPoint → PDF
+        elif ext in [".ppt", ".pptx"]:
+            pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # Excel → PDF
+        elif ext in [".xls", ".xlsx"]:
+            pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
         # PDF → images
         elif ext == ".pdf":
             images = self._convert_pdf_to_images(file_path, dpi=dpi)
@@ -183,3 +194,125 @@ class DocuToImageConverter:
             raise ValueError("Unsupported file type.")
         return images
+    def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert PPT/PPTX to PDF using:
+        1. PowerPoint COM (Windows)
+        2. LibreOffice
+        """
+        # write bytes if needed
+        if input_bytes is not None:
+            original_name = getattr(input_bytes, "name", "uploaded.pptx")
+            ext = os.path.splitext(original_name)[1] or ".pptx"
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:
+                content = input_bytes
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try PowerPoint COM on Windows
+        try:
+            import win32com.client
+            powerpoint = win32com.client.Dispatch("PowerPoint.Application")
+            powerpoint.Visible = 1
+            deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
+            deck.SaveAs(str(Path(output_pdf).resolve()), 32)  # 32 = PDF
+            deck.Close()
+            powerpoint.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ Try LibreOffice
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        raise ValueError("Unable to convert PPT/PPTX to PDF")
+    def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert XLS/XLSX to PDF using:
+        1. Excel COM (Windows)
+        2. LibreOffice
+        """
+        # write bytes if needed
+        if input_bytes is not None:
+            original_name = getattr(input_bytes, "name", "uploaded.xlsx")
+            ext = os.path.splitext(original_name)[1] or ".xlsx"
+            temp_input_path = tempfile.mktemp(suffix=ext)
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                content = input_bytes.read()
+            else:
+                content = input_bytes
+            with open(temp_input_path, "wb") as f:
+                f.write(content)
+            input_path = temp_input_path
+        elif file_path:
+            input_path = file_path
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try Excel COM (Windows)
+        try:
+            import win32com.client
+            excel = win32com.client.Dispatch("Excel.Application")
+            excel.Visible = False
+            wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
+            wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve()))  # 0 = PDF
+            wb.Close()
+            excel.Quit()
+            return output_pdf
+        except Exception:
+            pass
+        # 2️⃣ Try LibreOffice
+        try:
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            return output_pdf
+        except Exception:
+            pass
+        raise ValueError("Unable to convert XLS/XLSX to PDF")

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToMarkdownExtractor.py RENAMED Viewed

@@ -3,11 +3,13 @@ import os
 import tempfile
 import base64
+from langchain.chat_models import init_chat_model
 from openai import OpenAI
 from PIL import Image
 from dotenv import load_dotenv
+from openai.types import ChatModel
 from .image_processor import ImageProcessor
@@ -18,9 +20,19 @@ load_dotenv(override=True)
 class DocuToMarkdownExtractor:
     """Sends image pages to an LLM and extracts Markdown text + tables."""
-    def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
-        self.client = OpenAI(api_key=api_key)
-        self.model = model
+    def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
+        if client is None:
+            client = init_chat_model(
+                model=model,
+                model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+                api_key=api_key
+            )
+        self.client = client
+        self.model = client.model_name
+        # Initialize ImageProcessor once and pass the chat model
+        self.processor = ImageProcessor(client=self.client)
     def _image_to_base64(self, image: Image.Image) -> str:
         """Converts PIL image to base64-encoded PNG string."""
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
             with open(tmp.name, "rb") as f:
                 return base64.b64encode(f.read()).decode("utf-8")
-    def extract_markdown(self, images,include_image:True):
+    def extract_markdown(self, images,include_image:bool=True):
         """Extracts Markdown-formatted text from each image page."""
         all_outputs = []
         text_content=""
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
                 try:
                     response = json.loads(response)  # Convert JSON string to dictionary
                 except json.JSONDecodeError:
-                    raise ValueError("The response from 'processor.analyze' is not valid JSON.")
+                    print('skipping quietly')
+                    #raise ValueError("The response from 'processor.analyze' is not valid JSON.")
             text_content=text_content+"\n"+response["markdown_text"]
             if(include_image):
                 response["image_data"]=b64_image

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_documents_crud_vdb.py RENAMED Viewed

@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
 #function that chunks any document
-def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
-    return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
+def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
+    return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
 #function that chunks any document as well as inserts into vdb
 def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_to_all_content_mapper.py RENAMED Viewed

@@ -2,39 +2,35 @@ import numpy as np
 class ChunkMapper:
-    def __init__(self, client, markdown_output, embedding_model="text-embedding-3-small"):
+    def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
         """
         client: OpenAI client object
         markdown_output: list of JSON objects containing at least 'markdown_text'
         embedding_model: model for embeddings
         """
-        self.client = client
+        self.embedding_client = embedding_client
         self.markdown_output = markdown_output
         self.embedding_model = embedding_model
         # Precompute embeddings for markdown_output
         self.markdown_embeddings = self._compute_markdown_embeddings()
-    # -----------------------------
-    # Compute embeddings for all markdown items
-    # -----------------------------
+        # -----------------------------
+        # Compute embeddings for markdown JSON items
+        # -----------------------------
     def _compute_markdown_embeddings(self):
-        embeddings = []
-        for obj in self.markdown_output:
-            markdown_text = obj.get("markdown_text", "")
-            emb = self._get_embedding(markdown_text)
-            embeddings.append(emb)
-        return embeddings
+        texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
+        return self.embedding_client.embed_documents(texts)
+        # -----------------------------
+        # Get embedding for a single text
+        # -----------------------------
-    # -----------------------------
-    # Embedding helper
-    # -----------------------------
     def _get_embedding(self, text):
-        response = self.client.embeddings.create(
-            input=text,
-            model=self.embedding_model
-        )
-        return response.data[0].embedding
+        # LangChain uses a list input
+        emb = self.embedding_client.embed_query(text)
+        return emb
     # -----------------------------
     # Cosine similarity

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/image_processor.py RENAMED Viewed

@@ -10,6 +10,8 @@ import requests
 from dotenv import load_dotenv
 from typing import Optional
+from langchain.chat_models import init_chat_model
+from langchain_core.messages import HumanMessage
 from openai import OpenAI
 from langchain_core.pydantic_v1 import BaseModel
@@ -31,15 +33,22 @@ class ImageProcessor:
     Wrapper for a GPT-4o multimodal image reasoning pipeline.
     """
-    def __init__(self, model_name: str = "gpt-4o-mini"):
+    def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
         load_dotenv(override=True)
         self.api_key = os.getenv("OPENAI_API_KEY")
         if not self.api_key:
             raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
+        if client is None:
+            client = init_chat_model(
+                model=model_name,
+                model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+                api_key=api_key
+            )
+        self.llm = client
         # Initialize multimodal client
-        self.llm = OpenAI(api_key=self.api_key)
-        self.model_name = model_name
+        self.model_name = client.model_name
     # -------------------------------------------------
     # 3️⃣ Image encoding helper
@@ -70,17 +79,11 @@ class ImageProcessor:
                         },
                     ]
         content1.extend(finstructioncontent)
-        response = self.llm.chat.completions.create(
-            model=self.model_name,
-            messages=[
-                {
-                    "role": "user",
-                    "content": content1
-                }
-            ],
-        )
-        result_text = response.choices[0].message.content
+        # Call the LangChain model
+        response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
+        # Extract the text
+        result_text = response_msg.content
         print("✅ Analysis complete.")
         print(result_text)
         return result_text

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/markdown_and_chunk_documents.py RENAMED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 from docx import Document
 from dotenv import load_dotenv
+from langchain_openai import OpenAIEmbeddings
 from openai import OpenAI
 from PIL import Image
@@ -111,6 +112,34 @@ class ImageStrategy(BaseDocumentStrategy):
         return [image]
+class PowerPointStrategy(BaseDocumentStrategy):
+    def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
+        print(f"📊 Using PowerPointStrategy for {file_path or input_bytes}")
+        converter = DocuToImageConverter()
+        # Convert PPT/PPTX → PDF
+        pdf_path = converter._convert_ppt_to_pdf(file_path=file_path, input_bytes=input_bytes)
+        # Then convert PDF → images
+        images = converter.convert_to_images(pdf_path)
+        return images
+class ExcelStrategy(BaseDocumentStrategy):
+    def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
+        print(f"📈 Using ExcelStrategy for {file_path or input_bytes}")
+        converter = DocuToImageConverter()
+        # Convert XLS/XLSX → PDF
+        pdf_path = converter._convert_excel_to_pdf(file_path=file_path, input_bytes=input_bytes)
+        # Convert PDF → images
+        images = converter.convert_to_images(pdf_path)
+        return images
 # -----------------------------
 # Strategy Factory
@@ -127,6 +156,17 @@ class StrategyFactory:
         ".png": ImageStrategy(),
         ".bmp": ImageStrategy(),
         ".tiff": ImageStrategy(),
+        # NEW — PowerPoint
+        ".ppt": PowerPointStrategy(),
+        ".pptx": PowerPointStrategy(),
+        # NEW — Excel
+        ".xls": ExcelStrategy(),
+        ".xlsx": ExcelStrategy(),
+        # NEW — Google Docs/Sheets
     }
     @classmethod
@@ -145,11 +185,12 @@ class StrategyFactory:
 # Main Orchestrator
 # -----------------------------
 class MarkdownAndChunkDocuments:
-    def __init__(self):
+    def __init__(self,client):
         self.api_key = os.getenv("OPENAI_API_KEY")
-        self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
+        self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
+        self.client=client
-    def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
+    def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
         # Pick strategy
         strategy = StrategyFactory.get_strategy(file_path,file_name)
         if not strategy:
@@ -164,8 +205,14 @@ class MarkdownAndChunkDocuments:
         binary_text_content = text_content.encode("utf-8")
         # Chunking and mapping
-        chunk_client = OpenAI(api_key=self.api_key)
-        cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
+        #chunk_client = OpenAI(api_key=self.api_key)
+        if embedding_client is None:
+            embedding_client = OpenAIEmbeddings(
+                model="text-embedding-3-small",
+                api_key=self.api_key
+            )
+        cm = ChunkMapper(embedding_client, markdown_output, embedding_model="text-embedding-3-small")
         splitter_config = SplitterConfig(
             chunk_size=300,
             chunk_overlap=0,
@@ -191,6 +238,77 @@ class MarkdownAndChunkDocuments:
         print("✅ Processing complete.")
         return mapped_chunks
+    def markdown_and_chunk_documents_stream(
+            self,
+            file_path: str,
+            input_bytes: bytes = None,
+            include_image: bool = None,
+            file_name: str = None,
+    ):
+        """Generator version of markdown_and_chunk_documents that yields progress JSON events"""
+        def report(pct, msg=""):
+            yield {"progress": int(pct), "status": msg}
+        # 1️⃣ Pick strategy
+        yield from report(5, "Selecting strategy...")
+        strategy = StrategyFactory.get_strategy(file_path, file_name)
+        if not strategy:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        # 2️⃣ Convert to images
+        ext = get_file_extension(file_path, file_name)
+        yield from report(15, "Processing file into images...")
+        images = strategy.process(file_path, input_bytes, ext)
+        # 3️⃣ Extract Markdown
+        yield from report(35, "Extracting markdown...")
+        markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
+        binary_text_content = text_content.encode("utf-8")
+        # 4️⃣ Chunking
+        yield from report(55, "Chunking text...")
+        chunk_client = OpenAI(api_key=self.api_key)
+        cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
+        splitter_config = SplitterConfig(
+            chunk_size=300,
+            chunk_overlap=0,
+            separators=["\n"],
+            split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
+            min_rl_chunk_size=5,
+            max_rl_chunk_size=50,
+            enableLLMTouchUp=False,
+        )
+        chunked_text = chunk_documents(
+            "", file_name="install_ins.txt", file_path=binary_text_content, splitter_config=splitter_config
+        )
+        flat_chunks = ["".join(inner) for inner in chunked_text]
+        # 5️⃣ Map chunks (embedding)
+        yield from report(60, f"Mapping {len(flat_chunks)} chunks...")
+        total = len(flat_chunks)
+        mapped_chunks = []
+        for i, chunk in enumerate(flat_chunks, start=1):
+            mapped = cm.map_chunks([chunk])
+            mapped_chunks.extend(mapped)
+            progress = 60 + (i / total) * 30
+            yield from report(progress, f"Mapping chunk {i}/{total}")
+        # 6️⃣ Merge unmapped markdown sections
+        yield from report(95, "Merging markdown...")
+        for md_item in markdown_output:
+            if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
+                md_item["chunked_text"] = md_item["markdown_text"]
+                mapped_chunks.append(md_item)
+        adduuid(mapped_chunks)
+        yield from report(100, "✅ Processing complete.")
+        # Final result
+        yield {"progress": 100, "status": "done", "result": mapped_chunks}
 def adduuid(mapped_chunks):
     # Assuming mapped_chunks is a list of dictionaries

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/test_loader.py RENAMED Viewed

@@ -1,12 +1,15 @@
 import json
 import pytest
+from dotenv import load_dotenv
+from langchain.chat_models import init_chat_model
+from langchain_openai import OpenAIEmbeddings
 from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
 from core.prevectorchunks_core.services import chunk_documents_crud_vdb
 from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
 from core.prevectorchunks_core.utils.file_loader import SplitType
+import os
+load_dotenv(override=True)
 # Create a temporary JSON file to test with
 @pytest.fixture
 def temp_json_file(tmp_path):
@@ -19,12 +22,16 @@ def temp_json_file(tmp_path):
 def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
     splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
-                                     split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
-                                     max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
+                                     split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
+                                     max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED)
+    client = init_chat_model(
+        model="gpt-4o-mini",
+        model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+        api_key=os.getenv("OPENAI_API_KEY")
+    )
+    chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",
-    chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
-                                                      splitter_config=splitter_config)
+                                                      splitter_config=splitter_config,client=client)
     print(chunks)
     for i, c in enumerate(chunks):
@@ -32,9 +39,19 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
     print(chunks)
 def test_markdown(temp_json_file):
-    markdown_and_chunk_documents = MarkdownAndChunkDocuments()
+    client = init_chat_model(
+        model="gpt-4o-mini",
+        model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+        api_key=os.getenv("OPENAI_API_KEY")
+    )
+    markdown_and_chunk_documents = MarkdownAndChunkDocuments(client)
+    embedding_client = OpenAIEmbeddings(
+        model="text-embedding-3-small",
+        api_key=os.getenv("OPENAI_API_KEY")
+    )
     mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
-        "content.docx",include_image=True)
+        "C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",include_image=True,embedding_client=embedding_client)
     print(mapped_chunks)
     for i, c in enumerate(mapped_chunks):
         print(f"Chunk {i + 1}: {c}")

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/file_loader.py RENAMED Viewed

@@ -9,6 +9,7 @@ from PIL import Image
 import pytesseract
 import uuid
+from langchain.chat_models import init_chat_model
 from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
 from openai import OpenAI
 from openai import OpenAI
@@ -26,7 +27,8 @@ client =  OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 from django.core.files.uploadedfile import UploadedFile
 from enum import Enum
+import pandas as pd
+from pptx import Presentation
 class SplitType(Enum):
     RECURSIVE = "RecursiveCharacterTextSplitter"
     CHARACTER = "CharacterTextSplitter"
@@ -151,6 +153,35 @@ def load_file_by_type(ext, filepath):
             data = json.load(f)
             # Convert JSON to text (pretty print or flatten)
             text = json.dumps(data, ensure_ascii=False, indent=2)
+    # -------------------------
+    # PPTX (PowerPoint)
+    # -------------------------
+    elif ext in [".pptx", ".ppt"]:
+        pres = Presentation(filepath)
+        slides_text = []
+        for slide in pres.slides:
+            slide_text = []
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    slide_text.append(shape.text)
+            slides_text.append("\n".join(slide_text))
+        text = "\n\n---- Slide Break ----\n\n".join(slides_text)
+    # -------------------------
+    # Excel (XLS / XLSX)
+    # -------------------------
+    elif ext in [".xlsx", ".xls"]:
+        # Using pandas for convenience
+        try:
+            df_dict = pd.read_excel(filepath, sheet_name=None)
+            all_sheets = []
+            for sheet, df in df_dict.items():
+                sheet_text = f"=== Sheet: {sheet} ===\n"
+                sheet_text += df.to_string(index=False)
+                all_sheets.append(sheet_text)
+            text = "\n\n".join(all_sheets)
+        except Exception as e:
+            raise ValueError(f"Failed to read Excel file: {e}")
     else:
         raise ValueError(f"Unsupported file type: {ext}")
     return text
@@ -220,38 +251,63 @@ def split_text_by_config(text, splitter_config:SplitterConfig=None, binary_data=
             return [" ".join(words[i:i + splitter_config.chunk_size]) for i in
                     range(0, len(words), splitter_config.chunk_size)]
+import json
+from langchain.schema import HumanMessage
+import uuid
-def process_with_llm(chunk,instructions):
+def process_with_llm(chunk, instructions=None, xclient=None):
     """
     Send a chunk to LLM and return structured JSON array.
     Expected format: [{"id": ..., "title": ..., "text": ...}, ...]
     """
-    context = f"""
-    Take the following text and split it into sections based on the most important category headings (ignore lower level headings).
-    For each section, return a JSON object with - no extra words other than the json and remove ```json:
-    - "id" (a UUID you generate),
-    - "title" (the most important heading),
-    - "text" (the remaining text under that heading).
+    instructions = instructions or "Extract sections"
+    # Combine chunk + instructions into one prompt
+    prompt_text = f"""
+    You are a helpful assistant that structures text into JSON sections.
+    Take the following text and split it into sections based on the most important category headings.
+     return a JSON array of objects with the following keys:
+      - "id" (a UUID you generate)
+      - "title" (the most important heading)
+      - "text" (the remaining text under that heading)
+    Return ONLY valid JSON, without extra text or backtick or markdown formatting.
     Text:
     {chunk}
+    Instructions: {instructions}
     """
-    instructions=instructions or "Exract sections"
-    system_prompt="You are a helpful assistant that structures text into JSON sections."
-    # Create an instance of your LLM wrapper
-    llm = LLMClientWrapper(client, model="gpt-4o-mini", temperature=0, system_prompt=system_prompt)
-    response=llm.chat(context,instructions)
+    # Use provided client or create new wrapper
+    if xclient is None:
+        xclient = init_chat_model(
+            model="gpt-4o-mini",
+            model_provider="openai",  # you can later swap to "anthropic", "google", etc.
+            api_key=os.getenv("OPENAI_API_KEY")
+        )
+    # Call the LLM
+    response_msg = xclient.predict_messages([HumanMessage(content=prompt_text)])
+    response_text = response_msg.content
     # Parse JSON safely
     try:
-        structured_data = eval(response)
-    except Exception:
-        structured_data = []
+        structured_data = json.loads(response_text)
+        if isinstance(structured_data, str):
+            # Sometimes LLM returns a JSON string inside quotes
+            structured_data = json.loads(structured_data)
+    except json.JSONDecodeError as e:
+        print("LLM returned invalid JSON:", response_text)
+        raise e
+    for item in structured_data:
+        if isinstance(item, dict) and "id" not in item:
+            item["id"] = str(uuid.uuid4())
     return structured_data
-def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
+def process_large_text(text, instructions,splitter_config:SplitterConfig=None,client=None):
     """Main function: split -> send to LLM -> collect results."""
     chunks = split_text_by_config(text, splitter_config=splitter_config)
     all_results = []
@@ -261,7 +317,7 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
             return chunks
         elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
             for chunk in chunks:
-                structured = process_with_llm(chunk,instructions)
+                structured = process_with_llm(chunk,instructions,client)
                 # Ensure UUIDs exist
                 for obj in structured:
                     if "id" not in obj:
@@ -274,9 +330,9 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
-def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None):
+def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None,client=None):
     content =extract_content_agnostic(file_path,file_name)
-    results=process_large_text(content,instructions, splitter_config=splitter_config)
+    results=process_large_text(content,instructions, splitter_config=splitter_config,client=client)
     print (results)
     return results

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36/prevectorchunks_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.34
+Version: 0.1.36
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
 Requires-Dist: cssselect2~=0.7.0
 Requires-Dist: cairocffi~=1.4.0
 Requires-Dist: tensorflow~=2.12.0
+Requires-Dist: pandas~=2.2.2
+Requires-Dist: openpyxl~=3.1.2
+Requires-Dist: python-pptx~=0.6.21
 Dynamic: license-file
 # 📚 PreVectorChunks

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/requires.txt RENAMED Viewed

@@ -36,3 +36,6 @@ lxml~=4.9.3
 cssselect2~=0.7.0
 cairocffi~=1.4.0
 tensorflow~=2.12.0
+pandas~=2.2.2
+openpyxl~=3.1.2
+python-pptx~=0.6.21

{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.34"
+version = "0.1.36"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -52,6 +52,10 @@ dependencies = [
     "cssselect2~=0.7.0",
     "cairocffi~=1.4.0",
     "tensorflow~=2.12.0",   # <-- Add this
+        # 👉 Add these
+    "pandas~=2.2.2",
+    "openpyxl~=3.1.2",
+    "python-pptx~=0.6.21",
 ]