PyPI - prevectorchunks-core - Versions diffs - 0.1.39__tar.gz → 0.1.41__tar.gz - Mend

prevectorchunks-core 0.1.39tar.gz → 0.1.41tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

{prevectorchunks_core-0.1.39/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.41}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.39
+Version: 0.1.41
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -12,47 +12,35 @@ Requires-Python: <3.12,>=3.7
 Description-Content-Type: text/markdown
 License-File: LICENCE
 License-File: LICENSE
-Requires-Dist: packaging~=24.1
-Requires-Dist: openai<3.0.0,>=2.6.0
-Requires-Dist: python-dotenv~=1.0.1
-Requires-Dist: PyJWT~=2.7.0
+Requires-Dist: Django==5.1
+Requires-Dist: django-cors-headers~=4.4.0
 Requires-Dist: fastapi~=0.112.2
-Requires-Dist: datasets~=4.1.0
+Requires-Dist: PyJWT~=2.7.0
+Requires-Dist: langchain-text-splitters~=0.3.11
+Requires-Dist: openai~=2.6.0
 Requires-Dist: pinecone~=7.3.0
+Requires-Dist: python-dotenv~=1.0.1
 Requires-Dist: pytesseract~=0.3.13
 Requires-Dist: python-docx~=1.2.0
 Requires-Dist: PyPDF2~=3.0.1
 Requires-Dist: pillow~=11.3.0
-Requires-Dist: torch~=2.2.2
-Requires-Dist: torchvision~=0.17.2
-Requires-Dist: torchaudio~=2.2.2
+Requires-Dist: datasets~=4.1.1
+Requires-Dist: torch~=2.6.0
+Requires-Dist: torchvision~=0.21.0
+Requires-Dist: torchaudio~=2.6.0
 Requires-Dist: sentence-transformers~=5.1.1
-Requires-Dist: py-gutenberg~=1.0.3
-Requires-Dist: langchain-text-splitters~=0.3.11
-Requires-Dist: langchain~=0.3
-Requires-Dist: langchain_openai~=0.3.35
-Requires-Dist: accelerate>=0.22.0
 Requires-Dist: pathlib~=1.0.1
 Requires-Dist: transformers~=4.57.0
 Requires-Dist: imageio-ffmpeg~=0.6.0
-Requires-Dist: opencv-python~=4.8.0.76
+Requires-Dist: opencv-python~=4.12.0.88
 Requires-Dist: requests~=2.32.5
-Requires-Dist: langchain-core~=0.3.78
+Requires-Dist: langchain~=1.3.9
+Requires-Dist: langchain-openai~=1.0.0
 Requires-Dist: pdf2image~=1.17.0
 Requires-Dist: docx2pdf~=0.1.8
-Requires-Dist: numpy~=1.23.5
+Requires-Dist: numpy~=2.2.6
 Requires-Dist: scikit-learn~=1.7.2
-Requires-Dist: PyMuPDF~=1.22.5
-Requires-Dist: pypandoc~=1.13
-Requires-Dist: reportlab~=4.1.0
-Requires-Dist: weasyprint~=62.0
-Requires-Dist: lxml~=4.9.3
-Requires-Dist: cssselect2~=0.7.0
-Requires-Dist: cairocffi~=1.4.0
-Requires-Dist: tensorflow~=2.12.0
-Requires-Dist: pandas~=2.2.2
-Requires-Dist: openpyxl~=3.1.2
-Requires-Dist: python-pptx~=0.6.21
+Requires-Dist: fitz~=0.0.1.dev2
 Dynamic: license-file
 # 📚 PreVectorChunks

prevectorchunks_core-0.1.41/prevectorchunks_core/os-llm/dsqwen.py ADDED Viewed

@@ -0,0 +1,24 @@
+from transformers import pipeline
+# ----- Step 1: Load the model using a text-generation pipeline -----
+# DeepSeek-R1-Distill-Qwen-1.5B is a text-only model
+pipe = pipeline(
+    "text-generation",
+    model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
+    device=0  # set to -1 for CPU, or 0 for GPU if available
+)
+# ----- Step 2: Define your prompt -----
+prompt = "Hello, are you Jon?"
+# ----- Step 3: Run inference -----
+out = pipe(
+    prompt,
+    max_new_tokens=50,  # controls length of generated output
+    do_sample=True,     # optional: random sampling for variation
+    temperature=0.7     # optional: controls creativity
+)
+# ----- Step 4: Print output -----
+# `out` is a list of dicts, each dict has 'generated_text'
+print("Model response:", out[0]['generated_text'])

prevectorchunks_core-0.1.41/prevectorchunks_core/os-llm/llava.py ADDED Viewed

@@ -0,0 +1,29 @@
+from transformers import pipeline
+# #run locally
+# pipe = pipeline("image-text-to-text",
+#                 model="llava-hf/llava-1.5-13b-hf",
+#                 device_map="auto",
+#          load_in_4bit=True)
+pipe = pipeline(
+    "image-text-to-text",
+    model="llava-hf/llava-1.5-13b-hf",
+    device=-1,  # CPU
+    load_in_4bit=True,   # load model in 4-bit precision
+    use_auth_token=True
+)
+messages = [
+    {
+      "role": "user",
+      "content": [
+          {"type": "text", "text": "You are a content moderator - can you check if the content contains any personal information such as name, phone number, email etc"
+                                   "if the content contains personal information, return json failed"
+                                   "Here is the content : We need an electrician please contact John Doe on 0434343434"},
+        ],
+    },
+]
+out = pipe(text=messages, max_new_tokens=20)
+print(out)

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/DocuToImageConverter.py RENAMED Viewed

@@ -137,6 +137,89 @@ class DocuToImageConverter:
         pdf_document.close()
         return images
+    def _convert_txt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert a .txt file or text bytes to PDF using ReportLab.
+        """
+        # Read text
+        if input_bytes is not None:
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                text = input_bytes.read().decode("utf-8")
+            else:
+                text = input_bytes.decode("utf-8")
+        elif file_path:
+            with open(file_path, "r", encoding="utf-8") as f:
+                text = f.read()
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        # Prepare output PDF
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
+        # Write text to PDF
+        c = canvas.Canvas(output_pdf, pagesize=A4)
+        width, height = A4
+        y = height - 50
+        for line in text.splitlines():
+            c.drawString(50, y, line[:1000])  # truncate very long lines
+            y -= 15
+            if y < 50:
+                c.showPage()
+                y = height - 50
+        c.save()
+        return output_pdf
+    def _convert_csv_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
+        """
+        Convert a CSV file or CSV bytes to PDF using ReportLab tables.
+        """
+        # Read CSV data
+        rows = []
+        if input_bytes is not None:
+            if hasattr(input_bytes, "read"):
+                input_bytes.seek(0)
+                reader = csv.reader(io.StringIO(input_bytes.read().decode("utf-8")))
+            else:
+                reader = csv.reader(io.StringIO(input_bytes.decode("utf-8")))
+            rows = list(reader)
+        elif file_path:
+            with open(file_path, "r", encoding="utf-8") as f:
+                reader = csv.reader(f)
+                rows = list(reader)
+        else:
+            raise ValueError("Must supply either file_path or input_bytes")
+        if not rows:
+            raise ValueError("CSV is empty")
+        # Prepare output PDF
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
+        # Create a table PDF
+        doc = SimpleDocTemplate(output_pdf, pagesize=A4)
+        table = Table(rows, repeatRows=1)
+        # Style table
+        style = TableStyle([
+            ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
+            ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
+            ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+            ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+            ('FONTSIZE', (0, 0), (-1, -1), 10),
+            ('BOTTOMPADDING', (0, 0), (-1, 0), 6),
+            ('GRID', (0, 0), (-1, -1), 0.5, colors.grey)
+        ])
+        table.setStyle(style)
+        elements = [table]
+        doc.build(elements)
+        return output_pdf
     def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
         """
         Convert a file path or binary content to PIL images.
@@ -167,6 +250,15 @@ class DocuToImageConverter:
             pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
             images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # TXT → PDF
+        elif ext == ".txt":
+            pdf_path = self._convert_txt_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
+        # CSV → PDF
+        elif ext == ".csv":
+            pdf_path = self._convert_csv_to_pdf(file_path, input_bytes)
+            images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
         # PowerPoint → PDF
         elif ext in [".ppt", ".pptx"]:
             pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/DocuToMarkdownExtractor.py RENAMED Viewed

@@ -41,6 +41,84 @@ class DocuToMarkdownExtractor:
             with open(tmp.name, "rb") as f:
                 return base64.b64encode(f.read()).decode("utf-8")
+    def extract_markdown_pages(self, pages, include_image: bool = True):
+        """Extracts Markdown from page images and integrates embedded image data."""
+        all_outputs = []
+        text_content = ""
+        for i, page in enumerate(pages, start=1):
+            print(f"🧠 Processing page {i}/{len(pages)}...")
+            # 1. Get the primary page image for OCR/Parsing
+            # If 'page' is a dict, we use the 'rendered_image' key
+            rendered_image = page.get("page_image")
+            b64_page_image = self._image_to_base64(rendered_image)
+            processor = ImageProcessor(model_name="gpt-4o-mini")
+            fins = [
+                {"type": "text", "text": (
+                    "You are a document parser. Extract all text, images and tables "
+                    "from this image and format the output in clean Markdown. "
+                    "Preserve table structure, headings, and lists. If no markdown, put a space. "
+                    "Describe any visual elements or images found on this page."
+                    "Return only VALID JSON with keys: markdown_text, short_title, page_number, summary."
+                )},
+            ]
+            response = processor.analyze(encoded_image=b64_page_image, finstructioncontent=fins)
+            if isinstance(response, str):
+                try:
+                    response = json.loads(response)
+                except json.JSONDecodeError:
+                    print('skipping quietly')
+                    continue
+            # 2. Integrate text content
+            text_content += "\n" + response.get("markdown_text", "")
+            # 3. Handle Embedded Images for this page
+            # We attach the high-res embedded images found in the DOCX to the page response
+            if include_image:
+                response["page_image_data"] = b64_page_image
+                # Map the specific embedded images extracted from the DOCX for this page
+                response["embedded_images"] = self.conv_to_base64(page,response)
+            response["image_index"] = i
+            response["page_number"] = i
+            all_outputs.append(response)
+        return all_outputs, text_content
+    def conv_to_base64(self, page,response):
+        # Get the raw list of embedded images (which likely contains binary blobs)
+        raw_embedded_list = page.get("embedded_images", [])
+        encoded_images = []
+        for img in raw_embedded_list:
+            # Check if we have image_bytes or a blob
+            # The source data from your extractor typically provides 'image_bytes' or 'blob'
+            image_bytes = img.get("image_bytes") or img.get("image_data")
+            if isinstance(image_bytes, bytes):
+                # Encode to base64 and decode to utf-8 string for JSON compatibility
+                b64_string = base64.b64encode(image_bytes).decode("utf-8")
+            else:
+                # If it's already a string or empty, keep as is
+                b64_string = image_bytes
+            encoded_images.append({
+                "image_index": img.get("image_index"),
+                "image_data": b64_string,  # Now a Base64 string
+                "content_type": img.get("content_type", "image/png")
+            })
+        return json.dumps(encoded_images)
     def extract_markdown(self, images,include_image:bool=True):
         """Extracts Markdown-formatted text from each image page."""
         all_outputs = []
@@ -51,7 +129,7 @@ class DocuToMarkdownExtractor:
             b64_image = self._image_to_base64(image)
             processor = ImageProcessor(model_name="gpt-4o-mini")
-            fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
+            fins = [{"type": "text", "text": "You are a document parser. Extract all text, images and tables "
                                              "from this image and format the output in clean Markdown. "
                                              "Preserve table structure, headings, and lists. If there is no markdown, put a space. "
                                              "Put your result in a JSON object with the following keys:"

prevectorchunks_core-0.1.41/prevectorchunks_core/services/EmbeddedImageExtractor.py ADDED Viewed

@@ -0,0 +1,47 @@
+import os
+import tempfile
+import shutil
+import subprocess
+from base64 import b64encode
+from pathlib import Path
+from PIL import Image
+import io
+import fitz
+from docx2pdf import convert as docx2pdf_convert
+from docx import Document
+class EmbeddedImageExtractor:
+    """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
+    def __init__(self):
+        pass
+    # ----------------------------
+    # DOCX helper
+    # ----------------------------
+    def extract_all_images_from_docx(self, file_path, page_number):
+        doc = Document(file_path)
+        images = []
+        # Method A: Standard Relationship check
+        for rId, rel in doc.part.related_parts.items():
+            if "image" in rel.content_type:
+                image_bytes = rel.blob
+                images.append(self._format_output(image_bytes, rId))
+        # Method B: Package Part check (The "Deep Dive")
+        # If Method A found nothing, we look at every part in the zip package
+        if not images:
+            for part in doc.part.package.parts:
+                if "image" in part.content_type:
+                    images.append(self._format_output(part.blob, "unknown_rid"))
+        return images
+    def _format_output(self, blob, rId):
+        return {
+            "image_data": b64encode(blob).decode("utf-8"),
+            "rel_id": rId
+        }

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/markdown_and_chunk_documents.py RENAMED Viewed

@@ -2,6 +2,7 @@ import os
 import json
 import tempfile
 import uuid
+from base64 import b64encode
 from io import BytesIO
 from pathlib import Path
@@ -12,6 +13,7 @@ from openai import OpenAI
 from PIL import Image
 from .DocuToImageConverter import DocuToImageConverter
+from .EmbeddedImageExtractor import EmbeddedImageExtractor
 from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
 from ..config.splitter_config import SplitterConfig
 from .chunk_documents_crud_vdb import chunk_documents
@@ -151,6 +153,8 @@ class StrategyFactory:
         ".pdf": PDFStrategy(),
         ".doc": WordStrategy(),
         ".docx": WordStrategy(),
+        ".txt": WordStrategy(),
+        ".csv": WordStrategy(),
         ".jpg": ImageStrategy(),
         ".jpeg": ImageStrategy(),
         ".png": ImageStrategy(),
@@ -190,6 +194,47 @@ class MarkdownAndChunkDocuments:
         self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
         self.client=client
+    def extract_embedded_images_for_pages(self,page_images, file_path, ext):
+        """
+        For each page image, extract embedded images depending on document type.
+        Args:
+            page_images (List[PIL.Image]): List of page images
+            file_path (str): Original file path
+            ext (str): File extension (e.g., .docx, .pdf, .png)
+        Returns:
+            List[dict]: Each dict contains:
+                - page_image: the PIL image of the page
+                - embedded_images: list of dicts with image_data (base64), image_index, page_number
+        """
+        pages = []
+        for i, page_image in enumerate(page_images, start=1):
+            embedded_images = []
+            eie = EmbeddedImageExtractor()
+            # pass all arguments as keyword arguments
+            if ext == ".docx":
+                embedded_images = eie.extract_all_images_from_docx(file_path=file_path, page_number=i)
+            elif ext == ".pdf":
+                embedded_images = eie.extract_embedded_images_from_pdf_page(file_path=file_path, page_number=i)
+            elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
+                # Single image files → treat the image itself as embedded
+                buf = BytesIO()
+                page_image.save(buf, format="PNG")
+                b64_image = b64encode(buf.getvalue()).decode("utf-8")
+                embedded_images = [{"image_data": b64_image, "image_index": 1, "page_number": i}]
+            # You can add more document types here (e.g., PPTX, HTML)
+            pages.append({
+                "page_number": i,
+                "page_image": page_image,
+                "embedded_images": embedded_images
+            })
+        return pages
     def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
         # Pick strategy
         strategy = StrategyFactory.get_strategy(file_path,file_name)
@@ -200,6 +245,10 @@ class MarkdownAndChunkDocuments:
         ext=get_file_extension(file_path,file_name)
         images = strategy.process(file_path, input_bytes,ext)
+        # NEW: for each page, extract embedded images
+        eie=EmbeddedImageExtractor()
+        pages = self.extract_embedded_images_for_pages(images, file_path, ext)
         # Extract Markdown from images
         markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
         binary_text_content = text_content.encode("utf-8")
@@ -262,6 +311,10 @@ class MarkdownAndChunkDocuments:
         yield from report(15, "Processing file into images...")
         images = strategy.process(file_path, input_bytes, ext)
+        # NEW: for each page, extract embedded images
+        eie = EmbeddedImageExtractor()
+        pages = self.extract_embedded_images_for_pages(images, file_path, ext)
         # 3️⃣ Extract Markdown
         yield from report(35, "Extracting markdown...")
         markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41/prevectorchunks_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.39
+Version: 0.1.41
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -12,47 +12,35 @@ Requires-Python: <3.12,>=3.7
 Description-Content-Type: text/markdown
 License-File: LICENCE
 License-File: LICENSE
-Requires-Dist: packaging~=24.1
-Requires-Dist: openai<3.0.0,>=2.6.0
-Requires-Dist: python-dotenv~=1.0.1
-Requires-Dist: PyJWT~=2.7.0
+Requires-Dist: Django==5.1
+Requires-Dist: django-cors-headers~=4.4.0
 Requires-Dist: fastapi~=0.112.2
-Requires-Dist: datasets~=4.1.0
+Requires-Dist: PyJWT~=2.7.0
+Requires-Dist: langchain-text-splitters~=0.3.11
+Requires-Dist: openai~=2.6.0
 Requires-Dist: pinecone~=7.3.0
+Requires-Dist: python-dotenv~=1.0.1
 Requires-Dist: pytesseract~=0.3.13
 Requires-Dist: python-docx~=1.2.0
 Requires-Dist: PyPDF2~=3.0.1
 Requires-Dist: pillow~=11.3.0
-Requires-Dist: torch~=2.2.2
-Requires-Dist: torchvision~=0.17.2
-Requires-Dist: torchaudio~=2.2.2
+Requires-Dist: datasets~=4.1.1
+Requires-Dist: torch~=2.6.0
+Requires-Dist: torchvision~=0.21.0
+Requires-Dist: torchaudio~=2.6.0
 Requires-Dist: sentence-transformers~=5.1.1
-Requires-Dist: py-gutenberg~=1.0.3
-Requires-Dist: langchain-text-splitters~=0.3.11
-Requires-Dist: langchain~=0.3
-Requires-Dist: langchain_openai~=0.3.35
-Requires-Dist: accelerate>=0.22.0
 Requires-Dist: pathlib~=1.0.1
 Requires-Dist: transformers~=4.57.0
 Requires-Dist: imageio-ffmpeg~=0.6.0
-Requires-Dist: opencv-python~=4.8.0.76
+Requires-Dist: opencv-python~=4.12.0.88
 Requires-Dist: requests~=2.32.5
-Requires-Dist: langchain-core~=0.3.78
+Requires-Dist: langchain~=1.3.9
+Requires-Dist: langchain-openai~=1.0.0
 Requires-Dist: pdf2image~=1.17.0
 Requires-Dist: docx2pdf~=0.1.8
-Requires-Dist: numpy~=1.23.5
+Requires-Dist: numpy~=2.2.6
 Requires-Dist: scikit-learn~=1.7.2
-Requires-Dist: PyMuPDF~=1.22.5
-Requires-Dist: pypandoc~=1.13
-Requires-Dist: reportlab~=4.1.0
-Requires-Dist: weasyprint~=62.0
-Requires-Dist: lxml~=4.9.3
-Requires-Dist: cssselect2~=0.7.0
-Requires-Dist: cairocffi~=1.4.0
-Requires-Dist: tensorflow~=2.12.0
-Requires-Dist: pandas~=2.2.2
-Requires-Dist: openpyxl~=3.1.2
-Requires-Dist: python-pptx~=0.6.21
+Requires-Dist: fitz~=0.0.1.dev2
 Dynamic: license-file
 # 📚 PreVectorChunks

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,6 +8,7 @@ pyproject.toml
 ./prevectorchunks_core/config/splitter_config.py
 ./prevectorchunks_core/migrations/__init__.py
 ./prevectorchunks_core/os-llm/__init__.py
+./prevectorchunks_core/os-llm/dsqwen.py
 ./prevectorchunks_core/os-llm/llava.py
 ./prevectorchunks_core/rlchunker/__init__.py
 ./prevectorchunks_core/rlchunker/env.py
@@ -22,6 +23,7 @@ pyproject.toml
 ./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
 ./prevectorchunks_core/services/DocuToImageConverter.py
 ./prevectorchunks_core/services/DocuToMarkdownExtractor.py
+./prevectorchunks_core/services/EmbeddedImageExtractor.py
 ./prevectorchunks_core/services/__init__.py
 ./prevectorchunks_core/services/audio_processor.py
 ./prevectorchunks_core/services/chunk_documents_crud_vdb.py
@@ -47,6 +49,7 @@ prevectorchunks_core/config/__init__.py
 prevectorchunks_core/config/splitter_config.py
 prevectorchunks_core/migrations/__init__.py
 prevectorchunks_core/os-llm/__init__.py
+prevectorchunks_core/os-llm/dsqwen.py
 prevectorchunks_core/os-llm/llava.py
 prevectorchunks_core/rlchunker/__init__.py
 prevectorchunks_core/rlchunker/env.py
@@ -61,6 +64,7 @@ prevectorchunks_core/rlchunker/pretrained/model_info.txt
 prevectorchunks_core/rlchunker/pretrained/policy_model.pt
 prevectorchunks_core/services/DocuToImageConverter.py
 prevectorchunks_core/services/DocuToMarkdownExtractor.py
+prevectorchunks_core/services/EmbeddedImageExtractor.py
 prevectorchunks_core/services/__init__.py
 prevectorchunks_core/services/audio_processor.py
 prevectorchunks_core/services/chunk_documents_crud_vdb.py

prevectorchunks_core-0.1.41/prevectorchunks_core.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,29 @@
+Django==5.1
+django-cors-headers~=4.4.0
+fastapi~=0.112.2
+PyJWT~=2.7.0
+langchain-text-splitters~=0.3.11
+openai~=2.6.0
+pinecone~=7.3.0
+python-dotenv~=1.0.1
+pytesseract~=0.3.13
+python-docx~=1.2.0
+PyPDF2~=3.0.1
+pillow~=11.3.0
+datasets~=4.1.1
+torch~=2.6.0
+torchvision~=0.21.0
+torchaudio~=2.6.0
+sentence-transformers~=5.1.1
+pathlib~=1.0.1
+transformers~=4.57.0
+imageio-ffmpeg~=0.6.0
+opencv-python~=4.12.0.88
+requests~=2.32.5
+langchain~=1.3.9
+langchain-openai~=1.0.0
+pdf2image~=1.17.0
+docx2pdf~=0.1.8
+numpy~=2.2.6
+scikit-learn~=1.7.2
+fitz~=0.0.1.dev2

{prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.39"
+version = "0.1.41"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -14,48 +14,35 @@ authors = [
 ]
 dependencies = [
-    "packaging~=24.1",
-    "openai>=2.6.0,<3.0.0",
-    "python-dotenv~=1.0.1",
-    "PyJWT~=2.7.0",
+    "Django==5.1",
+    "django-cors-headers~=4.4.0",
     "fastapi~=0.112.2",
-    "datasets~=4.1.0",
+    "PyJWT~=2.7.0",
+    "langchain-text-splitters~=0.3.11",
+    "openai~=2.6.0",
     "pinecone~=7.3.0",
+    "python-dotenv~=1.0.1",
     "pytesseract~=0.3.13",
     "python-docx~=1.2.0",
     "PyPDF2~=3.0.1",
     "pillow~=11.3.0",
-    "torch~=2.2.2",
-    "torchvision~=0.17.2",
-    "torchaudio~=2.2.2",
+    "datasets~=4.1.1",
+    "torch~=2.6.0",
+    "torchvision~=0.21.0",
+    "torchaudio~=2.6.0",
     "sentence-transformers~=5.1.1",
-    "py-gutenberg~=1.0.3",
-    "langchain-text-splitters~=0.3.11",
-    "langchain~=0.3",
-    "langchain_openai~=0.3.35",
-    "accelerate>=0.22.0",
     "pathlib~=1.0.1",
     "transformers~=4.57.0",
     "imageio-ffmpeg~=0.6.0",
-    "opencv-python~= 4.8.0.76",
+    "opencv-python~=4.12.0.88",
     "requests~=2.32.5",
-    "langchain-core~=0.3.78",
+    "langchain~=1.3.9",
+    "langchain-openai~=1.0.0",
     "pdf2image~=1.17.0",
     "docx2pdf~=0.1.8",
-    "numpy~=1.23.5",
+    "numpy~=2.2.6",
     "scikit-learn~=1.7.2",
-    "PyMuPDF~=1.22.5",
-    "pypandoc~=1.13",
-    "reportlab~=4.1.0",
-    "weasyprint~=62.0",
-    "lxml~=4.9.3",
-    "cssselect2~=0.7.0",
-    "cairocffi~=1.4.0",
-    "tensorflow~=2.12.0",   # <-- Add this
-        # 👉 Add these
-    "pandas~=2.2.2",
-    "openpyxl~=3.1.2",
-    "python-pptx~=0.6.21",
+    "fitz~=0.0.1.dev2",
 ]

prevectorchunks_core-0.1.39/prevectorchunks_core/os-llm/llava.py DELETED Viewed

@@ -1,15 +0,0 @@
-from transformers import pipeline
-pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-13b-hf")
-messages = [
-    {
-      "role": "user",
-      "content": [
-          {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
-          {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
-        ],
-    },
-]
-out = pipe(text=messages, max_new_tokens=20)
-print(out)

prevectorchunks_core-0.1.39/prevectorchunks_core.egg-info/requires.txt DELETED Viewed

@@ -1,41 +0,0 @@
-packaging~=24.1
-openai<3.0.0,>=2.6.0
-python-dotenv~=1.0.1
-PyJWT~=2.7.0
-fastapi~=0.112.2
-datasets~=4.1.0
-pinecone~=7.3.0
-pytesseract~=0.3.13
-python-docx~=1.2.0
-PyPDF2~=3.0.1
-pillow~=11.3.0
-torch~=2.2.2
-torchvision~=0.17.2
-torchaudio~=2.2.2
-sentence-transformers~=5.1.1
-py-gutenberg~=1.0.3
-langchain-text-splitters~=0.3.11
-langchain~=0.3
-langchain_openai~=0.3.35
-accelerate>=0.22.0
-pathlib~=1.0.1
-transformers~=4.57.0
-imageio-ffmpeg~=0.6.0
-opencv-python~=4.8.0.76
-requests~=2.32.5
-langchain-core~=0.3.78
-pdf2image~=1.17.0
-docx2pdf~=0.1.8
-numpy~=1.23.5
-scikit-learn~=1.7.2
-PyMuPDF~=1.22.5
-pypandoc~=1.13
-reportlab~=4.1.0
-weasyprint~=62.0
-lxml~=4.9.3
-cssselect2~=0.7.0
-cairocffi~=1.4.0
-tensorflow~=2.12.0
-pandas~=2.2.2
-openpyxl~=3.1.2
-python-pptx~=0.6.21