PyPI - prevectorchunks-core - Versions diffs - 0.1.27__tar.gz → 0.1.29__tar.gz - Mend

prevectorchunks-core 0.1.27tar.gz → 0.1.29tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{prevectorchunks_core-0.1.27/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.29}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.27
+Version: 0.1.29
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
 Requires-Dist: langchain-core~=0.3.78
 Requires-Dist: pdf2image~=1.17.0
 Requires-Dist: docx2pdf~=0.1.8
-Requires-Dist: numpy~=2.2.6
+Requires-Dist: numpy~=2.0.0
 Requires-Dist: scikit-learn~=1.7.2
 Requires-Dist: PyMuPDF~=1.22.5
 Requires-Dist: pypandoc~=1.13

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToImageConverter.py RENAMED Viewed

@@ -3,6 +3,7 @@ import shutil
 import subprocess
 import sys
 import tempfile
+from pathlib import Path
 import pypandoc
 from PIL import Image
@@ -23,19 +24,44 @@ class DocuToImageConverter:
         pass
     def _convert_doc_to_pdf(self, input_path: str) -> str:
-        import shutil, tempfile, os, pypandoc
-        from docx import Document
+        import os, tempfile, shutil, subprocess
+        from pathlib import Path
         if not os.path.exists(input_path):
             raise FileNotFoundError(input_path)
         output_dir = tempfile.mkdtemp()
-        output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
+        output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
+        # 1️⃣ Try Microsoft Word COM automation (Windows only)
+        try:
+            import win32com.client
+            word = win32com.client.Dispatch("Word.Application")
+            word.Visible = False
+            doc = word.Documents.Open(str(Path(input_path).resolve()))
+            doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)  # 17 = wdFormatPDF
+            doc.Close()
+            word.Quit()
+            print("✅ Word COM conversion successful:", output_pdf)
+            return output_pdf
+        except Exception as e:
+            print("⚠️ Word COM conversion failed:", e)
-        # 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
+        # 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
         try:
-            pypandoc.get_pandoc_path()
+            # Requires LibreOffice installed and in PATH
+            subprocess.run(
+                ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
+                check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
+            print("✅ LibreOffice conversion successful:", output_pdf)
+            return output_pdf
+        except Exception as e:
+            print("⚠️ LibreOffice conversion failed:", e)
+        # 3️⃣ Fallback: Pandoc (simpler, loses layout)
+        try:
+            import pypandoc
             def which(cmd):
                 return shutil.which(cmd) is not None
@@ -44,23 +70,16 @@ class DocuToImageConverter:
                 input_path, "pdf", outputfile=output_pdf,
                 extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
             )
+            print("✅ Pandoc conversion successful:", output_pdf)
             return output_pdf
         except Exception as e:
-            print("⚠️ Pandoc PDF conversion failed:", e)
+            print("⚠️ Pandoc conversion failed:", e)
-        # 2️⃣ Fallback to pure Python (WeasyPrint)
-        try:
-            from weasyprint import HTML
-            doc = Document(input_path)
-            html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
-            HTML(string=html).write_pdf(output_pdf)
-            return output_pdf
-        except Exception as e:
-            print("⚠️ Fallback to WeasyPrint failed:", e)
-        # 3️⃣ Last resort (plain text with ReportLab)
+        # 4️⃣ Last resort: ReportLab basic text (no formatting)
         from reportlab.pdfgen import canvas
         from reportlab.lib.pagesizes import A4
+        from docx import Document
         doc = Document(input_path)
         c = canvas.Canvas(output_pdf, pagesize=A4)
         width, height = A4
@@ -68,7 +87,11 @@ class DocuToImageConverter:
         for p in doc.paragraphs:
             c.drawString(50, y, p.text[:1000])
             y -= 15
+            if y < 50:
+                c.showPage()
+                y = height - 50
         c.save()
+        print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
         return output_pdf
     def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToMarkdownExtractor.py RENAMED Viewed

@@ -42,12 +42,12 @@ class DocuToMarkdownExtractor:
             fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
                                              "from this image and format the output in clean Markdown. "
                                              "Preserve table structure, headings, and lists. If there is no markdown, put a space. "
-                                             "Put your result in a JSON object with the following keys:\n"
-                                             "- markdown_text: the markdown text\n"
-                                             "- short_title: the short title of the document\n"
-                                             "- page_number: the page number of the document (image index + 1)\n"
-                                             "- summary: a summary of the document\n,"
-                                             " - image_data: the image data in base64 format\n,"
+                                             "Put your result in a JSON object with the following keys:"
+                                             "- markdown_text: the markdown text"
+                                             "- short_title: the short title of the document"
+                                             "- page_number: the page number of the document (i+1)"
+                                             "- summary: a summary of the document,"
+                                             " - image_data: the image data in base64 format,"
                                              "Return only raw JSON, without markdown formatting or triple backticks."
                                              "- image_index: the index of the image in the document"},
                     {"type": "text", "text": "You are an image inspector. Tell us what is in the image "
@@ -63,6 +63,9 @@ class DocuToMarkdownExtractor:
             text_content=text_content+"\n"+response["markdown_text"]
             if(include_image):
                 response["image_data"]=b64_image
+            response["image_index"]=i
+            response["page_number"] = i
             all_outputs.append(response)
         json_array = json.dumps(all_outputs, indent=2)

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/markdown_and_chunk_documents.py RENAMED Viewed

@@ -1,5 +1,7 @@
 import os
 import json
+import tempfile
+from pathlib import Path
 from docx import Document
 from dotenv import load_dotenv
@@ -51,25 +53,17 @@ class PDFStrategy(BaseDocumentStrategy):
 # -----------------------------
 class WordStrategy(BaseDocumentStrategy):
     def process(self, file_path: str):
+        file_path = Path(file_path)
         print(f"📝 Using WordStrategy for {file_path}")
-        # Extract text semantically first
-        try:
-            doc = Document(file_path)
-            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-            text_content = "\n".join(paragraphs)
-            print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
-        except Exception as e:
-            print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
-            text_content = ""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            pdf_path = Path(tmpdir) / f"{file_path.stem}.pdf"
-        converter = DocuToImageConverter()
-        pdf_path = converter._convert_doc_to_pdf(file_path)
-        images = converter.convert_to_images(pdf_path)
+            converter = DocuToImageConverter()
+            pdf_path = converter._convert_doc_to_pdf(file_path)
+            images = converter.convert_to_images(pdf_path)
-        # Optional: attach text fallback
-        if text_content:
-            images[0].extracted_text = text_content  # for later use by extractor
         return images
@@ -115,7 +109,7 @@ class MarkdownAndChunkDocuments:
         self.api_key = os.getenv("OPENAI_API_KEY")
         self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
-    def markdown_and_chunk_documents(self, file_path: str):
+    def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
         # Pick strategy
         strategy = StrategyFactory.get_strategy(file_path)
         if not strategy:
@@ -125,7 +119,7 @@ class MarkdownAndChunkDocuments:
         images = strategy.process(file_path)
         # Extract Markdown from images
-        markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
+        markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
         binary_text_content = text_content.encode("utf-8")
         # Chunking and mapping

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/test_loader.py RENAMED Viewed

@@ -34,7 +34,7 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
 def test_markdown(temp_json_file):
     markdown_and_chunk_documents = MarkdownAndChunkDocuments()
     mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
-        "content.docx")
+        "content.docx",include_image=True)
     print(mapped_chunks)
     for i, c in enumerate(mapped_chunks):
         print(f"Chunk {i + 1}: {c}")

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29/prevectorchunks_core.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.27
+Version: 0.1.29
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
 License: MIT License
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
 Requires-Dist: langchain-core~=0.3.78
 Requires-Dist: pdf2image~=1.17.0
 Requires-Dist: docx2pdf~=0.1.8
-Requires-Dist: numpy~=2.2.6
+Requires-Dist: numpy~=2.0.0
 Requires-Dist: scikit-learn~=1.7.2
 Requires-Dist: PyMuPDF~=1.22.5
 Requires-Dist: pypandoc~=1.13

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/requires.txt RENAMED Viewed

@@ -26,7 +26,7 @@ requests~=2.32.5
 langchain-core~=0.3.78
 pdf2image~=1.17.0
 docx2pdf~=0.1.8
-numpy~=2.2.6
+numpy~=2.0.0
 scikit-learn~=1.7.2
 PyMuPDF~=1.22.5
 pypandoc~=1.13

{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.27"
+version = "0.1.29"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -42,7 +42,7 @@ dependencies = [
     "langchain-core~=0.3.78",
     "pdf2image~=1.17.0",
     "docx2pdf~=0.1.8",
-    "numpy~=2.2.6",
+    "numpy~=2.0.0",
     "scikit-learn~=1.7.2",
     "PyMuPDF~=1.22.5",
     "pypandoc~=1.13",