PyPI - prevectorchunks-core - Versions diffs - 0.1.26__tar.gz → 0.1.27__tar.gz - Mend

prevectorchunks-core 0.1.26tar.gz → 0.1.27tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of prevectorchunks-core might be problematic. Click here for more details.

Files changed (47) hide show

prevectorchunks_core-0.1.27/LICENSE ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ MIT License
2	+ Copyright (c) 2025 Your Name

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/PKG-INFO RENAMED Viewed

@@ -1,14 +1,18 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.26
+Version: 0.1.27
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
+License: MIT License
+        Copyright (c) 2025 Your Name
 Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
 Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
+Requires-Python: <3.12,>=3.7
 Description-Content-Type: text/markdown
 License-File: LICENCE
+License-File: LICENSE
 Requires-Dist: packaging~=24.1
-Requires-Dist: requests~=2.32.3
 Requires-Dist: openai<3.0.0,>=2.6.0
 Requires-Dist: python-dotenv~=1.0.1
 Requires-Dist: PyJWT~=2.7.0
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
 Requires-Dist: langchain-text-splitters~=0.3.11
 Requires-Dist: langchain~=0.3
 Requires-Dist: langchain_openai~=0.3.35
-Requires-Dist: transformers>=4.30.0
 Requires-Dist: accelerate>=0.22.0
-Requires-Dist: imageio-ffmpeg>=0.4.8
-Requires-Dist: opencv-python>=4.10.0
 Requires-Dist: pathlib~=1.0.1
 Requires-Dist: transformers~=4.57.0
 Requires-Dist: imageio-ffmpeg~=0.6.0
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
 Requires-Dist: numpy~=2.2.6
 Requires-Dist: scikit-learn~=1.7.2
 Requires-Dist: PyMuPDF~=1.22.5
+Requires-Dist: pypandoc~=1.13
+Requires-Dist: reportlab~=4.1.0
+Requires-Dist: weasyprint~=62.0
+Requires-Dist: lxml~=4.9.3
+Requires-Dist: cssselect2~=0.7.0
+Requires-Dist: cairocffi~=1.4.0
 Dynamic: license-file
 # 📚 PreVectorChunks
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
                                      split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
                                      max_rl_chunk_size=50,enableLLMTouchUp=False)
 - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
-**Returns**
+- **Returns**
 - A list of chunked strings including a unique id, a meaningful title and chunked text
 **Use Cases**

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/README.md RENAMED Viewed

@@ -76,7 +76,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
                                      split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
                                      max_rl_chunk_size=50,enableLLMTouchUp=False)
 - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
-**Returns**
+- **Returns**
 - A list of chunked strings including a unique id, a meaningful title and chunked text
 **Use Cases**

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/splitter_config.py RENAMED Viewed

@@ -3,12 +3,10 @@ from dataclasses import dataclass, field
 from enum import Enum
-class LLM_Structured__Output_Type(Enum):
-    RECURSIVE = "RecursiveCharacterTextSplitter"
-    CHARACTER = "CharacterTextSplitter"
-    STANDARD = "standard"
-    R_PRETRAINED_PROPOSITION = "RLBasedTextSplitterWithProposition"
-    R_PRETRAINED = "RLBasedTextSplitter"
+class LLM_Structured_Output_Type(Enum):
+    STANDARD = "STANDARD"
+    STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
 @dataclass()
 class SplitterConfig:
@@ -17,6 +15,7 @@ class SplitterConfig:
     separators: list[str] = field(default_factory=lambda: ["\n"])
     split_type: str = "recursive_splitter"
     enableLLMTouchUp: bool = True
+    llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
     min_rl_chunk_size: int = 5
     max_rl_chunk_size: int = 50

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/DocuToImageConverter.py RENAMED Viewed

@@ -1,11 +1,20 @@
 import os
+import shutil
+import subprocess
+import sys
 import tempfile
+import pypandoc
 from PIL import Image
 import io
 from docx2pdf import convert as docx_to_pdf
 import fitz
+from docx2pdf import convert as docx2pdf_convert
+try:
+    pypandoc.get_pandoc_path()
+except OSError:
+    print("Pandoc not found — downloading it temporarily...")
+    pypandoc.download_pandoc()
 class DocuToImageConverter:
     """Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
@@ -13,11 +22,53 @@ class DocuToImageConverter:
     def __init__(self):
         pass
-    def _convert_doc_to_pdf(self, doc_path: str) -> str:
-        """Converts a .docx or .doc file to PDF using docx2pdf."""
-        temp_dir = tempfile.mkdtemp()
-        output_pdf = os.path.join(temp_dir, "converted.pdf")
-        docx_to_pdf(doc_path, output_pdf)
+    def _convert_doc_to_pdf(self, input_path: str) -> str:
+        import shutil, tempfile, os, pypandoc
+        from docx import Document
+        if not os.path.exists(input_path):
+            raise FileNotFoundError(input_path)
+        output_dir = tempfile.mkdtemp()
+        output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
+        # 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
+        try:
+            pypandoc.get_pandoc_path()
+            def which(cmd):
+                return shutil.which(cmd) is not None
+            pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
+            pypandoc.convert_file(
+                input_path, "pdf", outputfile=output_pdf,
+                extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
+            )
+            return output_pdf
+        except Exception as e:
+            print("⚠️ Pandoc PDF conversion failed:", e)
+        # 2️⃣ Fallback to pure Python (WeasyPrint)
+        try:
+            from weasyprint import HTML
+            doc = Document(input_path)
+            html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
+            HTML(string=html).write_pdf(output_pdf)
+            return output_pdf
+        except Exception as e:
+            print("⚠️ Fallback to WeasyPrint failed:", e)
+        # 3️⃣ Last resort (plain text with ReportLab)
+        from reportlab.pdfgen import canvas
+        from reportlab.lib.pagesizes import A4
+        doc = Document(input_path)
+        c = canvas.Canvas(output_pdf, pagesize=A4)
+        width, height = A4
+        y = height - 50
+        for p in doc.paragraphs:
+            c.drawString(50, y, p.text[:1000])
+            y -= 15
+        c.save()
         return output_pdf
     def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):

prevectorchunks_core-0.1.27/prevectorchunks_core/services/markdown_and_chunk_documents.py ADDED Viewed

@@ -0,0 +1,167 @@
+import os
+import json
+from docx import Document
+from dotenv import load_dotenv
+from openai import OpenAI
+from PIL import Image
+from .DocuToImageConverter import DocuToImageConverter
+from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
+from ..config.splitter_config import SplitterConfig
+from .chunk_documents_crud_vdb import chunk_documents
+from .chunk_to_all_content_mapper import ChunkMapper
+from ..utils.file_loader import SplitType
+load_dotenv(override=True)
+# -----------------------------
+# Abstract Strategy Interface
+# -----------------------------
+class BaseDocumentStrategy:
+    """Defines a standard interface for all document processing strategies."""
+    def process(self, file_path: str):
+        raise NotImplementedError("process() must be implemented by subclasses")
+# -----------------------------
+# PDF Strategy
+# -----------------------------
+class PDFStrategy(BaseDocumentStrategy):
+    def process(self, file_path: str):
+        print(f"📄 Using PDFStrategy for {file_path}")
+        converter = DocuToImageConverter()
+        # Example: detect multi-column layout or extract embedded text first
+        # import fitz
+        # text_ratio = 0
+        # with fitz.open(file_path) as doc:
+        #     for page in doc:
+        #         text = page.get_text("text")
+        #         text_ratio += len(text) / (page.rect.width * page.rect.height)
+        # if text_ratio > 0.0001:
+        #     print("📚 PDF appears text-based – using hybrid extract + image backup")
+        images = converter.convert_to_images(file_path)
+        return images
+# -----------------------------
+# Word Strategy
+# -----------------------------
+class WordStrategy(BaseDocumentStrategy):
+    def process(self, file_path: str):
+        print(f"📝 Using WordStrategy for {file_path}")
+        # Extract text semantically first
+        try:
+            doc = Document(file_path)
+            paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+            text_content = "\n".join(paragraphs)
+            print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
+        except Exception as e:
+            print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
+            text_content = ""
+        converter = DocuToImageConverter()
+        pdf_path = converter._convert_doc_to_pdf(file_path)
+        images = converter.convert_to_images(pdf_path)
+        # Optional: attach text fallback
+        if text_content:
+            images[0].extracted_text = text_content  # for later use by extractor
+        return images
+# -----------------------------
+# Image Strategy
+# -----------------------------
+class ImageStrategy(BaseDocumentStrategy):
+    def process(self, file_path: str):
+        print(f"🖼️ Using ImageStrategy for {file_path}")
+        image = Image.open(file_path).convert("RGB")
+        return [image]
+# -----------------------------
+# Strategy Factory
+# -----------------------------
+class StrategyFactory:
+    """Selects a document strategy based on file extension."""
+    strategies = {
+        ".pdf": PDFStrategy(),
+        ".doc": WordStrategy(),
+        ".docx": WordStrategy(),
+        ".jpg": ImageStrategy(),
+        ".jpeg": ImageStrategy(),
+        ".png": ImageStrategy(),
+        ".bmp": ImageStrategy(),
+        ".tiff": ImageStrategy(),
+    }
+    @classmethod
+    def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
+        ext = os.path.splitext(file_path)[1].lower()
+        return cls.strategies.get(ext, None)
+# -----------------------------
+# Main Orchestrator
+# -----------------------------
+class MarkdownAndChunkDocuments:
+    def __init__(self):
+        self.api_key = os.getenv("OPENAI_API_KEY")
+        self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
+    def markdown_and_chunk_documents(self, file_path: str):
+        # Pick strategy
+        strategy = StrategyFactory.get_strategy(file_path)
+        if not strategy:
+            raise ValueError(f"Unsupported file type: {file_path}")
+        # Convert to images using correct strategy
+        images = strategy.process(file_path)
+        # Extract Markdown from images
+        markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
+        binary_text_content = text_content.encode("utf-8")
+        # Chunking and mapping
+        chunk_client = OpenAI(api_key=self.api_key)
+        cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
+        splitter_config = SplitterConfig(
+            chunk_size=300,
+            chunk_overlap=0,
+            separators=["\n"],
+            split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
+            min_rl_chunk_size=5,
+            max_rl_chunk_size=50,
+            enableLLMTouchUp=False,
+        )
+        chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
+                                       splitter_config=splitter_config)
+        flat_chunks = [''.join(inner) for inner in chunked_text]
+        mapped_chunks = cm.map_chunks(flat_chunks)
+        # Merge unmapped markdown sections
+        for md_item in markdown_output:
+            if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
+                md_item["chunked_text"] = md_item["markdown_text"]
+                mapped_chunks.append(md_item)
+        print("✅ Processing complete.")
+        return mapped_chunks
+# -----------------------------
+# CLI Entry
+# -----------------------------
+if __name__ == "__main__":
+    file_path = "421307-nz-au-top-loading-washer-guide-shorter.pdf"
+    pipeline = MarkdownAndChunkDocuments()
+    output = pipeline.markdown_and_chunk_documents(file_path)
+    print(json.dumps(output, indent=2))

prevectorchunks_core-0.1.27/prevectorchunks_core/test_loader.py ADDED Viewed

@@ -0,0 +1,44 @@
+import json
+import pytest
+from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
+from core.prevectorchunks_core.services import chunk_documents_crud_vdb
+from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
+from core.prevectorchunks_core.utils.file_loader import SplitType
+# Create a temporary JSON file to test with
+@pytest.fixture
+def temp_json_file(tmp_path):
+    file_path = tmp_path / "test.json"
+    content = [{"id": 1, "text": "hello world"}]
+    with open(file_path, "w") as f:
+        json.dump(content, f)
+    return file_path
+def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
+    splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
+                                     split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
+                                     max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
+    chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
+                                                      splitter_config=splitter_config)
+    print(chunks)
+    for i, c in enumerate(chunks):
+        print(f"Chunk {i + 1}: {c}")
+    print(chunks)
+def test_markdown(temp_json_file):
+    markdown_and_chunk_documents = MarkdownAndChunkDocuments()
+    mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
+        "content.docx")
+    print(mapped_chunks)
+    for i, c in enumerate(mapped_chunks):
+        print(f"Chunk {i + 1}: {c}")
+    for i, c in enumerate(mapped_chunks):
+        print(f"Chunk {i + 1}: {c}")
+    print(mapped_chunks)

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/file_loader.py RENAMED Viewed

@@ -16,7 +16,7 @@ from .llm_wrapper import LLMClientWrapper  # Relative import
 from dotenv import load_dotenv
 import tempfile
-from ..config.splitter_config import SplitterConfig
+from ..config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
 from ..rlchunker.inference import RLChunker
 from ..services.propositional_index import PropositionalIndexer
@@ -256,15 +256,19 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
     chunks = split_text_by_config(text, splitter_config=splitter_config)
     all_results = []
     if splitter_config.enableLLMTouchUp:
-        for chunk in chunks:
-            structured = process_with_llm(chunk,instructions)
-            # Ensure UUIDs exist
-            for obj in structured:
-                if "id" not in obj:
-                    obj["id"] = str(uuid.uuid4())
-            all_results.extend(structured)
-        return all_results
+        if splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STANDARD:
+            warnings.warn("bypassing LLM touch up for standard structured output")
+            return chunks
+        elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
+            for chunk in chunks:
+                structured = process_with_llm(chunk,instructions)
+                # Ensure UUIDs exist
+                for obj in structured:
+                    if "id" not in obj:
+                        obj["id"] = str(uuid.uuid4())
+                all_results.extend(structured)
+            return all_results
     else:
         return chunks

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/PKG-INFO RENAMED Viewed

@@ -1,14 +1,18 @@
 Metadata-Version: 2.4
 Name: prevectorchunks-core
-Version: 0.1.26
+Version: 0.1.27
 Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
 Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
+License: MIT License
+        Copyright (c) 2025 Your Name
 Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
 Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
+Requires-Python: <3.12,>=3.7
 Description-Content-Type: text/markdown
 License-File: LICENCE
+License-File: LICENSE
 Requires-Dist: packaging~=24.1
-Requires-Dist: requests~=2.32.3
 Requires-Dist: openai<3.0.0,>=2.6.0
 Requires-Dist: python-dotenv~=1.0.1
 Requires-Dist: PyJWT~=2.7.0
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
 Requires-Dist: langchain-text-splitters~=0.3.11
 Requires-Dist: langchain~=0.3
 Requires-Dist: langchain_openai~=0.3.35
-Requires-Dist: transformers>=4.30.0
 Requires-Dist: accelerate>=0.22.0
-Requires-Dist: imageio-ffmpeg>=0.4.8
-Requires-Dist: opencv-python>=4.10.0
 Requires-Dist: pathlib~=1.0.1
 Requires-Dist: transformers~=4.57.0
 Requires-Dist: imageio-ffmpeg~=0.6.0
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
 Requires-Dist: numpy~=2.2.6
 Requires-Dist: scikit-learn~=1.7.2
 Requires-Dist: PyMuPDF~=1.22.5
+Requires-Dist: pypandoc~=1.13
+Requires-Dist: reportlab~=4.1.0
+Requires-Dist: weasyprint~=62.0
+Requires-Dist: lxml~=4.9.3
+Requires-Dist: cssselect2~=0.7.0
+Requires-Dist: cairocffi~=1.4.0
 Dynamic: license-file
 # 📚 PreVectorChunks
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
                                      split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
                                      max_rl_chunk_size=50,enableLLMTouchUp=False)
 - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
-**Returns**
+- **Returns**
 - A list of chunked strings including a unique id, a meaningful title and chunked text
 **Use Cases**

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 LICENCE
+LICENSE
 README.md
 pyproject.toml
 prevectorchunks_core/__init__.py

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/requires.txt RENAMED Viewed

@@ -1,5 +1,4 @@
 packaging~=24.1
-requests~=2.32.3
 openai<3.0.0,>=2.6.0
 python-dotenv~=1.0.1
 PyJWT~=2.7.0
@@ -18,10 +17,7 @@ py-gutenberg~=1.0.3
 langchain-text-splitters~=0.3.11
 langchain~=0.3
 langchain_openai~=0.3.35
-transformers>=4.30.0
 accelerate>=0.22.0
-imageio-ffmpeg>=0.4.8
-opencv-python>=4.10.0
 pathlib~=1.0.1
 transformers~=4.57.0
 imageio-ffmpeg~=0.6.0
@@ -33,3 +29,9 @@ docx2pdf~=0.1.8
 numpy~=2.2.6
 scikit-learn~=1.7.2
 PyMuPDF~=1.22.5
+pypandoc~=1.13
+reportlab~=4.1.0
+weasyprint~=62.0
+lxml~=4.9.3
+cssselect2~=0.7.0
+cairocffi~=1.4.0

{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/pyproject.toml RENAMED Viewed

@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "prevectorchunks-core"
-version = "0.1.26"
+version = "0.1.27"
 description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
 readme = "README.md"
 license = { file = "LICENSE" }
+requires-python = ">=3.7,<3.12"
 authors = [
     { name = "Zul Al-Kabir", email = "zul.developer.2023@gmail.com" }
 ]
 dependencies = [
     "packaging~=24.1",
-    "requests~=2.32.3",
     "openai>=2.6.0,<3.0.0",
     "python-dotenv~=1.0.1",
     "PyJWT~=2.7.0",
@@ -30,15 +30,10 @@ dependencies = [
     "torchaudio~=2.6.0",
     "sentence-transformers~=5.1.1",
     "py-gutenberg~=1.0.3",
-    "langchain-text-splitters~=0.3.11", # <-- add this
+    "langchain-text-splitters~=0.3.11",
     "langchain~=0.3",
     "langchain_openai~=0.3.35",
-    # … your existing dependencies …
-    "transformers>=4.30.0",
     "accelerate>=0.22.0",
-    "imageio-ffmpeg>=0.4.8",
-    "opencv-python>=4.10.0", # for frame extraction
-    # or whichever version of LLaVA you use
     "pathlib~=1.0.1",
     "transformers~=4.57.0",
     "imageio-ffmpeg~=0.6.0",
@@ -49,17 +44,24 @@ dependencies = [
     "docx2pdf~=0.1.8",
     "numpy~=2.2.6",
     "scikit-learn~=1.7.2",
-    "PyMuPDF~=1.22.5"
+    "PyMuPDF~=1.22.5",
+    "pypandoc~=1.13",
+    "reportlab~=4.1.0",
+    "weasyprint~=62.0",
+    "lxml~=4.9.3",
+    "cssselect2~=0.7.0",
+    "cairocffi~=1.4.0"
 ]
 [tool.setuptools.packages.find]
 include = ["prevectorchunks_core*"]
 [tool.setuptools.package-data]
 "prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
 [tool.setuptools]
 include-package-data = true
 [project.urls]
 Homepage = "https://github.com/zuldeveloper2023/PreVectorChunks"
 Source = "https://github.com/zuldeveloper2023/PreVectorChunks"

prevectorchunks_core-0.1.26/prevectorchunks_core/services/markdown_and_chunk_documents.py DELETED Viewed

@@ -1,71 +0,0 @@
-import json
-import os
-import tempfile
-import base64
-from openai import OpenAI
-from PIL import Image
-from .DocuToImageConverter import DocuToImageConverter
-from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
-from ..config.splitter_config import SplitterConfig
-from dotenv import load_dotenv
-from .chunk_documents_crud_vdb import chunk_documents
-from .chunk_to_all_content_mapper import ChunkMapper
-from ..utils.file_loader import SplitType
-load_dotenv(override=True)
-class MarkdownAndChunkDocuments:
-    def markdown_and_chunk_documents(self,file_path:str):
-        # Create instances of the converter and extractor
-        converter = DocuToImageConverter()
-        extractor = DocuToMarkdownExtractor(api_key=os.getenv("OPENAI_API_KEY"))
-        images = converter.convert_to_images(file_path)
-        # convert
-        # Step 2: Extract Markdown from images
-        markdown_output, text_content = extractor.extract_markdown(images, include_image=False)
-        # convert text content to binary
-        binary_text_content = text_content.encode('utf-8')  # bytes representation
-        chunk_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-        cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
-        splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
-                                         split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
-                                         max_rl_chunk_size=50, enableLLMTouchUp=False)
-        chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
-                                       splitter_config=splitter_config)
-        flat_chunks = result = [''.join(inner) for inner in chunked_text]
-        mapped_chunks = cm.map_chunks(flat_chunks)
-        for md_item in markdown_output:
-            # Check if this markdown_output item is already present in mapped_chunks
-            match_found = False
-            for mapped in mapped_chunks:
-                if mapped.get("markdown_text") == md_item.get("markdown_text"):
-                    match_found = True
-                    break
-            # If not found, append the missing markdown_output item
-            if not match_found:
-                md_item["chunked_text"] = md_item["markdown_text"]
-                mapped_chunks.append(md_item)
-        #print(mapped_chunks)
-        #print("✅ Markdown extraction complete! See output.md")
-        return mapped_chunks
-if __name__ == "__main__":
-    markdown_and_chunk_documents = MarkdownAndChunkDocuments()
-    mapped_chunks=markdown_and_chunk_documents.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
-    print(mapped_chunks)

prevectorchunks_core-0.1.26/prevectorchunks_core/test_loader.py DELETED Viewed

@@ -1,26 +0,0 @@
-import json
-import pytest
-from core.prevectorchunks_core.config.splitter_config import SplitterConfig
-from core.prevectorchunks_core.services import chunk_documents_crud_vdb
-from core.prevectorchunks_core.utils.file_loader import SplitType
-# Create a temporary JSON file to test with
-@pytest.fixture
-def temp_json_file(tmp_path):
-    file_path = tmp_path / "test.json"
-    content = [{"id": 1, "text": "hello world"}]
-    with open(file_path, "w") as f:
-        json.dump(content, f)
-    return file_path
-def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
-    splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
-                                     split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
-                                     max_rl_chunk_size=50,enableLLMTouchUp=True)
-    chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
-    print(chunks)