prevectorchunks-core 0.1.26__tar.gz → 0.1.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of prevectorchunks-core might be problematic. Click here for more details.
- prevectorchunks_core-0.1.27/LICENSE +2 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/PKG-INFO +14 -6
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/README.md +2 -1
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/splitter_config.py +5 -6
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/DocuToImageConverter.py +58 -7
- prevectorchunks_core-0.1.27/prevectorchunks_core/services/markdown_and_chunk_documents.py +167 -0
- prevectorchunks_core-0.1.27/prevectorchunks_core/test_loader.py +44 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/file_loader.py +14 -10
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/PKG-INFO +14 -6
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/SOURCES.txt +1 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/requires.txt +6 -4
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/pyproject.toml +12 -10
- prevectorchunks_core-0.1.26/prevectorchunks_core/services/markdown_and_chunk_documents.py +0 -71
- prevectorchunks_core-0.1.26/prevectorchunks_core/test_loader.py +0 -26
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/LICENCE +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/setup.cfg +0 -0
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.27
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Copyright (c) 2025 Your Name
|
|
8
|
+
|
|
6
9
|
Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
7
10
|
Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
11
|
+
Requires-Python: <3.12,>=3.7
|
|
8
12
|
Description-Content-Type: text/markdown
|
|
9
13
|
License-File: LICENCE
|
|
14
|
+
License-File: LICENSE
|
|
10
15
|
Requires-Dist: packaging~=24.1
|
|
11
|
-
Requires-Dist: requests~=2.32.3
|
|
12
16
|
Requires-Dist: openai<3.0.0,>=2.6.0
|
|
13
17
|
Requires-Dist: python-dotenv~=1.0.1
|
|
14
18
|
Requires-Dist: PyJWT~=2.7.0
|
|
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
|
|
|
27
31
|
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
28
32
|
Requires-Dist: langchain~=0.3
|
|
29
33
|
Requires-Dist: langchain_openai~=0.3.35
|
|
30
|
-
Requires-Dist: transformers>=4.30.0
|
|
31
34
|
Requires-Dist: accelerate>=0.22.0
|
|
32
|
-
Requires-Dist: imageio-ffmpeg>=0.4.8
|
|
33
|
-
Requires-Dist: opencv-python>=4.10.0
|
|
34
35
|
Requires-Dist: pathlib~=1.0.1
|
|
35
36
|
Requires-Dist: transformers~=4.57.0
|
|
36
37
|
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
|
|
|
42
43
|
Requires-Dist: numpy~=2.2.6
|
|
43
44
|
Requires-Dist: scikit-learn~=1.7.2
|
|
44
45
|
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
|
+
Requires-Dist: pypandoc~=1.13
|
|
47
|
+
Requires-Dist: reportlab~=4.1.0
|
|
48
|
+
Requires-Dist: weasyprint~=62.0
|
|
49
|
+
Requires-Dist: lxml~=4.9.3
|
|
50
|
+
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
+
Requires-Dist: cairocffi~=1.4.0
|
|
45
52
|
Dynamic: license-file
|
|
46
53
|
|
|
47
54
|
# 📚 PreVectorChunks
|
|
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
|
|
|
122
129
|
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
123
130
|
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
124
131
|
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
125
|
-
|
|
132
|
+
|
|
133
|
+
- **Returns**
|
|
126
134
|
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
127
135
|
|
|
128
136
|
**Use Cases**
|
|
@@ -76,7 +76,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
|
|
|
76
76
|
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
77
77
|
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
78
78
|
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
79
|
-
|
|
79
|
+
|
|
80
|
+
- **Returns**
|
|
80
81
|
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
81
82
|
|
|
82
83
|
**Use Cases**
|
|
@@ -3,12 +3,10 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from enum import Enum
|
|
4
4
|
|
|
5
5
|
|
|
6
|
-
class
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
R_PRETRAINED_PROPOSITION = "RLBasedTextSplitterWithProposition"
|
|
11
|
-
R_PRETRAINED = "RLBasedTextSplitter"
|
|
6
|
+
class LLM_Structured_Output_Type(Enum):
|
|
7
|
+
STANDARD = "STANDARD"
|
|
8
|
+
STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
|
|
9
|
+
|
|
12
10
|
|
|
13
11
|
@dataclass()
|
|
14
12
|
class SplitterConfig:
|
|
@@ -17,6 +15,7 @@ class SplitterConfig:
|
|
|
17
15
|
separators: list[str] = field(default_factory=lambda: ["\n"])
|
|
18
16
|
split_type: str = "recursive_splitter"
|
|
19
17
|
enableLLMTouchUp: bool = True
|
|
18
|
+
llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
|
|
20
19
|
min_rl_chunk_size: int = 5
|
|
21
20
|
max_rl_chunk_size: int = 50
|
|
22
21
|
|
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
2
5
|
import tempfile
|
|
6
|
+
|
|
7
|
+
import pypandoc
|
|
3
8
|
from PIL import Image
|
|
4
9
|
import io
|
|
5
10
|
from docx2pdf import convert as docx_to_pdf
|
|
6
11
|
import fitz
|
|
7
|
-
|
|
8
|
-
|
|
12
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
13
|
+
try:
|
|
14
|
+
pypandoc.get_pandoc_path()
|
|
15
|
+
except OSError:
|
|
16
|
+
print("Pandoc not found — downloading it temporarily...")
|
|
17
|
+
pypandoc.download_pandoc()
|
|
9
18
|
|
|
10
19
|
class DocuToImageConverter:
|
|
11
20
|
"""Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
|
|
@@ -13,11 +22,53 @@ class DocuToImageConverter:
|
|
|
13
22
|
def __init__(self):
|
|
14
23
|
pass
|
|
15
24
|
|
|
16
|
-
def _convert_doc_to_pdf(self,
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
25
|
+
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
26
|
+
import shutil, tempfile, os, pypandoc
|
|
27
|
+
from docx import Document
|
|
28
|
+
|
|
29
|
+
if not os.path.exists(input_path):
|
|
30
|
+
raise FileNotFoundError(input_path)
|
|
31
|
+
|
|
32
|
+
output_dir = tempfile.mkdtemp()
|
|
33
|
+
output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
|
|
34
|
+
|
|
35
|
+
# 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
|
|
36
|
+
try:
|
|
37
|
+
pypandoc.get_pandoc_path()
|
|
38
|
+
|
|
39
|
+
def which(cmd):
|
|
40
|
+
return shutil.which(cmd) is not None
|
|
41
|
+
|
|
42
|
+
pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
|
|
43
|
+
pypandoc.convert_file(
|
|
44
|
+
input_path, "pdf", outputfile=output_pdf,
|
|
45
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
46
|
+
)
|
|
47
|
+
return output_pdf
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print("⚠️ Pandoc PDF conversion failed:", e)
|
|
50
|
+
|
|
51
|
+
# 2️⃣ Fallback to pure Python (WeasyPrint)
|
|
52
|
+
try:
|
|
53
|
+
from weasyprint import HTML
|
|
54
|
+
doc = Document(input_path)
|
|
55
|
+
html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
|
|
56
|
+
HTML(string=html).write_pdf(output_pdf)
|
|
57
|
+
return output_pdf
|
|
58
|
+
except Exception as e:
|
|
59
|
+
print("⚠️ Fallback to WeasyPrint failed:", e)
|
|
60
|
+
|
|
61
|
+
# 3️⃣ Last resort (plain text with ReportLab)
|
|
62
|
+
from reportlab.pdfgen import canvas
|
|
63
|
+
from reportlab.lib.pagesizes import A4
|
|
64
|
+
doc = Document(input_path)
|
|
65
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
66
|
+
width, height = A4
|
|
67
|
+
y = height - 50
|
|
68
|
+
for p in doc.paragraphs:
|
|
69
|
+
c.drawString(50, y, p.text[:1000])
|
|
70
|
+
y -= 15
|
|
71
|
+
c.save()
|
|
21
72
|
return output_pdf
|
|
22
73
|
|
|
23
74
|
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
from docx import Document
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from openai import OpenAI
|
|
7
|
+
from PIL import Image
|
|
8
|
+
|
|
9
|
+
from .DocuToImageConverter import DocuToImageConverter
|
|
10
|
+
from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
|
|
11
|
+
from ..config.splitter_config import SplitterConfig
|
|
12
|
+
from .chunk_documents_crud_vdb import chunk_documents
|
|
13
|
+
from .chunk_to_all_content_mapper import ChunkMapper
|
|
14
|
+
from ..utils.file_loader import SplitType
|
|
15
|
+
|
|
16
|
+
load_dotenv(override=True)
|
|
17
|
+
|
|
18
|
+
# -----------------------------
|
|
19
|
+
# Abstract Strategy Interface
|
|
20
|
+
# -----------------------------
|
|
21
|
+
class BaseDocumentStrategy:
|
|
22
|
+
"""Defines a standard interface for all document processing strategies."""
|
|
23
|
+
|
|
24
|
+
def process(self, file_path: str):
|
|
25
|
+
raise NotImplementedError("process() must be implemented by subclasses")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -----------------------------
|
|
29
|
+
# PDF Strategy
|
|
30
|
+
# -----------------------------
|
|
31
|
+
class PDFStrategy(BaseDocumentStrategy):
|
|
32
|
+
def process(self, file_path: str):
|
|
33
|
+
print(f"📄 Using PDFStrategy for {file_path}")
|
|
34
|
+
converter = DocuToImageConverter()
|
|
35
|
+
# Example: detect multi-column layout or extract embedded text first
|
|
36
|
+
# import fitz
|
|
37
|
+
# text_ratio = 0
|
|
38
|
+
# with fitz.open(file_path) as doc:
|
|
39
|
+
# for page in doc:
|
|
40
|
+
# text = page.get_text("text")
|
|
41
|
+
# text_ratio += len(text) / (page.rect.width * page.rect.height)
|
|
42
|
+
# if text_ratio > 0.0001:
|
|
43
|
+
# print("📚 PDF appears text-based – using hybrid extract + image backup")
|
|
44
|
+
|
|
45
|
+
images = converter.convert_to_images(file_path)
|
|
46
|
+
return images
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# -----------------------------
|
|
50
|
+
# Word Strategy
|
|
51
|
+
# -----------------------------
|
|
52
|
+
class WordStrategy(BaseDocumentStrategy):
|
|
53
|
+
def process(self, file_path: str):
|
|
54
|
+
print(f"📝 Using WordStrategy for {file_path}")
|
|
55
|
+
|
|
56
|
+
# Extract text semantically first
|
|
57
|
+
try:
|
|
58
|
+
doc = Document(file_path)
|
|
59
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
60
|
+
text_content = "\n".join(paragraphs)
|
|
61
|
+
print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
|
|
64
|
+
text_content = ""
|
|
65
|
+
|
|
66
|
+
converter = DocuToImageConverter()
|
|
67
|
+
pdf_path = converter._convert_doc_to_pdf(file_path)
|
|
68
|
+
images = converter.convert_to_images(pdf_path)
|
|
69
|
+
|
|
70
|
+
# Optional: attach text fallback
|
|
71
|
+
if text_content:
|
|
72
|
+
images[0].extracted_text = text_content # for later use by extractor
|
|
73
|
+
|
|
74
|
+
return images
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# -----------------------------
|
|
78
|
+
# Image Strategy
|
|
79
|
+
# -----------------------------
|
|
80
|
+
class ImageStrategy(BaseDocumentStrategy):
|
|
81
|
+
def process(self, file_path: str):
|
|
82
|
+
print(f"🖼️ Using ImageStrategy for {file_path}")
|
|
83
|
+
image = Image.open(file_path).convert("RGB")
|
|
84
|
+
return [image]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# -----------------------------
|
|
88
|
+
# Strategy Factory
|
|
89
|
+
# -----------------------------
|
|
90
|
+
class StrategyFactory:
|
|
91
|
+
"""Selects a document strategy based on file extension."""
|
|
92
|
+
|
|
93
|
+
strategies = {
|
|
94
|
+
".pdf": PDFStrategy(),
|
|
95
|
+
".doc": WordStrategy(),
|
|
96
|
+
".docx": WordStrategy(),
|
|
97
|
+
".jpg": ImageStrategy(),
|
|
98
|
+
".jpeg": ImageStrategy(),
|
|
99
|
+
".png": ImageStrategy(),
|
|
100
|
+
".bmp": ImageStrategy(),
|
|
101
|
+
".tiff": ImageStrategy(),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
|
|
106
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
107
|
+
return cls.strategies.get(ext, None)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# -----------------------------
|
|
111
|
+
# Main Orchestrator
|
|
112
|
+
# -----------------------------
|
|
113
|
+
class MarkdownAndChunkDocuments:
|
|
114
|
+
def __init__(self):
|
|
115
|
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
116
|
+
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
117
|
+
|
|
118
|
+
def markdown_and_chunk_documents(self, file_path: str):
|
|
119
|
+
# Pick strategy
|
|
120
|
+
strategy = StrategyFactory.get_strategy(file_path)
|
|
121
|
+
if not strategy:
|
|
122
|
+
raise ValueError(f"Unsupported file type: {file_path}")
|
|
123
|
+
|
|
124
|
+
# Convert to images using correct strategy
|
|
125
|
+
images = strategy.process(file_path)
|
|
126
|
+
|
|
127
|
+
# Extract Markdown from images
|
|
128
|
+
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
|
|
129
|
+
binary_text_content = text_content.encode("utf-8")
|
|
130
|
+
|
|
131
|
+
# Chunking and mapping
|
|
132
|
+
chunk_client = OpenAI(api_key=self.api_key)
|
|
133
|
+
cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
|
|
134
|
+
splitter_config = SplitterConfig(
|
|
135
|
+
chunk_size=300,
|
|
136
|
+
chunk_overlap=0,
|
|
137
|
+
separators=["\n"],
|
|
138
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
|
|
139
|
+
min_rl_chunk_size=5,
|
|
140
|
+
max_rl_chunk_size=50,
|
|
141
|
+
enableLLMTouchUp=False,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
|
|
145
|
+
splitter_config=splitter_config)
|
|
146
|
+
|
|
147
|
+
flat_chunks = [''.join(inner) for inner in chunked_text]
|
|
148
|
+
mapped_chunks = cm.map_chunks(flat_chunks)
|
|
149
|
+
|
|
150
|
+
# Merge unmapped markdown sections
|
|
151
|
+
for md_item in markdown_output:
|
|
152
|
+
if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
|
|
153
|
+
md_item["chunked_text"] = md_item["markdown_text"]
|
|
154
|
+
mapped_chunks.append(md_item)
|
|
155
|
+
|
|
156
|
+
print("✅ Processing complete.")
|
|
157
|
+
return mapped_chunks
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# -----------------------------
|
|
161
|
+
# CLI Entry
|
|
162
|
+
# -----------------------------
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
file_path = "421307-nz-au-top-loading-washer-guide-shorter.pdf"
|
|
165
|
+
pipeline = MarkdownAndChunkDocuments()
|
|
166
|
+
output = pipeline.markdown_and_chunk_documents(file_path)
|
|
167
|
+
print(json.dumps(output, indent=2))
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
|
|
5
|
+
from core.prevectorchunks_core.services import chunk_documents_crud_vdb
|
|
6
|
+
from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
7
|
+
from core.prevectorchunks_core.utils.file_loader import SplitType
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# Create a temporary JSON file to test with
|
|
11
|
+
@pytest.fixture
|
|
12
|
+
def temp_json_file(tmp_path):
|
|
13
|
+
file_path = tmp_path / "test.json"
|
|
14
|
+
content = [{"id": 1, "text": "hello world"}]
|
|
15
|
+
with open(file_path, "w") as f:
|
|
16
|
+
json.dump(content, f)
|
|
17
|
+
return file_path
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
21
|
+
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
22
|
+
split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
|
|
23
|
+
max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
|
|
24
|
+
|
|
25
|
+
chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
|
|
26
|
+
|
|
27
|
+
splitter_config=splitter_config)
|
|
28
|
+
|
|
29
|
+
print(chunks)
|
|
30
|
+
for i, c in enumerate(chunks):
|
|
31
|
+
print(f"Chunk {i + 1}: {c}")
|
|
32
|
+
print(chunks)
|
|
33
|
+
|
|
34
|
+
def test_markdown(temp_json_file):
|
|
35
|
+
markdown_and_chunk_documents = MarkdownAndChunkDocuments()
|
|
36
|
+
mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
|
|
37
|
+
"content.docx")
|
|
38
|
+
print(mapped_chunks)
|
|
39
|
+
for i, c in enumerate(mapped_chunks):
|
|
40
|
+
print(f"Chunk {i + 1}: {c}")
|
|
41
|
+
|
|
42
|
+
for i, c in enumerate(mapped_chunks):
|
|
43
|
+
print(f"Chunk {i + 1}: {c}")
|
|
44
|
+
print(mapped_chunks)
|
|
@@ -16,7 +16,7 @@ from .llm_wrapper import LLMClientWrapper # Relative import
|
|
|
16
16
|
from dotenv import load_dotenv
|
|
17
17
|
import tempfile
|
|
18
18
|
|
|
19
|
-
from ..config.splitter_config import SplitterConfig
|
|
19
|
+
from ..config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
|
|
20
20
|
from ..rlchunker.inference import RLChunker
|
|
21
21
|
from ..services.propositional_index import PropositionalIndexer
|
|
22
22
|
|
|
@@ -256,15 +256,19 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
|
|
|
256
256
|
chunks = split_text_by_config(text, splitter_config=splitter_config)
|
|
257
257
|
all_results = []
|
|
258
258
|
if splitter_config.enableLLMTouchUp:
|
|
259
|
-
|
|
260
|
-
structured
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
259
|
+
if splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STANDARD:
|
|
260
|
+
warnings.warn("bypassing LLM touch up for standard structured output")
|
|
261
|
+
return chunks
|
|
262
|
+
elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
|
|
263
|
+
for chunk in chunks:
|
|
264
|
+
structured = process_with_llm(chunk,instructions)
|
|
265
|
+
# Ensure UUIDs exist
|
|
266
|
+
for obj in structured:
|
|
267
|
+
if "id" not in obj:
|
|
268
|
+
obj["id"] = str(uuid.uuid4())
|
|
269
|
+
all_results.extend(structured)
|
|
270
|
+
|
|
271
|
+
return all_results
|
|
268
272
|
else:
|
|
269
273
|
return chunks
|
|
270
274
|
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/PKG-INFO
RENAMED
|
@@ -1,14 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.27
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Copyright (c) 2025 Your Name
|
|
8
|
+
|
|
6
9
|
Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
7
10
|
Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
11
|
+
Requires-Python: <3.12,>=3.7
|
|
8
12
|
Description-Content-Type: text/markdown
|
|
9
13
|
License-File: LICENCE
|
|
14
|
+
License-File: LICENSE
|
|
10
15
|
Requires-Dist: packaging~=24.1
|
|
11
|
-
Requires-Dist: requests~=2.32.3
|
|
12
16
|
Requires-Dist: openai<3.0.0,>=2.6.0
|
|
13
17
|
Requires-Dist: python-dotenv~=1.0.1
|
|
14
18
|
Requires-Dist: PyJWT~=2.7.0
|
|
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
|
|
|
27
31
|
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
28
32
|
Requires-Dist: langchain~=0.3
|
|
29
33
|
Requires-Dist: langchain_openai~=0.3.35
|
|
30
|
-
Requires-Dist: transformers>=4.30.0
|
|
31
34
|
Requires-Dist: accelerate>=0.22.0
|
|
32
|
-
Requires-Dist: imageio-ffmpeg>=0.4.8
|
|
33
|
-
Requires-Dist: opencv-python>=4.10.0
|
|
34
35
|
Requires-Dist: pathlib~=1.0.1
|
|
35
36
|
Requires-Dist: transformers~=4.57.0
|
|
36
37
|
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
|
|
|
42
43
|
Requires-Dist: numpy~=2.2.6
|
|
43
44
|
Requires-Dist: scikit-learn~=1.7.2
|
|
44
45
|
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
|
+
Requires-Dist: pypandoc~=1.13
|
|
47
|
+
Requires-Dist: reportlab~=4.1.0
|
|
48
|
+
Requires-Dist: weasyprint~=62.0
|
|
49
|
+
Requires-Dist: lxml~=4.9.3
|
|
50
|
+
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
+
Requires-Dist: cairocffi~=1.4.0
|
|
45
52
|
Dynamic: license-file
|
|
46
53
|
|
|
47
54
|
# 📚 PreVectorChunks
|
|
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
|
|
|
122
129
|
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
123
130
|
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
124
131
|
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
125
|
-
|
|
132
|
+
|
|
133
|
+
- **Returns**
|
|
126
134
|
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
127
135
|
|
|
128
136
|
**Use Cases**
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
packaging~=24.1
|
|
2
|
-
requests~=2.32.3
|
|
3
2
|
openai<3.0.0,>=2.6.0
|
|
4
3
|
python-dotenv~=1.0.1
|
|
5
4
|
PyJWT~=2.7.0
|
|
@@ -18,10 +17,7 @@ py-gutenberg~=1.0.3
|
|
|
18
17
|
langchain-text-splitters~=0.3.11
|
|
19
18
|
langchain~=0.3
|
|
20
19
|
langchain_openai~=0.3.35
|
|
21
|
-
transformers>=4.30.0
|
|
22
20
|
accelerate>=0.22.0
|
|
23
|
-
imageio-ffmpeg>=0.4.8
|
|
24
|
-
opencv-python>=4.10.0
|
|
25
21
|
pathlib~=1.0.1
|
|
26
22
|
transformers~=4.57.0
|
|
27
23
|
imageio-ffmpeg~=0.6.0
|
|
@@ -33,3 +29,9 @@ docx2pdf~=0.1.8
|
|
|
33
29
|
numpy~=2.2.6
|
|
34
30
|
scikit-learn~=1.7.2
|
|
35
31
|
PyMuPDF~=1.22.5
|
|
32
|
+
pypandoc~=1.13
|
|
33
|
+
reportlab~=4.1.0
|
|
34
|
+
weasyprint~=62.0
|
|
35
|
+
lxml~=4.9.3
|
|
36
|
+
cssselect2~=0.7.0
|
|
37
|
+
cairocffi~=1.4.0
|
|
@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.27"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
11
|
+
requires-python = ">=3.7,<3.12"
|
|
11
12
|
authors = [
|
|
12
13
|
{ name = "Zul Al-Kabir", email = "zul.developer.2023@gmail.com" }
|
|
13
14
|
]
|
|
14
15
|
|
|
15
16
|
dependencies = [
|
|
16
17
|
"packaging~=24.1",
|
|
17
|
-
"requests~=2.32.3",
|
|
18
18
|
"openai>=2.6.0,<3.0.0",
|
|
19
19
|
"python-dotenv~=1.0.1",
|
|
20
20
|
"PyJWT~=2.7.0",
|
|
@@ -30,15 +30,10 @@ dependencies = [
|
|
|
30
30
|
"torchaudio~=2.6.0",
|
|
31
31
|
"sentence-transformers~=5.1.1",
|
|
32
32
|
"py-gutenberg~=1.0.3",
|
|
33
|
-
"langchain-text-splitters~=0.3.11",
|
|
33
|
+
"langchain-text-splitters~=0.3.11",
|
|
34
34
|
"langchain~=0.3",
|
|
35
35
|
"langchain_openai~=0.3.35",
|
|
36
|
-
# … your existing dependencies …
|
|
37
|
-
"transformers>=4.30.0",
|
|
38
36
|
"accelerate>=0.22.0",
|
|
39
|
-
"imageio-ffmpeg>=0.4.8",
|
|
40
|
-
"opencv-python>=4.10.0", # for frame extraction
|
|
41
|
-
# or whichever version of LLaVA you use
|
|
42
37
|
"pathlib~=1.0.1",
|
|
43
38
|
"transformers~=4.57.0",
|
|
44
39
|
"imageio-ffmpeg~=0.6.0",
|
|
@@ -49,17 +44,24 @@ dependencies = [
|
|
|
49
44
|
"docx2pdf~=0.1.8",
|
|
50
45
|
"numpy~=2.2.6",
|
|
51
46
|
"scikit-learn~=1.7.2",
|
|
52
|
-
"PyMuPDF~=1.22.5"
|
|
47
|
+
"PyMuPDF~=1.22.5",
|
|
48
|
+
"pypandoc~=1.13",
|
|
49
|
+
"reportlab~=4.1.0",
|
|
50
|
+
"weasyprint~=62.0",
|
|
51
|
+
"lxml~=4.9.3",
|
|
52
|
+
"cssselect2~=0.7.0",
|
|
53
|
+
"cairocffi~=1.4.0"
|
|
53
54
|
]
|
|
54
55
|
|
|
55
56
|
[tool.setuptools.packages.find]
|
|
56
57
|
include = ["prevectorchunks_core*"]
|
|
58
|
+
|
|
57
59
|
[tool.setuptools.package-data]
|
|
58
60
|
"prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
|
|
59
61
|
|
|
60
62
|
[tool.setuptools]
|
|
61
63
|
include-package-data = true
|
|
64
|
+
|
|
62
65
|
[project.urls]
|
|
63
66
|
Homepage = "https://github.com/zuldeveloper2023/PreVectorChunks"
|
|
64
67
|
Source = "https://github.com/zuldeveloper2023/PreVectorChunks"
|
|
65
|
-
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import tempfile
|
|
4
|
-
import base64
|
|
5
|
-
|
|
6
|
-
from openai import OpenAI
|
|
7
|
-
from PIL import Image
|
|
8
|
-
from .DocuToImageConverter import DocuToImageConverter
|
|
9
|
-
|
|
10
|
-
from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
|
|
11
|
-
from ..config.splitter_config import SplitterConfig
|
|
12
|
-
|
|
13
|
-
from dotenv import load_dotenv
|
|
14
|
-
|
|
15
|
-
from .chunk_documents_crud_vdb import chunk_documents
|
|
16
|
-
from .chunk_to_all_content_mapper import ChunkMapper
|
|
17
|
-
from ..utils.file_loader import SplitType
|
|
18
|
-
|
|
19
|
-
load_dotenv(override=True)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MarkdownAndChunkDocuments:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def markdown_and_chunk_documents(self,file_path:str):
|
|
26
|
-
# Create instances of the converter and extractor
|
|
27
|
-
converter = DocuToImageConverter()
|
|
28
|
-
extractor = DocuToMarkdownExtractor(api_key=os.getenv("OPENAI_API_KEY"))
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
images = converter.convert_to_images(file_path)
|
|
32
|
-
|
|
33
|
-
# convert
|
|
34
|
-
# Step 2: Extract Markdown from images
|
|
35
|
-
markdown_output, text_content = extractor.extract_markdown(images, include_image=False)
|
|
36
|
-
# convert text content to binary
|
|
37
|
-
binary_text_content = text_content.encode('utf-8') # bytes representation
|
|
38
|
-
|
|
39
|
-
chunk_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
40
|
-
cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
|
|
41
|
-
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
42
|
-
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
43
|
-
max_rl_chunk_size=50, enableLLMTouchUp=False)
|
|
44
|
-
|
|
45
|
-
chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
|
|
46
|
-
splitter_config=splitter_config)
|
|
47
|
-
|
|
48
|
-
flat_chunks = result = [''.join(inner) for inner in chunked_text]
|
|
49
|
-
mapped_chunks = cm.map_chunks(flat_chunks)
|
|
50
|
-
for md_item in markdown_output:
|
|
51
|
-
# Check if this markdown_output item is already present in mapped_chunks
|
|
52
|
-
match_found = False
|
|
53
|
-
for mapped in mapped_chunks:
|
|
54
|
-
if mapped.get("markdown_text") == md_item.get("markdown_text"):
|
|
55
|
-
match_found = True
|
|
56
|
-
break
|
|
57
|
-
|
|
58
|
-
# If not found, append the missing markdown_output item
|
|
59
|
-
if not match_found:
|
|
60
|
-
md_item["chunked_text"] = md_item["markdown_text"]
|
|
61
|
-
mapped_chunks.append(md_item)
|
|
62
|
-
#print(mapped_chunks)
|
|
63
|
-
|
|
64
|
-
#print("✅ Markdown extraction complete! See output.md")
|
|
65
|
-
return mapped_chunks
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
if __name__ == "__main__":
|
|
69
|
-
markdown_and_chunk_documents = MarkdownAndChunkDocuments()
|
|
70
|
-
mapped_chunks=markdown_and_chunk_documents.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
71
|
-
print(mapped_chunks)
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import pytest
|
|
3
|
-
|
|
4
|
-
from core.prevectorchunks_core.config.splitter_config import SplitterConfig
|
|
5
|
-
from core.prevectorchunks_core.services import chunk_documents_crud_vdb
|
|
6
|
-
from core.prevectorchunks_core.utils.file_loader import SplitType
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
# Create a temporary JSON file to test with
|
|
10
|
-
@pytest.fixture
|
|
11
|
-
def temp_json_file(tmp_path):
|
|
12
|
-
file_path = tmp_path / "test.json"
|
|
13
|
-
content = [{"id": 1, "text": "hello world"}]
|
|
14
|
-
with open(file_path, "w") as f:
|
|
15
|
-
json.dump(content, f)
|
|
16
|
-
return file_path
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
20
|
-
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
21
|
-
split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
|
|
22
|
-
max_rl_chunk_size=50,enableLLMTouchUp=True)
|
|
23
|
-
|
|
24
|
-
chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
|
|
25
|
-
|
|
26
|
-
print(chunks)
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|