prevectorchunks-core 0.1.27__tar.gz → 0.1.29__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.27/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.29}/PKG-INFO +2 -2
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToImageConverter.py +40 -17
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +9 -6
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/markdown_and_chunk_documents.py +11 -17
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/test_loader.py +1 -1
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29/prevectorchunks_core.egg-info}/PKG-INFO +2 -2
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/requires.txt +1 -1
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/pyproject.toml +2 -2
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/LICENCE +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/LICENSE +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/README.md +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/file_loader.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/setup.cfg +0 -0
{prevectorchunks_core-0.1.27/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.29}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.29
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
|
|
|
40
40
|
Requires-Dist: langchain-core~=0.3.78
|
|
41
41
|
Requires-Dist: pdf2image~=1.17.0
|
|
42
42
|
Requires-Dist: docx2pdf~=0.1.8
|
|
43
|
-
Requires-Dist: numpy~=2.
|
|
43
|
+
Requires-Dist: numpy~=2.0.0
|
|
44
44
|
Requires-Dist: scikit-learn~=1.7.2
|
|
45
45
|
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
46
|
Requires-Dist: pypandoc~=1.13
|
|
@@ -3,6 +3,7 @@ import shutil
|
|
|
3
3
|
import subprocess
|
|
4
4
|
import sys
|
|
5
5
|
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
|
|
7
8
|
import pypandoc
|
|
8
9
|
from PIL import Image
|
|
@@ -23,19 +24,44 @@ class DocuToImageConverter:
|
|
|
23
24
|
pass
|
|
24
25
|
|
|
25
26
|
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
26
|
-
import
|
|
27
|
-
from
|
|
27
|
+
import os, tempfile, shutil, subprocess
|
|
28
|
+
from pathlib import Path
|
|
28
29
|
|
|
29
30
|
if not os.path.exists(input_path):
|
|
30
31
|
raise FileNotFoundError(input_path)
|
|
31
32
|
|
|
32
33
|
output_dir = tempfile.mkdtemp()
|
|
33
|
-
output_pdf = os.path.join(output_dir,
|
|
34
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
35
|
+
|
|
36
|
+
# 1️⃣ Try Microsoft Word COM automation (Windows only)
|
|
37
|
+
try:
|
|
38
|
+
import win32com.client
|
|
39
|
+
word = win32com.client.Dispatch("Word.Application")
|
|
40
|
+
word.Visible = False
|
|
41
|
+
doc = word.Documents.Open(str(Path(input_path).resolve()))
|
|
42
|
+
doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
|
|
43
|
+
doc.Close()
|
|
44
|
+
word.Quit()
|
|
45
|
+
print("✅ Word COM conversion successful:", output_pdf)
|
|
46
|
+
return output_pdf
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print("⚠️ Word COM conversion failed:", e)
|
|
34
49
|
|
|
35
|
-
#
|
|
50
|
+
# 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
|
|
36
51
|
try:
|
|
37
|
-
|
|
52
|
+
# Requires LibreOffice installed and in PATH
|
|
53
|
+
subprocess.run(
|
|
54
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
55
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
56
|
+
)
|
|
57
|
+
print("✅ LibreOffice conversion successful:", output_pdf)
|
|
58
|
+
return output_pdf
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print("⚠️ LibreOffice conversion failed:", e)
|
|
38
61
|
|
|
62
|
+
# 3️⃣ Fallback: Pandoc (simpler, loses layout)
|
|
63
|
+
try:
|
|
64
|
+
import pypandoc
|
|
39
65
|
def which(cmd):
|
|
40
66
|
return shutil.which(cmd) is not None
|
|
41
67
|
|
|
@@ -44,23 +70,16 @@ class DocuToImageConverter:
|
|
|
44
70
|
input_path, "pdf", outputfile=output_pdf,
|
|
45
71
|
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
46
72
|
)
|
|
73
|
+
print("✅ Pandoc conversion successful:", output_pdf)
|
|
47
74
|
return output_pdf
|
|
48
75
|
except Exception as e:
|
|
49
|
-
print("⚠️ Pandoc
|
|
76
|
+
print("⚠️ Pandoc conversion failed:", e)
|
|
50
77
|
|
|
51
|
-
#
|
|
52
|
-
try:
|
|
53
|
-
from weasyprint import HTML
|
|
54
|
-
doc = Document(input_path)
|
|
55
|
-
html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
|
|
56
|
-
HTML(string=html).write_pdf(output_pdf)
|
|
57
|
-
return output_pdf
|
|
58
|
-
except Exception as e:
|
|
59
|
-
print("⚠️ Fallback to WeasyPrint failed:", e)
|
|
60
|
-
|
|
61
|
-
# 3️⃣ Last resort (plain text with ReportLab)
|
|
78
|
+
# 4️⃣ Last resort: ReportLab basic text (no formatting)
|
|
62
79
|
from reportlab.pdfgen import canvas
|
|
63
80
|
from reportlab.lib.pagesizes import A4
|
|
81
|
+
from docx import Document
|
|
82
|
+
|
|
64
83
|
doc = Document(input_path)
|
|
65
84
|
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
66
85
|
width, height = A4
|
|
@@ -68,7 +87,11 @@ class DocuToImageConverter:
|
|
|
68
87
|
for p in doc.paragraphs:
|
|
69
88
|
c.drawString(50, y, p.text[:1000])
|
|
70
89
|
y -= 15
|
|
90
|
+
if y < 50:
|
|
91
|
+
c.showPage()
|
|
92
|
+
y = height - 50
|
|
71
93
|
c.save()
|
|
94
|
+
print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
|
|
72
95
|
return output_pdf
|
|
73
96
|
|
|
74
97
|
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
@@ -42,12 +42,12 @@ class DocuToMarkdownExtractor:
|
|
|
42
42
|
fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
|
|
43
43
|
"from this image and format the output in clean Markdown. "
|
|
44
44
|
"Preserve table structure, headings, and lists. If there is no markdown, put a space. "
|
|
45
|
-
"Put your result in a JSON object with the following keys
|
|
46
|
-
"- markdown_text: the markdown text
|
|
47
|
-
"- short_title: the short title of the document
|
|
48
|
-
"- page_number: the page number of the document (
|
|
49
|
-
"- summary: a summary of the document
|
|
50
|
-
" - image_data: the image data in base64 format
|
|
45
|
+
"Put your result in a JSON object with the following keys:"
|
|
46
|
+
"- markdown_text: the markdown text"
|
|
47
|
+
"- short_title: the short title of the document"
|
|
48
|
+
"- page_number: the page number of the document (i+1)"
|
|
49
|
+
"- summary: a summary of the document,"
|
|
50
|
+
" - image_data: the image data in base64 format,"
|
|
51
51
|
"Return only raw JSON, without markdown formatting or triple backticks."
|
|
52
52
|
"- image_index: the index of the image in the document"},
|
|
53
53
|
{"type": "text", "text": "You are an image inspector. Tell us what is in the image "
|
|
@@ -63,6 +63,9 @@ class DocuToMarkdownExtractor:
|
|
|
63
63
|
text_content=text_content+"\n"+response["markdown_text"]
|
|
64
64
|
if(include_image):
|
|
65
65
|
response["image_data"]=b64_image
|
|
66
|
+
response["image_index"]=i
|
|
67
|
+
response["page_number"] = i
|
|
68
|
+
|
|
66
69
|
all_outputs.append(response)
|
|
67
70
|
|
|
68
71
|
json_array = json.dumps(all_outputs, indent=2)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
3
5
|
|
|
4
6
|
from docx import Document
|
|
5
7
|
from dotenv import load_dotenv
|
|
@@ -51,25 +53,17 @@ class PDFStrategy(BaseDocumentStrategy):
|
|
|
51
53
|
# -----------------------------
|
|
52
54
|
class WordStrategy(BaseDocumentStrategy):
|
|
53
55
|
def process(self, file_path: str):
|
|
56
|
+
file_path = Path(file_path)
|
|
57
|
+
|
|
54
58
|
print(f"📝 Using WordStrategy for {file_path}")
|
|
55
59
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
doc = Document(file_path)
|
|
59
|
-
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
60
|
-
text_content = "\n".join(paragraphs)
|
|
61
|
-
print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
|
|
62
|
-
except Exception as e:
|
|
63
|
-
print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
|
|
64
|
-
text_content = ""
|
|
60
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
61
|
+
pdf_path = Path(tmpdir) / f"{file_path.stem}.pdf"
|
|
65
62
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
63
|
+
converter = DocuToImageConverter()
|
|
64
|
+
pdf_path = converter._convert_doc_to_pdf(file_path)
|
|
65
|
+
images = converter.convert_to_images(pdf_path)
|
|
69
66
|
|
|
70
|
-
# Optional: attach text fallback
|
|
71
|
-
if text_content:
|
|
72
|
-
images[0].extracted_text = text_content # for later use by extractor
|
|
73
67
|
|
|
74
68
|
return images
|
|
75
69
|
|
|
@@ -115,7 +109,7 @@ class MarkdownAndChunkDocuments:
|
|
|
115
109
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
116
110
|
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
117
111
|
|
|
118
|
-
def markdown_and_chunk_documents(self, file_path: str):
|
|
112
|
+
def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
|
|
119
113
|
# Pick strategy
|
|
120
114
|
strategy = StrategyFactory.get_strategy(file_path)
|
|
121
115
|
if not strategy:
|
|
@@ -125,7 +119,7 @@ class MarkdownAndChunkDocuments:
|
|
|
125
119
|
images = strategy.process(file_path)
|
|
126
120
|
|
|
127
121
|
# Extract Markdown from images
|
|
128
|
-
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=
|
|
122
|
+
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
|
129
123
|
binary_text_content = text_content.encode("utf-8")
|
|
130
124
|
|
|
131
125
|
# Chunking and mapping
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/test_loader.py
RENAMED
|
@@ -34,7 +34,7 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
|
34
34
|
def test_markdown(temp_json_file):
|
|
35
35
|
markdown_and_chunk_documents = MarkdownAndChunkDocuments()
|
|
36
36
|
mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
|
|
37
|
-
"content.docx")
|
|
37
|
+
"content.docx",include_image=True)
|
|
38
38
|
print(mapped_chunks)
|
|
39
39
|
for i, c in enumerate(mapped_chunks):
|
|
40
40
|
print(f"Chunk {i + 1}: {c}")
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29/prevectorchunks_core.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.29
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
|
|
|
40
40
|
Requires-Dist: langchain-core~=0.3.78
|
|
41
41
|
Requires-Dist: pdf2image~=1.17.0
|
|
42
42
|
Requires-Dist: docx2pdf~=0.1.8
|
|
43
|
-
Requires-Dist: numpy~=2.
|
|
43
|
+
Requires-Dist: numpy~=2.0.0
|
|
44
44
|
Requires-Dist: scikit-learn~=1.7.2
|
|
45
45
|
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
46
|
Requires-Dist: pypandoc~=1.13
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.29"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -42,7 +42,7 @@ dependencies = [
|
|
|
42
42
|
"langchain-core~=0.3.78",
|
|
43
43
|
"pdf2image~=1.17.0",
|
|
44
44
|
"docx2pdf~=0.1.8",
|
|
45
|
-
"numpy~=2.
|
|
45
|
+
"numpy~=2.0.0",
|
|
46
46
|
"scikit-learn~=1.7.2",
|
|
47
47
|
"PyMuPDF~=1.22.5",
|
|
48
48
|
"pypandoc~=1.13",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|