prevectorchunks-core 0.1.33__tar.gz → 0.1.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.35}/PKG-INFO +4 -1
- prevectorchunks_core-0.1.35/prevectorchunks_core/services/DocuToImageConverter.py +318 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +18 -5
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +2 -2
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +15 -19
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/image_processor.py +17 -14
- prevectorchunks_core-0.1.35/prevectorchunks_core/services/markdown_and_chunk_documents.py +326 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/test_loader.py +26 -9
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/file_loader.py +77 -21
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35/prevectorchunks_core.egg-info}/PKG-INFO +4 -1
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/SOURCES.txt +34 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/requires.txt +3 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/pyproject.toml +9 -1
- prevectorchunks_core-0.1.33/prevectorchunks_core/services/DocuToImageConverter.py +0 -143
- prevectorchunks_core-0.1.33/prevectorchunks_core/services/markdown_and_chunk_documents.py +0 -161
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/LICENCE +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/LICENSE +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/README.md +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/setup.cfg +0 -0
{prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.35}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.35
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
|
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
52
|
Requires-Dist: tensorflow~=2.12.0
|
|
53
|
+
Requires-Dist: pandas~=1.5.3
|
|
54
|
+
Requires-Dist: openpyxl~=3.1.2
|
|
55
|
+
Requires-Dist: python-pptx~=0.6.21
|
|
53
56
|
Dynamic: license-file
|
|
54
57
|
|
|
55
58
|
# 📚 PreVectorChunks
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import io
|
|
8
|
+
import fitz
|
|
9
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
10
|
+
from docx import Document
|
|
11
|
+
from reportlab.pdfgen import canvas
|
|
12
|
+
from reportlab.lib.pagesizes import A4
|
|
13
|
+
import pypandoc
|
|
14
|
+
|
|
15
|
+
# Ensure pandoc is available
|
|
16
|
+
try:
|
|
17
|
+
pypandoc.get_pandoc_path()
|
|
18
|
+
except OSError:
|
|
19
|
+
pypandoc.download_pandoc()
|
|
20
|
+
|
|
21
|
+
class DocuToImageConverter:
|
|
22
|
+
"""Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def _write_temp_file(self, input_bytes: bytes, suffix: str):
|
|
28
|
+
"""Write bytes to a temporary file and return path."""
|
|
29
|
+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
|
|
30
|
+
with os.fdopen(tmp_fd, "wb") as f:
|
|
31
|
+
f.write(input_bytes.read())
|
|
32
|
+
return tmp_path
|
|
33
|
+
|
|
34
|
+
def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Convert DOC/DOCX to PDF. Supports:
|
|
37
|
+
- file_path (string)
|
|
38
|
+
- input_bytes (bytes, InMemoryUploadedFile, or file-like)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# ✅ If bytes are provided, write them to a temporary .docx
|
|
42
|
+
if input_bytes is not None:
|
|
43
|
+
# Get filename or fallback
|
|
44
|
+
original_name = getattr(input_bytes, "name", "uploaded.docx")
|
|
45
|
+
ext = os.path.splitext(original_name)[1] or ".docx"
|
|
46
|
+
|
|
47
|
+
# Create a temporary file path
|
|
48
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
49
|
+
|
|
50
|
+
# Read bytes safely
|
|
51
|
+
if hasattr(input_bytes, "read"): # Django UploadedFile
|
|
52
|
+
input_bytes.seek(0)
|
|
53
|
+
content = input_bytes.read()
|
|
54
|
+
else: # already bytes
|
|
55
|
+
content = input_bytes
|
|
56
|
+
|
|
57
|
+
# Write bytes to temp file
|
|
58
|
+
with open(temp_input_path, "wb") as f:
|
|
59
|
+
f.write(content)
|
|
60
|
+
|
|
61
|
+
input_path = temp_input_path
|
|
62
|
+
|
|
63
|
+
# ✅ If file_path is provided, use it directly
|
|
64
|
+
elif file_path:
|
|
65
|
+
input_path = file_path
|
|
66
|
+
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
69
|
+
|
|
70
|
+
# ✅ Must exist at this point
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
raise FileNotFoundError(input_path)
|
|
73
|
+
|
|
74
|
+
# ✅ Prepare output PDF path
|
|
75
|
+
output_dir = tempfile.mkdtemp()
|
|
76
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
77
|
+
|
|
78
|
+
# 1️⃣ Try Microsoft Word COM automation (Windows)
|
|
79
|
+
try:
|
|
80
|
+
import win32com.client
|
|
81
|
+
word = win32com.client.Dispatch("Word.Application")
|
|
82
|
+
word.Visible = False
|
|
83
|
+
doc = word.Documents.Open(str(Path(input_path).resolve()))
|
|
84
|
+
doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
|
|
85
|
+
doc.Close()
|
|
86
|
+
word.Quit()
|
|
87
|
+
return output_pdf
|
|
88
|
+
except Exception:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
# 2️⃣ Try LibreOffice
|
|
92
|
+
try:
|
|
93
|
+
subprocess.run(
|
|
94
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
95
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
96
|
+
)
|
|
97
|
+
return output_pdf
|
|
98
|
+
except Exception:
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
# 3️⃣ Pandoc fallback
|
|
102
|
+
try:
|
|
103
|
+
pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
|
|
104
|
+
pypandoc.convert_file(
|
|
105
|
+
input_path, "pdf",
|
|
106
|
+
outputfile=output_pdf,
|
|
107
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
108
|
+
)
|
|
109
|
+
return output_pdf
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
# 4️⃣ Final fallback: Render plain text using ReportLab
|
|
114
|
+
doc = Document(input_path)
|
|
115
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
116
|
+
width, height = A4
|
|
117
|
+
y = height - 50
|
|
118
|
+
|
|
119
|
+
for p in doc.paragraphs:
|
|
120
|
+
c.drawString(50, y, p.text[:1000])
|
|
121
|
+
y -= 15
|
|
122
|
+
if y < 50:
|
|
123
|
+
c.showPage()
|
|
124
|
+
y = height - 50
|
|
125
|
+
|
|
126
|
+
c.save()
|
|
127
|
+
return output_pdf
|
|
128
|
+
|
|
129
|
+
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
130
|
+
images = []
|
|
131
|
+
pdf_document = fitz.open(pdf_path)
|
|
132
|
+
for page_num in range(len(pdf_document)):
|
|
133
|
+
page = pdf_document[page_num]
|
|
134
|
+
pixmap = page.get_pixmap(dpi=dpi)
|
|
135
|
+
image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
|
136
|
+
images.append(image)
|
|
137
|
+
pdf_document.close()
|
|
138
|
+
return images
|
|
139
|
+
|
|
140
|
+
def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
|
|
141
|
+
"""
|
|
142
|
+
Convert a file path or binary content to PIL images.
|
|
143
|
+
Supports PDF, DOC, DOCX, and image files.
|
|
144
|
+
"""
|
|
145
|
+
if not file_path and not input_bytes:
|
|
146
|
+
raise ValueError("Provide either file_path or input_bytes.")
|
|
147
|
+
|
|
148
|
+
# Determine extension
|
|
149
|
+
if file_path:
|
|
150
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
151
|
+
print('work')
|
|
152
|
+
elif input_bytes:
|
|
153
|
+
# Attempt to infer from first few bytes (simple)
|
|
154
|
+
# if input_bytes[:4] == b"%PDF":
|
|
155
|
+
# ext = ".pdf"
|
|
156
|
+
# elif input_bytes[:2] == b"PK":
|
|
157
|
+
# ext = ".docx"
|
|
158
|
+
# else:
|
|
159
|
+
# ext = ".img" # Treat as generic image
|
|
160
|
+
|
|
161
|
+
# Write to temp file if doc/pdf
|
|
162
|
+
if ext in [".pdf", ".doc", ".docx"]:
|
|
163
|
+
file_path = self._write_temp_file(input_bytes, suffix=ext)
|
|
164
|
+
|
|
165
|
+
# Word → PDF
|
|
166
|
+
if ext in [".doc", ".docx"]:
|
|
167
|
+
pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
|
|
168
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
169
|
+
|
|
170
|
+
# PowerPoint → PDF
|
|
171
|
+
elif ext in [".ppt", ".pptx"]:
|
|
172
|
+
pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
|
|
173
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
174
|
+
|
|
175
|
+
# Excel → PDF
|
|
176
|
+
elif ext in [".xls", ".xlsx"]:
|
|
177
|
+
pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
|
|
178
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# PDF → images
|
|
182
|
+
elif ext == ".pdf":
|
|
183
|
+
images = self._convert_pdf_to_images(file_path, dpi=dpi)
|
|
184
|
+
|
|
185
|
+
# Image
|
|
186
|
+
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
|
|
187
|
+
image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
|
|
188
|
+
buffer = io.BytesIO()
|
|
189
|
+
image.save(buffer, format=output_format)
|
|
190
|
+
buffer.seek(0)
|
|
191
|
+
images = [Image.open(buffer)]
|
|
192
|
+
|
|
193
|
+
else:
|
|
194
|
+
raise ValueError("Unsupported file type.")
|
|
195
|
+
|
|
196
|
+
return images
|
|
197
|
+
|
|
198
|
+
def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
199
|
+
"""
|
|
200
|
+
Convert PPT/PPTX to PDF using:
|
|
201
|
+
1. PowerPoint COM (Windows)
|
|
202
|
+
2. LibreOffice
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
# write bytes if needed
|
|
206
|
+
if input_bytes is not None:
|
|
207
|
+
original_name = getattr(input_bytes, "name", "uploaded.pptx")
|
|
208
|
+
ext = os.path.splitext(original_name)[1] or ".pptx"
|
|
209
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
210
|
+
|
|
211
|
+
if hasattr(input_bytes, "read"):
|
|
212
|
+
input_bytes.seek(0)
|
|
213
|
+
content = input_bytes.read()
|
|
214
|
+
else:
|
|
215
|
+
content = input_bytes
|
|
216
|
+
|
|
217
|
+
with open(temp_input_path, "wb") as f:
|
|
218
|
+
f.write(content)
|
|
219
|
+
|
|
220
|
+
input_path = temp_input_path
|
|
221
|
+
|
|
222
|
+
elif file_path:
|
|
223
|
+
input_path = file_path
|
|
224
|
+
|
|
225
|
+
else:
|
|
226
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
227
|
+
|
|
228
|
+
output_dir = tempfile.mkdtemp()
|
|
229
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
230
|
+
|
|
231
|
+
# 1️⃣ Try PowerPoint COM on Windows
|
|
232
|
+
try:
|
|
233
|
+
import win32com.client
|
|
234
|
+
powerpoint = win32com.client.Dispatch("PowerPoint.Application")
|
|
235
|
+
powerpoint.Visible = 1
|
|
236
|
+
|
|
237
|
+
deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
|
|
238
|
+
deck.SaveAs(str(Path(output_pdf).resolve()), 32) # 32 = PDF
|
|
239
|
+
deck.Close()
|
|
240
|
+
powerpoint.Quit()
|
|
241
|
+
|
|
242
|
+
return output_pdf
|
|
243
|
+
except Exception:
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
# 2️⃣ Try LibreOffice
|
|
247
|
+
try:
|
|
248
|
+
subprocess.run(
|
|
249
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
250
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
251
|
+
)
|
|
252
|
+
return output_pdf
|
|
253
|
+
except Exception:
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
raise ValueError("Unable to convert PPT/PPTX to PDF")
|
|
257
|
+
|
|
258
|
+
def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
259
|
+
"""
|
|
260
|
+
Convert XLS/XLSX to PDF using:
|
|
261
|
+
1. Excel COM (Windows)
|
|
262
|
+
2. LibreOffice
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
# write bytes if needed
|
|
266
|
+
if input_bytes is not None:
|
|
267
|
+
original_name = getattr(input_bytes, "name", "uploaded.xlsx")
|
|
268
|
+
ext = os.path.splitext(original_name)[1] or ".xlsx"
|
|
269
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
270
|
+
|
|
271
|
+
if hasattr(input_bytes, "read"):
|
|
272
|
+
input_bytes.seek(0)
|
|
273
|
+
content = input_bytes.read()
|
|
274
|
+
else:
|
|
275
|
+
content = input_bytes
|
|
276
|
+
|
|
277
|
+
with open(temp_input_path, "wb") as f:
|
|
278
|
+
f.write(content)
|
|
279
|
+
|
|
280
|
+
input_path = temp_input_path
|
|
281
|
+
|
|
282
|
+
elif file_path:
|
|
283
|
+
input_path = file_path
|
|
284
|
+
|
|
285
|
+
else:
|
|
286
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
287
|
+
|
|
288
|
+
output_dir = tempfile.mkdtemp()
|
|
289
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
290
|
+
|
|
291
|
+
# 1️⃣ Try Excel COM (Windows)
|
|
292
|
+
try:
|
|
293
|
+
import win32com.client
|
|
294
|
+
excel = win32com.client.Dispatch("Excel.Application")
|
|
295
|
+
excel.Visible = False
|
|
296
|
+
|
|
297
|
+
wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
|
|
298
|
+
wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve())) # 0 = PDF
|
|
299
|
+
wb.Close()
|
|
300
|
+
excel.Quit()
|
|
301
|
+
|
|
302
|
+
return output_pdf
|
|
303
|
+
except Exception:
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
# 2️⃣ Try LibreOffice
|
|
307
|
+
try:
|
|
308
|
+
subprocess.run(
|
|
309
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
310
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
311
|
+
)
|
|
312
|
+
return output_pdf
|
|
313
|
+
except Exception:
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
raise ValueError("Unable to convert XLS/XLSX to PDF")
|
|
317
|
+
|
|
318
|
+
|
|
@@ -3,11 +3,13 @@ import os
|
|
|
3
3
|
import tempfile
|
|
4
4
|
import base64
|
|
5
5
|
|
|
6
|
+
from langchain.chat_models import init_chat_model
|
|
6
7
|
from openai import OpenAI
|
|
7
8
|
from PIL import Image
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
from dotenv import load_dotenv
|
|
12
|
+
from openai.types import ChatModel
|
|
11
13
|
|
|
12
14
|
from .image_processor import ImageProcessor
|
|
13
15
|
|
|
@@ -18,9 +20,19 @@ load_dotenv(override=True)
|
|
|
18
20
|
class DocuToMarkdownExtractor:
|
|
19
21
|
"""Sends image pages to an LLM and extracts Markdown text + tables."""
|
|
20
22
|
|
|
21
|
-
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
|
|
24
|
+
if client is None:
|
|
25
|
+
client = init_chat_model(
|
|
26
|
+
model=model,
|
|
27
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
28
|
+
api_key=api_key
|
|
29
|
+
)
|
|
30
|
+
self.client = client
|
|
31
|
+
self.model = client.model_name
|
|
32
|
+
# Initialize ImageProcessor once and pass the chat model
|
|
33
|
+
self.processor = ImageProcessor(client=self.client)
|
|
34
|
+
|
|
35
|
+
|
|
24
36
|
|
|
25
37
|
def _image_to_base64(self, image: Image.Image) -> str:
|
|
26
38
|
"""Converts PIL image to base64-encoded PNG string."""
|
|
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
|
|
|
29
41
|
with open(tmp.name, "rb") as f:
|
|
30
42
|
return base64.b64encode(f.read()).decode("utf-8")
|
|
31
43
|
|
|
32
|
-
def extract_markdown(self, images,include_image:True):
|
|
44
|
+
def extract_markdown(self, images,include_image:bool=True):
|
|
33
45
|
"""Extracts Markdown-formatted text from each image page."""
|
|
34
46
|
all_outputs = []
|
|
35
47
|
text_content=""
|
|
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
|
|
|
59
71
|
try:
|
|
60
72
|
response = json.loads(response) # Convert JSON string to dictionary
|
|
61
73
|
except json.JSONDecodeError:
|
|
62
|
-
|
|
74
|
+
print('skipping quietly')
|
|
75
|
+
#raise ValueError("The response from 'processor.analyze' is not valid JSON.")
|
|
63
76
|
text_content=text_content+"\n"+response["markdown_text"]
|
|
64
77
|
if(include_image):
|
|
65
78
|
response["image_data"]=b64_image
|
|
@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
|
|
|
392
392
|
|
|
393
393
|
|
|
394
394
|
#function that chunks any document
|
|
395
|
-
def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
|
|
396
|
-
return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
|
|
395
|
+
def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
|
|
396
|
+
return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
|
|
397
397
|
|
|
398
398
|
#function that chunks any document as well as inserts into vdb
|
|
399
399
|
def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
|
|
@@ -2,39 +2,35 @@ import numpy as np
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class ChunkMapper:
|
|
5
|
-
def __init__(self,
|
|
5
|
+
def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
|
|
6
6
|
"""
|
|
7
7
|
client: OpenAI client object
|
|
8
8
|
markdown_output: list of JSON objects containing at least 'markdown_text'
|
|
9
9
|
embedding_model: model for embeddings
|
|
10
10
|
"""
|
|
11
|
-
self.
|
|
11
|
+
self.embedding_client = embedding_client
|
|
12
12
|
self.markdown_output = markdown_output
|
|
13
13
|
self.embedding_model = embedding_model
|
|
14
14
|
|
|
15
15
|
# Precompute embeddings for markdown_output
|
|
16
16
|
self.markdown_embeddings = self._compute_markdown_embeddings()
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
# -----------------------------
|
|
19
|
+
# Compute embeddings for markdown JSON items
|
|
20
|
+
# -----------------------------
|
|
21
|
+
|
|
21
22
|
def _compute_markdown_embeddings(self):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
|
|
24
|
+
return self.embedding_client.embed_documents(texts)
|
|
25
|
+
|
|
26
|
+
# -----------------------------
|
|
27
|
+
# Get embedding for a single text
|
|
28
|
+
# -----------------------------
|
|
28
29
|
|
|
29
|
-
# -----------------------------
|
|
30
|
-
# Embedding helper
|
|
31
|
-
# -----------------------------
|
|
32
30
|
def _get_embedding(self, text):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
)
|
|
37
|
-
return response.data[0].embedding
|
|
31
|
+
# LangChain uses a list input
|
|
32
|
+
emb = self.embedding_client.embed_query(text)
|
|
33
|
+
return emb
|
|
38
34
|
|
|
39
35
|
# -----------------------------
|
|
40
36
|
# Cosine similarity
|
|
@@ -10,6 +10,8 @@ import requests
|
|
|
10
10
|
from dotenv import load_dotenv
|
|
11
11
|
from typing import Optional
|
|
12
12
|
|
|
13
|
+
from langchain.chat_models import init_chat_model
|
|
14
|
+
from langchain_core.messages import HumanMessage
|
|
13
15
|
from openai import OpenAI
|
|
14
16
|
from langchain_core.pydantic_v1 import BaseModel
|
|
15
17
|
|
|
@@ -31,15 +33,22 @@ class ImageProcessor:
|
|
|
31
33
|
Wrapper for a GPT-4o multimodal image reasoning pipeline.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, model_name: str = "gpt-4o-mini"):
|
|
36
|
+
def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
|
|
35
37
|
load_dotenv(override=True)
|
|
36
38
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
37
39
|
if not self.api_key:
|
|
38
40
|
raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
|
|
39
41
|
|
|
42
|
+
if client is None:
|
|
43
|
+
client = init_chat_model(
|
|
44
|
+
model=model_name,
|
|
45
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
46
|
+
api_key=api_key
|
|
47
|
+
)
|
|
48
|
+
self.llm = client
|
|
40
49
|
# Initialize multimodal client
|
|
41
|
-
|
|
42
|
-
self.model_name = model_name
|
|
50
|
+
|
|
51
|
+
self.model_name = client.model_name
|
|
43
52
|
|
|
44
53
|
# -------------------------------------------------
|
|
45
54
|
# 3️⃣ Image encoding helper
|
|
@@ -70,17 +79,11 @@ class ImageProcessor:
|
|
|
70
79
|
},
|
|
71
80
|
]
|
|
72
81
|
content1.extend(finstructioncontent)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
"content": content1
|
|
79
|
-
}
|
|
80
|
-
],
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
result_text = response.choices[0].message.content
|
|
82
|
+
# Call the LangChain model
|
|
83
|
+
response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
|
|
84
|
+
|
|
85
|
+
# Extract the text
|
|
86
|
+
result_text = response_msg.content
|
|
84
87
|
print("✅ Analysis complete.")
|
|
85
88
|
print(result_text)
|
|
86
89
|
return result_text
|