prevectorchunks-core 0.1.33__tar.gz → 0.1.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.35}/PKG-INFO +4 -1
  2. prevectorchunks_core-0.1.35/prevectorchunks_core/services/DocuToImageConverter.py +318 -0
  3. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +18 -5
  4. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +2 -2
  5. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +15 -19
  6. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/image_processor.py +17 -14
  7. prevectorchunks_core-0.1.35/prevectorchunks_core/services/markdown_and_chunk_documents.py +326 -0
  8. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/test_loader.py +26 -9
  9. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/file_loader.py +77 -21
  10. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35/prevectorchunks_core.egg-info}/PKG-INFO +4 -1
  11. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/SOURCES.txt +34 -0
  12. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/requires.txt +3 -0
  13. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/pyproject.toml +9 -1
  14. prevectorchunks_core-0.1.33/prevectorchunks_core/services/DocuToImageConverter.py +0 -143
  15. prevectorchunks_core-0.1.33/prevectorchunks_core/services/markdown_and_chunk_documents.py +0 -161
  16. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/LICENCE +0 -0
  17. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/LICENSE +0 -0
  18. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/README.md +0 -0
  19. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/__init__.py +0 -0
  20. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/config/__init__.py +0 -0
  21. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/config/splitter_config.py +0 -0
  22. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/migrations/__init__.py +0 -0
  23. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/os-llm/__init__.py +0 -0
  24. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/os-llm/llava.py +0 -0
  25. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  26. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/env.py +0 -0
  27. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/inference.py +0 -0
  28. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/model.py +0 -0
  29. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  30. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  31. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  32. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/reward.py +0 -0
  33. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  34. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  35. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/rlchunker/utils.py +0 -0
  36. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/__init__.py +0 -0
  37. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/audio_processor.py +0 -0
  38. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/propositional_index.py +0 -0
  39. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/services/video_analyser.py +0 -0
  40. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/tests/__init__.py +0 -0
  41. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/tests/test_local.py +0 -0
  42. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/__init__.py +0 -0
  43. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/extract_content.py +0 -0
  44. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  45. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  46. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  47. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.35}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.33
3
+ Version: 0.1.35
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
50
50
  Requires-Dist: cssselect2~=0.7.0
51
51
  Requires-Dist: cairocffi~=1.4.0
52
52
  Requires-Dist: tensorflow~=2.12.0
53
+ Requires-Dist: pandas~=1.5.3
54
+ Requires-Dist: openpyxl~=3.1.2
55
+ Requires-Dist: python-pptx~=0.6.21
53
56
  Dynamic: license-file
54
57
 
55
58
  # 📚 PreVectorChunks
@@ -0,0 +1,318 @@
1
+ import os
2
+ import tempfile
3
+ import shutil
4
+ import subprocess
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ import io
8
+ import fitz
9
+ from docx2pdf import convert as docx2pdf_convert
10
+ from docx import Document
11
+ from reportlab.pdfgen import canvas
12
+ from reportlab.lib.pagesizes import A4
13
+ import pypandoc
14
+
15
+ # Ensure pandoc is available
16
+ try:
17
+ pypandoc.get_pandoc_path()
18
+ except OSError:
19
+ pypandoc.download_pandoc()
20
+
21
+ class DocuToImageConverter:
22
+ """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
23
+
24
+ def __init__(self):
25
+ pass
26
+
27
+ def _write_temp_file(self, input_bytes: bytes, suffix: str):
28
+ """Write bytes to a temporary file and return path."""
29
+ tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
30
+ with os.fdopen(tmp_fd, "wb") as f:
31
+ f.write(input_bytes.read())
32
+ return tmp_path
33
+
34
+ def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
35
+ """
36
+ Convert DOC/DOCX to PDF. Supports:
37
+ - file_path (string)
38
+ - input_bytes (bytes, InMemoryUploadedFile, or file-like)
39
+ """
40
+
41
+ # ✅ If bytes are provided, write them to a temporary .docx
42
+ if input_bytes is not None:
43
+ # Get filename or fallback
44
+ original_name = getattr(input_bytes, "name", "uploaded.docx")
45
+ ext = os.path.splitext(original_name)[1] or ".docx"
46
+
47
+ # Create a temporary file path
48
+ temp_input_path = tempfile.mktemp(suffix=ext)
49
+
50
+ # Read bytes safely
51
+ if hasattr(input_bytes, "read"): # Django UploadedFile
52
+ input_bytes.seek(0)
53
+ content = input_bytes.read()
54
+ else: # already bytes
55
+ content = input_bytes
56
+
57
+ # Write bytes to temp file
58
+ with open(temp_input_path, "wb") as f:
59
+ f.write(content)
60
+
61
+ input_path = temp_input_path
62
+
63
+ # ✅ If file_path is provided, use it directly
64
+ elif file_path:
65
+ input_path = file_path
66
+
67
+ else:
68
+ raise ValueError("Must supply either file_path or input_bytes")
69
+
70
+ # ✅ Must exist at this point
71
+ if not os.path.exists(input_path):
72
+ raise FileNotFoundError(input_path)
73
+
74
+ # ✅ Prepare output PDF path
75
+ output_dir = tempfile.mkdtemp()
76
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
77
+
78
+ # 1️⃣ Try Microsoft Word COM automation (Windows)
79
+ try:
80
+ import win32com.client
81
+ word = win32com.client.Dispatch("Word.Application")
82
+ word.Visible = False
83
+ doc = word.Documents.Open(str(Path(input_path).resolve()))
84
+ doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
85
+ doc.Close()
86
+ word.Quit()
87
+ return output_pdf
88
+ except Exception:
89
+ pass
90
+
91
+ # 2️⃣ Try LibreOffice
92
+ try:
93
+ subprocess.run(
94
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
95
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
96
+ )
97
+ return output_pdf
98
+ except Exception:
99
+ pass
100
+
101
+ # 3️⃣ Pandoc fallback
102
+ try:
103
+ pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
104
+ pypandoc.convert_file(
105
+ input_path, "pdf",
106
+ outputfile=output_pdf,
107
+ extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
108
+ )
109
+ return output_pdf
110
+ except Exception:
111
+ pass
112
+
113
+ # 4️⃣ Final fallback: Render plain text using ReportLab
114
+ doc = Document(input_path)
115
+ c = canvas.Canvas(output_pdf, pagesize=A4)
116
+ width, height = A4
117
+ y = height - 50
118
+
119
+ for p in doc.paragraphs:
120
+ c.drawString(50, y, p.text[:1000])
121
+ y -= 15
122
+ if y < 50:
123
+ c.showPage()
124
+ y = height - 50
125
+
126
+ c.save()
127
+ return output_pdf
128
+
129
+ def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
130
+ images = []
131
+ pdf_document = fitz.open(pdf_path)
132
+ for page_num in range(len(pdf_document)):
133
+ page = pdf_document[page_num]
134
+ pixmap = page.get_pixmap(dpi=dpi)
135
+ image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
136
+ images.append(image)
137
+ pdf_document.close()
138
+ return images
139
+
140
+ def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
141
+ """
142
+ Convert a file path or binary content to PIL images.
143
+ Supports PDF, DOC, DOCX, and image files.
144
+ """
145
+ if not file_path and not input_bytes:
146
+ raise ValueError("Provide either file_path or input_bytes.")
147
+
148
+ # Determine extension
149
+ if file_path:
150
+ ext = os.path.splitext(file_path)[1].lower()
151
+ print('work')
152
+ elif input_bytes:
153
+ # Attempt to infer from first few bytes (simple)
154
+ # if input_bytes[:4] == b"%PDF":
155
+ # ext = ".pdf"
156
+ # elif input_bytes[:2] == b"PK":
157
+ # ext = ".docx"
158
+ # else:
159
+ # ext = ".img" # Treat as generic image
160
+
161
+ # Write to temp file if doc/pdf
162
+ if ext in [".pdf", ".doc", ".docx"]:
163
+ file_path = self._write_temp_file(input_bytes, suffix=ext)
164
+
165
+ # Word → PDF
166
+ if ext in [".doc", ".docx"]:
167
+ pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
168
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
169
+
170
+ # PowerPoint → PDF
171
+ elif ext in [".ppt", ".pptx"]:
172
+ pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
173
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
174
+
175
+ # Excel → PDF
176
+ elif ext in [".xls", ".xlsx"]:
177
+ pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
178
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
179
+
180
+
181
+ # PDF → images
182
+ elif ext == ".pdf":
183
+ images = self._convert_pdf_to_images(file_path, dpi=dpi)
184
+
185
+ # Image
186
+ elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
187
+ image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
188
+ buffer = io.BytesIO()
189
+ image.save(buffer, format=output_format)
190
+ buffer.seek(0)
191
+ images = [Image.open(buffer)]
192
+
193
+ else:
194
+ raise ValueError("Unsupported file type.")
195
+
196
+ return images
197
+
198
+ def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
199
+ """
200
+ Convert PPT/PPTX to PDF using:
201
+ 1. PowerPoint COM (Windows)
202
+ 2. LibreOffice
203
+ """
204
+
205
+ # write bytes if needed
206
+ if input_bytes is not None:
207
+ original_name = getattr(input_bytes, "name", "uploaded.pptx")
208
+ ext = os.path.splitext(original_name)[1] or ".pptx"
209
+ temp_input_path = tempfile.mktemp(suffix=ext)
210
+
211
+ if hasattr(input_bytes, "read"):
212
+ input_bytes.seek(0)
213
+ content = input_bytes.read()
214
+ else:
215
+ content = input_bytes
216
+
217
+ with open(temp_input_path, "wb") as f:
218
+ f.write(content)
219
+
220
+ input_path = temp_input_path
221
+
222
+ elif file_path:
223
+ input_path = file_path
224
+
225
+ else:
226
+ raise ValueError("Must supply either file_path or input_bytes")
227
+
228
+ output_dir = tempfile.mkdtemp()
229
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
230
+
231
+ # 1️⃣ Try PowerPoint COM on Windows
232
+ try:
233
+ import win32com.client
234
+ powerpoint = win32com.client.Dispatch("PowerPoint.Application")
235
+ powerpoint.Visible = 1
236
+
237
+ deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
238
+ deck.SaveAs(str(Path(output_pdf).resolve()), 32) # 32 = PDF
239
+ deck.Close()
240
+ powerpoint.Quit()
241
+
242
+ return output_pdf
243
+ except Exception:
244
+ pass
245
+
246
+ # 2️⃣ Try LibreOffice
247
+ try:
248
+ subprocess.run(
249
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
250
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
251
+ )
252
+ return output_pdf
253
+ except Exception:
254
+ pass
255
+
256
+ raise ValueError("Unable to convert PPT/PPTX to PDF")
257
+
258
+ def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
259
+ """
260
+ Convert XLS/XLSX to PDF using:
261
+ 1. Excel COM (Windows)
262
+ 2. LibreOffice
263
+ """
264
+
265
+ # write bytes if needed
266
+ if input_bytes is not None:
267
+ original_name = getattr(input_bytes, "name", "uploaded.xlsx")
268
+ ext = os.path.splitext(original_name)[1] or ".xlsx"
269
+ temp_input_path = tempfile.mktemp(suffix=ext)
270
+
271
+ if hasattr(input_bytes, "read"):
272
+ input_bytes.seek(0)
273
+ content = input_bytes.read()
274
+ else:
275
+ content = input_bytes
276
+
277
+ with open(temp_input_path, "wb") as f:
278
+ f.write(content)
279
+
280
+ input_path = temp_input_path
281
+
282
+ elif file_path:
283
+ input_path = file_path
284
+
285
+ else:
286
+ raise ValueError("Must supply either file_path or input_bytes")
287
+
288
+ output_dir = tempfile.mkdtemp()
289
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
290
+
291
+ # 1️⃣ Try Excel COM (Windows)
292
+ try:
293
+ import win32com.client
294
+ excel = win32com.client.Dispatch("Excel.Application")
295
+ excel.Visible = False
296
+
297
+ wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
298
+ wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve())) # 0 = PDF
299
+ wb.Close()
300
+ excel.Quit()
301
+
302
+ return output_pdf
303
+ except Exception:
304
+ pass
305
+
306
+ # 2️⃣ Try LibreOffice
307
+ try:
308
+ subprocess.run(
309
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
310
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
311
+ )
312
+ return output_pdf
313
+ except Exception:
314
+ pass
315
+
316
+ raise ValueError("Unable to convert XLS/XLSX to PDF")
317
+
318
+
@@ -3,11 +3,13 @@ import os
3
3
  import tempfile
4
4
  import base64
5
5
 
6
+ from langchain.chat_models import init_chat_model
6
7
  from openai import OpenAI
7
8
  from PIL import Image
8
9
 
9
10
 
10
11
  from dotenv import load_dotenv
12
+ from openai.types import ChatModel
11
13
 
12
14
  from .image_processor import ImageProcessor
13
15
 
@@ -18,9 +20,19 @@ load_dotenv(override=True)
18
20
  class DocuToMarkdownExtractor:
19
21
  """Sends image pages to an LLM and extracts Markdown text + tables."""
20
22
 
21
- def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
22
- self.client = OpenAI(api_key=api_key)
23
- self.model = model
23
+ def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
24
+ if client is None:
25
+ client = init_chat_model(
26
+ model=model,
27
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
28
+ api_key=api_key
29
+ )
30
+ self.client = client
31
+ self.model = client.model_name
32
+ # Initialize ImageProcessor once and pass the chat model
33
+ self.processor = ImageProcessor(client=self.client)
34
+
35
+
24
36
 
25
37
  def _image_to_base64(self, image: Image.Image) -> str:
26
38
  """Converts PIL image to base64-encoded PNG string."""
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
29
41
  with open(tmp.name, "rb") as f:
30
42
  return base64.b64encode(f.read()).decode("utf-8")
31
43
 
32
- def extract_markdown(self, images,include_image:True):
44
+ def extract_markdown(self, images,include_image:bool=True):
33
45
  """Extracts Markdown-formatted text from each image page."""
34
46
  all_outputs = []
35
47
  text_content=""
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
59
71
  try:
60
72
  response = json.loads(response) # Convert JSON string to dictionary
61
73
  except json.JSONDecodeError:
62
- raise ValueError("The response from 'processor.analyze' is not valid JSON.")
74
+ print('skipping quietly')
75
+ #raise ValueError("The response from 'processor.analyze' is not valid JSON.")
63
76
  text_content=text_content+"\n"+response["markdown_text"]
64
77
  if(include_image):
65
78
  response["image_data"]=b64_image
@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
392
392
 
393
393
 
394
394
  #function that chunks any document
395
- def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
396
- return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
395
+ def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
396
+ return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
397
397
 
398
398
  #function that chunks any document as well as inserts into vdb
399
399
  def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
@@ -2,39 +2,35 @@ import numpy as np
2
2
 
3
3
 
4
4
  class ChunkMapper:
5
- def __init__(self, client, markdown_output, embedding_model="text-embedding-3-small"):
5
+ def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
6
6
  """
7
7
  client: OpenAI client object
8
8
  markdown_output: list of JSON objects containing at least 'markdown_text'
9
9
  embedding_model: model for embeddings
10
10
  """
11
- self.client = client
11
+ self.embedding_client = embedding_client
12
12
  self.markdown_output = markdown_output
13
13
  self.embedding_model = embedding_model
14
14
 
15
15
  # Precompute embeddings for markdown_output
16
16
  self.markdown_embeddings = self._compute_markdown_embeddings()
17
17
 
18
- # -----------------------------
19
- # Compute embeddings for all markdown items
20
- # -----------------------------
18
+ # -----------------------------
19
+ # Compute embeddings for markdown JSON items
20
+ # -----------------------------
21
+
21
22
  def _compute_markdown_embeddings(self):
22
- embeddings = []
23
- for obj in self.markdown_output:
24
- markdown_text = obj.get("markdown_text", "")
25
- emb = self._get_embedding(markdown_text)
26
- embeddings.append(emb)
27
- return embeddings
23
+ texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
24
+ return self.embedding_client.embed_documents(texts)
25
+
26
+ # -----------------------------
27
+ # Get embedding for a single text
28
+ # -----------------------------
28
29
 
29
- # -----------------------------
30
- # Embedding helper
31
- # -----------------------------
32
30
  def _get_embedding(self, text):
33
- response = self.client.embeddings.create(
34
- input=text,
35
- model=self.embedding_model
36
- )
37
- return response.data[0].embedding
31
+ # LangChain uses a list input
32
+ emb = self.embedding_client.embed_query(text)
33
+ return emb
38
34
 
39
35
  # -----------------------------
40
36
  # Cosine similarity
@@ -10,6 +10,8 @@ import requests
10
10
  from dotenv import load_dotenv
11
11
  from typing import Optional
12
12
 
13
+ from langchain.chat_models import init_chat_model
14
+ from langchain_core.messages import HumanMessage
13
15
  from openai import OpenAI
14
16
  from langchain_core.pydantic_v1 import BaseModel
15
17
 
@@ -31,15 +33,22 @@ class ImageProcessor:
31
33
  Wrapper for a GPT-4o multimodal image reasoning pipeline.
32
34
  """
33
35
 
34
- def __init__(self, model_name: str = "gpt-4o-mini"):
36
+ def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
35
37
  load_dotenv(override=True)
36
38
  self.api_key = os.getenv("OPENAI_API_KEY")
37
39
  if not self.api_key:
38
40
  raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
39
41
 
42
+ if client is None:
43
+ client = init_chat_model(
44
+ model=model_name,
45
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
46
+ api_key=api_key
47
+ )
48
+ self.llm = client
40
49
  # Initialize multimodal client
41
- self.llm = OpenAI(api_key=self.api_key)
42
- self.model_name = model_name
50
+
51
+ self.model_name = client.model_name
43
52
 
44
53
  # -------------------------------------------------
45
54
  # 3️⃣ Image encoding helper
@@ -70,17 +79,11 @@ class ImageProcessor:
70
79
  },
71
80
  ]
72
81
  content1.extend(finstructioncontent)
73
- response = self.llm.chat.completions.create(
74
- model=self.model_name,
75
- messages=[
76
- {
77
- "role": "user",
78
- "content": content1
79
- }
80
- ],
81
- )
82
-
83
- result_text = response.choices[0].message.content
82
+ # Call the LangChain model
83
+ response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
84
+
85
+ # Extract the text
86
+ result_text = response_msg.content
84
87
  print("✅ Analysis complete.")
85
88
  print(result_text)
86
89
  return result_text