prevectorchunks-core 0.1.34__tar.gz → 0.1.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.36}/PKG-INFO +4 -1
  2. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToImageConverter.py +134 -1
  3. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +18 -5
  4. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +2 -2
  5. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +15 -19
  6. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/image_processor.py +17 -14
  7. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/markdown_and_chunk_documents.py +123 -5
  8. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/test_loader.py +26 -9
  9. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/file_loader.py +77 -21
  10. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36/prevectorchunks_core.egg-info}/PKG-INFO +4 -1
  11. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/requires.txt +3 -0
  12. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/pyproject.toml +5 -1
  13. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/LICENCE +0 -0
  14. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/LICENSE +0 -0
  15. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/README.md +0 -0
  16. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/__init__.py +0 -0
  17. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/config/__init__.py +0 -0
  18. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/config/splitter_config.py +0 -0
  19. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/migrations/__init__.py +0 -0
  20. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/__init__.py +0 -0
  21. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/llava.py +0 -0
  22. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  23. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/env.py +0 -0
  24. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/inference.py +0 -0
  25. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/model.py +0 -0
  26. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  27. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  28. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  29. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/reward.py +0 -0
  30. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  31. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  32. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/utils.py +0 -0
  33. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/__init__.py +0 -0
  34. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/audio_processor.py +0 -0
  35. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/propositional_index.py +0 -0
  36. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/video_analyser.py +0 -0
  37. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/__init__.py +0 -0
  38. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/test_local.py +0 -0
  39. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/__init__.py +0 -0
  40. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/extract_content.py +0 -0
  41. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  42. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
  43. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  44. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  45. {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.34
3
+ Version: 0.1.36
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
50
50
  Requires-Dist: cssselect2~=0.7.0
51
51
  Requires-Dist: cairocffi~=1.4.0
52
52
  Requires-Dist: tensorflow~=2.12.0
53
+ Requires-Dist: pandas~=2.2.2
54
+ Requires-Dist: openpyxl~=3.1.2
55
+ Requires-Dist: python-pptx~=0.6.21
53
56
  Dynamic: license-file
54
57
 
55
58
  # 📚 PreVectorChunks
@@ -164,9 +164,20 @@ class DocuToImageConverter:
164
164
 
165
165
  # Word → PDF
166
166
  if ext in [".doc", ".docx"]:
167
- pdf_path = self._convert_doc_to_pdf(file_path)
167
+ pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
168
168
  images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
169
169
 
170
+ # PowerPoint → PDF
171
+ elif ext in [".ppt", ".pptx"]:
172
+ pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
173
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
174
+
175
+ # Excel → PDF
176
+ elif ext in [".xls", ".xlsx"]:
177
+ pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
178
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
179
+
180
+
170
181
  # PDF → images
171
182
  elif ext == ".pdf":
172
183
  images = self._convert_pdf_to_images(file_path, dpi=dpi)
@@ -183,3 +194,125 @@ class DocuToImageConverter:
183
194
  raise ValueError("Unsupported file type.")
184
195
 
185
196
  return images
197
+
198
+ def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
199
+ """
200
+ Convert PPT/PPTX to PDF using:
201
+ 1. PowerPoint COM (Windows)
202
+ 2. LibreOffice
203
+ """
204
+
205
+ # write bytes if needed
206
+ if input_bytes is not None:
207
+ original_name = getattr(input_bytes, "name", "uploaded.pptx")
208
+ ext = os.path.splitext(original_name)[1] or ".pptx"
209
+ temp_input_path = tempfile.mktemp(suffix=ext)
210
+
211
+ if hasattr(input_bytes, "read"):
212
+ input_bytes.seek(0)
213
+ content = input_bytes.read()
214
+ else:
215
+ content = input_bytes
216
+
217
+ with open(temp_input_path, "wb") as f:
218
+ f.write(content)
219
+
220
+ input_path = temp_input_path
221
+
222
+ elif file_path:
223
+ input_path = file_path
224
+
225
+ else:
226
+ raise ValueError("Must supply either file_path or input_bytes")
227
+
228
+ output_dir = tempfile.mkdtemp()
229
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
230
+
231
+ # 1️⃣ Try PowerPoint COM on Windows
232
+ try:
233
+ import win32com.client
234
+ powerpoint = win32com.client.Dispatch("PowerPoint.Application")
235
+ powerpoint.Visible = 1
236
+
237
+ deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
238
+ deck.SaveAs(str(Path(output_pdf).resolve()), 32) # 32 = PDF
239
+ deck.Close()
240
+ powerpoint.Quit()
241
+
242
+ return output_pdf
243
+ except Exception:
244
+ pass
245
+
246
+ # 2️⃣ Try LibreOffice
247
+ try:
248
+ subprocess.run(
249
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
250
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
251
+ )
252
+ return output_pdf
253
+ except Exception:
254
+ pass
255
+
256
+ raise ValueError("Unable to convert PPT/PPTX to PDF")
257
+
258
+ def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
259
+ """
260
+ Convert XLS/XLSX to PDF using:
261
+ 1. Excel COM (Windows)
262
+ 2. LibreOffice
263
+ """
264
+
265
+ # write bytes if needed
266
+ if input_bytes is not None:
267
+ original_name = getattr(input_bytes, "name", "uploaded.xlsx")
268
+ ext = os.path.splitext(original_name)[1] or ".xlsx"
269
+ temp_input_path = tempfile.mktemp(suffix=ext)
270
+
271
+ if hasattr(input_bytes, "read"):
272
+ input_bytes.seek(0)
273
+ content = input_bytes.read()
274
+ else:
275
+ content = input_bytes
276
+
277
+ with open(temp_input_path, "wb") as f:
278
+ f.write(content)
279
+
280
+ input_path = temp_input_path
281
+
282
+ elif file_path:
283
+ input_path = file_path
284
+
285
+ else:
286
+ raise ValueError("Must supply either file_path or input_bytes")
287
+
288
+ output_dir = tempfile.mkdtemp()
289
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
290
+
291
+ # 1️⃣ Try Excel COM (Windows)
292
+ try:
293
+ import win32com.client
294
+ excel = win32com.client.Dispatch("Excel.Application")
295
+ excel.Visible = False
296
+
297
+ wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
298
+ wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve())) # 0 = PDF
299
+ wb.Close()
300
+ excel.Quit()
301
+
302
+ return output_pdf
303
+ except Exception:
304
+ pass
305
+
306
+ # 2️⃣ Try LibreOffice
307
+ try:
308
+ subprocess.run(
309
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
310
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
311
+ )
312
+ return output_pdf
313
+ except Exception:
314
+ pass
315
+
316
+ raise ValueError("Unable to convert XLS/XLSX to PDF")
317
+
318
+
@@ -3,11 +3,13 @@ import os
3
3
  import tempfile
4
4
  import base64
5
5
 
6
+ from langchain.chat_models import init_chat_model
6
7
  from openai import OpenAI
7
8
  from PIL import Image
8
9
 
9
10
 
10
11
  from dotenv import load_dotenv
12
+ from openai.types import ChatModel
11
13
 
12
14
  from .image_processor import ImageProcessor
13
15
 
@@ -18,9 +20,19 @@ load_dotenv(override=True)
18
20
  class DocuToMarkdownExtractor:
19
21
  """Sends image pages to an LLM and extracts Markdown text + tables."""
20
22
 
21
- def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
22
- self.client = OpenAI(api_key=api_key)
23
- self.model = model
23
+ def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
24
+ if client is None:
25
+ client = init_chat_model(
26
+ model=model,
27
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
28
+ api_key=api_key
29
+ )
30
+ self.client = client
31
+ self.model = client.model_name
32
+ # Initialize ImageProcessor once and pass the chat model
33
+ self.processor = ImageProcessor(client=self.client)
34
+
35
+
24
36
 
25
37
  def _image_to_base64(self, image: Image.Image) -> str:
26
38
  """Converts PIL image to base64-encoded PNG string."""
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
29
41
  with open(tmp.name, "rb") as f:
30
42
  return base64.b64encode(f.read()).decode("utf-8")
31
43
 
32
- def extract_markdown(self, images,include_image:True):
44
+ def extract_markdown(self, images,include_image:bool=True):
33
45
  """Extracts Markdown-formatted text from each image page."""
34
46
  all_outputs = []
35
47
  text_content=""
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
59
71
  try:
60
72
  response = json.loads(response) # Convert JSON string to dictionary
61
73
  except json.JSONDecodeError:
62
- raise ValueError("The response from 'processor.analyze' is not valid JSON.")
74
+ print('skipping quietly')
75
+ #raise ValueError("The response from 'processor.analyze' is not valid JSON.")
63
76
  text_content=text_content+"\n"+response["markdown_text"]
64
77
  if(include_image):
65
78
  response["image_data"]=b64_image
@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
392
392
 
393
393
 
394
394
  #function that chunks any document
395
- def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
396
- return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
395
+ def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
396
+ return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
397
397
 
398
398
  #function that chunks any document as well as inserts into vdb
399
399
  def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
@@ -2,39 +2,35 @@ import numpy as np
2
2
 
3
3
 
4
4
  class ChunkMapper:
5
- def __init__(self, client, markdown_output, embedding_model="text-embedding-3-small"):
5
+ def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
6
6
  """
7
7
  client: OpenAI client object
8
8
  markdown_output: list of JSON objects containing at least 'markdown_text'
9
9
  embedding_model: model for embeddings
10
10
  """
11
- self.client = client
11
+ self.embedding_client = embedding_client
12
12
  self.markdown_output = markdown_output
13
13
  self.embedding_model = embedding_model
14
14
 
15
15
  # Precompute embeddings for markdown_output
16
16
  self.markdown_embeddings = self._compute_markdown_embeddings()
17
17
 
18
- # -----------------------------
19
- # Compute embeddings for all markdown items
20
- # -----------------------------
18
+ # -----------------------------
19
+ # Compute embeddings for markdown JSON items
20
+ # -----------------------------
21
+
21
22
  def _compute_markdown_embeddings(self):
22
- embeddings = []
23
- for obj in self.markdown_output:
24
- markdown_text = obj.get("markdown_text", "")
25
- emb = self._get_embedding(markdown_text)
26
- embeddings.append(emb)
27
- return embeddings
23
+ texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
24
+ return self.embedding_client.embed_documents(texts)
25
+
26
+ # -----------------------------
27
+ # Get embedding for a single text
28
+ # -----------------------------
28
29
 
29
- # -----------------------------
30
- # Embedding helper
31
- # -----------------------------
32
30
  def _get_embedding(self, text):
33
- response = self.client.embeddings.create(
34
- input=text,
35
- model=self.embedding_model
36
- )
37
- return response.data[0].embedding
31
+ # LangChain uses a list input
32
+ emb = self.embedding_client.embed_query(text)
33
+ return emb
38
34
 
39
35
  # -----------------------------
40
36
  # Cosine similarity
@@ -10,6 +10,8 @@ import requests
10
10
  from dotenv import load_dotenv
11
11
  from typing import Optional
12
12
 
13
+ from langchain.chat_models import init_chat_model
14
+ from langchain_core.messages import HumanMessage
13
15
  from openai import OpenAI
14
16
  from langchain_core.pydantic_v1 import BaseModel
15
17
 
@@ -31,15 +33,22 @@ class ImageProcessor:
31
33
  Wrapper for a GPT-4o multimodal image reasoning pipeline.
32
34
  """
33
35
 
34
- def __init__(self, model_name: str = "gpt-4o-mini"):
36
+ def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
35
37
  load_dotenv(override=True)
36
38
  self.api_key = os.getenv("OPENAI_API_KEY")
37
39
  if not self.api_key:
38
40
  raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
39
41
 
42
+ if client is None:
43
+ client = init_chat_model(
44
+ model=model_name,
45
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
46
+ api_key=api_key
47
+ )
48
+ self.llm = client
40
49
  # Initialize multimodal client
41
- self.llm = OpenAI(api_key=self.api_key)
42
- self.model_name = model_name
50
+
51
+ self.model_name = client.model_name
43
52
 
44
53
  # -------------------------------------------------
45
54
  # 3️⃣ Image encoding helper
@@ -70,17 +79,11 @@ class ImageProcessor:
70
79
  },
71
80
  ]
72
81
  content1.extend(finstructioncontent)
73
- response = self.llm.chat.completions.create(
74
- model=self.model_name,
75
- messages=[
76
- {
77
- "role": "user",
78
- "content": content1
79
- }
80
- ],
81
- )
82
-
83
- result_text = response.choices[0].message.content
82
+ # Call the LangChain model
83
+ response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
84
+
85
+ # Extract the text
86
+ result_text = response_msg.content
84
87
  print("✅ Analysis complete.")
85
88
  print(result_text)
86
89
  return result_text
@@ -7,6 +7,7 @@ from pathlib import Path
7
7
 
8
8
  from docx import Document
9
9
  from dotenv import load_dotenv
10
+ from langchain_openai import OpenAIEmbeddings
10
11
  from openai import OpenAI
11
12
  from PIL import Image
12
13
 
@@ -111,6 +112,34 @@ class ImageStrategy(BaseDocumentStrategy):
111
112
 
112
113
  return [image]
113
114
 
115
+ class PowerPointStrategy(BaseDocumentStrategy):
116
+ def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
117
+ print(f"📊 Using PowerPointStrategy for {file_path or input_bytes}")
118
+
119
+ converter = DocuToImageConverter()
120
+
121
+ # Convert PPT/PPTX → PDF
122
+ pdf_path = converter._convert_ppt_to_pdf(file_path=file_path, input_bytes=input_bytes)
123
+
124
+ # Then convert PDF → images
125
+ images = converter.convert_to_images(pdf_path)
126
+
127
+ return images
128
+
129
+ class ExcelStrategy(BaseDocumentStrategy):
130
+ def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
131
+ print(f"📈 Using ExcelStrategy for {file_path or input_bytes}")
132
+
133
+ converter = DocuToImageConverter()
134
+
135
+ # Convert XLS/XLSX → PDF
136
+ pdf_path = converter._convert_excel_to_pdf(file_path=file_path, input_bytes=input_bytes)
137
+
138
+ # Convert PDF → images
139
+ images = converter.convert_to_images(pdf_path)
140
+
141
+ return images
142
+
114
143
 
115
144
  # -----------------------------
116
145
  # Strategy Factory
@@ -127,6 +156,17 @@ class StrategyFactory:
127
156
  ".png": ImageStrategy(),
128
157
  ".bmp": ImageStrategy(),
129
158
  ".tiff": ImageStrategy(),
159
+
160
+ # NEW — PowerPoint
161
+ ".ppt": PowerPointStrategy(),
162
+ ".pptx": PowerPointStrategy(),
163
+
164
+ # NEW — Excel
165
+ ".xls": ExcelStrategy(),
166
+ ".xlsx": ExcelStrategy(),
167
+
168
+ # NEW — Google Docs/Sheets
169
+
130
170
  }
131
171
 
132
172
  @classmethod
@@ -145,11 +185,12 @@ class StrategyFactory:
145
185
  # Main Orchestrator
146
186
  # -----------------------------
147
187
  class MarkdownAndChunkDocuments:
148
- def __init__(self):
188
+ def __init__(self,client):
149
189
  self.api_key = os.getenv("OPENAI_API_KEY")
150
- self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
190
+ self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
191
+ self.client=client
151
192
 
152
- def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
193
+ def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
153
194
  # Pick strategy
154
195
  strategy = StrategyFactory.get_strategy(file_path,file_name)
155
196
  if not strategy:
@@ -164,8 +205,14 @@ class MarkdownAndChunkDocuments:
164
205
  binary_text_content = text_content.encode("utf-8")
165
206
 
166
207
  # Chunking and mapping
167
- chunk_client = OpenAI(api_key=self.api_key)
168
- cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
208
+ #chunk_client = OpenAI(api_key=self.api_key)
209
+ if embedding_client is None:
210
+ embedding_client = OpenAIEmbeddings(
211
+ model="text-embedding-3-small",
212
+ api_key=self.api_key
213
+ )
214
+
215
+ cm = ChunkMapper(embedding_client, markdown_output, embedding_model="text-embedding-3-small")
169
216
  splitter_config = SplitterConfig(
170
217
  chunk_size=300,
171
218
  chunk_overlap=0,
@@ -191,6 +238,77 @@ class MarkdownAndChunkDocuments:
191
238
  print("✅ Processing complete.")
192
239
  return mapped_chunks
193
240
 
241
+ def markdown_and_chunk_documents_stream(
242
+ self,
243
+ file_path: str,
244
+ input_bytes: bytes = None,
245
+ include_image: bool = None,
246
+ file_name: str = None,
247
+ ):
248
+ """Generator version of markdown_and_chunk_documents that yields progress JSON events"""
249
+
250
+ def report(pct, msg=""):
251
+ yield {"progress": int(pct), "status": msg}
252
+
253
+ # 1️⃣ Pick strategy
254
+ yield from report(5, "Selecting strategy...")
255
+ strategy = StrategyFactory.get_strategy(file_path, file_name)
256
+ if not strategy:
257
+ raise ValueError(f"Unsupported file type: {file_path}")
258
+
259
+ # 2️⃣ Convert to images
260
+ ext = get_file_extension(file_path, file_name)
261
+ yield from report(15, "Processing file into images...")
262
+ images = strategy.process(file_path, input_bytes, ext)
263
+
264
+ # 3️⃣ Extract Markdown
265
+ yield from report(35, "Extracting markdown...")
266
+ markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
267
+ binary_text_content = text_content.encode("utf-8")
268
+
269
+ # 4️⃣ Chunking
270
+ yield from report(55, "Chunking text...")
271
+ chunk_client = OpenAI(api_key=self.api_key)
272
+ cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
273
+
274
+ splitter_config = SplitterConfig(
275
+ chunk_size=300,
276
+ chunk_overlap=0,
277
+ separators=["\n"],
278
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
279
+ min_rl_chunk_size=5,
280
+ max_rl_chunk_size=50,
281
+ enableLLMTouchUp=False,
282
+ )
283
+
284
+ chunked_text = chunk_documents(
285
+ "", file_name="install_ins.txt", file_path=binary_text_content, splitter_config=splitter_config
286
+ )
287
+ flat_chunks = ["".join(inner) for inner in chunked_text]
288
+
289
+ # 5️⃣ Map chunks (embedding)
290
+ yield from report(60, f"Mapping {len(flat_chunks)} chunks...")
291
+ total = len(flat_chunks)
292
+ mapped_chunks = []
293
+ for i, chunk in enumerate(flat_chunks, start=1):
294
+ mapped = cm.map_chunks([chunk])
295
+ mapped_chunks.extend(mapped)
296
+ progress = 60 + (i / total) * 30
297
+ yield from report(progress, f"Mapping chunk {i}/{total}")
298
+
299
+ # 6️⃣ Merge unmapped markdown sections
300
+ yield from report(95, "Merging markdown...")
301
+ for md_item in markdown_output:
302
+ if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
303
+ md_item["chunked_text"] = md_item["markdown_text"]
304
+ mapped_chunks.append(md_item)
305
+
306
+ adduuid(mapped_chunks)
307
+ yield from report(100, "✅ Processing complete.")
308
+
309
+ # Final result
310
+ yield {"progress": 100, "status": "done", "result": mapped_chunks}
311
+
194
312
  def adduuid(mapped_chunks):
195
313
  # Assuming mapped_chunks is a list of dictionaries
196
314
 
@@ -1,12 +1,15 @@
1
1
  import json
2
2
  import pytest
3
+ from dotenv import load_dotenv
4
+ from langchain.chat_models import init_chat_model
5
+ from langchain_openai import OpenAIEmbeddings
3
6
 
4
7
  from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
5
8
  from core.prevectorchunks_core.services import chunk_documents_crud_vdb
6
9
  from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
7
10
  from core.prevectorchunks_core.utils.file_loader import SplitType
8
-
9
-
11
+ import os
12
+ load_dotenv(override=True)
10
13
  # Create a temporary JSON file to test with
11
14
  @pytest.fixture
12
15
  def temp_json_file(tmp_path):
@@ -19,12 +22,16 @@ def temp_json_file(tmp_path):
19
22
 
20
23
  def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
21
24
  splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
22
- split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
23
- max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
25
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
26
+ max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED)
27
+ client = init_chat_model(
28
+ model="gpt-4o-mini",
29
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
30
+ api_key=os.getenv("OPENAI_API_KEY")
31
+ )
32
+ chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",
24
33
 
25
- chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
26
-
27
- splitter_config=splitter_config)
34
+ splitter_config=splitter_config,client=client)
28
35
 
29
36
  print(chunks)
30
37
  for i, c in enumerate(chunks):
@@ -32,9 +39,19 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
32
39
  print(chunks)
33
40
 
34
41
  def test_markdown(temp_json_file):
35
- markdown_and_chunk_documents = MarkdownAndChunkDocuments()
42
+
43
+ client = init_chat_model(
44
+ model="gpt-4o-mini",
45
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
46
+ api_key=os.getenv("OPENAI_API_KEY")
47
+ )
48
+ markdown_and_chunk_documents = MarkdownAndChunkDocuments(client)
49
+ embedding_client = OpenAIEmbeddings(
50
+ model="text-embedding-3-small",
51
+ api_key=os.getenv("OPENAI_API_KEY")
52
+ )
36
53
  mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
37
- "content.docx",include_image=True)
54
+ "C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",include_image=True,embedding_client=embedding_client)
38
55
  print(mapped_chunks)
39
56
  for i, c in enumerate(mapped_chunks):
40
57
  print(f"Chunk {i + 1}: {c}")
@@ -9,6 +9,7 @@ from PIL import Image
9
9
  import pytesseract
10
10
  import uuid
11
11
 
12
+ from langchain.chat_models import init_chat_model
12
13
  from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
13
14
  from openai import OpenAI
14
15
  from openai import OpenAI
@@ -26,7 +27,8 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
26
27
  from django.core.files.uploadedfile import UploadedFile
27
28
 
28
29
  from enum import Enum
29
-
30
+ import pandas as pd
31
+ from pptx import Presentation
30
32
  class SplitType(Enum):
31
33
  RECURSIVE = "RecursiveCharacterTextSplitter"
32
34
  CHARACTER = "CharacterTextSplitter"
@@ -151,6 +153,35 @@ def load_file_by_type(ext, filepath):
151
153
  data = json.load(f)
152
154
  # Convert JSON to text (pretty print or flatten)
153
155
  text = json.dumps(data, ensure_ascii=False, indent=2)
156
+ # -------------------------
157
+ # PPTX (PowerPoint)
158
+ # -------------------------
159
+ elif ext in [".pptx", ".ppt"]:
160
+ pres = Presentation(filepath)
161
+ slides_text = []
162
+ for slide in pres.slides:
163
+ slide_text = []
164
+ for shape in slide.shapes:
165
+ if hasattr(shape, "text"):
166
+ slide_text.append(shape.text)
167
+ slides_text.append("\n".join(slide_text))
168
+ text = "\n\n---- Slide Break ----\n\n".join(slides_text)
169
+
170
+ # -------------------------
171
+ # Excel (XLS / XLSX)
172
+ # -------------------------
173
+ elif ext in [".xlsx", ".xls"]:
174
+ # Using pandas for convenience
175
+ try:
176
+ df_dict = pd.read_excel(filepath, sheet_name=None)
177
+ all_sheets = []
178
+ for sheet, df in df_dict.items():
179
+ sheet_text = f"=== Sheet: {sheet} ===\n"
180
+ sheet_text += df.to_string(index=False)
181
+ all_sheets.append(sheet_text)
182
+ text = "\n\n".join(all_sheets)
183
+ except Exception as e:
184
+ raise ValueError(f"Failed to read Excel file: {e}")
154
185
  else:
155
186
  raise ValueError(f"Unsupported file type: {ext}")
156
187
  return text
@@ -220,38 +251,63 @@ def split_text_by_config(text, splitter_config:SplitterConfig=None, binary_data=
220
251
  return [" ".join(words[i:i + splitter_config.chunk_size]) for i in
221
252
  range(0, len(words), splitter_config.chunk_size)]
222
253
 
254
+ import json
255
+ from langchain.schema import HumanMessage
256
+ import uuid
223
257
 
224
- def process_with_llm(chunk,instructions):
258
+ def process_with_llm(chunk, instructions=None, xclient=None):
225
259
  """
226
260
  Send a chunk to LLM and return structured JSON array.
227
261
  Expected format: [{"id": ..., "title": ..., "text": ...}, ...]
228
262
  """
229
- context = f"""
230
- Take the following text and split it into sections based on the most important category headings (ignore lower level headings).
231
- For each section, return a JSON object with - no extra words other than the json and remove ```json:
232
- - "id" (a UUID you generate),
233
- - "title" (the most important heading),
234
- - "text" (the remaining text under that heading).
263
+ instructions = instructions or "Extract sections"
264
+
265
+ # Combine chunk + instructions into one prompt
266
+ prompt_text = f"""
267
+ You are a helpful assistant that structures text into JSON sections.
268
+ Take the following text and split it into sections based on the most important category headings.
269
+ return a JSON array of objects with the following keys:
270
+ - "id" (a UUID you generate)
271
+ - "title" (the most important heading)
272
+ - "text" (the remaining text under that heading)
273
+ Return ONLY valid JSON, without extra text or backtick or markdown formatting.
235
274
 
236
275
  Text:
237
276
  {chunk}
277
+
278
+ Instructions: {instructions}
238
279
  """
239
- instructions=instructions or "Exract sections"
240
- system_prompt="You are a helpful assistant that structures text into JSON sections."
241
- # Create an instance of your LLM wrapper
242
- llm = LLMClientWrapper(client, model="gpt-4o-mini", temperature=0, system_prompt=system_prompt)
243
- response=llm.chat(context,instructions)
280
+
281
+ # Use provided client or create new wrapper
282
+ if xclient is None:
283
+ xclient = init_chat_model(
284
+ model="gpt-4o-mini",
285
+ model_provider="openai", # you can later swap to "anthropic", "google", etc.
286
+ api_key=os.getenv("OPENAI_API_KEY")
287
+ )
288
+
289
+
290
+ # Call the LLM
291
+ response_msg = xclient.predict_messages([HumanMessage(content=prompt_text)])
292
+ response_text = response_msg.content
244
293
 
245
294
  # Parse JSON safely
246
295
  try:
247
- structured_data = eval(response)
248
- except Exception:
249
- structured_data = []
296
+ structured_data = json.loads(response_text)
297
+ if isinstance(structured_data, str):
298
+ # Sometimes LLM returns a JSON string inside quotes
299
+ structured_data = json.loads(structured_data)
300
+ except json.JSONDecodeError as e:
301
+ print("LLM returned invalid JSON:", response_text)
302
+ raise e
303
+
304
+ for item in structured_data:
305
+ if isinstance(item, dict) and "id" not in item:
306
+ item["id"] = str(uuid.uuid4())
250
307
 
251
308
  return structured_data
252
309
 
253
-
254
- def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
310
+ def process_large_text(text, instructions,splitter_config:SplitterConfig=None,client=None):
255
311
  """Main function: split -> send to LLM -> collect results."""
256
312
  chunks = split_text_by_config(text, splitter_config=splitter_config)
257
313
  all_results = []
@@ -261,7 +317,7 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
261
317
  return chunks
262
318
  elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
263
319
  for chunk in chunks:
264
- structured = process_with_llm(chunk,instructions)
320
+ structured = process_with_llm(chunk,instructions,client)
265
321
  # Ensure UUIDs exist
266
322
  for obj in structured:
267
323
  if "id" not in obj:
@@ -274,9 +330,9 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
274
330
 
275
331
 
276
332
 
277
- def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None):
333
+ def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None,client=None):
278
334
  content =extract_content_agnostic(file_path,file_name)
279
- results=process_large_text(content,instructions, splitter_config=splitter_config)
335
+ results=process_large_text(content,instructions, splitter_config=splitter_config,client=client)
280
336
  print (results)
281
337
  return results
282
338
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.34
3
+ Version: 0.1.36
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
50
50
  Requires-Dist: cssselect2~=0.7.0
51
51
  Requires-Dist: cairocffi~=1.4.0
52
52
  Requires-Dist: tensorflow~=2.12.0
53
+ Requires-Dist: pandas~=2.2.2
54
+ Requires-Dist: openpyxl~=3.1.2
55
+ Requires-Dist: python-pptx~=0.6.21
53
56
  Dynamic: license-file
54
57
 
55
58
  # 📚 PreVectorChunks
@@ -36,3 +36,6 @@ lxml~=4.9.3
36
36
  cssselect2~=0.7.0
37
37
  cairocffi~=1.4.0
38
38
  tensorflow~=2.12.0
39
+ pandas~=2.2.2
40
+ openpyxl~=3.1.2
41
+ python-pptx~=0.6.21
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.34"
7
+ version = "0.1.36"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -52,6 +52,10 @@ dependencies = [
52
52
  "cssselect2~=0.7.0",
53
53
  "cairocffi~=1.4.0",
54
54
  "tensorflow~=2.12.0", # <-- Add this
55
+ # 👉 Add these
56
+ "pandas~=2.2.2",
57
+ "openpyxl~=3.1.2",
58
+ "python-pptx~=0.6.21",
55
59
  ]
56
60
 
57
61