prevectorchunks-core 0.1.34__tar.gz → 0.1.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.36}/PKG-INFO +4 -1
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToImageConverter.py +134 -1
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +18 -5
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +2 -2
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +15 -19
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/image_processor.py +17 -14
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/markdown_and_chunk_documents.py +123 -5
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/test_loader.py +26 -9
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/file_loader.py +77 -21
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36/prevectorchunks_core.egg-info}/PKG-INFO +4 -1
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/requires.txt +3 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/pyproject.toml +5 -1
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/LICENCE +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/LICENSE +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/README.md +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/setup.cfg +0 -0
{prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.36}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.36
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
|
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
52
|
Requires-Dist: tensorflow~=2.12.0
|
|
53
|
+
Requires-Dist: pandas~=2.2.2
|
|
54
|
+
Requires-Dist: openpyxl~=3.1.2
|
|
55
|
+
Requires-Dist: python-pptx~=0.6.21
|
|
53
56
|
Dynamic: license-file
|
|
54
57
|
|
|
55
58
|
# 📚 PreVectorChunks
|
|
@@ -164,9 +164,20 @@ class DocuToImageConverter:
|
|
|
164
164
|
|
|
165
165
|
# Word → PDF
|
|
166
166
|
if ext in [".doc", ".docx"]:
|
|
167
|
-
pdf_path = self._convert_doc_to_pdf(file_path)
|
|
167
|
+
pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
|
|
168
168
|
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
169
169
|
|
|
170
|
+
# PowerPoint → PDF
|
|
171
|
+
elif ext in [".ppt", ".pptx"]:
|
|
172
|
+
pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
|
|
173
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
174
|
+
|
|
175
|
+
# Excel → PDF
|
|
176
|
+
elif ext in [".xls", ".xlsx"]:
|
|
177
|
+
pdf_path = self._convert_excel_to_pdf(file_path, input_bytes)
|
|
178
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
179
|
+
|
|
180
|
+
|
|
170
181
|
# PDF → images
|
|
171
182
|
elif ext == ".pdf":
|
|
172
183
|
images = self._convert_pdf_to_images(file_path, dpi=dpi)
|
|
@@ -183,3 +194,125 @@ class DocuToImageConverter:
|
|
|
183
194
|
raise ValueError("Unsupported file type.")
|
|
184
195
|
|
|
185
196
|
return images
|
|
197
|
+
|
|
198
|
+
def _convert_ppt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
199
|
+
"""
|
|
200
|
+
Convert PPT/PPTX to PDF using:
|
|
201
|
+
1. PowerPoint COM (Windows)
|
|
202
|
+
2. LibreOffice
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
# write bytes if needed
|
|
206
|
+
if input_bytes is not None:
|
|
207
|
+
original_name = getattr(input_bytes, "name", "uploaded.pptx")
|
|
208
|
+
ext = os.path.splitext(original_name)[1] or ".pptx"
|
|
209
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
210
|
+
|
|
211
|
+
if hasattr(input_bytes, "read"):
|
|
212
|
+
input_bytes.seek(0)
|
|
213
|
+
content = input_bytes.read()
|
|
214
|
+
else:
|
|
215
|
+
content = input_bytes
|
|
216
|
+
|
|
217
|
+
with open(temp_input_path, "wb") as f:
|
|
218
|
+
f.write(content)
|
|
219
|
+
|
|
220
|
+
input_path = temp_input_path
|
|
221
|
+
|
|
222
|
+
elif file_path:
|
|
223
|
+
input_path = file_path
|
|
224
|
+
|
|
225
|
+
else:
|
|
226
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
227
|
+
|
|
228
|
+
output_dir = tempfile.mkdtemp()
|
|
229
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
230
|
+
|
|
231
|
+
# 1️⃣ Try PowerPoint COM on Windows
|
|
232
|
+
try:
|
|
233
|
+
import win32com.client
|
|
234
|
+
powerpoint = win32com.client.Dispatch("PowerPoint.Application")
|
|
235
|
+
powerpoint.Visible = 1
|
|
236
|
+
|
|
237
|
+
deck = powerpoint.Presentations.Open(str(Path(input_path).resolve()))
|
|
238
|
+
deck.SaveAs(str(Path(output_pdf).resolve()), 32) # 32 = PDF
|
|
239
|
+
deck.Close()
|
|
240
|
+
powerpoint.Quit()
|
|
241
|
+
|
|
242
|
+
return output_pdf
|
|
243
|
+
except Exception:
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
# 2️⃣ Try LibreOffice
|
|
247
|
+
try:
|
|
248
|
+
subprocess.run(
|
|
249
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
250
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
251
|
+
)
|
|
252
|
+
return output_pdf
|
|
253
|
+
except Exception:
|
|
254
|
+
pass
|
|
255
|
+
|
|
256
|
+
raise ValueError("Unable to convert PPT/PPTX to PDF")
|
|
257
|
+
|
|
258
|
+
def _convert_excel_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
259
|
+
"""
|
|
260
|
+
Convert XLS/XLSX to PDF using:
|
|
261
|
+
1. Excel COM (Windows)
|
|
262
|
+
2. LibreOffice
|
|
263
|
+
"""
|
|
264
|
+
|
|
265
|
+
# write bytes if needed
|
|
266
|
+
if input_bytes is not None:
|
|
267
|
+
original_name = getattr(input_bytes, "name", "uploaded.xlsx")
|
|
268
|
+
ext = os.path.splitext(original_name)[1] or ".xlsx"
|
|
269
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
270
|
+
|
|
271
|
+
if hasattr(input_bytes, "read"):
|
|
272
|
+
input_bytes.seek(0)
|
|
273
|
+
content = input_bytes.read()
|
|
274
|
+
else:
|
|
275
|
+
content = input_bytes
|
|
276
|
+
|
|
277
|
+
with open(temp_input_path, "wb") as f:
|
|
278
|
+
f.write(content)
|
|
279
|
+
|
|
280
|
+
input_path = temp_input_path
|
|
281
|
+
|
|
282
|
+
elif file_path:
|
|
283
|
+
input_path = file_path
|
|
284
|
+
|
|
285
|
+
else:
|
|
286
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
287
|
+
|
|
288
|
+
output_dir = tempfile.mkdtemp()
|
|
289
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
290
|
+
|
|
291
|
+
# 1️⃣ Try Excel COM (Windows)
|
|
292
|
+
try:
|
|
293
|
+
import win32com.client
|
|
294
|
+
excel = win32com.client.Dispatch("Excel.Application")
|
|
295
|
+
excel.Visible = False
|
|
296
|
+
|
|
297
|
+
wb = excel.Workbooks.Open(str(Path(input_path).resolve()))
|
|
298
|
+
wb.ExportAsFixedFormat(0, str(Path(output_pdf).resolve())) # 0 = PDF
|
|
299
|
+
wb.Close()
|
|
300
|
+
excel.Quit()
|
|
301
|
+
|
|
302
|
+
return output_pdf
|
|
303
|
+
except Exception:
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
# 2️⃣ Try LibreOffice
|
|
307
|
+
try:
|
|
308
|
+
subprocess.run(
|
|
309
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
310
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
311
|
+
)
|
|
312
|
+
return output_pdf
|
|
313
|
+
except Exception:
|
|
314
|
+
pass
|
|
315
|
+
|
|
316
|
+
raise ValueError("Unable to convert XLS/XLSX to PDF")
|
|
317
|
+
|
|
318
|
+
|
|
@@ -3,11 +3,13 @@ import os
|
|
|
3
3
|
import tempfile
|
|
4
4
|
import base64
|
|
5
5
|
|
|
6
|
+
from langchain.chat_models import init_chat_model
|
|
6
7
|
from openai import OpenAI
|
|
7
8
|
from PIL import Image
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
from dotenv import load_dotenv
|
|
12
|
+
from openai.types import ChatModel
|
|
11
13
|
|
|
12
14
|
from .image_processor import ImageProcessor
|
|
13
15
|
|
|
@@ -18,9 +20,19 @@ load_dotenv(override=True)
|
|
|
18
20
|
class DocuToMarkdownExtractor:
|
|
19
21
|
"""Sends image pages to an LLM and extracts Markdown text + tables."""
|
|
20
22
|
|
|
21
|
-
def __init__(self, api_key: str, model: str = "gpt-4o-mini"):
|
|
22
|
-
|
|
23
|
-
|
|
23
|
+
def __init__(self, api_key: str, model: str = "gpt-4o-mini",client:ChatModel=None):
|
|
24
|
+
if client is None:
|
|
25
|
+
client = init_chat_model(
|
|
26
|
+
model=model,
|
|
27
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
28
|
+
api_key=api_key
|
|
29
|
+
)
|
|
30
|
+
self.client = client
|
|
31
|
+
self.model = client.model_name
|
|
32
|
+
# Initialize ImageProcessor once and pass the chat model
|
|
33
|
+
self.processor = ImageProcessor(client=self.client)
|
|
34
|
+
|
|
35
|
+
|
|
24
36
|
|
|
25
37
|
def _image_to_base64(self, image: Image.Image) -> str:
|
|
26
38
|
"""Converts PIL image to base64-encoded PNG string."""
|
|
@@ -29,7 +41,7 @@ class DocuToMarkdownExtractor:
|
|
|
29
41
|
with open(tmp.name, "rb") as f:
|
|
30
42
|
return base64.b64encode(f.read()).decode("utf-8")
|
|
31
43
|
|
|
32
|
-
def extract_markdown(self, images,include_image:True):
|
|
44
|
+
def extract_markdown(self, images,include_image:bool=True):
|
|
33
45
|
"""Extracts Markdown-formatted text from each image page."""
|
|
34
46
|
all_outputs = []
|
|
35
47
|
text_content=""
|
|
@@ -59,7 +71,8 @@ class DocuToMarkdownExtractor:
|
|
|
59
71
|
try:
|
|
60
72
|
response = json.loads(response) # Convert JSON string to dictionary
|
|
61
73
|
except json.JSONDecodeError:
|
|
62
|
-
|
|
74
|
+
print('skipping quietly')
|
|
75
|
+
#raise ValueError("The response from 'processor.analyze' is not valid JSON.")
|
|
63
76
|
text_content=text_content+"\n"+response["markdown_text"]
|
|
64
77
|
if(include_image):
|
|
65
78
|
response["image_data"]=b64_image
|
|
@@ -392,8 +392,8 @@ def qfetch_records_grouped_by_document_name(index, batch_size=100,limit=100):
|
|
|
392
392
|
|
|
393
393
|
|
|
394
394
|
#function that chunks any document
|
|
395
|
-
def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
|
|
396
|
-
return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config)
|
|
395
|
+
def chunk_documents(instructions,file_name,file_path="content_playground/content.json",splitter_config=None,client=None):
|
|
396
|
+
return prepare_chunked_text(file_path, file_name,instructions,splitter_config=splitter_config,client=client)
|
|
397
397
|
|
|
398
398
|
#function that chunks any document as well as inserts into vdb
|
|
399
399
|
def chunk_and_upsert_to_vdb(index_n,instructions,file_name,file_path="content_playground/content.json",splitter_config=None):
|
|
@@ -2,39 +2,35 @@ import numpy as np
|
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
class ChunkMapper:
|
|
5
|
-
def __init__(self,
|
|
5
|
+
def __init__(self, embedding_client, markdown_output, embedding_model="text-embedding-3-small"):
|
|
6
6
|
"""
|
|
7
7
|
client: OpenAI client object
|
|
8
8
|
markdown_output: list of JSON objects containing at least 'markdown_text'
|
|
9
9
|
embedding_model: model for embeddings
|
|
10
10
|
"""
|
|
11
|
-
self.
|
|
11
|
+
self.embedding_client = embedding_client
|
|
12
12
|
self.markdown_output = markdown_output
|
|
13
13
|
self.embedding_model = embedding_model
|
|
14
14
|
|
|
15
15
|
# Precompute embeddings for markdown_output
|
|
16
16
|
self.markdown_embeddings = self._compute_markdown_embeddings()
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
18
|
+
# -----------------------------
|
|
19
|
+
# Compute embeddings for markdown JSON items
|
|
20
|
+
# -----------------------------
|
|
21
|
+
|
|
21
22
|
def _compute_markdown_embeddings(self):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
23
|
+
texts = [obj.get("markdown_text", "") for obj in self.markdown_output]
|
|
24
|
+
return self.embedding_client.embed_documents(texts)
|
|
25
|
+
|
|
26
|
+
# -----------------------------
|
|
27
|
+
# Get embedding for a single text
|
|
28
|
+
# -----------------------------
|
|
28
29
|
|
|
29
|
-
# -----------------------------
|
|
30
|
-
# Embedding helper
|
|
31
|
-
# -----------------------------
|
|
32
30
|
def _get_embedding(self, text):
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
)
|
|
37
|
-
return response.data[0].embedding
|
|
31
|
+
# LangChain uses a list input
|
|
32
|
+
emb = self.embedding_client.embed_query(text)
|
|
33
|
+
return emb
|
|
38
34
|
|
|
39
35
|
# -----------------------------
|
|
40
36
|
# Cosine similarity
|
|
@@ -10,6 +10,8 @@ import requests
|
|
|
10
10
|
from dotenv import load_dotenv
|
|
11
11
|
from typing import Optional
|
|
12
12
|
|
|
13
|
+
from langchain.chat_models import init_chat_model
|
|
14
|
+
from langchain_core.messages import HumanMessage
|
|
13
15
|
from openai import OpenAI
|
|
14
16
|
from langchain_core.pydantic_v1 import BaseModel
|
|
15
17
|
|
|
@@ -31,15 +33,22 @@ class ImageProcessor:
|
|
|
31
33
|
Wrapper for a GPT-4o multimodal image reasoning pipeline.
|
|
32
34
|
"""
|
|
33
35
|
|
|
34
|
-
def __init__(self, model_name: str = "gpt-4o-mini"):
|
|
36
|
+
def __init__(self, api_key:str=None, model_name: str = "gpt-4o-mini",client=None):
|
|
35
37
|
load_dotenv(override=True)
|
|
36
38
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
37
39
|
if not self.api_key:
|
|
38
40
|
raise ValueError("❌ OPENAI_API_KEY not found in .env or environment!")
|
|
39
41
|
|
|
42
|
+
if client is None:
|
|
43
|
+
client = init_chat_model(
|
|
44
|
+
model=model_name,
|
|
45
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
46
|
+
api_key=api_key
|
|
47
|
+
)
|
|
48
|
+
self.llm = client
|
|
40
49
|
# Initialize multimodal client
|
|
41
|
-
|
|
42
|
-
self.model_name = model_name
|
|
50
|
+
|
|
51
|
+
self.model_name = client.model_name
|
|
43
52
|
|
|
44
53
|
# -------------------------------------------------
|
|
45
54
|
# 3️⃣ Image encoding helper
|
|
@@ -70,17 +79,11 @@ class ImageProcessor:
|
|
|
70
79
|
},
|
|
71
80
|
]
|
|
72
81
|
content1.extend(finstructioncontent)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
"content": content1
|
|
79
|
-
}
|
|
80
|
-
],
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
result_text = response.choices[0].message.content
|
|
82
|
+
# Call the LangChain model
|
|
83
|
+
response_msg = self.llm.predict_messages([HumanMessage(content=content1)])
|
|
84
|
+
|
|
85
|
+
# Extract the text
|
|
86
|
+
result_text = response_msg.content
|
|
84
87
|
print("✅ Analysis complete.")
|
|
85
88
|
print(result_text)
|
|
86
89
|
return result_text
|
|
@@ -7,6 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
|
|
8
8
|
from docx import Document
|
|
9
9
|
from dotenv import load_dotenv
|
|
10
|
+
from langchain_openai import OpenAIEmbeddings
|
|
10
11
|
from openai import OpenAI
|
|
11
12
|
from PIL import Image
|
|
12
13
|
|
|
@@ -111,6 +112,34 @@ class ImageStrategy(BaseDocumentStrategy):
|
|
|
111
112
|
|
|
112
113
|
return [image]
|
|
113
114
|
|
|
115
|
+
class PowerPointStrategy(BaseDocumentStrategy):
|
|
116
|
+
def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
|
|
117
|
+
print(f"📊 Using PowerPointStrategy for {file_path or input_bytes}")
|
|
118
|
+
|
|
119
|
+
converter = DocuToImageConverter()
|
|
120
|
+
|
|
121
|
+
# Convert PPT/PPTX → PDF
|
|
122
|
+
pdf_path = converter._convert_ppt_to_pdf(file_path=file_path, input_bytes=input_bytes)
|
|
123
|
+
|
|
124
|
+
# Then convert PDF → images
|
|
125
|
+
images = converter.convert_to_images(pdf_path)
|
|
126
|
+
|
|
127
|
+
return images
|
|
128
|
+
|
|
129
|
+
class ExcelStrategy(BaseDocumentStrategy):
|
|
130
|
+
def process(self, file_path: str, input_bytes: bytes = None, ext: str = None):
|
|
131
|
+
print(f"📈 Using ExcelStrategy for {file_path or input_bytes}")
|
|
132
|
+
|
|
133
|
+
converter = DocuToImageConverter()
|
|
134
|
+
|
|
135
|
+
# Convert XLS/XLSX → PDF
|
|
136
|
+
pdf_path = converter._convert_excel_to_pdf(file_path=file_path, input_bytes=input_bytes)
|
|
137
|
+
|
|
138
|
+
# Convert PDF → images
|
|
139
|
+
images = converter.convert_to_images(pdf_path)
|
|
140
|
+
|
|
141
|
+
return images
|
|
142
|
+
|
|
114
143
|
|
|
115
144
|
# -----------------------------
|
|
116
145
|
# Strategy Factory
|
|
@@ -127,6 +156,17 @@ class StrategyFactory:
|
|
|
127
156
|
".png": ImageStrategy(),
|
|
128
157
|
".bmp": ImageStrategy(),
|
|
129
158
|
".tiff": ImageStrategy(),
|
|
159
|
+
|
|
160
|
+
# NEW — PowerPoint
|
|
161
|
+
".ppt": PowerPointStrategy(),
|
|
162
|
+
".pptx": PowerPointStrategy(),
|
|
163
|
+
|
|
164
|
+
# NEW — Excel
|
|
165
|
+
".xls": ExcelStrategy(),
|
|
166
|
+
".xlsx": ExcelStrategy(),
|
|
167
|
+
|
|
168
|
+
# NEW — Google Docs/Sheets
|
|
169
|
+
|
|
130
170
|
}
|
|
131
171
|
|
|
132
172
|
@classmethod
|
|
@@ -145,11 +185,12 @@ class StrategyFactory:
|
|
|
145
185
|
# Main Orchestrator
|
|
146
186
|
# -----------------------------
|
|
147
187
|
class MarkdownAndChunkDocuments:
|
|
148
|
-
def __init__(self):
|
|
188
|
+
def __init__(self,client):
|
|
149
189
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
150
|
-
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
190
|
+
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
|
|
191
|
+
self.client=client
|
|
151
192
|
|
|
152
|
-
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
|
|
193
|
+
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
|
|
153
194
|
# Pick strategy
|
|
154
195
|
strategy = StrategyFactory.get_strategy(file_path,file_name)
|
|
155
196
|
if not strategy:
|
|
@@ -164,8 +205,14 @@ class MarkdownAndChunkDocuments:
|
|
|
164
205
|
binary_text_content = text_content.encode("utf-8")
|
|
165
206
|
|
|
166
207
|
# Chunking and mapping
|
|
167
|
-
chunk_client = OpenAI(api_key=self.api_key)
|
|
168
|
-
|
|
208
|
+
#chunk_client = OpenAI(api_key=self.api_key)
|
|
209
|
+
if embedding_client is None:
|
|
210
|
+
embedding_client = OpenAIEmbeddings(
|
|
211
|
+
model="text-embedding-3-small",
|
|
212
|
+
api_key=self.api_key
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
cm = ChunkMapper(embedding_client, markdown_output, embedding_model="text-embedding-3-small")
|
|
169
216
|
splitter_config = SplitterConfig(
|
|
170
217
|
chunk_size=300,
|
|
171
218
|
chunk_overlap=0,
|
|
@@ -191,6 +238,77 @@ class MarkdownAndChunkDocuments:
|
|
|
191
238
|
print("✅ Processing complete.")
|
|
192
239
|
return mapped_chunks
|
|
193
240
|
|
|
241
|
+
def markdown_and_chunk_documents_stream(
|
|
242
|
+
self,
|
|
243
|
+
file_path: str,
|
|
244
|
+
input_bytes: bytes = None,
|
|
245
|
+
include_image: bool = None,
|
|
246
|
+
file_name: str = None,
|
|
247
|
+
):
|
|
248
|
+
"""Generator version of markdown_and_chunk_documents that yields progress JSON events"""
|
|
249
|
+
|
|
250
|
+
def report(pct, msg=""):
|
|
251
|
+
yield {"progress": int(pct), "status": msg}
|
|
252
|
+
|
|
253
|
+
# 1️⃣ Pick strategy
|
|
254
|
+
yield from report(5, "Selecting strategy...")
|
|
255
|
+
strategy = StrategyFactory.get_strategy(file_path, file_name)
|
|
256
|
+
if not strategy:
|
|
257
|
+
raise ValueError(f"Unsupported file type: {file_path}")
|
|
258
|
+
|
|
259
|
+
# 2️⃣ Convert to images
|
|
260
|
+
ext = get_file_extension(file_path, file_name)
|
|
261
|
+
yield from report(15, "Processing file into images...")
|
|
262
|
+
images = strategy.process(file_path, input_bytes, ext)
|
|
263
|
+
|
|
264
|
+
# 3️⃣ Extract Markdown
|
|
265
|
+
yield from report(35, "Extracting markdown...")
|
|
266
|
+
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
|
267
|
+
binary_text_content = text_content.encode("utf-8")
|
|
268
|
+
|
|
269
|
+
# 4️⃣ Chunking
|
|
270
|
+
yield from report(55, "Chunking text...")
|
|
271
|
+
chunk_client = OpenAI(api_key=self.api_key)
|
|
272
|
+
cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
|
|
273
|
+
|
|
274
|
+
splitter_config = SplitterConfig(
|
|
275
|
+
chunk_size=300,
|
|
276
|
+
chunk_overlap=0,
|
|
277
|
+
separators=["\n"],
|
|
278
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
|
|
279
|
+
min_rl_chunk_size=5,
|
|
280
|
+
max_rl_chunk_size=50,
|
|
281
|
+
enableLLMTouchUp=False,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
chunked_text = chunk_documents(
|
|
285
|
+
"", file_name="install_ins.txt", file_path=binary_text_content, splitter_config=splitter_config
|
|
286
|
+
)
|
|
287
|
+
flat_chunks = ["".join(inner) for inner in chunked_text]
|
|
288
|
+
|
|
289
|
+
# 5️⃣ Map chunks (embedding)
|
|
290
|
+
yield from report(60, f"Mapping {len(flat_chunks)} chunks...")
|
|
291
|
+
total = len(flat_chunks)
|
|
292
|
+
mapped_chunks = []
|
|
293
|
+
for i, chunk in enumerate(flat_chunks, start=1):
|
|
294
|
+
mapped = cm.map_chunks([chunk])
|
|
295
|
+
mapped_chunks.extend(mapped)
|
|
296
|
+
progress = 60 + (i / total) * 30
|
|
297
|
+
yield from report(progress, f"Mapping chunk {i}/{total}")
|
|
298
|
+
|
|
299
|
+
# 6️⃣ Merge unmapped markdown sections
|
|
300
|
+
yield from report(95, "Merging markdown...")
|
|
301
|
+
for md_item in markdown_output:
|
|
302
|
+
if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
|
|
303
|
+
md_item["chunked_text"] = md_item["markdown_text"]
|
|
304
|
+
mapped_chunks.append(md_item)
|
|
305
|
+
|
|
306
|
+
adduuid(mapped_chunks)
|
|
307
|
+
yield from report(100, "✅ Processing complete.")
|
|
308
|
+
|
|
309
|
+
# Final result
|
|
310
|
+
yield {"progress": 100, "status": "done", "result": mapped_chunks}
|
|
311
|
+
|
|
194
312
|
def adduuid(mapped_chunks):
|
|
195
313
|
# Assuming mapped_chunks is a list of dictionaries
|
|
196
314
|
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/test_loader.py
RENAMED
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import pytest
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from langchain.chat_models import init_chat_model
|
|
5
|
+
from langchain_openai import OpenAIEmbeddings
|
|
3
6
|
|
|
4
7
|
from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
|
|
5
8
|
from core.prevectorchunks_core.services import chunk_documents_crud_vdb
|
|
6
9
|
from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
7
10
|
from core.prevectorchunks_core.utils.file_loader import SplitType
|
|
8
|
-
|
|
9
|
-
|
|
11
|
+
import os
|
|
12
|
+
load_dotenv(override=True)
|
|
10
13
|
# Create a temporary JSON file to test with
|
|
11
14
|
@pytest.fixture
|
|
12
15
|
def temp_json_file(tmp_path):
|
|
@@ -19,12 +22,16 @@ def temp_json_file(tmp_path):
|
|
|
19
22
|
|
|
20
23
|
def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
21
24
|
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
22
|
-
split_type=SplitType.
|
|
23
|
-
max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.
|
|
25
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
26
|
+
max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED)
|
|
27
|
+
client = init_chat_model(
|
|
28
|
+
model="gpt-4o-mini",
|
|
29
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
30
|
+
api_key=os.getenv("OPENAI_API_KEY")
|
|
31
|
+
)
|
|
32
|
+
chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",
|
|
24
33
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
splitter_config=splitter_config)
|
|
34
|
+
splitter_config=splitter_config,client=client)
|
|
28
35
|
|
|
29
36
|
print(chunks)
|
|
30
37
|
for i, c in enumerate(chunks):
|
|
@@ -32,9 +39,19 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
|
32
39
|
print(chunks)
|
|
33
40
|
|
|
34
41
|
def test_markdown(temp_json_file):
|
|
35
|
-
|
|
42
|
+
|
|
43
|
+
client = init_chat_model(
|
|
44
|
+
model="gpt-4o-mini",
|
|
45
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
46
|
+
api_key=os.getenv("OPENAI_API_KEY")
|
|
47
|
+
)
|
|
48
|
+
markdown_and_chunk_documents = MarkdownAndChunkDocuments(client)
|
|
49
|
+
embedding_client = OpenAIEmbeddings(
|
|
50
|
+
model="text-embedding-3-small",
|
|
51
|
+
api_key=os.getenv("OPENAI_API_KEY")
|
|
52
|
+
)
|
|
36
53
|
mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
|
|
37
|
-
"content.
|
|
54
|
+
"C:\\test-sandbox\\be\\PreVectorDeps\\PreVectorChunks\\core\\prevectorchunks_core\\services\\content.pptx",include_image=True,embedding_client=embedding_client)
|
|
38
55
|
print(mapped_chunks)
|
|
39
56
|
for i, c in enumerate(mapped_chunks):
|
|
40
57
|
print(f"Chunk {i + 1}: {c}")
|
|
@@ -9,6 +9,7 @@ from PIL import Image
|
|
|
9
9
|
import pytesseract
|
|
10
10
|
import uuid
|
|
11
11
|
|
|
12
|
+
from langchain.chat_models import init_chat_model
|
|
12
13
|
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
|
|
13
14
|
from openai import OpenAI
|
|
14
15
|
from openai import OpenAI
|
|
@@ -26,7 +27,8 @@ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
26
27
|
from django.core.files.uploadedfile import UploadedFile
|
|
27
28
|
|
|
28
29
|
from enum import Enum
|
|
29
|
-
|
|
30
|
+
import pandas as pd
|
|
31
|
+
from pptx import Presentation
|
|
30
32
|
class SplitType(Enum):
|
|
31
33
|
RECURSIVE = "RecursiveCharacterTextSplitter"
|
|
32
34
|
CHARACTER = "CharacterTextSplitter"
|
|
@@ -151,6 +153,35 @@ def load_file_by_type(ext, filepath):
|
|
|
151
153
|
data = json.load(f)
|
|
152
154
|
# Convert JSON to text (pretty print or flatten)
|
|
153
155
|
text = json.dumps(data, ensure_ascii=False, indent=2)
|
|
156
|
+
# -------------------------
|
|
157
|
+
# PPTX (PowerPoint)
|
|
158
|
+
# -------------------------
|
|
159
|
+
elif ext in [".pptx", ".ppt"]:
|
|
160
|
+
pres = Presentation(filepath)
|
|
161
|
+
slides_text = []
|
|
162
|
+
for slide in pres.slides:
|
|
163
|
+
slide_text = []
|
|
164
|
+
for shape in slide.shapes:
|
|
165
|
+
if hasattr(shape, "text"):
|
|
166
|
+
slide_text.append(shape.text)
|
|
167
|
+
slides_text.append("\n".join(slide_text))
|
|
168
|
+
text = "\n\n---- Slide Break ----\n\n".join(slides_text)
|
|
169
|
+
|
|
170
|
+
# -------------------------
|
|
171
|
+
# Excel (XLS / XLSX)
|
|
172
|
+
# -------------------------
|
|
173
|
+
elif ext in [".xlsx", ".xls"]:
|
|
174
|
+
# Using pandas for convenience
|
|
175
|
+
try:
|
|
176
|
+
df_dict = pd.read_excel(filepath, sheet_name=None)
|
|
177
|
+
all_sheets = []
|
|
178
|
+
for sheet, df in df_dict.items():
|
|
179
|
+
sheet_text = f"=== Sheet: {sheet} ===\n"
|
|
180
|
+
sheet_text += df.to_string(index=False)
|
|
181
|
+
all_sheets.append(sheet_text)
|
|
182
|
+
text = "\n\n".join(all_sheets)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
raise ValueError(f"Failed to read Excel file: {e}")
|
|
154
185
|
else:
|
|
155
186
|
raise ValueError(f"Unsupported file type: {ext}")
|
|
156
187
|
return text
|
|
@@ -220,38 +251,63 @@ def split_text_by_config(text, splitter_config:SplitterConfig=None, binary_data=
|
|
|
220
251
|
return [" ".join(words[i:i + splitter_config.chunk_size]) for i in
|
|
221
252
|
range(0, len(words), splitter_config.chunk_size)]
|
|
222
253
|
|
|
254
|
+
import json
|
|
255
|
+
from langchain.schema import HumanMessage
|
|
256
|
+
import uuid
|
|
223
257
|
|
|
224
|
-
def process_with_llm(chunk,instructions):
|
|
258
|
+
def process_with_llm(chunk, instructions=None, xclient=None):
|
|
225
259
|
"""
|
|
226
260
|
Send a chunk to LLM and return structured JSON array.
|
|
227
261
|
Expected format: [{"id": ..., "title": ..., "text": ...}, ...]
|
|
228
262
|
"""
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
263
|
+
instructions = instructions or "Extract sections"
|
|
264
|
+
|
|
265
|
+
# Combine chunk + instructions into one prompt
|
|
266
|
+
prompt_text = f"""
|
|
267
|
+
You are a helpful assistant that structures text into JSON sections.
|
|
268
|
+
Take the following text and split it into sections based on the most important category headings.
|
|
269
|
+
return a JSON array of objects with the following keys:
|
|
270
|
+
- "id" (a UUID you generate)
|
|
271
|
+
- "title" (the most important heading)
|
|
272
|
+
- "text" (the remaining text under that heading)
|
|
273
|
+
Return ONLY valid JSON, without extra text or backtick or markdown formatting.
|
|
235
274
|
|
|
236
275
|
Text:
|
|
237
276
|
{chunk}
|
|
277
|
+
|
|
278
|
+
Instructions: {instructions}
|
|
238
279
|
"""
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
280
|
+
|
|
281
|
+
# Use provided client or create new wrapper
|
|
282
|
+
if xclient is None:
|
|
283
|
+
xclient = init_chat_model(
|
|
284
|
+
model="gpt-4o-mini",
|
|
285
|
+
model_provider="openai", # you can later swap to "anthropic", "google", etc.
|
|
286
|
+
api_key=os.getenv("OPENAI_API_KEY")
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
# Call the LLM
|
|
291
|
+
response_msg = xclient.predict_messages([HumanMessage(content=prompt_text)])
|
|
292
|
+
response_text = response_msg.content
|
|
244
293
|
|
|
245
294
|
# Parse JSON safely
|
|
246
295
|
try:
|
|
247
|
-
structured_data =
|
|
248
|
-
|
|
249
|
-
|
|
296
|
+
structured_data = json.loads(response_text)
|
|
297
|
+
if isinstance(structured_data, str):
|
|
298
|
+
# Sometimes LLM returns a JSON string inside quotes
|
|
299
|
+
structured_data = json.loads(structured_data)
|
|
300
|
+
except json.JSONDecodeError as e:
|
|
301
|
+
print("LLM returned invalid JSON:", response_text)
|
|
302
|
+
raise e
|
|
303
|
+
|
|
304
|
+
for item in structured_data:
|
|
305
|
+
if isinstance(item, dict) and "id" not in item:
|
|
306
|
+
item["id"] = str(uuid.uuid4())
|
|
250
307
|
|
|
251
308
|
return structured_data
|
|
252
309
|
|
|
253
|
-
|
|
254
|
-
def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
|
|
310
|
+
def process_large_text(text, instructions,splitter_config:SplitterConfig=None,client=None):
|
|
255
311
|
"""Main function: split -> send to LLM -> collect results."""
|
|
256
312
|
chunks = split_text_by_config(text, splitter_config=splitter_config)
|
|
257
313
|
all_results = []
|
|
@@ -261,7 +317,7 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
|
|
|
261
317
|
return chunks
|
|
262
318
|
elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
|
|
263
319
|
for chunk in chunks:
|
|
264
|
-
structured = process_with_llm(chunk,instructions)
|
|
320
|
+
structured = process_with_llm(chunk,instructions,client)
|
|
265
321
|
# Ensure UUIDs exist
|
|
266
322
|
for obj in structured:
|
|
267
323
|
if "id" not in obj:
|
|
@@ -274,9 +330,9 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
|
|
|
274
330
|
|
|
275
331
|
|
|
276
332
|
|
|
277
|
-
def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None):
|
|
333
|
+
def prepare_chunked_text(file_path,file_name,instructions,chunk_size=200,splitter_config:SplitterConfig=None,client=None):
|
|
278
334
|
content =extract_content_agnostic(file_path,file_name)
|
|
279
|
-
results=process_large_text(content,instructions, splitter_config=splitter_config)
|
|
335
|
+
results=process_large_text(content,instructions, splitter_config=splitter_config,client=client)
|
|
280
336
|
print (results)
|
|
281
337
|
return results
|
|
282
338
|
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36/prevectorchunks_core.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.36
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -50,6 +50,9 @@ Requires-Dist: lxml~=4.9.3
|
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
52
|
Requires-Dist: tensorflow~=2.12.0
|
|
53
|
+
Requires-Dist: pandas~=2.2.2
|
|
54
|
+
Requires-Dist: openpyxl~=3.1.2
|
|
55
|
+
Requires-Dist: python-pptx~=0.6.21
|
|
53
56
|
Dynamic: license-file
|
|
54
57
|
|
|
55
58
|
# 📚 PreVectorChunks
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.36"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -52,6 +52,10 @@ dependencies = [
|
|
|
52
52
|
"cssselect2~=0.7.0",
|
|
53
53
|
"cairocffi~=1.4.0",
|
|
54
54
|
"tensorflow~=2.12.0", # <-- Add this
|
|
55
|
+
# 👉 Add these
|
|
56
|
+
"pandas~=2.2.2",
|
|
57
|
+
"openpyxl~=3.1.2",
|
|
58
|
+
"python-pptx~=0.6.21",
|
|
55
59
|
]
|
|
56
60
|
|
|
57
61
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.34 → prevectorchunks_core-0.1.36}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|