prevectorchunks-core 0.1.38__tar.gz → 0.1.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.38/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.40}/PKG-INFO +15 -26
- prevectorchunks_core-0.1.40/prevectorchunks_core/os-llm/dsqwen.py +24 -0
- prevectorchunks_core-0.1.40/prevectorchunks_core/os-llm/llava.py +29 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/DocuToImageConverter.py +92 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +79 -1
- prevectorchunks_core-0.1.40/prevectorchunks_core/services/EmbeddedImageExtractor.py +47 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/markdown_and_chunk_documents.py +53 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40/prevectorchunks_core.egg-info}/PKG-INFO +15 -26
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core.egg-info/SOURCES.txt +4 -0
- prevectorchunks_core-0.1.40/prevectorchunks_core.egg-info/requires.txt +30 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/pyproject.toml +15 -27
- prevectorchunks_core-0.1.38/prevectorchunks_core/os-llm/llava.py +0 -15
- prevectorchunks_core-0.1.38/prevectorchunks_core.egg-info/requires.txt +0 -41
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/LICENCE +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/LICENSE +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/README.md +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/test_loader.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/utils/file_loader.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/setup.cfg +0 -0
{prevectorchunks_core-0.1.38/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.40}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.40
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -12,47 +12,36 @@ Requires-Python: <3.12,>=3.7
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENCE
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist:
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist: python-dotenv~=1.0.1
|
|
18
|
-
Requires-Dist: PyJWT~=2.7.0
|
|
15
|
+
Requires-Dist: Django==5.1
|
|
16
|
+
Requires-Dist: django-cors-headers~=4.4.0
|
|
19
17
|
Requires-Dist: fastapi~=0.112.2
|
|
20
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: PyJWT~=2.7.0
|
|
19
|
+
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
20
|
+
Requires-Dist: openai~=2.6.0
|
|
21
21
|
Requires-Dist: pinecone~=7.3.0
|
|
22
|
+
Requires-Dist: python-dotenv~=1.0.1
|
|
22
23
|
Requires-Dist: pytesseract~=0.3.13
|
|
23
24
|
Requires-Dist: python-docx~=1.2.0
|
|
24
25
|
Requires-Dist: PyPDF2~=3.0.1
|
|
25
26
|
Requires-Dist: pillow~=11.3.0
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: datasets~=4.1.1
|
|
28
|
+
Requires-Dist: torch~=2.6.0
|
|
29
|
+
Requires-Dist: torchvision~=0.21.0
|
|
28
30
|
Requires-Dist: torchaudio~=2.6.0
|
|
29
31
|
Requires-Dist: sentence-transformers~=5.1.1
|
|
30
|
-
Requires-Dist: py-gutenberg~=1.0.3
|
|
31
|
-
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
32
|
-
Requires-Dist: langchain~=0.3
|
|
33
|
-
Requires-Dist: langchain_openai~=0.3.35
|
|
34
|
-
Requires-Dist: accelerate>=0.22.0
|
|
35
32
|
Requires-Dist: pathlib~=1.0.1
|
|
36
33
|
Requires-Dist: transformers~=4.57.0
|
|
37
34
|
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
38
|
-
Requires-Dist: opencv-python~=4.
|
|
35
|
+
Requires-Dist: opencv-python~=4.12.0.88
|
|
39
36
|
Requires-Dist: requests~=2.32.5
|
|
37
|
+
Requires-Dist: langchain~=0.3.27
|
|
40
38
|
Requires-Dist: langchain-core~=0.3.78
|
|
39
|
+
Requires-Dist: langchain-openai~=0.3.35
|
|
41
40
|
Requires-Dist: pdf2image~=1.17.0
|
|
42
41
|
Requires-Dist: docx2pdf~=0.1.8
|
|
43
|
-
Requires-Dist: numpy~=
|
|
42
|
+
Requires-Dist: numpy~=2.2.6
|
|
44
43
|
Requires-Dist: scikit-learn~=1.7.2
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist: pypandoc~=1.13
|
|
47
|
-
Requires-Dist: reportlab~=4.1.0
|
|
48
|
-
Requires-Dist: weasyprint~=62.0
|
|
49
|
-
Requires-Dist: lxml~=4.9.3
|
|
50
|
-
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
-
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
-
Requires-Dist: tensorflow~=2.12.0
|
|
53
|
-
Requires-Dist: pandas~=2.2.2
|
|
54
|
-
Requires-Dist: openpyxl~=3.1.2
|
|
55
|
-
Requires-Dist: python-pptx~=0.6.21
|
|
44
|
+
Requires-Dist: fitz~=0.0.1.dev2
|
|
56
45
|
Dynamic: license-file
|
|
57
46
|
|
|
58
47
|
# 📚 PreVectorChunks
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from transformers import pipeline
|
|
2
|
+
|
|
3
|
+
# ----- Step 1: Load the model using a text-generation pipeline -----
|
|
4
|
+
# DeepSeek-R1-Distill-Qwen-1.5B is a text-only model
|
|
5
|
+
pipe = pipeline(
|
|
6
|
+
"text-generation",
|
|
7
|
+
model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
|
|
8
|
+
device=0 # set to -1 for CPU, or 0 for GPU if available
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
# ----- Step 2: Define your prompt -----
|
|
12
|
+
prompt = "Hello, are you Jon?"
|
|
13
|
+
|
|
14
|
+
# ----- Step 3: Run inference -----
|
|
15
|
+
out = pipe(
|
|
16
|
+
prompt,
|
|
17
|
+
max_new_tokens=50, # controls length of generated output
|
|
18
|
+
do_sample=True, # optional: random sampling for variation
|
|
19
|
+
temperature=0.7 # optional: controls creativity
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# ----- Step 4: Print output -----
|
|
23
|
+
# `out` is a list of dicts, each dict has 'generated_text'
|
|
24
|
+
print("Model response:", out[0]['generated_text'])
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from transformers import pipeline
|
|
2
|
+
|
|
3
|
+
# #run locally
|
|
4
|
+
# pipe = pipeline("image-text-to-text",
|
|
5
|
+
# model="llava-hf/llava-1.5-13b-hf",
|
|
6
|
+
# device_map="auto",
|
|
7
|
+
# load_in_4bit=True)
|
|
8
|
+
|
|
9
|
+
pipe = pipeline(
|
|
10
|
+
"image-text-to-text",
|
|
11
|
+
model="llava-hf/llava-1.5-13b-hf",
|
|
12
|
+
device=-1, # CPU
|
|
13
|
+
load_in_4bit=True, # load model in 4-bit precision
|
|
14
|
+
use_auth_token=True
|
|
15
|
+
)
|
|
16
|
+
messages = [
|
|
17
|
+
{
|
|
18
|
+
"role": "user",
|
|
19
|
+
"content": [
|
|
20
|
+
|
|
21
|
+
{"type": "text", "text": "You are a content moderator - can you check if the content contains any personal information such as name, phone number, email etc"
|
|
22
|
+
"if the content contains personal information, return json failed"
|
|
23
|
+
"Here is the content : We need an electrician please contact John Doe on 0434343434"},
|
|
24
|
+
],
|
|
25
|
+
},
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
out = pipe(text=messages, max_new_tokens=20)
|
|
29
|
+
print(out)
|
|
@@ -137,6 +137,89 @@ class DocuToImageConverter:
|
|
|
137
137
|
pdf_document.close()
|
|
138
138
|
return images
|
|
139
139
|
|
|
140
|
+
def _convert_txt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
141
|
+
"""
|
|
142
|
+
Convert a .txt file or text bytes to PDF using ReportLab.
|
|
143
|
+
"""
|
|
144
|
+
# Read text
|
|
145
|
+
if input_bytes is not None:
|
|
146
|
+
if hasattr(input_bytes, "read"):
|
|
147
|
+
input_bytes.seek(0)
|
|
148
|
+
text = input_bytes.read().decode("utf-8")
|
|
149
|
+
else:
|
|
150
|
+
text = input_bytes.decode("utf-8")
|
|
151
|
+
elif file_path:
|
|
152
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
153
|
+
text = f.read()
|
|
154
|
+
else:
|
|
155
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
156
|
+
|
|
157
|
+
# Prepare output PDF
|
|
158
|
+
output_dir = tempfile.mkdtemp()
|
|
159
|
+
output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
|
|
160
|
+
|
|
161
|
+
# Write text to PDF
|
|
162
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
163
|
+
width, height = A4
|
|
164
|
+
y = height - 50
|
|
165
|
+
|
|
166
|
+
for line in text.splitlines():
|
|
167
|
+
c.drawString(50, y, line[:1000]) # truncate very long lines
|
|
168
|
+
y -= 15
|
|
169
|
+
if y < 50:
|
|
170
|
+
c.showPage()
|
|
171
|
+
y = height - 50
|
|
172
|
+
|
|
173
|
+
c.save()
|
|
174
|
+
return output_pdf
|
|
175
|
+
|
|
176
|
+
def _convert_csv_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
177
|
+
"""
|
|
178
|
+
Convert a CSV file or CSV bytes to PDF using ReportLab tables.
|
|
179
|
+
"""
|
|
180
|
+
# Read CSV data
|
|
181
|
+
rows = []
|
|
182
|
+
if input_bytes is not None:
|
|
183
|
+
if hasattr(input_bytes, "read"):
|
|
184
|
+
input_bytes.seek(0)
|
|
185
|
+
reader = csv.reader(io.StringIO(input_bytes.read().decode("utf-8")))
|
|
186
|
+
else:
|
|
187
|
+
reader = csv.reader(io.StringIO(input_bytes.decode("utf-8")))
|
|
188
|
+
rows = list(reader)
|
|
189
|
+
elif file_path:
|
|
190
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
191
|
+
reader = csv.reader(f)
|
|
192
|
+
rows = list(reader)
|
|
193
|
+
else:
|
|
194
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
195
|
+
|
|
196
|
+
if not rows:
|
|
197
|
+
raise ValueError("CSV is empty")
|
|
198
|
+
|
|
199
|
+
# Prepare output PDF
|
|
200
|
+
output_dir = tempfile.mkdtemp()
|
|
201
|
+
output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
|
|
202
|
+
|
|
203
|
+
# Create a table PDF
|
|
204
|
+
doc = SimpleDocTemplate(output_pdf, pagesize=A4)
|
|
205
|
+
table = Table(rows, repeatRows=1)
|
|
206
|
+
|
|
207
|
+
# Style table
|
|
208
|
+
style = TableStyle([
|
|
209
|
+
('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
|
|
210
|
+
('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
|
|
211
|
+
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
|
|
212
|
+
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
|
|
213
|
+
('FONTSIZE', (0, 0), (-1, -1), 10),
|
|
214
|
+
('BOTTOMPADDING', (0, 0), (-1, 0), 6),
|
|
215
|
+
('GRID', (0, 0), (-1, -1), 0.5, colors.grey)
|
|
216
|
+
])
|
|
217
|
+
table.setStyle(style)
|
|
218
|
+
|
|
219
|
+
elements = [table]
|
|
220
|
+
doc.build(elements)
|
|
221
|
+
|
|
222
|
+
return output_pdf
|
|
140
223
|
def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
|
|
141
224
|
"""
|
|
142
225
|
Convert a file path or binary content to PIL images.
|
|
@@ -167,6 +250,15 @@ class DocuToImageConverter:
|
|
|
167
250
|
pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
|
|
168
251
|
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
169
252
|
|
|
253
|
+
# TXT → PDF
|
|
254
|
+
elif ext == ".txt":
|
|
255
|
+
pdf_path = self._convert_txt_to_pdf(file_path, input_bytes)
|
|
256
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
257
|
+
# CSV → PDF
|
|
258
|
+
elif ext == ".csv":
|
|
259
|
+
pdf_path = self._convert_csv_to_pdf(file_path, input_bytes)
|
|
260
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
261
|
+
|
|
170
262
|
# PowerPoint → PDF
|
|
171
263
|
elif ext in [".ppt", ".pptx"]:
|
|
172
264
|
pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
|
|
@@ -41,6 +41,84 @@ class DocuToMarkdownExtractor:
|
|
|
41
41
|
with open(tmp.name, "rb") as f:
|
|
42
42
|
return base64.b64encode(f.read()).decode("utf-8")
|
|
43
43
|
|
|
44
|
+
def extract_markdown_pages(self, pages, include_image: bool = True):
|
|
45
|
+
"""Extracts Markdown from page images and integrates embedded image data."""
|
|
46
|
+
all_outputs = []
|
|
47
|
+
text_content = ""
|
|
48
|
+
|
|
49
|
+
for i, page in enumerate(pages, start=1):
|
|
50
|
+
print(f"🧠 Processing page {i}/{len(pages)}...")
|
|
51
|
+
|
|
52
|
+
# 1. Get the primary page image for OCR/Parsing
|
|
53
|
+
# If 'page' is a dict, we use the 'rendered_image' key
|
|
54
|
+
rendered_image = page.get("page_image")
|
|
55
|
+
b64_page_image = self._image_to_base64(rendered_image)
|
|
56
|
+
|
|
57
|
+
processor = ImageProcessor(model_name="gpt-4o-mini")
|
|
58
|
+
|
|
59
|
+
fins = [
|
|
60
|
+
{"type": "text", "text": (
|
|
61
|
+
"You are a document parser. Extract all text, images and tables "
|
|
62
|
+
"from this image and format the output in clean Markdown. "
|
|
63
|
+
"Preserve table structure, headings, and lists. If no markdown, put a space. "
|
|
64
|
+
"Describe any visual elements or images found on this page."
|
|
65
|
+
"Return only VALID JSON with keys: markdown_text, short_title, page_number, summary."
|
|
66
|
+
)},
|
|
67
|
+
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
response = processor.analyze(encoded_image=b64_page_image, finstructioncontent=fins)
|
|
71
|
+
|
|
72
|
+
if isinstance(response, str):
|
|
73
|
+
try:
|
|
74
|
+
response = json.loads(response)
|
|
75
|
+
except json.JSONDecodeError:
|
|
76
|
+
print('skipping quietly')
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# 2. Integrate text content
|
|
80
|
+
text_content += "\n" + response.get("markdown_text", "")
|
|
81
|
+
|
|
82
|
+
# 3. Handle Embedded Images for this page
|
|
83
|
+
# We attach the high-res embedded images found in the DOCX to the page response
|
|
84
|
+
if include_image:
|
|
85
|
+
response["page_image_data"] = b64_page_image
|
|
86
|
+
# Map the specific embedded images extracted from the DOCX for this page
|
|
87
|
+
|
|
88
|
+
response["embedded_images"] = self.conv_to_base64(page,response)
|
|
89
|
+
|
|
90
|
+
response["image_index"] = i
|
|
91
|
+
response["page_number"] = i
|
|
92
|
+
|
|
93
|
+
all_outputs.append(response)
|
|
94
|
+
|
|
95
|
+
return all_outputs, text_content
|
|
96
|
+
|
|
97
|
+
def conv_to_base64(self, page,response):
|
|
98
|
+
|
|
99
|
+
# Get the raw list of embedded images (which likely contains binary blobs)
|
|
100
|
+
raw_embedded_list = page.get("embedded_images", [])
|
|
101
|
+
encoded_images = []
|
|
102
|
+
|
|
103
|
+
for img in raw_embedded_list:
|
|
104
|
+
# Check if we have image_bytes or a blob
|
|
105
|
+
# The source data from your extractor typically provides 'image_bytes' or 'blob'
|
|
106
|
+
image_bytes = img.get("image_bytes") or img.get("image_data")
|
|
107
|
+
|
|
108
|
+
if isinstance(image_bytes, bytes):
|
|
109
|
+
# Encode to base64 and decode to utf-8 string for JSON compatibility
|
|
110
|
+
b64_string = base64.b64encode(image_bytes).decode("utf-8")
|
|
111
|
+
else:
|
|
112
|
+
# If it's already a string or empty, keep as is
|
|
113
|
+
b64_string = image_bytes
|
|
114
|
+
|
|
115
|
+
encoded_images.append({
|
|
116
|
+
"image_index": img.get("image_index"),
|
|
117
|
+
"image_data": b64_string, # Now a Base64 string
|
|
118
|
+
"content_type": img.get("content_type", "image/png")
|
|
119
|
+
})
|
|
120
|
+
|
|
121
|
+
return json.dumps(encoded_images)
|
|
44
122
|
def extract_markdown(self, images,include_image:bool=True):
|
|
45
123
|
"""Extracts Markdown-formatted text from each image page."""
|
|
46
124
|
all_outputs = []
|
|
@@ -51,7 +129,7 @@ class DocuToMarkdownExtractor:
|
|
|
51
129
|
b64_image = self._image_to_base64(image)
|
|
52
130
|
processor = ImageProcessor(model_name="gpt-4o-mini")
|
|
53
131
|
|
|
54
|
-
fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
|
|
132
|
+
fins = [{"type": "text", "text": "You are a document parser. Extract all text, images and tables "
|
|
55
133
|
"from this image and format the output in clean Markdown. "
|
|
56
134
|
"Preserve table structure, headings, and lists. If there is no markdown, put a space. "
|
|
57
135
|
"Put your result in a JSON object with the following keys:"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from base64 import b64encode
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from PIL import Image
|
|
8
|
+
import io
|
|
9
|
+
import fitz
|
|
10
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
11
|
+
from docx import Document
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EmbeddedImageExtractor:
|
|
15
|
+
"""Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
# ----------------------------
|
|
21
|
+
# DOCX helper
|
|
22
|
+
# ----------------------------
|
|
23
|
+
def extract_all_images_from_docx(self, file_path, page_number):
|
|
24
|
+
doc = Document(file_path)
|
|
25
|
+
images = []
|
|
26
|
+
|
|
27
|
+
# Method A: Standard Relationship check
|
|
28
|
+
for rId, rel in doc.part.related_parts.items():
|
|
29
|
+
if "image" in rel.content_type:
|
|
30
|
+
image_bytes = rel.blob
|
|
31
|
+
images.append(self._format_output(image_bytes, rId))
|
|
32
|
+
|
|
33
|
+
# Method B: Package Part check (The "Deep Dive")
|
|
34
|
+
# If Method A found nothing, we look at every part in the zip package
|
|
35
|
+
if not images:
|
|
36
|
+
for part in doc.part.package.parts:
|
|
37
|
+
if "image" in part.content_type:
|
|
38
|
+
images.append(self._format_output(part.blob, "unknown_rid"))
|
|
39
|
+
|
|
40
|
+
return images
|
|
41
|
+
|
|
42
|
+
def _format_output(self, blob, rId):
|
|
43
|
+
return {
|
|
44
|
+
"image_data": b64encode(blob).decode("utf-8"),
|
|
45
|
+
"rel_id": rId
|
|
46
|
+
}
|
|
47
|
+
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
import tempfile
|
|
4
4
|
import uuid
|
|
5
|
+
from base64 import b64encode
|
|
5
6
|
from io import BytesIO
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
@@ -12,6 +13,7 @@ from openai import OpenAI
|
|
|
12
13
|
from PIL import Image
|
|
13
14
|
|
|
14
15
|
from .DocuToImageConverter import DocuToImageConverter
|
|
16
|
+
from .EmbeddedImageExtractor import EmbeddedImageExtractor
|
|
15
17
|
from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
|
|
16
18
|
from ..config.splitter_config import SplitterConfig
|
|
17
19
|
from .chunk_documents_crud_vdb import chunk_documents
|
|
@@ -151,6 +153,8 @@ class StrategyFactory:
|
|
|
151
153
|
".pdf": PDFStrategy(),
|
|
152
154
|
".doc": WordStrategy(),
|
|
153
155
|
".docx": WordStrategy(),
|
|
156
|
+
".txt": WordStrategy(),
|
|
157
|
+
".csv": WordStrategy(),
|
|
154
158
|
".jpg": ImageStrategy(),
|
|
155
159
|
".jpeg": ImageStrategy(),
|
|
156
160
|
".png": ImageStrategy(),
|
|
@@ -190,6 +194,47 @@ class MarkdownAndChunkDocuments:
|
|
|
190
194
|
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
|
|
191
195
|
self.client=client
|
|
192
196
|
|
|
197
|
+
def extract_embedded_images_for_pages(self,page_images, file_path, ext):
|
|
198
|
+
"""
|
|
199
|
+
For each page image, extract embedded images depending on document type.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
page_images (List[PIL.Image]): List of page images
|
|
203
|
+
file_path (str): Original file path
|
|
204
|
+
ext (str): File extension (e.g., .docx, .pdf, .png)
|
|
205
|
+
|
|
206
|
+
Returns:
|
|
207
|
+
List[dict]: Each dict contains:
|
|
208
|
+
- page_image: the PIL image of the page
|
|
209
|
+
- embedded_images: list of dicts with image_data (base64), image_index, page_number
|
|
210
|
+
"""
|
|
211
|
+
pages = []
|
|
212
|
+
|
|
213
|
+
for i, page_image in enumerate(page_images, start=1):
|
|
214
|
+
embedded_images = []
|
|
215
|
+
eie = EmbeddedImageExtractor()
|
|
216
|
+
|
|
217
|
+
# pass all arguments as keyword arguments
|
|
218
|
+
if ext == ".docx":
|
|
219
|
+
embedded_images = eie.extract_all_images_from_docx(file_path=file_path, page_number=i)
|
|
220
|
+
elif ext == ".pdf":
|
|
221
|
+
embedded_images = eie.extract_embedded_images_from_pdf_page(file_path=file_path, page_number=i)
|
|
222
|
+
|
|
223
|
+
elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
|
|
224
|
+
# Single image files → treat the image itself as embedded
|
|
225
|
+
buf = BytesIO()
|
|
226
|
+
page_image.save(buf, format="PNG")
|
|
227
|
+
b64_image = b64encode(buf.getvalue()).decode("utf-8")
|
|
228
|
+
embedded_images = [{"image_data": b64_image, "image_index": 1, "page_number": i}]
|
|
229
|
+
# You can add more document types here (e.g., PPTX, HTML)
|
|
230
|
+
|
|
231
|
+
pages.append({
|
|
232
|
+
"page_number": i,
|
|
233
|
+
"page_image": page_image,
|
|
234
|
+
"embedded_images": embedded_images
|
|
235
|
+
})
|
|
236
|
+
|
|
237
|
+
return pages
|
|
193
238
|
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
|
|
194
239
|
# Pick strategy
|
|
195
240
|
strategy = StrategyFactory.get_strategy(file_path,file_name)
|
|
@@ -200,6 +245,10 @@ class MarkdownAndChunkDocuments:
|
|
|
200
245
|
ext=get_file_extension(file_path,file_name)
|
|
201
246
|
images = strategy.process(file_path, input_bytes,ext)
|
|
202
247
|
|
|
248
|
+
# NEW: for each page, extract embedded images
|
|
249
|
+
eie=EmbeddedImageExtractor()
|
|
250
|
+
pages = self.extract_embedded_images_for_pages(images, file_path, ext)
|
|
251
|
+
|
|
203
252
|
# Extract Markdown from images
|
|
204
253
|
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
|
205
254
|
binary_text_content = text_content.encode("utf-8")
|
|
@@ -262,6 +311,10 @@ class MarkdownAndChunkDocuments:
|
|
|
262
311
|
yield from report(15, "Processing file into images...")
|
|
263
312
|
images = strategy.process(file_path, input_bytes, ext)
|
|
264
313
|
|
|
314
|
+
# NEW: for each page, extract embedded images
|
|
315
|
+
eie = EmbeddedImageExtractor()
|
|
316
|
+
pages = self.extract_embedded_images_for_pages(images, file_path, ext)
|
|
317
|
+
|
|
265
318
|
# 3️⃣ Extract Markdown
|
|
266
319
|
yield from report(35, "Extracting markdown...")
|
|
267
320
|
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40/prevectorchunks_core.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.40
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -12,47 +12,36 @@ Requires-Python: <3.12,>=3.7
|
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENCE
|
|
14
14
|
License-File: LICENSE
|
|
15
|
-
Requires-Dist:
|
|
16
|
-
Requires-Dist:
|
|
17
|
-
Requires-Dist: python-dotenv~=1.0.1
|
|
18
|
-
Requires-Dist: PyJWT~=2.7.0
|
|
15
|
+
Requires-Dist: Django==5.1
|
|
16
|
+
Requires-Dist: django-cors-headers~=4.4.0
|
|
19
17
|
Requires-Dist: fastapi~=0.112.2
|
|
20
|
-
Requires-Dist:
|
|
18
|
+
Requires-Dist: PyJWT~=2.7.0
|
|
19
|
+
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
20
|
+
Requires-Dist: openai~=2.6.0
|
|
21
21
|
Requires-Dist: pinecone~=7.3.0
|
|
22
|
+
Requires-Dist: python-dotenv~=1.0.1
|
|
22
23
|
Requires-Dist: pytesseract~=0.3.13
|
|
23
24
|
Requires-Dist: python-docx~=1.2.0
|
|
24
25
|
Requires-Dist: PyPDF2~=3.0.1
|
|
25
26
|
Requires-Dist: pillow~=11.3.0
|
|
26
|
-
Requires-Dist:
|
|
27
|
-
Requires-Dist:
|
|
27
|
+
Requires-Dist: datasets~=4.1.1
|
|
28
|
+
Requires-Dist: torch~=2.6.0
|
|
29
|
+
Requires-Dist: torchvision~=0.21.0
|
|
28
30
|
Requires-Dist: torchaudio~=2.6.0
|
|
29
31
|
Requires-Dist: sentence-transformers~=5.1.1
|
|
30
|
-
Requires-Dist: py-gutenberg~=1.0.3
|
|
31
|
-
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
32
|
-
Requires-Dist: langchain~=0.3
|
|
33
|
-
Requires-Dist: langchain_openai~=0.3.35
|
|
34
|
-
Requires-Dist: accelerate>=0.22.0
|
|
35
32
|
Requires-Dist: pathlib~=1.0.1
|
|
36
33
|
Requires-Dist: transformers~=4.57.0
|
|
37
34
|
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
38
|
-
Requires-Dist: opencv-python~=4.
|
|
35
|
+
Requires-Dist: opencv-python~=4.12.0.88
|
|
39
36
|
Requires-Dist: requests~=2.32.5
|
|
37
|
+
Requires-Dist: langchain~=0.3.27
|
|
40
38
|
Requires-Dist: langchain-core~=0.3.78
|
|
39
|
+
Requires-Dist: langchain-openai~=0.3.35
|
|
41
40
|
Requires-Dist: pdf2image~=1.17.0
|
|
42
41
|
Requires-Dist: docx2pdf~=0.1.8
|
|
43
|
-
Requires-Dist: numpy~=
|
|
42
|
+
Requires-Dist: numpy~=2.2.6
|
|
44
43
|
Requires-Dist: scikit-learn~=1.7.2
|
|
45
|
-
Requires-Dist:
|
|
46
|
-
Requires-Dist: pypandoc~=1.13
|
|
47
|
-
Requires-Dist: reportlab~=4.1.0
|
|
48
|
-
Requires-Dist: weasyprint~=62.0
|
|
49
|
-
Requires-Dist: lxml~=4.9.3
|
|
50
|
-
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
-
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
-
Requires-Dist: tensorflow~=2.12.0
|
|
53
|
-
Requires-Dist: pandas~=2.2.2
|
|
54
|
-
Requires-Dist: openpyxl~=3.1.2
|
|
55
|
-
Requires-Dist: python-pptx~=0.6.21
|
|
44
|
+
Requires-Dist: fitz~=0.0.1.dev2
|
|
56
45
|
Dynamic: license-file
|
|
57
46
|
|
|
58
47
|
# 📚 PreVectorChunks
|
|
@@ -8,6 +8,7 @@ pyproject.toml
|
|
|
8
8
|
./prevectorchunks_core/config/splitter_config.py
|
|
9
9
|
./prevectorchunks_core/migrations/__init__.py
|
|
10
10
|
./prevectorchunks_core/os-llm/__init__.py
|
|
11
|
+
./prevectorchunks_core/os-llm/dsqwen.py
|
|
11
12
|
./prevectorchunks_core/os-llm/llava.py
|
|
12
13
|
./prevectorchunks_core/rlchunker/__init__.py
|
|
13
14
|
./prevectorchunks_core/rlchunker/env.py
|
|
@@ -22,6 +23,7 @@ pyproject.toml
|
|
|
22
23
|
./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
|
|
23
24
|
./prevectorchunks_core/services/DocuToImageConverter.py
|
|
24
25
|
./prevectorchunks_core/services/DocuToMarkdownExtractor.py
|
|
26
|
+
./prevectorchunks_core/services/EmbeddedImageExtractor.py
|
|
25
27
|
./prevectorchunks_core/services/__init__.py
|
|
26
28
|
./prevectorchunks_core/services/audio_processor.py
|
|
27
29
|
./prevectorchunks_core/services/chunk_documents_crud_vdb.py
|
|
@@ -47,6 +49,7 @@ prevectorchunks_core/config/__init__.py
|
|
|
47
49
|
prevectorchunks_core/config/splitter_config.py
|
|
48
50
|
prevectorchunks_core/migrations/__init__.py
|
|
49
51
|
prevectorchunks_core/os-llm/__init__.py
|
|
52
|
+
prevectorchunks_core/os-llm/dsqwen.py
|
|
50
53
|
prevectorchunks_core/os-llm/llava.py
|
|
51
54
|
prevectorchunks_core/rlchunker/__init__.py
|
|
52
55
|
prevectorchunks_core/rlchunker/env.py
|
|
@@ -61,6 +64,7 @@ prevectorchunks_core/rlchunker/pretrained/model_info.txt
|
|
|
61
64
|
prevectorchunks_core/rlchunker/pretrained/policy_model.pt
|
|
62
65
|
prevectorchunks_core/services/DocuToImageConverter.py
|
|
63
66
|
prevectorchunks_core/services/DocuToMarkdownExtractor.py
|
|
67
|
+
prevectorchunks_core/services/EmbeddedImageExtractor.py
|
|
64
68
|
prevectorchunks_core/services/__init__.py
|
|
65
69
|
prevectorchunks_core/services/audio_processor.py
|
|
66
70
|
prevectorchunks_core/services/chunk_documents_crud_vdb.py
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Django==5.1
|
|
2
|
+
django-cors-headers~=4.4.0
|
|
3
|
+
fastapi~=0.112.2
|
|
4
|
+
PyJWT~=2.7.0
|
|
5
|
+
langchain-text-splitters~=0.3.11
|
|
6
|
+
openai~=2.6.0
|
|
7
|
+
pinecone~=7.3.0
|
|
8
|
+
python-dotenv~=1.0.1
|
|
9
|
+
pytesseract~=0.3.13
|
|
10
|
+
python-docx~=1.2.0
|
|
11
|
+
PyPDF2~=3.0.1
|
|
12
|
+
pillow~=11.3.0
|
|
13
|
+
datasets~=4.1.1
|
|
14
|
+
torch~=2.6.0
|
|
15
|
+
torchvision~=0.21.0
|
|
16
|
+
torchaudio~=2.6.0
|
|
17
|
+
sentence-transformers~=5.1.1
|
|
18
|
+
pathlib~=1.0.1
|
|
19
|
+
transformers~=4.57.0
|
|
20
|
+
imageio-ffmpeg~=0.6.0
|
|
21
|
+
opencv-python~=4.12.0.88
|
|
22
|
+
requests~=2.32.5
|
|
23
|
+
langchain~=0.3.27
|
|
24
|
+
langchain-core~=0.3.78
|
|
25
|
+
langchain-openai~=0.3.35
|
|
26
|
+
pdf2image~=1.17.0
|
|
27
|
+
docx2pdf~=0.1.8
|
|
28
|
+
numpy~=2.2.6
|
|
29
|
+
scikit-learn~=1.7.2
|
|
30
|
+
fitz~=0.0.1.dev2
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.40"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -14,48 +14,36 @@ authors = [
|
|
|
14
14
|
]
|
|
15
15
|
|
|
16
16
|
dependencies = [
|
|
17
|
-
"
|
|
18
|
-
"
|
|
19
|
-
"python-dotenv~=1.0.1",
|
|
20
|
-
"PyJWT~=2.7.0",
|
|
17
|
+
"Django==5.1",
|
|
18
|
+
"django-cors-headers~=4.4.0",
|
|
21
19
|
"fastapi~=0.112.2",
|
|
22
|
-
"
|
|
20
|
+
"PyJWT~=2.7.0",
|
|
21
|
+
"langchain-text-splitters~=0.3.11",
|
|
22
|
+
"openai~=2.6.0",
|
|
23
23
|
"pinecone~=7.3.0",
|
|
24
|
+
"python-dotenv~=1.0.1",
|
|
24
25
|
"pytesseract~=0.3.13",
|
|
25
26
|
"python-docx~=1.2.0",
|
|
26
27
|
"PyPDF2~=3.0.1",
|
|
27
28
|
"pillow~=11.3.0",
|
|
28
|
-
"
|
|
29
|
-
"
|
|
29
|
+
"datasets~=4.1.1",
|
|
30
|
+
"torch~=2.6.0",
|
|
31
|
+
"torchvision~=0.21.0",
|
|
30
32
|
"torchaudio~=2.6.0",
|
|
31
33
|
"sentence-transformers~=5.1.1",
|
|
32
|
-
"py-gutenberg~=1.0.3",
|
|
33
|
-
"langchain-text-splitters~=0.3.11",
|
|
34
|
-
"langchain~=0.3",
|
|
35
|
-
"langchain_openai~=0.3.35",
|
|
36
|
-
"accelerate>=0.22.0",
|
|
37
34
|
"pathlib~=1.0.1",
|
|
38
35
|
"transformers~=4.57.0",
|
|
39
36
|
"imageio-ffmpeg~=0.6.0",
|
|
40
|
-
"opencv-python~=
|
|
37
|
+
"opencv-python~=4.12.0.88",
|
|
41
38
|
"requests~=2.32.5",
|
|
39
|
+
"langchain~=0.3.27",
|
|
42
40
|
"langchain-core~=0.3.78",
|
|
41
|
+
"langchain-openai~=0.3.35",
|
|
43
42
|
"pdf2image~=1.17.0",
|
|
44
43
|
"docx2pdf~=0.1.8",
|
|
45
|
-
"numpy~=
|
|
44
|
+
"numpy~=2.2.6",
|
|
46
45
|
"scikit-learn~=1.7.2",
|
|
47
|
-
"
|
|
48
|
-
"pypandoc~=1.13",
|
|
49
|
-
"reportlab~=4.1.0",
|
|
50
|
-
"weasyprint~=62.0",
|
|
51
|
-
"lxml~=4.9.3",
|
|
52
|
-
"cssselect2~=0.7.0",
|
|
53
|
-
"cairocffi~=1.4.0",
|
|
54
|
-
"tensorflow~=2.12.0", # <-- Add this
|
|
55
|
-
# 👉 Add these
|
|
56
|
-
"pandas~=2.2.2",
|
|
57
|
-
"openpyxl~=3.1.2",
|
|
58
|
-
"python-pptx~=0.6.21",
|
|
46
|
+
"fitz~=0.0.1.dev2",
|
|
59
47
|
]
|
|
60
48
|
|
|
61
49
|
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
from transformers import pipeline
|
|
2
|
-
|
|
3
|
-
pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-13b-hf")
|
|
4
|
-
messages = [
|
|
5
|
-
{
|
|
6
|
-
"role": "user",
|
|
7
|
-
"content": [
|
|
8
|
-
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
|
|
9
|
-
{"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
|
|
10
|
-
],
|
|
11
|
-
},
|
|
12
|
-
]
|
|
13
|
-
|
|
14
|
-
out = pipe(text=messages, max_new_tokens=20)
|
|
15
|
-
print(out)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
packaging~=24.1
|
|
2
|
-
openai<3.0.0,>=2.6.0
|
|
3
|
-
python-dotenv~=1.0.1
|
|
4
|
-
PyJWT~=2.7.0
|
|
5
|
-
fastapi~=0.112.2
|
|
6
|
-
datasets~=4.1.0
|
|
7
|
-
pinecone~=7.3.0
|
|
8
|
-
pytesseract~=0.3.13
|
|
9
|
-
python-docx~=1.2.0
|
|
10
|
-
PyPDF2~=3.0.1
|
|
11
|
-
pillow~=11.3.0
|
|
12
|
-
torch~=2.2.2
|
|
13
|
-
torchvision~=0.17.2
|
|
14
|
-
torchaudio~=2.6.0
|
|
15
|
-
sentence-transformers~=5.1.1
|
|
16
|
-
py-gutenberg~=1.0.3
|
|
17
|
-
langchain-text-splitters~=0.3.11
|
|
18
|
-
langchain~=0.3
|
|
19
|
-
langchain_openai~=0.3.35
|
|
20
|
-
accelerate>=0.22.0
|
|
21
|
-
pathlib~=1.0.1
|
|
22
|
-
transformers~=4.57.0
|
|
23
|
-
imageio-ffmpeg~=0.6.0
|
|
24
|
-
opencv-python~=4.8.0.76
|
|
25
|
-
requests~=2.32.5
|
|
26
|
-
langchain-core~=0.3.78
|
|
27
|
-
pdf2image~=1.17.0
|
|
28
|
-
docx2pdf~=0.1.8
|
|
29
|
-
numpy~=1.23.5
|
|
30
|
-
scikit-learn~=1.7.2
|
|
31
|
-
PyMuPDF~=1.22.5
|
|
32
|
-
pypandoc~=1.13
|
|
33
|
-
reportlab~=4.1.0
|
|
34
|
-
weasyprint~=62.0
|
|
35
|
-
lxml~=4.9.3
|
|
36
|
-
cssselect2~=0.7.0
|
|
37
|
-
cairocffi~=1.4.0
|
|
38
|
-
tensorflow~=2.12.0
|
|
39
|
-
pandas~=2.2.2
|
|
40
|
-
openpyxl~=3.1.2
|
|
41
|
-
python-pptx~=0.6.21
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/test_loader.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.38 → prevectorchunks_core-0.1.40}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|