prevectorchunks-core 0.1.39__tar.gz → 0.1.41__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {prevectorchunks_core-0.1.39/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.41}/PKG-INFO +16 -28
  2. prevectorchunks_core-0.1.41/prevectorchunks_core/os-llm/dsqwen.py +24 -0
  3. prevectorchunks_core-0.1.41/prevectorchunks_core/os-llm/llava.py +29 -0
  4. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/DocuToImageConverter.py +92 -0
  5. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +79 -1
  6. prevectorchunks_core-0.1.41/prevectorchunks_core/services/EmbeddedImageExtractor.py +47 -0
  7. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/markdown_and_chunk_documents.py +53 -0
  8. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41/prevectorchunks_core.egg-info}/PKG-INFO +16 -28
  9. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core.egg-info/SOURCES.txt +4 -0
  10. prevectorchunks_core-0.1.41/prevectorchunks_core.egg-info/requires.txt +29 -0
  11. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/pyproject.toml +16 -29
  12. prevectorchunks_core-0.1.39/prevectorchunks_core/os-llm/llava.py +0 -15
  13. prevectorchunks_core-0.1.39/prevectorchunks_core.egg-info/requires.txt +0 -41
  14. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/LICENCE +0 -0
  15. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/LICENSE +0 -0
  16. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/README.md +0 -0
  17. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/__init__.py +0 -0
  18. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/config/__init__.py +0 -0
  19. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/config/splitter_config.py +0 -0
  20. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/migrations/__init__.py +0 -0
  21. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/os-llm/__init__.py +0 -0
  22. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  23. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/env.py +0 -0
  24. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/inference.py +0 -0
  25. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/model.py +0 -0
  26. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  27. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  28. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  29. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/reward.py +0 -0
  30. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  31. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  32. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/rlchunker/utils.py +0 -0
  33. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/__init__.py +0 -0
  34. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/audio_processor.py +0 -0
  35. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  36. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
  37. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/image_processor.py +0 -0
  38. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/propositional_index.py +0 -0
  39. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/services/video_analyser.py +0 -0
  40. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/test_loader.py +0 -0
  41. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/tests/__init__.py +0 -0
  42. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/tests/test_local.py +0 -0
  43. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/utils/__init__.py +0 -0
  44. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/utils/extract_content.py +0 -0
  45. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/utils/file_loader.py +0 -0
  46. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  47. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  48. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  49. {prevectorchunks_core-0.1.39 → prevectorchunks_core-0.1.41}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.39
3
+ Version: 0.1.41
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -12,47 +12,35 @@ Requires-Python: <3.12,>=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENCE
14
14
  License-File: LICENSE
15
- Requires-Dist: packaging~=24.1
16
- Requires-Dist: openai<3.0.0,>=2.6.0
17
- Requires-Dist: python-dotenv~=1.0.1
18
- Requires-Dist: PyJWT~=2.7.0
15
+ Requires-Dist: Django==5.1
16
+ Requires-Dist: django-cors-headers~=4.4.0
19
17
  Requires-Dist: fastapi~=0.112.2
20
- Requires-Dist: datasets~=4.1.0
18
+ Requires-Dist: PyJWT~=2.7.0
19
+ Requires-Dist: langchain-text-splitters~=0.3.11
20
+ Requires-Dist: openai~=2.6.0
21
21
  Requires-Dist: pinecone~=7.3.0
22
+ Requires-Dist: python-dotenv~=1.0.1
22
23
  Requires-Dist: pytesseract~=0.3.13
23
24
  Requires-Dist: python-docx~=1.2.0
24
25
  Requires-Dist: PyPDF2~=3.0.1
25
26
  Requires-Dist: pillow~=11.3.0
26
- Requires-Dist: torch~=2.2.2
27
- Requires-Dist: torchvision~=0.17.2
28
- Requires-Dist: torchaudio~=2.2.2
27
+ Requires-Dist: datasets~=4.1.1
28
+ Requires-Dist: torch~=2.6.0
29
+ Requires-Dist: torchvision~=0.21.0
30
+ Requires-Dist: torchaudio~=2.6.0
29
31
  Requires-Dist: sentence-transformers~=5.1.1
30
- Requires-Dist: py-gutenberg~=1.0.3
31
- Requires-Dist: langchain-text-splitters~=0.3.11
32
- Requires-Dist: langchain~=0.3
33
- Requires-Dist: langchain_openai~=0.3.35
34
- Requires-Dist: accelerate>=0.22.0
35
32
  Requires-Dist: pathlib~=1.0.1
36
33
  Requires-Dist: transformers~=4.57.0
37
34
  Requires-Dist: imageio-ffmpeg~=0.6.0
38
- Requires-Dist: opencv-python~=4.8.0.76
35
+ Requires-Dist: opencv-python~=4.12.0.88
39
36
  Requires-Dist: requests~=2.32.5
40
- Requires-Dist: langchain-core~=0.3.78
37
+ Requires-Dist: langchain~=1.3.9
38
+ Requires-Dist: langchain-openai~=1.0.0
41
39
  Requires-Dist: pdf2image~=1.17.0
42
40
  Requires-Dist: docx2pdf~=0.1.8
43
- Requires-Dist: numpy~=1.23.5
41
+ Requires-Dist: numpy~=2.2.6
44
42
  Requires-Dist: scikit-learn~=1.7.2
45
- Requires-Dist: PyMuPDF~=1.22.5
46
- Requires-Dist: pypandoc~=1.13
47
- Requires-Dist: reportlab~=4.1.0
48
- Requires-Dist: weasyprint~=62.0
49
- Requires-Dist: lxml~=4.9.3
50
- Requires-Dist: cssselect2~=0.7.0
51
- Requires-Dist: cairocffi~=1.4.0
52
- Requires-Dist: tensorflow~=2.12.0
53
- Requires-Dist: pandas~=2.2.2
54
- Requires-Dist: openpyxl~=3.1.2
55
- Requires-Dist: python-pptx~=0.6.21
43
+ Requires-Dist: fitz~=0.0.1.dev2
56
44
  Dynamic: license-file
57
45
 
58
46
  # 📚 PreVectorChunks
@@ -0,0 +1,24 @@
1
+ from transformers import pipeline
2
+
3
+ # ----- Step 1: Load the model using a text-generation pipeline -----
4
+ # DeepSeek-R1-Distill-Qwen-1.5B is a text-only model
5
+ pipe = pipeline(
6
+ "text-generation",
7
+ model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
8
+ device=0 # set to -1 for CPU, or 0 for GPU if available
9
+ )
10
+
11
+ # ----- Step 2: Define your prompt -----
12
+ prompt = "Hello, are you Jon?"
13
+
14
+ # ----- Step 3: Run inference -----
15
+ out = pipe(
16
+ prompt,
17
+ max_new_tokens=50, # controls length of generated output
18
+ do_sample=True, # optional: random sampling for variation
19
+ temperature=0.7 # optional: controls creativity
20
+ )
21
+
22
+ # ----- Step 4: Print output -----
23
+ # `out` is a list of dicts, each dict has 'generated_text'
24
+ print("Model response:", out[0]['generated_text'])
@@ -0,0 +1,29 @@
1
+ from transformers import pipeline
2
+
3
+ # #run locally
4
+ # pipe = pipeline("image-text-to-text",
5
+ # model="llava-hf/llava-1.5-13b-hf",
6
+ # device_map="auto",
7
+ # load_in_4bit=True)
8
+
9
+ pipe = pipeline(
10
+ "image-text-to-text",
11
+ model="llava-hf/llava-1.5-13b-hf",
12
+ device=-1, # CPU
13
+ load_in_4bit=True, # load model in 4-bit precision
14
+ use_auth_token=True
15
+ )
16
+ messages = [
17
+ {
18
+ "role": "user",
19
+ "content": [
20
+
21
+ {"type": "text", "text": "You are a content moderator - can you check if the content contains any personal information such as name, phone number, email etc"
22
+ "if the content contains personal information, return json failed"
23
+ "Here is the content : We need an electrician please contact John Doe on 0434343434"},
24
+ ],
25
+ },
26
+ ]
27
+
28
+ out = pipe(text=messages, max_new_tokens=20)
29
+ print(out)
@@ -137,6 +137,89 @@ class DocuToImageConverter:
137
137
  pdf_document.close()
138
138
  return images
139
139
 
140
+ def _convert_txt_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
141
+ """
142
+ Convert a .txt file or text bytes to PDF using ReportLab.
143
+ """
144
+ # Read text
145
+ if input_bytes is not None:
146
+ if hasattr(input_bytes, "read"):
147
+ input_bytes.seek(0)
148
+ text = input_bytes.read().decode("utf-8")
149
+ else:
150
+ text = input_bytes.decode("utf-8")
151
+ elif file_path:
152
+ with open(file_path, "r", encoding="utf-8") as f:
153
+ text = f.read()
154
+ else:
155
+ raise ValueError("Must supply either file_path or input_bytes")
156
+
157
+ # Prepare output PDF
158
+ output_dir = tempfile.mkdtemp()
159
+ output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
160
+
161
+ # Write text to PDF
162
+ c = canvas.Canvas(output_pdf, pagesize=A4)
163
+ width, height = A4
164
+ y = height - 50
165
+
166
+ for line in text.splitlines():
167
+ c.drawString(50, y, line[:1000]) # truncate very long lines
168
+ y -= 15
169
+ if y < 50:
170
+ c.showPage()
171
+ y = height - 50
172
+
173
+ c.save()
174
+ return output_pdf
175
+
176
+ def _convert_csv_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
177
+ """
178
+ Convert a CSV file or CSV bytes to PDF using ReportLab tables.
179
+ """
180
+ # Read CSV data
181
+ rows = []
182
+ if input_bytes is not None:
183
+ if hasattr(input_bytes, "read"):
184
+ input_bytes.seek(0)
185
+ reader = csv.reader(io.StringIO(input_bytes.read().decode("utf-8")))
186
+ else:
187
+ reader = csv.reader(io.StringIO(input_bytes.decode("utf-8")))
188
+ rows = list(reader)
189
+ elif file_path:
190
+ with open(file_path, "r", encoding="utf-8") as f:
191
+ reader = csv.reader(f)
192
+ rows = list(reader)
193
+ else:
194
+ raise ValueError("Must supply either file_path or input_bytes")
195
+
196
+ if not rows:
197
+ raise ValueError("CSV is empty")
198
+
199
+ # Prepare output PDF
200
+ output_dir = tempfile.mkdtemp()
201
+ output_pdf = os.path.join(output_dir, (Path(file_path).stem if file_path else "temp") + ".pdf")
202
+
203
+ # Create a table PDF
204
+ doc = SimpleDocTemplate(output_pdf, pagesize=A4)
205
+ table = Table(rows, repeatRows=1)
206
+
207
+ # Style table
208
+ style = TableStyle([
209
+ ('BACKGROUND', (0, 0), (-1, 0), colors.lightgrey),
210
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.black),
211
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
212
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
213
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
214
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 6),
215
+ ('GRID', (0, 0), (-1, -1), 0.5, colors.grey)
216
+ ])
217
+ table.setStyle(style)
218
+
219
+ elements = [table]
220
+ doc.build(elements)
221
+
222
+ return output_pdf
140
223
  def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
141
224
  """
142
225
  Convert a file path or binary content to PIL images.
@@ -167,6 +250,15 @@ class DocuToImageConverter:
167
250
  pdf_path = self._convert_doc_to_pdf(file_path, input_bytes)
168
251
  images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
169
252
 
253
+ # TXT → PDF
254
+ elif ext == ".txt":
255
+ pdf_path = self._convert_txt_to_pdf(file_path, input_bytes)
256
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
257
+ # CSV → PDF
258
+ elif ext == ".csv":
259
+ pdf_path = self._convert_csv_to_pdf(file_path, input_bytes)
260
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
261
+
170
262
  # PowerPoint → PDF
171
263
  elif ext in [".ppt", ".pptx"]:
172
264
  pdf_path = self._convert_ppt_to_pdf(file_path, input_bytes)
@@ -41,6 +41,84 @@ class DocuToMarkdownExtractor:
41
41
  with open(tmp.name, "rb") as f:
42
42
  return base64.b64encode(f.read()).decode("utf-8")
43
43
 
44
+ def extract_markdown_pages(self, pages, include_image: bool = True):
45
+ """Extracts Markdown from page images and integrates embedded image data."""
46
+ all_outputs = []
47
+ text_content = ""
48
+
49
+ for i, page in enumerate(pages, start=1):
50
+ print(f"🧠 Processing page {i}/{len(pages)}...")
51
+
52
+ # 1. Get the primary page image for OCR/Parsing
53
+ # If 'page' is a dict, we use the 'rendered_image' key
54
+ rendered_image = page.get("page_image")
55
+ b64_page_image = self._image_to_base64(rendered_image)
56
+
57
+ processor = ImageProcessor(model_name="gpt-4o-mini")
58
+
59
+ fins = [
60
+ {"type": "text", "text": (
61
+ "You are a document parser. Extract all text, images and tables "
62
+ "from this image and format the output in clean Markdown. "
63
+ "Preserve table structure, headings, and lists. If no markdown, put a space. "
64
+ "Describe any visual elements or images found on this page."
65
+ "Return only VALID JSON with keys: markdown_text, short_title, page_number, summary."
66
+ )},
67
+
68
+ ]
69
+
70
+ response = processor.analyze(encoded_image=b64_page_image, finstructioncontent=fins)
71
+
72
+ if isinstance(response, str):
73
+ try:
74
+ response = json.loads(response)
75
+ except json.JSONDecodeError:
76
+ print('skipping quietly')
77
+ continue
78
+
79
+ # 2. Integrate text content
80
+ text_content += "\n" + response.get("markdown_text", "")
81
+
82
+ # 3. Handle Embedded Images for this page
83
+ # We attach the high-res embedded images found in the DOCX to the page response
84
+ if include_image:
85
+ response["page_image_data"] = b64_page_image
86
+ # Map the specific embedded images extracted from the DOCX for this page
87
+
88
+ response["embedded_images"] = self.conv_to_base64(page,response)
89
+
90
+ response["image_index"] = i
91
+ response["page_number"] = i
92
+
93
+ all_outputs.append(response)
94
+
95
+ return all_outputs, text_content
96
+
97
+ def conv_to_base64(self, page,response):
98
+
99
+ # Get the raw list of embedded images (which likely contains binary blobs)
100
+ raw_embedded_list = page.get("embedded_images", [])
101
+ encoded_images = []
102
+
103
+ for img in raw_embedded_list:
104
+ # Check if we have image_bytes or a blob
105
+ # The source data from your extractor typically provides 'image_bytes' or 'blob'
106
+ image_bytes = img.get("image_bytes") or img.get("image_data")
107
+
108
+ if isinstance(image_bytes, bytes):
109
+ # Encode to base64 and decode to utf-8 string for JSON compatibility
110
+ b64_string = base64.b64encode(image_bytes).decode("utf-8")
111
+ else:
112
+ # If it's already a string or empty, keep as is
113
+ b64_string = image_bytes
114
+
115
+ encoded_images.append({
116
+ "image_index": img.get("image_index"),
117
+ "image_data": b64_string, # Now a Base64 string
118
+ "content_type": img.get("content_type", "image/png")
119
+ })
120
+
121
+ return json.dumps(encoded_images)
44
122
  def extract_markdown(self, images,include_image:bool=True):
45
123
  """Extracts Markdown-formatted text from each image page."""
46
124
  all_outputs = []
@@ -51,7 +129,7 @@ class DocuToMarkdownExtractor:
51
129
  b64_image = self._image_to_base64(image)
52
130
  processor = ImageProcessor(model_name="gpt-4o-mini")
53
131
 
54
- fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
132
+ fins = [{"type": "text", "text": "You are a document parser. Extract all text, images and tables "
55
133
  "from this image and format the output in clean Markdown. "
56
134
  "Preserve table structure, headings, and lists. If there is no markdown, put a space. "
57
135
  "Put your result in a JSON object with the following keys:"
@@ -0,0 +1,47 @@
1
+ import os
2
+ import tempfile
3
+ import shutil
4
+ import subprocess
5
+ from base64 import b64encode
6
+ from pathlib import Path
7
+ from PIL import Image
8
+ import io
9
+ import fitz
10
+ from docx2pdf import convert as docx2pdf_convert
11
+ from docx import Document
12
+
13
+
14
+ class EmbeddedImageExtractor:
15
+ """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
16
+
17
+ def __init__(self):
18
+ pass
19
+
20
+ # ----------------------------
21
+ # DOCX helper
22
+ # ----------------------------
23
+ def extract_all_images_from_docx(self, file_path, page_number):
24
+ doc = Document(file_path)
25
+ images = []
26
+
27
+ # Method A: Standard Relationship check
28
+ for rId, rel in doc.part.related_parts.items():
29
+ if "image" in rel.content_type:
30
+ image_bytes = rel.blob
31
+ images.append(self._format_output(image_bytes, rId))
32
+
33
+ # Method B: Package Part check (The "Deep Dive")
34
+ # If Method A found nothing, we look at every part in the zip package
35
+ if not images:
36
+ for part in doc.part.package.parts:
37
+ if "image" in part.content_type:
38
+ images.append(self._format_output(part.blob, "unknown_rid"))
39
+
40
+ return images
41
+
42
+ def _format_output(self, blob, rId):
43
+ return {
44
+ "image_data": b64encode(blob).decode("utf-8"),
45
+ "rel_id": rId
46
+ }
47
+
@@ -2,6 +2,7 @@ import os
2
2
  import json
3
3
  import tempfile
4
4
  import uuid
5
+ from base64 import b64encode
5
6
  from io import BytesIO
6
7
  from pathlib import Path
7
8
 
@@ -12,6 +13,7 @@ from openai import OpenAI
12
13
  from PIL import Image
13
14
 
14
15
  from .DocuToImageConverter import DocuToImageConverter
16
+ from .EmbeddedImageExtractor import EmbeddedImageExtractor
15
17
  from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
16
18
  from ..config.splitter_config import SplitterConfig
17
19
  from .chunk_documents_crud_vdb import chunk_documents
@@ -151,6 +153,8 @@ class StrategyFactory:
151
153
  ".pdf": PDFStrategy(),
152
154
  ".doc": WordStrategy(),
153
155
  ".docx": WordStrategy(),
156
+ ".txt": WordStrategy(),
157
+ ".csv": WordStrategy(),
154
158
  ".jpg": ImageStrategy(),
155
159
  ".jpeg": ImageStrategy(),
156
160
  ".png": ImageStrategy(),
@@ -190,6 +194,47 @@ class MarkdownAndChunkDocuments:
190
194
  self.extractor = DocuToMarkdownExtractor(api_key=self.api_key,client=client)
191
195
  self.client=client
192
196
 
197
+ def extract_embedded_images_for_pages(self,page_images, file_path, ext):
198
+ """
199
+ For each page image, extract embedded images depending on document type.
200
+
201
+ Args:
202
+ page_images (List[PIL.Image]): List of page images
203
+ file_path (str): Original file path
204
+ ext (str): File extension (e.g., .docx, .pdf, .png)
205
+
206
+ Returns:
207
+ List[dict]: Each dict contains:
208
+ - page_image: the PIL image of the page
209
+ - embedded_images: list of dicts with image_data (base64), image_index, page_number
210
+ """
211
+ pages = []
212
+
213
+ for i, page_image in enumerate(page_images, start=1):
214
+ embedded_images = []
215
+ eie = EmbeddedImageExtractor()
216
+
217
+ # pass all arguments as keyword arguments
218
+ if ext == ".docx":
219
+ embedded_images = eie.extract_all_images_from_docx(file_path=file_path, page_number=i)
220
+ elif ext == ".pdf":
221
+ embedded_images = eie.extract_embedded_images_from_pdf_page(file_path=file_path, page_number=i)
222
+
223
+ elif ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]:
224
+ # Single image files → treat the image itself as embedded
225
+ buf = BytesIO()
226
+ page_image.save(buf, format="PNG")
227
+ b64_image = b64encode(buf.getvalue()).decode("utf-8")
228
+ embedded_images = [{"image_data": b64_image, "image_index": 1, "page_number": i}]
229
+ # You can add more document types here (e.g., PPTX, HTML)
230
+
231
+ pages.append({
232
+ "page_number": i,
233
+ "page_image": page_image,
234
+ "embedded_images": embedded_images
235
+ })
236
+
237
+ return pages
193
238
  def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None,embedding_client=None):
194
239
  # Pick strategy
195
240
  strategy = StrategyFactory.get_strategy(file_path,file_name)
@@ -200,6 +245,10 @@ class MarkdownAndChunkDocuments:
200
245
  ext=get_file_extension(file_path,file_name)
201
246
  images = strategy.process(file_path, input_bytes,ext)
202
247
 
248
+ # NEW: for each page, extract embedded images
249
+ eie=EmbeddedImageExtractor()
250
+ pages = self.extract_embedded_images_for_pages(images, file_path, ext)
251
+
203
252
  # Extract Markdown from images
204
253
  markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
205
254
  binary_text_content = text_content.encode("utf-8")
@@ -262,6 +311,10 @@ class MarkdownAndChunkDocuments:
262
311
  yield from report(15, "Processing file into images...")
263
312
  images = strategy.process(file_path, input_bytes, ext)
264
313
 
314
+ # NEW: for each page, extract embedded images
315
+ eie = EmbeddedImageExtractor()
316
+ pages = self.extract_embedded_images_for_pages(images, file_path, ext)
317
+
265
318
  # 3️⃣ Extract Markdown
266
319
  yield from report(35, "Extracting markdown...")
267
320
  markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.39
3
+ Version: 0.1.41
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -12,47 +12,35 @@ Requires-Python: <3.12,>=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENCE
14
14
  License-File: LICENSE
15
- Requires-Dist: packaging~=24.1
16
- Requires-Dist: openai<3.0.0,>=2.6.0
17
- Requires-Dist: python-dotenv~=1.0.1
18
- Requires-Dist: PyJWT~=2.7.0
15
+ Requires-Dist: Django==5.1
16
+ Requires-Dist: django-cors-headers~=4.4.0
19
17
  Requires-Dist: fastapi~=0.112.2
20
- Requires-Dist: datasets~=4.1.0
18
+ Requires-Dist: PyJWT~=2.7.0
19
+ Requires-Dist: langchain-text-splitters~=0.3.11
20
+ Requires-Dist: openai~=2.6.0
21
21
  Requires-Dist: pinecone~=7.3.0
22
+ Requires-Dist: python-dotenv~=1.0.1
22
23
  Requires-Dist: pytesseract~=0.3.13
23
24
  Requires-Dist: python-docx~=1.2.0
24
25
  Requires-Dist: PyPDF2~=3.0.1
25
26
  Requires-Dist: pillow~=11.3.0
26
- Requires-Dist: torch~=2.2.2
27
- Requires-Dist: torchvision~=0.17.2
28
- Requires-Dist: torchaudio~=2.2.2
27
+ Requires-Dist: datasets~=4.1.1
28
+ Requires-Dist: torch~=2.6.0
29
+ Requires-Dist: torchvision~=0.21.0
30
+ Requires-Dist: torchaudio~=2.6.0
29
31
  Requires-Dist: sentence-transformers~=5.1.1
30
- Requires-Dist: py-gutenberg~=1.0.3
31
- Requires-Dist: langchain-text-splitters~=0.3.11
32
- Requires-Dist: langchain~=0.3
33
- Requires-Dist: langchain_openai~=0.3.35
34
- Requires-Dist: accelerate>=0.22.0
35
32
  Requires-Dist: pathlib~=1.0.1
36
33
  Requires-Dist: transformers~=4.57.0
37
34
  Requires-Dist: imageio-ffmpeg~=0.6.0
38
- Requires-Dist: opencv-python~=4.8.0.76
35
+ Requires-Dist: opencv-python~=4.12.0.88
39
36
  Requires-Dist: requests~=2.32.5
40
- Requires-Dist: langchain-core~=0.3.78
37
+ Requires-Dist: langchain~=1.3.9
38
+ Requires-Dist: langchain-openai~=1.0.0
41
39
  Requires-Dist: pdf2image~=1.17.0
42
40
  Requires-Dist: docx2pdf~=0.1.8
43
- Requires-Dist: numpy~=1.23.5
41
+ Requires-Dist: numpy~=2.2.6
44
42
  Requires-Dist: scikit-learn~=1.7.2
45
- Requires-Dist: PyMuPDF~=1.22.5
46
- Requires-Dist: pypandoc~=1.13
47
- Requires-Dist: reportlab~=4.1.0
48
- Requires-Dist: weasyprint~=62.0
49
- Requires-Dist: lxml~=4.9.3
50
- Requires-Dist: cssselect2~=0.7.0
51
- Requires-Dist: cairocffi~=1.4.0
52
- Requires-Dist: tensorflow~=2.12.0
53
- Requires-Dist: pandas~=2.2.2
54
- Requires-Dist: openpyxl~=3.1.2
55
- Requires-Dist: python-pptx~=0.6.21
43
+ Requires-Dist: fitz~=0.0.1.dev2
56
44
  Dynamic: license-file
57
45
 
58
46
  # 📚 PreVectorChunks
@@ -8,6 +8,7 @@ pyproject.toml
8
8
  ./prevectorchunks_core/config/splitter_config.py
9
9
  ./prevectorchunks_core/migrations/__init__.py
10
10
  ./prevectorchunks_core/os-llm/__init__.py
11
+ ./prevectorchunks_core/os-llm/dsqwen.py
11
12
  ./prevectorchunks_core/os-llm/llava.py
12
13
  ./prevectorchunks_core/rlchunker/__init__.py
13
14
  ./prevectorchunks_core/rlchunker/env.py
@@ -22,6 +23,7 @@ pyproject.toml
22
23
  ./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
23
24
  ./prevectorchunks_core/services/DocuToImageConverter.py
24
25
  ./prevectorchunks_core/services/DocuToMarkdownExtractor.py
26
+ ./prevectorchunks_core/services/EmbeddedImageExtractor.py
25
27
  ./prevectorchunks_core/services/__init__.py
26
28
  ./prevectorchunks_core/services/audio_processor.py
27
29
  ./prevectorchunks_core/services/chunk_documents_crud_vdb.py
@@ -47,6 +49,7 @@ prevectorchunks_core/config/__init__.py
47
49
  prevectorchunks_core/config/splitter_config.py
48
50
  prevectorchunks_core/migrations/__init__.py
49
51
  prevectorchunks_core/os-llm/__init__.py
52
+ prevectorchunks_core/os-llm/dsqwen.py
50
53
  prevectorchunks_core/os-llm/llava.py
51
54
  prevectorchunks_core/rlchunker/__init__.py
52
55
  prevectorchunks_core/rlchunker/env.py
@@ -61,6 +64,7 @@ prevectorchunks_core/rlchunker/pretrained/model_info.txt
61
64
  prevectorchunks_core/rlchunker/pretrained/policy_model.pt
62
65
  prevectorchunks_core/services/DocuToImageConverter.py
63
66
  prevectorchunks_core/services/DocuToMarkdownExtractor.py
67
+ prevectorchunks_core/services/EmbeddedImageExtractor.py
64
68
  prevectorchunks_core/services/__init__.py
65
69
  prevectorchunks_core/services/audio_processor.py
66
70
  prevectorchunks_core/services/chunk_documents_crud_vdb.py
@@ -0,0 +1,29 @@
1
+ Django==5.1
2
+ django-cors-headers~=4.4.0
3
+ fastapi~=0.112.2
4
+ PyJWT~=2.7.0
5
+ langchain-text-splitters~=0.3.11
6
+ openai~=2.6.0
7
+ pinecone~=7.3.0
8
+ python-dotenv~=1.0.1
9
+ pytesseract~=0.3.13
10
+ python-docx~=1.2.0
11
+ PyPDF2~=3.0.1
12
+ pillow~=11.3.0
13
+ datasets~=4.1.1
14
+ torch~=2.6.0
15
+ torchvision~=0.21.0
16
+ torchaudio~=2.6.0
17
+ sentence-transformers~=5.1.1
18
+ pathlib~=1.0.1
19
+ transformers~=4.57.0
20
+ imageio-ffmpeg~=0.6.0
21
+ opencv-python~=4.12.0.88
22
+ requests~=2.32.5
23
+ langchain~=1.3.9
24
+ langchain-openai~=1.0.0
25
+ pdf2image~=1.17.0
26
+ docx2pdf~=0.1.8
27
+ numpy~=2.2.6
28
+ scikit-learn~=1.7.2
29
+ fitz~=0.0.1.dev2
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.39"
7
+ version = "0.1.41"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -14,48 +14,35 @@ authors = [
14
14
  ]
15
15
 
16
16
  dependencies = [
17
- "packaging~=24.1",
18
- "openai>=2.6.0,<3.0.0",
19
- "python-dotenv~=1.0.1",
20
- "PyJWT~=2.7.0",
17
+ "Django==5.1",
18
+ "django-cors-headers~=4.4.0",
21
19
  "fastapi~=0.112.2",
22
- "datasets~=4.1.0",
20
+ "PyJWT~=2.7.0",
21
+ "langchain-text-splitters~=0.3.11",
22
+ "openai~=2.6.0",
23
23
  "pinecone~=7.3.0",
24
+ "python-dotenv~=1.0.1",
24
25
  "pytesseract~=0.3.13",
25
26
  "python-docx~=1.2.0",
26
27
  "PyPDF2~=3.0.1",
27
28
  "pillow~=11.3.0",
28
- "torch~=2.2.2",
29
- "torchvision~=0.17.2",
30
- "torchaudio~=2.2.2",
29
+ "datasets~=4.1.1",
30
+ "torch~=2.6.0",
31
+ "torchvision~=0.21.0",
32
+ "torchaudio~=2.6.0",
31
33
  "sentence-transformers~=5.1.1",
32
- "py-gutenberg~=1.0.3",
33
- "langchain-text-splitters~=0.3.11",
34
- "langchain~=0.3",
35
- "langchain_openai~=0.3.35",
36
- "accelerate>=0.22.0",
37
34
  "pathlib~=1.0.1",
38
35
  "transformers~=4.57.0",
39
36
  "imageio-ffmpeg~=0.6.0",
40
- "opencv-python~= 4.8.0.76",
37
+ "opencv-python~=4.12.0.88",
41
38
  "requests~=2.32.5",
42
- "langchain-core~=0.3.78",
39
+ "langchain~=1.3.9",
40
+ "langchain-openai~=1.0.0",
43
41
  "pdf2image~=1.17.0",
44
42
  "docx2pdf~=0.1.8",
45
- "numpy~=1.23.5",
43
+ "numpy~=2.2.6",
46
44
  "scikit-learn~=1.7.2",
47
- "PyMuPDF~=1.22.5",
48
- "pypandoc~=1.13",
49
- "reportlab~=4.1.0",
50
- "weasyprint~=62.0",
51
- "lxml~=4.9.3",
52
- "cssselect2~=0.7.0",
53
- "cairocffi~=1.4.0",
54
- "tensorflow~=2.12.0", # <-- Add this
55
- # 👉 Add these
56
- "pandas~=2.2.2",
57
- "openpyxl~=3.1.2",
58
- "python-pptx~=0.6.21",
45
+ "fitz~=0.0.1.dev2",
59
46
  ]
60
47
 
61
48
 
@@ -1,15 +0,0 @@
1
- from transformers import pipeline
2
-
3
- pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-13b-hf")
4
- messages = [
5
- {
6
- "role": "user",
7
- "content": [
8
- {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
9
- {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
10
- ],
11
- },
12
- ]
13
-
14
- out = pipe(text=messages, max_new_tokens=20)
15
- print(out)
@@ -1,41 +0,0 @@
1
- packaging~=24.1
2
- openai<3.0.0,>=2.6.0
3
- python-dotenv~=1.0.1
4
- PyJWT~=2.7.0
5
- fastapi~=0.112.2
6
- datasets~=4.1.0
7
- pinecone~=7.3.0
8
- pytesseract~=0.3.13
9
- python-docx~=1.2.0
10
- PyPDF2~=3.0.1
11
- pillow~=11.3.0
12
- torch~=2.2.2
13
- torchvision~=0.17.2
14
- torchaudio~=2.2.2
15
- sentence-transformers~=5.1.1
16
- py-gutenberg~=1.0.3
17
- langchain-text-splitters~=0.3.11
18
- langchain~=0.3
19
- langchain_openai~=0.3.35
20
- accelerate>=0.22.0
21
- pathlib~=1.0.1
22
- transformers~=4.57.0
23
- imageio-ffmpeg~=0.6.0
24
- opencv-python~=4.8.0.76
25
- requests~=2.32.5
26
- langchain-core~=0.3.78
27
- pdf2image~=1.17.0
28
- docx2pdf~=0.1.8
29
- numpy~=1.23.5
30
- scikit-learn~=1.7.2
31
- PyMuPDF~=1.22.5
32
- pypandoc~=1.13
33
- reportlab~=4.1.0
34
- weasyprint~=62.0
35
- lxml~=4.9.3
36
- cssselect2~=0.7.0
37
- cairocffi~=1.4.0
38
- tensorflow~=2.12.0
39
- pandas~=2.2.2
40
- openpyxl~=3.1.2
41
- python-pptx~=0.6.21