prevectorchunks-core 0.1.10__tar.gz → 0.1.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. prevectorchunks_core-0.1.28/LICENSE +2 -0
  2. prevectorchunks_core-0.1.28/PKG-INFO +272 -0
  3. prevectorchunks_core-0.1.10/PKG-INFO → prevectorchunks_core-0.1.28/README.md +73 -38
  4. prevectorchunks_core-0.1.28/prevectorchunks_core/config/splitter_config.py +22 -0
  5. prevectorchunks_core-0.1.28/prevectorchunks_core/os-llm/llava.py +15 -0
  6. prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained/model_info.txt +2 -0
  7. prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  8. prevectorchunks_core-0.1.28/prevectorchunks_core/services/DocuToImageConverter.py +148 -0
  9. prevectorchunks_core-0.1.28/prevectorchunks_core/services/DocuToMarkdownExtractor.py +74 -0
  10. prevectorchunks_core-0.1.28/prevectorchunks_core/services/audio_processor.py +136 -0
  11. prevectorchunks_core-0.1.28/prevectorchunks_core/services/chunk_to_all_content_mapper.py +78 -0
  12. prevectorchunks_core-0.1.28/prevectorchunks_core/services/image_processor.py +104 -0
  13. prevectorchunks_core-0.1.28/prevectorchunks_core/services/markdown_and_chunk_documents.py +161 -0
  14. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/services/propositional_index.py +17 -6
  15. prevectorchunks_core-0.1.28/prevectorchunks_core/services/video_analyser.py +176 -0
  16. prevectorchunks_core-0.1.28/prevectorchunks_core/test_loader.py +44 -0
  17. prevectorchunks_core-0.1.28/prevectorchunks_core/tests/__init__.py +0 -0
  18. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/tests/test_local.py +3 -1
  19. prevectorchunks_core-0.1.28/prevectorchunks_core/utils/__init__.py +4 -0
  20. prevectorchunks_core-0.1.28/prevectorchunks_core/utils/extract_content.py +92 -0
  21. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/utils/file_loader.py +62 -16
  22. prevectorchunks_core-0.1.28/prevectorchunks_core.egg-info/PKG-INFO +272 -0
  23. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/SOURCES.txt +13 -0
  24. prevectorchunks_core-0.1.28/prevectorchunks_core.egg-info/requires.txt +37 -0
  25. prevectorchunks_core-0.1.28/pyproject.toml +67 -0
  26. prevectorchunks_core-0.1.10/README.md +0 -155
  27. prevectorchunks_core-0.1.10/prevectorchunks_core/config/splitter_config.py +0 -11
  28. prevectorchunks_core-0.1.10/prevectorchunks_core/rlchunker/testpretrained.py +0 -6
  29. prevectorchunks_core-0.1.10/prevectorchunks_core/test_loader.py +0 -26
  30. prevectorchunks_core-0.1.10/prevectorchunks_core.egg-info/PKG-INFO +0 -184
  31. prevectorchunks_core-0.1.10/prevectorchunks_core.egg-info/requires.txt +0 -19
  32. prevectorchunks_core-0.1.10/pyproject.toml +0 -41
  33. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/LICENCE +0 -0
  34. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/__init__.py +0 -0
  35. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/config/__init__.py +0 -0
  36. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/migrations/__init__.py +0 -0
  37. {prevectorchunks_core-0.1.10/prevectorchunks_core/rlchunker/pretrained → prevectorchunks_core-0.1.28/prevectorchunks_core/os-llm}/__init__.py +0 -0
  38. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  39. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/env.py +0 -0
  40. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/inference.py +0 -0
  41. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/model.py +0 -0
  42. {prevectorchunks_core-0.1.10/prevectorchunks_core/services → prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained}/__init__.py +0 -0
  43. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/reward.py +0 -0
  44. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  45. /prevectorchunks_core-0.1.10/prevectorchunks_core/tests/__init__.py → /prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  46. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/utils.py +0 -0
  47. {prevectorchunks_core-0.1.10/prevectorchunks_core/utils → prevectorchunks_core-0.1.28/prevectorchunks_core/services}/__init__.py +0 -0
  48. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  49. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  50. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  51. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  52. {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/setup.cfg +0 -0
@@ -0,0 +1,2 @@
1
+ MIT License
2
+ Copyright (c) 2025 Your Name
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: prevectorchunks-core
3
+ Version: 0.1.28
4
+ Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
+ Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
+ License: MIT License
7
+ Copyright (c) 2025 Your Name
8
+
9
+ Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
10
+ Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
11
+ Requires-Python: <3.12,>=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENCE
14
+ License-File: LICENSE
15
+ Requires-Dist: packaging~=24.1
16
+ Requires-Dist: openai<3.0.0,>=2.6.0
17
+ Requires-Dist: python-dotenv~=1.0.1
18
+ Requires-Dist: PyJWT~=2.7.0
19
+ Requires-Dist: fastapi~=0.112.2
20
+ Requires-Dist: datasets~=4.1.0
21
+ Requires-Dist: pinecone~=7.3.0
22
+ Requires-Dist: pytesseract~=0.3.13
23
+ Requires-Dist: python-docx~=1.2.0
24
+ Requires-Dist: PyPDF2~=3.0.1
25
+ Requires-Dist: pillow~=11.3.0
26
+ Requires-Dist: torch~=2.6.0
27
+ Requires-Dist: torchvision~=0.21.0
28
+ Requires-Dist: torchaudio~=2.6.0
29
+ Requires-Dist: sentence-transformers~=5.1.1
30
+ Requires-Dist: py-gutenberg~=1.0.3
31
+ Requires-Dist: langchain-text-splitters~=0.3.11
32
+ Requires-Dist: langchain~=0.3
33
+ Requires-Dist: langchain_openai~=0.3.35
34
+ Requires-Dist: accelerate>=0.22.0
35
+ Requires-Dist: pathlib~=1.0.1
36
+ Requires-Dist: transformers~=4.57.0
37
+ Requires-Dist: imageio-ffmpeg~=0.6.0
38
+ Requires-Dist: opencv-python~=4.12.0.88
39
+ Requires-Dist: requests~=2.32.5
40
+ Requires-Dist: langchain-core~=0.3.78
41
+ Requires-Dist: pdf2image~=1.17.0
42
+ Requires-Dist: docx2pdf~=0.1.8
43
+ Requires-Dist: numpy~=2.2.6
44
+ Requires-Dist: scikit-learn~=1.7.2
45
+ Requires-Dist: PyMuPDF~=1.22.5
46
+ Requires-Dist: pypandoc~=1.13
47
+ Requires-Dist: reportlab~=4.1.0
48
+ Requires-Dist: weasyprint~=62.0
49
+ Requires-Dist: lxml~=4.9.3
50
+ Requires-Dist: cssselect2~=0.7.0
51
+ Requires-Dist: cairocffi~=1.4.0
52
+ Dynamic: license-file
53
+
54
+ # 📚 PreVectorChunks
55
+
56
+ > A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
57
+
58
+ ---
59
+
60
+ ## ✨ Who Needs This Module?
61
+ Any developer working with:
62
+ - **RAG pipelines**
63
+ - **Vector Databases** (like Pinecone, Weaviate, etc.)
64
+ - **AI applications** requiring **similar content retrieval**
65
+
66
+ ---
67
+
68
+ ## 🎯 What Does This Module Do?
69
+ This module helps you:
70
+ - **Chunk documents** into smaller fragments using:
71
+ - a pretrained Reinforcement Learning based model or
72
+ - a pretrained Reinforcement Learning based model with proposition indexing or
73
+ - standard word chunking
74
+ - recursive character based chunking
75
+ - character based chunking
76
+ - **Insert (upsert) fragments** into a vector database
77
+ - **Fetch & update** existing chunks from a vector database
78
+
79
+ ---
80
+
81
+ ## 📦 Installation
82
+ ```bash
83
+ pip install prevectorchunks-core
84
+ ```
85
+
86
+ How to import in a file:
87
+ ```python
88
+ from PreVectorChunks.services import chunk_documents_crud_vdb
89
+ ```
90
+
91
+ **Use .env for API keys:IMPORTANT: PLEASE ENSURE TO PROVIDE YOUR OPENAI_API_KEY as MINIMUM in an .env file or as required**
92
+ ```
93
+ PINECONE_API_KEY=YOUR_API_KEY
94
+ OPENAI_API_KEY=YOUR_API_KEY
95
+ ```
96
+
97
+ ---
98
+
99
+ ## 📄 Functions
100
+
101
+ ### 1. `chunk_documents`
102
+ ```python
103
+ chunk_documents(instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
104
+ ```
105
+ Splits the content of a document into smaller, manageable chunks. - Five types of document chunking
106
+ - Chunking using Reinforcement Learning based pretrained model +(enable/disable LLM to structure the chunked text - default is enabled)
107
+ - Chunking using Reinforcement Learning based pretrained model and proposition indexing +(enable/disable LLM to structure the chunked text - default is enabled)
108
+ - Recursive Character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
109
+ - Standard word based chunking+(enable/disable LLM to structure the chunked text - default is enabled)
110
+ - Simple character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
111
+
112
+
113
+ **Parameters**
114
+ - `instructions` (*dict or str*): Additional rules or guidance for how the document should be split.
115
+ - Example: `"split my content by biggest headings"`
116
+ - `file_path` (*str*): Binary file or file path to the input file containing the content or content of the file. Default: `"content_playground/content.json"`.
117
+ - `splitter_config (optional) ` (*SplitterConfig*): (if none provided standard split takes place) Object that defines chunking behavior, e.g., `chunk_size`, `chunk_overlap`, `separator`, `split_type`.
118
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.RECURSIVE.value)
119
+ - (chunk_size refers to size in characters (i.e. 100 characters) when RECURSIVE is used)
120
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.CHARACTER.value)
121
+ - - (chunk_size refers to size in characters (i.e. 100 characters) when CHARACTER is used)
122
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.STANDARD.value)
123
+ - - (chunk_size refers to size in words (i.e. 100 characters) when STANDARD is used)
124
+ - i.e. splitter_config = SplitterConfig(separators=["\n"],
125
+ split_type=SplitType.R_PRETRAINED.value, min_rl_chunk_size=5,
126
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
127
+ - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED is used)
128
+ - i.e. splitter_config = SplitterConfig(separators=["\n"],
129
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
130
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
131
+ - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
132
+
133
+ - **Returns**
134
+ - A list of chunked strings including a unique id, a meaningful title and chunked text
135
+
136
+ **Use Cases**
137
+ - Preparing text for LLM ingestion
138
+ - Splitting text by structure (headings, paragraphs)
139
+ - Vector database indexing
140
+
141
+ ---
142
+
143
+ ### 2. `chunk_and_upsert_to_vdb`
144
+ ```python
145
+ chunk_and_upsert_to_vdb(index_n, instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
146
+ ```
147
+ Splits a document into chunks (via `chunk_documents`) and **inserts them into a Vector Database**.
148
+
149
+ **Parameters**
150
+ - `index_n` (*str*): The name of the VDB index where chunks should be stored.
151
+ - `instructions` (*dict or str*): Rules for splitting content (same as `chunk_documents`).
152
+ - `file_path` (*str*): Path to the document file or content of the file. Default: `"content_playground/content.json"`.
153
+ - `splitter_config` (*SplitterConfig*): Object that defines chunking behavior.
154
+
155
+ **Returns**
156
+ - Confirmation of successful insert into the VDB.
157
+
158
+ **Use Cases**
159
+ - Automated document preprocessing and storage for vector search
160
+ - Preparing embeddings for semantic search
161
+
162
+ ---
163
+
164
+ ### 3. `fetch_vdb_chunks_grouped_by_document_name`
165
+ ```python
166
+ fetch_vdb_chunks_grouped_by_document_name(index_n)
167
+ ```
168
+ Fetches existing chunks stored in the Vector Database, grouped by **document name**.
169
+
170
+ **Parameters**
171
+ - `index_n` (*str*): The name of the VDB index.
172
+
173
+ **Returns**
174
+ - A dictionary or list of chunks grouped by document name.
175
+
176
+ **Use Cases**
177
+ - Retrieving all chunks of a specific document
178
+ - Verifying what content has been ingested into the VDB
179
+
180
+ ---
181
+
182
+ ### 4. `update_vdb_chunks_grouped_by_document_name`
183
+ ```python
184
+ update_vdb_chunks_grouped_by_document_name(index_n, dataset)
185
+ ```
186
+ Updates existing chunks in the Vector Database by document name.
187
+
188
+ **Parameters**
189
+ - `index_n` (*str*): The name of the VDB index.
190
+ - `dataset` (*dict or list*): The new data (chunks) to update existing entries.
191
+
192
+ **Returns**
193
+ - Confirmation of update status.
194
+
195
+ **Use Cases**
196
+ - Keeping VDB chunks up to date when documents change
197
+ - Re-ingesting revised or corrected content
198
+
199
+ ---
200
+ ### 5. ``markdown_and_chunk_documents``
201
+ ```python
202
+ from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
203
+
204
+ markdown_processor = MarkdownAndChunkDocuments()
205
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
206
+ ```
207
+
208
+ **Description**
209
+ This new function automatically:
210
+ 1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
211
+ 2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
212
+ 3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
213
+ 4. Merges unmatched markdown segments into the final structured output.
214
+
215
+ **Parameters**
216
+ - `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
217
+
218
+ **Returns**
219
+ - `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
220
+
221
+ **Example**
222
+ ```python
223
+ if __name__ == "__main__":
224
+ markdown_processor = MarkdownAndChunkDocuments()
225
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
226
+ print(mapped_chunks)
227
+ ```
228
+
229
+ **Use Cases**
230
+ - End-to-end document-to-markdown-to-chunks pipeline
231
+ - Automating preprocessing for RAG/LLM ingestion
232
+ - Extracting structured markdown for semantic search or content indexing
233
+
234
+ ---
235
+
236
+ ## 🚀 Example Workflow
237
+ ```python
238
+ from prevectorchunks_core.config import SplitterConfig
239
+
240
+ splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type=SplitType.R_PRETRAINED_PROPOSITION.value)
241
+
242
+ # Step 1: Chunk a document
243
+ chunks = chunk_documents(
244
+ instructions="split my content by biggest headings",
245
+ file_path="content_playground/content.json",
246
+ splitter_config=splitter_config
247
+ )
248
+
249
+ splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
250
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
251
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
252
+
253
+ chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
254
+
255
+ # Step 2: Insert chunks into VDB
256
+ chunk_and_upsert_to_vdb("my_index", instructions="split by headings", splitter_config=splitter_config)
257
+
258
+ # Step 3: Fetch stored chunks
259
+ docs = fetch_vdb_chunks_grouped_by_document_name("my_index")
260
+
261
+ # Step 4: Update chunks if needed
262
+ update_vdb_chunks_grouped_by_document_name("my_index", dataset=docs)
263
+ ```
264
+
265
+ ---
266
+
267
+ ## 🛠 Use Cases
268
+ - Preprocessing documents for LLM ingestion
269
+ - Semantic search and Q&A systems
270
+ - Vector database indexing and retrieval
271
+ - Maintaining versioned document chunks
272
+
@@ -1,32 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: prevectorchunks-core
3
- Version: 0.1.10
4
- Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
- Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
- Project-URL: Homepage, https://github.com/yourusername/mydep
7
- Description-Content-Type: text/markdown
8
- License-File: LICENCE
9
- Requires-Dist: packaging~=24.1
10
- Requires-Dist: requests~=2.32.3
11
- Requires-Dist: openai~=1.37.1
12
- Requires-Dist: python-dotenv~=1.0.1
13
- Requires-Dist: PyJWT~=2.7.0
14
- Requires-Dist: fastapi~=0.112.2
15
- Requires-Dist: datasets~=4.1.0
16
- Requires-Dist: pinecone~=7.3.0
17
- Requires-Dist: pytesseract~=0.3.13
18
- Requires-Dist: python-docx~=1.2.0
19
- Requires-Dist: PyPDF2~=3.0.1
20
- Requires-Dist: pillow~=11.3.0
21
- Requires-Dist: torch~=2.6.0
22
- Requires-Dist: torchvision~=0.21.0
23
- Requires-Dist: torchaudio~=2.6.0
24
- Requires-Dist: sentence-transformers~=5.1.1
25
- Requires-Dist: py-gutenberg~=1.0.3
26
- Requires-Dist: langchain-text-splitters~=0.3.11
27
- Requires-Dist: langchain~=0.3
28
- Dynamic: license-file
29
-
30
1
  # 📚 PreVectorChunks
31
2
 
32
3
  > A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
@@ -43,7 +14,12 @@ Any developer working with:
43
14
 
44
15
  ## 🎯 What Does This Module Do?
45
16
  This module helps you:
46
- - **Chunk documents** into smaller fragments
17
+ - **Chunk documents** into smaller fragments using:
18
+ - a pretrained Reinforcement Learning based model or
19
+ - a pretrained Reinforcement Learning based model with proposition indexing or
20
+ - standard word chunking
21
+ - recursive character based chunking
22
+ - character based chunking
47
23
  - **Insert (upsert) fragments** into a vector database
48
24
  - **Fetch & update** existing chunks from a vector database
49
25
 
@@ -59,7 +35,7 @@ How to import in a file:
59
35
  from PreVectorChunks.services import chunk_documents_crud_vdb
60
36
  ```
61
37
 
62
- **Use .env for API keys:**
38
+ **Use .env for API keys:IMPORTANT: PLEASE ENSURE TO PROVIDE YOUR OPENAI_API_KEY as MINIMUM in an .env file or as required**
63
39
  ```
64
40
  PINECONE_API_KEY=YOUR_API_KEY
65
41
  OPENAI_API_KEY=YOUR_API_KEY
@@ -73,17 +49,35 @@ OPENAI_API_KEY=YOUR_API_KEY
73
49
  ```python
74
50
  chunk_documents(instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
75
51
  ```
76
- Splits the content of a document into smaller, manageable chunks.
52
+ Splits the content of a document into smaller, manageable chunks. - Five types of document chunking
53
+ - Chunking using Reinforcement Learning based pretrained model +(enable/disable LLM to structure the chunked text - default is enabled)
54
+ - Chunking using Reinforcement Learning based pretrained model and proposition indexing +(enable/disable LLM to structure the chunked text - default is enabled)
55
+ - Recursive Character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
56
+ - Standard word based chunking+(enable/disable LLM to structure the chunked text - default is enabled)
57
+ - Simple character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
58
+
77
59
 
78
60
  **Parameters**
79
61
  - `instructions` (*dict or str*): Additional rules or guidance for how the document should be split.
80
62
  - Example: `"split my content by biggest headings"`
81
- - `file_path` (*str*): Path to the input JSON/text file containing the content or content of the file. Default: `"content_playground/content.json"`.
63
+ - `file_path` (*str*): Binary file or file path to the input file containing the content or content of the file. Default: `"content_playground/content.json"`.
82
64
  - `splitter_config (optional) ` (*SplitterConfig*): (if none provided standard split takes place) Object that defines chunking behavior, e.g., `chunk_size`, `chunk_overlap`, `separator`, `split_type`.
83
- - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type="RecursiveCharacterTextSplitter")
84
- - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type="CharacterTextSplitter")
85
- - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type="standard")
86
- **Returns**
65
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.RECURSIVE.value)
66
+ - (chunk_size refers to size in characters (i.e. 100 characters) when RECURSIVE is used)
67
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.CHARACTER.value)
68
+ - - (chunk_size refers to size in characters (i.e. 100 characters) when CHARACTER is used)
69
+ - i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.STANDARD.value)
70
+ - - (chunk_size refers to size in words (i.e. 100 characters) when STANDARD is used)
71
+ - i.e. splitter_config = SplitterConfig(separators=["\n"],
72
+ split_type=SplitType.R_PRETRAINED.value, min_rl_chunk_size=5,
73
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
74
+ - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED is used)
75
+ - i.e. splitter_config = SplitterConfig(separators=["\n"],
76
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
77
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
78
+ - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
79
+
80
+ - **Returns**
87
81
  - A list of chunked strings including a unique id, a meaningful title and chunked text
88
82
 
89
83
  **Use Cases**
@@ -149,13 +143,48 @@ Updates existing chunks in the Vector Database by document name.
149
143
  - Keeping VDB chunks up to date when documents change
150
144
  - Re-ingesting revised or corrected content
151
145
 
146
+ ---
147
+ ### 5. ``markdown_and_chunk_documents``
148
+ ```python
149
+ from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
150
+
151
+ markdown_processor = MarkdownAndChunkDocuments()
152
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
153
+ ```
154
+
155
+ **Description**
156
+ This new function automatically:
157
+ 1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
158
+ 2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
159
+ 3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
160
+ 4. Merges unmatched markdown segments into the final structured output.
161
+
162
+ **Parameters**
163
+ - `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
164
+
165
+ **Returns**
166
+ - `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
167
+
168
+ **Example**
169
+ ```python
170
+ if __name__ == "__main__":
171
+ markdown_processor = MarkdownAndChunkDocuments()
172
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
173
+ print(mapped_chunks)
174
+ ```
175
+
176
+ **Use Cases**
177
+ - End-to-end document-to-markdown-to-chunks pipeline
178
+ - Automating preprocessing for RAG/LLM ingestion
179
+ - Extracting structured markdown for semantic search or content indexing
180
+
152
181
  ---
153
182
 
154
183
  ## 🚀 Example Workflow
155
184
  ```python
156
185
  from prevectorchunks_core.config import SplitterConfig
157
186
 
158
- splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type="RecursiveCharacterTextSplitter")
187
+ splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type=SplitType.R_PRETRAINED_PROPOSITION.value)
159
188
 
160
189
  # Step 1: Chunk a document
161
190
  chunks = chunk_documents(
@@ -164,6 +193,12 @@ chunks = chunk_documents(
164
193
  splitter_config=splitter_config
165
194
  )
166
195
 
196
+ splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
197
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
198
+ max_rl_chunk_size=50,enableLLMTouchUp=False)
199
+
200
+ chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
201
+
167
202
  # Step 2: Insert chunks into VDB
168
203
  chunk_and_upsert_to_vdb("my_index", instructions="split by headings", splitter_config=splitter_config)
169
204
 
@@ -0,0 +1,22 @@
1
+ # prevectorchunks_core/config.py
2
+ from dataclasses import dataclass, field
3
+ from enum import Enum
4
+
5
+
6
+ class LLM_Structured_Output_Type(Enum):
7
+ STANDARD = "STANDARD"
8
+ STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
9
+
10
+
11
+ @dataclass()
12
+ class SplitterConfig:
13
+ chunk_size: int = 300
14
+ chunk_overlap: int = 0
15
+ separators: list[str] = field(default_factory=lambda: ["\n"])
16
+ split_type: str = "recursive_splitter"
17
+ enableLLMTouchUp: bool = True
18
+ llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
19
+ min_rl_chunk_size: int = 5
20
+ max_rl_chunk_size: int = 50
21
+
22
+
@@ -0,0 +1,15 @@
1
+ from transformers import pipeline
2
+
3
+ pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-13b-hf")
4
+ messages = [
5
+ {
6
+ "role": "user",
7
+ "content": [
8
+ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
9
+ {"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
10
+ ],
11
+ },
12
+ ]
13
+
14
+ out = pipe(text=messages, max_new_tokens=20)
15
+ print(out)
@@ -0,0 +1,2 @@
1
+ RL-based text chunking pretrained model
2
+ Embedding model: all-MiniLM-L6-v2
@@ -0,0 +1,148 @@
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import sys
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ import pypandoc
9
+ from PIL import Image
10
+ import io
11
+ from docx2pdf import convert as docx_to_pdf
12
+ import fitz
13
+ from docx2pdf import convert as docx2pdf_convert
14
+ try:
15
+ pypandoc.get_pandoc_path()
16
+ except OSError:
17
+ print("Pandoc not found — downloading it temporarily...")
18
+ pypandoc.download_pandoc()
19
+
20
+ class DocuToImageConverter:
21
+ """Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
22
+
23
+ def __init__(self):
24
+ pass
25
+
26
+ def _convert_doc_to_pdf(self, input_path: str) -> str:
27
+ import os, tempfile, shutil, subprocess
28
+ from pathlib import Path
29
+
30
+ if not os.path.exists(input_path):
31
+ raise FileNotFoundError(input_path)
32
+
33
+ output_dir = tempfile.mkdtemp()
34
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
35
+
36
+ # 1️⃣ Try Microsoft Word COM automation (Windows only)
37
+ try:
38
+ import win32com.client
39
+ word = win32com.client.Dispatch("Word.Application")
40
+ word.Visible = False
41
+ doc = word.Documents.Open(str(Path(input_path).resolve()))
42
+ doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
43
+ doc.Close()
44
+ word.Quit()
45
+ print("✅ Word COM conversion successful:", output_pdf)
46
+ return output_pdf
47
+ except Exception as e:
48
+ print("⚠️ Word COM conversion failed:", e)
49
+
50
+ # 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
51
+ try:
52
+ # Requires LibreOffice installed and in PATH
53
+ subprocess.run(
54
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
55
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
56
+ )
57
+ print("✅ LibreOffice conversion successful:", output_pdf)
58
+ return output_pdf
59
+ except Exception as e:
60
+ print("⚠️ LibreOffice conversion failed:", e)
61
+
62
+ # 3️⃣ Fallback: Pandoc (simpler, loses layout)
63
+ try:
64
+ import pypandoc
65
+ def which(cmd):
66
+ return shutil.which(cmd) is not None
67
+
68
+ pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
69
+ pypandoc.convert_file(
70
+ input_path, "pdf", outputfile=output_pdf,
71
+ extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
72
+ )
73
+ print("✅ Pandoc conversion successful:", output_pdf)
74
+ return output_pdf
75
+ except Exception as e:
76
+ print("⚠️ Pandoc conversion failed:", e)
77
+
78
+ # 4️⃣ Last resort: ReportLab basic text (no formatting)
79
+ from reportlab.pdfgen import canvas
80
+ from reportlab.lib.pagesizes import A4
81
+ from docx import Document
82
+
83
+ doc = Document(input_path)
84
+ c = canvas.Canvas(output_pdf, pagesize=A4)
85
+ width, height = A4
86
+ y = height - 50
87
+ for p in doc.paragraphs:
88
+ c.drawString(50, y, p.text[:1000])
89
+ y -= 15
90
+ if y < 50:
91
+ c.showPage()
92
+ y = height - 50
93
+ c.save()
94
+ print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
95
+ return output_pdf
96
+
97
+ def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
98
+ """
99
+ Converts each page of a PDF into images using PyMuPDF directly.
100
+ """
101
+ images = []
102
+
103
+ try:
104
+ pdf_document = fitz.open(pdf_path) # Use `PyMuPDF` instead of fitz alias
105
+ for page_num in range(len(pdf_document)):
106
+ page = pdf_document[page_num]
107
+ # Render page to a pixmap with the specified DPI
108
+ pixmap = page.get_pixmap(dpi=dpi)
109
+ # Convert pixmap to an Image object using PIL
110
+ image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
111
+ images.append(image)
112
+ pdf_document.close()
113
+ except Exception as e:
114
+ raise RuntimeError(f"Failed to convert PDF to images: {e}")
115
+
116
+ return images
117
+
118
+ def convert_to_images(self, file_path: str, dpi: int = 200, output_format: str = "PNG"):
119
+ """
120
+ Converts each page of a document into a list of PIL images.
121
+ Supports .pdf, .doc, .docx, and image files (.jpg, .png, etc.)
122
+ Ensures all outputs are in a consistent image format.
123
+ """
124
+ ext = os.path.splitext(file_path)[1].lower()
125
+
126
+ # Convert Word → PDF first
127
+ if ext in [".doc", ".docx"]:
128
+ pdf_path = self._convert_doc_to_pdf(file_path)
129
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
130
+
131
+ # Convert PDF → list of images
132
+ elif ext == ".pdf":
133
+ images = self._convert_pdf_to_images(file_path, dpi=dpi)
134
+
135
+ # Handle already an image file
136
+ elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
137
+ image = Image.open(file_path).convert("RGB")
138
+ # Convert to consistent format (e.g., PNG or JPEG in memory)
139
+ buffer = io.BytesIO()
140
+ image.save(buffer, format=output_format)
141
+ buffer.seek(0)
142
+ converted_image = Image.open(buffer)
143
+ images = [converted_image]
144
+
145
+ else:
146
+ raise ValueError("Unsupported file type. Use .pdf, .doc, .docx, or image files")
147
+
148
+ return images