prevectorchunks-core 0.1.10__tar.gz → 0.1.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prevectorchunks_core-0.1.28/LICENSE +2 -0
- prevectorchunks_core-0.1.28/PKG-INFO +272 -0
- prevectorchunks_core-0.1.10/PKG-INFO → prevectorchunks_core-0.1.28/README.md +73 -38
- prevectorchunks_core-0.1.28/prevectorchunks_core/config/splitter_config.py +22 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/os-llm/llava.py +15 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained/model_info.txt +2 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/DocuToImageConverter.py +148 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/DocuToMarkdownExtractor.py +74 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/audio_processor.py +136 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/chunk_to_all_content_mapper.py +78 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/image_processor.py +104 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/markdown_and_chunk_documents.py +161 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/services/propositional_index.py +17 -6
- prevectorchunks_core-0.1.28/prevectorchunks_core/services/video_analyser.py +176 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/test_loader.py +44 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/tests/test_local.py +3 -1
- prevectorchunks_core-0.1.28/prevectorchunks_core/utils/__init__.py +4 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core/utils/extract_content.py +92 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/utils/file_loader.py +62 -16
- prevectorchunks_core-0.1.28/prevectorchunks_core.egg-info/PKG-INFO +272 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/SOURCES.txt +13 -0
- prevectorchunks_core-0.1.28/prevectorchunks_core.egg-info/requires.txt +37 -0
- prevectorchunks_core-0.1.28/pyproject.toml +67 -0
- prevectorchunks_core-0.1.10/README.md +0 -155
- prevectorchunks_core-0.1.10/prevectorchunks_core/config/splitter_config.py +0 -11
- prevectorchunks_core-0.1.10/prevectorchunks_core/rlchunker/testpretrained.py +0 -6
- prevectorchunks_core-0.1.10/prevectorchunks_core/test_loader.py +0 -26
- prevectorchunks_core-0.1.10/prevectorchunks_core.egg-info/PKG-INFO +0 -184
- prevectorchunks_core-0.1.10/prevectorchunks_core.egg-info/requires.txt +0 -19
- prevectorchunks_core-0.1.10/pyproject.toml +0 -41
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/LICENCE +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.10/prevectorchunks_core/rlchunker/pretrained → prevectorchunks_core-0.1.28/prevectorchunks_core/os-llm}/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.10/prevectorchunks_core/services → prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/pretrained}/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- /prevectorchunks_core-0.1.10/prevectorchunks_core/tests/__init__.py → /prevectorchunks_core-0.1.28/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.10/prevectorchunks_core/utils → prevectorchunks_core-0.1.28/prevectorchunks_core/services}/__init__.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.10 → prevectorchunks_core-0.1.28}/setup.cfg +0 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: prevectorchunks-core
|
|
3
|
+
Version: 0.1.28
|
|
4
|
+
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
|
+
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
Copyright (c) 2025 Your Name
|
|
8
|
+
|
|
9
|
+
Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
10
|
+
Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
11
|
+
Requires-Python: <3.12,>=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENCE
|
|
14
|
+
License-File: LICENSE
|
|
15
|
+
Requires-Dist: packaging~=24.1
|
|
16
|
+
Requires-Dist: openai<3.0.0,>=2.6.0
|
|
17
|
+
Requires-Dist: python-dotenv~=1.0.1
|
|
18
|
+
Requires-Dist: PyJWT~=2.7.0
|
|
19
|
+
Requires-Dist: fastapi~=0.112.2
|
|
20
|
+
Requires-Dist: datasets~=4.1.0
|
|
21
|
+
Requires-Dist: pinecone~=7.3.0
|
|
22
|
+
Requires-Dist: pytesseract~=0.3.13
|
|
23
|
+
Requires-Dist: python-docx~=1.2.0
|
|
24
|
+
Requires-Dist: PyPDF2~=3.0.1
|
|
25
|
+
Requires-Dist: pillow~=11.3.0
|
|
26
|
+
Requires-Dist: torch~=2.6.0
|
|
27
|
+
Requires-Dist: torchvision~=0.21.0
|
|
28
|
+
Requires-Dist: torchaudio~=2.6.0
|
|
29
|
+
Requires-Dist: sentence-transformers~=5.1.1
|
|
30
|
+
Requires-Dist: py-gutenberg~=1.0.3
|
|
31
|
+
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
32
|
+
Requires-Dist: langchain~=0.3
|
|
33
|
+
Requires-Dist: langchain_openai~=0.3.35
|
|
34
|
+
Requires-Dist: accelerate>=0.22.0
|
|
35
|
+
Requires-Dist: pathlib~=1.0.1
|
|
36
|
+
Requires-Dist: transformers~=4.57.0
|
|
37
|
+
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
38
|
+
Requires-Dist: opencv-python~=4.12.0.88
|
|
39
|
+
Requires-Dist: requests~=2.32.5
|
|
40
|
+
Requires-Dist: langchain-core~=0.3.78
|
|
41
|
+
Requires-Dist: pdf2image~=1.17.0
|
|
42
|
+
Requires-Dist: docx2pdf~=0.1.8
|
|
43
|
+
Requires-Dist: numpy~=2.2.6
|
|
44
|
+
Requires-Dist: scikit-learn~=1.7.2
|
|
45
|
+
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
|
+
Requires-Dist: pypandoc~=1.13
|
|
47
|
+
Requires-Dist: reportlab~=4.1.0
|
|
48
|
+
Requires-Dist: weasyprint~=62.0
|
|
49
|
+
Requires-Dist: lxml~=4.9.3
|
|
50
|
+
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
+
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
|
|
54
|
+
# 📚 PreVectorChunks
|
|
55
|
+
|
|
56
|
+
> A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## ✨ Who Needs This Module?
|
|
61
|
+
Any developer working with:
|
|
62
|
+
- **RAG pipelines**
|
|
63
|
+
- **Vector Databases** (like Pinecone, Weaviate, etc.)
|
|
64
|
+
- **AI applications** requiring **similar content retrieval**
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 🎯 What Does This Module Do?
|
|
69
|
+
This module helps you:
|
|
70
|
+
- **Chunk documents** into smaller fragments using:
|
|
71
|
+
- a pretrained Reinforcement Learning based model or
|
|
72
|
+
- a pretrained Reinforcement Learning based model with proposition indexing or
|
|
73
|
+
- standard word chunking
|
|
74
|
+
- recursive character based chunking
|
|
75
|
+
- character based chunking
|
|
76
|
+
- **Insert (upsert) fragments** into a vector database
|
|
77
|
+
- **Fetch & update** existing chunks from a vector database
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## 📦 Installation
|
|
82
|
+
```bash
|
|
83
|
+
pip install prevectorchunks-core
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
How to import in a file:
|
|
87
|
+
```python
|
|
88
|
+
from PreVectorChunks.services import chunk_documents_crud_vdb
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**Use .env for API keys:IMPORTANT: PLEASE ENSURE TO PROVIDE YOUR OPENAI_API_KEY as MINIMUM in an .env file or as required**
|
|
92
|
+
```
|
|
93
|
+
PINECONE_API_KEY=YOUR_API_KEY
|
|
94
|
+
OPENAI_API_KEY=YOUR_API_KEY
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 📄 Functions
|
|
100
|
+
|
|
101
|
+
### 1. `chunk_documents`
|
|
102
|
+
```python
|
|
103
|
+
chunk_documents(instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
|
|
104
|
+
```
|
|
105
|
+
Splits the content of a document into smaller, manageable chunks. - Five types of document chunking
|
|
106
|
+
- Chunking using Reinforcement Learning based pretrained model +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
107
|
+
- Chunking using Reinforcement Learning based pretrained model and proposition indexing +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
108
|
+
- Recursive Character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
109
|
+
- Standard word based chunking+(enable/disable LLM to structure the chunked text - default is enabled)
|
|
110
|
+
- Simple character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
**Parameters**
|
|
114
|
+
- `instructions` (*dict or str*): Additional rules or guidance for how the document should be split.
|
|
115
|
+
- Example: `"split my content by biggest headings"`
|
|
116
|
+
- `file_path` (*str*): Binary file or file path to the input file containing the content or content of the file. Default: `"content_playground/content.json"`.
|
|
117
|
+
- `splitter_config (optional) ` (*SplitterConfig*): (if none provided standard split takes place) Object that defines chunking behavior, e.g., `chunk_size`, `chunk_overlap`, `separator`, `split_type`.
|
|
118
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.RECURSIVE.value)
|
|
119
|
+
- (chunk_size refers to size in characters (i.e. 100 characters) when RECURSIVE is used)
|
|
120
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.CHARACTER.value)
|
|
121
|
+
- - (chunk_size refers to size in characters (i.e. 100 characters) when CHARACTER is used)
|
|
122
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.STANDARD.value)
|
|
123
|
+
- - (chunk_size refers to size in words (i.e. 100 characters) when STANDARD is used)
|
|
124
|
+
- i.e. splitter_config = SplitterConfig(separators=["\n"],
|
|
125
|
+
split_type=SplitType.R_PRETRAINED.value, min_rl_chunk_size=5,
|
|
126
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
127
|
+
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED is used)
|
|
128
|
+
- i.e. splitter_config = SplitterConfig(separators=["\n"],
|
|
129
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
130
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
131
|
+
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
132
|
+
|
|
133
|
+
- **Returns**
|
|
134
|
+
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
135
|
+
|
|
136
|
+
**Use Cases**
|
|
137
|
+
- Preparing text for LLM ingestion
|
|
138
|
+
- Splitting text by structure (headings, paragraphs)
|
|
139
|
+
- Vector database indexing
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
### 2. `chunk_and_upsert_to_vdb`
|
|
144
|
+
```python
|
|
145
|
+
chunk_and_upsert_to_vdb(index_n, instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
|
|
146
|
+
```
|
|
147
|
+
Splits a document into chunks (via `chunk_documents`) and **inserts them into a Vector Database**.
|
|
148
|
+
|
|
149
|
+
**Parameters**
|
|
150
|
+
- `index_n` (*str*): The name of the VDB index where chunks should be stored.
|
|
151
|
+
- `instructions` (*dict or str*): Rules for splitting content (same as `chunk_documents`).
|
|
152
|
+
- `file_path` (*str*): Path to the document file or content of the file. Default: `"content_playground/content.json"`.
|
|
153
|
+
- `splitter_config` (*SplitterConfig*): Object that defines chunking behavior.
|
|
154
|
+
|
|
155
|
+
**Returns**
|
|
156
|
+
- Confirmation of successful insert into the VDB.
|
|
157
|
+
|
|
158
|
+
**Use Cases**
|
|
159
|
+
- Automated document preprocessing and storage for vector search
|
|
160
|
+
- Preparing embeddings for semantic search
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
### 3. `fetch_vdb_chunks_grouped_by_document_name`
|
|
165
|
+
```python
|
|
166
|
+
fetch_vdb_chunks_grouped_by_document_name(index_n)
|
|
167
|
+
```
|
|
168
|
+
Fetches existing chunks stored in the Vector Database, grouped by **document name**.
|
|
169
|
+
|
|
170
|
+
**Parameters**
|
|
171
|
+
- `index_n` (*str*): The name of the VDB index.
|
|
172
|
+
|
|
173
|
+
**Returns**
|
|
174
|
+
- A dictionary or list of chunks grouped by document name.
|
|
175
|
+
|
|
176
|
+
**Use Cases**
|
|
177
|
+
- Retrieving all chunks of a specific document
|
|
178
|
+
- Verifying what content has been ingested into the VDB
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
### 4. `update_vdb_chunks_grouped_by_document_name`
|
|
183
|
+
```python
|
|
184
|
+
update_vdb_chunks_grouped_by_document_name(index_n, dataset)
|
|
185
|
+
```
|
|
186
|
+
Updates existing chunks in the Vector Database by document name.
|
|
187
|
+
|
|
188
|
+
**Parameters**
|
|
189
|
+
- `index_n` (*str*): The name of the VDB index.
|
|
190
|
+
- `dataset` (*dict or list*): The new data (chunks) to update existing entries.
|
|
191
|
+
|
|
192
|
+
**Returns**
|
|
193
|
+
- Confirmation of update status.
|
|
194
|
+
|
|
195
|
+
**Use Cases**
|
|
196
|
+
- Keeping VDB chunks up to date when documents change
|
|
197
|
+
- Re-ingesting revised or corrected content
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
### 5. ``markdown_and_chunk_documents``
|
|
201
|
+
```python
|
|
202
|
+
from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
203
|
+
|
|
204
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
205
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
**Description**
|
|
209
|
+
This new function automatically:
|
|
210
|
+
1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
|
|
211
|
+
2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
|
|
212
|
+
3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
|
|
213
|
+
4. Merges unmatched markdown segments into the final structured output.
|
|
214
|
+
|
|
215
|
+
**Parameters**
|
|
216
|
+
- `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
|
|
217
|
+
|
|
218
|
+
**Returns**
|
|
219
|
+
- `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
|
|
220
|
+
|
|
221
|
+
**Example**
|
|
222
|
+
```python
|
|
223
|
+
if __name__ == "__main__":
|
|
224
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
225
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
226
|
+
print(mapped_chunks)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
**Use Cases**
|
|
230
|
+
- End-to-end document-to-markdown-to-chunks pipeline
|
|
231
|
+
- Automating preprocessing for RAG/LLM ingestion
|
|
232
|
+
- Extracting structured markdown for semantic search or content indexing
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## 🚀 Example Workflow
|
|
237
|
+
```python
|
|
238
|
+
from prevectorchunks_core.config import SplitterConfig
|
|
239
|
+
|
|
240
|
+
splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type=SplitType.R_PRETRAINED_PROPOSITION.value)
|
|
241
|
+
|
|
242
|
+
# Step 1: Chunk a document
|
|
243
|
+
chunks = chunk_documents(
|
|
244
|
+
instructions="split my content by biggest headings",
|
|
245
|
+
file_path="content_playground/content.json",
|
|
246
|
+
splitter_config=splitter_config
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
250
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
251
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
252
|
+
|
|
253
|
+
chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
|
|
254
|
+
|
|
255
|
+
# Step 2: Insert chunks into VDB
|
|
256
|
+
chunk_and_upsert_to_vdb("my_index", instructions="split by headings", splitter_config=splitter_config)
|
|
257
|
+
|
|
258
|
+
# Step 3: Fetch stored chunks
|
|
259
|
+
docs = fetch_vdb_chunks_grouped_by_document_name("my_index")
|
|
260
|
+
|
|
261
|
+
# Step 4: Update chunks if needed
|
|
262
|
+
update_vdb_chunks_grouped_by_document_name("my_index", dataset=docs)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 🛠 Use Cases
|
|
268
|
+
- Preprocessing documents for LLM ingestion
|
|
269
|
+
- Semantic search and Q&A systems
|
|
270
|
+
- Vector database indexing and retrieval
|
|
271
|
+
- Maintaining versioned document chunks
|
|
272
|
+
|
|
@@ -1,32 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.10
|
|
4
|
-
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
|
-
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
-
Project-URL: Homepage, https://github.com/yourusername/mydep
|
|
7
|
-
Description-Content-Type: text/markdown
|
|
8
|
-
License-File: LICENCE
|
|
9
|
-
Requires-Dist: packaging~=24.1
|
|
10
|
-
Requires-Dist: requests~=2.32.3
|
|
11
|
-
Requires-Dist: openai~=1.37.1
|
|
12
|
-
Requires-Dist: python-dotenv~=1.0.1
|
|
13
|
-
Requires-Dist: PyJWT~=2.7.0
|
|
14
|
-
Requires-Dist: fastapi~=0.112.2
|
|
15
|
-
Requires-Dist: datasets~=4.1.0
|
|
16
|
-
Requires-Dist: pinecone~=7.3.0
|
|
17
|
-
Requires-Dist: pytesseract~=0.3.13
|
|
18
|
-
Requires-Dist: python-docx~=1.2.0
|
|
19
|
-
Requires-Dist: PyPDF2~=3.0.1
|
|
20
|
-
Requires-Dist: pillow~=11.3.0
|
|
21
|
-
Requires-Dist: torch~=2.6.0
|
|
22
|
-
Requires-Dist: torchvision~=0.21.0
|
|
23
|
-
Requires-Dist: torchaudio~=2.6.0
|
|
24
|
-
Requires-Dist: sentence-transformers~=5.1.1
|
|
25
|
-
Requires-Dist: py-gutenberg~=1.0.3
|
|
26
|
-
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
27
|
-
Requires-Dist: langchain~=0.3
|
|
28
|
-
Dynamic: license-file
|
|
29
|
-
|
|
30
1
|
# 📚 PreVectorChunks
|
|
31
2
|
|
|
32
3
|
> A lightweight utility for **document chunking** and **vector database upserts** — designed for developers building **RAG (Retrieval-Augmented Generation)** solutions.
|
|
@@ -43,7 +14,12 @@ Any developer working with:
|
|
|
43
14
|
|
|
44
15
|
## 🎯 What Does This Module Do?
|
|
45
16
|
This module helps you:
|
|
46
|
-
- **Chunk documents** into smaller fragments
|
|
17
|
+
- **Chunk documents** into smaller fragments using:
|
|
18
|
+
- a pretrained Reinforcement Learning based model or
|
|
19
|
+
- a pretrained Reinforcement Learning based model with proposition indexing or
|
|
20
|
+
- standard word chunking
|
|
21
|
+
- recursive character based chunking
|
|
22
|
+
- character based chunking
|
|
47
23
|
- **Insert (upsert) fragments** into a vector database
|
|
48
24
|
- **Fetch & update** existing chunks from a vector database
|
|
49
25
|
|
|
@@ -59,7 +35,7 @@ How to import in a file:
|
|
|
59
35
|
from PreVectorChunks.services import chunk_documents_crud_vdb
|
|
60
36
|
```
|
|
61
37
|
|
|
62
|
-
**Use .env for API keys
|
|
38
|
+
**Use .env for API keys:IMPORTANT: PLEASE ENSURE TO PROVIDE YOUR OPENAI_API_KEY as MINIMUM in an .env file or as required**
|
|
63
39
|
```
|
|
64
40
|
PINECONE_API_KEY=YOUR_API_KEY
|
|
65
41
|
OPENAI_API_KEY=YOUR_API_KEY
|
|
@@ -73,17 +49,35 @@ OPENAI_API_KEY=YOUR_API_KEY
|
|
|
73
49
|
```python
|
|
74
50
|
chunk_documents(instructions, file_path="content_playground/content.json", splitter_config=SplitterConfig())
|
|
75
51
|
```
|
|
76
|
-
Splits the content of a document into smaller, manageable chunks.
|
|
52
|
+
Splits the content of a document into smaller, manageable chunks. - Five types of document chunking
|
|
53
|
+
- Chunking using Reinforcement Learning based pretrained model +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
54
|
+
- Chunking using Reinforcement Learning based pretrained model and proposition indexing +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
55
|
+
- Recursive Character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
56
|
+
- Standard word based chunking+(enable/disable LLM to structure the chunked text - default is enabled)
|
|
57
|
+
- Simple character based chunking +(enable/disable LLM to structure the chunked text - default is enabled)
|
|
58
|
+
|
|
77
59
|
|
|
78
60
|
**Parameters**
|
|
79
61
|
- `instructions` (*dict or str*): Additional rules or guidance for how the document should be split.
|
|
80
62
|
- Example: `"split my content by biggest headings"`
|
|
81
|
-
- `file_path` (*str*):
|
|
63
|
+
- `file_path` (*str*): Binary file or file path to the input file containing the content or content of the file. Default: `"content_playground/content.json"`.
|
|
82
64
|
- `splitter_config (optional) ` (*SplitterConfig*): (if none provided standard split takes place) Object that defines chunking behavior, e.g., `chunk_size`, `chunk_overlap`, `separator`, `split_type`.
|
|
83
|
-
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=
|
|
84
|
-
- i.e.
|
|
85
|
-
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=
|
|
86
|
-
|
|
65
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.RECURSIVE.value)
|
|
66
|
+
- (chunk_size refers to size in characters (i.e. 100 characters) when RECURSIVE is used)
|
|
67
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.CHARACTER.value)
|
|
68
|
+
- - (chunk_size refers to size in characters (i.e. 100 characters) when CHARACTER is used)
|
|
69
|
+
- i.e. splitter_config = SplitterConfig(chunk_size= 300, chunk_overlap= 0,separators=["\n"],split_type=SplitType.STANDARD.value)
|
|
70
|
+
- - (chunk_size refers to size in words (i.e. 100 characters) when STANDARD is used)
|
|
71
|
+
- i.e. splitter_config = SplitterConfig(separators=["\n"],
|
|
72
|
+
split_type=SplitType.R_PRETRAINED.value, min_rl_chunk_size=5,
|
|
73
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
74
|
+
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED is used)
|
|
75
|
+
- i.e. splitter_config = SplitterConfig(separators=["\n"],
|
|
76
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
77
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
78
|
+
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
79
|
+
|
|
80
|
+
- **Returns**
|
|
87
81
|
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
88
82
|
|
|
89
83
|
**Use Cases**
|
|
@@ -149,13 +143,48 @@ Updates existing chunks in the Vector Database by document name.
|
|
|
149
143
|
- Keeping VDB chunks up to date when documents change
|
|
150
144
|
- Re-ingesting revised or corrected content
|
|
151
145
|
|
|
146
|
+
---
|
|
147
|
+
### 5. ``markdown_and_chunk_documents``
|
|
148
|
+
```python
|
|
149
|
+
from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
150
|
+
|
|
151
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
152
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
**Description**
|
|
156
|
+
This new function automatically:
|
|
157
|
+
1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
|
|
158
|
+
2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
|
|
159
|
+
3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
|
|
160
|
+
4. Merges unmatched markdown segments into the final structured output.
|
|
161
|
+
|
|
162
|
+
**Parameters**
|
|
163
|
+
- `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
|
|
164
|
+
|
|
165
|
+
**Returns**
|
|
166
|
+
- `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
|
|
167
|
+
|
|
168
|
+
**Example**
|
|
169
|
+
```python
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
172
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
173
|
+
print(mapped_chunks)
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
**Use Cases**
|
|
177
|
+
- End-to-end document-to-markdown-to-chunks pipeline
|
|
178
|
+
- Automating preprocessing for RAG/LLM ingestion
|
|
179
|
+
- Extracting structured markdown for semantic search or content indexing
|
|
180
|
+
|
|
152
181
|
---
|
|
153
182
|
|
|
154
183
|
## 🚀 Example Workflow
|
|
155
184
|
```python
|
|
156
185
|
from prevectorchunks_core.config import SplitterConfig
|
|
157
186
|
|
|
158
|
-
splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type=
|
|
187
|
+
splitter_config = SplitterConfig(chunk_size=150, chunk_overlap=0, separator=["\n"], split_type=SplitType.R_PRETRAINED_PROPOSITION.value)
|
|
159
188
|
|
|
160
189
|
# Step 1: Chunk a document
|
|
161
190
|
chunks = chunk_documents(
|
|
@@ -164,6 +193,12 @@ chunks = chunk_documents(
|
|
|
164
193
|
splitter_config=splitter_config
|
|
165
194
|
)
|
|
166
195
|
|
|
196
|
+
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
197
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
198
|
+
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
199
|
+
|
|
200
|
+
chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
|
|
201
|
+
|
|
167
202
|
# Step 2: Insert chunks into VDB
|
|
168
203
|
chunk_and_upsert_to_vdb("my_index", instructions="split by headings", splitter_config=splitter_config)
|
|
169
204
|
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# prevectorchunks_core/config.py
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LLM_Structured_Output_Type(Enum):
|
|
7
|
+
STANDARD = "STANDARD"
|
|
8
|
+
STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass()
|
|
12
|
+
class SplitterConfig:
|
|
13
|
+
chunk_size: int = 300
|
|
14
|
+
chunk_overlap: int = 0
|
|
15
|
+
separators: list[str] = field(default_factory=lambda: ["\n"])
|
|
16
|
+
split_type: str = "recursive_splitter"
|
|
17
|
+
enableLLMTouchUp: bool = True
|
|
18
|
+
llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
|
|
19
|
+
min_rl_chunk_size: int = 5
|
|
20
|
+
max_rl_chunk_size: int = 50
|
|
21
|
+
|
|
22
|
+
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from transformers import pipeline
|
|
2
|
+
|
|
3
|
+
pipe = pipeline("image-text-to-text", model="llava-hf/llava-1.5-13b-hf")
|
|
4
|
+
messages = [
|
|
5
|
+
{
|
|
6
|
+
"role": "user",
|
|
7
|
+
"content": [
|
|
8
|
+
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"},
|
|
9
|
+
{"type": "text", "text": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud"},
|
|
10
|
+
],
|
|
11
|
+
},
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
out = pipe(text=messages, max_new_tokens=20)
|
|
15
|
+
print(out)
|
|
Binary file
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pypandoc
|
|
9
|
+
from PIL import Image
|
|
10
|
+
import io
|
|
11
|
+
from docx2pdf import convert as docx_to_pdf
|
|
12
|
+
import fitz
|
|
13
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
14
|
+
try:
|
|
15
|
+
pypandoc.get_pandoc_path()
|
|
16
|
+
except OSError:
|
|
17
|
+
print("Pandoc not found — downloading it temporarily...")
|
|
18
|
+
pypandoc.download_pandoc()
|
|
19
|
+
|
|
20
|
+
class DocuToImageConverter:
|
|
21
|
+
"""Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
27
|
+
import os, tempfile, shutil, subprocess
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
if not os.path.exists(input_path):
|
|
31
|
+
raise FileNotFoundError(input_path)
|
|
32
|
+
|
|
33
|
+
output_dir = tempfile.mkdtemp()
|
|
34
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
35
|
+
|
|
36
|
+
# 1️⃣ Try Microsoft Word COM automation (Windows only)
|
|
37
|
+
try:
|
|
38
|
+
import win32com.client
|
|
39
|
+
word = win32com.client.Dispatch("Word.Application")
|
|
40
|
+
word.Visible = False
|
|
41
|
+
doc = word.Documents.Open(str(Path(input_path).resolve()))
|
|
42
|
+
doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
|
|
43
|
+
doc.Close()
|
|
44
|
+
word.Quit()
|
|
45
|
+
print("✅ Word COM conversion successful:", output_pdf)
|
|
46
|
+
return output_pdf
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print("⚠️ Word COM conversion failed:", e)
|
|
49
|
+
|
|
50
|
+
# 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
|
|
51
|
+
try:
|
|
52
|
+
# Requires LibreOffice installed and in PATH
|
|
53
|
+
subprocess.run(
|
|
54
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
55
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
56
|
+
)
|
|
57
|
+
print("✅ LibreOffice conversion successful:", output_pdf)
|
|
58
|
+
return output_pdf
|
|
59
|
+
except Exception as e:
|
|
60
|
+
print("⚠️ LibreOffice conversion failed:", e)
|
|
61
|
+
|
|
62
|
+
# 3️⃣ Fallback: Pandoc (simpler, loses layout)
|
|
63
|
+
try:
|
|
64
|
+
import pypandoc
|
|
65
|
+
def which(cmd):
|
|
66
|
+
return shutil.which(cmd) is not None
|
|
67
|
+
|
|
68
|
+
pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
|
|
69
|
+
pypandoc.convert_file(
|
|
70
|
+
input_path, "pdf", outputfile=output_pdf,
|
|
71
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
72
|
+
)
|
|
73
|
+
print("✅ Pandoc conversion successful:", output_pdf)
|
|
74
|
+
return output_pdf
|
|
75
|
+
except Exception as e:
|
|
76
|
+
print("⚠️ Pandoc conversion failed:", e)
|
|
77
|
+
|
|
78
|
+
# 4️⃣ Last resort: ReportLab basic text (no formatting)
|
|
79
|
+
from reportlab.pdfgen import canvas
|
|
80
|
+
from reportlab.lib.pagesizes import A4
|
|
81
|
+
from docx import Document
|
|
82
|
+
|
|
83
|
+
doc = Document(input_path)
|
|
84
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
85
|
+
width, height = A4
|
|
86
|
+
y = height - 50
|
|
87
|
+
for p in doc.paragraphs:
|
|
88
|
+
c.drawString(50, y, p.text[:1000])
|
|
89
|
+
y -= 15
|
|
90
|
+
if y < 50:
|
|
91
|
+
c.showPage()
|
|
92
|
+
y = height - 50
|
|
93
|
+
c.save()
|
|
94
|
+
print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
|
|
95
|
+
return output_pdf
|
|
96
|
+
|
|
97
|
+
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
98
|
+
"""
|
|
99
|
+
Converts each page of a PDF into images using PyMuPDF directly.
|
|
100
|
+
"""
|
|
101
|
+
images = []
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
pdf_document = fitz.open(pdf_path) # Use `PyMuPDF` instead of fitz alias
|
|
105
|
+
for page_num in range(len(pdf_document)):
|
|
106
|
+
page = pdf_document[page_num]
|
|
107
|
+
# Render page to a pixmap with the specified DPI
|
|
108
|
+
pixmap = page.get_pixmap(dpi=dpi)
|
|
109
|
+
# Convert pixmap to an Image object using PIL
|
|
110
|
+
image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
|
111
|
+
images.append(image)
|
|
112
|
+
pdf_document.close()
|
|
113
|
+
except Exception as e:
|
|
114
|
+
raise RuntimeError(f"Failed to convert PDF to images: {e}")
|
|
115
|
+
|
|
116
|
+
return images
|
|
117
|
+
|
|
118
|
+
def convert_to_images(self, file_path: str, dpi: int = 200, output_format: str = "PNG"):
|
|
119
|
+
"""
|
|
120
|
+
Converts each page of a document into a list of PIL images.
|
|
121
|
+
Supports .pdf, .doc, .docx, and image files (.jpg, .png, etc.)
|
|
122
|
+
Ensures all outputs are in a consistent image format.
|
|
123
|
+
"""
|
|
124
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
125
|
+
|
|
126
|
+
# Convert Word → PDF first
|
|
127
|
+
if ext in [".doc", ".docx"]:
|
|
128
|
+
pdf_path = self._convert_doc_to_pdf(file_path)
|
|
129
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
130
|
+
|
|
131
|
+
# Convert PDF → list of images
|
|
132
|
+
elif ext == ".pdf":
|
|
133
|
+
images = self._convert_pdf_to_images(file_path, dpi=dpi)
|
|
134
|
+
|
|
135
|
+
# Handle already an image file
|
|
136
|
+
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
|
|
137
|
+
image = Image.open(file_path).convert("RGB")
|
|
138
|
+
# Convert to consistent format (e.g., PNG or JPEG in memory)
|
|
139
|
+
buffer = io.BytesIO()
|
|
140
|
+
image.save(buffer, format=output_format)
|
|
141
|
+
buffer.seek(0)
|
|
142
|
+
converted_image = Image.open(buffer)
|
|
143
|
+
images = [converted_image]
|
|
144
|
+
|
|
145
|
+
else:
|
|
146
|
+
raise ValueError("Unsupported file type. Use .pdf, .doc, .docx, or image files")
|
|
147
|
+
|
|
148
|
+
return images
|