prevectorchunks-core 0.1.24__tar.gz → 0.1.25__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of prevectorchunks-core might be problematic. Click here for more details.
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/PKG-INFO +36 -1
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/README.md +35 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/PKG-INFO +36 -1
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/pyproject.toml +1 -1
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/LICENCE +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/DocuToImageConverter.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/markdown_and_chunk_documents.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/test_loader.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/utils/file_loader.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/requires.txt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.25
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/yourusername/mydep
|
|
@@ -187,6 +187,41 @@ Updates existing chunks in the Vector Database by document name.
|
|
|
187
187
|
- Keeping VDB chunks up to date when documents change
|
|
188
188
|
- Re-ingesting revised or corrected content
|
|
189
189
|
|
|
190
|
+
---
|
|
191
|
+
### 5. ``markdown_and_chunk_documents``
|
|
192
|
+
```python
|
|
193
|
+
from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
194
|
+
|
|
195
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
196
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Description**
|
|
200
|
+
This new function automatically:
|
|
201
|
+
1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
|
|
202
|
+
2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
|
|
203
|
+
3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
|
|
204
|
+
4. Merges unmatched markdown segments into the final structured output.
|
|
205
|
+
|
|
206
|
+
**Parameters**
|
|
207
|
+
- `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
|
|
208
|
+
|
|
209
|
+
**Returns**
|
|
210
|
+
- `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
|
|
211
|
+
|
|
212
|
+
**Example**
|
|
213
|
+
```python
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
216
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
217
|
+
print(mapped_chunks)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Use Cases**
|
|
221
|
+
- End-to-end document-to-markdown-to-chunks pipeline
|
|
222
|
+
- Automating preprocessing for RAG/LLM ingestion
|
|
223
|
+
- Extracting structured markdown for semantic search or content indexing
|
|
224
|
+
|
|
190
225
|
---
|
|
191
226
|
|
|
192
227
|
## 🚀 Example Workflow
|
|
@@ -142,6 +142,41 @@ Updates existing chunks in the Vector Database by document name.
|
|
|
142
142
|
- Keeping VDB chunks up to date when documents change
|
|
143
143
|
- Re-ingesting revised or corrected content
|
|
144
144
|
|
|
145
|
+
---
|
|
146
|
+
### 5. ``markdown_and_chunk_documents``
|
|
147
|
+
```python
|
|
148
|
+
from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
149
|
+
|
|
150
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
151
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Description**
|
|
155
|
+
This new function automatically:
|
|
156
|
+
1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
|
|
157
|
+
2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
|
|
158
|
+
3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
|
|
159
|
+
4. Merges unmatched markdown segments into the final structured output.
|
|
160
|
+
|
|
161
|
+
**Parameters**
|
|
162
|
+
- `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
|
|
163
|
+
|
|
164
|
+
**Returns**
|
|
165
|
+
- `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
|
|
166
|
+
|
|
167
|
+
**Example**
|
|
168
|
+
```python
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
171
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
172
|
+
print(mapped_chunks)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Use Cases**
|
|
176
|
+
- End-to-end document-to-markdown-to-chunks pipeline
|
|
177
|
+
- Automating preprocessing for RAG/LLM ingestion
|
|
178
|
+
- Extracting structured markdown for semantic search or content indexing
|
|
179
|
+
|
|
145
180
|
---
|
|
146
181
|
|
|
147
182
|
## 🚀 Example Workflow
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.25
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/yourusername/mydep
|
|
@@ -187,6 +187,41 @@ Updates existing chunks in the Vector Database by document name.
|
|
|
187
187
|
- Keeping VDB chunks up to date when documents change
|
|
188
188
|
- Re-ingesting revised or corrected content
|
|
189
189
|
|
|
190
|
+
---
|
|
191
|
+
### 5. ``markdown_and_chunk_documents``
|
|
192
|
+
```python
|
|
193
|
+
from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
194
|
+
|
|
195
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
196
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
**Description**
|
|
200
|
+
This new function automatically:
|
|
201
|
+
1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
|
|
202
|
+
2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
|
|
203
|
+
3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
|
|
204
|
+
4. Merges unmatched markdown segments into the final structured output.
|
|
205
|
+
|
|
206
|
+
**Parameters**
|
|
207
|
+
- `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
|
|
208
|
+
|
|
209
|
+
**Returns**
|
|
210
|
+
- `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
|
|
211
|
+
|
|
212
|
+
**Example**
|
|
213
|
+
```python
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
markdown_processor = MarkdownAndChunkDocuments()
|
|
216
|
+
mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
|
|
217
|
+
print(mapped_chunks)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
**Use Cases**
|
|
221
|
+
- End-to-end document-to-markdown-to-chunks pipeline
|
|
222
|
+
- Automating preprocessing for RAG/LLM ingestion
|
|
223
|
+
- Extracting structured markdown for semantic search or content indexing
|
|
224
|
+
|
|
190
225
|
---
|
|
191
226
|
|
|
192
227
|
## 🚀 Example Workflow
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.25"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/test_loader.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.24 → prevectorchunks_core-0.1.25}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|