prevectorchunks-core 0.1.23__py3-none-any.whl → 0.1.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of prevectorchunks-core might be problematic. Click here for more details.

@@ -1,5 +1,14 @@
1
1
  # prevectorchunks_core/config.py
2
2
  from dataclasses import dataclass, field
3
+ from enum import Enum
4
+
5
+
6
+ class LLM_Structured__Output_Type(Enum):
7
+ RECURSIVE = "RecursiveCharacterTextSplitter"
8
+ CHARACTER = "CharacterTextSplitter"
9
+ STANDARD = "standard"
10
+ R_PRETRAINED_PROPOSITION = "RLBasedTextSplitterWithProposition"
11
+ R_PRETRAINED = "RLBasedTextSplitter"
3
12
 
4
13
  @dataclass()
5
14
  class SplitterConfig:
@@ -12,9 +12,9 @@ from ..config.splitter_config import SplitterConfig
12
12
 
13
13
  from dotenv import load_dotenv
14
14
 
15
- from chunk_documents_crud_vdb import chunk_documents
16
- from chunk_to_all_content_mapper import ChunkMapper
17
- from core.prevectorchunks_core.utils.file_loader import SplitType
15
+ from .chunk_documents_crud_vdb import chunk_documents
16
+ from .chunk_to_all_content_mapper import ChunkMapper
17
+ from ..utils.file_loader import SplitType
18
18
 
19
19
  load_dotenv(override=True)
20
20
 
@@ -1,9 +1,10 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.23
3
+ Version: 0.1.26
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
- Project-URL: Homepage, https://github.com/yourusername/mydep
6
+ Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
7
+ Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
7
8
  Description-Content-Type: text/markdown
8
9
  License-File: LICENCE
9
10
  Requires-Dist: packaging~=24.1
@@ -187,6 +188,41 @@ Updates existing chunks in the Vector Database by document name.
187
188
  - Keeping VDB chunks up to date when documents change
188
189
  - Re-ingesting revised or corrected content
189
190
 
191
+ ---
192
+ ### 5. ``markdown_and_chunk_documents``
193
+ ```python
194
+ from prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
195
+
196
+ markdown_processor = MarkdownAndChunkDocuments()
197
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("example.pdf")
198
+ ```
199
+
200
+ **Description**
201
+ This new function automatically:
202
+ 1. Converts a document (PDF, DOCX, etc.) into images using `DocuToImageConverter`.
203
+ 2. Extracts **Markdown and text** content from those images using `DocuToMarkdownExtractor` (powered by GPT).
204
+ 3. Converts the extracted markdown text into **RL-based chunks** using `ChunkMapper` and `chunk_documents`.
205
+ 4. Merges unmatched markdown segments into the final structured output.
206
+
207
+ **Parameters**
208
+ - `file_path` (*str*): Path to the document (PDF, DOCX, or image) you want to process.
209
+
210
+ **Returns**
211
+ - `mapped_chunks` (*list[dict]*): A list of markdown-based chunks with both markdown and chunked text content.
212
+
213
+ **Example**
214
+ ```python
215
+ if __name__ == "__main__":
216
+ markdown_processor = MarkdownAndChunkDocuments()
217
+ mapped_chunks = markdown_processor.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
218
+ print(mapped_chunks)
219
+ ```
220
+
221
+ **Use Cases**
222
+ - End-to-end document-to-markdown-to-chunks pipeline
223
+ - Automating preprocessing for RAG/LLM ingestion
224
+ - Extracting structured markdown for semantic search or content indexing
225
+
190
226
  ---
191
227
 
192
228
  ## 🚀 Example Workflow
@@ -1,7 +1,7 @@
1
1
  prevectorchunks_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  prevectorchunks_core/test_loader.py,sha256=4u4XPFhWruZxHU2pNsDDSNr6FG2yPHXA96CGZXIoqLA,1019
3
3
  prevectorchunks_core/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- prevectorchunks_core/config/splitter_config.py,sha256=_dUELUfQ9YQotmKY9JsOochsXVenEhDPZiFPdogSpR0,385
4
+ prevectorchunks_core/config/splitter_config.py,sha256=FvG2nMg_UB8Qzlc-GdMGzw3hRXfSCbM0L8XI2tquLS4,683
5
5
  prevectorchunks_core/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  prevectorchunks_core/os-llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  prevectorchunks_core/os-llm/llava.py,sha256=GXdVoT6FJ3AKl4c5wY5CWweIs7w82jOevhPfiLhQPZY,526
@@ -23,7 +23,7 @@ prevectorchunks_core/services/audio_processor.py,sha256=XKNYhXHIt_77a3PT2wwKvnCS
23
23
  prevectorchunks_core/services/chunk_documents_crud_vdb.py,sha256=Md4vy7vJDnSYpvZiF0HbHCOA0StSVm62ALHAPYU2A7I,16279
24
24
  prevectorchunks_core/services/chunk_to_all_content_mapper.py,sha256=xEz2idxJTsJwyCJWMPZCk3CFcalKhbSuucFH9TPouU0,2778
25
25
  prevectorchunks_core/services/image_processor.py,sha256=2CRwTbI-czbakm9aG-kMdx908bc5H1rQETQiVCKbWd8,3518
26
- prevectorchunks_core/services/markdown_and_chunk_documents.py,sha256=BYwu4FcliFU-adnPoqUuqjAkRvV7mVtOvAPKS1sM6Zk,2884
26
+ prevectorchunks_core/services/markdown_and_chunk_documents.py,sha256=vfGvvirn3rtwIVJtVK5_dJSrV3JeO0p0d0rc5BOnGx8,2862
27
27
  prevectorchunks_core/services/propositional_index.py,sha256=cVH3obhLtlcfJYA6VN4KfC3len4fe5nNcboorlouOb0,4151
28
28
  prevectorchunks_core/services/video_analyser.py,sha256=1wI38xZ8vdE8T4EBAnxWzt7Hc8vTYrdQhbA4Y5VZLeY,6651
29
29
  prevectorchunks_core/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -32,8 +32,8 @@ prevectorchunks_core/utils/__init__.py,sha256=aez3v2dwGHXvmALXVBPR-mQgvxMqxv9NsE
32
32
  prevectorchunks_core/utils/extract_content.py,sha256=fMDT-BsjYutHLnOFebLhMFpb1UFAB8ldGldxh11FsXw,2920
33
33
  prevectorchunks_core/utils/file_loader.py,sha256=JkCKiz3M2TMw5qHoTJXhbn33PfTv5gvQ3nfrbaQOmHs,10689
34
34
  prevectorchunks_core/utils/llm_wrapper.py,sha256=7GfyM5p5PeIehi4Dj5jgC7-xi2SjZuyyPuLkWtucQzQ,1139
35
- prevectorchunks_core-0.1.23.dist-info/licenses/LICENCE,sha256=Ljp4XVKnncsQ59h0eMW6J5V-ylsVeqDRC8smR7UPIDs,512
36
- prevectorchunks_core-0.1.23.dist-info/METADATA,sha256=vKOvF3fsJY6Zn-jfY0DNUP6CljBQadByMcD8xzta2WU,9272
37
- prevectorchunks_core-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- prevectorchunks_core-0.1.23.dist-info/top_level.txt,sha256=OWJgfrUDNTh49PpKvRXHY8lVeWqzFbTr9OkDoAvpvPk,21
39
- prevectorchunks_core-0.1.23.dist-info/RECORD,,
35
+ prevectorchunks_core-0.1.26.dist-info/licenses/LICENCE,sha256=Ljp4XVKnncsQ59h0eMW6J5V-ylsVeqDRC8smR7UPIDs,512
36
+ prevectorchunks_core-0.1.26.dist-info/METADATA,sha256=KxZUWsezZRpvFhD29CyDLV2_VdLDQKgMOs37skA_wfk,10775
37
+ prevectorchunks_core-0.1.26.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
+ prevectorchunks_core-0.1.26.dist-info/top_level.txt,sha256=OWJgfrUDNTh49PpKvRXHY8lVeWqzFbTr9OkDoAvpvPk,21
39
+ prevectorchunks_core-0.1.26.dist-info/RECORD,,