prevectorchunks-core 0.1.26__tar.gz → 0.1.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of prevectorchunks-core might be problematic. Click here for more details.

Files changed (47) hide show
  1. prevectorchunks_core-0.1.27/LICENSE +2 -0
  2. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/PKG-INFO +14 -6
  3. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/README.md +2 -1
  4. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/splitter_config.py +5 -6
  5. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/DocuToImageConverter.py +58 -7
  6. prevectorchunks_core-0.1.27/prevectorchunks_core/services/markdown_and_chunk_documents.py +167 -0
  7. prevectorchunks_core-0.1.27/prevectorchunks_core/test_loader.py +44 -0
  8. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/file_loader.py +14 -10
  9. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/PKG-INFO +14 -6
  10. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/SOURCES.txt +1 -0
  11. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/requires.txt +6 -4
  12. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/pyproject.toml +12 -10
  13. prevectorchunks_core-0.1.26/prevectorchunks_core/services/markdown_and_chunk_documents.py +0 -71
  14. prevectorchunks_core-0.1.26/prevectorchunks_core/test_loader.py +0 -26
  15. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/LICENCE +0 -0
  16. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/__init__.py +0 -0
  17. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/config/__init__.py +0 -0
  18. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/migrations/__init__.py +0 -0
  19. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/__init__.py +0 -0
  20. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/os-llm/llava.py +0 -0
  21. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  22. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/env.py +0 -0
  23. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/inference.py +0 -0
  24. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/model.py +0 -0
  25. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  26. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  27. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  28. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/reward.py +0 -0
  29. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  30. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  31. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/rlchunker/utils.py +0 -0
  32. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
  33. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/__init__.py +0 -0
  34. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/audio_processor.py +0 -0
  35. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  36. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
  37. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/image_processor.py +0 -0
  38. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/propositional_index.py +0 -0
  39. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/services/video_analyser.py +0 -0
  40. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/__init__.py +0 -0
  41. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/tests/test_local.py +0 -0
  42. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/__init__.py +0 -0
  43. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/extract_content.py +0 -0
  44. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  45. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  46. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  47. {prevectorchunks_core-0.1.26 → prevectorchunks_core-0.1.27}/setup.cfg +0 -0
@@ -0,0 +1,2 @@
1
+ MIT License
2
+ Copyright (c) 2025 Your Name
@@ -1,14 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.26
3
+ Version: 0.1.27
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
+ License: MIT License
7
+ Copyright (c) 2025 Your Name
8
+
6
9
  Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
7
10
  Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
11
+ Requires-Python: <3.12,>=3.7
8
12
  Description-Content-Type: text/markdown
9
13
  License-File: LICENCE
14
+ License-File: LICENSE
10
15
  Requires-Dist: packaging~=24.1
11
- Requires-Dist: requests~=2.32.3
12
16
  Requires-Dist: openai<3.0.0,>=2.6.0
13
17
  Requires-Dist: python-dotenv~=1.0.1
14
18
  Requires-Dist: PyJWT~=2.7.0
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
27
31
  Requires-Dist: langchain-text-splitters~=0.3.11
28
32
  Requires-Dist: langchain~=0.3
29
33
  Requires-Dist: langchain_openai~=0.3.35
30
- Requires-Dist: transformers>=4.30.0
31
34
  Requires-Dist: accelerate>=0.22.0
32
- Requires-Dist: imageio-ffmpeg>=0.4.8
33
- Requires-Dist: opencv-python>=4.10.0
34
35
  Requires-Dist: pathlib~=1.0.1
35
36
  Requires-Dist: transformers~=4.57.0
36
37
  Requires-Dist: imageio-ffmpeg~=0.6.0
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
42
43
  Requires-Dist: numpy~=2.2.6
43
44
  Requires-Dist: scikit-learn~=1.7.2
44
45
  Requires-Dist: PyMuPDF~=1.22.5
46
+ Requires-Dist: pypandoc~=1.13
47
+ Requires-Dist: reportlab~=4.1.0
48
+ Requires-Dist: weasyprint~=62.0
49
+ Requires-Dist: lxml~=4.9.3
50
+ Requires-Dist: cssselect2~=0.7.0
51
+ Requires-Dist: cairocffi~=1.4.0
45
52
  Dynamic: license-file
46
53
 
47
54
  # 📚 PreVectorChunks
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
122
129
  split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
123
130
  max_rl_chunk_size=50,enableLLMTouchUp=False)
124
131
  - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
125
- **Returns**
132
+
133
+ - **Returns**
126
134
  - A list of chunked strings including a unique id, a meaningful title and chunked text
127
135
 
128
136
  **Use Cases**
@@ -76,7 +76,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
76
76
  split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
77
77
  max_rl_chunk_size=50,enableLLMTouchUp=False)
78
78
  - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
79
- **Returns**
79
+
80
+ - **Returns**
80
81
  - A list of chunked strings including a unique id, a meaningful title and chunked text
81
82
 
82
83
  **Use Cases**
@@ -3,12 +3,10 @@ from dataclasses import dataclass, field
3
3
  from enum import Enum
4
4
 
5
5
 
6
- class LLM_Structured__Output_Type(Enum):
7
- RECURSIVE = "RecursiveCharacterTextSplitter"
8
- CHARACTER = "CharacterTextSplitter"
9
- STANDARD = "standard"
10
- R_PRETRAINED_PROPOSITION = "RLBasedTextSplitterWithProposition"
11
- R_PRETRAINED = "RLBasedTextSplitter"
6
+ class LLM_Structured_Output_Type(Enum):
7
+ STANDARD = "STANDARD"
8
+ STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
9
+
12
10
 
13
11
  @dataclass()
14
12
  class SplitterConfig:
@@ -17,6 +15,7 @@ class SplitterConfig:
17
15
  separators: list[str] = field(default_factory=lambda: ["\n"])
18
16
  split_type: str = "recursive_splitter"
19
17
  enableLLMTouchUp: bool = True
18
+ llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
20
19
  min_rl_chunk_size: int = 5
21
20
  max_rl_chunk_size: int = 50
22
21
 
@@ -1,11 +1,20 @@
1
1
  import os
2
+ import shutil
3
+ import subprocess
4
+ import sys
2
5
  import tempfile
6
+
7
+ import pypandoc
3
8
  from PIL import Image
4
9
  import io
5
10
  from docx2pdf import convert as docx_to_pdf
6
11
  import fitz
7
-
8
-
12
+ from docx2pdf import convert as docx2pdf_convert
13
+ try:
14
+ pypandoc.get_pandoc_path()
15
+ except OSError:
16
+ print("Pandoc not found — downloading it temporarily...")
17
+ pypandoc.download_pandoc()
9
18
 
10
19
  class DocuToImageConverter:
11
20
  """Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
@@ -13,11 +22,53 @@ class DocuToImageConverter:
13
22
  def __init__(self):
14
23
  pass
15
24
 
16
- def _convert_doc_to_pdf(self, doc_path: str) -> str:
17
- """Converts a .docx or .doc file to PDF using docx2pdf."""
18
- temp_dir = tempfile.mkdtemp()
19
- output_pdf = os.path.join(temp_dir, "converted.pdf")
20
- docx_to_pdf(doc_path, output_pdf)
25
+ def _convert_doc_to_pdf(self, input_path: str) -> str:
26
+ import shutil, tempfile, os, pypandoc
27
+ from docx import Document
28
+
29
+ if not os.path.exists(input_path):
30
+ raise FileNotFoundError(input_path)
31
+
32
+ output_dir = tempfile.mkdtemp()
33
+ output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
34
+
35
+ # 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
36
+ try:
37
+ pypandoc.get_pandoc_path()
38
+
39
+ def which(cmd):
40
+ return shutil.which(cmd) is not None
41
+
42
+ pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
43
+ pypandoc.convert_file(
44
+ input_path, "pdf", outputfile=output_pdf,
45
+ extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
46
+ )
47
+ return output_pdf
48
+ except Exception as e:
49
+ print("⚠️ Pandoc PDF conversion failed:", e)
50
+
51
+ # 2️⃣ Fallback to pure Python (WeasyPrint)
52
+ try:
53
+ from weasyprint import HTML
54
+ doc = Document(input_path)
55
+ html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
56
+ HTML(string=html).write_pdf(output_pdf)
57
+ return output_pdf
58
+ except Exception as e:
59
+ print("⚠️ Fallback to WeasyPrint failed:", e)
60
+
61
+ # 3️⃣ Last resort (plain text with ReportLab)
62
+ from reportlab.pdfgen import canvas
63
+ from reportlab.lib.pagesizes import A4
64
+ doc = Document(input_path)
65
+ c = canvas.Canvas(output_pdf, pagesize=A4)
66
+ width, height = A4
67
+ y = height - 50
68
+ for p in doc.paragraphs:
69
+ c.drawString(50, y, p.text[:1000])
70
+ y -= 15
71
+ c.save()
21
72
  return output_pdf
22
73
 
23
74
  def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
@@ -0,0 +1,167 @@
1
+ import os
2
+ import json
3
+
4
+ from docx import Document
5
+ from dotenv import load_dotenv
6
+ from openai import OpenAI
7
+ from PIL import Image
8
+
9
+ from .DocuToImageConverter import DocuToImageConverter
10
+ from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
11
+ from ..config.splitter_config import SplitterConfig
12
+ from .chunk_documents_crud_vdb import chunk_documents
13
+ from .chunk_to_all_content_mapper import ChunkMapper
14
+ from ..utils.file_loader import SplitType
15
+
16
+ load_dotenv(override=True)
17
+
18
+ # -----------------------------
19
+ # Abstract Strategy Interface
20
+ # -----------------------------
21
+ class BaseDocumentStrategy:
22
+ """Defines a standard interface for all document processing strategies."""
23
+
24
+ def process(self, file_path: str):
25
+ raise NotImplementedError("process() must be implemented by subclasses")
26
+
27
+
28
+ # -----------------------------
29
+ # PDF Strategy
30
+ # -----------------------------
31
+ class PDFStrategy(BaseDocumentStrategy):
32
+ def process(self, file_path: str):
33
+ print(f"📄 Using PDFStrategy for {file_path}")
34
+ converter = DocuToImageConverter()
35
+ # Example: detect multi-column layout or extract embedded text first
36
+ # import fitz
37
+ # text_ratio = 0
38
+ # with fitz.open(file_path) as doc:
39
+ # for page in doc:
40
+ # text = page.get_text("text")
41
+ # text_ratio += len(text) / (page.rect.width * page.rect.height)
42
+ # if text_ratio > 0.0001:
43
+ # print("📚 PDF appears text-based – using hybrid extract + image backup")
44
+
45
+ images = converter.convert_to_images(file_path)
46
+ return images
47
+
48
+
49
+ # -----------------------------
50
+ # Word Strategy
51
+ # -----------------------------
52
+ class WordStrategy(BaseDocumentStrategy):
53
+ def process(self, file_path: str):
54
+ print(f"📝 Using WordStrategy for {file_path}")
55
+
56
+ # Extract text semantically first
57
+ try:
58
+ doc = Document(file_path)
59
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
60
+ text_content = "\n".join(paragraphs)
61
+ print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
62
+ except Exception as e:
63
+ print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
64
+ text_content = ""
65
+
66
+ converter = DocuToImageConverter()
67
+ pdf_path = converter._convert_doc_to_pdf(file_path)
68
+ images = converter.convert_to_images(pdf_path)
69
+
70
+ # Optional: attach text fallback
71
+ if text_content:
72
+ images[0].extracted_text = text_content # for later use by extractor
73
+
74
+ return images
75
+
76
+
77
+ # -----------------------------
78
+ # Image Strategy
79
+ # -----------------------------
80
+ class ImageStrategy(BaseDocumentStrategy):
81
+ def process(self, file_path: str):
82
+ print(f"🖼️ Using ImageStrategy for {file_path}")
83
+ image = Image.open(file_path).convert("RGB")
84
+ return [image]
85
+
86
+
87
+ # -----------------------------
88
+ # Strategy Factory
89
+ # -----------------------------
90
+ class StrategyFactory:
91
+ """Selects a document strategy based on file extension."""
92
+
93
+ strategies = {
94
+ ".pdf": PDFStrategy(),
95
+ ".doc": WordStrategy(),
96
+ ".docx": WordStrategy(),
97
+ ".jpg": ImageStrategy(),
98
+ ".jpeg": ImageStrategy(),
99
+ ".png": ImageStrategy(),
100
+ ".bmp": ImageStrategy(),
101
+ ".tiff": ImageStrategy(),
102
+ }
103
+
104
+ @classmethod
105
+ def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
106
+ ext = os.path.splitext(file_path)[1].lower()
107
+ return cls.strategies.get(ext, None)
108
+
109
+
110
+ # -----------------------------
111
+ # Main Orchestrator
112
+ # -----------------------------
113
+ class MarkdownAndChunkDocuments:
114
+ def __init__(self):
115
+ self.api_key = os.getenv("OPENAI_API_KEY")
116
+ self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
117
+
118
+ def markdown_and_chunk_documents(self, file_path: str):
119
+ # Pick strategy
120
+ strategy = StrategyFactory.get_strategy(file_path)
121
+ if not strategy:
122
+ raise ValueError(f"Unsupported file type: {file_path}")
123
+
124
+ # Convert to images using correct strategy
125
+ images = strategy.process(file_path)
126
+
127
+ # Extract Markdown from images
128
+ markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
129
+ binary_text_content = text_content.encode("utf-8")
130
+
131
+ # Chunking and mapping
132
+ chunk_client = OpenAI(api_key=self.api_key)
133
+ cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
134
+ splitter_config = SplitterConfig(
135
+ chunk_size=300,
136
+ chunk_overlap=0,
137
+ separators=["\n"],
138
+ split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
139
+ min_rl_chunk_size=5,
140
+ max_rl_chunk_size=50,
141
+ enableLLMTouchUp=False,
142
+ )
143
+
144
+ chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
145
+ splitter_config=splitter_config)
146
+
147
+ flat_chunks = [''.join(inner) for inner in chunked_text]
148
+ mapped_chunks = cm.map_chunks(flat_chunks)
149
+
150
+ # Merge unmapped markdown sections
151
+ for md_item in markdown_output:
152
+ if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
153
+ md_item["chunked_text"] = md_item["markdown_text"]
154
+ mapped_chunks.append(md_item)
155
+
156
+ print("✅ Processing complete.")
157
+ return mapped_chunks
158
+
159
+
160
+ # -----------------------------
161
+ # CLI Entry
162
+ # -----------------------------
163
+ if __name__ == "__main__":
164
+ file_path = "421307-nz-au-top-loading-washer-guide-shorter.pdf"
165
+ pipeline = MarkdownAndChunkDocuments()
166
+ output = pipeline.markdown_and_chunk_documents(file_path)
167
+ print(json.dumps(output, indent=2))
@@ -0,0 +1,44 @@
1
+ import json
2
+ import pytest
3
+
4
+ from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
5
+ from core.prevectorchunks_core.services import chunk_documents_crud_vdb
6
+ from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
7
+ from core.prevectorchunks_core.utils.file_loader import SplitType
8
+
9
+
10
+ # Create a temporary JSON file to test with
11
+ @pytest.fixture
12
+ def temp_json_file(tmp_path):
13
+ file_path = tmp_path / "test.json"
14
+ content = [{"id": 1, "text": "hello world"}]
15
+ with open(file_path, "w") as f:
16
+ json.dump(content, f)
17
+ return file_path
18
+
19
+
20
+ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
21
+ splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
22
+ split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
23
+ max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
24
+
25
+ chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
26
+
27
+ splitter_config=splitter_config)
28
+
29
+ print(chunks)
30
+ for i, c in enumerate(chunks):
31
+ print(f"Chunk {i + 1}: {c}")
32
+ print(chunks)
33
+
34
+ def test_markdown(temp_json_file):
35
+ markdown_and_chunk_documents = MarkdownAndChunkDocuments()
36
+ mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
37
+ "content.docx")
38
+ print(mapped_chunks)
39
+ for i, c in enumerate(mapped_chunks):
40
+ print(f"Chunk {i + 1}: {c}")
41
+
42
+ for i, c in enumerate(mapped_chunks):
43
+ print(f"Chunk {i + 1}: {c}")
44
+ print(mapped_chunks)
@@ -16,7 +16,7 @@ from .llm_wrapper import LLMClientWrapper # Relative import
16
16
  from dotenv import load_dotenv
17
17
  import tempfile
18
18
 
19
- from ..config.splitter_config import SplitterConfig
19
+ from ..config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
20
20
  from ..rlchunker.inference import RLChunker
21
21
  from ..services.propositional_index import PropositionalIndexer
22
22
 
@@ -256,15 +256,19 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
256
256
  chunks = split_text_by_config(text, splitter_config=splitter_config)
257
257
  all_results = []
258
258
  if splitter_config.enableLLMTouchUp:
259
- for chunk in chunks:
260
- structured = process_with_llm(chunk,instructions)
261
- # Ensure UUIDs exist
262
- for obj in structured:
263
- if "id" not in obj:
264
- obj["id"] = str(uuid.uuid4())
265
- all_results.extend(structured)
266
-
267
- return all_results
259
+ if splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STANDARD:
260
+ warnings.warn("bypassing LLM touch up for standard structured output")
261
+ return chunks
262
+ elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
263
+ for chunk in chunks:
264
+ structured = process_with_llm(chunk,instructions)
265
+ # Ensure UUIDs exist
266
+ for obj in structured:
267
+ if "id" not in obj:
268
+ obj["id"] = str(uuid.uuid4())
269
+ all_results.extend(structured)
270
+
271
+ return all_results
268
272
  else:
269
273
  return chunks
270
274
 
@@ -1,14 +1,18 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.26
3
+ Version: 0.1.27
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
+ License: MIT License
7
+ Copyright (c) 2025 Your Name
8
+
6
9
  Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
7
10
  Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
11
+ Requires-Python: <3.12,>=3.7
8
12
  Description-Content-Type: text/markdown
9
13
  License-File: LICENCE
14
+ License-File: LICENSE
10
15
  Requires-Dist: packaging~=24.1
11
- Requires-Dist: requests~=2.32.3
12
16
  Requires-Dist: openai<3.0.0,>=2.6.0
13
17
  Requires-Dist: python-dotenv~=1.0.1
14
18
  Requires-Dist: PyJWT~=2.7.0
@@ -27,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
27
31
  Requires-Dist: langchain-text-splitters~=0.3.11
28
32
  Requires-Dist: langchain~=0.3
29
33
  Requires-Dist: langchain_openai~=0.3.35
30
- Requires-Dist: transformers>=4.30.0
31
34
  Requires-Dist: accelerate>=0.22.0
32
- Requires-Dist: imageio-ffmpeg>=0.4.8
33
- Requires-Dist: opencv-python>=4.10.0
34
35
  Requires-Dist: pathlib~=1.0.1
35
36
  Requires-Dist: transformers~=4.57.0
36
37
  Requires-Dist: imageio-ffmpeg~=0.6.0
@@ -42,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
42
43
  Requires-Dist: numpy~=2.2.6
43
44
  Requires-Dist: scikit-learn~=1.7.2
44
45
  Requires-Dist: PyMuPDF~=1.22.5
46
+ Requires-Dist: pypandoc~=1.13
47
+ Requires-Dist: reportlab~=4.1.0
48
+ Requires-Dist: weasyprint~=62.0
49
+ Requires-Dist: lxml~=4.9.3
50
+ Requires-Dist: cssselect2~=0.7.0
51
+ Requires-Dist: cairocffi~=1.4.0
45
52
  Dynamic: license-file
46
53
 
47
54
  # 📚 PreVectorChunks
@@ -122,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
122
129
  split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
123
130
  max_rl_chunk_size=50,enableLLMTouchUp=False)
124
131
  - - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
125
- **Returns**
132
+
133
+ - **Returns**
126
134
  - A list of chunked strings including a unique id, a meaningful title and chunked text
127
135
 
128
136
  **Use Cases**
@@ -1,4 +1,5 @@
1
1
  LICENCE
2
+ LICENSE
2
3
  README.md
3
4
  pyproject.toml
4
5
  prevectorchunks_core/__init__.py
@@ -1,5 +1,4 @@
1
1
  packaging~=24.1
2
- requests~=2.32.3
3
2
  openai<3.0.0,>=2.6.0
4
3
  python-dotenv~=1.0.1
5
4
  PyJWT~=2.7.0
@@ -18,10 +17,7 @@ py-gutenberg~=1.0.3
18
17
  langchain-text-splitters~=0.3.11
19
18
  langchain~=0.3
20
19
  langchain_openai~=0.3.35
21
- transformers>=4.30.0
22
20
  accelerate>=0.22.0
23
- imageio-ffmpeg>=0.4.8
24
- opencv-python>=4.10.0
25
21
  pathlib~=1.0.1
26
22
  transformers~=4.57.0
27
23
  imageio-ffmpeg~=0.6.0
@@ -33,3 +29,9 @@ docx2pdf~=0.1.8
33
29
  numpy~=2.2.6
34
30
  scikit-learn~=1.7.2
35
31
  PyMuPDF~=1.22.5
32
+ pypandoc~=1.13
33
+ reportlab~=4.1.0
34
+ weasyprint~=62.0
35
+ lxml~=4.9.3
36
+ cssselect2~=0.7.0
37
+ cairocffi~=1.4.0
@@ -4,17 +4,17 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.26"
7
+ version = "0.1.27"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
11
+ requires-python = ">=3.7,<3.12"
11
12
  authors = [
12
13
  { name = "Zul Al-Kabir", email = "zul.developer.2023@gmail.com" }
13
14
  ]
14
15
 
15
16
  dependencies = [
16
17
  "packaging~=24.1",
17
- "requests~=2.32.3",
18
18
  "openai>=2.6.0,<3.0.0",
19
19
  "python-dotenv~=1.0.1",
20
20
  "PyJWT~=2.7.0",
@@ -30,15 +30,10 @@ dependencies = [
30
30
  "torchaudio~=2.6.0",
31
31
  "sentence-transformers~=5.1.1",
32
32
  "py-gutenberg~=1.0.3",
33
- "langchain-text-splitters~=0.3.11", # <-- add this
33
+ "langchain-text-splitters~=0.3.11",
34
34
  "langchain~=0.3",
35
35
  "langchain_openai~=0.3.35",
36
- # … your existing dependencies …
37
- "transformers>=4.30.0",
38
36
  "accelerate>=0.22.0",
39
- "imageio-ffmpeg>=0.4.8",
40
- "opencv-python>=4.10.0", # for frame extraction
41
- # or whichever version of LLaVA you use
42
37
  "pathlib~=1.0.1",
43
38
  "transformers~=4.57.0",
44
39
  "imageio-ffmpeg~=0.6.0",
@@ -49,17 +44,24 @@ dependencies = [
49
44
  "docx2pdf~=0.1.8",
50
45
  "numpy~=2.2.6",
51
46
  "scikit-learn~=1.7.2",
52
- "PyMuPDF~=1.22.5"
47
+ "PyMuPDF~=1.22.5",
48
+ "pypandoc~=1.13",
49
+ "reportlab~=4.1.0",
50
+ "weasyprint~=62.0",
51
+ "lxml~=4.9.3",
52
+ "cssselect2~=0.7.0",
53
+ "cairocffi~=1.4.0"
53
54
  ]
54
55
 
55
56
  [tool.setuptools.packages.find]
56
57
  include = ["prevectorchunks_core*"]
58
+
57
59
  [tool.setuptools.package-data]
58
60
  "prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
59
61
 
60
62
  [tool.setuptools]
61
63
  include-package-data = true
64
+
62
65
  [project.urls]
63
66
  Homepage = "https://github.com/zuldeveloper2023/PreVectorChunks"
64
67
  Source = "https://github.com/zuldeveloper2023/PreVectorChunks"
65
-
@@ -1,71 +0,0 @@
1
- import json
2
- import os
3
- import tempfile
4
- import base64
5
-
6
- from openai import OpenAI
7
- from PIL import Image
8
- from .DocuToImageConverter import DocuToImageConverter
9
-
10
- from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
11
- from ..config.splitter_config import SplitterConfig
12
-
13
- from dotenv import load_dotenv
14
-
15
- from .chunk_documents_crud_vdb import chunk_documents
16
- from .chunk_to_all_content_mapper import ChunkMapper
17
- from ..utils.file_loader import SplitType
18
-
19
- load_dotenv(override=True)
20
-
21
-
22
- class MarkdownAndChunkDocuments:
23
-
24
-
25
- def markdown_and_chunk_documents(self,file_path:str):
26
- # Create instances of the converter and extractor
27
- converter = DocuToImageConverter()
28
- extractor = DocuToMarkdownExtractor(api_key=os.getenv("OPENAI_API_KEY"))
29
-
30
-
31
- images = converter.convert_to_images(file_path)
32
-
33
- # convert
34
- # Step 2: Extract Markdown from images
35
- markdown_output, text_content = extractor.extract_markdown(images, include_image=False)
36
- # convert text content to binary
37
- binary_text_content = text_content.encode('utf-8') # bytes representation
38
-
39
- chunk_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
40
- cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
41
- splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
42
- split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
43
- max_rl_chunk_size=50, enableLLMTouchUp=False)
44
-
45
- chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
46
- splitter_config=splitter_config)
47
-
48
- flat_chunks = result = [''.join(inner) for inner in chunked_text]
49
- mapped_chunks = cm.map_chunks(flat_chunks)
50
- for md_item in markdown_output:
51
- # Check if this markdown_output item is already present in mapped_chunks
52
- match_found = False
53
- for mapped in mapped_chunks:
54
- if mapped.get("markdown_text") == md_item.get("markdown_text"):
55
- match_found = True
56
- break
57
-
58
- # If not found, append the missing markdown_output item
59
- if not match_found:
60
- md_item["chunked_text"] = md_item["markdown_text"]
61
- mapped_chunks.append(md_item)
62
- #print(mapped_chunks)
63
-
64
- #print("✅ Markdown extraction complete! See output.md")
65
- return mapped_chunks
66
-
67
-
68
- if __name__ == "__main__":
69
- markdown_and_chunk_documents = MarkdownAndChunkDocuments()
70
- mapped_chunks=markdown_and_chunk_documents.markdown_and_chunk_documents("421307-nz-au-top-loading-washer-guide-shorter.pdf")
71
- print(mapped_chunks)
@@ -1,26 +0,0 @@
1
- import json
2
- import pytest
3
-
4
- from core.prevectorchunks_core.config.splitter_config import SplitterConfig
5
- from core.prevectorchunks_core.services import chunk_documents_crud_vdb
6
- from core.prevectorchunks_core.utils.file_loader import SplitType
7
-
8
-
9
- # Create a temporary JSON file to test with
10
- @pytest.fixture
11
- def temp_json_file(tmp_path):
12
- file_path = tmp_path / "test.json"
13
- content = [{"id": 1, "text": "hello world"}]
14
- with open(file_path, "w") as f:
15
- json.dump(content, f)
16
- return file_path
17
-
18
-
19
- def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
20
- splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
21
- split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
22
- max_rl_chunk_size=50,enableLLMTouchUp=True)
23
-
24
- chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",splitter_config=splitter_config)
25
-
26
- print(chunks)