prevectorchunks-core 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of prevectorchunks-core might be problematic. Click here for more details.
- prevectorchunks_core/config/splitter_config.py +8 -0
- prevectorchunks_core/services/DocuToImageConverter.py +58 -7
- prevectorchunks_core/services/markdown_and_chunk_documents.py +132 -36
- prevectorchunks_core/test_loader.py +21 -3
- prevectorchunks_core/utils/file_loader.py +14 -10
- {prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/METADATA +16 -7
- {prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/RECORD +11 -10
- prevectorchunks_core-0.1.27.dist-info/licenses/LICENSE +2 -0
- {prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/WHEEL +0 -0
- {prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/licenses/LICENCE +0 -0
- {prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# prevectorchunks_core/config.py
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class LLM_Structured_Output_Type(Enum):
|
|
7
|
+
STANDARD = "STANDARD"
|
|
8
|
+
STRUCTURED_WITH_VECTOR_DB_ID_GENERATED = "STRUCTURED_WITH_VECTOR_DB_ID_GENERATED"
|
|
9
|
+
|
|
3
10
|
|
|
4
11
|
@dataclass()
|
|
5
12
|
class SplitterConfig:
|
|
@@ -8,6 +15,7 @@ class SplitterConfig:
|
|
|
8
15
|
separators: list[str] = field(default_factory=lambda: ["\n"])
|
|
9
16
|
split_type: str = "recursive_splitter"
|
|
10
17
|
enableLLMTouchUp: bool = True
|
|
18
|
+
llm_structured_output_type: LLM_Structured_Output_Type = LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED
|
|
11
19
|
min_rl_chunk_size: int = 5
|
|
12
20
|
max_rl_chunk_size: int = 50
|
|
13
21
|
|
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import subprocess
|
|
4
|
+
import sys
|
|
2
5
|
import tempfile
|
|
6
|
+
|
|
7
|
+
import pypandoc
|
|
3
8
|
from PIL import Image
|
|
4
9
|
import io
|
|
5
10
|
from docx2pdf import convert as docx_to_pdf
|
|
6
11
|
import fitz
|
|
7
|
-
|
|
8
|
-
|
|
12
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
13
|
+
try:
|
|
14
|
+
pypandoc.get_pandoc_path()
|
|
15
|
+
except OSError:
|
|
16
|
+
print("Pandoc not found — downloading it temporarily...")
|
|
17
|
+
pypandoc.download_pandoc()
|
|
9
18
|
|
|
10
19
|
class DocuToImageConverter:
|
|
11
20
|
"""Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
|
|
@@ -13,11 +22,53 @@ class DocuToImageConverter:
|
|
|
13
22
|
def __init__(self):
|
|
14
23
|
pass
|
|
15
24
|
|
|
16
|
-
def _convert_doc_to_pdf(self,
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
25
|
+
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
26
|
+
import shutil, tempfile, os, pypandoc
|
|
27
|
+
from docx import Document
|
|
28
|
+
|
|
29
|
+
if not os.path.exists(input_path):
|
|
30
|
+
raise FileNotFoundError(input_path)
|
|
31
|
+
|
|
32
|
+
output_dir = tempfile.mkdtemp()
|
|
33
|
+
output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
|
|
34
|
+
|
|
35
|
+
# 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
|
|
36
|
+
try:
|
|
37
|
+
pypandoc.get_pandoc_path()
|
|
38
|
+
|
|
39
|
+
def which(cmd):
|
|
40
|
+
return shutil.which(cmd) is not None
|
|
41
|
+
|
|
42
|
+
pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
|
|
43
|
+
pypandoc.convert_file(
|
|
44
|
+
input_path, "pdf", outputfile=output_pdf,
|
|
45
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
46
|
+
)
|
|
47
|
+
return output_pdf
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print("⚠️ Pandoc PDF conversion failed:", e)
|
|
50
|
+
|
|
51
|
+
# 2️⃣ Fallback to pure Python (WeasyPrint)
|
|
52
|
+
try:
|
|
53
|
+
from weasyprint import HTML
|
|
54
|
+
doc = Document(input_path)
|
|
55
|
+
html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
|
|
56
|
+
HTML(string=html).write_pdf(output_pdf)
|
|
57
|
+
return output_pdf
|
|
58
|
+
except Exception as e:
|
|
59
|
+
print("⚠️ Fallback to WeasyPrint failed:", e)
|
|
60
|
+
|
|
61
|
+
# 3️⃣ Last resort (plain text with ReportLab)
|
|
62
|
+
from reportlab.pdfgen import canvas
|
|
63
|
+
from reportlab.lib.pagesizes import A4
|
|
64
|
+
doc = Document(input_path)
|
|
65
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
66
|
+
width, height = A4
|
|
67
|
+
y = height - 50
|
|
68
|
+
for p in doc.paragraphs:
|
|
69
|
+
c.drawString(50, y, p.text[:1000])
|
|
70
|
+
y -= 15
|
|
71
|
+
c.save()
|
|
21
72
|
return output_pdf
|
|
22
73
|
|
|
23
74
|
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
@@ -1,71 +1,167 @@
|
|
|
1
|
-
import json
|
|
2
1
|
import os
|
|
3
|
-
import
|
|
4
|
-
import base64
|
|
2
|
+
import json
|
|
5
3
|
|
|
4
|
+
from docx import Document
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
6
|
from openai import OpenAI
|
|
7
7
|
from PIL import Image
|
|
8
|
-
from .DocuToImageConverter import DocuToImageConverter
|
|
9
8
|
|
|
9
|
+
from .DocuToImageConverter import DocuToImageConverter
|
|
10
10
|
from .DocuToMarkdownExtractor import DocuToMarkdownExtractor
|
|
11
11
|
from ..config.splitter_config import SplitterConfig
|
|
12
|
-
|
|
13
|
-
from dotenv import load_dotenv
|
|
14
|
-
|
|
15
12
|
from .chunk_documents_crud_vdb import chunk_documents
|
|
16
13
|
from .chunk_to_all_content_mapper import ChunkMapper
|
|
17
14
|
from ..utils.file_loader import SplitType
|
|
18
15
|
|
|
19
16
|
load_dotenv(override=True)
|
|
20
17
|
|
|
18
|
+
# -----------------------------
|
|
19
|
+
# Abstract Strategy Interface
|
|
20
|
+
# -----------------------------
|
|
21
|
+
class BaseDocumentStrategy:
|
|
22
|
+
"""Defines a standard interface for all document processing strategies."""
|
|
21
23
|
|
|
22
|
-
|
|
24
|
+
def process(self, file_path: str):
|
|
25
|
+
raise NotImplementedError("process() must be implemented by subclasses")
|
|
23
26
|
|
|
24
27
|
|
|
25
|
-
|
|
26
|
-
|
|
28
|
+
# -----------------------------
|
|
29
|
+
# PDF Strategy
|
|
30
|
+
# -----------------------------
|
|
31
|
+
class PDFStrategy(BaseDocumentStrategy):
|
|
32
|
+
def process(self, file_path: str):
|
|
33
|
+
print(f"📄 Using PDFStrategy for {file_path}")
|
|
27
34
|
converter = DocuToImageConverter()
|
|
28
|
-
|
|
29
|
-
|
|
35
|
+
# Example: detect multi-column layout or extract embedded text first
|
|
36
|
+
# import fitz
|
|
37
|
+
# text_ratio = 0
|
|
38
|
+
# with fitz.open(file_path) as doc:
|
|
39
|
+
# for page in doc:
|
|
40
|
+
# text = page.get_text("text")
|
|
41
|
+
# text_ratio += len(text) / (page.rect.width * page.rect.height)
|
|
42
|
+
# if text_ratio > 0.0001:
|
|
43
|
+
# print("📚 PDF appears text-based – using hybrid extract + image backup")
|
|
30
44
|
|
|
31
45
|
images = converter.convert_to_images(file_path)
|
|
46
|
+
return images
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# -----------------------------
|
|
50
|
+
# Word Strategy
|
|
51
|
+
# -----------------------------
|
|
52
|
+
class WordStrategy(BaseDocumentStrategy):
|
|
53
|
+
def process(self, file_path: str):
|
|
54
|
+
print(f"📝 Using WordStrategy for {file_path}")
|
|
55
|
+
|
|
56
|
+
# Extract text semantically first
|
|
57
|
+
try:
|
|
58
|
+
doc = Document(file_path)
|
|
59
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
60
|
+
text_content = "\n".join(paragraphs)
|
|
61
|
+
print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
|
|
62
|
+
except Exception as e:
|
|
63
|
+
print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
|
|
64
|
+
text_content = ""
|
|
65
|
+
|
|
66
|
+
converter = DocuToImageConverter()
|
|
67
|
+
pdf_path = converter._convert_doc_to_pdf(file_path)
|
|
68
|
+
images = converter.convert_to_images(pdf_path)
|
|
69
|
+
|
|
70
|
+
# Optional: attach text fallback
|
|
71
|
+
if text_content:
|
|
72
|
+
images[0].extracted_text = text_content # for later use by extractor
|
|
73
|
+
|
|
74
|
+
return images
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# -----------------------------
|
|
78
|
+
# Image Strategy
|
|
79
|
+
# -----------------------------
|
|
80
|
+
class ImageStrategy(BaseDocumentStrategy):
|
|
81
|
+
def process(self, file_path: str):
|
|
82
|
+
print(f"🖼️ Using ImageStrategy for {file_path}")
|
|
83
|
+
image = Image.open(file_path).convert("RGB")
|
|
84
|
+
return [image]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# -----------------------------
|
|
88
|
+
# Strategy Factory
|
|
89
|
+
# -----------------------------
|
|
90
|
+
class StrategyFactory:
|
|
91
|
+
"""Selects a document strategy based on file extension."""
|
|
92
|
+
|
|
93
|
+
strategies = {
|
|
94
|
+
".pdf": PDFStrategy(),
|
|
95
|
+
".doc": WordStrategy(),
|
|
96
|
+
".docx": WordStrategy(),
|
|
97
|
+
".jpg": ImageStrategy(),
|
|
98
|
+
".jpeg": ImageStrategy(),
|
|
99
|
+
".png": ImageStrategy(),
|
|
100
|
+
".bmp": ImageStrategy(),
|
|
101
|
+
".tiff": ImageStrategy(),
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
|
|
106
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
107
|
+
return cls.strategies.get(ext, None)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# -----------------------------
|
|
111
|
+
# Main Orchestrator
|
|
112
|
+
# -----------------------------
|
|
113
|
+
class MarkdownAndChunkDocuments:
|
|
114
|
+
def __init__(self):
|
|
115
|
+
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
116
|
+
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
117
|
+
|
|
118
|
+
def markdown_and_chunk_documents(self, file_path: str):
|
|
119
|
+
# Pick strategy
|
|
120
|
+
strategy = StrategyFactory.get_strategy(file_path)
|
|
121
|
+
if not strategy:
|
|
122
|
+
raise ValueError(f"Unsupported file type: {file_path}")
|
|
32
123
|
|
|
33
|
-
#
|
|
34
|
-
|
|
35
|
-
markdown_output, text_content = extractor.extract_markdown(images, include_image=False)
|
|
36
|
-
# convert text content to binary
|
|
37
|
-
binary_text_content = text_content.encode('utf-8') # bytes representation
|
|
124
|
+
# Convert to images using correct strategy
|
|
125
|
+
images = strategy.process(file_path)
|
|
38
126
|
|
|
39
|
-
|
|
127
|
+
# Extract Markdown from images
|
|
128
|
+
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
|
|
129
|
+
binary_text_content = text_content.encode("utf-8")
|
|
130
|
+
|
|
131
|
+
# Chunking and mapping
|
|
132
|
+
chunk_client = OpenAI(api_key=self.api_key)
|
|
40
133
|
cm = ChunkMapper(chunk_client, markdown_output, embedding_model="text-embedding-3-small")
|
|
41
|
-
splitter_config = SplitterConfig(
|
|
42
|
-
|
|
43
|
-
|
|
134
|
+
splitter_config = SplitterConfig(
|
|
135
|
+
chunk_size=300,
|
|
136
|
+
chunk_overlap=0,
|
|
137
|
+
separators=["\n"],
|
|
138
|
+
split_type=SplitType.R_PRETRAINED_PROPOSITION.value,
|
|
139
|
+
min_rl_chunk_size=5,
|
|
140
|
+
max_rl_chunk_size=50,
|
|
141
|
+
enableLLMTouchUp=False,
|
|
142
|
+
)
|
|
44
143
|
|
|
45
144
|
chunked_text = chunk_documents("", file_name="install_ins.txt", file_path=binary_text_content,
|
|
46
145
|
splitter_config=splitter_config)
|
|
47
146
|
|
|
48
|
-
flat_chunks =
|
|
147
|
+
flat_chunks = [''.join(inner) for inner in chunked_text]
|
|
49
148
|
mapped_chunks = cm.map_chunks(flat_chunks)
|
|
149
|
+
|
|
150
|
+
# Merge unmapped markdown sections
|
|
50
151
|
for md_item in markdown_output:
|
|
51
|
-
|
|
52
|
-
match_found = False
|
|
53
|
-
for mapped in mapped_chunks:
|
|
54
|
-
if mapped.get("markdown_text") == md_item.get("markdown_text"):
|
|
55
|
-
match_found = True
|
|
56
|
-
break
|
|
57
|
-
|
|
58
|
-
# If not found, append the missing markdown_output item
|
|
59
|
-
if not match_found:
|
|
152
|
+
if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
|
|
60
153
|
md_item["chunked_text"] = md_item["markdown_text"]
|
|
61
154
|
mapped_chunks.append(md_item)
|
|
62
|
-
#print(mapped_chunks)
|
|
63
155
|
|
|
64
|
-
|
|
156
|
+
print("✅ Processing complete.")
|
|
65
157
|
return mapped_chunks
|
|
66
158
|
|
|
67
159
|
|
|
160
|
+
# -----------------------------
|
|
161
|
+
# CLI Entry
|
|
162
|
+
# -----------------------------
|
|
68
163
|
if __name__ == "__main__":
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
164
|
+
file_path = "421307-nz-au-top-loading-washer-guide-shorter.pdf"
|
|
165
|
+
pipeline = MarkdownAndChunkDocuments()
|
|
166
|
+
output = pipeline.markdown_and_chunk_documents(file_path)
|
|
167
|
+
print(json.dumps(output, indent=2))
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import pytest
|
|
3
3
|
|
|
4
|
-
from core.prevectorchunks_core.config.splitter_config import SplitterConfig
|
|
4
|
+
from core.prevectorchunks_core.config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
|
|
5
5
|
from core.prevectorchunks_core.services import chunk_documents_crud_vdb
|
|
6
|
+
from core.prevectorchunks_core.services.markdown_and_chunk_documents import MarkdownAndChunkDocuments
|
|
6
7
|
from core.prevectorchunks_core.utils.file_loader import SplitType
|
|
7
8
|
|
|
8
9
|
|
|
@@ -19,8 +20,25 @@ def temp_json_file(tmp_path):
|
|
|
19
20
|
def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
|
|
20
21
|
splitter_config = SplitterConfig(chunk_size=300, chunk_overlap=0, separators=["\n"],
|
|
21
22
|
split_type=SplitType.RECURSIVE.value, min_rl_chunk_size=5,
|
|
22
|
-
max_rl_chunk_size=50,enableLLMTouchUp=True)
|
|
23
|
+
max_rl_chunk_size=50, enableLLMTouchUp=True,llm_structured_output_type=LLM_Structured_Output_Type.STANDARD)
|
|
23
24
|
|
|
24
|
-
chunks=chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
|
|
25
|
+
chunks = chunk_documents_crud_vdb.chunk_documents("extract", file_name=None, file_path="content.txt",
|
|
25
26
|
|
|
27
|
+
splitter_config=splitter_config)
|
|
28
|
+
|
|
29
|
+
print(chunks)
|
|
30
|
+
for i, c in enumerate(chunks):
|
|
31
|
+
print(f"Chunk {i + 1}: {c}")
|
|
26
32
|
print(chunks)
|
|
33
|
+
|
|
34
|
+
def test_markdown(temp_json_file):
|
|
35
|
+
markdown_and_chunk_documents = MarkdownAndChunkDocuments()
|
|
36
|
+
mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
|
|
37
|
+
"content.docx")
|
|
38
|
+
print(mapped_chunks)
|
|
39
|
+
for i, c in enumerate(mapped_chunks):
|
|
40
|
+
print(f"Chunk {i + 1}: {c}")
|
|
41
|
+
|
|
42
|
+
for i, c in enumerate(mapped_chunks):
|
|
43
|
+
print(f"Chunk {i + 1}: {c}")
|
|
44
|
+
print(mapped_chunks)
|
|
@@ -16,7 +16,7 @@ from .llm_wrapper import LLMClientWrapper # Relative import
|
|
|
16
16
|
from dotenv import load_dotenv
|
|
17
17
|
import tempfile
|
|
18
18
|
|
|
19
|
-
from ..config.splitter_config import SplitterConfig
|
|
19
|
+
from ..config.splitter_config import SplitterConfig, LLM_Structured_Output_Type
|
|
20
20
|
from ..rlchunker.inference import RLChunker
|
|
21
21
|
from ..services.propositional_index import PropositionalIndexer
|
|
22
22
|
|
|
@@ -256,15 +256,19 @@ def process_large_text(text, instructions,splitter_config:SplitterConfig=None):
|
|
|
256
256
|
chunks = split_text_by_config(text, splitter_config=splitter_config)
|
|
257
257
|
all_results = []
|
|
258
258
|
if splitter_config.enableLLMTouchUp:
|
|
259
|
-
|
|
260
|
-
structured
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
259
|
+
if splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STANDARD:
|
|
260
|
+
warnings.warn("bypassing LLM touch up for standard structured output")
|
|
261
|
+
return chunks
|
|
262
|
+
elif splitter_config.llm_structured_output_type == LLM_Structured_Output_Type.STRUCTURED_WITH_VECTOR_DB_ID_GENERATED:
|
|
263
|
+
for chunk in chunks:
|
|
264
|
+
structured = process_with_llm(chunk,instructions)
|
|
265
|
+
# Ensure UUIDs exist
|
|
266
|
+
for obj in structured:
|
|
267
|
+
if "id" not in obj:
|
|
268
|
+
obj["id"] = str(uuid.uuid4())
|
|
269
|
+
all_results.extend(structured)
|
|
270
|
+
|
|
271
|
+
return all_results
|
|
268
272
|
else:
|
|
269
273
|
return chunks
|
|
270
274
|
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.27
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
|
-
|
|
6
|
+
License: MIT License
|
|
7
|
+
Copyright (c) 2025 Your Name
|
|
8
|
+
|
|
9
|
+
Project-URL: Homepage, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
10
|
+
Project-URL: Source, https://github.com/zuldeveloper2023/PreVectorChunks
|
|
11
|
+
Requires-Python: <3.12,>=3.7
|
|
7
12
|
Description-Content-Type: text/markdown
|
|
8
13
|
License-File: LICENCE
|
|
14
|
+
License-File: LICENSE
|
|
9
15
|
Requires-Dist: packaging~=24.1
|
|
10
|
-
Requires-Dist: requests~=2.32.3
|
|
11
16
|
Requires-Dist: openai<3.0.0,>=2.6.0
|
|
12
17
|
Requires-Dist: python-dotenv~=1.0.1
|
|
13
18
|
Requires-Dist: PyJWT~=2.7.0
|
|
@@ -26,10 +31,7 @@ Requires-Dist: py-gutenberg~=1.0.3
|
|
|
26
31
|
Requires-Dist: langchain-text-splitters~=0.3.11
|
|
27
32
|
Requires-Dist: langchain~=0.3
|
|
28
33
|
Requires-Dist: langchain_openai~=0.3.35
|
|
29
|
-
Requires-Dist: transformers>=4.30.0
|
|
30
34
|
Requires-Dist: accelerate>=0.22.0
|
|
31
|
-
Requires-Dist: imageio-ffmpeg>=0.4.8
|
|
32
|
-
Requires-Dist: opencv-python>=4.10.0
|
|
33
35
|
Requires-Dist: pathlib~=1.0.1
|
|
34
36
|
Requires-Dist: transformers~=4.57.0
|
|
35
37
|
Requires-Dist: imageio-ffmpeg~=0.6.0
|
|
@@ -41,6 +43,12 @@ Requires-Dist: docx2pdf~=0.1.8
|
|
|
41
43
|
Requires-Dist: numpy~=2.2.6
|
|
42
44
|
Requires-Dist: scikit-learn~=1.7.2
|
|
43
45
|
Requires-Dist: PyMuPDF~=1.22.5
|
|
46
|
+
Requires-Dist: pypandoc~=1.13
|
|
47
|
+
Requires-Dist: reportlab~=4.1.0
|
|
48
|
+
Requires-Dist: weasyprint~=62.0
|
|
49
|
+
Requires-Dist: lxml~=4.9.3
|
|
50
|
+
Requires-Dist: cssselect2~=0.7.0
|
|
51
|
+
Requires-Dist: cairocffi~=1.4.0
|
|
44
52
|
Dynamic: license-file
|
|
45
53
|
|
|
46
54
|
# 📚 PreVectorChunks
|
|
@@ -121,7 +129,8 @@ Splits the content of a document into smaller, manageable chunks. - Five types o
|
|
|
121
129
|
split_type=SplitType.R_PRETRAINED_PROPOSITION.value, min_rl_chunk_size=5,
|
|
122
130
|
max_rl_chunk_size=50,enableLLMTouchUp=False)
|
|
123
131
|
- - (min_rl_chunk_size and max_rl_chunk_size refers to size in sentences (i.e. 100 sentences) when R_PRETRAINED_PROPOSITION is used)
|
|
124
|
-
|
|
132
|
+
|
|
133
|
+
- **Returns**
|
|
125
134
|
- A list of chunked strings including a unique id, a meaningful title and chunked text
|
|
126
135
|
|
|
127
136
|
**Use Cases**
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
prevectorchunks_core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
prevectorchunks_core/test_loader.py,sha256=
|
|
2
|
+
prevectorchunks_core/test_loader.py,sha256=bAniYz7PunILn0GKFufEWUKXWivWqf_VjdG6zm3pXfI,1790
|
|
3
3
|
prevectorchunks_core/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
prevectorchunks_core/config/splitter_config.py,sha256=
|
|
4
|
+
prevectorchunks_core/config/splitter_config.py,sha256=BzVmNlDjAIyGNKXcj4yK7wVpZyvM1yX2f0MpdHiOKsM,697
|
|
5
5
|
prevectorchunks_core/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
prevectorchunks_core/os-llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
7
|
prevectorchunks_core/os-llm/llava.py,sha256=GXdVoT6FJ3AKl4c5wY5CWweIs7w82jOevhPfiLhQPZY,526
|
|
@@ -16,24 +16,25 @@ prevectorchunks_core/rlchunker/utils.py,sha256=E8agBIu_zNwz5PwshBDYfb733nYac9qvs
|
|
|
16
16
|
prevectorchunks_core/rlchunker/pretrained/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
prevectorchunks_core/rlchunker/pretrained/model_info.txt,sha256=jsEwZaHLCQy15E2lrU5DfqsaxU6w4vHsVlYt511gQ00,74
|
|
18
18
|
prevectorchunks_core/rlchunker/pretrained/policy_model.pt,sha256=4oO3JmEM-sTO4pbsDcs-t5BycSIIHub13HwbP0ETjtc,200824
|
|
19
|
-
prevectorchunks_core/services/DocuToImageConverter.py,sha256=
|
|
19
|
+
prevectorchunks_core/services/DocuToImageConverter.py,sha256=4RHQF-ufZQ4lNDW5ELQL-FDxDE76AcKnTrsT-eVqQMQ,4578
|
|
20
20
|
prevectorchunks_core/services/DocuToMarkdownExtractor.py,sha256=ZzJy0bDL-9Ycog3ejCE8525KF0Z0SOONJEgiwpaNdk8,3352
|
|
21
21
|
prevectorchunks_core/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
prevectorchunks_core/services/audio_processor.py,sha256=XKNYhXHIt_77a3PT2wwKvnCSmXSI-BIbzZmoXVI_7Ww,5017
|
|
23
23
|
prevectorchunks_core/services/chunk_documents_crud_vdb.py,sha256=Md4vy7vJDnSYpvZiF0HbHCOA0StSVm62ALHAPYU2A7I,16279
|
|
24
24
|
prevectorchunks_core/services/chunk_to_all_content_mapper.py,sha256=xEz2idxJTsJwyCJWMPZCk3CFcalKhbSuucFH9TPouU0,2778
|
|
25
25
|
prevectorchunks_core/services/image_processor.py,sha256=2CRwTbI-czbakm9aG-kMdx908bc5H1rQETQiVCKbWd8,3518
|
|
26
|
-
prevectorchunks_core/services/markdown_and_chunk_documents.py,sha256=
|
|
26
|
+
prevectorchunks_core/services/markdown_and_chunk_documents.py,sha256=i1C4zMH45GXGSIbgn9GKc0QHQjxhX_oQTipaCDwvrss,5982
|
|
27
27
|
prevectorchunks_core/services/propositional_index.py,sha256=cVH3obhLtlcfJYA6VN4KfC3len4fe5nNcboorlouOb0,4151
|
|
28
28
|
prevectorchunks_core/services/video_analyser.py,sha256=1wI38xZ8vdE8T4EBAnxWzt7Hc8vTYrdQhbA4Y5VZLeY,6651
|
|
29
29
|
prevectorchunks_core/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
prevectorchunks_core/tests/test_local.py,sha256=i43OenjEeDgdpMkL6RdJUDixtNv1xS22kYR7wXLU49U,2088
|
|
31
31
|
prevectorchunks_core/utils/__init__.py,sha256=aez3v2dwGHXvmALXVBPR-mQgvxMqxv9NsENsNcr6Cg4,106
|
|
32
32
|
prevectorchunks_core/utils/extract_content.py,sha256=fMDT-BsjYutHLnOFebLhMFpb1UFAB8ldGldxh11FsXw,2920
|
|
33
|
-
prevectorchunks_core/utils/file_loader.py,sha256=
|
|
33
|
+
prevectorchunks_core/utils/file_loader.py,sha256=hNBk6ZWtZxcrf6uZxdI_OkG0Ff9L0JWTLuR9Bff0zYo,11082
|
|
34
34
|
prevectorchunks_core/utils/llm_wrapper.py,sha256=7GfyM5p5PeIehi4Dj5jgC7-xi2SjZuyyPuLkWtucQzQ,1139
|
|
35
|
-
prevectorchunks_core-0.1.
|
|
36
|
-
prevectorchunks_core-0.1.
|
|
37
|
-
prevectorchunks_core-0.1.
|
|
38
|
-
prevectorchunks_core-0.1.
|
|
39
|
-
prevectorchunks_core-0.1.
|
|
35
|
+
prevectorchunks_core-0.1.27.dist-info/licenses/LICENCE,sha256=Ljp4XVKnncsQ59h0eMW6J5V-ylsVeqDRC8smR7UPIDs,512
|
|
36
|
+
prevectorchunks_core-0.1.27.dist-info/licenses/LICENSE,sha256=dYaNLS6Xsc9xANAXqDI8Jn0cz8jch1VTh-m7pVcNGWo,43
|
|
37
|
+
prevectorchunks_core-0.1.27.dist-info/METADATA,sha256=LgAXC8AB6zib4C-LTLIQk7vTl9o_rJohoJiA4Kjbklw,10948
|
|
38
|
+
prevectorchunks_core-0.1.27.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
39
|
+
prevectorchunks_core-0.1.27.dist-info/top_level.txt,sha256=OWJgfrUDNTh49PpKvRXHY8lVeWqzFbTr9OkDoAvpvPk,21
|
|
40
|
+
prevectorchunks_core-0.1.27.dist-info/RECORD,,
|
|
File without changes
|
{prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/licenses/LICENCE
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.25.dist-info → prevectorchunks_core-0.1.27.dist-info}/top_level.txt
RENAMED
|
File without changes
|