prevectorchunks-core 0.1.32__tar.gz → 0.1.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.34}/PKG-INFO +2 -3
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/DocuToImageConverter.py +57 -15
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/markdown_and_chunk_documents.py +66 -19
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info}/PKG-INFO +2 -3
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/SOURCES.txt +34 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/requires.txt +1 -2
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/pyproject.toml +6 -3
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/LICENCE +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/LICENSE +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/README.md +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/test_loader.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/file_loader.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/setup.cfg +0 -0
{prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.34}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.34
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -49,8 +49,7 @@ Requires-Dist: weasyprint~=62.0
|
|
|
49
49
|
Requires-Dist: lxml~=4.9.3
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
-
Requires-Dist: tensorflow
|
|
53
|
-
Requires-Dist: codecarbon>=2.3.0
|
|
52
|
+
Requires-Dist: tensorflow~=2.12.0
|
|
54
53
|
Dynamic: license-file
|
|
55
54
|
|
|
56
55
|
# 📚 PreVectorChunks
|
|
@@ -28,18 +28,54 @@ class DocuToImageConverter:
|
|
|
28
28
|
"""Write bytes to a temporary file and return path."""
|
|
29
29
|
tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
|
|
30
30
|
with os.fdopen(tmp_fd, "wb") as f:
|
|
31
|
-
f.write(input_bytes)
|
|
31
|
+
f.write(input_bytes.read())
|
|
32
32
|
return tmp_path
|
|
33
33
|
|
|
34
|
-
def _convert_doc_to_pdf(self,
|
|
35
|
-
"""
|
|
34
|
+
def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Convert DOC/DOCX to PDF. Supports:
|
|
37
|
+
- file_path (string)
|
|
38
|
+
- input_bytes (bytes, InMemoryUploadedFile, or file-like)
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# ✅ If bytes are provided, write them to a temporary .docx
|
|
42
|
+
if input_bytes is not None:
|
|
43
|
+
# Get filename or fallback
|
|
44
|
+
original_name = getattr(input_bytes, "name", "uploaded.docx")
|
|
45
|
+
ext = os.path.splitext(original_name)[1] or ".docx"
|
|
46
|
+
|
|
47
|
+
# Create a temporary file path
|
|
48
|
+
temp_input_path = tempfile.mktemp(suffix=ext)
|
|
49
|
+
|
|
50
|
+
# Read bytes safely
|
|
51
|
+
if hasattr(input_bytes, "read"): # Django UploadedFile
|
|
52
|
+
input_bytes.seek(0)
|
|
53
|
+
content = input_bytes.read()
|
|
54
|
+
else: # already bytes
|
|
55
|
+
content = input_bytes
|
|
56
|
+
|
|
57
|
+
# Write bytes to temp file
|
|
58
|
+
with open(temp_input_path, "wb") as f:
|
|
59
|
+
f.write(content)
|
|
60
|
+
|
|
61
|
+
input_path = temp_input_path
|
|
62
|
+
|
|
63
|
+
# ✅ If file_path is provided, use it directly
|
|
64
|
+
elif file_path:
|
|
65
|
+
input_path = file_path
|
|
66
|
+
|
|
67
|
+
else:
|
|
68
|
+
raise ValueError("Must supply either file_path or input_bytes")
|
|
69
|
+
|
|
70
|
+
# ✅ Must exist at this point
|
|
36
71
|
if not os.path.exists(input_path):
|
|
37
72
|
raise FileNotFoundError(input_path)
|
|
38
73
|
|
|
74
|
+
# ✅ Prepare output PDF path
|
|
39
75
|
output_dir = tempfile.mkdtemp()
|
|
40
76
|
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
41
77
|
|
|
42
|
-
# 1️⃣ Microsoft Word COM automation (Windows
|
|
78
|
+
# 1️⃣ Try Microsoft Word COM automation (Windows)
|
|
43
79
|
try:
|
|
44
80
|
import win32com.client
|
|
45
81
|
word = win32com.client.Dispatch("Word.Application")
|
|
@@ -52,7 +88,7 @@ class DocuToImageConverter:
|
|
|
52
88
|
except Exception:
|
|
53
89
|
pass
|
|
54
90
|
|
|
55
|
-
# 2️⃣ LibreOffice
|
|
91
|
+
# 2️⃣ Try LibreOffice
|
|
56
92
|
try:
|
|
57
93
|
subprocess.run(
|
|
58
94
|
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
@@ -65,23 +101,28 @@ class DocuToImageConverter:
|
|
|
65
101
|
# 3️⃣ Pandoc fallback
|
|
66
102
|
try:
|
|
67
103
|
pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
|
|
68
|
-
pypandoc.convert_file(
|
|
69
|
-
|
|
104
|
+
pypandoc.convert_file(
|
|
105
|
+
input_path, "pdf",
|
|
106
|
+
outputfile=output_pdf,
|
|
107
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
108
|
+
)
|
|
70
109
|
return output_pdf
|
|
71
110
|
except Exception:
|
|
72
111
|
pass
|
|
73
112
|
|
|
74
|
-
# 4️⃣
|
|
113
|
+
# 4️⃣ Final fallback: Render plain text using ReportLab
|
|
75
114
|
doc = Document(input_path)
|
|
76
115
|
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
77
116
|
width, height = A4
|
|
78
117
|
y = height - 50
|
|
118
|
+
|
|
79
119
|
for p in doc.paragraphs:
|
|
80
120
|
c.drawString(50, y, p.text[:1000])
|
|
81
121
|
y -= 15
|
|
82
122
|
if y < 50:
|
|
83
123
|
c.showPage()
|
|
84
124
|
y = height - 50
|
|
125
|
+
|
|
85
126
|
c.save()
|
|
86
127
|
return output_pdf
|
|
87
128
|
|
|
@@ -96,7 +137,7 @@ class DocuToImageConverter:
|
|
|
96
137
|
pdf_document.close()
|
|
97
138
|
return images
|
|
98
139
|
|
|
99
|
-
def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
|
|
140
|
+
def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
|
|
100
141
|
"""
|
|
101
142
|
Convert a file path or binary content to PIL images.
|
|
102
143
|
Supports PDF, DOC, DOCX, and image files.
|
|
@@ -107,14 +148,15 @@ class DocuToImageConverter:
|
|
|
107
148
|
# Determine extension
|
|
108
149
|
if file_path:
|
|
109
150
|
ext = os.path.splitext(file_path)[1].lower()
|
|
151
|
+
print('work')
|
|
110
152
|
elif input_bytes:
|
|
111
153
|
# Attempt to infer from first few bytes (simple)
|
|
112
|
-
if input_bytes[:4] == b"%PDF":
|
|
113
|
-
|
|
114
|
-
elif input_bytes[:2] == b"PK":
|
|
115
|
-
|
|
116
|
-
else:
|
|
117
|
-
|
|
154
|
+
# if input_bytes[:4] == b"%PDF":
|
|
155
|
+
# ext = ".pdf"
|
|
156
|
+
# elif input_bytes[:2] == b"PK":
|
|
157
|
+
# ext = ".docx"
|
|
158
|
+
# else:
|
|
159
|
+
# ext = ".img" # Treat as generic image
|
|
118
160
|
|
|
119
161
|
# Write to temp file if doc/pdf
|
|
120
162
|
if ext in [".pdf", ".doc", ".docx"]:
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import json
|
|
3
3
|
import tempfile
|
|
4
|
+
import uuid
|
|
5
|
+
from io import BytesIO
|
|
4
6
|
from pathlib import Path
|
|
5
7
|
|
|
6
8
|
from docx import Document
|
|
@@ -17,21 +19,29 @@ from ..utils.file_loader import SplitType
|
|
|
17
19
|
|
|
18
20
|
load_dotenv(override=True)
|
|
19
21
|
|
|
22
|
+
def get_file_extension(file_path,file_name):
|
|
23
|
+
ext=''
|
|
24
|
+
if file_name:
|
|
25
|
+
ext = file_name[1]
|
|
26
|
+
else:
|
|
27
|
+
# Extract extension
|
|
28
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
29
|
+
return ext
|
|
30
|
+
|
|
20
31
|
# -----------------------------
|
|
21
32
|
# Abstract Strategy Interface
|
|
22
33
|
# -----------------------------
|
|
23
34
|
class BaseDocumentStrategy:
|
|
24
35
|
"""Defines a standard interface for all document processing strategies."""
|
|
25
36
|
|
|
26
|
-
def process(self, file_path: str, input_bytes: bytes = None):
|
|
37
|
+
def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
|
|
27
38
|
raise NotImplementedError("process() must be implemented by subclasses")
|
|
28
39
|
|
|
29
|
-
|
|
30
40
|
# -----------------------------
|
|
31
41
|
# PDF Strategy
|
|
32
42
|
# -----------------------------
|
|
33
43
|
class PDFStrategy(BaseDocumentStrategy):
|
|
34
|
-
def process(self, file_path: str, input_bytes: bytes = None):
|
|
44
|
+
def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
|
|
35
45
|
print(f"📄 Using PDFStrategy for {file_path}")
|
|
36
46
|
converter = DocuToImageConverter()
|
|
37
47
|
# Example: detect multi-column layout or extract embedded text first
|
|
@@ -44,7 +54,7 @@ class PDFStrategy(BaseDocumentStrategy):
|
|
|
44
54
|
# if text_ratio > 0.0001:
|
|
45
55
|
# print("📚 PDF appears text-based – using hybrid extract + image backup")
|
|
46
56
|
|
|
47
|
-
images = converter.convert_to_images(file_path)
|
|
57
|
+
images = converter.convert_to_images(file_path,input_bytes,ext=ext)
|
|
48
58
|
return images
|
|
49
59
|
|
|
50
60
|
|
|
@@ -52,16 +62,18 @@ class PDFStrategy(BaseDocumentStrategy):
|
|
|
52
62
|
# Word Strategy
|
|
53
63
|
# -----------------------------
|
|
54
64
|
class WordStrategy(BaseDocumentStrategy):
|
|
55
|
-
def process(self, file_path: str, input_bytes: bytes = None):
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
65
|
+
def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
|
|
66
|
+
file_name=''
|
|
67
|
+
if file_path:
|
|
68
|
+
file_name = Path(file_path)
|
|
69
|
+
print(f"📝 Using WordStrategy for {file_path}")
|
|
70
|
+
else:
|
|
71
|
+
file_name_no_ext = os.path.splitext(input_bytes.name)[0]
|
|
60
72
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
61
|
-
pdf_path = Path(tmpdir) / f"{
|
|
73
|
+
pdf_path = Path(tmpdir) / f"{file_name}.pdf"
|
|
62
74
|
|
|
63
75
|
converter = DocuToImageConverter()
|
|
64
|
-
pdf_path = converter._convert_doc_to_pdf(file_path)
|
|
76
|
+
pdf_path = converter._convert_doc_to_pdf(file_path=file_path, input_bytes=input_bytes)
|
|
65
77
|
images = converter.convert_to_images(pdf_path)
|
|
66
78
|
|
|
67
79
|
|
|
@@ -72,9 +84,31 @@ class WordStrategy(BaseDocumentStrategy):
|
|
|
72
84
|
# Image Strategy
|
|
73
85
|
# -----------------------------
|
|
74
86
|
class ImageStrategy(BaseDocumentStrategy):
|
|
75
|
-
def process(self, file_path: str, input_bytes: bytes = None):
|
|
87
|
+
def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
|
|
76
88
|
print(f"🖼️ Using ImageStrategy for {file_path}")
|
|
77
|
-
|
|
89
|
+
if file_path:
|
|
90
|
+
# Path-based loading
|
|
91
|
+
image = Image.open(file_path).convert("RGB")
|
|
92
|
+
|
|
93
|
+
else:
|
|
94
|
+
# Byte-based loading
|
|
95
|
+
if input_bytes is None:
|
|
96
|
+
raise ValueError("Either file_path or input_bytes must be provided")
|
|
97
|
+
|
|
98
|
+
# If it's a Django UploadedFile → read() needed
|
|
99
|
+
if hasattr(input_bytes, "read"):
|
|
100
|
+
input_bytes.seek(0)
|
|
101
|
+
image_bytes = input_bytes.read()
|
|
102
|
+
|
|
103
|
+
# If it's already bytes
|
|
104
|
+
elif isinstance(input_bytes, (bytes, bytearray)):
|
|
105
|
+
image_bytes = input_bytes
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
raise TypeError("input_bytes must be bytes or file-like object")
|
|
109
|
+
|
|
110
|
+
image = Image.open(BytesIO(image_bytes)).convert("RGB")
|
|
111
|
+
|
|
78
112
|
return [image]
|
|
79
113
|
|
|
80
114
|
|
|
@@ -96,8 +130,14 @@ class StrategyFactory:
|
|
|
96
130
|
}
|
|
97
131
|
|
|
98
132
|
@classmethod
|
|
99
|
-
def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
|
|
100
|
-
|
|
133
|
+
def get_strategy(cls, file_path: str,file_name:str=None) -> BaseDocumentStrategy:
|
|
134
|
+
if file_name:
|
|
135
|
+
ext=file_name[1]
|
|
136
|
+
else:
|
|
137
|
+
# Extract extension
|
|
138
|
+
|
|
139
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
140
|
+
|
|
101
141
|
return cls.strategies.get(ext, None)
|
|
102
142
|
|
|
103
143
|
|
|
@@ -109,14 +149,15 @@ class MarkdownAndChunkDocuments:
|
|
|
109
149
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
110
150
|
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
111
151
|
|
|
112
|
-
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
|
|
152
|
+
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
|
|
113
153
|
# Pick strategy
|
|
114
|
-
strategy = StrategyFactory.get_strategy(file_path)
|
|
154
|
+
strategy = StrategyFactory.get_strategy(file_path,file_name)
|
|
115
155
|
if not strategy:
|
|
116
156
|
raise ValueError(f"Unsupported file type: {file_path}")
|
|
117
157
|
|
|
118
158
|
# Convert to images using correct strategy
|
|
119
|
-
|
|
159
|
+
ext=get_file_extension(file_path,file_name)
|
|
160
|
+
images = strategy.process(file_path, input_bytes,ext)
|
|
120
161
|
|
|
121
162
|
# Extract Markdown from images
|
|
122
163
|
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
|
@@ -146,10 +187,16 @@ class MarkdownAndChunkDocuments:
|
|
|
146
187
|
if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
|
|
147
188
|
md_item["chunked_text"] = md_item["markdown_text"]
|
|
148
189
|
mapped_chunks.append(md_item)
|
|
149
|
-
|
|
190
|
+
adduuid(mapped_chunks)
|
|
150
191
|
print("✅ Processing complete.")
|
|
151
192
|
return mapped_chunks
|
|
152
193
|
|
|
194
|
+
def adduuid(mapped_chunks):
|
|
195
|
+
# Assuming mapped_chunks is a list of dictionaries
|
|
196
|
+
|
|
197
|
+
for chunk in mapped_chunks:
|
|
198
|
+
chunk['id'] = str(uuid.uuid4())
|
|
199
|
+
|
|
153
200
|
|
|
154
201
|
# -----------------------------
|
|
155
202
|
# CLI Entry
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.34
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -49,8 +49,7 @@ Requires-Dist: weasyprint~=62.0
|
|
|
49
49
|
Requires-Dist: lxml~=4.9.3
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
-
Requires-Dist: tensorflow
|
|
53
|
-
Requires-Dist: codecarbon>=2.3.0
|
|
52
|
+
Requires-Dist: tensorflow~=2.12.0
|
|
54
53
|
Dynamic: license-file
|
|
55
54
|
|
|
56
55
|
# 📚 PreVectorChunks
|
|
@@ -2,6 +2,40 @@ LICENCE
|
|
|
2
2
|
LICENSE
|
|
3
3
|
README.md
|
|
4
4
|
pyproject.toml
|
|
5
|
+
./prevectorchunks_core/__init__.py
|
|
6
|
+
./prevectorchunks_core/test_loader.py
|
|
7
|
+
./prevectorchunks_core/config/__init__.py
|
|
8
|
+
./prevectorchunks_core/config/splitter_config.py
|
|
9
|
+
./prevectorchunks_core/migrations/__init__.py
|
|
10
|
+
./prevectorchunks_core/os-llm/__init__.py
|
|
11
|
+
./prevectorchunks_core/os-llm/llava.py
|
|
12
|
+
./prevectorchunks_core/rlchunker/__init__.py
|
|
13
|
+
./prevectorchunks_core/rlchunker/env.py
|
|
14
|
+
./prevectorchunks_core/rlchunker/inference.py
|
|
15
|
+
./prevectorchunks_core/rlchunker/model.py
|
|
16
|
+
./prevectorchunks_core/rlchunker/reward.py
|
|
17
|
+
./prevectorchunks_core/rlchunker/savepretrained.py
|
|
18
|
+
./prevectorchunks_core/rlchunker/testpretrained.py
|
|
19
|
+
./prevectorchunks_core/rlchunker/utils.py
|
|
20
|
+
./prevectorchunks_core/rlchunker/pretrained/__init__.py
|
|
21
|
+
./prevectorchunks_core/rlchunker/pretrained/model_info.txt
|
|
22
|
+
./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
|
|
23
|
+
./prevectorchunks_core/services/DocuToImageConverter.py
|
|
24
|
+
./prevectorchunks_core/services/DocuToMarkdownExtractor.py
|
|
25
|
+
./prevectorchunks_core/services/__init__.py
|
|
26
|
+
./prevectorchunks_core/services/audio_processor.py
|
|
27
|
+
./prevectorchunks_core/services/chunk_documents_crud_vdb.py
|
|
28
|
+
./prevectorchunks_core/services/chunk_to_all_content_mapper.py
|
|
29
|
+
./prevectorchunks_core/services/image_processor.py
|
|
30
|
+
./prevectorchunks_core/services/markdown_and_chunk_documents.py
|
|
31
|
+
./prevectorchunks_core/services/propositional_index.py
|
|
32
|
+
./prevectorchunks_core/services/video_analyser.py
|
|
33
|
+
./prevectorchunks_core/tests/__init__.py
|
|
34
|
+
./prevectorchunks_core/tests/test_local.py
|
|
35
|
+
./prevectorchunks_core/utils/__init__.py
|
|
36
|
+
./prevectorchunks_core/utils/extract_content.py
|
|
37
|
+
./prevectorchunks_core/utils/file_loader.py
|
|
38
|
+
./prevectorchunks_core/utils/llm_wrapper.py
|
|
5
39
|
prevectorchunks_core/__init__.py
|
|
6
40
|
prevectorchunks_core/test_loader.py
|
|
7
41
|
prevectorchunks_core.egg-info/PKG-INFO
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.34"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -51,17 +51,20 @@ dependencies = [
|
|
|
51
51
|
"lxml~=4.9.3",
|
|
52
52
|
"cssselect2~=0.7.0",
|
|
53
53
|
"cairocffi~=1.4.0",
|
|
54
|
-
"tensorflow
|
|
55
|
-
"codecarbon>=2.3.0" # <-- Add this
|
|
54
|
+
"tensorflow~=2.12.0", # <-- Add this
|
|
56
55
|
]
|
|
57
56
|
|
|
57
|
+
|
|
58
|
+
|
|
58
59
|
[tool.setuptools.packages.find]
|
|
60
|
+
where = ["."]
|
|
59
61
|
include = ["prevectorchunks_core*"]
|
|
60
62
|
|
|
61
63
|
[tool.setuptools.package-data]
|
|
62
64
|
"prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
|
|
63
65
|
|
|
64
66
|
[tool.setuptools]
|
|
67
|
+
package-dir = {"" = "."}
|
|
65
68
|
include-package-data = true
|
|
66
69
|
|
|
67
70
|
[project.urls]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/test_loader.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.32 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|