prevectorchunks-core 0.1.30__tar.gz → 0.1.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {prevectorchunks_core-0.1.30/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.32}/PKG-INFO +3 -1
- prevectorchunks_core-0.1.32/prevectorchunks_core/services/DocuToImageConverter.py +143 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/markdown_and_chunk_documents.py +6 -6
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info}/PKG-INFO +3 -1
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/requires.txt +2 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/pyproject.toml +4 -2
- prevectorchunks_core-0.1.30/prevectorchunks_core/services/DocuToImageConverter.py +0 -148
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/LICENCE +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/LICENSE +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/README.md +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/config/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/config/splitter_config.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/migrations/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/llava.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/env.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/inference.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/model.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/reward.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/utils.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/audio_processor.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/image_processor.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/propositional_index.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/video_analyser.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/test_loader.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/test_local.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/__init__.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/extract_content.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/file_loader.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/top_level.txt +0 -0
- {prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/setup.cfg +0 -0
{prevectorchunks_core-0.1.30/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.32}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.32
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
|
|
|
49
49
|
Requires-Dist: lxml~=4.9.3
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
+
Requires-Dist: tensorflow<3.0.0,>=2.15.0
|
|
53
|
+
Requires-Dist: codecarbon>=2.3.0
|
|
52
54
|
Dynamic: license-file
|
|
53
55
|
|
|
54
56
|
# 📚 PreVectorChunks
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
import shutil
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from PIL import Image
|
|
7
|
+
import io
|
|
8
|
+
import fitz
|
|
9
|
+
from docx2pdf import convert as docx2pdf_convert
|
|
10
|
+
from docx import Document
|
|
11
|
+
from reportlab.pdfgen import canvas
|
|
12
|
+
from reportlab.lib.pagesizes import A4
|
|
13
|
+
import pypandoc
|
|
14
|
+
|
|
15
|
+
# Ensure pandoc is available
|
|
16
|
+
try:
|
|
17
|
+
pypandoc.get_pandoc_path()
|
|
18
|
+
except OSError:
|
|
19
|
+
pypandoc.download_pandoc()
|
|
20
|
+
|
|
21
|
+
class DocuToImageConverter:
|
|
22
|
+
"""Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def _write_temp_file(self, input_bytes: bytes, suffix: str):
|
|
28
|
+
"""Write bytes to a temporary file and return path."""
|
|
29
|
+
tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
|
|
30
|
+
with os.fdopen(tmp_fd, "wb") as f:
|
|
31
|
+
f.write(input_bytes)
|
|
32
|
+
return tmp_path
|
|
33
|
+
|
|
34
|
+
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
35
|
+
"""Convert DOC/DOCX file to PDF using Word COM, LibreOffice, Pandoc, or fallback."""
|
|
36
|
+
if not os.path.exists(input_path):
|
|
37
|
+
raise FileNotFoundError(input_path)
|
|
38
|
+
|
|
39
|
+
output_dir = tempfile.mkdtemp()
|
|
40
|
+
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
41
|
+
|
|
42
|
+
# 1️⃣ Microsoft Word COM automation (Windows only)
|
|
43
|
+
try:
|
|
44
|
+
import win32com.client
|
|
45
|
+
word = win32com.client.Dispatch("Word.Application")
|
|
46
|
+
word.Visible = False
|
|
47
|
+
doc = word.Documents.Open(str(Path(input_path).resolve()))
|
|
48
|
+
doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
|
|
49
|
+
doc.Close()
|
|
50
|
+
word.Quit()
|
|
51
|
+
return output_pdf
|
|
52
|
+
except Exception:
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
# 2️⃣ LibreOffice fallback
|
|
56
|
+
try:
|
|
57
|
+
subprocess.run(
|
|
58
|
+
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
59
|
+
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
60
|
+
)
|
|
61
|
+
return output_pdf
|
|
62
|
+
except Exception:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
# 3️⃣ Pandoc fallback
|
|
66
|
+
try:
|
|
67
|
+
pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
|
|
68
|
+
pypandoc.convert_file(input_path, "pdf", outputfile=output_pdf,
|
|
69
|
+
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"])
|
|
70
|
+
return output_pdf
|
|
71
|
+
except Exception:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
# 4️⃣ Last resort: ReportLab plain text
|
|
75
|
+
doc = Document(input_path)
|
|
76
|
+
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
77
|
+
width, height = A4
|
|
78
|
+
y = height - 50
|
|
79
|
+
for p in doc.paragraphs:
|
|
80
|
+
c.drawString(50, y, p.text[:1000])
|
|
81
|
+
y -= 15
|
|
82
|
+
if y < 50:
|
|
83
|
+
c.showPage()
|
|
84
|
+
y = height - 50
|
|
85
|
+
c.save()
|
|
86
|
+
return output_pdf
|
|
87
|
+
|
|
88
|
+
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
89
|
+
images = []
|
|
90
|
+
pdf_document = fitz.open(pdf_path)
|
|
91
|
+
for page_num in range(len(pdf_document)):
|
|
92
|
+
page = pdf_document[page_num]
|
|
93
|
+
pixmap = page.get_pixmap(dpi=dpi)
|
|
94
|
+
image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
|
95
|
+
images.append(image)
|
|
96
|
+
pdf_document.close()
|
|
97
|
+
return images
|
|
98
|
+
|
|
99
|
+
def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
|
|
100
|
+
"""
|
|
101
|
+
Convert a file path or binary content to PIL images.
|
|
102
|
+
Supports PDF, DOC, DOCX, and image files.
|
|
103
|
+
"""
|
|
104
|
+
if not file_path and not input_bytes:
|
|
105
|
+
raise ValueError("Provide either file_path or input_bytes.")
|
|
106
|
+
|
|
107
|
+
# Determine extension
|
|
108
|
+
if file_path:
|
|
109
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
110
|
+
elif input_bytes:
|
|
111
|
+
# Attempt to infer from first few bytes (simple)
|
|
112
|
+
if input_bytes[:4] == b"%PDF":
|
|
113
|
+
ext = ".pdf"
|
|
114
|
+
elif input_bytes[:2] == b"PK":
|
|
115
|
+
ext = ".docx"
|
|
116
|
+
else:
|
|
117
|
+
ext = ".img" # Treat as generic image
|
|
118
|
+
|
|
119
|
+
# Write to temp file if doc/pdf
|
|
120
|
+
if ext in [".pdf", ".doc", ".docx"]:
|
|
121
|
+
file_path = self._write_temp_file(input_bytes, suffix=ext)
|
|
122
|
+
|
|
123
|
+
# Word → PDF
|
|
124
|
+
if ext in [".doc", ".docx"]:
|
|
125
|
+
pdf_path = self._convert_doc_to_pdf(file_path)
|
|
126
|
+
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
127
|
+
|
|
128
|
+
# PDF → images
|
|
129
|
+
elif ext == ".pdf":
|
|
130
|
+
images = self._convert_pdf_to_images(file_path, dpi=dpi)
|
|
131
|
+
|
|
132
|
+
# Image
|
|
133
|
+
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
|
|
134
|
+
image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
|
|
135
|
+
buffer = io.BytesIO()
|
|
136
|
+
image.save(buffer, format=output_format)
|
|
137
|
+
buffer.seek(0)
|
|
138
|
+
images = [Image.open(buffer)]
|
|
139
|
+
|
|
140
|
+
else:
|
|
141
|
+
raise ValueError("Unsupported file type.")
|
|
142
|
+
|
|
143
|
+
return images
|
|
@@ -23,7 +23,7 @@ load_dotenv(override=True)
|
|
|
23
23
|
class BaseDocumentStrategy:
|
|
24
24
|
"""Defines a standard interface for all document processing strategies."""
|
|
25
25
|
|
|
26
|
-
def process(self, file_path: str):
|
|
26
|
+
def process(self, file_path: str, input_bytes: bytes = None):
|
|
27
27
|
raise NotImplementedError("process() must be implemented by subclasses")
|
|
28
28
|
|
|
29
29
|
|
|
@@ -31,7 +31,7 @@ class BaseDocumentStrategy:
|
|
|
31
31
|
# PDF Strategy
|
|
32
32
|
# -----------------------------
|
|
33
33
|
class PDFStrategy(BaseDocumentStrategy):
|
|
34
|
-
def process(self, file_path: str):
|
|
34
|
+
def process(self, file_path: str, input_bytes: bytes = None):
|
|
35
35
|
print(f"📄 Using PDFStrategy for {file_path}")
|
|
36
36
|
converter = DocuToImageConverter()
|
|
37
37
|
# Example: detect multi-column layout or extract embedded text first
|
|
@@ -52,7 +52,7 @@ class PDFStrategy(BaseDocumentStrategy):
|
|
|
52
52
|
# Word Strategy
|
|
53
53
|
# -----------------------------
|
|
54
54
|
class WordStrategy(BaseDocumentStrategy):
|
|
55
|
-
def process(self, file_path: str):
|
|
55
|
+
def process(self, file_path: str, input_bytes: bytes = None):
|
|
56
56
|
file_path = Path(file_path)
|
|
57
57
|
|
|
58
58
|
print(f"📝 Using WordStrategy for {file_path}")
|
|
@@ -72,7 +72,7 @@ class WordStrategy(BaseDocumentStrategy):
|
|
|
72
72
|
# Image Strategy
|
|
73
73
|
# -----------------------------
|
|
74
74
|
class ImageStrategy(BaseDocumentStrategy):
|
|
75
|
-
def process(self, file_path: str):
|
|
75
|
+
def process(self, file_path: str, input_bytes: bytes = None):
|
|
76
76
|
print(f"🖼️ Using ImageStrategy for {file_path}")
|
|
77
77
|
image = Image.open(file_path).convert("RGB")
|
|
78
78
|
return [image]
|
|
@@ -109,14 +109,14 @@ class MarkdownAndChunkDocuments:
|
|
|
109
109
|
self.api_key = os.getenv("OPENAI_API_KEY")
|
|
110
110
|
self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
|
|
111
111
|
|
|
112
|
-
def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
|
|
112
|
+
def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
|
|
113
113
|
# Pick strategy
|
|
114
114
|
strategy = StrategyFactory.get_strategy(file_path)
|
|
115
115
|
if not strategy:
|
|
116
116
|
raise ValueError(f"Unsupported file type: {file_path}")
|
|
117
117
|
|
|
118
118
|
# Convert to images using correct strategy
|
|
119
|
-
images = strategy.process(file_path)
|
|
119
|
+
images = strategy.process(file_path, input_bytes)
|
|
120
120
|
|
|
121
121
|
# Extract Markdown from images
|
|
122
122
|
markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info}/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prevectorchunks-core
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.32
|
|
4
4
|
Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
|
|
5
5
|
Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
|
|
|
49
49
|
Requires-Dist: lxml~=4.9.3
|
|
50
50
|
Requires-Dist: cssselect2~=0.7.0
|
|
51
51
|
Requires-Dist: cairocffi~=1.4.0
|
|
52
|
+
Requires-Dist: tensorflow<3.0.0,>=2.15.0
|
|
53
|
+
Requires-Dist: codecarbon>=2.3.0
|
|
52
54
|
Dynamic: license-file
|
|
53
55
|
|
|
54
56
|
# 📚 PreVectorChunks
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "prevectorchunks-core"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.32"
|
|
8
8
|
description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -50,7 +50,9 @@ dependencies = [
|
|
|
50
50
|
"weasyprint~=62.0",
|
|
51
51
|
"lxml~=4.9.3",
|
|
52
52
|
"cssselect2~=0.7.0",
|
|
53
|
-
"cairocffi~=1.4.0"
|
|
53
|
+
"cairocffi~=1.4.0",
|
|
54
|
+
"tensorflow>=2.15.0,<3.0.0", # <-- Add this
|
|
55
|
+
"codecarbon>=2.3.0" # <-- Add this
|
|
54
56
|
]
|
|
55
57
|
|
|
56
58
|
[tool.setuptools.packages.find]
|
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import shutil
|
|
3
|
-
import subprocess
|
|
4
|
-
import sys
|
|
5
|
-
import tempfile
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
import pypandoc
|
|
9
|
-
from PIL import Image
|
|
10
|
-
import io
|
|
11
|
-
from docx2pdf import convert as docx_to_pdf
|
|
12
|
-
import fitz
|
|
13
|
-
from docx2pdf import convert as docx2pdf_convert
|
|
14
|
-
try:
|
|
15
|
-
pypandoc.get_pandoc_path()
|
|
16
|
-
except OSError:
|
|
17
|
-
print("Pandoc not found — downloading it temporarily...")
|
|
18
|
-
pypandoc.download_pandoc()
|
|
19
|
-
|
|
20
|
-
class DocuToImageConverter:
|
|
21
|
-
"""Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
|
|
22
|
-
|
|
23
|
-
def __init__(self):
|
|
24
|
-
pass
|
|
25
|
-
|
|
26
|
-
def _convert_doc_to_pdf(self, input_path: str) -> str:
|
|
27
|
-
import os, tempfile, shutil, subprocess
|
|
28
|
-
from pathlib import Path
|
|
29
|
-
|
|
30
|
-
if not os.path.exists(input_path):
|
|
31
|
-
raise FileNotFoundError(input_path)
|
|
32
|
-
|
|
33
|
-
output_dir = tempfile.mkdtemp()
|
|
34
|
-
output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
|
|
35
|
-
|
|
36
|
-
# 1️⃣ Try Microsoft Word COM automation (Windows only)
|
|
37
|
-
try:
|
|
38
|
-
import win32com.client
|
|
39
|
-
word = win32com.client.Dispatch("Word.Application")
|
|
40
|
-
word.Visible = False
|
|
41
|
-
doc = word.Documents.Open(str(Path(input_path).resolve()))
|
|
42
|
-
doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
|
|
43
|
-
doc.Close()
|
|
44
|
-
word.Quit()
|
|
45
|
-
print("✅ Word COM conversion successful:", output_pdf)
|
|
46
|
-
return output_pdf
|
|
47
|
-
except Exception as e:
|
|
48
|
-
print("⚠️ Word COM conversion failed:", e)
|
|
49
|
-
|
|
50
|
-
# 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
|
|
51
|
-
try:
|
|
52
|
-
# Requires LibreOffice installed and in PATH
|
|
53
|
-
subprocess.run(
|
|
54
|
-
["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
|
|
55
|
-
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
|
56
|
-
)
|
|
57
|
-
print("✅ LibreOffice conversion successful:", output_pdf)
|
|
58
|
-
return output_pdf
|
|
59
|
-
except Exception as e:
|
|
60
|
-
print("⚠️ LibreOffice conversion failed:", e)
|
|
61
|
-
|
|
62
|
-
# 3️⃣ Fallback: Pandoc (simpler, loses layout)
|
|
63
|
-
try:
|
|
64
|
-
import pypandoc
|
|
65
|
-
def which(cmd):
|
|
66
|
-
return shutil.which(cmd) is not None
|
|
67
|
-
|
|
68
|
-
pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
|
|
69
|
-
pypandoc.convert_file(
|
|
70
|
-
input_path, "pdf", outputfile=output_pdf,
|
|
71
|
-
extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
|
|
72
|
-
)
|
|
73
|
-
print("✅ Pandoc conversion successful:", output_pdf)
|
|
74
|
-
return output_pdf
|
|
75
|
-
except Exception as e:
|
|
76
|
-
print("⚠️ Pandoc conversion failed:", e)
|
|
77
|
-
|
|
78
|
-
# 4️⃣ Last resort: ReportLab basic text (no formatting)
|
|
79
|
-
from reportlab.pdfgen import canvas
|
|
80
|
-
from reportlab.lib.pagesizes import A4
|
|
81
|
-
from docx import Document
|
|
82
|
-
|
|
83
|
-
doc = Document(input_path)
|
|
84
|
-
c = canvas.Canvas(output_pdf, pagesize=A4)
|
|
85
|
-
width, height = A4
|
|
86
|
-
y = height - 50
|
|
87
|
-
for p in doc.paragraphs:
|
|
88
|
-
c.drawString(50, y, p.text[:1000])
|
|
89
|
-
y -= 15
|
|
90
|
-
if y < 50:
|
|
91
|
-
c.showPage()
|
|
92
|
-
y = height - 50
|
|
93
|
-
c.save()
|
|
94
|
-
print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
|
|
95
|
-
return output_pdf
|
|
96
|
-
|
|
97
|
-
def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
|
|
98
|
-
"""
|
|
99
|
-
Converts each page of a PDF into images using PyMuPDF directly.
|
|
100
|
-
"""
|
|
101
|
-
images = []
|
|
102
|
-
|
|
103
|
-
try:
|
|
104
|
-
pdf_document = fitz.open(pdf_path) # Use `PyMuPDF` instead of fitz alias
|
|
105
|
-
for page_num in range(len(pdf_document)):
|
|
106
|
-
page = pdf_document[page_num]
|
|
107
|
-
# Render page to a pixmap with the specified DPI
|
|
108
|
-
pixmap = page.get_pixmap(dpi=dpi)
|
|
109
|
-
# Convert pixmap to an Image object using PIL
|
|
110
|
-
image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
|
|
111
|
-
images.append(image)
|
|
112
|
-
pdf_document.close()
|
|
113
|
-
except Exception as e:
|
|
114
|
-
raise RuntimeError(f"Failed to convert PDF to images: {e}")
|
|
115
|
-
|
|
116
|
-
return images
|
|
117
|
-
|
|
118
|
-
def convert_to_images(self, file_path: str, dpi: int = 200, output_format: str = "PNG"):
|
|
119
|
-
"""
|
|
120
|
-
Converts each page of a document into a list of PIL images.
|
|
121
|
-
Supports .pdf, .doc, .docx, and image files (.jpg, .png, etc.)
|
|
122
|
-
Ensures all outputs are in a consistent image format.
|
|
123
|
-
"""
|
|
124
|
-
ext = os.path.splitext(file_path)[1].lower()
|
|
125
|
-
|
|
126
|
-
# Convert Word → PDF first
|
|
127
|
-
if ext in [".doc", ".docx"]:
|
|
128
|
-
pdf_path = self._convert_doc_to_pdf(file_path)
|
|
129
|
-
images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
|
|
130
|
-
|
|
131
|
-
# Convert PDF → list of images
|
|
132
|
-
elif ext == ".pdf":
|
|
133
|
-
images = self._convert_pdf_to_images(file_path, dpi=dpi)
|
|
134
|
-
|
|
135
|
-
# Handle already an image file
|
|
136
|
-
elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
|
|
137
|
-
image = Image.open(file_path).convert("RGB")
|
|
138
|
-
# Convert to consistent format (e.g., PNG or JPEG in memory)
|
|
139
|
-
buffer = io.BytesIO()
|
|
140
|
-
image.save(buffer, format=output_format)
|
|
141
|
-
buffer.seek(0)
|
|
142
|
-
converted_image = Image.open(buffer)
|
|
143
|
-
images = [converted_image]
|
|
144
|
-
|
|
145
|
-
else:
|
|
146
|
-
raise ValueError("Unsupported file type. Use .pdf, .doc, .docx, or image files")
|
|
147
|
-
|
|
148
|
-
return images
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/config/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/llava.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/env.py
RENAMED
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/model.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/reward.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/test_loader.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/__init__.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/test_local.py
RENAMED
|
File without changes
|
{prevectorchunks_core-0.1.30 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|