prevectorchunks-core 0.1.31__tar.gz → 0.1.32__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {prevectorchunks_core-0.1.31/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.32}/PKG-INFO +3 -1
  2. prevectorchunks_core-0.1.32/prevectorchunks_core/services/DocuToImageConverter.py +143 -0
  3. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/markdown_and_chunk_documents.py +6 -6
  4. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32/prevectorchunks_core.egg-info}/PKG-INFO +3 -1
  5. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/requires.txt +2 -0
  6. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/pyproject.toml +4 -2
  7. prevectorchunks_core-0.1.31/prevectorchunks_core/services/DocuToImageConverter.py +0 -148
  8. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/LICENCE +0 -0
  9. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/LICENSE +0 -0
  10. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/README.md +0 -0
  11. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/__init__.py +0 -0
  12. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/config/__init__.py +0 -0
  13. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/config/splitter_config.py +0 -0
  14. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/migrations/__init__.py +0 -0
  15. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/__init__.py +0 -0
  16. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/os-llm/llava.py +0 -0
  17. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  18. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/env.py +0 -0
  19. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/inference.py +0 -0
  20. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/model.py +0 -0
  21. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  22. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  23. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  24. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/reward.py +0 -0
  25. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  26. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  27. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/rlchunker/utils.py +0 -0
  28. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
  29. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/__init__.py +0 -0
  30. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/audio_processor.py +0 -0
  31. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  32. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
  33. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/image_processor.py +0 -0
  34. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/propositional_index.py +0 -0
  35. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/services/video_analyser.py +0 -0
  36. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/test_loader.py +0 -0
  37. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/__init__.py +0 -0
  38. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/tests/test_local.py +0 -0
  39. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/__init__.py +0 -0
  40. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/extract_content.py +0 -0
  41. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/file_loader.py +0 -0
  42. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  43. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
  44. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  45. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  46. {prevectorchunks_core-0.1.31 → prevectorchunks_core-0.1.32}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.31
3
+ Version: 0.1.32
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
49
49
  Requires-Dist: lxml~=4.9.3
50
50
  Requires-Dist: cssselect2~=0.7.0
51
51
  Requires-Dist: cairocffi~=1.4.0
52
+ Requires-Dist: tensorflow<3.0.0,>=2.15.0
53
+ Requires-Dist: codecarbon>=2.3.0
52
54
  Dynamic: license-file
53
55
 
54
56
  # 📚 PreVectorChunks
@@ -0,0 +1,143 @@
1
+ import os
2
+ import tempfile
3
+ import shutil
4
+ import subprocess
5
+ from pathlib import Path
6
+ from PIL import Image
7
+ import io
8
+ import fitz
9
+ from docx2pdf import convert as docx2pdf_convert
10
+ from docx import Document
11
+ from reportlab.pdfgen import canvas
12
+ from reportlab.lib.pagesizes import A4
13
+ import pypandoc
14
+
15
+ # Ensure pandoc is available
16
+ try:
17
+ pypandoc.get_pandoc_path()
18
+ except OSError:
19
+ pypandoc.download_pandoc()
20
+
21
+ class DocuToImageConverter:
22
+ """Converts a document (PDF, DOCX, DOC, image bytes) into a list of PIL images."""
23
+
24
+ def __init__(self):
25
+ pass
26
+
27
+ def _write_temp_file(self, input_bytes: bytes, suffix: str):
28
+ """Write bytes to a temporary file and return path."""
29
+ tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
30
+ with os.fdopen(tmp_fd, "wb") as f:
31
+ f.write(input_bytes)
32
+ return tmp_path
33
+
34
+ def _convert_doc_to_pdf(self, input_path: str) -> str:
35
+ """Convert DOC/DOCX file to PDF using Word COM, LibreOffice, Pandoc, or fallback."""
36
+ if not os.path.exists(input_path):
37
+ raise FileNotFoundError(input_path)
38
+
39
+ output_dir = tempfile.mkdtemp()
40
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
41
+
42
+ # 1️⃣ Microsoft Word COM automation (Windows only)
43
+ try:
44
+ import win32com.client
45
+ word = win32com.client.Dispatch("Word.Application")
46
+ word.Visible = False
47
+ doc = word.Documents.Open(str(Path(input_path).resolve()))
48
+ doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17)
49
+ doc.Close()
50
+ word.Quit()
51
+ return output_pdf
52
+ except Exception:
53
+ pass
54
+
55
+ # 2️⃣ LibreOffice fallback
56
+ try:
57
+ subprocess.run(
58
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
59
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
60
+ )
61
+ return output_pdf
62
+ except Exception:
63
+ pass
64
+
65
+ # 3️⃣ Pandoc fallback
66
+ try:
67
+ pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
68
+ pypandoc.convert_file(input_path, "pdf", outputfile=output_pdf,
69
+ extra_args=["--standalone", f"--pdf-engine={pdf_engine}"])
70
+ return output_pdf
71
+ except Exception:
72
+ pass
73
+
74
+ # 4️⃣ Last resort: ReportLab plain text
75
+ doc = Document(input_path)
76
+ c = canvas.Canvas(output_pdf, pagesize=A4)
77
+ width, height = A4
78
+ y = height - 50
79
+ for p in doc.paragraphs:
80
+ c.drawString(50, y, p.text[:1000])
81
+ y -= 15
82
+ if y < 50:
83
+ c.showPage()
84
+ y = height - 50
85
+ c.save()
86
+ return output_pdf
87
+
88
+ def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
89
+ images = []
90
+ pdf_document = fitz.open(pdf_path)
91
+ for page_num in range(len(pdf_document)):
92
+ page = pdf_document[page_num]
93
+ pixmap = page.get_pixmap(dpi=dpi)
94
+ image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
95
+ images.append(image)
96
+ pdf_document.close()
97
+ return images
98
+
99
+ def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
100
+ """
101
+ Convert a file path or binary content to PIL images.
102
+ Supports PDF, DOC, DOCX, and image files.
103
+ """
104
+ if not file_path and not input_bytes:
105
+ raise ValueError("Provide either file_path or input_bytes.")
106
+
107
+ # Determine extension
108
+ if file_path:
109
+ ext = os.path.splitext(file_path)[1].lower()
110
+ elif input_bytes:
111
+ # Attempt to infer from first few bytes (simple)
112
+ if input_bytes[:4] == b"%PDF":
113
+ ext = ".pdf"
114
+ elif input_bytes[:2] == b"PK":
115
+ ext = ".docx"
116
+ else:
117
+ ext = ".img" # Treat as generic image
118
+
119
+ # Write to temp file if doc/pdf
120
+ if ext in [".pdf", ".doc", ".docx"]:
121
+ file_path = self._write_temp_file(input_bytes, suffix=ext)
122
+
123
+ # Word → PDF
124
+ if ext in [".doc", ".docx"]:
125
+ pdf_path = self._convert_doc_to_pdf(file_path)
126
+ images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
127
+
128
+ # PDF → images
129
+ elif ext == ".pdf":
130
+ images = self._convert_pdf_to_images(file_path, dpi=dpi)
131
+
132
+ # Image
133
+ elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".img"]:
134
+ image = Image.open(io.BytesIO(input_bytes) if input_bytes else file_path).convert("RGB")
135
+ buffer = io.BytesIO()
136
+ image.save(buffer, format=output_format)
137
+ buffer.seek(0)
138
+ images = [Image.open(buffer)]
139
+
140
+ else:
141
+ raise ValueError("Unsupported file type.")
142
+
143
+ return images
@@ -23,7 +23,7 @@ load_dotenv(override=True)
23
23
  class BaseDocumentStrategy:
24
24
  """Defines a standard interface for all document processing strategies."""
25
25
 
26
- def process(self, file_path: str):
26
+ def process(self, file_path: str, input_bytes: bytes = None):
27
27
  raise NotImplementedError("process() must be implemented by subclasses")
28
28
 
29
29
 
@@ -31,7 +31,7 @@ class BaseDocumentStrategy:
31
31
  # PDF Strategy
32
32
  # -----------------------------
33
33
  class PDFStrategy(BaseDocumentStrategy):
34
- def process(self, file_path: str):
34
+ def process(self, file_path: str, input_bytes: bytes = None):
35
35
  print(f"📄 Using PDFStrategy for {file_path}")
36
36
  converter = DocuToImageConverter()
37
37
  # Example: detect multi-column layout or extract embedded text first
@@ -52,7 +52,7 @@ class PDFStrategy(BaseDocumentStrategy):
52
52
  # Word Strategy
53
53
  # -----------------------------
54
54
  class WordStrategy(BaseDocumentStrategy):
55
- def process(self, file_path: str):
55
+ def process(self, file_path: str, input_bytes: bytes = None):
56
56
  file_path = Path(file_path)
57
57
 
58
58
  print(f"📝 Using WordStrategy for {file_path}")
@@ -72,7 +72,7 @@ class WordStrategy(BaseDocumentStrategy):
72
72
  # Image Strategy
73
73
  # -----------------------------
74
74
  class ImageStrategy(BaseDocumentStrategy):
75
- def process(self, file_path: str):
75
+ def process(self, file_path: str, input_bytes: bytes = None):
76
76
  print(f"🖼️ Using ImageStrategy for {file_path}")
77
77
  image = Image.open(file_path).convert("RGB")
78
78
  return [image]
@@ -109,14 +109,14 @@ class MarkdownAndChunkDocuments:
109
109
  self.api_key = os.getenv("OPENAI_API_KEY")
110
110
  self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
111
111
 
112
- def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
112
+ def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
113
113
  # Pick strategy
114
114
  strategy = StrategyFactory.get_strategy(file_path)
115
115
  if not strategy:
116
116
  raise ValueError(f"Unsupported file type: {file_path}")
117
117
 
118
118
  # Convert to images using correct strategy
119
- images = strategy.process(file_path)
119
+ images = strategy.process(file_path, input_bytes)
120
120
 
121
121
  # Extract Markdown from images
122
122
  markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.31
3
+ Version: 0.1.32
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -49,6 +49,8 @@ Requires-Dist: weasyprint~=62.0
49
49
  Requires-Dist: lxml~=4.9.3
50
50
  Requires-Dist: cssselect2~=0.7.0
51
51
  Requires-Dist: cairocffi~=1.4.0
52
+ Requires-Dist: tensorflow<3.0.0,>=2.15.0
53
+ Requires-Dist: codecarbon>=2.3.0
52
54
  Dynamic: license-file
53
55
 
54
56
  # 📚 PreVectorChunks
@@ -35,3 +35,5 @@ weasyprint~=62.0
35
35
  lxml~=4.9.3
36
36
  cssselect2~=0.7.0
37
37
  cairocffi~=1.4.0
38
+ tensorflow<3.0.0,>=2.15.0
39
+ codecarbon>=2.3.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.31"
7
+ version = "0.1.32"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -50,7 +50,9 @@ dependencies = [
50
50
  "weasyprint~=62.0",
51
51
  "lxml~=4.9.3",
52
52
  "cssselect2~=0.7.0",
53
- "cairocffi~=1.4.0"
53
+ "cairocffi~=1.4.0",
54
+ "tensorflow>=2.15.0,<3.0.0", # <-- Add this
55
+ "codecarbon>=2.3.0" # <-- Add this
54
56
  ]
55
57
 
56
58
  [tool.setuptools.packages.find]
@@ -1,148 +0,0 @@
1
- import os
2
- import shutil
3
- import subprocess
4
- import sys
5
- import tempfile
6
- from pathlib import Path
7
-
8
- import pypandoc
9
- from PIL import Image
10
- import io
11
- from docx2pdf import convert as docx_to_pdf
12
- import fitz
13
- from docx2pdf import convert as docx2pdf_convert
14
- try:
15
- pypandoc.get_pandoc_path()
16
- except OSError:
17
- print("Pandoc not found — downloading it temporarily...")
18
- pypandoc.download_pandoc()
19
-
20
- class DocuToImageConverter:
21
- """Converts a document (PDF, DOCX, DOC) into a list of PIL images."""
22
-
23
- def __init__(self):
24
- pass
25
-
26
- def _convert_doc_to_pdf(self, input_path: str) -> str:
27
- import os, tempfile, shutil, subprocess
28
- from pathlib import Path
29
-
30
- if not os.path.exists(input_path):
31
- raise FileNotFoundError(input_path)
32
-
33
- output_dir = tempfile.mkdtemp()
34
- output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
35
-
36
- # 1️⃣ Try Microsoft Word COM automation (Windows only)
37
- try:
38
- import win32com.client
39
- word = win32com.client.Dispatch("Word.Application")
40
- word.Visible = False
41
- doc = word.Documents.Open(str(Path(input_path).resolve()))
42
- doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
43
- doc.Close()
44
- word.Quit()
45
- print("✅ Word COM conversion successful:", output_pdf)
46
- return output_pdf
47
- except Exception as e:
48
- print("⚠️ Word COM conversion failed:", e)
49
-
50
- # 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
51
- try:
52
- # Requires LibreOffice installed and in PATH
53
- subprocess.run(
54
- ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
55
- check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
56
- )
57
- print("✅ LibreOffice conversion successful:", output_pdf)
58
- return output_pdf
59
- except Exception as e:
60
- print("⚠️ LibreOffice conversion failed:", e)
61
-
62
- # 3️⃣ Fallback: Pandoc (simpler, loses layout)
63
- try:
64
- import pypandoc
65
- def which(cmd):
66
- return shutil.which(cmd) is not None
67
-
68
- pdf_engine = "pdflatex" if which("pdflatex") else "wkhtmltopdf"
69
- pypandoc.convert_file(
70
- input_path, "pdf", outputfile=output_pdf,
71
- extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
72
- )
73
- print("✅ Pandoc conversion successful:", output_pdf)
74
- return output_pdf
75
- except Exception as e:
76
- print("⚠️ Pandoc conversion failed:", e)
77
-
78
- # 4️⃣ Last resort: ReportLab basic text (no formatting)
79
- from reportlab.pdfgen import canvas
80
- from reportlab.lib.pagesizes import A4
81
- from docx import Document
82
-
83
- doc = Document(input_path)
84
- c = canvas.Canvas(output_pdf, pagesize=A4)
85
- width, height = A4
86
- y = height - 50
87
- for p in doc.paragraphs:
88
- c.drawString(50, y, p.text[:1000])
89
- y -= 15
90
- if y < 50:
91
- c.showPage()
92
- y = height - 50
93
- c.save()
94
- print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
95
- return output_pdf
96
-
97
- def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
98
- """
99
- Converts each page of a PDF into images using PyMuPDF directly.
100
- """
101
- images = []
102
-
103
- try:
104
- pdf_document = fitz.open(pdf_path) # Use `PyMuPDF` instead of fitz alias
105
- for page_num in range(len(pdf_document)):
106
- page = pdf_document[page_num]
107
- # Render page to a pixmap with the specified DPI
108
- pixmap = page.get_pixmap(dpi=dpi)
109
- # Convert pixmap to an Image object using PIL
110
- image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
111
- images.append(image)
112
- pdf_document.close()
113
- except Exception as e:
114
- raise RuntimeError(f"Failed to convert PDF to images: {e}")
115
-
116
- return images
117
-
118
- def convert_to_images(self, file_path: str, dpi: int = 200, output_format: str = "PNG"):
119
- """
120
- Converts each page of a document into a list of PIL images.
121
- Supports .pdf, .doc, .docx, and image files (.jpg, .png, etc.)
122
- Ensures all outputs are in a consistent image format.
123
- """
124
- ext = os.path.splitext(file_path)[1].lower()
125
-
126
- # Convert Word → PDF first
127
- if ext in [".doc", ".docx"]:
128
- pdf_path = self._convert_doc_to_pdf(file_path)
129
- images = self._convert_pdf_to_images(pdf_path, dpi=dpi)
130
-
131
- # Convert PDF → list of images
132
- elif ext == ".pdf":
133
- images = self._convert_pdf_to_images(file_path, dpi=dpi)
134
-
135
- # Handle already an image file
136
- elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
137
- image = Image.open(file_path).convert("RGB")
138
- # Convert to consistent format (e.g., PNG or JPEG in memory)
139
- buffer = io.BytesIO()
140
- image.save(buffer, format=output_format)
141
- buffer.seek(0)
142
- converted_image = Image.open(buffer)
143
- images = [converted_image]
144
-
145
- else:
146
- raise ValueError("Unsupported file type. Use .pdf, .doc, .docx, or image files")
147
-
148
- return images