prevectorchunks-core 0.1.27__tar.gz → 0.1.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {prevectorchunks_core-0.1.27/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.29}/PKG-INFO +2 -2
  2. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToImageConverter.py +40 -17
  3. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +9 -6
  4. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/markdown_and_chunk_documents.py +11 -17
  5. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/test_loader.py +1 -1
  6. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29/prevectorchunks_core.egg-info}/PKG-INFO +2 -2
  7. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/requires.txt +1 -1
  8. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/pyproject.toml +2 -2
  9. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/LICENCE +0 -0
  10. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/LICENSE +0 -0
  11. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/README.md +0 -0
  12. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/__init__.py +0 -0
  13. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/config/__init__.py +0 -0
  14. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/config/splitter_config.py +0 -0
  15. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/migrations/__init__.py +0 -0
  16. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/__init__.py +0 -0
  17. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/os-llm/llava.py +0 -0
  18. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  19. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/env.py +0 -0
  20. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/inference.py +0 -0
  21. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/model.py +0 -0
  22. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  23. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  24. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  25. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/reward.py +0 -0
  26. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  27. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  28. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/rlchunker/utils.py +0 -0
  29. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/__init__.py +0 -0
  30. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/audio_processor.py +0 -0
  31. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  32. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
  33. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/image_processor.py +0 -0
  34. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/propositional_index.py +0 -0
  35. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/services/video_analyser.py +0 -0
  36. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/__init__.py +0 -0
  37. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/tests/test_local.py +0 -0
  38. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/__init__.py +0 -0
  39. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/extract_content.py +0 -0
  40. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/file_loader.py +0 -0
  41. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  42. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/SOURCES.txt +0 -0
  43. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  44. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  45. {prevectorchunks_core-0.1.27 → prevectorchunks_core-0.1.29}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.27
3
+ Version: 0.1.29
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
40
40
  Requires-Dist: langchain-core~=0.3.78
41
41
  Requires-Dist: pdf2image~=1.17.0
42
42
  Requires-Dist: docx2pdf~=0.1.8
43
- Requires-Dist: numpy~=2.2.6
43
+ Requires-Dist: numpy~=2.0.0
44
44
  Requires-Dist: scikit-learn~=1.7.2
45
45
  Requires-Dist: PyMuPDF~=1.22.5
46
46
  Requires-Dist: pypandoc~=1.13
@@ -3,6 +3,7 @@ import shutil
3
3
  import subprocess
4
4
  import sys
5
5
  import tempfile
6
+ from pathlib import Path
6
7
 
7
8
  import pypandoc
8
9
  from PIL import Image
@@ -23,19 +24,44 @@ class DocuToImageConverter:
23
24
  pass
24
25
 
25
26
  def _convert_doc_to_pdf(self, input_path: str) -> str:
26
- import shutil, tempfile, os, pypandoc
27
- from docx import Document
27
+ import os, tempfile, shutil, subprocess
28
+ from pathlib import Path
28
29
 
29
30
  if not os.path.exists(input_path):
30
31
  raise FileNotFoundError(input_path)
31
32
 
32
33
  output_dir = tempfile.mkdtemp()
33
- output_pdf = os.path.join(output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf")
34
+ output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
35
+
36
+ # 1️⃣ Try Microsoft Word COM automation (Windows only)
37
+ try:
38
+ import win32com.client
39
+ word = win32com.client.Dispatch("Word.Application")
40
+ word.Visible = False
41
+ doc = word.Documents.Open(str(Path(input_path).resolve()))
42
+ doc.SaveAs(str(Path(output_pdf).resolve()), FileFormat=17) # 17 = wdFormatPDF
43
+ doc.Close()
44
+ word.Quit()
45
+ print("✅ Word COM conversion successful:", output_pdf)
46
+ return output_pdf
47
+ except Exception as e:
48
+ print("⚠️ Word COM conversion failed:", e)
34
49
 
35
- # 1️⃣ Try Pandoc + wkhtmltopdf or pdflatex
50
+ # 2️⃣ Fallback: LibreOffice (cross-platform, preserves layout)
36
51
  try:
37
- pypandoc.get_pandoc_path()
52
+ # Requires LibreOffice installed and in PATH
53
+ subprocess.run(
54
+ ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
55
+ check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
56
+ )
57
+ print("✅ LibreOffice conversion successful:", output_pdf)
58
+ return output_pdf
59
+ except Exception as e:
60
+ print("⚠️ LibreOffice conversion failed:", e)
38
61
 
62
+ # 3️⃣ Fallback: Pandoc (simpler, loses layout)
63
+ try:
64
+ import pypandoc
39
65
  def which(cmd):
40
66
  return shutil.which(cmd) is not None
41
67
 
@@ -44,23 +70,16 @@ class DocuToImageConverter:
44
70
  input_path, "pdf", outputfile=output_pdf,
45
71
  extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
46
72
  )
73
+ print("✅ Pandoc conversion successful:", output_pdf)
47
74
  return output_pdf
48
75
  except Exception as e:
49
- print("⚠️ Pandoc PDF conversion failed:", e)
76
+ print("⚠️ Pandoc conversion failed:", e)
50
77
 
51
- # 2️⃣ Fallback to pure Python (WeasyPrint)
52
- try:
53
- from weasyprint import HTML
54
- doc = Document(input_path)
55
- html = "<html><body>" + "".join(f"<p>{p.text}</p>" for p in doc.paragraphs) + "</body></html>"
56
- HTML(string=html).write_pdf(output_pdf)
57
- return output_pdf
58
- except Exception as e:
59
- print("⚠️ Fallback to WeasyPrint failed:", e)
60
-
61
- # 3️⃣ Last resort (plain text with ReportLab)
78
+ # 4️⃣ Last resort: ReportLab basic text (no formatting)
62
79
  from reportlab.pdfgen import canvas
63
80
  from reportlab.lib.pagesizes import A4
81
+ from docx import Document
82
+
64
83
  doc = Document(input_path)
65
84
  c = canvas.Canvas(output_pdf, pagesize=A4)
66
85
  width, height = A4
@@ -68,7 +87,11 @@ class DocuToImageConverter:
68
87
  for p in doc.paragraphs:
69
88
  c.drawString(50, y, p.text[:1000])
70
89
  y -= 15
90
+ if y < 50:
91
+ c.showPage()
92
+ y = height - 50
71
93
  c.save()
94
+ print("⚠️ Fallback to plain ReportLab text output:", output_pdf)
72
95
  return output_pdf
73
96
 
74
97
  def _convert_pdf_to_images(self, pdf_path: str, dpi: int = 200):
@@ -42,12 +42,12 @@ class DocuToMarkdownExtractor:
42
42
  fins = [{"type": "text", "text": "You are a document parser. Extract all text and tables "
43
43
  "from this image and format the output in clean Markdown. "
44
44
  "Preserve table structure, headings, and lists. If there is no markdown, put a space. "
45
- "Put your result in a JSON object with the following keys:\n"
46
- "- markdown_text: the markdown text\n"
47
- "- short_title: the short title of the document\n"
48
- "- page_number: the page number of the document (image index + 1)\n"
49
- "- summary: a summary of the document\n,"
50
- " - image_data: the image data in base64 format\n,"
45
+ "Put your result in a JSON object with the following keys:"
46
+ "- markdown_text: the markdown text"
47
+ "- short_title: the short title of the document"
48
+ "- page_number: the page number of the document (i+1)"
49
+ "- summary: a summary of the document,"
50
+ " - image_data: the image data in base64 format,"
51
51
  "Return only raw JSON, without markdown formatting or triple backticks."
52
52
  "- image_index: the index of the image in the document"},
53
53
  {"type": "text", "text": "You are an image inspector. Tell us what is in the image "
@@ -63,6 +63,9 @@ class DocuToMarkdownExtractor:
63
63
  text_content=text_content+"\n"+response["markdown_text"]
64
64
  if(include_image):
65
65
  response["image_data"]=b64_image
66
+ response["image_index"]=i
67
+ response["page_number"] = i
68
+
66
69
  all_outputs.append(response)
67
70
 
68
71
  json_array = json.dumps(all_outputs, indent=2)
@@ -1,5 +1,7 @@
1
1
  import os
2
2
  import json
3
+ import tempfile
4
+ from pathlib import Path
3
5
 
4
6
  from docx import Document
5
7
  from dotenv import load_dotenv
@@ -51,25 +53,17 @@ class PDFStrategy(BaseDocumentStrategy):
51
53
  # -----------------------------
52
54
  class WordStrategy(BaseDocumentStrategy):
53
55
  def process(self, file_path: str):
56
+ file_path = Path(file_path)
57
+
54
58
  print(f"📝 Using WordStrategy for {file_path}")
55
59
 
56
- # Extract text semantically first
57
- try:
58
- doc = Document(file_path)
59
- paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
60
- text_content = "\n".join(paragraphs)
61
- print(f"🧩 Extracted {len(paragraphs)} paragraphs via python-docx")
62
- except Exception as e:
63
- print("⚠️ Could not parse docx structurally, falling back to image mode:", e)
64
- text_content = ""
60
+ with tempfile.TemporaryDirectory() as tmpdir:
61
+ pdf_path = Path(tmpdir) / f"{file_path.stem}.pdf"
65
62
 
66
- converter = DocuToImageConverter()
67
- pdf_path = converter._convert_doc_to_pdf(file_path)
68
- images = converter.convert_to_images(pdf_path)
63
+ converter = DocuToImageConverter()
64
+ pdf_path = converter._convert_doc_to_pdf(file_path)
65
+ images = converter.convert_to_images(pdf_path)
69
66
 
70
- # Optional: attach text fallback
71
- if text_content:
72
- images[0].extracted_text = text_content # for later use by extractor
73
67
 
74
68
  return images
75
69
 
@@ -115,7 +109,7 @@ class MarkdownAndChunkDocuments:
115
109
  self.api_key = os.getenv("OPENAI_API_KEY")
116
110
  self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
117
111
 
118
- def markdown_and_chunk_documents(self, file_path: str):
112
+ def markdown_and_chunk_documents(self, file_path: str,include_image:bool):
119
113
  # Pick strategy
120
114
  strategy = StrategyFactory.get_strategy(file_path)
121
115
  if not strategy:
@@ -125,7 +119,7 @@ class MarkdownAndChunkDocuments:
125
119
  images = strategy.process(file_path)
126
120
 
127
121
  # Extract Markdown from images
128
- markdown_output, text_content = self.extractor.extract_markdown(images, include_image=False)
122
+ markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
129
123
  binary_text_content = text_content.encode("utf-8")
130
124
 
131
125
  # Chunking and mapping
@@ -34,7 +34,7 @@ def test_load_file_and_upsert_chunks_to_vdb(temp_json_file):
34
34
  def test_markdown(temp_json_file):
35
35
  markdown_and_chunk_documents = MarkdownAndChunkDocuments()
36
36
  mapped_chunks = markdown_and_chunk_documents.markdown_and_chunk_documents(
37
- "content.docx")
37
+ "content.docx",include_image=True)
38
38
  print(mapped_chunks)
39
39
  for i, c in enumerate(mapped_chunks):
40
40
  print(f"Chunk {i + 1}: {c}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.27
3
+ Version: 0.1.29
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -40,7 +40,7 @@ Requires-Dist: requests~=2.32.5
40
40
  Requires-Dist: langchain-core~=0.3.78
41
41
  Requires-Dist: pdf2image~=1.17.0
42
42
  Requires-Dist: docx2pdf~=0.1.8
43
- Requires-Dist: numpy~=2.2.6
43
+ Requires-Dist: numpy~=2.0.0
44
44
  Requires-Dist: scikit-learn~=1.7.2
45
45
  Requires-Dist: PyMuPDF~=1.22.5
46
46
  Requires-Dist: pypandoc~=1.13
@@ -26,7 +26,7 @@ requests~=2.32.5
26
26
  langchain-core~=0.3.78
27
27
  pdf2image~=1.17.0
28
28
  docx2pdf~=0.1.8
29
- numpy~=2.2.6
29
+ numpy~=2.0.0
30
30
  scikit-learn~=1.7.2
31
31
  PyMuPDF~=1.22.5
32
32
  pypandoc~=1.13
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.27"
7
+ version = "0.1.29"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -42,7 +42,7 @@ dependencies = [
42
42
  "langchain-core~=0.3.78",
43
43
  "pdf2image~=1.17.0",
44
44
  "docx2pdf~=0.1.8",
45
- "numpy~=2.2.6",
45
+ "numpy~=2.0.0",
46
46
  "scikit-learn~=1.7.2",
47
47
  "PyMuPDF~=1.22.5",
48
48
  "pypandoc~=1.13",