prevectorchunks-core 0.1.33__tar.gz → 0.1.34__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {prevectorchunks_core-0.1.33/prevectorchunks_core.egg-info → prevectorchunks_core-0.1.34}/PKG-INFO +1 -1
  2. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/DocuToImageConverter.py +57 -15
  3. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/markdown_and_chunk_documents.py +66 -19
  4. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34/prevectorchunks_core.egg-info}/PKG-INFO +1 -1
  5. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/SOURCES.txt +34 -0
  6. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/pyproject.toml +5 -1
  7. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/LICENCE +0 -0
  8. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/LICENSE +0 -0
  9. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/README.md +0 -0
  10. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/__init__.py +0 -0
  11. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/config/__init__.py +0 -0
  12. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/config/splitter_config.py +0 -0
  13. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/migrations/__init__.py +0 -0
  14. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/__init__.py +0 -0
  15. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/os-llm/llava.py +0 -0
  16. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/__init__.py +0 -0
  17. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/env.py +0 -0
  18. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/inference.py +0 -0
  19. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/model.py +0 -0
  20. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/__init__.py +0 -0
  21. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/model_info.txt +0 -0
  22. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/pretrained/policy_model.pt +0 -0
  23. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/reward.py +0 -0
  24. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/savepretrained.py +0 -0
  25. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/testpretrained.py +0 -0
  26. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/rlchunker/utils.py +0 -0
  27. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/DocuToMarkdownExtractor.py +0 -0
  28. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/__init__.py +0 -0
  29. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/audio_processor.py +0 -0
  30. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/chunk_documents_crud_vdb.py +0 -0
  31. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/chunk_to_all_content_mapper.py +0 -0
  32. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/image_processor.py +0 -0
  33. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/propositional_index.py +0 -0
  34. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/services/video_analyser.py +0 -0
  35. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/test_loader.py +0 -0
  36. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/__init__.py +0 -0
  37. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/tests/test_local.py +0 -0
  38. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/__init__.py +0 -0
  39. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/extract_content.py +0 -0
  40. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/file_loader.py +0 -0
  41. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core/utils/llm_wrapper.py +0 -0
  42. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/dependency_links.txt +0 -0
  43. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/requires.txt +0 -0
  44. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/prevectorchunks_core.egg-info/top_level.txt +0 -0
  45. {prevectorchunks_core-0.1.33 → prevectorchunks_core-0.1.34}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.33
3
+ Version: 0.1.34
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -28,18 +28,54 @@ class DocuToImageConverter:
28
28
  """Write bytes to a temporary file and return path."""
29
29
  tmp_fd, tmp_path = tempfile.mkstemp(suffix=suffix)
30
30
  with os.fdopen(tmp_fd, "wb") as f:
31
- f.write(input_bytes)
31
+ f.write(input_bytes.read())
32
32
  return tmp_path
33
33
 
34
- def _convert_doc_to_pdf(self, input_path: str) -> str:
35
- """Convert DOC/DOCX file to PDF using Word COM, LibreOffice, Pandoc, or fallback."""
34
+ def _convert_doc_to_pdf(self, file_path: str = None, input_bytes=None) -> str:
35
+ """
36
+ Convert DOC/DOCX to PDF. Supports:
37
+ - file_path (string)
38
+ - input_bytes (bytes, InMemoryUploadedFile, or file-like)
39
+ """
40
+
41
+ # ✅ If bytes are provided, write them to a temporary .docx
42
+ if input_bytes is not None:
43
+ # Get filename or fallback
44
+ original_name = getattr(input_bytes, "name", "uploaded.docx")
45
+ ext = os.path.splitext(original_name)[1] or ".docx"
46
+
47
+ # Create a temporary file path
48
+ temp_input_path = tempfile.mktemp(suffix=ext)
49
+
50
+ # Read bytes safely
51
+ if hasattr(input_bytes, "read"): # Django UploadedFile
52
+ input_bytes.seek(0)
53
+ content = input_bytes.read()
54
+ else: # already bytes
55
+ content = input_bytes
56
+
57
+ # Write bytes to temp file
58
+ with open(temp_input_path, "wb") as f:
59
+ f.write(content)
60
+
61
+ input_path = temp_input_path
62
+
63
+ # ✅ If file_path is provided, use it directly
64
+ elif file_path:
65
+ input_path = file_path
66
+
67
+ else:
68
+ raise ValueError("Must supply either file_path or input_bytes")
69
+
70
+ # ✅ Must exist at this point
36
71
  if not os.path.exists(input_path):
37
72
  raise FileNotFoundError(input_path)
38
73
 
74
+ # ✅ Prepare output PDF path
39
75
  output_dir = tempfile.mkdtemp()
40
76
  output_pdf = os.path.join(output_dir, Path(input_path).stem + ".pdf")
41
77
 
42
- # 1️⃣ Microsoft Word COM automation (Windows only)
78
+ # 1️⃣ Try Microsoft Word COM automation (Windows)
43
79
  try:
44
80
  import win32com.client
45
81
  word = win32com.client.Dispatch("Word.Application")
@@ -52,7 +88,7 @@ class DocuToImageConverter:
52
88
  except Exception:
53
89
  pass
54
90
 
55
- # 2️⃣ LibreOffice fallback
91
+ # 2️⃣ Try LibreOffice
56
92
  try:
57
93
  subprocess.run(
58
94
  ["soffice", "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_path],
@@ -65,23 +101,28 @@ class DocuToImageConverter:
65
101
  # 3️⃣ Pandoc fallback
66
102
  try:
67
103
  pdf_engine = "pdflatex" if shutil.which("pdflatex") else "wkhtmltopdf"
68
- pypandoc.convert_file(input_path, "pdf", outputfile=output_pdf,
69
- extra_args=["--standalone", f"--pdf-engine={pdf_engine}"])
104
+ pypandoc.convert_file(
105
+ input_path, "pdf",
106
+ outputfile=output_pdf,
107
+ extra_args=["--standalone", f"--pdf-engine={pdf_engine}"]
108
+ )
70
109
  return output_pdf
71
110
  except Exception:
72
111
  pass
73
112
 
74
- # 4️⃣ Last resort: ReportLab plain text
113
+ # 4️⃣ Final fallback: Render plain text using ReportLab
75
114
  doc = Document(input_path)
76
115
  c = canvas.Canvas(output_pdf, pagesize=A4)
77
116
  width, height = A4
78
117
  y = height - 50
118
+
79
119
  for p in doc.paragraphs:
80
120
  c.drawString(50, y, p.text[:1000])
81
121
  y -= 15
82
122
  if y < 50:
83
123
  c.showPage()
84
124
  y = height - 50
125
+
85
126
  c.save()
86
127
  return output_pdf
87
128
 
@@ -96,7 +137,7 @@ class DocuToImageConverter:
96
137
  pdf_document.close()
97
138
  return images
98
139
 
99
- def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG"):
140
+ def convert_to_images(self, file_path: str = None, input_bytes: bytes = None, dpi: int = 200, output_format: str = "PNG",ext:str=None):
100
141
  """
101
142
  Convert a file path or binary content to PIL images.
102
143
  Supports PDF, DOC, DOCX, and image files.
@@ -107,14 +148,15 @@ class DocuToImageConverter:
107
148
  # Determine extension
108
149
  if file_path:
109
150
  ext = os.path.splitext(file_path)[1].lower()
151
+ print('work')
110
152
  elif input_bytes:
111
153
  # Attempt to infer from first few bytes (simple)
112
- if input_bytes[:4] == b"%PDF":
113
- ext = ".pdf"
114
- elif input_bytes[:2] == b"PK":
115
- ext = ".docx"
116
- else:
117
- ext = ".img" # Treat as generic image
154
+ # if input_bytes[:4] == b"%PDF":
155
+ # ext = ".pdf"
156
+ # elif input_bytes[:2] == b"PK":
157
+ # ext = ".docx"
158
+ # else:
159
+ # ext = ".img" # Treat as generic image
118
160
 
119
161
  # Write to temp file if doc/pdf
120
162
  if ext in [".pdf", ".doc", ".docx"]:
@@ -1,6 +1,8 @@
1
1
  import os
2
2
  import json
3
3
  import tempfile
4
+ import uuid
5
+ from io import BytesIO
4
6
  from pathlib import Path
5
7
 
6
8
  from docx import Document
@@ -17,21 +19,29 @@ from ..utils.file_loader import SplitType
17
19
 
18
20
  load_dotenv(override=True)
19
21
 
22
+ def get_file_extension(file_path,file_name):
23
+ ext=''
24
+ if file_name:
25
+ ext = file_name[1]
26
+ else:
27
+ # Extract extension
28
+ ext = os.path.splitext(file_path)[1].lower()
29
+ return ext
30
+
20
31
  # -----------------------------
21
32
  # Abstract Strategy Interface
22
33
  # -----------------------------
23
34
  class BaseDocumentStrategy:
24
35
  """Defines a standard interface for all document processing strategies."""
25
36
 
26
- def process(self, file_path: str, input_bytes: bytes = None):
37
+ def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
27
38
  raise NotImplementedError("process() must be implemented by subclasses")
28
39
 
29
-
30
40
  # -----------------------------
31
41
  # PDF Strategy
32
42
  # -----------------------------
33
43
  class PDFStrategy(BaseDocumentStrategy):
34
- def process(self, file_path: str, input_bytes: bytes = None):
44
+ def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
35
45
  print(f"📄 Using PDFStrategy for {file_path}")
36
46
  converter = DocuToImageConverter()
37
47
  # Example: detect multi-column layout or extract embedded text first
@@ -44,7 +54,7 @@ class PDFStrategy(BaseDocumentStrategy):
44
54
  # if text_ratio > 0.0001:
45
55
  # print("📚 PDF appears text-based – using hybrid extract + image backup")
46
56
 
47
- images = converter.convert_to_images(file_path)
57
+ images = converter.convert_to_images(file_path,input_bytes,ext=ext)
48
58
  return images
49
59
 
50
60
 
@@ -52,16 +62,18 @@ class PDFStrategy(BaseDocumentStrategy):
52
62
  # Word Strategy
53
63
  # -----------------------------
54
64
  class WordStrategy(BaseDocumentStrategy):
55
- def process(self, file_path: str, input_bytes: bytes = None):
56
- file_path = Path(file_path)
57
-
58
- print(f"📝 Using WordStrategy for {file_path}")
59
-
65
+ def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
66
+ file_name=''
67
+ if file_path:
68
+ file_name = Path(file_path)
69
+ print(f"📝 Using WordStrategy for {file_path}")
70
+ else:
71
+ file_name_no_ext = os.path.splitext(input_bytes.name)[0]
60
72
  with tempfile.TemporaryDirectory() as tmpdir:
61
- pdf_path = Path(tmpdir) / f"{file_path.stem}.pdf"
73
+ pdf_path = Path(tmpdir) / f"{file_name}.pdf"
62
74
 
63
75
  converter = DocuToImageConverter()
64
- pdf_path = converter._convert_doc_to_pdf(file_path)
76
+ pdf_path = converter._convert_doc_to_pdf(file_path=file_path, input_bytes=input_bytes)
65
77
  images = converter.convert_to_images(pdf_path)
66
78
 
67
79
 
@@ -72,9 +84,31 @@ class WordStrategy(BaseDocumentStrategy):
72
84
  # Image Strategy
73
85
  # -----------------------------
74
86
  class ImageStrategy(BaseDocumentStrategy):
75
- def process(self, file_path: str, input_bytes: bytes = None):
87
+ def process(self, file_path: str, input_bytes: bytes = None,ext:str=None):
76
88
  print(f"🖼️ Using ImageStrategy for {file_path}")
77
- image = Image.open(file_path).convert("RGB")
89
+ if file_path:
90
+ # Path-based loading
91
+ image = Image.open(file_path).convert("RGB")
92
+
93
+ else:
94
+ # Byte-based loading
95
+ if input_bytes is None:
96
+ raise ValueError("Either file_path or input_bytes must be provided")
97
+
98
+ # If it's a Django UploadedFile → read() needed
99
+ if hasattr(input_bytes, "read"):
100
+ input_bytes.seek(0)
101
+ image_bytes = input_bytes.read()
102
+
103
+ # If it's already bytes
104
+ elif isinstance(input_bytes, (bytes, bytearray)):
105
+ image_bytes = input_bytes
106
+
107
+ else:
108
+ raise TypeError("input_bytes must be bytes or file-like object")
109
+
110
+ image = Image.open(BytesIO(image_bytes)).convert("RGB")
111
+
78
112
  return [image]
79
113
 
80
114
 
@@ -96,8 +130,14 @@ class StrategyFactory:
96
130
  }
97
131
 
98
132
  @classmethod
99
- def get_strategy(cls, file_path: str) -> BaseDocumentStrategy:
100
- ext = os.path.splitext(file_path)[1].lower()
133
+ def get_strategy(cls, file_path: str,file_name:str=None) -> BaseDocumentStrategy:
134
+ if file_name:
135
+ ext=file_name[1]
136
+ else:
137
+ # Extract extension
138
+
139
+ ext = os.path.splitext(file_path)[1].lower()
140
+
101
141
  return cls.strategies.get(ext, None)
102
142
 
103
143
 
@@ -109,14 +149,15 @@ class MarkdownAndChunkDocuments:
109
149
  self.api_key = os.getenv("OPENAI_API_KEY")
110
150
  self.extractor = DocuToMarkdownExtractor(api_key=self.api_key)
111
151
 
112
- def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None):
152
+ def markdown_and_chunk_documents(self, file_path: str, input_bytes: bytes = None, include_image:bool=None,file_name:str=None):
113
153
  # Pick strategy
114
- strategy = StrategyFactory.get_strategy(file_path)
154
+ strategy = StrategyFactory.get_strategy(file_path,file_name)
115
155
  if not strategy:
116
156
  raise ValueError(f"Unsupported file type: {file_path}")
117
157
 
118
158
  # Convert to images using correct strategy
119
- images = strategy.process(file_path, input_bytes)
159
+ ext=get_file_extension(file_path,file_name)
160
+ images = strategy.process(file_path, input_bytes,ext)
120
161
 
121
162
  # Extract Markdown from images
122
163
  markdown_output, text_content = self.extractor.extract_markdown(images, include_image=include_image)
@@ -146,10 +187,16 @@ class MarkdownAndChunkDocuments:
146
187
  if not any(md_item.get("markdown_text") == m.get("markdown_text") for m in mapped_chunks):
147
188
  md_item["chunked_text"] = md_item["markdown_text"]
148
189
  mapped_chunks.append(md_item)
149
-
190
+ adduuid(mapped_chunks)
150
191
  print("✅ Processing complete.")
151
192
  return mapped_chunks
152
193
 
194
+ def adduuid(mapped_chunks):
195
+ # Assuming mapped_chunks is a list of dictionaries
196
+
197
+ for chunk in mapped_chunks:
198
+ chunk['id'] = str(uuid.uuid4())
199
+
153
200
 
154
201
  # -----------------------------
155
202
  # CLI Entry
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: prevectorchunks-core
3
- Version: 0.1.33
3
+ Version: 0.1.34
4
4
  Summary: A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database
5
5
  Author-email: Zul Al-Kabir <zul.developer.2023@gmail.com>
6
6
  License: MIT License
@@ -2,6 +2,40 @@ LICENCE
2
2
  LICENSE
3
3
  README.md
4
4
  pyproject.toml
5
+ ./prevectorchunks_core/__init__.py
6
+ ./prevectorchunks_core/test_loader.py
7
+ ./prevectorchunks_core/config/__init__.py
8
+ ./prevectorchunks_core/config/splitter_config.py
9
+ ./prevectorchunks_core/migrations/__init__.py
10
+ ./prevectorchunks_core/os-llm/__init__.py
11
+ ./prevectorchunks_core/os-llm/llava.py
12
+ ./prevectorchunks_core/rlchunker/__init__.py
13
+ ./prevectorchunks_core/rlchunker/env.py
14
+ ./prevectorchunks_core/rlchunker/inference.py
15
+ ./prevectorchunks_core/rlchunker/model.py
16
+ ./prevectorchunks_core/rlchunker/reward.py
17
+ ./prevectorchunks_core/rlchunker/savepretrained.py
18
+ ./prevectorchunks_core/rlchunker/testpretrained.py
19
+ ./prevectorchunks_core/rlchunker/utils.py
20
+ ./prevectorchunks_core/rlchunker/pretrained/__init__.py
21
+ ./prevectorchunks_core/rlchunker/pretrained/model_info.txt
22
+ ./prevectorchunks_core/rlchunker/pretrained/policy_model.pt
23
+ ./prevectorchunks_core/services/DocuToImageConverter.py
24
+ ./prevectorchunks_core/services/DocuToMarkdownExtractor.py
25
+ ./prevectorchunks_core/services/__init__.py
26
+ ./prevectorchunks_core/services/audio_processor.py
27
+ ./prevectorchunks_core/services/chunk_documents_crud_vdb.py
28
+ ./prevectorchunks_core/services/chunk_to_all_content_mapper.py
29
+ ./prevectorchunks_core/services/image_processor.py
30
+ ./prevectorchunks_core/services/markdown_and_chunk_documents.py
31
+ ./prevectorchunks_core/services/propositional_index.py
32
+ ./prevectorchunks_core/services/video_analyser.py
33
+ ./prevectorchunks_core/tests/__init__.py
34
+ ./prevectorchunks_core/tests/test_local.py
35
+ ./prevectorchunks_core/utils/__init__.py
36
+ ./prevectorchunks_core/utils/extract_content.py
37
+ ./prevectorchunks_core/utils/file_loader.py
38
+ ./prevectorchunks_core/utils/llm_wrapper.py
5
39
  prevectorchunks_core/__init__.py
6
40
  prevectorchunks_core/test_loader.py
7
41
  prevectorchunks_core.egg-info/PKG-INFO
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "prevectorchunks-core"
7
- version = "0.1.33"
7
+ version = "0.1.34"
8
8
  description = "A Python module that allows conversion of a document into chunks to be inserted into Pinecone vector database"
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -54,13 +54,17 @@ dependencies = [
54
54
  "tensorflow~=2.12.0", # <-- Add this
55
55
  ]
56
56
 
57
+
58
+
57
59
  [tool.setuptools.packages.find]
60
+ where = ["."]
58
61
  include = ["prevectorchunks_core*"]
59
62
 
60
63
  [tool.setuptools.package-data]
61
64
  "prevectorchunks_core.rlchunker.pretrained" = ["*.pt", "*.txt"]
62
65
 
63
66
  [tool.setuptools]
67
+ package-dir = {"" = "."}
64
68
  include-package-data = true
65
69
 
66
70
  [project.urls]