vecforge 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vecforge/exceptions.py ADDED
@@ -0,0 +1,164 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ VecForge custom exceptions.
12
+
13
+ All exceptions inherit from VecForgeError for easy catching.
14
+ Error messages always tell the user what to do next.
15
+
16
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+
22
+ class VecForgeError(Exception):
23
+ """Base exception for all VecForge errors.
24
+
25
+ All VecForge exceptions inherit from this class so callers
26
+ can catch ``except VecForgeError`` for a blanket handler.
27
+
28
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
29
+ """
30
+
31
+
32
+ class VaultEmptyError(VecForgeError):
33
+ """Raised when searching an empty vault.
34
+
35
+ Example:
36
+ >>> raise VaultEmptyError("my_vault")
37
+ VaultEmptyError: Vault 'my_vault' contains no documents.
38
+ Add documents with: db.add("your text") or db.ingest("path/")
39
+ """
40
+
41
+ def __init__(self, vault_name: str) -> None:
42
+ super().__init__(
43
+ f"Vault '{vault_name}' contains no documents.\n"
44
+ f'Add documents with: db.add("your text") or db.ingest("path/")\n'
45
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
46
+ )
47
+
48
+
49
+ class NamespaceNotFoundError(VecForgeError):
50
+ """Raised when referencing a namespace that does not exist.
51
+
52
+ Example:
53
+ >>> raise NamespaceNotFoundError("ward_7", ["default", "ward_5"])
54
+ NamespaceNotFoundError: Namespace 'ward_7' does not exist ...
55
+ """
56
+
57
+ def __init__(self, namespace: str, available: list[str] | None = None) -> None:
58
+ available_str = ", ".join(available) if available else "none"
59
+ super().__init__(
60
+ f"Namespace '{namespace}' does not exist in this vault.\n"
61
+ f"Available namespaces: [{available_str}]\n"
62
+ f"Create it with: db.create_namespace('{namespace}')\n"
63
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
64
+ )
65
+
66
+
67
+ class VecForgePermissionError(VecForgeError):
68
+ """Raised when the current API key lacks required permission.
69
+
70
+ Example:
71
+ >>> raise VecForgePermissionError("write", "read-only")
72
+ VecForgePermissionError: Permission denied: 'write' requires ...
73
+ """
74
+
75
+ def __init__(self, operation: str, current_role: str) -> None:
76
+ super().__init__(
77
+ f"Permission denied: '{operation}' requires a higher role.\n"
78
+ f"Current role: '{current_role}'.\n"
79
+ f"Request an upgraded key from your vault administrator.\n"
80
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
81
+ )
82
+
83
+
84
+ class InvalidAlphaError(VecForgeError):
85
+ """Raised when alpha is outside [0.0, 1.0].
86
+
87
+ Example:
88
+ >>> raise InvalidAlphaError(1.5)
89
+ InvalidAlphaError: alpha must be between 0.0 and 1.0, got 1.5.
90
+ """
91
+
92
+ def __init__(self, alpha: float) -> None:
93
+ super().__init__(
94
+ f"alpha must be between 0.0 and 1.0, got {alpha}.\n"
95
+ f"Use alpha=0.0 for keyword-only, alpha=1.0 for semantic-only, "
96
+ f"alpha=0.5 for balanced hybrid search.\n"
97
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
98
+ )
99
+
100
+
101
+ class EncryptionKeyError(VecForgeError):
102
+ """Raised when encryption key is invalid or missing.
103
+
104
+ Example:
105
+ >>> raise EncryptionKeyError("wrong_key")
106
+ EncryptionKeyError: Failed to decrypt vault with provided key.
107
+ """
108
+
109
+ def __init__(self, reason: str = "wrong_key") -> None:
110
+ messages = {
111
+ "wrong_key": (
112
+ "Failed to decrypt vault with provided key.\n"
113
+ "Ensure VECFORGE_KEY environment variable is set correctly.\n"
114
+ "If you've lost the key, the vault data cannot be recovered."
115
+ ),
116
+ "missing": (
117
+ "This vault is encrypted but no encryption_key was provided.\n"
118
+ "Pass encryption_key=os.environ['VECFORGE_KEY'] when opening.\n"
119
+ "Example: VecForge('vault', encryption_key=os.environ['VECFORGE_KEY'])"
120
+ ),
121
+ "sqlcipher_unavailable": (
122
+ "SQLCipher is not installed on this system.\n"
123
+ "Encryption requires the sqlcipher3 package.\n"
124
+ "Install with: pip install sqlcipher3\n"
125
+ "Falling back to unencrypted SQLite storage."
126
+ ),
127
+ }
128
+ msg = messages.get(reason, f"Encryption error: {reason}")
129
+ super().__init__(
130
+ f"{msg}\n"
131
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
132
+ )
133
+
134
+
135
+ class DeletionProtectedError(VecForgeError):
136
+ """Raised when attempting to delete from a deletion-protected vault.
137
+
138
+ Example:
139
+ >>> raise DeletionProtectedError("doc_123")
140
+ DeletionProtectedError: Cannot delete doc 'doc_123' ...
141
+ """
142
+
143
+ def __init__(self, doc_id: str) -> None:
144
+ super().__init__(
145
+ f"Cannot delete doc '{doc_id}': vault has deletion_protection=True.\n"
146
+ f"Disable with: VecForge('vault', deletion_protection=False)\n"
147
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
148
+ )
149
+
150
+
151
+ class IngestError(VecForgeError):
152
+ """Raised when document ingestion fails.
153
+
154
+ Example:
155
+ >>> raise IngestError("report.xyz", "Unsupported file format")
156
+ IngestError: Failed to ingest 'report.xyz': Unsupported file format.
157
+ """
158
+
159
+ def __init__(self, path: str, reason: str) -> None:
160
+ super().__init__(
161
+ f"Failed to ingest '{path}': {reason}.\n"
162
+ f"Supported formats: .txt, .md, .pdf, .docx, .html\n"
163
+ f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
164
+ )
@@ -0,0 +1,3 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Licensed under BSL 1.1 — see LICENSE for details.
@@ -0,0 +1,181 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ Ingestion dispatcher for VecForge.
12
+
13
+ Auto-detects file format by extension and routes to the appropriate
14
+ parser. Recursively walks directories for batch ingestion.
15
+
16
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from dataclasses import dataclass, field
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from vecforge.exceptions import IngestError
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ # Supported file extensions mapped to handler
31
+ _SUPPORTED_EXTENSIONS: dict[str, str] = {
32
+ ".txt": "text",
33
+ ".md": "text",
34
+ ".pdf": "pdf",
35
+ ".docx": "docx",
36
+ ".html": "html",
37
+ ".htm": "html",
38
+ }
39
+
40
+
41
+ @dataclass
42
+ class IngestChunk:
43
+ """A chunk of text extracted from a document.
44
+
45
+ Attributes:
46
+ text: Extracted text content.
47
+ metadata: Metadata about the chunk (source, page, etc.).
48
+ modality: Content modality (text, image, audio, etc.).
49
+ """
50
+
51
+ text: str
52
+ metadata: dict[str, Any] = field(default_factory=dict)
53
+ modality: str = "text"
54
+
55
+
56
+ class IngestDispatcher:
57
+ """Auto-detecting document ingestion dispatcher.
58
+
59
+ Walks directories, detects file formats, and routes to the
60
+ appropriate parser. Returns text chunks ready for embedding.
61
+
62
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
63
+
64
+ Args:
65
+ chunk_size: Maximum characters per chunk. Defaults to 1000.
66
+ chunk_overlap: Overlap between chunks in characters. Defaults to 200.
67
+
68
+ Performance:
69
+ Time: O(F * S) where F = files, S = avg file size
70
+
71
+ Example:
72
+ >>> dispatcher = IngestDispatcher()
73
+ >>> chunks = dispatcher.ingest("my_documents/")
74
+ >>> len(chunks)
75
+ 42
76
+ """
77
+
78
+ def __init__(
79
+ self,
80
+ chunk_size: int = 1000,
81
+ chunk_overlap: int = 200,
82
+ ) -> None:
83
+ self._chunk_size = chunk_size
84
+ self._chunk_overlap = chunk_overlap
85
+
86
+ def ingest(self, path: str) -> list[IngestChunk]:
87
+ """Ingest a file or directory of files.
88
+
89
+ Args:
90
+ path: File path or directory path to ingest.
91
+
92
+ Returns:
93
+ List of IngestChunk objects ready for embedding.
94
+
95
+ Raises:
96
+ IngestError: If file format is not supported.
97
+ FileNotFoundError: If path does not exist.
98
+
99
+ Performance:
100
+ Time: O(F * S)
101
+
102
+ Example:
103
+ >>> chunks = dispatcher.ingest("reports/")
104
+ >>> for chunk in chunks:
105
+ ... print(f"{chunk.metadata['source']}: {chunk.text[:50]}...")
106
+ """
107
+ target = Path(path)
108
+
109
+ if not target.exists():
110
+ raise FileNotFoundError(
111
+ f"Path not found: {path}\nVecForge by Suneel Bose K · ArcGX TechLabs"
112
+ )
113
+
114
+ if target.is_file():
115
+ return self._ingest_file(target)
116
+
117
+ # why: Recursively walk directory
118
+ all_chunks: list[IngestChunk] = []
119
+ for file_path in sorted(target.rglob("*")):
120
+ ext = file_path.suffix.lower()
121
+ if file_path.is_file() and ext in _SUPPORTED_EXTENSIONS:
122
+ try:
123
+ chunks = self._ingest_file(file_path)
124
+ all_chunks.extend(chunks)
125
+ except IngestError as e:
126
+ logger.warning("Skipping %s: %s", file_path, e)
127
+
128
+ logger.info("Ingested %d chunks from %s", len(all_chunks), path)
129
+ return all_chunks
130
+
131
+ def _ingest_file(self, file_path: Path) -> list[IngestChunk]:
132
+ """Ingest a single file.
133
+
134
+ Args:
135
+ file_path: Path to the file.
136
+
137
+ Returns:
138
+ List of IngestChunk from this file.
139
+
140
+ Performance:
141
+ Time: O(S) where S = file size
142
+ """
143
+ ext = file_path.suffix.lower()
144
+ handler = _SUPPORTED_EXTENSIONS.get(ext)
145
+
146
+ if handler is None:
147
+ raise IngestError(
148
+ str(file_path),
149
+ f"Unsupported file extension '{ext}'",
150
+ )
151
+
152
+ # why: Import document parser lazily to avoid heavy deps at import time
153
+ from vecforge.ingest.document import DocumentParser
154
+
155
+ parser = DocumentParser(
156
+ chunk_size=self._chunk_size,
157
+ chunk_overlap=self._chunk_overlap,
158
+ )
159
+
160
+ if handler == "text":
161
+ return parser.parse_text_file(file_path)
162
+ elif handler == "pdf":
163
+ return parser.parse_pdf(file_path)
164
+ elif handler == "docx":
165
+ return parser.parse_docx(file_path)
166
+ elif handler == "html":
167
+ return parser.parse_html_file(file_path)
168
+ else:
169
+ raise IngestError(str(file_path), f"No handler for '{handler}'")
170
+
171
+ @staticmethod
172
+ def supported_extensions() -> list[str]:
173
+ """Return list of supported file extensions.
174
+
175
+ Returns:
176
+ Sorted list of supported extensions.
177
+
178
+ Performance:
179
+ Time: O(1)
180
+ """
181
+ return sorted(_SUPPORTED_EXTENSIONS.keys())
@@ -0,0 +1,237 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
4
+ #
5
+ # Licensed under the Business Source License 1.1 (BSL 1.1)
6
+ # Free for personal, research, open-source, and non-commercial use.
7
+ # Commercial use requires a separate license from ArcGX TechLabs.
8
+ # See LICENSE file in the project root or contact: suneelbose@arcgx.in
9
+
10
+ """
11
+ Document parser for VecForge.
12
+
13
+ Handles text extraction and chunking for common document formats:
14
+ PDF (via PyMuPDF), DOCX (via python-docx), HTML (via BeautifulSoup),
15
+ and plain text/markdown.
16
+
17
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ from vecforge.ingest.dispatcher import IngestChunk
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class DocumentParser:
32
+ """Multi-format document text extractor with chunking.
33
+
34
+ Extracts raw text from supported formats and splits into
35
+ overlapping chunks suitable for embedding.
36
+
37
+ Built by Suneel Bose K · ArcGX TechLabs Private Limited.
38
+
39
+ Args:
40
+ chunk_size: Maximum characters per chunk. Defaults to 1000.
41
+ chunk_overlap: Overlap between consecutive chunks. Defaults to 200.
42
+
43
+ Performance:
44
+ Time: O(S) where S = total text size
45
+ Chunking: O(S / chunk_size) chunks produced
46
+
47
+ Example:
48
+ >>> parser = DocumentParser(chunk_size=500, chunk_overlap=100)
49
+ >>> chunks = parser.parse_text_file(Path("report.txt"))
50
+ >>> len(chunks)
51
+ 15
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ chunk_size: int = 1000,
57
+ chunk_overlap: int = 200,
58
+ ) -> None:
59
+ self._chunk_size = chunk_size
60
+ self._chunk_overlap = chunk_overlap
61
+
62
+ def _chunk_text(
63
+ self,
64
+ text: str,
65
+ source: str,
66
+ extra_metadata: dict[str, Any] | None = None,
67
+ ) -> list[IngestChunk]:
68
+ """Split text into overlapping chunks.
69
+
70
+ Args:
71
+ text: Raw text to chunk.
72
+ source: Source file path for metadata.
73
+ extra_metadata: Additional metadata to add to each chunk.
74
+
75
+ Returns:
76
+ List of IngestChunk with text and metadata.
77
+
78
+ Performance:
79
+ Time: O(S / chunk_size)
80
+ """
81
+ if not text.strip():
82
+ return []
83
+
84
+ chunks: list[IngestChunk] = []
85
+ start = 0
86
+ chunk_index = 0
87
+
88
+ while start < len(text):
89
+ end = start + self._chunk_size
90
+
91
+ # why: Try to break at sentence/paragraph boundary
92
+ if end < len(text):
93
+ # Look for paragraph break first, then sentence end
94
+ for sep in ["\n\n", "\n", ". ", "! ", "? "]:
95
+ last_sep = text.rfind(sep, start, end)
96
+ if last_sep > start:
97
+ end = last_sep + len(sep)
98
+ break
99
+
100
+ chunk_text = text[start:end].strip()
101
+
102
+ if chunk_text:
103
+ meta = {
104
+ "source": source,
105
+ "chunk_index": chunk_index,
106
+ "char_start": start,
107
+ "char_end": end,
108
+ }
109
+ if extra_metadata:
110
+ meta.update(extra_metadata)
111
+
112
+ chunks.append(IngestChunk(text=chunk_text, metadata=meta))
113
+ chunk_index += 1
114
+
115
+ # why: Move forward by chunk_size - overlap for continuity
116
+ start = end - self._chunk_overlap
117
+ if start <= chunks[-1].metadata["char_start"] if chunks else True:
118
+ # safety: Prevent infinite loop
119
+ start = end
120
+
121
+ logger.debug("Chunked %s into %d chunks", source, len(chunks))
122
+ return chunks
123
+
124
+ def parse_text_file(self, path: Path) -> list[IngestChunk]:
125
+ """Parse a plain text or markdown file.
126
+
127
+ Args:
128
+ path: Path to .txt or .md file.
129
+
130
+ Returns:
131
+ List of IngestChunk from the file.
132
+
133
+ Performance:
134
+ Time: O(S) where S = file size
135
+ """
136
+ text = path.read_text(encoding="utf-8", errors="replace")
137
+ return self._chunk_text(text, source=str(path))
138
+
139
+ def parse_pdf(self, path: Path) -> list[IngestChunk]:
140
+ """Parse a PDF file using PyMuPDF (fitz).
141
+
142
+ Args:
143
+ path: Path to .pdf file.
144
+
145
+ Returns:
146
+ List of IngestChunk with page metadata.
147
+
148
+ Performance:
149
+ Time: O(P * S) where P = pages, S = avg page text size
150
+ """
151
+ try:
152
+ import fitz # PyMuPDF
153
+ except ImportError as e:
154
+ raise ImportError(
155
+ "PyMuPDF (fitz) is required for PDF ingestion.\n"
156
+ "Install with: pip install pymupdf\n"
157
+ "VecForge by Suneel Bose K · ArcGX TechLabs"
158
+ ) from e
159
+
160
+ all_chunks: list[IngestChunk] = []
161
+
162
+ with fitz.open(str(path)) as doc:
163
+ for page_num, page in enumerate(doc):
164
+ text = page.get_text()
165
+ if text.strip():
166
+ chunks = self._chunk_text(
167
+ text,
168
+ source=str(path),
169
+ extra_metadata={"page": page_num + 1},
170
+ )
171
+ all_chunks.extend(chunks)
172
+
173
+ logger.info("Parsed PDF %s: %d chunks", path.name, len(all_chunks))
174
+ return all_chunks
175
+
176
+ def parse_docx(self, path: Path) -> list[IngestChunk]:
177
+ """Parse a DOCX file using python-docx.
178
+
179
+ Args:
180
+ path: Path to .docx file.
181
+
182
+ Returns:
183
+ List of IngestChunk from the document.
184
+
185
+ Performance:
186
+ Time: O(P) where P = number of paragraphs
187
+ """
188
+ try:
189
+ import docx
190
+ except ImportError as e:
191
+ raise ImportError(
192
+ "python-docx is required for DOCX ingestion.\n"
193
+ "Install with: pip install python-docx\n"
194
+ "VecForge by Suneel Bose K · ArcGX TechLabs"
195
+ ) from e
196
+
197
+ doc = docx.Document(str(path))
198
+ full_text = "\n\n".join(
199
+ para.text for para in doc.paragraphs if para.text.strip()
200
+ )
201
+
202
+ chunks = self._chunk_text(full_text, source=str(path))
203
+ logger.info("Parsed DOCX %s: %d chunks", path.name, len(chunks))
204
+ return chunks
205
+
206
+ def parse_html_file(self, path: Path) -> list[IngestChunk]:
207
+ """Parse an HTML file using BeautifulSoup.
208
+
209
+ Args:
210
+ path: Path to .html or .htm file.
211
+
212
+ Returns:
213
+ List of IngestChunk from the HTML content.
214
+
215
+ Performance:
216
+ Time: O(S) where S = file size
217
+ """
218
+ try:
219
+ from bs4 import BeautifulSoup
220
+ except ImportError as e:
221
+ raise ImportError(
222
+ "beautifulsoup4 is required for HTML ingestion.\n"
223
+ "Install with: pip install beautifulsoup4\n"
224
+ "VecForge by Suneel Bose K · ArcGX TechLabs"
225
+ ) from e
226
+
227
+ html = path.read_text(encoding="utf-8", errors="replace")
228
+ soup = BeautifulSoup(html, "html.parser")
229
+
230
+ # why: Remove script and style elements before extracting text
231
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
232
+ tag.decompose()
233
+
234
+ text = soup.get_text(separator="\n", strip=True)
235
+ chunks = self._chunk_text(text, source=str(path))
236
+ logger.info("Parsed HTML %s: %d chunks", path.name, len(chunks))
237
+ return chunks
@@ -0,0 +1,3 @@
1
+ # VecForge — Universal Local-First Vector Database
2
+ # Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
3
+ # Licensed under BSL 1.1 — see LICENSE for details.