vecforge 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vecforge/__init__.py +59 -0
- vecforge/cli/__init__.py +3 -0
- vecforge/cli/main.py +197 -0
- vecforge/core/__init__.py +3 -0
- vecforge/core/bm25.py +187 -0
- vecforge/core/embedder.py +152 -0
- vecforge/core/indexer.py +196 -0
- vecforge/core/reranker.py +120 -0
- vecforge/core/storage.py +493 -0
- vecforge/core/vault.py +760 -0
- vecforge/exceptions.py +164 -0
- vecforge/ingest/__init__.py +3 -0
- vecforge/ingest/dispatcher.py +181 -0
- vecforge/ingest/document.py +237 -0
- vecforge/search/__init__.py +3 -0
- vecforge/search/cascade.py +186 -0
- vecforge/search/filters.py +146 -0
- vecforge/search/hybrid.py +146 -0
- vecforge/security/__init__.py +3 -0
- vecforge/security/audit.py +169 -0
- vecforge/security/encryption.py +84 -0
- vecforge/security/namespaces.py +127 -0
- vecforge/security/rbac.py +172 -0
- vecforge/security/snapshots.py +135 -0
- vecforge/server/__init__.py +3 -0
- vecforge/server/app.py +54 -0
- vecforge/server/routes.py +215 -0
- vecforge-0.2.0.dist-info/METADATA +302 -0
- vecforge-0.2.0.dist-info/RECORD +34 -0
- vecforge-0.2.0.dist-info/WHEEL +5 -0
- vecforge-0.2.0.dist-info/entry_points.txt +2 -0
- vecforge-0.2.0.dist-info/licenses/LICENSE +45 -0
- vecforge-0.2.0.dist-info/licenses/NOTICE +14 -0
- vecforge-0.2.0.dist-info/top_level.txt +1 -0
vecforge/exceptions.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
VecForge custom exceptions.
|
|
12
|
+
|
|
13
|
+
All exceptions inherit from VecForgeError for easy catching.
|
|
14
|
+
Error messages always tell the user what to do next.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class VecForgeError(Exception):
|
|
23
|
+
"""Base exception for all VecForge errors.
|
|
24
|
+
|
|
25
|
+
All VecForge exceptions inherit from this class so callers
|
|
26
|
+
can catch ``except VecForgeError`` for a blanket handler.
|
|
27
|
+
|
|
28
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class VaultEmptyError(VecForgeError):
|
|
33
|
+
"""Raised when searching an empty vault.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> raise VaultEmptyError("my_vault")
|
|
37
|
+
VaultEmptyError: Vault 'my_vault' contains no documents.
|
|
38
|
+
Add documents with: db.add("your text") or db.ingest("path/")
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, vault_name: str) -> None:
|
|
42
|
+
super().__init__(
|
|
43
|
+
f"Vault '{vault_name}' contains no documents.\n"
|
|
44
|
+
f'Add documents with: db.add("your text") or db.ingest("path/")\n'
|
|
45
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class NamespaceNotFoundError(VecForgeError):
|
|
50
|
+
"""Raised when referencing a namespace that does not exist.
|
|
51
|
+
|
|
52
|
+
Example:
|
|
53
|
+
>>> raise NamespaceNotFoundError("ward_7", ["default", "ward_5"])
|
|
54
|
+
NamespaceNotFoundError: Namespace 'ward_7' does not exist ...
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, namespace: str, available: list[str] | None = None) -> None:
|
|
58
|
+
available_str = ", ".join(available) if available else "none"
|
|
59
|
+
super().__init__(
|
|
60
|
+
f"Namespace '{namespace}' does not exist in this vault.\n"
|
|
61
|
+
f"Available namespaces: [{available_str}]\n"
|
|
62
|
+
f"Create it with: db.create_namespace('{namespace}')\n"
|
|
63
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class VecForgePermissionError(VecForgeError):
|
|
68
|
+
"""Raised when the current API key lacks required permission.
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> raise VecForgePermissionError("write", "read-only")
|
|
72
|
+
VecForgePermissionError: Permission denied: 'write' requires ...
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, operation: str, current_role: str) -> None:
|
|
76
|
+
super().__init__(
|
|
77
|
+
f"Permission denied: '{operation}' requires a higher role.\n"
|
|
78
|
+
f"Current role: '{current_role}'.\n"
|
|
79
|
+
f"Request an upgraded key from your vault administrator.\n"
|
|
80
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class InvalidAlphaError(VecForgeError):
|
|
85
|
+
"""Raised when alpha is outside [0.0, 1.0].
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> raise InvalidAlphaError(1.5)
|
|
89
|
+
InvalidAlphaError: alpha must be between 0.0 and 1.0, got 1.5.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(self, alpha: float) -> None:
|
|
93
|
+
super().__init__(
|
|
94
|
+
f"alpha must be between 0.0 and 1.0, got {alpha}.\n"
|
|
95
|
+
f"Use alpha=0.0 for keyword-only, alpha=1.0 for semantic-only, "
|
|
96
|
+
f"alpha=0.5 for balanced hybrid search.\n"
|
|
97
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class EncryptionKeyError(VecForgeError):
|
|
102
|
+
"""Raised when encryption key is invalid or missing.
|
|
103
|
+
|
|
104
|
+
Example:
|
|
105
|
+
>>> raise EncryptionKeyError("wrong_key")
|
|
106
|
+
EncryptionKeyError: Failed to decrypt vault with provided key.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, reason: str = "wrong_key") -> None:
|
|
110
|
+
messages = {
|
|
111
|
+
"wrong_key": (
|
|
112
|
+
"Failed to decrypt vault with provided key.\n"
|
|
113
|
+
"Ensure VECFORGE_KEY environment variable is set correctly.\n"
|
|
114
|
+
"If you've lost the key, the vault data cannot be recovered."
|
|
115
|
+
),
|
|
116
|
+
"missing": (
|
|
117
|
+
"This vault is encrypted but no encryption_key was provided.\n"
|
|
118
|
+
"Pass encryption_key=os.environ['VECFORGE_KEY'] when opening.\n"
|
|
119
|
+
"Example: VecForge('vault', encryption_key=os.environ['VECFORGE_KEY'])"
|
|
120
|
+
),
|
|
121
|
+
"sqlcipher_unavailable": (
|
|
122
|
+
"SQLCipher is not installed on this system.\n"
|
|
123
|
+
"Encryption requires the sqlcipher3 package.\n"
|
|
124
|
+
"Install with: pip install sqlcipher3\n"
|
|
125
|
+
"Falling back to unencrypted SQLite storage."
|
|
126
|
+
),
|
|
127
|
+
}
|
|
128
|
+
msg = messages.get(reason, f"Encryption error: {reason}")
|
|
129
|
+
super().__init__(
|
|
130
|
+
f"{msg}\n"
|
|
131
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class DeletionProtectedError(VecForgeError):
|
|
136
|
+
"""Raised when attempting to delete from a deletion-protected vault.
|
|
137
|
+
|
|
138
|
+
Example:
|
|
139
|
+
>>> raise DeletionProtectedError("doc_123")
|
|
140
|
+
DeletionProtectedError: Cannot delete doc 'doc_123' ...
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
def __init__(self, doc_id: str) -> None:
|
|
144
|
+
super().__init__(
|
|
145
|
+
f"Cannot delete doc '{doc_id}': vault has deletion_protection=True.\n"
|
|
146
|
+
f"Disable with: VecForge('vault', deletion_protection=False)\n"
|
|
147
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class IngestError(VecForgeError):
|
|
152
|
+
"""Raised when document ingestion fails.
|
|
153
|
+
|
|
154
|
+
Example:
|
|
155
|
+
>>> raise IngestError("report.xyz", "Unsupported file format")
|
|
156
|
+
IngestError: Failed to ingest 'report.xyz': Unsupported file format.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
def __init__(self, path: str, reason: str) -> None:
|
|
160
|
+
super().__init__(
|
|
161
|
+
f"Failed to ingest '{path}': {reason}.\n"
|
|
162
|
+
f"Supported formats: .txt, .md, .pdf, .docx, .html\n"
|
|
163
|
+
f"VecForge by Suneel Bose K · ArcGX TechLabs — docs: vecforge.arcgx.in"
|
|
164
|
+
)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Ingestion dispatcher for VecForge.
|
|
12
|
+
|
|
13
|
+
Auto-detects file format by extension and routes to the appropriate
|
|
14
|
+
parser. Recursively walks directories for batch ingestion.
|
|
15
|
+
|
|
16
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from vecforge.exceptions import IngestError
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
# Supported file extensions mapped to handler
|
|
31
|
+
_SUPPORTED_EXTENSIONS: dict[str, str] = {
|
|
32
|
+
".txt": "text",
|
|
33
|
+
".md": "text",
|
|
34
|
+
".pdf": "pdf",
|
|
35
|
+
".docx": "docx",
|
|
36
|
+
".html": "html",
|
|
37
|
+
".htm": "html",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class IngestChunk:
|
|
43
|
+
"""A chunk of text extracted from a document.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
text: Extracted text content.
|
|
47
|
+
metadata: Metadata about the chunk (source, page, etc.).
|
|
48
|
+
modality: Content modality (text, image, audio, etc.).
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
text: str
|
|
52
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
53
|
+
modality: str = "text"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class IngestDispatcher:
|
|
57
|
+
"""Auto-detecting document ingestion dispatcher.
|
|
58
|
+
|
|
59
|
+
Walks directories, detects file formats, and routes to the
|
|
60
|
+
appropriate parser. Returns text chunks ready for embedding.
|
|
61
|
+
|
|
62
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
chunk_size: Maximum characters per chunk. Defaults to 1000.
|
|
66
|
+
chunk_overlap: Overlap between chunks in characters. Defaults to 200.
|
|
67
|
+
|
|
68
|
+
Performance:
|
|
69
|
+
Time: O(F * S) where F = files, S = avg file size
|
|
70
|
+
|
|
71
|
+
Example:
|
|
72
|
+
>>> dispatcher = IngestDispatcher()
|
|
73
|
+
>>> chunks = dispatcher.ingest("my_documents/")
|
|
74
|
+
>>> len(chunks)
|
|
75
|
+
42
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
chunk_size: int = 1000,
|
|
81
|
+
chunk_overlap: int = 200,
|
|
82
|
+
) -> None:
|
|
83
|
+
self._chunk_size = chunk_size
|
|
84
|
+
self._chunk_overlap = chunk_overlap
|
|
85
|
+
|
|
86
|
+
def ingest(self, path: str) -> list[IngestChunk]:
|
|
87
|
+
"""Ingest a file or directory of files.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
path: File path or directory path to ingest.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of IngestChunk objects ready for embedding.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
IngestError: If file format is not supported.
|
|
97
|
+
FileNotFoundError: If path does not exist.
|
|
98
|
+
|
|
99
|
+
Performance:
|
|
100
|
+
Time: O(F * S)
|
|
101
|
+
|
|
102
|
+
Example:
|
|
103
|
+
>>> chunks = dispatcher.ingest("reports/")
|
|
104
|
+
>>> for chunk in chunks:
|
|
105
|
+
... print(f"{chunk.metadata['source']}: {chunk.text[:50]}...")
|
|
106
|
+
"""
|
|
107
|
+
target = Path(path)
|
|
108
|
+
|
|
109
|
+
if not target.exists():
|
|
110
|
+
raise FileNotFoundError(
|
|
111
|
+
f"Path not found: {path}\nVecForge by Suneel Bose K · ArcGX TechLabs"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
if target.is_file():
|
|
115
|
+
return self._ingest_file(target)
|
|
116
|
+
|
|
117
|
+
# why: Recursively walk directory
|
|
118
|
+
all_chunks: list[IngestChunk] = []
|
|
119
|
+
for file_path in sorted(target.rglob("*")):
|
|
120
|
+
ext = file_path.suffix.lower()
|
|
121
|
+
if file_path.is_file() and ext in _SUPPORTED_EXTENSIONS:
|
|
122
|
+
try:
|
|
123
|
+
chunks = self._ingest_file(file_path)
|
|
124
|
+
all_chunks.extend(chunks)
|
|
125
|
+
except IngestError as e:
|
|
126
|
+
logger.warning("Skipping %s: %s", file_path, e)
|
|
127
|
+
|
|
128
|
+
logger.info("Ingested %d chunks from %s", len(all_chunks), path)
|
|
129
|
+
return all_chunks
|
|
130
|
+
|
|
131
|
+
def _ingest_file(self, file_path: Path) -> list[IngestChunk]:
|
|
132
|
+
"""Ingest a single file.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
file_path: Path to the file.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of IngestChunk from this file.
|
|
139
|
+
|
|
140
|
+
Performance:
|
|
141
|
+
Time: O(S) where S = file size
|
|
142
|
+
"""
|
|
143
|
+
ext = file_path.suffix.lower()
|
|
144
|
+
handler = _SUPPORTED_EXTENSIONS.get(ext)
|
|
145
|
+
|
|
146
|
+
if handler is None:
|
|
147
|
+
raise IngestError(
|
|
148
|
+
str(file_path),
|
|
149
|
+
f"Unsupported file extension '{ext}'",
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# why: Import document parser lazily to avoid heavy deps at import time
|
|
153
|
+
from vecforge.ingest.document import DocumentParser
|
|
154
|
+
|
|
155
|
+
parser = DocumentParser(
|
|
156
|
+
chunk_size=self._chunk_size,
|
|
157
|
+
chunk_overlap=self._chunk_overlap,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if handler == "text":
|
|
161
|
+
return parser.parse_text_file(file_path)
|
|
162
|
+
elif handler == "pdf":
|
|
163
|
+
return parser.parse_pdf(file_path)
|
|
164
|
+
elif handler == "docx":
|
|
165
|
+
return parser.parse_docx(file_path)
|
|
166
|
+
elif handler == "html":
|
|
167
|
+
return parser.parse_html_file(file_path)
|
|
168
|
+
else:
|
|
169
|
+
raise IngestError(str(file_path), f"No handler for '{handler}'")
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def supported_extensions() -> list[str]:
|
|
173
|
+
"""Return list of supported file extensions.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Sorted list of supported extensions.
|
|
177
|
+
|
|
178
|
+
Performance:
|
|
179
|
+
Time: O(1)
|
|
180
|
+
"""
|
|
181
|
+
return sorted(_SUPPORTED_EXTENSIONS.keys())
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
# VecForge — Universal Local-First Vector Database
|
|
2
|
+
# Copyright (c) 2026 Suneel Bose K · ArcGX TechLabs Private Limited
|
|
3
|
+
# Built by Suneel Bose K (Founder & CEO, ArcGX TechLabs)
|
|
4
|
+
#
|
|
5
|
+
# Licensed under the Business Source License 1.1 (BSL 1.1)
|
|
6
|
+
# Free for personal, research, open-source, and non-commercial use.
|
|
7
|
+
# Commercial use requires a separate license from ArcGX TechLabs.
|
|
8
|
+
# See LICENSE file in the project root or contact: suneelbose@arcgx.in
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
Document parser for VecForge.
|
|
12
|
+
|
|
13
|
+
Handles text extraction and chunking for common document formats:
|
|
14
|
+
PDF (via PyMuPDF), DOCX (via python-docx), HTML (via BeautifulSoup),
|
|
15
|
+
and plain text/markdown.
|
|
16
|
+
|
|
17
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Any
|
|
25
|
+
|
|
26
|
+
from vecforge.ingest.dispatcher import IngestChunk
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DocumentParser:
|
|
32
|
+
"""Multi-format document text extractor with chunking.
|
|
33
|
+
|
|
34
|
+
Extracts raw text from supported formats and splits into
|
|
35
|
+
overlapping chunks suitable for embedding.
|
|
36
|
+
|
|
37
|
+
Built by Suneel Bose K · ArcGX TechLabs Private Limited.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
chunk_size: Maximum characters per chunk. Defaults to 1000.
|
|
41
|
+
chunk_overlap: Overlap between consecutive chunks. Defaults to 200.
|
|
42
|
+
|
|
43
|
+
Performance:
|
|
44
|
+
Time: O(S) where S = total text size
|
|
45
|
+
Chunking: O(S / chunk_size) chunks produced
|
|
46
|
+
|
|
47
|
+
Example:
|
|
48
|
+
>>> parser = DocumentParser(chunk_size=500, chunk_overlap=100)
|
|
49
|
+
>>> chunks = parser.parse_text_file(Path("report.txt"))
|
|
50
|
+
>>> len(chunks)
|
|
51
|
+
15
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
chunk_size: int = 1000,
|
|
57
|
+
chunk_overlap: int = 200,
|
|
58
|
+
) -> None:
|
|
59
|
+
self._chunk_size = chunk_size
|
|
60
|
+
self._chunk_overlap = chunk_overlap
|
|
61
|
+
|
|
62
|
+
def _chunk_text(
|
|
63
|
+
self,
|
|
64
|
+
text: str,
|
|
65
|
+
source: str,
|
|
66
|
+
extra_metadata: dict[str, Any] | None = None,
|
|
67
|
+
) -> list[IngestChunk]:
|
|
68
|
+
"""Split text into overlapping chunks.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
text: Raw text to chunk.
|
|
72
|
+
source: Source file path for metadata.
|
|
73
|
+
extra_metadata: Additional metadata to add to each chunk.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of IngestChunk with text and metadata.
|
|
77
|
+
|
|
78
|
+
Performance:
|
|
79
|
+
Time: O(S / chunk_size)
|
|
80
|
+
"""
|
|
81
|
+
if not text.strip():
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
chunks: list[IngestChunk] = []
|
|
85
|
+
start = 0
|
|
86
|
+
chunk_index = 0
|
|
87
|
+
|
|
88
|
+
while start < len(text):
|
|
89
|
+
end = start + self._chunk_size
|
|
90
|
+
|
|
91
|
+
# why: Try to break at sentence/paragraph boundary
|
|
92
|
+
if end < len(text):
|
|
93
|
+
# Look for paragraph break first, then sentence end
|
|
94
|
+
for sep in ["\n\n", "\n", ". ", "! ", "? "]:
|
|
95
|
+
last_sep = text.rfind(sep, start, end)
|
|
96
|
+
if last_sep > start:
|
|
97
|
+
end = last_sep + len(sep)
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
chunk_text = text[start:end].strip()
|
|
101
|
+
|
|
102
|
+
if chunk_text:
|
|
103
|
+
meta = {
|
|
104
|
+
"source": source,
|
|
105
|
+
"chunk_index": chunk_index,
|
|
106
|
+
"char_start": start,
|
|
107
|
+
"char_end": end,
|
|
108
|
+
}
|
|
109
|
+
if extra_metadata:
|
|
110
|
+
meta.update(extra_metadata)
|
|
111
|
+
|
|
112
|
+
chunks.append(IngestChunk(text=chunk_text, metadata=meta))
|
|
113
|
+
chunk_index += 1
|
|
114
|
+
|
|
115
|
+
# why: Move forward by chunk_size - overlap for continuity
|
|
116
|
+
start = end - self._chunk_overlap
|
|
117
|
+
if start <= chunks[-1].metadata["char_start"] if chunks else True:
|
|
118
|
+
# safety: Prevent infinite loop
|
|
119
|
+
start = end
|
|
120
|
+
|
|
121
|
+
logger.debug("Chunked %s into %d chunks", source, len(chunks))
|
|
122
|
+
return chunks
|
|
123
|
+
|
|
124
|
+
def parse_text_file(self, path: Path) -> list[IngestChunk]:
|
|
125
|
+
"""Parse a plain text or markdown file.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
path: Path to .txt or .md file.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of IngestChunk from the file.
|
|
132
|
+
|
|
133
|
+
Performance:
|
|
134
|
+
Time: O(S) where S = file size
|
|
135
|
+
"""
|
|
136
|
+
text = path.read_text(encoding="utf-8", errors="replace")
|
|
137
|
+
return self._chunk_text(text, source=str(path))
|
|
138
|
+
|
|
139
|
+
def parse_pdf(self, path: Path) -> list[IngestChunk]:
|
|
140
|
+
"""Parse a PDF file using PyMuPDF (fitz).
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
path: Path to .pdf file.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
List of IngestChunk with page metadata.
|
|
147
|
+
|
|
148
|
+
Performance:
|
|
149
|
+
Time: O(P * S) where P = pages, S = avg page text size
|
|
150
|
+
"""
|
|
151
|
+
try:
|
|
152
|
+
import fitz # PyMuPDF
|
|
153
|
+
except ImportError as e:
|
|
154
|
+
raise ImportError(
|
|
155
|
+
"PyMuPDF (fitz) is required for PDF ingestion.\n"
|
|
156
|
+
"Install with: pip install pymupdf\n"
|
|
157
|
+
"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
158
|
+
) from e
|
|
159
|
+
|
|
160
|
+
all_chunks: list[IngestChunk] = []
|
|
161
|
+
|
|
162
|
+
with fitz.open(str(path)) as doc:
|
|
163
|
+
for page_num, page in enumerate(doc):
|
|
164
|
+
text = page.get_text()
|
|
165
|
+
if text.strip():
|
|
166
|
+
chunks = self._chunk_text(
|
|
167
|
+
text,
|
|
168
|
+
source=str(path),
|
|
169
|
+
extra_metadata={"page": page_num + 1},
|
|
170
|
+
)
|
|
171
|
+
all_chunks.extend(chunks)
|
|
172
|
+
|
|
173
|
+
logger.info("Parsed PDF %s: %d chunks", path.name, len(all_chunks))
|
|
174
|
+
return all_chunks
|
|
175
|
+
|
|
176
|
+
def parse_docx(self, path: Path) -> list[IngestChunk]:
|
|
177
|
+
"""Parse a DOCX file using python-docx.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
path: Path to .docx file.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List of IngestChunk from the document.
|
|
184
|
+
|
|
185
|
+
Performance:
|
|
186
|
+
Time: O(P) where P = number of paragraphs
|
|
187
|
+
"""
|
|
188
|
+
try:
|
|
189
|
+
import docx
|
|
190
|
+
except ImportError as e:
|
|
191
|
+
raise ImportError(
|
|
192
|
+
"python-docx is required for DOCX ingestion.\n"
|
|
193
|
+
"Install with: pip install python-docx\n"
|
|
194
|
+
"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
195
|
+
) from e
|
|
196
|
+
|
|
197
|
+
doc = docx.Document(str(path))
|
|
198
|
+
full_text = "\n\n".join(
|
|
199
|
+
para.text for para in doc.paragraphs if para.text.strip()
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
chunks = self._chunk_text(full_text, source=str(path))
|
|
203
|
+
logger.info("Parsed DOCX %s: %d chunks", path.name, len(chunks))
|
|
204
|
+
return chunks
|
|
205
|
+
|
|
206
|
+
def parse_html_file(self, path: Path) -> list[IngestChunk]:
|
|
207
|
+
"""Parse an HTML file using BeautifulSoup.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
path: Path to .html or .htm file.
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
List of IngestChunk from the HTML content.
|
|
214
|
+
|
|
215
|
+
Performance:
|
|
216
|
+
Time: O(S) where S = file size
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
from bs4 import BeautifulSoup
|
|
220
|
+
except ImportError as e:
|
|
221
|
+
raise ImportError(
|
|
222
|
+
"beautifulsoup4 is required for HTML ingestion.\n"
|
|
223
|
+
"Install with: pip install beautifulsoup4\n"
|
|
224
|
+
"VecForge by Suneel Bose K · ArcGX TechLabs"
|
|
225
|
+
) from e
|
|
226
|
+
|
|
227
|
+
html = path.read_text(encoding="utf-8", errors="replace")
|
|
228
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
229
|
+
|
|
230
|
+
# why: Remove script and style elements before extracting text
|
|
231
|
+
for tag in soup(["script", "style", "nav", "footer", "header"]):
|
|
232
|
+
tag.decompose()
|
|
233
|
+
|
|
234
|
+
text = soup.get_text(separator="\n", strip=True)
|
|
235
|
+
chunks = self._chunk_text(text, source=str(path))
|
|
236
|
+
logger.info("Parsed HTML %s: %d chunks", path.name, len(chunks))
|
|
237
|
+
return chunks
|