swarmauri_vectorstore_fs 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,83 @@
1
+ Metadata-Version: 2.4
2
+ Name: swarmauri_vectorstore_fs
3
+ Version: 0.1.0
4
+ Summary: Swarmauri filesystem-aware BM25F vector store
5
+ License-Expression: Apache-2.0
6
+ Keywords: swarmauri,vectorstore,filesystem,bm25f,chunks,retrieval
7
+ Author: Jacob Stewart
8
+ Author-email: jacob@swarmauri.com
9
+ Requires-Python: >=3.10,<3.13
10
+ Classifier: License :: OSI Approved :: Apache Software License
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Natural Language :: English
15
+ Classifier: Development Status :: 3 - Alpha
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
18
+ Requires-Dist: rich (>=13.9.4)
19
+ Requires-Dist: swarmauri_base
20
+ Requires-Dist: swarmauri_core
21
+ Requires-Dist: swarmauri_standard
22
+ Description-Content-Type: text/markdown
23
+
24
+ ![Swarmauri Logo](https://github.com/swarmauri/swarmauri-sdk/blob/3d4d1cfa949399d7019ae9d8f296afba773dfb7f/assets/swarmauri_brand_frag_light.png)
25
+
26
+ <p align="center">
27
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
28
+ <img src="https://img.shields.io/pypi/pyversions/swarmauri_vectorstore_fs" alt="PyPI - Python Version"/></a>
29
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
30
+ <img src="https://img.shields.io/pypi/l/swarmauri_vectorstore_fs" alt="PyPI - License"/></a>
31
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
32
+ <img src="https://img.shields.io/pypi/v/swarmauri_vectorstore_fs?label=swarmauri_vectorstore_fs&color=green" alt="PyPI - swarmauri_vectorstore_fs"/></a>
33
+ </p>
34
+
35
+ ---
36
+
37
+ # Swarmauri Vectorstore FS
38
+
39
+ A Swarmauri community vector store that indexes filesystem trees for BM25F retrieval over file paths, file names, extensions, chunk identity, and file content.
40
+
41
+ ## Features
42
+
43
+ - Filesystem-aware retrieval with weighted BM25F fields
44
+ - Chunk, file, and chunk-plus-file indexing modes
45
+ - Stable chunk identity metadata for global, path-level, and file-level chunk numbering
46
+ - CLI for ad hoc lexical search over source trees and document corpora
47
+ - No embedding vocabulary dependency for query handling
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install swarmauri_vectorstore_fs
53
+ ```
54
+
55
+ ## Usage
56
+
57
+ ```python
58
+ from swarmauri_vectorstore_fs import FsVectorStore
59
+
60
+ store = FsVectorStore(root_path=".", mode="chunk")
61
+ store.build_index()
62
+ results = store.retrieve("vector store registration", top_k=3)
63
+
64
+ for document in results:
65
+ print(document.id, document.metadata["relative_path"])
66
+ ```
67
+
68
+ ## CLI
69
+
70
+ ```bash
71
+ fsvs --root . query --query "vector store registration" --top-k 5
72
+ ```
73
+
74
+ To inspect a specific retrieved document:
75
+
76
+ ```bash
77
+ fsvs --root . show --document-id <document-id>
78
+ ```
79
+
80
+ ## Want to help?
81
+
82
+ If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md).
83
+
@@ -0,0 +1,59 @@
1
+ ![Swarmauri Logo](https://github.com/swarmauri/swarmauri-sdk/blob/3d4d1cfa949399d7019ae9d8f296afba773dfb7f/assets/swarmauri_brand_frag_light.png)
2
+
3
+ <p align="center">
4
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
5
+ <img src="https://img.shields.io/pypi/pyversions/swarmauri_vectorstore_fs" alt="PyPI - Python Version"/></a>
6
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
7
+ <img src="https://img.shields.io/pypi/l/swarmauri_vectorstore_fs" alt="PyPI - License"/></a>
8
+ <a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
9
+ <img src="https://img.shields.io/pypi/v/swarmauri_vectorstore_fs?label=swarmauri_vectorstore_fs&color=green" alt="PyPI - swarmauri_vectorstore_fs"/></a>
10
+ </p>
11
+
12
+ ---
13
+
14
+ # Swarmauri Vectorstore FS
15
+
16
+ A Swarmauri community vector store that indexes filesystem trees for BM25F retrieval over file paths, file names, extensions, chunk identity, and file content.
17
+
18
+ ## Features
19
+
20
+ - Filesystem-aware retrieval with weighted BM25F fields
21
+ - Chunk, file, and chunk-plus-file indexing modes
22
+ - Stable chunk identity metadata for global, path-level, and file-level chunk numbering
23
+ - CLI for ad hoc lexical search over source trees and document corpora
24
+ - No embedding vocabulary dependency for query handling
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install swarmauri_vectorstore_fs
30
+ ```
31
+
32
+ ## Usage
33
+
34
+ ```python
35
+ from swarmauri_vectorstore_fs import FsVectorStore
36
+
37
+ store = FsVectorStore(root_path=".", mode="chunk")
38
+ store.build_index()
39
+ results = store.retrieve("vector store registration", top_k=3)
40
+
41
+ for document in results:
42
+ print(document.id, document.metadata["relative_path"])
43
+ ```
44
+
45
+ ## CLI
46
+
47
+ ```bash
48
+ fsvs --root . query --query "vector store registration" --top-k 5
49
+ ```
50
+
51
+ To inspect a specific retrieved document:
52
+
53
+ ```bash
54
+ fsvs --root . show --document-id <document-id>
55
+ ```
56
+
57
+ ## Want to help?
58
+
59
+ If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md).
@@ -0,0 +1,82 @@
1
+ [project]
2
+ name = "swarmauri_vectorstore_fs"
3
+ version = "0.1.0"
4
+ description = "Swarmauri filesystem-aware BM25F vector store"
5
+ license = "Apache-2.0"
6
+ readme = "README.md"
7
+ repository = "http://github.com/swarmauri/swarmauri-sdk"
8
+ requires-python = ">=3.10,<3.13"
9
+ classifiers = [
10
+ "License :: OSI Approved :: Apache Software License",
11
+ "Programming Language :: Python :: 3.10",
12
+ "Programming Language :: Python :: 3.11",
13
+ "Programming Language :: Python :: 3.12",
14
+ "Natural Language :: English",
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
18
+ ]
19
+ authors = [{ name = "Jacob Stewart", email = "jacob@swarmauri.com" }]
20
+ dependencies = [
21
+ "rich>=13.9.4",
22
+ "swarmauri_core",
23
+ "swarmauri_base",
24
+ "swarmauri_standard",
25
+ ]
26
+ keywords = [
27
+ "swarmauri",
28
+ "vectorstore",
29
+ "filesystem",
30
+ "bm25f",
31
+ "chunks",
32
+ "retrieval",
33
+ ]
34
+
35
+ [tool.uv.sources]
36
+ swarmauri_core = { workspace = true }
37
+ swarmauri_base = { workspace = true }
38
+ swarmauri_standard = { workspace = true }
39
+
40
+ [tool.pytest.ini_options]
41
+ norecursedirs = ["combined", "scripts"]
42
+ markers = [
43
+ "test: standard test",
44
+ "unit: Unit tests",
45
+ "i9n: Integration tests",
46
+ "r8n: Regression tests",
47
+ "timeout: mark test to timeout after X seconds",
48
+ "xpass: Expected passes",
49
+ "xfail: Expected failures",
50
+ "acceptance: Acceptance tests",
51
+ "perf: Performance tests that measure execution time and resource usage",
52
+ ]
53
+ timeout = 300
54
+ log_cli = true
55
+ log_cli_level = "INFO"
56
+ log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
57
+ log_cli_date_format = "%Y-%m-%d %H:%M:%S"
58
+ asyncio_default_fixture_loop_scope = "function"
59
+
60
+ [project.entry-points.'swarmauri.vector_stores']
61
+ FsVectorStore = "swarmauri_vectorstore_fs.FsVectorStore:FsVectorStore"
62
+
63
+ [project.scripts]
64
+ fsvs = "swarmauri_vectorstore_fs.cli:main"
65
+
66
+ [build-system]
67
+ requires = ["poetry-core>=1.0.0"]
68
+ build-backend = "poetry.core.masonry.api"
69
+
70
+ [dependency-groups]
71
+ dev = [
72
+ "pytest>=8.0",
73
+ "pytest-asyncio>=0.24.0",
74
+ "pytest-xdist>=3.6.1",
75
+ "pytest-json-report>=1.5.0",
76
+ "python-dotenv",
77
+ "requests>=2.32.3",
78
+ "flake8>=7.0",
79
+ "pytest-timeout>=2.3.1",
80
+ "ruff>=0.9.9",
81
+ "pytest-benchmark>=4.0.0",
82
+ ]
@@ -0,0 +1,142 @@
1
+ import math
2
+ import re
3
+ from collections import Counter, defaultdict
4
+ from typing import Dict, Iterable, List, Tuple
5
+
6
+
7
+ DEFAULT_FIELD_WEIGHTS: Dict[str, float] = {
8
+ "file_name": 5.0,
9
+ "relative_path": 4.0,
10
+ "directory_path": 2.5,
11
+ "file_extension": 1.5,
12
+ "chunk_identity": 1.0,
13
+ "content": 1.0,
14
+ }
15
+
16
+
17
+ class BM25FScorer:
18
+ def __init__(
19
+ self,
20
+ field_weights: Dict[str, float] | None = None,
21
+ k1: float = 1.2,
22
+ b: float = 0.75,
23
+ ) -> None:
24
+ self.field_weights = dict(field_weights or DEFAULT_FIELD_WEIGHTS)
25
+ self.k1 = k1
26
+ self.b = b
27
+ self.document_fields: List[Dict[str, str]] = []
28
+ self._field_term_frequencies: List[Dict[str, Counter[str]]] = []
29
+ self._field_lengths: List[Dict[str, int]] = []
30
+ self._average_field_lengths: Dict[str, float] = {}
31
+ self._document_frequencies: Counter[str] = Counter()
32
+
33
+ @staticmethod
34
+ def tokenize(text: str) -> List[str]:
35
+ tokens: List[str] = []
36
+ for raw_token in re.findall(r"[A-Za-z0-9_]+", text.replace("\\", "/")):
37
+ token = raw_token.lower()
38
+ if token:
39
+ tokens.append(token)
40
+ parts = re.findall(
41
+ r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)|[0-9]+", raw_token.replace("_", " ")
42
+ )
43
+ for part in parts:
44
+ lowered = part.lower()
45
+ if lowered and lowered != token:
46
+ tokens.append(lowered)
47
+ return tokens
48
+
49
+ def fit(self, document_fields: Iterable[Dict[str, str]]) -> None:
50
+ self.document_fields = [dict(fields) for fields in document_fields]
51
+ self._field_term_frequencies = []
52
+ self._field_lengths = []
53
+ self._average_field_lengths = {}
54
+ self._document_frequencies = Counter()
55
+
56
+ field_length_totals: Dict[str, int] = defaultdict(int)
57
+ for fields in self.document_fields:
58
+ doc_frequencies: Dict[str, Counter[str]] = {}
59
+ doc_lengths: Dict[str, int] = {}
60
+ doc_terms = set()
61
+ for field_name in self.field_weights:
62
+ tokens = self.tokenize(fields.get(field_name, ""))
63
+ frequencies = Counter(tokens)
64
+ doc_frequencies[field_name] = frequencies
65
+ doc_lengths[field_name] = len(tokens)
66
+ field_length_totals[field_name] += len(tokens)
67
+ doc_terms.update(frequencies)
68
+ self._field_term_frequencies.append(doc_frequencies)
69
+ self._field_lengths.append(doc_lengths)
70
+ for term in doc_terms:
71
+ self._document_frequencies[term] += 1
72
+
73
+ doc_count = max(1, len(self.document_fields))
74
+ self._average_field_lengths = {
75
+ field_name: field_length_totals[field_name] / doc_count
76
+ for field_name in self.field_weights
77
+ }
78
+
79
+ def search(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
80
+ query_terms = self.tokenize(query)
81
+ if not query_terms or not self.document_fields:
82
+ return []
83
+
84
+ scores = [
85
+ (index, self._score_document(index, query_terms))
86
+ for index in range(len(self.document_fields))
87
+ ]
88
+ matches = [(index, score) for index, score in scores if score > 0.0]
89
+ matches.sort(key=lambda item: item[1], reverse=True)
90
+ return matches[:top_k]
91
+
92
+ def _score_document(self, document_index: int, query_terms: List[str]) -> float:
93
+ score = 0.0
94
+ for term in query_terms:
95
+ document_frequency = self._document_frequencies.get(term, 0)
96
+ if document_frequency == 0:
97
+ continue
98
+ weighted_frequency = self._weighted_term_frequency(document_index, term)
99
+ if weighted_frequency <= 0:
100
+ continue
101
+ score += self._idf(document_frequency) * (
102
+ (weighted_frequency * (self.k1 + 1.0)) / (self.k1 + weighted_frequency)
103
+ )
104
+ return score
105
+
106
+ def _weighted_term_frequency(self, document_index: int, term: str) -> float:
107
+ weighted_frequency = 0.0
108
+ for field_name, weight in self.field_weights.items():
109
+ field_frequency = self._field_term_frequencies[document_index][
110
+ field_name
111
+ ].get(term, 0)
112
+ if field_frequency == 0:
113
+ continue
114
+ field_length = self._field_lengths[document_index].get(field_name, 0)
115
+ average_length = self._average_field_lengths.get(field_name, 1.0) or 1.0
116
+ normalized_length = 1.0 - self.b + self.b * (field_length / average_length)
117
+ weighted_frequency += weight * (field_frequency / normalized_length)
118
+ return weighted_frequency
119
+
120
+ def _idf(self, document_frequency: int) -> float:
121
+ doc_count = len(self.document_fields)
122
+ return math.log(
123
+ 1.0 + ((doc_count - document_frequency + 0.5) / (document_frequency + 0.5))
124
+ )
125
+
126
+ def to_dict(self) -> Dict:
127
+ return {
128
+ "field_weights": self.field_weights,
129
+ "k1": self.k1,
130
+ "b": self.b,
131
+ "document_fields": self.document_fields,
132
+ }
133
+
134
+ @classmethod
135
+ def from_dict(cls, payload: Dict) -> "BM25FScorer":
136
+ scorer = cls(
137
+ field_weights=payload.get("field_weights"),
138
+ k1=payload.get("k1", 1.2),
139
+ b=payload.get("b", 0.75),
140
+ )
141
+ scorer.fit(payload.get("document_fields", []))
142
+ return scorer
@@ -0,0 +1,414 @@
1
+ import fnmatch
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
5
+
6
+ from pydantic import Field, PrivateAttr
7
+ from swarmauri_base.ComponentBase import ComponentBase
8
+ from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase
9
+ from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import (
10
+ VectorStoreRetrieveMixin,
11
+ )
12
+ from swarmauri_standard.documents.Document import Document
13
+
14
+ from .BM25FScorer import BM25FScorer
15
+
16
+
17
+ ModeLiteral = Literal["chunk", "file", "chunk_file"]
18
+ DEFAULT_EXCLUDE_PATTERNS = (
19
+ ".git/**",
20
+ "**/.git/**",
21
+ "__pycache__/**",
22
+ "**/__pycache__/**",
23
+ ".venv/**",
24
+ "**/.venv/**",
25
+ "node_modules/**",
26
+ "**/node_modules/**",
27
+ )
28
+
29
+
30
+ @ComponentBase.register_type(VectorStoreBase, "FsVectorStore")
31
+ class FsVectorStore(VectorStoreRetrieveMixin, VectorStoreBase):
32
+ type: Literal["FsVectorStore"] = "FsVectorStore"
33
+ root_path: Optional[str] = None
34
+ mode: ModeLiteral = "chunk"
35
+ chunk_size: int = 1200
36
+ chunk_overlap: int = 120
37
+ include: Tuple[str, ...] = Field(default=())
38
+ exclude: Tuple[str, ...] = Field(default=())
39
+ max_file_size: int = 1_000_000
40
+ auto_index: bool = False
41
+ index_metadata: Dict[str, int] = Field(default_factory=dict)
42
+
43
+ _scorer = PrivateAttr()
44
+ _document_map = PrivateAttr()
45
+
46
+ def __init__(self, **kwargs):
47
+ super().__init__(**kwargs)
48
+ self._scorer = BM25FScorer()
49
+ self._document_map: Dict[str, Document] = {}
50
+ self.documents = []
51
+ if self.auto_index and self.root_path:
52
+ self.build_index()
53
+
54
+ def add_document(self, document: Document) -> None:
55
+ self._document_map[document.id] = document
56
+ self.documents = list(self._document_map.values())
57
+ self._reindex_documents()
58
+
59
+ def add_documents(self, documents: List[Document]) -> None:
60
+ for document in documents:
61
+ self._document_map[document.id] = document
62
+ self.documents = list(self._document_map.values())
63
+ self._reindex_documents()
64
+
65
+ def get_document(self, id: str) -> Union[Document, None]:
66
+ return self._document_map.get(id)
67
+
68
+ def get_all_documents(self) -> List[Document]:
69
+ return list(self.documents)
70
+
71
+ def update_document(self, id: str, updated_document: Document) -> None:
72
+ if id not in self._document_map:
73
+ raise KeyError(f"Document '{id}' not found")
74
+ self._document_map[id] = updated_document
75
+ self.documents = list(self._document_map.values())
76
+ self._reindex_documents()
77
+
78
+ def delete_document(self, id: str) -> None:
79
+ self._document_map.pop(id, None)
80
+ self.documents = list(self._document_map.values())
81
+ self._reindex_documents()
82
+
83
+ def clear_documents(self) -> None:
84
+ self.documents = []
85
+ self._document_map = {}
86
+ self._scorer = BM25FScorer()
87
+
88
+ def build_index(self) -> None:
89
+ root = self._validate_root()
90
+ documents: List[Document] = []
91
+ skip_counts = {
92
+ "binary": 0,
93
+ "oversized": 0,
94
+ "unreadable": 0,
95
+ "excluded": 0,
96
+ }
97
+ global_index = 0
98
+
99
+ for file_index, path in enumerate(
100
+ self._iter_indexable_files(root, skip_counts)
101
+ ):
102
+ relative_path = path.relative_to(root).as_posix()
103
+ text = self._read_text(path, skip_counts)
104
+ if text is None:
105
+ continue
106
+
107
+ chunks = self._chunk_text(text)
108
+ if self.mode in {"chunk", "chunk_file"}:
109
+ for chunk_file_index, (chunk, start_line, end_line) in enumerate(
110
+ chunks
111
+ ):
112
+ document = self._build_document(
113
+ relative_path=relative_path,
114
+ file_index=file_index,
115
+ chunk_global_index=global_index,
116
+ chunk_path_index=chunk_file_index,
117
+ chunk_file_index=chunk_file_index,
118
+ start_line=start_line,
119
+ end_line=end_line,
120
+ document_kind="chunk",
121
+ content=chunk,
122
+ )
123
+ documents.append(document)
124
+ global_index += 1
125
+
126
+ if self.mode in {"file", "chunk_file"}:
127
+ document = self._build_document(
128
+ relative_path=relative_path,
129
+ file_index=file_index,
130
+ chunk_global_index=global_index,
131
+ chunk_path_index=len(chunks),
132
+ chunk_file_index=-1,
133
+ start_line=1,
134
+ end_line=self._line_count(text),
135
+ document_kind="file",
136
+ content=text,
137
+ )
138
+ documents.append(document)
139
+ global_index += 1
140
+
141
+ self.index_metadata = {
142
+ **skip_counts,
143
+ "indexed_documents": len(documents),
144
+ }
145
+ self._document_map = {document.id: document for document in documents}
146
+ self.documents = documents
147
+ self._reindex_documents()
148
+
149
+ def refresh_index(self) -> None:
150
+ self.build_index()
151
+
152
+ def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
153
+ return [document for document, _score in self.search(query, top_k=top_k)]
154
+
155
+ def query(self, text: str, top_k: int = 5) -> List[Document]:
156
+ return self.retrieve(text, top_k=top_k)
157
+
158
+ def search(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
159
+ if not self.documents and self.root_path:
160
+ self.build_index()
161
+ results = self._scorer.search(query, top_k=top_k)
162
+ return [(self.documents[index], score) for index, score in results]
163
+
164
+ def save_store(self, directory_path: str) -> None:
165
+ output_dir = Path(directory_path)
166
+ output_dir.mkdir(parents=True, exist_ok=True)
167
+ payload = {
168
+ "root_path": self.root_path,
169
+ "mode": self.mode,
170
+ "chunk_size": self.chunk_size,
171
+ "chunk_overlap": self.chunk_overlap,
172
+ "include": list(self.include),
173
+ "exclude": list(self.exclude),
174
+ "max_file_size": self.max_file_size,
175
+ "index_metadata": self.index_metadata,
176
+ "bm25f": self._scorer.to_dict(),
177
+ "documents": [
178
+ document.model_dump(mode="json") for document in self.documents
179
+ ],
180
+ }
181
+ (output_dir / "fs_vectorstore.json").write_text(
182
+ json.dumps(payload, indent=2), encoding="utf-8"
183
+ )
184
+
185
+ def load_store(self, directory_path: str) -> None:
186
+ payload = json.loads(
187
+ (Path(directory_path) / "fs_vectorstore.json").read_text(encoding="utf-8")
188
+ )
189
+ self.root_path = payload.get("root_path")
190
+ self.mode = payload.get("mode", "chunk")
191
+ self.chunk_size = payload.get("chunk_size", 1200)
192
+ self.chunk_overlap = payload.get("chunk_overlap", 120)
193
+ self.include = tuple(payload.get("include", []))
194
+ self.exclude = tuple(payload.get("exclude", []))
195
+ self.max_file_size = payload.get("max_file_size", 1_000_000)
196
+ self.index_metadata = payload.get("index_metadata", {})
197
+ self.documents = [
198
+ Document.model_validate(document) for document in payload["documents"]
199
+ ]
200
+ self._document_map = {document.id: document for document in self.documents}
201
+ self._scorer = BM25FScorer.from_dict(payload.get("bm25f", {}))
202
+ if not self._scorer.document_fields:
203
+ self._reindex_documents()
204
+
205
+ def _reindex_documents(self) -> None:
206
+ self._scorer = BM25FScorer()
207
+ self._scorer.fit(
208
+ [self._fields_for_document(document) for document in self.documents]
209
+ )
210
+
211
+ def _validate_root(self) -> Path:
212
+ if not self.root_path:
213
+ raise ValueError("root_path is required to build the filesystem index")
214
+ root = Path(self.root_path)
215
+ if not root.exists():
216
+ raise FileNotFoundError(f"Root path '{self.root_path}' does not exist")
217
+ if not root.is_dir():
218
+ raise NotADirectoryError(f"Root path '{self.root_path}' is not a directory")
219
+ if self.chunk_size <= 0:
220
+ raise ValueError("chunk_size must be greater than zero")
221
+ if self.chunk_overlap < 0:
222
+ raise ValueError("chunk_overlap must be zero or greater")
223
+ if self.chunk_overlap >= self.chunk_size:
224
+ raise ValueError("chunk_overlap must be smaller than chunk_size")
225
+ return root
226
+
227
+ def _iter_indexable_files(
228
+ self, root: Path, skip_counts: Dict[str, int]
229
+ ) -> Iterable[Path]:
230
+ for path in sorted(root.rglob("*")):
231
+ if not path.is_file():
232
+ continue
233
+ relative_path = path.relative_to(root).as_posix()
234
+ if self.include and not self._matches_any(relative_path, self.include):
235
+ skip_counts["excluded"] += 1
236
+ continue
237
+ if self._matches_any(
238
+ relative_path, (*DEFAULT_EXCLUDE_PATTERNS, *self.exclude)
239
+ ):
240
+ skip_counts["excluded"] += 1
241
+ continue
242
+ yield path
243
+
244
+ def _read_text(self, path: Path, skip_counts: Dict[str, int]) -> Optional[str]:
245
+ try:
246
+ size = path.stat().st_size
247
+ if size > self.max_file_size:
248
+ skip_counts["oversized"] += 1
249
+ return None
250
+ data = path.read_bytes()
251
+ except OSError:
252
+ skip_counts["unreadable"] += 1
253
+ return None
254
+
255
+ if b"\x00" in data:
256
+ skip_counts["binary"] += 1
257
+ return None
258
+
259
+ try:
260
+ return data.decode("utf-8")
261
+ except UnicodeDecodeError:
262
+ skip_counts["binary"] += 1
263
+ return None
264
+
265
+ def _chunk_text(self, text: str) -> List[Tuple[str, int, int]]:
266
+ if not text:
267
+ return [("", 1, 1)]
268
+
269
+ chunks: List[Tuple[str, int, int]] = []
270
+ start = 0
271
+ text_length = len(text)
272
+ while start < text_length:
273
+ end = min(text_length, start + self.chunk_size)
274
+ chunk = text[start:end]
275
+ start_line = text.count("\n", 0, start) + 1
276
+ end_line = text.count("\n", 0, end) + 1
277
+ chunks.append((chunk, start_line, end_line))
278
+ if end == text_length:
279
+ break
280
+ start = max(end - self.chunk_overlap, start + 1)
281
+ return chunks
282
+
283
+ def _build_document(
284
+ self,
285
+ *,
286
+ relative_path: str,
287
+ file_index: int,
288
+ chunk_global_index: int,
289
+ chunk_path_index: int,
290
+ chunk_file_index: int,
291
+ start_line: int,
292
+ end_line: int,
293
+ document_kind: Literal["chunk", "file"],
294
+ content: str,
295
+ ) -> Document:
296
+ path = Path(relative_path)
297
+ file_name = path.name
298
+ file_extension = path.suffix
299
+ directory_path = path.parent.as_posix() if path.parent.as_posix() != "." else ""
300
+ bm25_fields = self._build_bm25_fields(
301
+ relative_path=relative_path,
302
+ directory_path=directory_path,
303
+ file_name=file_name,
304
+ file_extension=file_extension,
305
+ file_index=file_index,
306
+ chunk_global_index=chunk_global_index,
307
+ chunk_path_index=chunk_path_index,
308
+ chunk_file_index=chunk_file_index,
309
+ content=content,
310
+ )
311
+ metadata = {
312
+ "relative_path": relative_path,
313
+ "file_index": file_index,
314
+ "file_name": file_name,
315
+ "file_extension": file_extension,
316
+ "chunk_global_index": chunk_global_index,
317
+ "chunk_path_index": chunk_path_index,
318
+ "chunk_file_index": chunk_file_index,
319
+ "start_line": start_line,
320
+ "end_line": end_line,
321
+ "document_kind": document_kind,
322
+ "bm25_fields": bm25_fields,
323
+ }
324
+ return Document(
325
+ id=f"fs:{file_index}:{document_kind}:{chunk_path_index}",
326
+ content=self._composite_content(metadata, content),
327
+ metadata=metadata,
328
+ )
329
+
330
+ def _build_bm25_fields(
331
+ self,
332
+ *,
333
+ relative_path: str,
334
+ directory_path: str,
335
+ file_name: str,
336
+ file_extension: str,
337
+ file_index: int,
338
+ chunk_global_index: int,
339
+ chunk_path_index: int,
340
+ chunk_file_index: int,
341
+ content: str,
342
+ ) -> Dict[str, str]:
343
+ chunk_identity = " ".join(
344
+ [
345
+ f"file_index {file_index}",
346
+ f"file_{file_index}",
347
+ f"chunk_global_index {chunk_global_index}",
348
+ f"chunk_global_{chunk_global_index}",
349
+ f"chunk_path_index {chunk_path_index}",
350
+ f"chunk_path_{chunk_path_index}",
351
+ f"chunk_file_index {chunk_file_index}",
352
+ f"chunk_file_{chunk_file_index}",
353
+ ]
354
+ )
355
+ return {
356
+ "file_name": file_name,
357
+ "relative_path": relative_path,
358
+ "directory_path": directory_path,
359
+ "file_extension": file_extension,
360
+ "chunk_identity": chunk_identity,
361
+ "content": content,
362
+ }
363
+
364
+ def _fields_for_document(self, document: Document) -> Dict[str, str]:
365
+ fields = document.metadata.get("bm25_fields")
366
+ if isinstance(fields, dict):
367
+ return {field: str(value) for field, value in fields.items()}
368
+ return {
369
+ "file_name": str(document.metadata.get("file_name", "")),
370
+ "relative_path": str(document.metadata.get("relative_path", "")),
371
+ "directory_path": str(
372
+ Path(str(document.metadata.get("relative_path", ""))).parent
373
+ ),
374
+ "file_extension": str(document.metadata.get("file_extension", "")),
375
+ "chunk_identity": " ".join(
376
+ str(document.metadata.get(key, ""))
377
+ for key in (
378
+ "file_index",
379
+ "chunk_global_index",
380
+ "chunk_path_index",
381
+ "chunk_file_index",
382
+ )
383
+ ),
384
+ "content": document.content,
385
+ }
386
+
387
+ def _composite_content(self, metadata: Dict, content: str) -> str:
388
+ return "\n".join(
389
+ [
390
+ f"file_index: {metadata['file_index']}",
391
+ f"file_path: {metadata['relative_path']}",
392
+ f"file_name: {metadata['file_name']}",
393
+ f"file_extension: {metadata['file_extension']}",
394
+ f"chunk_global_index: {metadata['chunk_global_index']}",
395
+ f"chunk_path_index: {metadata['chunk_path_index']}",
396
+ f"chunk_file_index: {metadata['chunk_file_index']}",
397
+ f"document_kind: {metadata['document_kind']}",
398
+ "content:",
399
+ content,
400
+ ]
401
+ )
402
+
403
+ def _matches_any(self, relative_path: str, patterns: Tuple[str, ...]) -> bool:
404
+ path = relative_path.replace("\\", "/")
405
+ name = Path(path).name
406
+ return any(
407
+ fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(name, pattern)
408
+ for pattern in patterns
409
+ )
410
+
411
+ def _line_count(self, text: str) -> int:
412
+ if not text:
413
+ return 1
414
+ return text.count("\n") + (0 if text.endswith("\n") else 1)
@@ -0,0 +1,14 @@
1
+ from .BM25FScorer import BM25FScorer
2
+ from .FsVectorStore import FsVectorStore
3
+
4
+ __all__ = ["BM25FScorer", "FsVectorStore"]
5
+
6
+ try:
7
+ from importlib.metadata import PackageNotFoundError, version
8
+ except ImportError:
9
+ from importlib_metadata import PackageNotFoundError, version
10
+
11
+ try:
12
+ __version__ = version("swarmauri_vectorstore_fs")
13
+ except PackageNotFoundError:
14
+ __version__ = "0.0.0"
@@ -0,0 +1,171 @@
1
+ import argparse
2
+ import json
3
+ import logging
4
+ from typing import Iterable, Optional
5
+
6
+ from rich.console import Console
7
+ from rich.logging import RichHandler
8
+ from rich.table import Table
9
+
10
+ from .FsVectorStore import FsVectorStore
11
+
12
+
13
+ LOGGER = logging.getLogger("swarmauri.fs_vectorstore")
14
+ CONSOLE = Console()
15
+
16
+
17
+ def configure_logging(verbose: bool = False) -> None:
18
+ level = logging.DEBUG if verbose else logging.INFO
19
+ logging.basicConfig(
20
+ level=level,
21
+ format="%(message)s",
22
+ datefmt="[%X]",
23
+ handlers=[RichHandler(rich_tracebacks=True, markup=True)],
24
+ force=True,
25
+ )
26
+
27
+
28
+ def build_parser() -> argparse.ArgumentParser:
29
+ parser = argparse.ArgumentParser(
30
+ description="BM25F retrieval over filesystem files and chunks."
31
+ )
32
+ parser.add_argument("--root", default=".", help="Root directory to index")
33
+ parser.add_argument(
34
+ "--mode",
35
+ choices=["chunk", "file", "chunk_file"],
36
+ default="chunk",
37
+ help="Indexing mode",
38
+ )
39
+ parser.add_argument(
40
+ "--chunk-size", type=int, default=1200, help="Chunk size in characters"
41
+ )
42
+ parser.add_argument(
43
+ "--chunk-overlap", type=int, default=120, help="Chunk overlap in characters"
44
+ )
45
+ parser.add_argument(
46
+ "--include", action="append", dest="include", help="Glob to include"
47
+ )
48
+ parser.add_argument(
49
+ "--exclude", action="append", dest="exclude", help="Glob to exclude"
50
+ )
51
+ parser.add_argument(
52
+ "--max-file-size",
53
+ type=int,
54
+ default=1_000_000,
55
+ help="Maximum file size in bytes to index",
56
+ )
57
+ parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
58
+
59
+ subparsers = parser.add_subparsers(dest="command", required=True)
60
+
61
+ query_parser = subparsers.add_parser(
62
+ "query", help="Query indexed filesystem documents"
63
+ )
64
+ query_parser.add_argument("--query", required=True, help="Query text")
65
+ query_parser.add_argument(
66
+ "--top-k", type=int, default=5, help="Number of hits to return"
67
+ )
68
+ query_parser.add_argument("--json", action="store_true", help="Emit JSON output")
69
+
70
+ show_parser = subparsers.add_parser("show", help="Print a document by id")
71
+ show_parser.add_argument(
72
+ "--document-id", required=True, help="Document id to display"
73
+ )
74
+
75
+ return parser
76
+
77
+
78
+ def build_store(args: argparse.Namespace) -> FsVectorStore:
79
+ store = FsVectorStore(
80
+ root_path=args.root,
81
+ mode=args.mode,
82
+ chunk_size=args.chunk_size,
83
+ chunk_overlap=args.chunk_overlap,
84
+ include=tuple(args.include or ()),
85
+ exclude=tuple(args.exclude or ()),
86
+ max_file_size=args.max_file_size,
87
+ )
88
+ LOGGER.info(
89
+ "Building filesystem BM25F index [bold cyan](root=%s, mode=%s)[/bold cyan]",
90
+ args.root,
91
+ args.mode,
92
+ )
93
+ store.build_index()
94
+ LOGGER.info("Indexed [bold green]%s[/bold green] documents", len(store.documents))
95
+ return store
96
+
97
+
98
+ def render_query_results(
99
+ store: FsVectorStore, query: str, top_k: int, as_json: bool
100
+ ) -> int:
101
+ LOGGER.info(
102
+ "Running BM25F retrieval for query [bold yellow]%s[/bold yellow]", query
103
+ )
104
+ results = store.search(query, top_k=top_k)
105
+ if not results:
106
+ LOGGER.warning(
107
+ "No lexical matches found for query [bold yellow]%s[/bold yellow]", query
108
+ )
109
+
110
+ if as_json:
111
+ payload = [
112
+ {
113
+ "id": document.id,
114
+ "score": score,
115
+ "metadata": document.metadata,
116
+ "content": document.content,
117
+ }
118
+ for document, score in results
119
+ ]
120
+ CONSOLE.print_json(json.dumps(payload))
121
+ return 0
122
+
123
+ table = Table(title=f"Fs Vector Store Results ({len(results)} hits)")
124
+ table.add_column("Document ID", style="cyan", overflow="fold")
125
+ table.add_column("Score", style="green")
126
+ table.add_column("Kind", style="magenta")
127
+ table.add_column("Path", style="white", overflow="fold")
128
+ table.add_column("Lines", style="blue")
129
+
130
+ for document, score in results:
131
+ table.add_row(
132
+ document.id,
133
+ f"{score:.4f}",
134
+ str(document.metadata.get("document_kind", "")),
135
+ str(document.metadata.get("relative_path", "")),
136
+ f"{document.metadata.get('start_line')}:{document.metadata.get('end_line')}",
137
+ )
138
+ CONSOLE.print(table)
139
+ return 0
140
+
141
+
142
+ def render_document(store: FsVectorStore, document_id: str) -> int:
143
+ document = store.get_document(document_id)
144
+ if not document:
145
+ LOGGER.error("Document [bold red]%s[/bold red] was not found", document_id)
146
+ return 1
147
+
148
+ LOGGER.info("Printing document [bold cyan]%s[/bold cyan]", document_id)
149
+ CONSOLE.rule(f"[bold blue]{document_id}")
150
+ CONSOLE.print_json(json.dumps(document.metadata))
151
+ CONSOLE.print(document.content)
152
+ return 0
153
+
154
+
155
+ def main(argv: Optional[Iterable[str]] = None) -> int:
156
+ parser = build_parser()
157
+ args = parser.parse_args(list(argv) if argv is not None else None)
158
+ configure_logging(verbose=args.verbose)
159
+ store = build_store(args)
160
+
161
+ if args.command == "query":
162
+ return render_query_results(store, args.query, args.top_k, args.json)
163
+ if args.command == "show":
164
+ return render_document(store, args.document_id)
165
+
166
+ parser.error(f"Unknown command: {args.command}")
167
+ return 2
168
+
169
+
170
+ if __name__ == "__main__":
171
+ raise SystemExit(main())