swarmauri_vectorstore_fs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmauri_vectorstore_fs-0.1.0/PKG-INFO +83 -0
- swarmauri_vectorstore_fs-0.1.0/README.md +59 -0
- swarmauri_vectorstore_fs-0.1.0/pyproject.toml +82 -0
- swarmauri_vectorstore_fs-0.1.0/swarmauri_vectorstore_fs/BM25FScorer.py +142 -0
- swarmauri_vectorstore_fs-0.1.0/swarmauri_vectorstore_fs/FsVectorStore.py +414 -0
- swarmauri_vectorstore_fs-0.1.0/swarmauri_vectorstore_fs/__init__.py +14 -0
- swarmauri_vectorstore_fs-0.1.0/swarmauri_vectorstore_fs/cli.py +171 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: swarmauri_vectorstore_fs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Swarmauri filesystem-aware BM25F vector store
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Keywords: swarmauri,vectorstore,filesystem,bm25f,chunks,retrieval
|
|
7
|
+
Author: Jacob Stewart
|
|
8
|
+
Author-email: jacob@swarmauri.com
|
|
9
|
+
Requires-Python: >=3.10,<3.13
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Natural Language :: English
|
|
15
|
+
Classifier: Development Status :: 3 - Alpha
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
18
|
+
Requires-Dist: rich (>=13.9.4)
|
|
19
|
+
Requires-Dist: swarmauri_base
|
|
20
|
+
Requires-Dist: swarmauri_core
|
|
21
|
+
Requires-Dist: swarmauri_standard
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+

|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
28
|
+
<img src="https://img.shields.io/pypi/pyversions/swarmauri_vectorstore_fs" alt="PyPI - Python Version"/></a>
|
|
29
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
30
|
+
<img src="https://img.shields.io/pypi/l/swarmauri_vectorstore_fs" alt="PyPI - License"/></a>
|
|
31
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
32
|
+
<img src="https://img.shields.io/pypi/v/swarmauri_vectorstore_fs?label=swarmauri_vectorstore_fs&color=green" alt="PyPI - swarmauri_vectorstore_fs"/></a>
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
# Swarmauri Vectorstore FS
|
|
38
|
+
|
|
39
|
+
A Swarmauri community vector store that indexes filesystem trees for BM25F retrieval over file paths, file names, extensions, chunk identity, and file content.
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- Filesystem-aware retrieval with weighted BM25F fields
|
|
44
|
+
- Chunk, file, and chunk-plus-file indexing modes
|
|
45
|
+
- Stable chunk identity metadata for global, path-level, and file-level chunk numbering
|
|
46
|
+
- CLI for ad hoc lexical search over source trees and document corpora
|
|
47
|
+
- No embedding vocabulary dependency for query handling
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install swarmauri_vectorstore_fs
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from swarmauri_vectorstore_fs import FsVectorStore
|
|
59
|
+
|
|
60
|
+
store = FsVectorStore(root_path=".", mode="chunk")
|
|
61
|
+
store.build_index()
|
|
62
|
+
results = store.retrieve("vector store registration", top_k=3)
|
|
63
|
+
|
|
64
|
+
for document in results:
|
|
65
|
+
print(document.id, document.metadata["relative_path"])
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## CLI
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
fsvs --root . query --query "vector store registration" --top-k 5
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
To inspect a specific retrieved document:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
fsvs --root . show --document-id <document-id>
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Want to help?
|
|
81
|
+
|
|
82
|
+
If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md).
|
|
83
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
5
|
+
<img src="https://img.shields.io/pypi/pyversions/swarmauri_vectorstore_fs" alt="PyPI - Python Version"/></a>
|
|
6
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
7
|
+
<img src="https://img.shields.io/pypi/l/swarmauri_vectorstore_fs" alt="PyPI - License"/></a>
|
|
8
|
+
<a href="https://pypi.org/project/swarmauri_vectorstore_fs/">
|
|
9
|
+
<img src="https://img.shields.io/pypi/v/swarmauri_vectorstore_fs?label=swarmauri_vectorstore_fs&color=green" alt="PyPI - swarmauri_vectorstore_fs"/></a>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
# Swarmauri Vectorstore FS
|
|
15
|
+
|
|
16
|
+
A Swarmauri community vector store that indexes filesystem trees for BM25F retrieval over file paths, file names, extensions, chunk identity, and file content.
|
|
17
|
+
|
|
18
|
+
## Features
|
|
19
|
+
|
|
20
|
+
- Filesystem-aware retrieval with weighted BM25F fields
|
|
21
|
+
- Chunk, file, and chunk-plus-file indexing modes
|
|
22
|
+
- Stable chunk identity metadata for global, path-level, and file-level chunk numbering
|
|
23
|
+
- CLI for ad hoc lexical search over source trees and document corpora
|
|
24
|
+
- No embedding vocabulary dependency for query handling
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install swarmauri_vectorstore_fs
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Usage
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from swarmauri_vectorstore_fs import FsVectorStore
|
|
36
|
+
|
|
37
|
+
store = FsVectorStore(root_path=".", mode="chunk")
|
|
38
|
+
store.build_index()
|
|
39
|
+
results = store.retrieve("vector store registration", top_k=3)
|
|
40
|
+
|
|
41
|
+
for document in results:
|
|
42
|
+
print(document.id, document.metadata["relative_path"])
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## CLI
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
fsvs --root . query --query "vector store registration" --top-k 5
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
To inspect a specific retrieved document:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
fsvs --root . show --document-id <document-id>
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Want to help?
|
|
58
|
+
|
|
59
|
+
If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md).
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "swarmauri_vectorstore_fs"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Swarmauri filesystem-aware BM25F vector store"
|
|
5
|
+
license = "Apache-2.0"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
repository = "http://github.com/swarmauri/swarmauri-sdk"
|
|
8
|
+
requires-python = ">=3.10,<3.13"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"License :: OSI Approved :: Apache Software License",
|
|
11
|
+
"Programming Language :: Python :: 3.10",
|
|
12
|
+
"Programming Language :: Python :: 3.11",
|
|
13
|
+
"Programming Language :: Python :: 3.12",
|
|
14
|
+
"Natural Language :: English",
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
18
|
+
]
|
|
19
|
+
authors = [{ name = "Jacob Stewart", email = "jacob@swarmauri.com" }]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"rich>=13.9.4",
|
|
22
|
+
"swarmauri_core",
|
|
23
|
+
"swarmauri_base",
|
|
24
|
+
"swarmauri_standard",
|
|
25
|
+
]
|
|
26
|
+
keywords = [
|
|
27
|
+
"swarmauri",
|
|
28
|
+
"vectorstore",
|
|
29
|
+
"filesystem",
|
|
30
|
+
"bm25f",
|
|
31
|
+
"chunks",
|
|
32
|
+
"retrieval",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[tool.uv.sources]
|
|
36
|
+
swarmauri_core = { workspace = true }
|
|
37
|
+
swarmauri_base = { workspace = true }
|
|
38
|
+
swarmauri_standard = { workspace = true }
|
|
39
|
+
|
|
40
|
+
[tool.pytest.ini_options]
|
|
41
|
+
norecursedirs = ["combined", "scripts"]
|
|
42
|
+
markers = [
|
|
43
|
+
"test: standard test",
|
|
44
|
+
"unit: Unit tests",
|
|
45
|
+
"i9n: Integration tests",
|
|
46
|
+
"r8n: Regression tests",
|
|
47
|
+
"timeout: mark test to timeout after X seconds",
|
|
48
|
+
"xpass: Expected passes",
|
|
49
|
+
"xfail: Expected failures",
|
|
50
|
+
"acceptance: Acceptance tests",
|
|
51
|
+
"perf: Performance tests that measure execution time and resource usage",
|
|
52
|
+
]
|
|
53
|
+
timeout = 300
|
|
54
|
+
log_cli = true
|
|
55
|
+
log_cli_level = "INFO"
|
|
56
|
+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
|
|
57
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
58
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
59
|
+
|
|
60
|
+
[project.entry-points.'swarmauri.vector_stores']
|
|
61
|
+
FsVectorStore = "swarmauri_vectorstore_fs.FsVectorStore:FsVectorStore"
|
|
62
|
+
|
|
63
|
+
[project.scripts]
|
|
64
|
+
fsvs = "swarmauri_vectorstore_fs.cli:main"
|
|
65
|
+
|
|
66
|
+
[build-system]
|
|
67
|
+
requires = ["poetry-core>=1.0.0"]
|
|
68
|
+
build-backend = "poetry.core.masonry.api"
|
|
69
|
+
|
|
70
|
+
[dependency-groups]
|
|
71
|
+
dev = [
|
|
72
|
+
"pytest>=8.0",
|
|
73
|
+
"pytest-asyncio>=0.24.0",
|
|
74
|
+
"pytest-xdist>=3.6.1",
|
|
75
|
+
"pytest-json-report>=1.5.0",
|
|
76
|
+
"python-dotenv",
|
|
77
|
+
"requests>=2.32.3",
|
|
78
|
+
"flake8>=7.0",
|
|
79
|
+
"pytest-timeout>=2.3.1",
|
|
80
|
+
"ruff>=0.9.9",
|
|
81
|
+
"pytest-benchmark>=4.0.0",
|
|
82
|
+
]
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import re
|
|
3
|
+
from collections import Counter, defaultdict
|
|
4
|
+
from typing import Dict, Iterable, List, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
DEFAULT_FIELD_WEIGHTS: Dict[str, float] = {
|
|
8
|
+
"file_name": 5.0,
|
|
9
|
+
"relative_path": 4.0,
|
|
10
|
+
"directory_path": 2.5,
|
|
11
|
+
"file_extension": 1.5,
|
|
12
|
+
"chunk_identity": 1.0,
|
|
13
|
+
"content": 1.0,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BM25FScorer:
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
field_weights: Dict[str, float] | None = None,
|
|
21
|
+
k1: float = 1.2,
|
|
22
|
+
b: float = 0.75,
|
|
23
|
+
) -> None:
|
|
24
|
+
self.field_weights = dict(field_weights or DEFAULT_FIELD_WEIGHTS)
|
|
25
|
+
self.k1 = k1
|
|
26
|
+
self.b = b
|
|
27
|
+
self.document_fields: List[Dict[str, str]] = []
|
|
28
|
+
self._field_term_frequencies: List[Dict[str, Counter[str]]] = []
|
|
29
|
+
self._field_lengths: List[Dict[str, int]] = []
|
|
30
|
+
self._average_field_lengths: Dict[str, float] = {}
|
|
31
|
+
self._document_frequencies: Counter[str] = Counter()
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def tokenize(text: str) -> List[str]:
|
|
35
|
+
tokens: List[str] = []
|
|
36
|
+
for raw_token in re.findall(r"[A-Za-z0-9_]+", text.replace("\\", "/")):
|
|
37
|
+
token = raw_token.lower()
|
|
38
|
+
if token:
|
|
39
|
+
tokens.append(token)
|
|
40
|
+
parts = re.findall(
|
|
41
|
+
r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)|[0-9]+", raw_token.replace("_", " ")
|
|
42
|
+
)
|
|
43
|
+
for part in parts:
|
|
44
|
+
lowered = part.lower()
|
|
45
|
+
if lowered and lowered != token:
|
|
46
|
+
tokens.append(lowered)
|
|
47
|
+
return tokens
|
|
48
|
+
|
|
49
|
+
def fit(self, document_fields: Iterable[Dict[str, str]]) -> None:
|
|
50
|
+
self.document_fields = [dict(fields) for fields in document_fields]
|
|
51
|
+
self._field_term_frequencies = []
|
|
52
|
+
self._field_lengths = []
|
|
53
|
+
self._average_field_lengths = {}
|
|
54
|
+
self._document_frequencies = Counter()
|
|
55
|
+
|
|
56
|
+
field_length_totals: Dict[str, int] = defaultdict(int)
|
|
57
|
+
for fields in self.document_fields:
|
|
58
|
+
doc_frequencies: Dict[str, Counter[str]] = {}
|
|
59
|
+
doc_lengths: Dict[str, int] = {}
|
|
60
|
+
doc_terms = set()
|
|
61
|
+
for field_name in self.field_weights:
|
|
62
|
+
tokens = self.tokenize(fields.get(field_name, ""))
|
|
63
|
+
frequencies = Counter(tokens)
|
|
64
|
+
doc_frequencies[field_name] = frequencies
|
|
65
|
+
doc_lengths[field_name] = len(tokens)
|
|
66
|
+
field_length_totals[field_name] += len(tokens)
|
|
67
|
+
doc_terms.update(frequencies)
|
|
68
|
+
self._field_term_frequencies.append(doc_frequencies)
|
|
69
|
+
self._field_lengths.append(doc_lengths)
|
|
70
|
+
for term in doc_terms:
|
|
71
|
+
self._document_frequencies[term] += 1
|
|
72
|
+
|
|
73
|
+
doc_count = max(1, len(self.document_fields))
|
|
74
|
+
self._average_field_lengths = {
|
|
75
|
+
field_name: field_length_totals[field_name] / doc_count
|
|
76
|
+
for field_name in self.field_weights
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def search(self, query: str, top_k: int = 5) -> List[Tuple[int, float]]:
|
|
80
|
+
query_terms = self.tokenize(query)
|
|
81
|
+
if not query_terms or not self.document_fields:
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
scores = [
|
|
85
|
+
(index, self._score_document(index, query_terms))
|
|
86
|
+
for index in range(len(self.document_fields))
|
|
87
|
+
]
|
|
88
|
+
matches = [(index, score) for index, score in scores if score > 0.0]
|
|
89
|
+
matches.sort(key=lambda item: item[1], reverse=True)
|
|
90
|
+
return matches[:top_k]
|
|
91
|
+
|
|
92
|
+
def _score_document(self, document_index: int, query_terms: List[str]) -> float:
|
|
93
|
+
score = 0.0
|
|
94
|
+
for term in query_terms:
|
|
95
|
+
document_frequency = self._document_frequencies.get(term, 0)
|
|
96
|
+
if document_frequency == 0:
|
|
97
|
+
continue
|
|
98
|
+
weighted_frequency = self._weighted_term_frequency(document_index, term)
|
|
99
|
+
if weighted_frequency <= 0:
|
|
100
|
+
continue
|
|
101
|
+
score += self._idf(document_frequency) * (
|
|
102
|
+
(weighted_frequency * (self.k1 + 1.0)) / (self.k1 + weighted_frequency)
|
|
103
|
+
)
|
|
104
|
+
return score
|
|
105
|
+
|
|
106
|
+
def _weighted_term_frequency(self, document_index: int, term: str) -> float:
|
|
107
|
+
weighted_frequency = 0.0
|
|
108
|
+
for field_name, weight in self.field_weights.items():
|
|
109
|
+
field_frequency = self._field_term_frequencies[document_index][
|
|
110
|
+
field_name
|
|
111
|
+
].get(term, 0)
|
|
112
|
+
if field_frequency == 0:
|
|
113
|
+
continue
|
|
114
|
+
field_length = self._field_lengths[document_index].get(field_name, 0)
|
|
115
|
+
average_length = self._average_field_lengths.get(field_name, 1.0) or 1.0
|
|
116
|
+
normalized_length = 1.0 - self.b + self.b * (field_length / average_length)
|
|
117
|
+
weighted_frequency += weight * (field_frequency / normalized_length)
|
|
118
|
+
return weighted_frequency
|
|
119
|
+
|
|
120
|
+
def _idf(self, document_frequency: int) -> float:
|
|
121
|
+
doc_count = len(self.document_fields)
|
|
122
|
+
return math.log(
|
|
123
|
+
1.0 + ((doc_count - document_frequency + 0.5) / (document_frequency + 0.5))
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def to_dict(self) -> Dict:
|
|
127
|
+
return {
|
|
128
|
+
"field_weights": self.field_weights,
|
|
129
|
+
"k1": self.k1,
|
|
130
|
+
"b": self.b,
|
|
131
|
+
"document_fields": self.document_fields,
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def from_dict(cls, payload: Dict) -> "BM25FScorer":
|
|
136
|
+
scorer = cls(
|
|
137
|
+
field_weights=payload.get("field_weights"),
|
|
138
|
+
k1=payload.get("k1", 1.2),
|
|
139
|
+
b=payload.get("b", 0.75),
|
|
140
|
+
)
|
|
141
|
+
scorer.fit(payload.get("document_fields", []))
|
|
142
|
+
return scorer
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Iterable, List, Literal, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from pydantic import Field, PrivateAttr
|
|
7
|
+
from swarmauri_base.ComponentBase import ComponentBase
|
|
8
|
+
from swarmauri_base.vector_stores.VectorStoreBase import VectorStoreBase
|
|
9
|
+
from swarmauri_base.vector_stores.VectorStoreRetrieveMixin import (
|
|
10
|
+
VectorStoreRetrieveMixin,
|
|
11
|
+
)
|
|
12
|
+
from swarmauri_standard.documents.Document import Document
|
|
13
|
+
|
|
14
|
+
from .BM25FScorer import BM25FScorer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
ModeLiteral = Literal["chunk", "file", "chunk_file"]
|
|
18
|
+
DEFAULT_EXCLUDE_PATTERNS = (
|
|
19
|
+
".git/**",
|
|
20
|
+
"**/.git/**",
|
|
21
|
+
"__pycache__/**",
|
|
22
|
+
"**/__pycache__/**",
|
|
23
|
+
".venv/**",
|
|
24
|
+
"**/.venv/**",
|
|
25
|
+
"node_modules/**",
|
|
26
|
+
"**/node_modules/**",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@ComponentBase.register_type(VectorStoreBase, "FsVectorStore")
|
|
31
|
+
class FsVectorStore(VectorStoreRetrieveMixin, VectorStoreBase):
|
|
32
|
+
type: Literal["FsVectorStore"] = "FsVectorStore"
|
|
33
|
+
root_path: Optional[str] = None
|
|
34
|
+
mode: ModeLiteral = "chunk"
|
|
35
|
+
chunk_size: int = 1200
|
|
36
|
+
chunk_overlap: int = 120
|
|
37
|
+
include: Tuple[str, ...] = Field(default=())
|
|
38
|
+
exclude: Tuple[str, ...] = Field(default=())
|
|
39
|
+
max_file_size: int = 1_000_000
|
|
40
|
+
auto_index: bool = False
|
|
41
|
+
index_metadata: Dict[str, int] = Field(default_factory=dict)
|
|
42
|
+
|
|
43
|
+
_scorer = PrivateAttr()
|
|
44
|
+
_document_map = PrivateAttr()
|
|
45
|
+
|
|
46
|
+
def __init__(self, **kwargs):
|
|
47
|
+
super().__init__(**kwargs)
|
|
48
|
+
self._scorer = BM25FScorer()
|
|
49
|
+
self._document_map: Dict[str, Document] = {}
|
|
50
|
+
self.documents = []
|
|
51
|
+
if self.auto_index and self.root_path:
|
|
52
|
+
self.build_index()
|
|
53
|
+
|
|
54
|
+
def add_document(self, document: Document) -> None:
|
|
55
|
+
self._document_map[document.id] = document
|
|
56
|
+
self.documents = list(self._document_map.values())
|
|
57
|
+
self._reindex_documents()
|
|
58
|
+
|
|
59
|
+
def add_documents(self, documents: List[Document]) -> None:
|
|
60
|
+
for document in documents:
|
|
61
|
+
self._document_map[document.id] = document
|
|
62
|
+
self.documents = list(self._document_map.values())
|
|
63
|
+
self._reindex_documents()
|
|
64
|
+
|
|
65
|
+
def get_document(self, id: str) -> Union[Document, None]:
|
|
66
|
+
return self._document_map.get(id)
|
|
67
|
+
|
|
68
|
+
def get_all_documents(self) -> List[Document]:
|
|
69
|
+
return list(self.documents)
|
|
70
|
+
|
|
71
|
+
def update_document(self, id: str, updated_document: Document) -> None:
|
|
72
|
+
if id not in self._document_map:
|
|
73
|
+
raise KeyError(f"Document '{id}' not found")
|
|
74
|
+
self._document_map[id] = updated_document
|
|
75
|
+
self.documents = list(self._document_map.values())
|
|
76
|
+
self._reindex_documents()
|
|
77
|
+
|
|
78
|
+
def delete_document(self, id: str) -> None:
|
|
79
|
+
self._document_map.pop(id, None)
|
|
80
|
+
self.documents = list(self._document_map.values())
|
|
81
|
+
self._reindex_documents()
|
|
82
|
+
|
|
83
|
+
def clear_documents(self) -> None:
|
|
84
|
+
self.documents = []
|
|
85
|
+
self._document_map = {}
|
|
86
|
+
self._scorer = BM25FScorer()
|
|
87
|
+
|
|
88
|
+
def build_index(self) -> None:
|
|
89
|
+
root = self._validate_root()
|
|
90
|
+
documents: List[Document] = []
|
|
91
|
+
skip_counts = {
|
|
92
|
+
"binary": 0,
|
|
93
|
+
"oversized": 0,
|
|
94
|
+
"unreadable": 0,
|
|
95
|
+
"excluded": 0,
|
|
96
|
+
}
|
|
97
|
+
global_index = 0
|
|
98
|
+
|
|
99
|
+
for file_index, path in enumerate(
|
|
100
|
+
self._iter_indexable_files(root, skip_counts)
|
|
101
|
+
):
|
|
102
|
+
relative_path = path.relative_to(root).as_posix()
|
|
103
|
+
text = self._read_text(path, skip_counts)
|
|
104
|
+
if text is None:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
chunks = self._chunk_text(text)
|
|
108
|
+
if self.mode in {"chunk", "chunk_file"}:
|
|
109
|
+
for chunk_file_index, (chunk, start_line, end_line) in enumerate(
|
|
110
|
+
chunks
|
|
111
|
+
):
|
|
112
|
+
document = self._build_document(
|
|
113
|
+
relative_path=relative_path,
|
|
114
|
+
file_index=file_index,
|
|
115
|
+
chunk_global_index=global_index,
|
|
116
|
+
chunk_path_index=chunk_file_index,
|
|
117
|
+
chunk_file_index=chunk_file_index,
|
|
118
|
+
start_line=start_line,
|
|
119
|
+
end_line=end_line,
|
|
120
|
+
document_kind="chunk",
|
|
121
|
+
content=chunk,
|
|
122
|
+
)
|
|
123
|
+
documents.append(document)
|
|
124
|
+
global_index += 1
|
|
125
|
+
|
|
126
|
+
if self.mode in {"file", "chunk_file"}:
|
|
127
|
+
document = self._build_document(
|
|
128
|
+
relative_path=relative_path,
|
|
129
|
+
file_index=file_index,
|
|
130
|
+
chunk_global_index=global_index,
|
|
131
|
+
chunk_path_index=len(chunks),
|
|
132
|
+
chunk_file_index=-1,
|
|
133
|
+
start_line=1,
|
|
134
|
+
end_line=self._line_count(text),
|
|
135
|
+
document_kind="file",
|
|
136
|
+
content=text,
|
|
137
|
+
)
|
|
138
|
+
documents.append(document)
|
|
139
|
+
global_index += 1
|
|
140
|
+
|
|
141
|
+
self.index_metadata = {
|
|
142
|
+
**skip_counts,
|
|
143
|
+
"indexed_documents": len(documents),
|
|
144
|
+
}
|
|
145
|
+
self._document_map = {document.id: document for document in documents}
|
|
146
|
+
self.documents = documents
|
|
147
|
+
self._reindex_documents()
|
|
148
|
+
|
|
149
|
+
def refresh_index(self) -> None:
|
|
150
|
+
self.build_index()
|
|
151
|
+
|
|
152
|
+
def retrieve(self, query: str, top_k: int = 5) -> List[Document]:
|
|
153
|
+
return [document for document, _score in self.search(query, top_k=top_k)]
|
|
154
|
+
|
|
155
|
+
def query(self, text: str, top_k: int = 5) -> List[Document]:
|
|
156
|
+
return self.retrieve(text, top_k=top_k)
|
|
157
|
+
|
|
158
|
+
def search(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
|
|
159
|
+
if not self.documents and self.root_path:
|
|
160
|
+
self.build_index()
|
|
161
|
+
results = self._scorer.search(query, top_k=top_k)
|
|
162
|
+
return [(self.documents[index], score) for index, score in results]
|
|
163
|
+
|
|
164
|
+
def save_store(self, directory_path: str) -> None:
|
|
165
|
+
output_dir = Path(directory_path)
|
|
166
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
payload = {
|
|
168
|
+
"root_path": self.root_path,
|
|
169
|
+
"mode": self.mode,
|
|
170
|
+
"chunk_size": self.chunk_size,
|
|
171
|
+
"chunk_overlap": self.chunk_overlap,
|
|
172
|
+
"include": list(self.include),
|
|
173
|
+
"exclude": list(self.exclude),
|
|
174
|
+
"max_file_size": self.max_file_size,
|
|
175
|
+
"index_metadata": self.index_metadata,
|
|
176
|
+
"bm25f": self._scorer.to_dict(),
|
|
177
|
+
"documents": [
|
|
178
|
+
document.model_dump(mode="json") for document in self.documents
|
|
179
|
+
],
|
|
180
|
+
}
|
|
181
|
+
(output_dir / "fs_vectorstore.json").write_text(
|
|
182
|
+
json.dumps(payload, indent=2), encoding="utf-8"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def load_store(self, directory_path: str) -> None:
|
|
186
|
+
payload = json.loads(
|
|
187
|
+
(Path(directory_path) / "fs_vectorstore.json").read_text(encoding="utf-8")
|
|
188
|
+
)
|
|
189
|
+
self.root_path = payload.get("root_path")
|
|
190
|
+
self.mode = payload.get("mode", "chunk")
|
|
191
|
+
self.chunk_size = payload.get("chunk_size", 1200)
|
|
192
|
+
self.chunk_overlap = payload.get("chunk_overlap", 120)
|
|
193
|
+
self.include = tuple(payload.get("include", []))
|
|
194
|
+
self.exclude = tuple(payload.get("exclude", []))
|
|
195
|
+
self.max_file_size = payload.get("max_file_size", 1_000_000)
|
|
196
|
+
self.index_metadata = payload.get("index_metadata", {})
|
|
197
|
+
self.documents = [
|
|
198
|
+
Document.model_validate(document) for document in payload["documents"]
|
|
199
|
+
]
|
|
200
|
+
self._document_map = {document.id: document for document in self.documents}
|
|
201
|
+
self._scorer = BM25FScorer.from_dict(payload.get("bm25f", {}))
|
|
202
|
+
if not self._scorer.document_fields:
|
|
203
|
+
self._reindex_documents()
|
|
204
|
+
|
|
205
|
+
def _reindex_documents(self) -> None:
|
|
206
|
+
self._scorer = BM25FScorer()
|
|
207
|
+
self._scorer.fit(
|
|
208
|
+
[self._fields_for_document(document) for document in self.documents]
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
def _validate_root(self) -> Path:
|
|
212
|
+
if not self.root_path:
|
|
213
|
+
raise ValueError("root_path is required to build the filesystem index")
|
|
214
|
+
root = Path(self.root_path)
|
|
215
|
+
if not root.exists():
|
|
216
|
+
raise FileNotFoundError(f"Root path '{self.root_path}' does not exist")
|
|
217
|
+
if not root.is_dir():
|
|
218
|
+
raise NotADirectoryError(f"Root path '{self.root_path}' is not a directory")
|
|
219
|
+
if self.chunk_size <= 0:
|
|
220
|
+
raise ValueError("chunk_size must be greater than zero")
|
|
221
|
+
if self.chunk_overlap < 0:
|
|
222
|
+
raise ValueError("chunk_overlap must be zero or greater")
|
|
223
|
+
if self.chunk_overlap >= self.chunk_size:
|
|
224
|
+
raise ValueError("chunk_overlap must be smaller than chunk_size")
|
|
225
|
+
return root
|
|
226
|
+
|
|
227
|
+
def _iter_indexable_files(
|
|
228
|
+
self, root: Path, skip_counts: Dict[str, int]
|
|
229
|
+
) -> Iterable[Path]:
|
|
230
|
+
for path in sorted(root.rglob("*")):
|
|
231
|
+
if not path.is_file():
|
|
232
|
+
continue
|
|
233
|
+
relative_path = path.relative_to(root).as_posix()
|
|
234
|
+
if self.include and not self._matches_any(relative_path, self.include):
|
|
235
|
+
skip_counts["excluded"] += 1
|
|
236
|
+
continue
|
|
237
|
+
if self._matches_any(
|
|
238
|
+
relative_path, (*DEFAULT_EXCLUDE_PATTERNS, *self.exclude)
|
|
239
|
+
):
|
|
240
|
+
skip_counts["excluded"] += 1
|
|
241
|
+
continue
|
|
242
|
+
yield path
|
|
243
|
+
|
|
244
|
+
def _read_text(self, path: Path, skip_counts: Dict[str, int]) -> Optional[str]:
|
|
245
|
+
try:
|
|
246
|
+
size = path.stat().st_size
|
|
247
|
+
if size > self.max_file_size:
|
|
248
|
+
skip_counts["oversized"] += 1
|
|
249
|
+
return None
|
|
250
|
+
data = path.read_bytes()
|
|
251
|
+
except OSError:
|
|
252
|
+
skip_counts["unreadable"] += 1
|
|
253
|
+
return None
|
|
254
|
+
|
|
255
|
+
if b"\x00" in data:
|
|
256
|
+
skip_counts["binary"] += 1
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
return data.decode("utf-8")
|
|
261
|
+
except UnicodeDecodeError:
|
|
262
|
+
skip_counts["binary"] += 1
|
|
263
|
+
return None
|
|
264
|
+
|
|
265
|
+
def _chunk_text(self, text: str) -> List[Tuple[str, int, int]]:
|
|
266
|
+
if not text:
|
|
267
|
+
return [("", 1, 1)]
|
|
268
|
+
|
|
269
|
+
chunks: List[Tuple[str, int, int]] = []
|
|
270
|
+
start = 0
|
|
271
|
+
text_length = len(text)
|
|
272
|
+
while start < text_length:
|
|
273
|
+
end = min(text_length, start + self.chunk_size)
|
|
274
|
+
chunk = text[start:end]
|
|
275
|
+
start_line = text.count("\n", 0, start) + 1
|
|
276
|
+
end_line = text.count("\n", 0, end) + 1
|
|
277
|
+
chunks.append((chunk, start_line, end_line))
|
|
278
|
+
if end == text_length:
|
|
279
|
+
break
|
|
280
|
+
start = max(end - self.chunk_overlap, start + 1)
|
|
281
|
+
return chunks
|
|
282
|
+
|
|
283
|
+
def _build_document(
|
|
284
|
+
self,
|
|
285
|
+
*,
|
|
286
|
+
relative_path: str,
|
|
287
|
+
file_index: int,
|
|
288
|
+
chunk_global_index: int,
|
|
289
|
+
chunk_path_index: int,
|
|
290
|
+
chunk_file_index: int,
|
|
291
|
+
start_line: int,
|
|
292
|
+
end_line: int,
|
|
293
|
+
document_kind: Literal["chunk", "file"],
|
|
294
|
+
content: str,
|
|
295
|
+
) -> Document:
|
|
296
|
+
path = Path(relative_path)
|
|
297
|
+
file_name = path.name
|
|
298
|
+
file_extension = path.suffix
|
|
299
|
+
directory_path = path.parent.as_posix() if path.parent.as_posix() != "." else ""
|
|
300
|
+
bm25_fields = self._build_bm25_fields(
|
|
301
|
+
relative_path=relative_path,
|
|
302
|
+
directory_path=directory_path,
|
|
303
|
+
file_name=file_name,
|
|
304
|
+
file_extension=file_extension,
|
|
305
|
+
file_index=file_index,
|
|
306
|
+
chunk_global_index=chunk_global_index,
|
|
307
|
+
chunk_path_index=chunk_path_index,
|
|
308
|
+
chunk_file_index=chunk_file_index,
|
|
309
|
+
content=content,
|
|
310
|
+
)
|
|
311
|
+
metadata = {
|
|
312
|
+
"relative_path": relative_path,
|
|
313
|
+
"file_index": file_index,
|
|
314
|
+
"file_name": file_name,
|
|
315
|
+
"file_extension": file_extension,
|
|
316
|
+
"chunk_global_index": chunk_global_index,
|
|
317
|
+
"chunk_path_index": chunk_path_index,
|
|
318
|
+
"chunk_file_index": chunk_file_index,
|
|
319
|
+
"start_line": start_line,
|
|
320
|
+
"end_line": end_line,
|
|
321
|
+
"document_kind": document_kind,
|
|
322
|
+
"bm25_fields": bm25_fields,
|
|
323
|
+
}
|
|
324
|
+
return Document(
|
|
325
|
+
id=f"fs:{file_index}:{document_kind}:{chunk_path_index}",
|
|
326
|
+
content=self._composite_content(metadata, content),
|
|
327
|
+
metadata=metadata,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def _build_bm25_fields(
|
|
331
|
+
self,
|
|
332
|
+
*,
|
|
333
|
+
relative_path: str,
|
|
334
|
+
directory_path: str,
|
|
335
|
+
file_name: str,
|
|
336
|
+
file_extension: str,
|
|
337
|
+
file_index: int,
|
|
338
|
+
chunk_global_index: int,
|
|
339
|
+
chunk_path_index: int,
|
|
340
|
+
chunk_file_index: int,
|
|
341
|
+
content: str,
|
|
342
|
+
) -> Dict[str, str]:
|
|
343
|
+
chunk_identity = " ".join(
|
|
344
|
+
[
|
|
345
|
+
f"file_index {file_index}",
|
|
346
|
+
f"file_{file_index}",
|
|
347
|
+
f"chunk_global_index {chunk_global_index}",
|
|
348
|
+
f"chunk_global_{chunk_global_index}",
|
|
349
|
+
f"chunk_path_index {chunk_path_index}",
|
|
350
|
+
f"chunk_path_{chunk_path_index}",
|
|
351
|
+
f"chunk_file_index {chunk_file_index}",
|
|
352
|
+
f"chunk_file_{chunk_file_index}",
|
|
353
|
+
]
|
|
354
|
+
)
|
|
355
|
+
return {
|
|
356
|
+
"file_name": file_name,
|
|
357
|
+
"relative_path": relative_path,
|
|
358
|
+
"directory_path": directory_path,
|
|
359
|
+
"file_extension": file_extension,
|
|
360
|
+
"chunk_identity": chunk_identity,
|
|
361
|
+
"content": content,
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
def _fields_for_document(self, document: Document) -> Dict[str, str]:
|
|
365
|
+
fields = document.metadata.get("bm25_fields")
|
|
366
|
+
if isinstance(fields, dict):
|
|
367
|
+
return {field: str(value) for field, value in fields.items()}
|
|
368
|
+
return {
|
|
369
|
+
"file_name": str(document.metadata.get("file_name", "")),
|
|
370
|
+
"relative_path": str(document.metadata.get("relative_path", "")),
|
|
371
|
+
"directory_path": str(
|
|
372
|
+
Path(str(document.metadata.get("relative_path", ""))).parent
|
|
373
|
+
),
|
|
374
|
+
"file_extension": str(document.metadata.get("file_extension", "")),
|
|
375
|
+
"chunk_identity": " ".join(
|
|
376
|
+
str(document.metadata.get(key, ""))
|
|
377
|
+
for key in (
|
|
378
|
+
"file_index",
|
|
379
|
+
"chunk_global_index",
|
|
380
|
+
"chunk_path_index",
|
|
381
|
+
"chunk_file_index",
|
|
382
|
+
)
|
|
383
|
+
),
|
|
384
|
+
"content": document.content,
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
def _composite_content(self, metadata: Dict, content: str) -> str:
|
|
388
|
+
return "\n".join(
|
|
389
|
+
[
|
|
390
|
+
f"file_index: {metadata['file_index']}",
|
|
391
|
+
f"file_path: {metadata['relative_path']}",
|
|
392
|
+
f"file_name: {metadata['file_name']}",
|
|
393
|
+
f"file_extension: {metadata['file_extension']}",
|
|
394
|
+
f"chunk_global_index: {metadata['chunk_global_index']}",
|
|
395
|
+
f"chunk_path_index: {metadata['chunk_path_index']}",
|
|
396
|
+
f"chunk_file_index: {metadata['chunk_file_index']}",
|
|
397
|
+
f"document_kind: {metadata['document_kind']}",
|
|
398
|
+
"content:",
|
|
399
|
+
content,
|
|
400
|
+
]
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _matches_any(self, relative_path: str, patterns: Tuple[str, ...]) -> bool:
|
|
404
|
+
path = relative_path.replace("\\", "/")
|
|
405
|
+
name = Path(path).name
|
|
406
|
+
return any(
|
|
407
|
+
fnmatch.fnmatch(path, pattern) or fnmatch.fnmatch(name, pattern)
|
|
408
|
+
for pattern in patterns
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def _line_count(self, text: str) -> int:
|
|
412
|
+
if not text:
|
|
413
|
+
return 1
|
|
414
|
+
return text.count("\n") + (0 if text.endswith("\n") else 1)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .BM25FScorer import BM25FScorer
|
|
2
|
+
from .FsVectorStore import FsVectorStore
|
|
3
|
+
|
|
4
|
+
__all__ = ["BM25FScorer", "FsVectorStore"]
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
8
|
+
except ImportError:
|
|
9
|
+
from importlib_metadata import PackageNotFoundError, version
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
__version__ = version("swarmauri_vectorstore_fs")
|
|
13
|
+
except PackageNotFoundError:
|
|
14
|
+
__version__ = "0.0.0"
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Iterable, Optional
|
|
5
|
+
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.logging import RichHandler
|
|
8
|
+
from rich.table import Table
|
|
9
|
+
|
|
10
|
+
from .FsVectorStore import FsVectorStore
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
LOGGER = logging.getLogger("swarmauri.fs_vectorstore")
|
|
14
|
+
CONSOLE = Console()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def configure_logging(verbose: bool = False) -> None:
|
|
18
|
+
level = logging.DEBUG if verbose else logging.INFO
|
|
19
|
+
logging.basicConfig(
|
|
20
|
+
level=level,
|
|
21
|
+
format="%(message)s",
|
|
22
|
+
datefmt="[%X]",
|
|
23
|
+
handlers=[RichHandler(rich_tracebacks=True, markup=True)],
|
|
24
|
+
force=True,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
29
|
+
parser = argparse.ArgumentParser(
|
|
30
|
+
description="BM25F retrieval over filesystem files and chunks."
|
|
31
|
+
)
|
|
32
|
+
parser.add_argument("--root", default=".", help="Root directory to index")
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--mode",
|
|
35
|
+
choices=["chunk", "file", "chunk_file"],
|
|
36
|
+
default="chunk",
|
|
37
|
+
help="Indexing mode",
|
|
38
|
+
)
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--chunk-size", type=int, default=1200, help="Chunk size in characters"
|
|
41
|
+
)
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"--chunk-overlap", type=int, default=120, help="Chunk overlap in characters"
|
|
44
|
+
)
|
|
45
|
+
parser.add_argument(
|
|
46
|
+
"--include", action="append", dest="include", help="Glob to include"
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"--exclude", action="append", dest="exclude", help="Glob to exclude"
|
|
50
|
+
)
|
|
51
|
+
parser.add_argument(
|
|
52
|
+
"--max-file-size",
|
|
53
|
+
type=int,
|
|
54
|
+
default=1_000_000,
|
|
55
|
+
help="Maximum file size in bytes to index",
|
|
56
|
+
)
|
|
57
|
+
parser.add_argument("--verbose", action="store_true", help="Enable debug logging")
|
|
58
|
+
|
|
59
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
60
|
+
|
|
61
|
+
query_parser = subparsers.add_parser(
|
|
62
|
+
"query", help="Query indexed filesystem documents"
|
|
63
|
+
)
|
|
64
|
+
query_parser.add_argument("--query", required=True, help="Query text")
|
|
65
|
+
query_parser.add_argument(
|
|
66
|
+
"--top-k", type=int, default=5, help="Number of hits to return"
|
|
67
|
+
)
|
|
68
|
+
query_parser.add_argument("--json", action="store_true", help="Emit JSON output")
|
|
69
|
+
|
|
70
|
+
show_parser = subparsers.add_parser("show", help="Print a document by id")
|
|
71
|
+
show_parser.add_argument(
|
|
72
|
+
"--document-id", required=True, help="Document id to display"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return parser
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def build_store(args: argparse.Namespace) -> FsVectorStore:
|
|
79
|
+
store = FsVectorStore(
|
|
80
|
+
root_path=args.root,
|
|
81
|
+
mode=args.mode,
|
|
82
|
+
chunk_size=args.chunk_size,
|
|
83
|
+
chunk_overlap=args.chunk_overlap,
|
|
84
|
+
include=tuple(args.include or ()),
|
|
85
|
+
exclude=tuple(args.exclude or ()),
|
|
86
|
+
max_file_size=args.max_file_size,
|
|
87
|
+
)
|
|
88
|
+
LOGGER.info(
|
|
89
|
+
"Building filesystem BM25F index [bold cyan](root=%s, mode=%s)[/bold cyan]",
|
|
90
|
+
args.root,
|
|
91
|
+
args.mode,
|
|
92
|
+
)
|
|
93
|
+
store.build_index()
|
|
94
|
+
LOGGER.info("Indexed [bold green]%s[/bold green] documents", len(store.documents))
|
|
95
|
+
return store
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def render_query_results(
|
|
99
|
+
store: FsVectorStore, query: str, top_k: int, as_json: bool
|
|
100
|
+
) -> int:
|
|
101
|
+
LOGGER.info(
|
|
102
|
+
"Running BM25F retrieval for query [bold yellow]%s[/bold yellow]", query
|
|
103
|
+
)
|
|
104
|
+
results = store.search(query, top_k=top_k)
|
|
105
|
+
if not results:
|
|
106
|
+
LOGGER.warning(
|
|
107
|
+
"No lexical matches found for query [bold yellow]%s[/bold yellow]", query
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if as_json:
|
|
111
|
+
payload = [
|
|
112
|
+
{
|
|
113
|
+
"id": document.id,
|
|
114
|
+
"score": score,
|
|
115
|
+
"metadata": document.metadata,
|
|
116
|
+
"content": document.content,
|
|
117
|
+
}
|
|
118
|
+
for document, score in results
|
|
119
|
+
]
|
|
120
|
+
CONSOLE.print_json(json.dumps(payload))
|
|
121
|
+
return 0
|
|
122
|
+
|
|
123
|
+
table = Table(title=f"Fs Vector Store Results ({len(results)} hits)")
|
|
124
|
+
table.add_column("Document ID", style="cyan", overflow="fold")
|
|
125
|
+
table.add_column("Score", style="green")
|
|
126
|
+
table.add_column("Kind", style="magenta")
|
|
127
|
+
table.add_column("Path", style="white", overflow="fold")
|
|
128
|
+
table.add_column("Lines", style="blue")
|
|
129
|
+
|
|
130
|
+
for document, score in results:
|
|
131
|
+
table.add_row(
|
|
132
|
+
document.id,
|
|
133
|
+
f"{score:.4f}",
|
|
134
|
+
str(document.metadata.get("document_kind", "")),
|
|
135
|
+
str(document.metadata.get("relative_path", "")),
|
|
136
|
+
f"{document.metadata.get('start_line')}:{document.metadata.get('end_line')}",
|
|
137
|
+
)
|
|
138
|
+
CONSOLE.print(table)
|
|
139
|
+
return 0
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def render_document(store: FsVectorStore, document_id: str) -> int:
|
|
143
|
+
document = store.get_document(document_id)
|
|
144
|
+
if not document:
|
|
145
|
+
LOGGER.error("Document [bold red]%s[/bold red] was not found", document_id)
|
|
146
|
+
return 1
|
|
147
|
+
|
|
148
|
+
LOGGER.info("Printing document [bold cyan]%s[/bold cyan]", document_id)
|
|
149
|
+
CONSOLE.rule(f"[bold blue]{document_id}")
|
|
150
|
+
CONSOLE.print_json(json.dumps(document.metadata))
|
|
151
|
+
CONSOLE.print(document.content)
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main(argv: Optional[Iterable[str]] = None) -> int:
|
|
156
|
+
parser = build_parser()
|
|
157
|
+
args = parser.parse_args(list(argv) if argv is not None else None)
|
|
158
|
+
configure_logging(verbose=args.verbose)
|
|
159
|
+
store = build_store(args)
|
|
160
|
+
|
|
161
|
+
if args.command == "query":
|
|
162
|
+
return render_query_results(store, args.query, args.top_k, args.json)
|
|
163
|
+
if args.command == "show":
|
|
164
|
+
return render_document(store, args.document_id)
|
|
165
|
+
|
|
166
|
+
parser.error(f"Unknown command: {args.command}")
|
|
167
|
+
return 2
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
if __name__ == "__main__":
|
|
171
|
+
raise SystemExit(main())
|