mcp-kb 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_kb/cli/__init__.py +1 -0
- mcp_kb/cli/args.py +175 -0
- mcp_kb/cli/main.py +181 -0
- mcp_kb/cli/reindex.py +113 -0
- mcp_kb/cli/runtime_config.py +421 -0
- mcp_kb/data/KNOWLEDBASE_DOC.md +151 -0
- mcp_kb/data/__init__.py +1 -0
- mcp_kb/ingest/__init__.py +1 -0
- mcp_kb/ingest/chroma.py +1287 -0
- mcp_kb/knowledge/__init__.py +1 -0
- mcp_kb/knowledge/bootstrap.py +44 -0
- mcp_kb/knowledge/events.py +105 -0
- mcp_kb/knowledge/search.py +177 -0
- mcp_kb/knowledge/store.py +294 -0
- mcp_kb/security/__init__.py +1 -0
- mcp_kb/security/path_validation.py +108 -0
- mcp_kb/server/__init__.py +1 -0
- mcp_kb/server/app.py +201 -0
- mcp_kb/ui/__init__.py +17 -0
- mcp_kb/ui/api.py +377 -0
- mcp_kb/ui/assets/assets/index.css +1 -0
- mcp_kb/ui/assets/index.html +62 -0
- mcp_kb/ui/server.py +332 -0
- mcp_kb/utils/__init__.py +1 -0
- mcp_kb/utils/filesystem.py +128 -0
- mcp_kb-0.3.3.dist-info/METADATA +338 -0
- mcp_kb-0.3.3.dist-info/RECORD +32 -0
- mcp_kb-0.3.1.dist-info/METADATA +0 -181
- mcp_kb-0.3.1.dist-info/RECORD +0 -7
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/WHEEL +0 -0
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/entry_points.txt +0 -0
- {mcp_kb-0.3.1.dist-info → mcp_kb-0.3.3.dist-info}/top_level.txt +0 -0
mcp_kb/ingest/chroma.py
ADDED
@@ -0,0 +1,1287 @@
|
|
1
|
+
"""Integration layer that mirrors knowledge base updates into ChromaDB."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
|
6
|
+
import importlib
|
7
|
+
import json
|
8
|
+
import logging
|
9
|
+
import pickle
|
10
|
+
import threading
|
11
|
+
from pathlib import Path
|
12
|
+
from bisect import bisect_right
|
13
|
+
from typing import (
|
14
|
+
TYPE_CHECKING,
|
15
|
+
Any,
|
16
|
+
Dict,
|
17
|
+
List,
|
18
|
+
Mapping,
|
19
|
+
Optional,
|
20
|
+
Set,
|
21
|
+
Sequence,
|
22
|
+
Tuple,
|
23
|
+
Type,
|
24
|
+
TypedDict,
|
25
|
+
Literal,
|
26
|
+
)
|
27
|
+
from datetime import datetime, timezone
|
28
|
+
from threading import Timer
|
29
|
+
|
30
|
+
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
|
31
|
+
|
32
|
+
from langchain_text_splitters import TokenTextSplitter
|
33
|
+
from tqdm import tqdm
|
34
|
+
from pydantic import BaseModel, model_validator
|
35
|
+
|
36
|
+
from mcp_kb.config import DATA_FOLDER_NAME
|
37
|
+
from mcp_kb.knowledge.events import (
|
38
|
+
FileDeleteEvent,
|
39
|
+
FileUpsertEvent,
|
40
|
+
KnowledgeBaseListener,
|
41
|
+
KnowledgeBaseReindexListener,
|
42
|
+
)
|
43
|
+
from mcp_kb.knowledge.store import FileSegment
|
44
|
+
|
45
|
+
|
46
|
+
if TYPE_CHECKING: # pragma: no cover - type checking only imports
|
47
|
+
from chromadb.api import ClientAPI, GetResult
|
48
|
+
from chromadb.api.models.Collection import Collection
|
49
|
+
from mcp_kb.knowledge.store import KnowledgeBase
|
50
|
+
try:
|
51
|
+
from sentence_transformers import SentenceTransformer
|
52
|
+
except ImportError:
|
53
|
+
SentenceTransformer = None
|
54
|
+
|
55
|
+
|
56
|
+
def _import_sentence_transformer() -> Optional[Type[SentenceTransformer]]:
|
57
|
+
try:
|
58
|
+
from sentence_transformers import SentenceTransformer
|
59
|
+
return SentenceTransformer
|
60
|
+
except ImportError:
|
61
|
+
return None
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
logger = logging.getLogger(__name__)
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
SUPPORTED_CLIENTS: Tuple[str, ...] = ("off", "ephemeral", "persistent", "http", "cloud")
|
70
|
+
"""Recognised client types exposed to operators enabling Chroma ingestion."""
|
71
|
+
|
72
|
+
|
73
|
+
class SentenceTransformerEmbedder(EmbeddingFunction):
|
74
|
+
def __init__(self, model_name: str):
|
75
|
+
self._model_name = model_name
|
76
|
+
if _import_sentence_transformer() is None:
|
77
|
+
raise ValueError("SentenceTransformer is not installed")
|
78
|
+
|
79
|
+
def __call__(self, input: Documents) -> Embeddings:
|
80
|
+
# embed the documents somehow
|
81
|
+
if self._model_name not in _SENTENCE_TRANSFORMER:
|
82
|
+
_SENTENCE_TRANSFORMER[self._model_name] = SentenceTransformer(self._model_name)
|
83
|
+
prompt_name=None
|
84
|
+
if "query" in _SENTENCE_TRANSFORMER[self._model_name].prompts:
|
85
|
+
prompt_name="query"
|
86
|
+
|
87
|
+
return _SENTENCE_TRANSFORMER[self._model_name].encode(input,
|
88
|
+
prompt_name=prompt_name,
|
89
|
+
# precision="int8"
|
90
|
+
batch_size=4
|
91
|
+
)
|
92
|
+
|
93
|
+
def name(self) -> str:
|
94
|
+
return f"SentenceTransformerEmbedder-{self._model_name}"
|
95
|
+
|
96
|
+
@classmethod
|
97
|
+
def build_from_config(cls,config: Dict[str, Any]):
|
98
|
+
return cls(config["model_name"])
|
99
|
+
|
100
|
+
def get_config(self) -> Dict[str, Any]:
|
101
|
+
return {"model_name": self.name()}
|
102
|
+
|
103
|
+
|
104
|
+
class ChromaFileSegment(FileSegment):
|
105
|
+
"""Represents a snippet of file content returned to MCP clients."""
|
106
|
+
document_id: str
|
107
|
+
chunk_number: int
|
108
|
+
distance: Optional[float] = None
|
109
|
+
chunk_id: Optional[str] = None
|
110
|
+
umap2d:Optional[List[float]]=None
|
111
|
+
umap3d:Optional[List[float]]=None
|
112
|
+
|
113
|
+
@model_validator(mode="before")
|
114
|
+
@classmethod
|
115
|
+
def check_umap(cls, values: dict) -> dict:
|
116
|
+
nan = float("nan")
|
117
|
+
if "umap2d" in values:
|
118
|
+
values["umap2d"] = json.loads(values["umap2d"])
|
119
|
+
else:
|
120
|
+
values["umap2d"] = [nan,nan]
|
121
|
+
if "umap3d" in values:
|
122
|
+
values["umap3d"] = json.loads(values["umap3d"])
|
123
|
+
else:
|
124
|
+
values["umap3d"] = [nan,nan,nan]
|
125
|
+
|
126
|
+
if len(values["umap2d"]) > 2:
|
127
|
+
values["umap2d"] = values["umap2d"][:2]
|
128
|
+
if len(values["umap3d"]) > 3:
|
129
|
+
values["umap3d"] = values["umap3d"][:3]
|
130
|
+
while len(values["umap2d"]) < 2:
|
131
|
+
values["umap2d"].append(nan)
|
132
|
+
while len(values["umap3d"]) < 3:
|
133
|
+
values["umap3d"].append(nan)
|
134
|
+
|
135
|
+
if "umap2d_x" in values:
|
136
|
+
values["umap2d"][0] = float(values["umap2d_x"])
|
137
|
+
if "umap2d_y" in values:
|
138
|
+
values["umap2d"][1] = float(values["umap2d_y"])
|
139
|
+
if "umap3d_x" in values:
|
140
|
+
values["umap3d"][0] = float(values["umap3d_x"])
|
141
|
+
if "umap3d_y" in values:
|
142
|
+
values["umap3d"][1] = float(values["umap3d_y"])
|
143
|
+
if "umap3d_z" in values:
|
144
|
+
values["umap3d"][2] = float(values["umap3d_z"])
|
145
|
+
|
146
|
+
return values
|
147
|
+
|
148
|
+
|
149
|
+
_SENTENCE_TRANSFORMER:Dict[str,SentenceTransformer]={}
|
150
|
+
|
151
|
+
class ChromaConfiguration(BaseModel):
|
152
|
+
"""Runtime configuration controlling how Chroma ingestion behaves.
|
153
|
+
|
154
|
+
Each attribute corresponds to either a CLI flag or an environment variable
|
155
|
+
so that deployments can toggle Chroma synchronisation without changing the
|
156
|
+
application code. The configuration intentionally stores already-normalised
|
157
|
+
values (e.g., resolved paths and lowercase enums) so downstream components
|
158
|
+
can rely on consistent semantics regardless of where the data originated.
|
159
|
+
The resolved knowledge base root is kept in ``kb_root`` for features that
|
160
|
+
need deterministic access to the filesystem layout.
|
161
|
+
"""
|
162
|
+
|
163
|
+
client_type: str
|
164
|
+
collection_name: str
|
165
|
+
embedding: str
|
166
|
+
data_directory: Optional[Path]
|
167
|
+
kb_root: Path
|
168
|
+
host: Optional[str]
|
169
|
+
port: Optional[int]
|
170
|
+
ssl: bool
|
171
|
+
tenant: Optional[str]
|
172
|
+
database: Optional[str]
|
173
|
+
api_key: Optional[str]
|
174
|
+
custom_auth_credentials: Optional[str]
|
175
|
+
id_prefix: str
|
176
|
+
sentence_transformer: Optional[str] = None
|
177
|
+
chunk_size: int = 200
|
178
|
+
chunk_overlap: int = 20
|
179
|
+
|
180
|
+
@model_validator(mode="after")
|
181
|
+
def check_sentence_transformer(self) -> "ChromaConfiguration":
|
182
|
+
if self.sentence_transformer:
|
183
|
+
if _import_sentence_transformer() is not None:
|
184
|
+
if self.sentence_transformer not in _SENTENCE_TRANSFORMER:
|
185
|
+
from sentence_transformers.util import is_sentence_transformer_model
|
186
|
+
if not is_sentence_transformer_model(self.sentence_transformer):
|
187
|
+
raise ValueError(f"Invalid sentence transformer model: {self.sentence_transformer}")
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
return self
|
192
|
+
|
193
|
+
|
194
|
+
@property
|
195
|
+
def enabled(self) -> bool:
|
196
|
+
"""Return ``True`` when ingestion should be activated."""
|
197
|
+
|
198
|
+
return self.client_type != "off"
|
199
|
+
|
200
|
+
@classmethod
|
201
|
+
def from_options(
|
202
|
+
cls,
|
203
|
+
*,
|
204
|
+
root: Path,
|
205
|
+
client_type: str,
|
206
|
+
collection_name: str,
|
207
|
+
embedding: str,
|
208
|
+
data_directory: Optional[str],
|
209
|
+
host: Optional[str],
|
210
|
+
port: Optional[int],
|
211
|
+
ssl: bool,
|
212
|
+
tenant: Optional[str],
|
213
|
+
database: Optional[str],
|
214
|
+
api_key: Optional[str],
|
215
|
+
custom_auth_credentials: Optional[str],
|
216
|
+
id_prefix: Optional[str],
|
217
|
+
sentence_transformer: Optional[str] = None,
|
218
|
+
chunk_size: int = 200,
|
219
|
+
chunk_overlap: int = 20,
|
220
|
+
) -> "ChromaConfiguration":
|
221
|
+
"""Normalise CLI and environment inputs into a configuration object.
|
222
|
+
|
223
|
+
Parameters
|
224
|
+
----------
|
225
|
+
root:
|
226
|
+
Absolute knowledge base root used to derive default directories. The
|
227
|
+
resolved path is stored on the resulting configuration as
|
228
|
+
``kb_root`` for downstream components that need filesystem access.
|
229
|
+
client_type:
|
230
|
+
One of :data:`SUPPORTED_CLIENTS`. ``"off"`` disables ingestion.
|
231
|
+
collection_name:
|
232
|
+
Target Chroma collection that will store knowledge base documents.
|
233
|
+
embedding:
|
234
|
+
Name of the embedding function to instantiate. Values are matched
|
235
|
+
case-insensitively to the functions exported by Chroma.
|
236
|
+
data_directory:
|
237
|
+
Optional directory for the persistent client. When omitted and the
|
238
|
+
client type is ``"persistent"`` the function creates a ``chroma``
|
239
|
+
sub-directory next to the knowledge base.
|
240
|
+
host / port / ssl / tenant / database / api_key / custom_auth_credentials:
|
241
|
+
Transport-specific settings passed directly to the Chroma client
|
242
|
+
constructors.
|
243
|
+
id_prefix:
|
244
|
+
Optional prefix prepended to every document ID stored in Chroma.
|
245
|
+
Defaults to ``"kb::"`` for readability.
|
246
|
+
sentence_transformer:
|
247
|
+
Optional name of a sentence transformer model to load when the
|
248
|
+
``sentence-transformers`` extra is installed. ``None`` keeps the
|
249
|
+
default embedding factory untouched.
|
250
|
+
"""
|
251
|
+
|
252
|
+
normalized_type = (client_type or "off").lower()
|
253
|
+
if normalized_type not in SUPPORTED_CLIENTS:
|
254
|
+
raise ValueError(f"Unsupported Chroma client type: {client_type}")
|
255
|
+
|
256
|
+
resolved_directory: Optional[Path]
|
257
|
+
if data_directory:
|
258
|
+
resolved_directory = Path(data_directory).expanduser().resolve()
|
259
|
+
elif normalized_type == "persistent":
|
260
|
+
resolved_directory = (root / DATA_FOLDER_NAME / "chroma").resolve()
|
261
|
+
else:
|
262
|
+
resolved_directory = None
|
263
|
+
|
264
|
+
if resolved_directory is not None:
|
265
|
+
resolved_directory.mkdir(parents=True, exist_ok=True)
|
266
|
+
|
267
|
+
prefix = id_prefix or "kb::"
|
268
|
+
|
269
|
+
normalized_embedding = (embedding or "default").lower()
|
270
|
+
|
271
|
+
config = cls(
|
272
|
+
kb_root=root,
|
273
|
+
client_type=normalized_type,
|
274
|
+
collection_name=collection_name,
|
275
|
+
embedding=normalized_embedding,
|
276
|
+
data_directory=resolved_directory,
|
277
|
+
host=host,
|
278
|
+
port=port,
|
279
|
+
ssl=ssl,
|
280
|
+
tenant=tenant,
|
281
|
+
database=database,
|
282
|
+
api_key=api_key,
|
283
|
+
custom_auth_credentials=custom_auth_credentials,
|
284
|
+
id_prefix=prefix,
|
285
|
+
sentence_transformer=sentence_transformer,
|
286
|
+
chunk_size=chunk_size,
|
287
|
+
chunk_overlap=chunk_overlap,
|
288
|
+
)
|
289
|
+
config._validate()
|
290
|
+
return config
|
291
|
+
|
292
|
+
def _validate(self) -> None:
|
293
|
+
"""Validate the configuration and raise descriptive errors when invalid."""
|
294
|
+
|
295
|
+
if not self.enabled:
|
296
|
+
return
|
297
|
+
|
298
|
+
if self.client_type == "persistent" and self.data_directory is None:
|
299
|
+
raise ValueError("Persistent Chroma client requires a data directory")
|
300
|
+
|
301
|
+
if self.client_type == "http" and not self.host:
|
302
|
+
raise ValueError(
|
303
|
+
"HTTP Chroma client requires --chroma-host or MCP_KB_CHROMA_HOST"
|
304
|
+
)
|
305
|
+
|
306
|
+
if self.client_type == "cloud":
|
307
|
+
missing = [
|
308
|
+
name
|
309
|
+
for name, value in (
|
310
|
+
("tenant", self.tenant),
|
311
|
+
("database", self.database),
|
312
|
+
("api_key", self.api_key),
|
313
|
+
)
|
314
|
+
if not value
|
315
|
+
]
|
316
|
+
if missing:
|
317
|
+
pretty = ", ".join(missing)
|
318
|
+
raise ValueError(f"Cloud Chroma client requires values for: {pretty}")
|
319
|
+
|
320
|
+
if not self.collection_name:
|
321
|
+
raise ValueError("Collection name must be provided")
|
322
|
+
|
323
|
+
if not self.embedding:
|
324
|
+
raise ValueError("Embedding function name must be provided")
|
325
|
+
|
326
|
+
|
327
|
+
class _ChromaDependencies(BaseModel):
|
328
|
+
"""Lazy import bundle containing the pieces needed to talk to ChromaDB."""
|
329
|
+
|
330
|
+
chroma_module: Any
|
331
|
+
settings_cls: Type[Any]
|
332
|
+
embedding_factories: Mapping[str, Type[Any]]
|
333
|
+
|
334
|
+
|
335
|
+
def _load_dependencies() -> _ChromaDependencies:
|
336
|
+
"""Import ChromaDB lazily so the base server works without the dependency."""
|
337
|
+
|
338
|
+
try:
|
339
|
+
chroma_module = importlib.import_module("chromadb")
|
340
|
+
except ModuleNotFoundError as exc: # pragma: no cover - dependent on environment
|
341
|
+
raise RuntimeError(
|
342
|
+
"Chroma integration requested but the 'chromadb' package is not installed. "
|
343
|
+
"Install chromadb via 'uv add chromadb' or disable ingestion."
|
344
|
+
) from exc
|
345
|
+
|
346
|
+
config_module = importlib.import_module("chromadb.config")
|
347
|
+
embedding_module = importlib.import_module("chromadb.utils.embedding_functions")
|
348
|
+
|
349
|
+
factories: Dict[str, Type[Any]] = {}
|
350
|
+
fallback_map = {
|
351
|
+
"default": "DefaultEmbeddingFunction",
|
352
|
+
"cohere": "CohereEmbeddingFunction",
|
353
|
+
"openai": "OpenAIEmbeddingFunction",
|
354
|
+
"jina": "JinaEmbeddingFunction",
|
355
|
+
"voyageai": "VoyageAIEmbeddingFunction",
|
356
|
+
"roboflow": "RoboflowEmbeddingFunction",
|
357
|
+
}
|
358
|
+
for alias, attr in fallback_map.items():
|
359
|
+
if hasattr(embedding_module, attr):
|
360
|
+
factories[alias] = getattr(embedding_module, attr)
|
361
|
+
if not factories:
|
362
|
+
raise RuntimeError(
|
363
|
+
"No embedding functions were found in chromadb.utils.embedding_functions"
|
364
|
+
)
|
365
|
+
|
366
|
+
|
367
|
+
|
368
|
+
factories["sentence_transformer"] = SentenceTransformerEmbedder
|
369
|
+
|
370
|
+
return _ChromaDependencies(
|
371
|
+
chroma_module=chroma_module,
|
372
|
+
settings_cls=getattr(config_module, "Settings"),
|
373
|
+
embedding_factories=factories,
|
374
|
+
)
|
375
|
+
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
def line_starts(s: str):
|
380
|
+
"""Return a list of 0-based character offsets where each line starts."""
|
381
|
+
starts = []
|
382
|
+
pos = 0
|
383
|
+
for line in s.splitlines(keepends=True): # handles \n, \r\n, \r
|
384
|
+
starts.append(pos)
|
385
|
+
pos += len(line)
|
386
|
+
if not s.endswith(('\n', '\r')): # last line without newline
|
387
|
+
starts.append(pos) # sentinel for bisect
|
388
|
+
else:
|
389
|
+
starts.append(pos) # still add sentinel
|
390
|
+
return starts
|
391
|
+
|
392
|
+
def char_to_line(char_idx: int, starts: list[int]) -> int:
|
393
|
+
"""Map a 0-based char index to a 0-based line number."""
|
394
|
+
# bisect_right gives index of first start > char_idx
|
395
|
+
return bisect_right(starts, char_idx)-1 # already 1-based because starts[0] is line 1
|
396
|
+
|
397
|
+
def find_start_char(subtext:str,full_text:str) -> int:
|
398
|
+
"""Find the start character of a subtext in a fulltext."""
|
399
|
+
return full_text.find(subtext)
|
400
|
+
|
401
|
+
class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
|
402
|
+
"""Listener that mirrors knowledge base writes into a Chroma collection.
|
403
|
+
|
404
|
+
The listener adheres to the :class:`KnowledgeBaseListener` protocol so it
|
405
|
+
can be registered alongside other observers without coupling. Events are
|
406
|
+
written synchronously to guarantee that indexing stays consistent with the
|
407
|
+
underlying filesystem operations.
|
408
|
+
"""
|
409
|
+
|
410
|
+
def __init__(self, configuration: ChromaConfiguration) -> None:
|
411
|
+
"""Create an ingestor bound to ``configuration``.
|
412
|
+
|
413
|
+
Parameters
|
414
|
+
----------
|
415
|
+
configuration:
|
416
|
+
Sanitised :class:`ChromaConfiguration` describing how to connect to
|
417
|
+
Chroma and which collection to mirror.
|
418
|
+
"""
|
419
|
+
|
420
|
+
self.configuration = configuration
|
421
|
+
self._deps = _load_dependencies()
|
422
|
+
self._client = self._create_client()
|
423
|
+
self._collection = self._ensure_collection()
|
424
|
+
self.textsplitter = TokenTextSplitter(
|
425
|
+
chunk_size=self.configuration.chunk_size, chunk_overlap=self.configuration.chunk_overlap, add_start_index=True,strip_whitespace=False
|
426
|
+
)
|
427
|
+
# Optional UMAP integration is initialised lazily because the dependency
|
428
|
+
# may be absent in environments such as Python 3.13 where wheels are not
|
429
|
+
# yet available. The attributes are cached on the instance to avoid
|
430
|
+
# repetitively loading models from disk.
|
431
|
+
try:
|
432
|
+
import umap # type: ignore
|
433
|
+
|
434
|
+
self._umap_mod = umap
|
435
|
+
except Exception:
|
436
|
+
self._umap_mod = None
|
437
|
+
self._umap_dir = (
|
438
|
+
self.configuration.kb_root / DATA_FOLDER_NAME / "umap"
|
439
|
+
).resolve()
|
440
|
+
try:
|
441
|
+
self._umap_dir.mkdir(parents=True, exist_ok=True)
|
442
|
+
except Exception:
|
443
|
+
# Directory creation failures should not break ingestion; training
|
444
|
+
# simply becomes a no-op.
|
445
|
+
pass
|
446
|
+
self._umap_2d = None
|
447
|
+
self._umap_3d = None
|
448
|
+
self._umap_timer: Optional[Timer] = None
|
449
|
+
self._umap_fit_lock = threading.Lock()
|
450
|
+
self._reindex_lock = threading.Lock()
|
451
|
+
if self._umap_mod:
|
452
|
+
self._load_umap_models()
|
453
|
+
|
454
|
+
def get_document_chunks(
|
455
|
+
self, document_id: str, include: List[str] = ["metadatas", "documents"]
|
456
|
+
) -> GetResult:
|
457
|
+
"""Get a document from the Chroma index."""
|
458
|
+
return self._collection.get(where={"document_id": document_id}, include=include)
|
459
|
+
|
460
|
+
def handle_upsert(self, event: FileUpsertEvent) -> None:
|
461
|
+
"""Upsert ``event`` into the configured Chroma collection.
|
462
|
+
|
463
|
+
Every invocation removes any existing Chroma entry before inserting the
|
464
|
+
fresh payload so that the embedding engine recomputes vectors using the
|
465
|
+
latest markdown. The stored metadata keeps both absolute and relative
|
466
|
+
paths, enabling downstream semantic search tools to surface references
|
467
|
+
that point straight back into the knowledge base.
|
468
|
+
"""
|
469
|
+
|
470
|
+
document_id = f"{self.configuration.id_prefix}{event.path}"
|
471
|
+
relative = Path(event.path)
|
472
|
+
self._reindex_document(document_id, event.content, relative)
|
473
|
+
self._schedule_umap_refit()
|
474
|
+
|
475
|
+
def delete_document(self, document_id: str) -> None:
|
476
|
+
"""Delete a document from the Chroma index."""
|
477
|
+
self._collection.delete(
|
478
|
+
ids=self.get_document_chunks(document_id, include=[])["ids"]
|
479
|
+
)
|
480
|
+
|
481
|
+
def handle_delete(self, event: FileDeleteEvent) -> None:
|
482
|
+
"""Remove documents associated with ``event`` from the Chroma index.
|
483
|
+
|
484
|
+
Soft deletions translate to a straight removal because the PRD treats
|
485
|
+
files carrying the delete sentinel as hidden from client tooling.
|
486
|
+
"""
|
487
|
+
|
488
|
+
document_id = f"{self.configuration.id_prefix}{event.path}"
|
489
|
+
try:
|
490
|
+
self.delete_document(document_id)
|
491
|
+
except Exception: # pragma: no cover - depends on Chroma exceptions
|
492
|
+
# Chroma raises a custom error when the ID is missing. Deletion should
|
493
|
+
# be idempotent so we swallow those errors silently.
|
494
|
+
pass
|
495
|
+
self._schedule_umap_refit()
|
496
|
+
|
497
|
+
@property
|
498
|
+
def collection(self) -> "Collection":
|
499
|
+
"""Return the underlying Chroma collection for diagnostics and tests."""
|
500
|
+
|
501
|
+
return self._collection
|
502
|
+
|
503
|
+
# UMAP helpers -------------------------------------------------------------
|
504
|
+
|
505
|
+
def _umap_paths(self) -> Tuple[Path, Path, Path]:
|
506
|
+
"""Return the filesystem locations backing persisted UMAP state.
|
507
|
+
|
508
|
+
The tuple contains the pickled 2D model, the pickled 3D model, and a
|
509
|
+
companion JSON metadata file stored alongside them inside the knowledge
|
510
|
+
base's ``.data/umap`` directory.
|
511
|
+
"""
|
512
|
+
|
513
|
+
base = self.configuration.collection_name
|
514
|
+
two_path = self._umap_dir / f"{base}-umap-2d.pkl"
|
515
|
+
three_path = self._umap_dir / f"{base}-umap-3d.pkl"
|
516
|
+
meta_path = self._umap_dir / f"{base}-umap-meta.json"
|
517
|
+
return two_path, three_path, meta_path
|
518
|
+
|
519
|
+
def _load_umap_models(self) -> bool:
|
520
|
+
"""Load persisted UMAP transformers into memory when present.
|
521
|
+
|
522
|
+
Returns ``True`` when both the 2D and 3D models were unpickled
|
523
|
+
successfully, otherwise leaves the cached attributes set to ``None`` so
|
524
|
+
callers can fall back to on-demand refits.
|
525
|
+
"""
|
526
|
+
|
527
|
+
if not self._umap_mod:
|
528
|
+
return False
|
529
|
+
two_path, three_path, _ = self._umap_paths()
|
530
|
+
if not two_path.exists() or not three_path.exists():
|
531
|
+
return False
|
532
|
+
try:
|
533
|
+
with two_path.open("rb") as fh:
|
534
|
+
self._umap_2d = pickle.load(fh)
|
535
|
+
with three_path.open("rb") as fh:
|
536
|
+
self._umap_3d = pickle.load(fh)
|
537
|
+
return True
|
538
|
+
except Exception:
|
539
|
+
self._umap_2d = None
|
540
|
+
self._umap_3d = None
|
541
|
+
return False
|
542
|
+
|
543
|
+
def _save_umap_models(
|
544
|
+
self,
|
545
|
+
umap2d: Any,
|
546
|
+
umap3d: Any,
|
547
|
+
*,
|
548
|
+
sample_count: int,
|
549
|
+
dimensions: int,
|
550
|
+
neighbors: int,
|
551
|
+
) -> None:
|
552
|
+
"""Persist trained UMAP models alongside a JSON metadata descriptor.
|
553
|
+
|
554
|
+
The helper writes both pickles plus a human-readable JSON file so
|
555
|
+
operators can audit when the layout was last refreshed and which
|
556
|
+
hyperparameters were used during training.
|
557
|
+
"""
|
558
|
+
|
559
|
+
two_path, three_path, meta_path = self._umap_paths()
|
560
|
+
payload = {
|
561
|
+
"trained_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
562
|
+
"n_samples": sample_count,
|
563
|
+
"n_dims": dimensions,
|
564
|
+
"metric": "cosine",
|
565
|
+
"neighbors": neighbors,
|
566
|
+
"min_dist": 0.1,
|
567
|
+
}
|
568
|
+
try:
|
569
|
+
with two_path.open("wb") as fh:
|
570
|
+
pickle.dump(umap2d, fh)
|
571
|
+
with three_path.open("wb") as fh:
|
572
|
+
pickle.dump(umap3d, fh)
|
573
|
+
meta_path.write_text(
|
574
|
+
json.dumps(payload, ensure_ascii=False, indent=2),
|
575
|
+
encoding="utf-8",
|
576
|
+
)
|
577
|
+
except Exception as exc: # pragma: no cover - persistence is best effort
|
578
|
+
logger.exception("Failed to persist UMAP models", exc_info=exc)
|
579
|
+
|
580
|
+
def _transform_umap(
|
581
|
+
self,
|
582
|
+
embeddings: List[List[float]],
|
583
|
+
) -> Tuple[Optional[List[List[float]]], Optional[List[List[float]]]]:
|
584
|
+
"""Project ``embeddings`` using cached models when available.
|
585
|
+
|
586
|
+
The function returns coordinate lists for each embedding when both model
|
587
|
+
instances are loaded; otherwise it yields ``(None, None)`` to signal that
|
588
|
+
the caller should skip annotating metadata.
|
589
|
+
"""
|
590
|
+
|
591
|
+
if embeddings is None or len(embeddings) == 0 or len(embeddings[0]) == 0:
|
592
|
+
return None, None
|
593
|
+
if self._umap_2d is None or self._umap_3d is None:
|
594
|
+
self._load_umap_models()
|
595
|
+
if self._umap_2d is None or self._umap_3d is None:
|
596
|
+
return None, None
|
597
|
+
try:
|
598
|
+
raw2d = self._umap_2d.transform(embeddings)
|
599
|
+
raw3d = self._umap_3d.transform(embeddings)
|
600
|
+
coords2d = self._coerce_projection(raw2d)
|
601
|
+
coords3d = self._coerce_projection(raw3d)
|
602
|
+
return coords2d, coords3d
|
603
|
+
except Exception:
|
604
|
+
return None, None
|
605
|
+
|
606
|
+
def _fetch_all_embeddings(
|
607
|
+
self,
|
608
|
+
batch_size: int = 256,
|
609
|
+
) -> Tuple[
|
610
|
+
List[List[float]],
|
611
|
+
List[str],
|
612
|
+
List[str],
|
613
|
+
List[Dict[str, Any]],
|
614
|
+
]:
|
615
|
+
"""Return embeddings, IDs, documents, and metadata from the collection.
|
616
|
+
|
617
|
+
Chroma's pagination is consumed in batches to avoid overwhelming memory
|
618
|
+
usage on large corpora while still returning native Python structures
|
619
|
+
that are convenient for subsequent processing.
|
620
|
+
"""
|
621
|
+
|
622
|
+
embeddings: List[List[float]] = []
|
623
|
+
ids: List[str] = []
|
624
|
+
documents: List[str] = []
|
625
|
+
metadatas: List[Dict[str, Any]] = []
|
626
|
+
offset = 0
|
627
|
+
while True:
|
628
|
+
payload = self._collection.get( # type: ignore[no-untyped-call]
|
629
|
+
include=["embeddings", "documents", "metadatas"],
|
630
|
+
limit=batch_size,
|
631
|
+
offset=offset,
|
632
|
+
)
|
633
|
+
got_ids = payload.get("ids")
|
634
|
+
got_embs = payload.get("embeddings")
|
635
|
+
got_docs = payload.get("documents")
|
636
|
+
got_metas = payload.get("metadatas")
|
637
|
+
if got_ids is None:
|
638
|
+
got_ids = []
|
639
|
+
if got_embs is None:
|
640
|
+
got_embs = []
|
641
|
+
if got_docs is None:
|
642
|
+
got_docs = []
|
643
|
+
if got_metas is None:
|
644
|
+
got_metas = []
|
645
|
+
if not got_ids:
|
646
|
+
break
|
647
|
+
for doc_id, emb, doc, meta in zip(
|
648
|
+
got_ids,
|
649
|
+
got_embs,
|
650
|
+
got_docs,
|
651
|
+
got_metas,
|
652
|
+
):
|
653
|
+
try:
|
654
|
+
vector = (
|
655
|
+
[float(x) for x in emb.tolist()]
|
656
|
+
if hasattr(emb, "tolist")
|
657
|
+
else [float(x) for x in emb]
|
658
|
+
)
|
659
|
+
except Exception:
|
660
|
+
continue
|
661
|
+
embeddings.append(vector)
|
662
|
+
ids.append(doc_id)
|
663
|
+
documents.append(doc)
|
664
|
+
metadatas.append(dict(meta or {}))
|
665
|
+
offset += len(got_ids)
|
666
|
+
return embeddings, ids, documents, metadatas
|
667
|
+
|
668
|
+
def _coerce_projection(self, value: Any) -> List[List[float]]:
|
669
|
+
"""Return ``value`` materialised as a ``list[list[float]]`` structure.
|
670
|
+
|
671
|
+
Numpy arrays, nested sequences, and other iterable containers are
|
672
|
+
normalised by iterating over rows and coercing each element to a float.
|
673
|
+
Invalid rows are skipped to keep the caller's downstream processing
|
674
|
+
trivial.
|
675
|
+
"""
|
676
|
+
|
677
|
+
if value is None:
|
678
|
+
return []
|
679
|
+
try:
|
680
|
+
if hasattr(value, "tolist"):
|
681
|
+
value = value.tolist()
|
682
|
+
except Exception:
|
683
|
+
pass
|
684
|
+
result: List[List[float]] = []
|
685
|
+
for row in value:
|
686
|
+
try:
|
687
|
+
if hasattr(row, "tolist"):
|
688
|
+
row = row.tolist()
|
689
|
+
result.append([float(x) for x in row])
|
690
|
+
except Exception:
|
691
|
+
continue
|
692
|
+
return result
|
693
|
+
|
694
|
+
def _prepare_umap_metadata(
|
695
|
+
self,
|
696
|
+
coords2d: Sequence[float],
|
697
|
+
coords3d: Sequence[float],
|
698
|
+
) -> Dict[str, Any]:
|
699
|
+
"""Build a metadata payload that satisfies Chroma's type constraints.
|
700
|
+
|
701
|
+
The coordinates are stored as JSON-encoded strings (``"[x, y]"`` and
|
702
|
+
``"[x, y, z]"``) alongside individual scalar components for
|
703
|
+
convenience. This keeps metadata values within Chroma's supported types
|
704
|
+
while allowing downstream consumers to reconstruct dense vectors.
|
705
|
+
"""
|
706
|
+
|
707
|
+
values2 = [float(v) for v in coords2d]
|
708
|
+
values3 = [float(v) for v in coords3d]
|
709
|
+
payload: Dict[str, Any] = {
|
710
|
+
"umap2d": json.dumps(values2, separators=(",", ":")),
|
711
|
+
"umap3d": json.dumps(values3, separators=(",", ":")),
|
712
|
+
}
|
713
|
+
if values2:
|
714
|
+
payload["umap2d_x"] = values2[0]
|
715
|
+
if len(values2) > 1:
|
716
|
+
payload["umap2d_y"] = values2[1]
|
717
|
+
if values3:
|
718
|
+
payload["umap3d_x"] = values3[0]
|
719
|
+
if len(values3) > 1:
|
720
|
+
payload["umap3d_y"] = values3[1]
|
721
|
+
if len(values3) > 2:
|
722
|
+
payload["umap3d_z"] = values3[2]
|
723
|
+
|
724
|
+
return payload
|
725
|
+
|
726
|
+
def _refit_umap_and_update_all(self) -> None:
|
727
|
+
"""Train fresh UMAP models on the entire dataset and update metadata.
|
728
|
+
|
729
|
+
The refit process acquires a short-lived lock so only one fit runs at a
|
730
|
+
time, retrains both the 2D and 3D manifolds, persists the models, and
|
731
|
+
finally propagates the coordinates back into the Chroma collection.
|
732
|
+
"""
|
733
|
+
|
734
|
+
if not self._umap_mod:
|
735
|
+
return
|
736
|
+
if not self._umap_fit_lock.acquire(blocking=False):
|
737
|
+
return
|
738
|
+
try:
|
739
|
+
embeddings, ids, documents, metadatas = self._fetch_all_embeddings()
|
740
|
+
if len(embeddings) < 5:
|
741
|
+
return
|
742
|
+
neighbor_count = max(2, min(15, len(embeddings) - 1))
|
743
|
+
umap_class = self._umap_mod.UMAP
|
744
|
+
umap2d = umap_class(
|
745
|
+
n_components=2,
|
746
|
+
metric="cosine",
|
747
|
+
n_neighbors=neighbor_count,
|
748
|
+
min_dist=0.1,
|
749
|
+
random_state=42,
|
750
|
+
)
|
751
|
+
umap3d = umap_class(
|
752
|
+
n_components=3,
|
753
|
+
metric="cosine",
|
754
|
+
n_neighbors=neighbor_count,
|
755
|
+
min_dist=0.1,
|
756
|
+
random_state=42,
|
757
|
+
)
|
758
|
+
umap2d.fit(embeddings)
|
759
|
+
umap3d.fit(embeddings)
|
760
|
+
coords2d = self._coerce_projection(getattr(umap2d, "embedding_", []))
|
761
|
+
coords3d = self._coerce_projection(getattr(umap3d, "embedding_", []))
|
762
|
+
self._umap_2d = umap2d
|
763
|
+
self._umap_3d = umap3d
|
764
|
+
self._save_umap_models(
|
765
|
+
umap2d,
|
766
|
+
umap3d,
|
767
|
+
sample_count=len(embeddings),
|
768
|
+
dimensions=len(embeddings[0]),
|
769
|
+
neighbors=neighbor_count,
|
770
|
+
)
|
771
|
+
batch = 128
|
772
|
+
for start in range(0, len(ids), batch):
|
773
|
+
end = start + batch
|
774
|
+
batch_ids = ids[start:end]
|
775
|
+
batch_coords2d = coords2d[start:end]
|
776
|
+
batch_coords3d = coords3d[start:end]
|
777
|
+
batch_documents = documents[start:end]
|
778
|
+
batch_metas: List[Dict[str, Any]] = []
|
779
|
+
for idx, meta in enumerate(metadatas[start:end]):
|
780
|
+
updated = dict(meta or {})
|
781
|
+
c2 = batch_coords2d[idx]
|
782
|
+
c3 = batch_coords3d[idx]
|
783
|
+
updated.update(self._prepare_umap_metadata(c2, c3))
|
784
|
+
batch_metas.append(updated)
|
785
|
+
try:
|
786
|
+
self._collection.update( # type: ignore[no-untyped-call]
|
787
|
+
ids=batch_ids,
|
788
|
+
metadatas=batch_metas,
|
789
|
+
)
|
790
|
+
except Exception:
|
791
|
+
try:
|
792
|
+
self._collection.delete(ids=batch_ids) # type: ignore[no-untyped-call]
|
793
|
+
self._collection.add( # type: ignore[no-untyped-call]
|
794
|
+
ids=batch_ids,
|
795
|
+
documents=batch_documents,
|
796
|
+
metadatas=batch_metas,
|
797
|
+
)
|
798
|
+
except Exception as exc: # pragma: no cover - chroma variant specific
|
799
|
+
logger.exception("Failed to update UMAP metadata", exc_info=exc)
|
800
|
+
finally:
|
801
|
+
self._umap_fit_lock.release()
|
802
|
+
|
803
|
+
def _schedule_umap_refit(self, delay: float = 3.0) -> None:
|
804
|
+
"""Debounce refits to avoid repeated fits during rapid edit bursts.
|
805
|
+
|
806
|
+
Each call cancels any pending timer and schedules a new daemon thread,
|
807
|
+
ensuring background fits eventually run without blocking foreground
|
808
|
+
ingestion work.
|
809
|
+
"""
|
810
|
+
|
811
|
+
if not self._umap_mod:
|
812
|
+
return
|
813
|
+
if self._umap_timer is not None:
|
814
|
+
try:
|
815
|
+
self._umap_timer.cancel()
|
816
|
+
except Exception:
|
817
|
+
pass
|
818
|
+
self._umap_timer = Timer(delay, self._refit_umap_and_update_all)
|
819
|
+
self._umap_timer.daemon = True
|
820
|
+
self._umap_timer.start()
|
821
|
+
|
822
|
+
def start_reindex_async(self, kb: "KnowledgeBase") -> bool:
|
823
|
+
"""Spawn a background thread that reindexes ``kb`` without blocking.
|
824
|
+
|
825
|
+
The method acquires an internal lock to ensure only one reindex task
|
826
|
+
runs at a time. It returns ``True`` when a new task was scheduled and
|
827
|
+
``False`` when another invocation is still processing documents.
|
828
|
+
"""
|
829
|
+
|
830
|
+
if not self._reindex_lock.acquire(blocking=False):
|
831
|
+
return False
|
832
|
+
|
833
|
+
def _run() -> None:
|
834
|
+
try:
|
835
|
+
self.reindex(kb)
|
836
|
+
finally:
|
837
|
+
try:
|
838
|
+
self._reindex_lock.release()
|
839
|
+
except RuntimeError:
|
840
|
+
# The lock should always be held, but guard against edge cases.
|
841
|
+
pass
|
842
|
+
|
843
|
+
thread = threading.Thread(target=_run, name="kb-reindex", daemon=True)
|
844
|
+
thread.start()
|
845
|
+
return True
|
846
|
+
|
847
|
+
def trigger_umap_refit_async(self) -> bool:
|
848
|
+
"""Schedule an immediate background UMAP refit when the dependency is available."""
|
849
|
+
|
850
|
+
if not self._umap_mod:
|
851
|
+
return False
|
852
|
+
if self._umap_timer is not None:
|
853
|
+
try:
|
854
|
+
self._umap_timer.cancel()
|
855
|
+
except Exception:
|
856
|
+
pass
|
857
|
+
self._umap_timer = None
|
858
|
+
|
859
|
+
thread = threading.Thread(
|
860
|
+
target=self._refit_umap_and_update_all,
|
861
|
+
name="kb-umap-refit",
|
862
|
+
daemon=True,
|
863
|
+
)
|
864
|
+
thread.start()
|
865
|
+
return True
|
866
|
+
|
867
|
+
@staticmethod
|
868
|
+
def _convert_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
|
869
|
+
"""Convert metadata keys to match the ChromaFileSegment model (backwards compatibility)"""
|
870
|
+
if "relative_path" in metadata:
|
871
|
+
metadata["path"] = metadata.pop("relative_path")
|
872
|
+
if "startline" in metadata:
|
873
|
+
metadata["start_line"] = metadata.pop("startline")
|
874
|
+
if "endline" in metadata:
|
875
|
+
metadata["end_line"] = metadata.pop("endline")
|
876
|
+
return metadata
|
877
|
+
|
878
|
+
def query(self, query: str, *, n_results: int = 5) -> List[ChromaFileSegment]:
|
879
|
+
"""Return structured query results from the configured collection.
|
880
|
+
|
881
|
+
Parameters
|
882
|
+
----------
|
883
|
+
query:
|
884
|
+
Natural language string used to compute the semantic embedding.
|
885
|
+
n_results:
|
886
|
+
Maximum number of results to return. Defaults to five to mirror the
|
887
|
+
behaviour surfaced through the MCP search tool.
|
888
|
+
|
889
|
+
Returns
|
890
|
+
-------
|
891
|
+
list[dict[str, Any]]
|
892
|
+
Each dictionary contains the ``document`` text, associated
|
893
|
+
``metadata`` payload, and a floating-point ``distance`` score if
|
894
|
+
provided by Chroma.
|
895
|
+
"""
|
896
|
+
|
897
|
+
query_meta={}
|
898
|
+
embs=self._collection._embedding_function([query])
|
899
|
+
|
900
|
+
|
901
|
+
payload = self._collection.query(
|
902
|
+
query_embeddings=embs,
|
903
|
+
n_results=n_results,
|
904
|
+
include=["metadatas", "documents", "distances"],
|
905
|
+
)
|
906
|
+
|
907
|
+
query_embeddings = [float(x) for x in embs[0]]
|
908
|
+
# transform query_embeddings to 2d and 3d
|
909
|
+
query_embeddings2d, query_embeddings3d = self._transform_umap([query_embeddings])
|
910
|
+
if query_embeddings2d is not None:
|
911
|
+
query_meta["query_embeddings_umap2d"] = query_embeddings2d[0] if query_embeddings2d is not None else None
|
912
|
+
if query_embeddings3d is not None:
|
913
|
+
query_meta["query_embeddings_umap3d"] = query_embeddings3d[0] if query_embeddings3d is not None else None
|
914
|
+
query_meta["query_embeddings"] = query_embeddings
|
915
|
+
|
916
|
+
|
917
|
+
docids = payload.get("ids", [[]])[0]
|
918
|
+
documents = payload.get("documents", [[]])[0]
|
919
|
+
metadatas = payload.get("metadatas", [[]])[0]
|
920
|
+
distances = payload.get("distances", [[]])[0]
|
921
|
+
|
922
|
+
|
923
|
+
|
924
|
+
if not documents or not documents[0]:
|
925
|
+
return [],query_meta
|
926
|
+
|
927
|
+
results: List[ChromaFileSegment] = []
|
928
|
+
|
929
|
+
for docid, metadata, document, distance in zip(docids, metadatas, documents, distances):
|
930
|
+
metadata = self._convert_metadata(metadata)
|
931
|
+
|
932
|
+
results.append(
|
933
|
+
ChromaFileSegment(
|
934
|
+
**metadata,
|
935
|
+
content=document,
|
936
|
+
distance=distance,
|
937
|
+
chunk_id=docid,
|
938
|
+
)
|
939
|
+
)
|
940
|
+
|
941
|
+
return results,query_meta
|
942
|
+
|
943
|
+
# Optional search extension -------------------------------------------------
|
944
|
+
|
945
|
+
def search(
|
946
|
+
self,
|
947
|
+
kb: "KnowledgeBase",
|
948
|
+
query: str,
|
949
|
+
*,
|
950
|
+
context_lines: int = 2,
|
951
|
+
limit: Optional[int] = None,
|
952
|
+
) -> List[ChromaFileSegment]:
|
953
|
+
"""Translate semantic query results into :class:`ChromaFileSegment` objects."""
|
954
|
+
|
955
|
+
max_results = limit or 5
|
956
|
+
records,query_meta = self.query(query, n_results=max_results)
|
957
|
+
matches: List[ChromaFileSegment] = []
|
958
|
+
|
959
|
+
|
960
|
+
|
961
|
+
to_delete = set()
|
962
|
+
|
963
|
+
for record in records:
|
964
|
+
|
965
|
+
candidate = self._resolve_candidate_path(
|
966
|
+
kb,
|
967
|
+
record.path,
|
968
|
+
)
|
969
|
+
|
970
|
+
if candidate is None:
|
971
|
+
to_delete.add(record.chunk_id)
|
972
|
+
continue
|
973
|
+
|
974
|
+
matches.append(record)
|
975
|
+
|
976
|
+
if limit is not None and len(matches) >= limit:
|
977
|
+
break
|
978
|
+
|
979
|
+
if to_delete:
|
980
|
+
self._find_orphaned_documents(kb,remove=True)
|
981
|
+
|
982
|
+
return self.search(kb, query, context_lines=context_lines, limit=limit)
|
983
|
+
|
984
|
+
return matches,query_meta
|
985
|
+
|
986
|
+
# Internal helpers ----------------------------------------------------------
|
987
|
+
|
988
|
+
def _find_orphaned_documents(self,kb: "KnowledgeBase",remove: bool = True) -> Set[str]:
|
989
|
+
"""Find documents in the Chroma collection that are not in the knowledge base."""
|
990
|
+
|
991
|
+
try:
|
992
|
+
count = self._collection.count()
|
993
|
+
to_delete = set()
|
994
|
+
with tqdm(total=count, desc="Finding orphaned documents") as pbar:
|
995
|
+
for i in range(0, count, 10):
|
996
|
+
batch = self._collection.get(include=["metadatas"], limit=10, offset=i)
|
997
|
+
for ids,metadata in zip(batch.get("ids", []), batch.get("metadatas", [])):
|
998
|
+
path = metadata.get("path")
|
999
|
+
if path and not kb.rules.root.joinpath(path).exists():
|
1000
|
+
to_delete.add(ids)
|
1001
|
+
pbar.update(10)
|
1002
|
+
if remove:
|
1003
|
+
for ids in to_delete:
|
1004
|
+
self.collection.delete(ids=ids)
|
1005
|
+
return to_delete
|
1006
|
+
except Exception as e:
|
1007
|
+
logger.exception(e)
|
1008
|
+
return set()
|
1009
|
+
|
1010
|
+
|
1011
|
+
def _reindex_document(
|
1012
|
+
self,
|
1013
|
+
document_id: str,
|
1014
|
+
content: str,
|
1015
|
+
path: Path,
|
1016
|
+
) -> None:
|
1017
|
+
"""Replace the stored document so embeddings are recomputed.
|
1018
|
+
|
1019
|
+
Reindexing involves removing any stale record before inserting the new
|
1020
|
+
payload. Some Chroma backends keep historical data around when ``add``
|
1021
|
+
is invoked with an existing ID, so the deletion step ensures the stored
|
1022
|
+
embedding always reflects the latest markdown contents. ``metadata`` is
|
1023
|
+
copied to break accidental references held by callers.
|
1024
|
+
"""
|
1025
|
+
|
1026
|
+
try:
|
1027
|
+
# filter by document_id in metadata
|
1028
|
+
self.delete_document(document_id)
|
1029
|
+
except Exception: # pragma: no cover - depends on Chroma exception types
|
1030
|
+
# Missing IDs are not an error; most clients raise when attempting to
|
1031
|
+
# delete a non-existent record. We swallow those errors to keep the
|
1032
|
+
# reindexing path idempotent.
|
1033
|
+
pass
|
1034
|
+
|
1035
|
+
|
1036
|
+
|
1037
|
+
# Empty documents should not be added to Chroma. After the delete above
|
1038
|
+
# there is nothing else to do for empty payloads.
|
1039
|
+
if not content.strip():
|
1040
|
+
return
|
1041
|
+
|
1042
|
+
# Split content into chunks suitable for embedding. When the splitter
|
1043
|
+
# returns no chunks (e.g., content is whitespace), skip the add call to
|
1044
|
+
# avoid Chroma errors about empty lists.
|
1045
|
+
split_docs = self.textsplitter.create_documents([content])
|
1046
|
+
if not split_docs:
|
1047
|
+
return
|
1048
|
+
|
1049
|
+
starts = line_starts(content)
|
1050
|
+
|
1051
|
+
chunks: List[ChromaFileSegment] = []
|
1052
|
+
chunk_texts: List[str] = []
|
1053
|
+
for i, d in enumerate(split_docs):
|
1054
|
+
start_char = d.metadata["start_index"] # 0-based char offset in original content
|
1055
|
+
if start_char <0:
|
1056
|
+
start_char = find_start_char(d.page_content,content)
|
1057
|
+
start_line = char_to_line(start_char, starts)
|
1058
|
+
|
1059
|
+
end_char_excl = start_char + len(d.page_content) # exclusive end
|
1060
|
+
end_line = char_to_line(max(0, end_char_excl - 1), starts)
|
1061
|
+
|
1062
|
+
|
1063
|
+
file_segment = ChromaFileSegment(
|
1064
|
+
document_id=document_id,
|
1065
|
+
path=str(path),
|
1066
|
+
start_line=start_line,
|
1067
|
+
end_line=end_line,
|
1068
|
+
content=d.page_content,
|
1069
|
+
chunk_number=i,
|
1070
|
+
)
|
1071
|
+
chunks.append(file_segment)
|
1072
|
+
chunk_texts.append(d.page_content)
|
1073
|
+
|
1074
|
+
|
1075
|
+
|
1076
|
+
|
1077
|
+
ids: List[str] = []
|
1078
|
+
contents: List[str] = []
|
1079
|
+
metadatas: List[Dict[str, Any]] = []
|
1080
|
+
for idx, d in enumerate(chunks):
|
1081
|
+
# Use Pydantic's exclude_none to drop optional fields (e.g. distance)
|
1082
|
+
# because Chroma's metadata schema rejects None values.
|
1083
|
+
dump = d.model_dump(exclude_none=True)
|
1084
|
+
dump.pop("umap2d", None)
|
1085
|
+
dump.pop("umap3d", None)
|
1086
|
+
dump.pop("umap2d_x", None)
|
1087
|
+
dump.pop("umap2d_y", None)
|
1088
|
+
dump.pop("umap3d_x", None)
|
1089
|
+
dump.pop("umap3d_y", None)
|
1090
|
+
dump.pop("umap3d_z", None)
|
1091
|
+
id = f"{d.document_id}-{d.chunk_number}"
|
1092
|
+
ids.append(id)
|
1093
|
+
contents.append(dump.pop("content"))
|
1094
|
+
|
1095
|
+
metadatas.append(dump)
|
1096
|
+
|
1097
|
+
self._collection.add(
|
1098
|
+
documents=contents,
|
1099
|
+
metadatas=metadatas,
|
1100
|
+
ids=ids,
|
1101
|
+
)
|
1102
|
+
|
1103
|
+
payload = self._collection.get(ids=ids, include=["embeddings"])
|
1104
|
+
chunk_embeddings = payload.get("embeddings", [])
|
1105
|
+
coords2d, coords3d = self._transform_umap(chunk_embeddings)
|
1106
|
+
if coords2d is not None and coords3d is not None:
|
1107
|
+
for idx, (c2,c3) in enumerate(zip(coords2d, coords3d)):
|
1108
|
+
metadatas[idx].update(self._prepare_umap_metadata(c2, c3))
|
1109
|
+
self._collection.update(ids=ids, metadatas=metadatas)
|
1110
|
+
|
1111
|
+
|
1112
|
+
|
1113
|
+
# Optional full reindex -----------------------------------------------------
|
1114
|
+
|
1115
|
+
def reindex(self, kb: "KnowledgeBase") -> int:
|
1116
|
+
"""Rebuild the Chroma index from the current knowledge base state.
|
1117
|
+
|
1118
|
+
The method iterates over all active markdown files visible to the
|
1119
|
+
provided knowledge base instance, computing a deterministic document ID
|
1120
|
+
for each path using the configured ``id_prefix``. Each file is read from
|
1121
|
+
disk and upserted into the underlying Chroma collection by delegating to
|
1122
|
+
:meth:`_reindex_document`, ensuring embeddings are recomputed.
|
1123
|
+
|
1124
|
+
Parameters
|
1125
|
+
----------
|
1126
|
+
kb:
|
1127
|
+
The :class:`~mcp_kb.knowledge.store.KnowledgeBase` providing access
|
1128
|
+
to the validated filesystem and utility methods.
|
1129
|
+
|
1130
|
+
Returns
|
1131
|
+
-------
|
1132
|
+
int
|
1133
|
+
The number of documents processed during the reindex run.
|
1134
|
+
"""
|
1135
|
+
|
1136
|
+
count = 0
|
1137
|
+
root = kb.rules.root
|
1138
|
+
|
1139
|
+
# Clear previous KB documents from the collection. Some Chroma backends
|
1140
|
+
# do not support regex filters; use substring containment on our stable
|
1141
|
+
# metadata field instead.
|
1142
|
+
try:
|
1143
|
+
self._collection.delete( # type: ignore[no-untyped-call]
|
1144
|
+
where={"document_id": {"$contains": f"{self.configuration.id_prefix}"}}
|
1145
|
+
)
|
1146
|
+
except Exception:
|
1147
|
+
# As a fallback, attempt a two-step delete by IDs when supported.
|
1148
|
+
try:
|
1149
|
+
payload = self._collection.get( # type: ignore[no-untyped-call]
|
1150
|
+
where={"document_id": {"$contains": f"{self.configuration.id_prefix}"}},
|
1151
|
+
include=[],
|
1152
|
+
)
|
1153
|
+
ids = payload.get("ids", []) or []
|
1154
|
+
if ids:
|
1155
|
+
self._collection.delete(ids=ids) # type: ignore[no-untyped-call]
|
1156
|
+
except Exception:
|
1157
|
+
# If clearing fails, proceed with reindexing; upserts are idempotent.
|
1158
|
+
pass
|
1159
|
+
|
1160
|
+
with tqdm(
|
1161
|
+
kb.iter_active_files(include_docs=False),
|
1162
|
+
desc="Reindexing Chroma",
|
1163
|
+
total=kb.total_active_files(include_docs=False),
|
1164
|
+
) as pbar:
|
1165
|
+
for path in pbar:
|
1166
|
+
pbar.set_description(f"Reindexing Chroma {path.name}")
|
1167
|
+
try:
|
1168
|
+
content = path.read_text(encoding="utf-8")
|
1169
|
+
except FileNotFoundError: # pragma: no cover - race with external edits
|
1170
|
+
continue
|
1171
|
+
|
1172
|
+
relative = path.relative_to(root)
|
1173
|
+
document_id = f"{self.configuration.id_prefix}{relative}"
|
1174
|
+
|
1175
|
+
self._reindex_document(document_id, content, relative)
|
1176
|
+
count += 1
|
1177
|
+
|
1178
|
+
try:
|
1179
|
+
self._refit_umap_and_update_all()
|
1180
|
+
except Exception as exc: # pragma: no cover - refit is best effort
|
1181
|
+
logger.exception("Failed to refit UMAP models after reindex", exc_info=exc)
|
1182
|
+
return count
|
1183
|
+
|
1184
|
+
|
1185
|
+
|
1186
|
+
def _resolve_candidate_path(
|
1187
|
+
self,
|
1188
|
+
kb: "KnowledgeBase",
|
1189
|
+
relative: Optional[str],
|
1190
|
+
) -> Optional[Path]:
|
1191
|
+
"""Translate metadata hints into a validated path inside ``kb``."""
|
1192
|
+
|
1193
|
+
|
1194
|
+
if not relative:
|
1195
|
+
return None
|
1196
|
+
|
1197
|
+
|
1198
|
+
candidate = (kb.rules.root / relative).resolve()
|
1199
|
+
|
1200
|
+
try:
|
1201
|
+
candidate.relative_to(kb.rules.root)
|
1202
|
+
except ValueError:
|
1203
|
+
return None
|
1204
|
+
|
1205
|
+
if not candidate.exists():
|
1206
|
+
return None
|
1207
|
+
|
1208
|
+
return candidate
|
1209
|
+
|
1210
|
+
|
1211
|
+
def _create_client(self) -> "ClientAPI":
|
1212
|
+
"""Instantiate the proper Chroma client based on configuration.
|
1213
|
+
|
1214
|
+
The method supports all transport modes referenced in the user
|
1215
|
+
requirements. It constructs the minimal set of keyword arguments for the
|
1216
|
+
chosen backend and lets Chroma's client validate the final configuration.
|
1217
|
+
"""
|
1218
|
+
|
1219
|
+
chroma = self._deps.chroma_module
|
1220
|
+
config = self.configuration
|
1221
|
+
|
1222
|
+
if not config.enabled:
|
1223
|
+
raise RuntimeError(
|
1224
|
+
"ChromaIngestor cannot be constructed when ingestion is disabled"
|
1225
|
+
)
|
1226
|
+
|
1227
|
+
settings = chroma.Settings(anonymized_telemetry=False)
|
1228
|
+
|
1229
|
+
if config.client_type == "ephemeral":
|
1230
|
+
return chroma.EphemeralClient(settings=settings)
|
1231
|
+
|
1232
|
+
if config.client_type == "persistent":
|
1233
|
+
return chroma.PersistentClient(path=str(config.data_directory),settings=settings)
|
1234
|
+
|
1235
|
+
if config.client_type in {"http", "cloud"}:
|
1236
|
+
kwargs: Dict[str, Any] = {
|
1237
|
+
"ssl": config.ssl if config.client_type == "http" else True,
|
1238
|
+
}
|
1239
|
+
if config.client_type == "http":
|
1240
|
+
kwargs["host"] = config.host
|
1241
|
+
if config.port is not None:
|
1242
|
+
kwargs["port"] = config.ports
|
1243
|
+
if config.custom_auth_credentials:
|
1244
|
+
kwargs["settings"] = self._deps.settings_cls(
|
1245
|
+
chroma_client_auth_provider="chromadb.auth.basic_authn.BasicAuthClientProvider",
|
1246
|
+
chroma_client_auth_credentials=config.custom_auth_credentials,
|
1247
|
+
)
|
1248
|
+
else: # cloud
|
1249
|
+
kwargs["host"] = config.host or "api.trychroma.com"
|
1250
|
+
kwargs["tenant"] = config.tenant
|
1251
|
+
kwargs["database"] = config.database
|
1252
|
+
kwargs.setdefault("headers", {})
|
1253
|
+
kwargs["headers"]["x-chroma-token"] = config.api_key
|
1254
|
+
|
1255
|
+
return chroma.HttpClient(**kwargs)
|
1256
|
+
|
1257
|
+
raise ValueError(f"Unsupported client type: {config.client_type}")
|
1258
|
+
|
1259
|
+
def _ensure_collection(self) -> "Collection":
|
1260
|
+
"""Create or return the configured Chroma collection."""
|
1261
|
+
|
1262
|
+
factory = self._deps.embedding_factories.get(self.configuration.embedding)
|
1263
|
+
if factory is None:
|
1264
|
+
available = ", ".join(sorted(self._deps.embedding_factories))
|
1265
|
+
raise ValueError(
|
1266
|
+
f"Unknown embedding function '{self.configuration.embedding}'. "
|
1267
|
+
f"Available options: {available}"
|
1268
|
+
)
|
1269
|
+
if issubclass(factory, SentenceTransformerEmbedder):
|
1270
|
+
embedding_function = factory(self.configuration.sentence_transformer)
|
1271
|
+
else:
|
1272
|
+
embedding_function = factory()
|
1273
|
+
metadata = {"source": "mcp-knowledge-base"}
|
1274
|
+
client = self._client
|
1275
|
+
try:
|
1276
|
+
return client.get_or_create_collection(
|
1277
|
+
name=self.configuration.collection_name,
|
1278
|
+
metadata=metadata,
|
1279
|
+
embedding_function=embedding_function,
|
1280
|
+
)
|
1281
|
+
except TypeError:
|
1282
|
+
# Older Chroma versions expect CreateCollectionConfiguration. Fall back
|
1283
|
+
# to create_collection for compatibility.
|
1284
|
+
return client.get_or_create_collection(
|
1285
|
+
name=self.configuration.collection_name,
|
1286
|
+
embedding_function=embedding_function,
|
1287
|
+
)
|