mcp-kb 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_kb/ingest/chroma.py DELETED
@@ -1,610 +0,0 @@
1
- """Integration layer that mirrors knowledge base updates into ChromaDB."""
2
-
3
- from __future__ import annotations
4
-
5
- import importlib
6
- from dataclasses import dataclass
7
- from pathlib import Path
8
- from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Type, TYPE_CHECKING
9
- from langchain_text_splitters import TokenTextSplitter
10
- from tqdm import tqdm
11
-
12
- from mcp_kb.config import DATA_FOLDER_NAME
13
- from mcp_kb.knowledge.events import (
14
- FileDeleteEvent,
15
- FileUpsertEvent,
16
- KnowledgeBaseListener,
17
- KnowledgeBaseReindexListener,
18
- )
19
- from mcp_kb.knowledge.search import SearchMatch
20
-
21
- if TYPE_CHECKING: # pragma: no cover - type checking only imports
22
- from chromadb.api import ClientAPI, GetResult
23
- from chromadb.api.models.Collection import Collection
24
- from mcp_kb.knowledge.store import KnowledgeBase
25
-
26
- SUPPORTED_CLIENTS: Tuple[str, ...] = ("off", "ephemeral", "persistent", "http", "cloud")
27
- """Recognised client types exposed to operators enabling Chroma ingestion."""
28
-
29
-
30
- @dataclass(frozen=True)
31
- class ChromaConfiguration:
32
- """Runtime configuration controlling how Chroma ingestion behaves.
33
-
34
- Each attribute corresponds to either a CLI flag or an environment variable
35
- so that deployments can toggle Chroma synchronisation without changing the
36
- application code. The configuration intentionally stores already-normalised
37
- values (e.g., resolved paths and lowercase enums) so downstream components
38
- can rely on consistent semantics regardless of where the data originated.
39
- """
40
-
41
- client_type: str
42
- collection_name: str
43
- embedding: str
44
- data_directory: Optional[Path]
45
- host: Optional[str]
46
- port: Optional[int]
47
- ssl: bool
48
- tenant: Optional[str]
49
- database: Optional[str]
50
- api_key: Optional[str]
51
- custom_auth_credentials: Optional[str]
52
- id_prefix: str
53
-
54
- @property
55
- def enabled(self) -> bool:
56
- """Return ``True`` when ingestion should be activated."""
57
-
58
- return self.client_type != "off"
59
-
60
- @classmethod
61
- def from_options(
62
- cls,
63
- *,
64
- root: Path,
65
- client_type: str,
66
- collection_name: str,
67
- embedding: str,
68
- data_directory: Optional[str],
69
- host: Optional[str],
70
- port: Optional[int],
71
- ssl: bool,
72
- tenant: Optional[str],
73
- database: Optional[str],
74
- api_key: Optional[str],
75
- custom_auth_credentials: Optional[str],
76
- id_prefix: Optional[str],
77
- ) -> "ChromaConfiguration":
78
- """Normalise CLI and environment inputs into a configuration object.
79
-
80
- Parameters
81
- ----------
82
- root:
83
- Absolute knowledge base root used to derive default directories.
84
- client_type:
85
- One of :data:`SUPPORTED_CLIENTS`. ``"off"`` disables ingestion.
86
- collection_name:
87
- Target Chroma collection that will store knowledge base documents.
88
- embedding:
89
- Name of the embedding function to instantiate. Values are matched
90
- case-insensitively to the functions exported by Chroma.
91
- data_directory:
92
- Optional directory for the persistent client. When omitted and the
93
- client type is ``"persistent"`` the function creates a ``chroma``
94
- sub-directory next to the knowledge base.
95
- host / port / ssl / tenant / database / api_key / custom_auth_credentials:
96
- Transport-specific settings passed directly to the Chroma client
97
- constructors.
98
- id_prefix:
99
- Optional prefix prepended to every document ID stored in Chroma.
100
- Defaults to ``"kb::"`` for readability.
101
- """
102
-
103
- normalized_type = (client_type or "off").lower()
104
- if normalized_type not in SUPPORTED_CLIENTS:
105
- raise ValueError(f"Unsupported Chroma client type: {client_type}")
106
-
107
- resolved_directory: Optional[Path]
108
- if data_directory:
109
- resolved_directory = Path(data_directory).expanduser().resolve()
110
- elif normalized_type == "persistent":
111
- resolved_directory = (root / DATA_FOLDER_NAME / "chroma").resolve()
112
- else:
113
- resolved_directory = None
114
-
115
- if resolved_directory is not None:
116
- resolved_directory.mkdir(parents=True, exist_ok=True)
117
-
118
- prefix = id_prefix or "kb::"
119
-
120
- normalized_embedding = (embedding or "default").lower()
121
-
122
- config = cls(
123
- client_type=normalized_type,
124
- collection_name=collection_name,
125
- embedding=normalized_embedding,
126
- data_directory=resolved_directory,
127
- host=host,
128
- port=port,
129
- ssl=ssl,
130
- tenant=tenant,
131
- database=database,
132
- api_key=api_key,
133
- custom_auth_credentials=custom_auth_credentials,
134
- id_prefix=prefix,
135
- )
136
- config._validate()
137
- return config
138
-
139
- def _validate(self) -> None:
140
- """Validate the configuration and raise descriptive errors when invalid."""
141
-
142
- if not self.enabled:
143
- return
144
-
145
- if self.client_type == "persistent" and self.data_directory is None:
146
- raise ValueError("Persistent Chroma client requires a data directory")
147
-
148
- if self.client_type == "http" and not self.host:
149
- raise ValueError(
150
- "HTTP Chroma client requires --chroma-host or MCP_KB_CHROMA_HOST"
151
- )
152
-
153
- if self.client_type == "cloud":
154
- missing = [
155
- name
156
- for name, value in (
157
- ("tenant", self.tenant),
158
- ("database", self.database),
159
- ("api_key", self.api_key),
160
- )
161
- if not value
162
- ]
163
- if missing:
164
- pretty = ", ".join(missing)
165
- raise ValueError(f"Cloud Chroma client requires values for: {pretty}")
166
-
167
- if not self.collection_name:
168
- raise ValueError("Collection name must be provided")
169
-
170
- if not self.embedding:
171
- raise ValueError("Embedding function name must be provided")
172
-
173
-
174
- @dataclass(frozen=True)
175
- class _ChromaDependencies:
176
- """Lazy import bundle containing the pieces needed to talk to ChromaDB."""
177
-
178
- chroma_module: Any
179
- settings_cls: Type[Any]
180
- embedding_factories: Mapping[str, Type[Any]]
181
-
182
-
183
- def _load_dependencies() -> _ChromaDependencies:
184
- """Import ChromaDB lazily so the base server works without the dependency."""
185
-
186
- try:
187
- chroma_module = importlib.import_module("chromadb")
188
- except ModuleNotFoundError as exc: # pragma: no cover - dependent on environment
189
- raise RuntimeError(
190
- "Chroma integration requested but the 'chromadb' package is not installed. "
191
- "Install chromadb via 'uv add chromadb' or disable ingestion."
192
- ) from exc
193
-
194
- config_module = importlib.import_module("chromadb.config")
195
- embedding_module = importlib.import_module("chromadb.utils.embedding_functions")
196
-
197
- factories: Dict[str, Type[Any]] = {}
198
- fallback_map = {
199
- "default": "DefaultEmbeddingFunction",
200
- "cohere": "CohereEmbeddingFunction",
201
- "openai": "OpenAIEmbeddingFunction",
202
- "jina": "JinaEmbeddingFunction",
203
- "voyageai": "VoyageAIEmbeddingFunction",
204
- "roboflow": "RoboflowEmbeddingFunction",
205
- }
206
- for alias, attr in fallback_map.items():
207
- if hasattr(embedding_module, attr):
208
- factories[alias] = getattr(embedding_module, attr)
209
- if not factories:
210
- raise RuntimeError(
211
- "No embedding functions were found in chromadb.utils.embedding_functions"
212
- )
213
-
214
- return _ChromaDependencies(
215
- chroma_module=chroma_module,
216
- settings_cls=getattr(config_module, "Settings"),
217
- embedding_factories=factories,
218
- )
219
-
220
-
221
- class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
222
- """Listener that mirrors knowledge base writes into a Chroma collection.
223
-
224
- The listener adheres to the :class:`KnowledgeBaseListener` protocol so it
225
- can be registered alongside other observers without coupling. Events are
226
- written synchronously to guarantee that indexing stays consistent with the
227
- underlying filesystem operations.
228
- """
229
-
230
- def __init__(self, configuration: ChromaConfiguration) -> None:
231
- """Create an ingestor bound to ``configuration``.
232
-
233
- Parameters
234
- ----------
235
- configuration:
236
- Sanitised :class:`ChromaConfiguration` describing how to connect to
237
- Chroma and which collection to mirror.
238
- """
239
-
240
- self.configuration = configuration
241
- self._deps = _load_dependencies()
242
- self._client = self._create_client()
243
- self._collection = self._ensure_collection()
244
- self.textsplitter = TokenTextSplitter(
245
- chunk_size=200, chunk_overlap=20, add_start_index=True
246
- )
247
-
248
- def get_document_chunks(
249
- self, document_id: str, include: List[str] = ["metadatas", "documents"]
250
- ) -> GetResult:
251
- """Get a document from the Chroma index."""
252
- return self._collection.get(where={"document_id": document_id}, include=include)
253
-
254
- def handle_upsert(self, event: FileUpsertEvent) -> None:
255
- """Upsert ``event`` into the configured Chroma collection.
256
-
257
- Every invocation removes any existing Chroma entry before inserting the
258
- fresh payload so that the embedding engine recomputes vectors using the
259
- latest markdown. The stored metadata keeps both absolute and relative
260
- paths, enabling downstream semantic search tools to surface references
261
- that point straight back into the knowledge base.
262
- """
263
-
264
- document_id = f"{self.configuration.id_prefix}{event.relative_path}"
265
- metadata = {
266
- "relative_path": event.relative_path,
267
- }
268
- self._reindex_document(document_id, event.content, metadata)
269
-
270
- def delete_document(self, document_id: str) -> None:
271
- """Delete a document from the Chroma index."""
272
- self._collection.delete(
273
- ids=self.get_document_chunks(document_id, include=[])["ids"]
274
- )
275
-
276
- def handle_delete(self, event: FileDeleteEvent) -> None:
277
- """Remove documents associated with ``event`` from the Chroma index.
278
-
279
- Soft deletions translate to a straight removal because the PRD treats
280
- files carrying the delete sentinel as hidden from client tooling.
281
- """
282
-
283
- document_id = f"{self.configuration.id_prefix}{event.relative_path}"
284
- try:
285
- self.delete_document(document_id)
286
- except Exception: # pragma: no cover - depends on Chroma exceptions
287
- # Chroma raises a custom error when the ID is missing. Deletion should
288
- # be idempotent so we swallow those errors silently.
289
- pass
290
-
291
- @property
292
- def collection(self) -> "Collection":
293
- """Return the underlying Chroma collection for diagnostics and tests."""
294
-
295
- return self._collection
296
-
297
- def query(self, query: str, *, n_results: int = 5) -> List[Dict[str, Any]]:
298
- """Return structured query results from the configured collection.
299
-
300
- Parameters
301
- ----------
302
- query:
303
- Natural language string used to compute the semantic embedding.
304
- n_results:
305
- Maximum number of results to return. Defaults to five to mirror the
306
- behaviour surfaced through the MCP search tool.
307
-
308
- Returns
309
- -------
310
- list[dict[str, Any]]
311
- Each dictionary contains the ``document`` text, associated
312
- ``metadata`` payload, and a floating-point ``distance`` score if
313
- provided by Chroma.
314
- """
315
-
316
- payload = self._collection.query(
317
- query_texts=[query],
318
- n_results=n_results,
319
- include=["metadatas", "documents", "distances"],
320
- )
321
-
322
- documents = payload.get("documents", [[]])
323
- metadatas = payload.get("metadatas", [[]])
324
- distances = payload.get("distances", [[]])
325
-
326
- if not documents or not documents[0]:
327
- return []
328
-
329
- results: List[Dict[str, Any]] = []
330
- for index, metadata in enumerate(metadatas[0]):
331
- document = documents[0][index] if index < len(documents[0]) else ""
332
- distance = distances[0][index] if distances and distances[0] else None
333
- results.append(
334
- {
335
- "metadata": metadata or {},
336
- "document": document,
337
- "distance": distance,
338
- }
339
- )
340
-
341
- return results
342
-
343
- # Optional search extension -------------------------------------------------
344
-
345
- def search(
346
- self,
347
- kb: "KnowledgeBase",
348
- query: str,
349
- *,
350
- context_lines: int = 2,
351
- limit: Optional[int] = None,
352
- ) -> List[SearchMatch]:
353
- """Translate semantic query results into :class:`SearchMatch` objects."""
354
-
355
- max_results = limit or 5
356
- records = self.query(query, n_results=max_results)
357
- matches: List[SearchMatch] = []
358
- seen_paths: Set[Path] = set()
359
-
360
- for record in records:
361
- metadata = record.get("metadata") or {}
362
- candidate = self._resolve_candidate_path(
363
- kb,
364
- metadata.get("relative_path"),
365
- )
366
- if candidate is None or candidate in seen_paths:
367
- continue
368
-
369
- seen_paths.add(candidate)
370
- try:
371
- text = candidate.read_text(encoding="utf-8")
372
- except FileNotFoundError:
373
- continue
374
-
375
- lines = text.splitlines()
376
- file_matches = self._extract_matches_from_lines(
377
- candidate, lines, query, context_lines
378
- )
379
- if file_matches:
380
- matches.append(file_matches[0])
381
- elif lines:
382
- preview_limit = min(len(lines), context_lines * 2 + 1)
383
- matches.append(
384
- SearchMatch(
385
- path=candidate,
386
- line_number=1,
387
- context=lines[:preview_limit],
388
- )
389
- )
390
-
391
- if limit is not None and len(matches) >= limit:
392
- break
393
-
394
- return matches
395
-
396
- # Internal helpers ----------------------------------------------------------
397
-
398
- def _reindex_document(
399
- self,
400
- document_id: str,
401
- content: str,
402
- metadata: Mapping[str, Any],
403
- ) -> None:
404
- """Replace the stored document so embeddings are recomputed.
405
-
406
- Reindexing involves removing any stale record before inserting the new
407
- payload. Some Chroma backends keep historical data around when ``add``
408
- is invoked with an existing ID, so the deletion step ensures the stored
409
- embedding always reflects the latest markdown contents. ``metadata`` is
410
- copied to break accidental references held by callers.
411
- """
412
-
413
- try:
414
- # filter by document_id in metadata
415
- self.delete_document(document_id)
416
- except Exception: # pragma: no cover - depends on Chroma exception types
417
- # Missing IDs are not an error; most clients raise when attempting to
418
- # delete a non-existent record. We swallow those errors to keep the
419
- # reindexing path idempotent.
420
- pass
421
-
422
- payload_metadata = dict(metadata)
423
- payload_metadata["document_id"] = document_id
424
-
425
- # splitting
426
-
427
- split_docs = self.textsplitter.create_documents([content])
428
-
429
- for i, d in enumerate(split_docs):
430
- d.metadata.update(payload_metadata)
431
- d.metadata["chunk_number"] = i
432
- d.metadata["startline"] = len(
433
- content[: d.metadata["start_index"]].splitlines()
434
- )
435
- d.metadata["endline"] = (
436
- d.metadata["startline"] + len(d.page_content.splitlines()) - 1
437
- )
438
-
439
- self._collection.add(
440
- documents=[d.page_content for d in split_docs],
441
- metadatas=[d.metadata for d in split_docs],
442
- ids=[
443
- f"{d.metadata['document_id']}-{d.metadata['chunk_number']}"
444
- for d in split_docs
445
- ],
446
- )
447
-
448
- # Optional full reindex -----------------------------------------------------
449
-
450
- def reindex(self, kb: "KnowledgeBase") -> int:
451
- """Rebuild the Chroma index from the current knowledge base state.
452
-
453
- The method iterates over all active markdown files visible to the
454
- provided knowledge base instance, computing a deterministic document ID
455
- for each path using the configured ``id_prefix``. Each file is read from
456
- disk and upserted into the underlying Chroma collection by delegating to
457
- :meth:`_reindex_document`, ensuring embeddings are recomputed.
458
-
459
- Parameters
460
- ----------
461
- kb:
462
- The :class:`~mcp_kb.knowledge.store.KnowledgeBase` providing access
463
- to the validated filesystem and utility methods.
464
-
465
- Returns
466
- -------
467
- int
468
- The number of documents processed during the reindex run.
469
- """
470
-
471
- count = 0
472
- root = kb.rules.root
473
- with tqdm(
474
- kb.iter_active_files(include_docs=False),
475
- desc="Reindexing Chroma",
476
- total=kb.total_active_files(include_docs=False),
477
- ) as pbar:
478
- for path in pbar:
479
- pbar.set_description(f"Reindexing Chroma {path.name}")
480
- try:
481
- content = path.read_text(encoding="utf-8")
482
- except FileNotFoundError: # pragma: no cover - race with external edits
483
- continue
484
-
485
- relative = str(path.relative_to(root))
486
- document_id = f"{self.configuration.id_prefix}{relative}"
487
- metadata = {
488
- "relative_path": relative,
489
- }
490
- self._reindex_document(document_id, content, metadata)
491
- count += 1
492
-
493
- return count
494
-
495
- def _extract_matches_from_lines(
496
- self,
497
- path: Path,
498
- lines: List[str],
499
- query: str,
500
- context_lines: int,
501
- ) -> List[SearchMatch]:
502
- """Return line-based matches for ``query`` within ``lines``."""
503
-
504
- matches: List[SearchMatch] = []
505
- for index, line in enumerate(lines, start=1):
506
- if query in line:
507
- start = max(0, index - context_lines - 1)
508
- end = min(len(lines), index + context_lines)
509
- matches.append(
510
- SearchMatch(
511
- path=path,
512
- line_number=index,
513
- context=lines[start:end],
514
- )
515
- )
516
- return matches
517
-
518
- def _resolve_candidate_path(
519
- self,
520
- kb: "KnowledgeBase",
521
- relative: Optional[str],
522
- ) -> Optional[Path]:
523
- """Translate metadata hints into a validated path inside ``kb``."""
524
-
525
- path: Optional[Path] = None
526
- if relative:
527
- candidate = (kb.rules.root / relative).resolve()
528
- try:
529
- candidate.relative_to(kb.rules.root)
530
- except ValueError:
531
- path = None
532
- else:
533
- if candidate.exists():
534
- path = candidate
535
-
536
- return path
537
-
538
- def _create_client(self) -> "ClientAPI":
539
- """Instantiate the proper Chroma client based on configuration.
540
-
541
- The method supports all transport modes referenced in the user
542
- requirements. It constructs the minimal set of keyword arguments for the
543
- chosen backend and lets Chroma's client validate the final configuration.
544
- """
545
-
546
- chroma = self._deps.chroma_module
547
- config = self.configuration
548
-
549
- if not config.enabled:
550
- raise RuntimeError(
551
- "ChromaIngestor cannot be constructed when ingestion is disabled"
552
- )
553
-
554
- if config.client_type == "ephemeral":
555
- return chroma.EphemeralClient()
556
-
557
- if config.client_type == "persistent":
558
- return chroma.PersistentClient(path=str(config.data_directory))
559
-
560
- if config.client_type in {"http", "cloud"}:
561
- kwargs: Dict[str, Any] = {
562
- "ssl": config.ssl if config.client_type == "http" else True,
563
- }
564
- if config.client_type == "http":
565
- kwargs["host"] = config.host
566
- if config.port is not None:
567
- kwargs["port"] = config.port
568
- if config.custom_auth_credentials:
569
- kwargs["settings"] = self._deps.settings_cls(
570
- chroma_client_auth_provider="chromadb.auth.basic_authn.BasicAuthClientProvider",
571
- chroma_client_auth_credentials=config.custom_auth_credentials,
572
- )
573
- else: # cloud
574
- kwargs["host"] = config.host or "api.trychroma.com"
575
- kwargs["tenant"] = config.tenant
576
- kwargs["database"] = config.database
577
- kwargs.setdefault("headers", {})
578
- kwargs["headers"]["x-chroma-token"] = config.api_key
579
-
580
- return chroma.HttpClient(**kwargs)
581
-
582
- raise ValueError(f"Unsupported client type: {config.client_type}")
583
-
584
- def _ensure_collection(self) -> "Collection":
585
- """Create or return the configured Chroma collection."""
586
-
587
- factory = self._deps.embedding_factories.get(self.configuration.embedding)
588
- if factory is None:
589
- available = ", ".join(sorted(self._deps.embedding_factories))
590
- raise ValueError(
591
- f"Unknown embedding function '{self.configuration.embedding}'. "
592
- f"Available options: {available}"
593
- )
594
- embedding_function = factory()
595
-
596
- metadata = {"source": "mcp-knowledge-base"}
597
- client = self._client
598
- try:
599
- return client.get_or_create_collection(
600
- name=self.configuration.collection_name,
601
- metadata=metadata,
602
- embedding_function=embedding_function,
603
- )
604
- except TypeError:
605
- # Older Chroma versions expect CreateCollectionConfiguration. Fall back
606
- # to create_collection for compatibility.
607
- return client.get_or_create_collection(
608
- name=self.configuration.collection_name,
609
- embedding_function=embedding_function,
610
- )
@@ -1 +0,0 @@
1
- """Knowledge layer that encapsulates content storage and search helpers."""
@@ -1,44 +0,0 @@
1
- """Bootstrap helpers executed during server startup."""
2
-
3
- from __future__ import annotations
4
-
5
- import importlib.resources as resources
6
- from pathlib import Path
7
-
8
- from mcp_kb.config import DATA_FOLDER_NAME, DOC_FILENAME
9
-
10
-
11
- def install_default_documentation(root: Path) -> Path:
12
- """Ensure the default documentation file exists under ``root``.
13
-
14
- The function creates the documentation directory if necessary and copies the
15
- packaged ``KNOWLEDBASE_DOC.md`` file into place. Existing documentation is
16
- preserved so that operators can customize the file without losing changes on
17
- subsequent startups.
18
-
19
- Parameters
20
- ----------
21
- root:
22
- Absolute path representing the knowledge base root directory.
23
-
24
- Returns
25
- -------
26
- Path
27
- Path to the documentation file inside the knowledge base tree.
28
- """
29
-
30
- docs_dir = root / DATA_FOLDER_NAME
31
- doc_path = docs_dir / DOC_FILENAME
32
- if doc_path.exists():
33
- return doc_path
34
-
35
- docs_dir.mkdir(parents=True, exist_ok=True)
36
-
37
- with (
38
- resources.files("mcp_kb.data")
39
- .joinpath("KNOWLEDBASE_DOC.md")
40
- .open("r", encoding="utf-8") as source
41
- ):
42
- doc_path.write_text(source.read(), encoding="utf-8")
43
-
44
- return doc_path