mcp-kb 0.1.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/PKG-INFO +3 -3
  2. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/README.md +1 -1
  3. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/cli/main.py +11 -4
  4. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/cli/reindex.py +7 -4
  5. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/config.py +8 -6
  6. mcp_kb-0.2.1/mcp_kb/data/KNOWLEDBASE_DOC.md +151 -0
  7. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/ingest/chroma.py +57 -30
  8. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/knowledge/bootstrap.py +8 -3
  9. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/knowledge/events.py +1 -0
  10. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/knowledge/search.py +6 -3
  11. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/knowledge/store.py +26 -10
  12. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/security/path_validation.py +8 -3
  13. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/server/app.py +17 -8
  14. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/utils/filesystem.py +45 -0
  15. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/PKG-INFO +3 -3
  16. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/pyproject.toml +3 -2
  17. mcp_kb-0.1.0/mcp_kb/data/KNOWLEDBASE_DOC.md +0 -36
  18. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/__init__.py +0 -0
  19. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/cli/__init__.py +0 -0
  20. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/cli/args.py +1 -1
  21. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/data/__init__.py +0 -0
  22. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/ingest/__init__.py +0 -0
  23. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/knowledge/__init__.py +0 -0
  24. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/security/__init__.py +0 -0
  25. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/server/__init__.py +0 -0
  26. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb/utils/__init__.py +0 -0
  27. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/SOURCES.txt +0 -0
  28. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/dependency_links.txt +0 -0
  29. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/entry_points.txt +0 -0
  30. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/requires.txt +1 -1
  31. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/mcp_kb.egg-info/top_level.txt +0 -0
  32. {mcp_kb-0.1.0 → mcp_kb-0.2.1}/setup.cfg +0 -0
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-kb
3
- Version: 0.1.0
3
+ Version: 0.2.1
4
4
  Summary: MCP server exposing a local markdown knowledge base
5
5
  Author: LLM Maintainer
6
6
  Requires-Python: >=3.11
7
7
  Description-Content-Type: text/markdown
8
- Requires-Dist: chromadb>=1.1.0
9
8
  Requires-Dist: httpx>=0.28.1
10
9
  Requires-Dist: mcp[cli]>=1.15.0
11
10
  Provides-Extra: vector
11
+ Requires-Dist: chromadb>=1.1.0; extra == "vector"
12
12
  Requires-Dist: tiktoken>=0.11.0; extra == "vector"
13
13
  Requires-Dist: langchain-text-splitters>=0.3.11; extra == "vector"
14
14
 
@@ -36,7 +36,7 @@ uv run mcp-kb-server --transport http --host 0.0.0.0 --port 9000
36
36
  ```
37
37
 
38
38
  On first launch the server copies a bundled `KNOWLEDBASE_DOC.md` into the
39
- `.docs/` directory if it is missing so that every deployment starts with a
39
+ `.data/` directory if it is missing so that every deployment starts with a
40
40
  baseline usage guide.
41
41
 
42
42
  ## Optional ChromaDB Mirroring
@@ -22,7 +22,7 @@ uv run mcp-kb-server --transport http --host 0.0.0.0 --port 9000
22
22
  ```
23
23
 
24
24
  On first launch the server copies a bundled `KNOWLEDBASE_DOC.md` into the
25
- `.docs/` directory if it is missing so that every deployment starts with a
25
+ `.data/` directory if it is missing so that every deployment starts with a
26
26
  baseline usage guide.
27
27
 
28
28
  ## Optional ChromaDB Mirroring
@@ -1,4 +1,5 @@
1
1
  """Command line interface for running the MCP knowledge base server."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  import argparse
@@ -8,7 +9,7 @@ import os
8
9
  from pathlib import Path
9
10
  from typing import Iterable, List, Optional
10
11
 
11
- from mcp_kb.config import DOCS_FOLDER_NAME, resolve_knowledge_base_root
12
+ from mcp_kb.config import DATA_FOLDER_NAME, resolve_knowledge_base_root
12
13
  from mcp_kb.cli.args import add_chroma_arguments, build_chroma_listener, parse_bool
13
14
  from mcp_kb.ingest.chroma import ChromaIngestor
14
15
  from mcp_kb.knowledge.bootstrap import install_default_documentation
@@ -79,7 +80,7 @@ def run_server(arguments: Iterable[str] | None = None) -> None:
79
80
  parser = _build_argument_parser()
80
81
  options = parser.parse_args(arguments)
81
82
  root_path = resolve_knowledge_base_root(options.root)
82
- rules = PathRules(root=root_path, protected_folders=(DOCS_FOLDER_NAME,))
83
+ rules = PathRules(root=root_path, protected_folders=(DATA_FOLDER_NAME,))
83
84
  install_default_documentation(root_path)
84
85
  listeners: List[ChromaIngestor] = []
85
86
  try:
@@ -100,9 +101,15 @@ def run_server(arguments: Iterable[str] | None = None) -> None:
100
101
  listeners=listeners,
101
102
  )
102
103
  transports = options.transports or ["stdio"]
103
- logger.info(f"Running server on {options.host}:{options.port} with transports {transports}")
104
+ logger.info(
105
+ f"Running server on {options.host}:{options.port} with transports {transports}"
106
+ )
104
107
  logger.info(f"Data root is {root_path}")
105
- print("--------------------------------",root_path,"--------------------------------")
108
+ print(
109
+ "--------------------------------",
110
+ root_path,
111
+ "--------------------------------",
112
+ )
106
113
  asyncio.run(_run_transports(server, transports))
107
114
 
108
115
 
@@ -5,6 +5,7 @@ ingestors and calls their ``reindex`` method when available, allowing operators
5
5
  to trigger a full rebuild of external indexes (e.g., Chroma) from the current
6
6
  filesystem state.
7
7
  """
8
+
8
9
  from __future__ import annotations
9
10
 
10
11
  import argparse
@@ -12,7 +13,7 @@ import logging
12
13
  from typing import Iterable, List
13
14
 
14
15
  from mcp_kb.cli.args import add_chroma_arguments, build_chroma_listener
15
- from mcp_kb.config import DOCS_FOLDER_NAME, resolve_knowledge_base_root
16
+ from mcp_kb.config import DATA_FOLDER_NAME, resolve_knowledge_base_root
16
17
  from mcp_kb.knowledge.events import KnowledgeBaseReindexListener
17
18
  from mcp_kb.knowledge.store import KnowledgeBase
18
19
  from mcp_kb.security.path_validation import PathRules
@@ -24,7 +25,9 @@ logger = logging.getLogger(__name__)
24
25
  def _build_argument_parser() -> argparse.ArgumentParser:
25
26
  """Return the argument parser for the reindex command."""
26
27
 
27
- parser = argparse.ArgumentParser(description="Reindex the knowledge base into configured backends")
28
+ parser = argparse.ArgumentParser(
29
+ description="Reindex the knowledge base into configured backends"
30
+ )
28
31
  parser.add_argument(
29
32
  "--root",
30
33
  dest="root",
@@ -58,7 +61,7 @@ def run_reindex(arguments: Iterable[str] | None = None) -> int:
58
61
  parser = _build_argument_parser()
59
62
  options = parser.parse_args(arguments)
60
63
  root_path = resolve_knowledge_base_root(options.root)
61
- rules = PathRules(root=root_path, protected_folders=(DOCS_FOLDER_NAME,))
64
+ rules = PathRules(root=root_path, protected_folders=(DATA_FOLDER_NAME,))
62
65
  kb = KnowledgeBase(rules)
63
66
 
64
67
  listeners: List[KnowledgeBaseReindexListener] = []
@@ -71,6 +74,7 @@ def run_reindex(arguments: Iterable[str] | None = None) -> int:
71
74
 
72
75
  total = 0
73
76
  for listener in listeners:
77
+ logger.info("Reindexing via %s", listener.__class__.__name__)
74
78
  count = listener.reindex(kb)
75
79
  logger.info("Reindexed %d documents via %s", count, listener.__class__.__name__)
76
80
  total += count
@@ -87,4 +91,3 @@ def main() -> None:
87
91
 
88
92
  if __name__ == "__main__":
89
93
  main()
90
-
@@ -7,6 +7,7 @@ logic more reusable across different deployment environments because callers can
7
7
  swap configurations programmatically or via environment variables without
8
8
  modifying the core modules.
9
9
  """
10
+
10
11
  from __future__ import annotations
11
12
 
12
13
  from pathlib import Path
@@ -16,7 +17,7 @@ import os
16
17
  DEFAULT_KNOWLEDGE_BASE_DIR = ".knowledgebase"
17
18
  """str: Default relative directory for persisting knowledge base documents."""
18
19
 
19
- DOCS_FOLDER_NAME = ".docs"
20
+ DATA_FOLDER_NAME = ".data"
20
21
  """str: Name of the documentation folder inside the knowledge base tree."""
21
22
 
22
23
  DOC_FILENAME = "KNOWLEDBASE_DOC.md"
@@ -58,12 +59,13 @@ def resolve_knowledge_base_root(provided_path: str | None = None) -> Path:
58
59
  root directory.
59
60
  """
60
61
 
61
-
62
- candidate = provided_path or os.getenv(ENV_ROOT_KEY) or Path(
63
- os.getenv('WORKSPACE_FOLDER_PATHS') or Path.cwd()
64
- )/DEFAULT_KNOWLEDGE_BASE_DIR
62
+ candidate = (
63
+ provided_path
64
+ or os.getenv(ENV_ROOT_KEY)
65
+ or Path(os.getenv("WORKSPACE_FOLDER_PATHS") or Path.cwd())
66
+ / DEFAULT_KNOWLEDGE_BASE_DIR
67
+ )
65
68
  root_path = Path(candidate).expanduser().resolve()
66
69
  root_path.mkdir(parents=True, exist_ok=True)
67
70
 
68
-
69
71
  return root_path
@@ -0,0 +1,151 @@
1
+ # LLM Operating Manual — MCP Knowledge Base (`mcp-kb`)
2
+
3
+ You are connected to a **local, text-only knowledge base**. Your job is to **search, read, create, update, and soft-delete** UTF‑8 text files under a single root directory while respecting safety rules below. Use the provided MCP tools exactly as specified.
4
+
5
+ ---
6
+
7
+ ## Ground Rules (enforced by the server)
8
+
9
+ - **Paths are relative only.** Absolute paths are rejected. No `..` traversal.
10
+ - **Protected folder:** `.data/` is read‑only. Do not write there.
11
+ - **Soft delete sentinel:** Files marked with `_DELETE_` in the name are considered deleted. Do not read/write them.
12
+ - **Text files only.** Binary-ish files are ignored by scans. Treat this KB as UTF‑8 text storage.
13
+ - **Concurrency:** Writes are serialized per file; still prefer read‑verify‑write sequences.
14
+
15
+ Constants (baked into the server):
16
+ - Protected folder: `.data`
17
+ - Documentation file name: `KNOWLEDBASE_DOC.md`
18
+ - Delete sentinel: `_DELETE_`
19
+
20
+ ---
21
+
22
+ ## Tools You Can Call
23
+
24
+ All tool names and parameter contracts are stable. Stick to these shapes.
25
+
26
+ ### `create_file(path: str, content: str) -> str`
27
+ - Create or **overwrite** a text file at `path` with `content`.
28
+ - `path` must be **relative** and **outside** `.data/`.
29
+
30
+ ### `read_file(path: str, start_line?: int, end_line?: int) -> { path, start_line, end_line, content }`
31
+ - Read full file or a 1‑based inclusive slice.
32
+ - If both bounds omitted ⇒ full file. If one bound omitted ⇒ server fills it.
33
+
34
+ ### `append_file(path: str, content: str) -> str`
35
+ - Append text. If file is missing, it will be **created**.
36
+
37
+ ### `regex_replace(path: str, pattern: str, replacement: str) -> { replacements: int }`
38
+ - Multiline regex (`re.MULTILINE`). Returns count. Always `read_file` afterwards to verify.
39
+
40
+ ### `delete(path: str) -> str`
41
+ - **Soft delete**: renames `name.ext` to `name_DELETE_.ext`. Use when content is obsolete.
42
+
43
+ ### `search(query: str, limit: int = 5) -> [{ path, line, context: string[] }]`
44
+ - Returns up to `limit` matches with short context.
45
+ - If Chroma mirroring is active, results are **semantic** first; otherwise plain scan.
46
+ - `limit` must be **> 0**.
47
+
48
+ ### `overview() -> str`
49
+ - A deterministic `tree`-like view of active files under root (skips deleted and binaries).
50
+
51
+ ### `documentation() -> str`
52
+ - Human usage guide (not this manual). For you, prefer this manual.
53
+
54
+ ---
55
+
56
+ ## How to Work Effectively
57
+
58
+ ### 1) Discover
59
+ - Call `overview()` to understand the tree.
60
+ - If you need conventions or human guidelines, read `documentation()` (optional).
61
+
62
+ ### 2) Locate Content
63
+ - Prefer `search("keywords", limit=5)` to find candidate files/snippets.
64
+ - Examine each `{path, line, context}`. The `context` is a short window around the hit.
65
+ - If results look thin, **increase `limit`** (e.g., 10–20) before broadening the query.
66
+
67
+ ### 3) Read Precisely
68
+ - Use `read_file(path)` for the full file when structure matters.
69
+ - If the file is large but you know the region, use `read_file(path, start_line, end_line)` to minimize tokens.
70
+
71
+ ### 4) Create New Knowledge
72
+ - Pick a **descriptive relative path** (folders based on topic, kebab‑case names).
73
+ - Example: `architecture/decision-records/adr-2025-10-06-edge-cache.md`
74
+ - Call `create_file(path, content)`.
75
+ - Keep the **title as the first Markdown heading** so search has context.
76
+ - Link related files with **relative Markdown links**.
77
+
78
+ ### 5) Update Safely
79
+ - For small edits:
80
+ 1) `read_file(...)` to confirm current state.
81
+ 2) `regex_replace(path, pattern, replacement)` for targeted changes.
82
+ 3) `read_file(...)` again to verify.
83
+ - For additive changes: `append_file(path, "\n...")`.
84
+
85
+ ### 6) Deletion Policy
86
+ - Use `delete(path)` to **soft-delete**. Do not operate on files that already include `_DELETE_` in their name.
87
+
88
+ ---
89
+
90
+ ## Search Semantics (important)
91
+
92
+ - When Chroma ingestion is **enabled**, `search()` uses semantic ranking first and returns the **best slice per file** (the ingestor extracts one representative match per document chunk/file). If no obvious line match is found, you may get a **top-of-file preview** — then call `read_file()` to confirm.
93
+ - When Chroma is **not** enabled, `search()` scans files literally and returns all matches up to `limit`.
94
+ - Always **validate** by fetching the file segment with `read_file()` before making edits.
95
+
96
+ ---
97
+
98
+ ## Parameter Contracts and Gotchas
99
+
100
+ - `path` must be **relative** (e.g., `notes/today.md`). Absolute paths are rejected.
101
+ - Do **not** write into `.data/` (protected). Reads are allowed there.
102
+ - Line numbers in `read_file` are **1‑based** and the interval is **inclusive**.
103
+ - `regex_replace` uses Python’s `re.MULTILINE`. Validate your pattern; avoid overly broad substitutions.
104
+ - `append_file` will create a file if missing (useful for logs/progress notes).
105
+
106
+ ---
107
+
108
+ ## Typical Recipes
109
+
110
+ **Find → Read → Edit**
111
+ 1. `search("beta feature toggle", limit=10)`
112
+ 2. Pick a result: `read_file("features/toggles.md", 40, 80)`
113
+ 3. Adjust: `regex_replace("features/toggles.md", "^Status:.*$", "Status: Enabled")`
114
+ 4. Verify: `read_file("features/toggles.md")` (check the `Status:` header)
115
+
116
+ **Add a new doc**
117
+ 1. `create_file("ops/runbooks/cache-invalidation.md", "# Cache Invalidation\n\n…")`
118
+ 2. Optionally link it from an index: `append_file("ops/README.md", "\n- [Cache Invalidation](runbooks/cache-invalidation.md)")`
119
+
120
+ **Soft delete an obsolete note**
121
+ 1. `delete("notes/old-incident.md")`
122
+
123
+ ---
124
+
125
+ ## Error Recovery
126
+
127
+ - **"Absolute paths are not permitted"** → Use a **relative** path.
128
+ - **"Writes are not allowed inside the protected folder '.data'"** → Choose a different folder (e.g., `docs/`).
129
+ - **"File 'X' does not exist"** on delete → Confirm with `overview()` or `search()`. Only existing non‑deleted files can be soft‑deleted.
130
+ - **No search hits** → Widen keywords, increase `limit`, or pivot to `overview()` to eyeball likely locations.
131
+
132
+ ---
133
+
134
+ ## Things You Should Not Do
135
+
136
+ - Do not fabricate file contents or paths. Always confirm with `overview()`, `search()`, and `read_file()`.
137
+ - Do not operate on files that include `_DELETE_` in their name.
138
+ - Do not attempt to talk directly to Chroma; you only use `search()`. Indexing is handled automatically after writes.
139
+ - Do not write binary or non‑UTF‑8 content.
140
+
141
+ ---
142
+
143
+ ## Performance Hints
144
+
145
+ - Prefer `search()` + targeted `read_file()` slices over reading entire large files.
146
+ - Keep `limit` modest (5–10) unless you must broaden the search.
147
+ - Batch edits in one file using a single `regex_replace` when safe (then verify).
148
+
149
+ ---
150
+
151
+ You now have the minimal contract to operate this KB safely and efficiently.
@@ -1,4 +1,5 @@
1
1
  """Integration layer that mirrors knowledge base updates into ChromaDB."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  import importlib
@@ -6,7 +7,9 @@ from dataclasses import dataclass
6
7
  from pathlib import Path
7
8
  from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Type, TYPE_CHECKING
8
9
  from langchain_text_splitters import TokenTextSplitter
10
+ from tqdm import tqdm
9
11
 
12
+ from mcp_kb.config import DATA_FOLDER_NAME
10
13
  from mcp_kb.knowledge.events import (
11
14
  FileDeleteEvent,
12
15
  FileUpsertEvent,
@@ -23,6 +26,7 @@ if TYPE_CHECKING: # pragma: no cover - type checking only imports
23
26
  SUPPORTED_CLIENTS: Tuple[str, ...] = ("off", "ephemeral", "persistent", "http", "cloud")
24
27
  """Recognised client types exposed to operators enabling Chroma ingestion."""
25
28
 
29
+
26
30
  @dataclass(frozen=True)
27
31
  class ChromaConfiguration:
28
32
  """Runtime configuration controlling how Chroma ingestion behaves.
@@ -104,7 +108,7 @@ class ChromaConfiguration:
104
108
  if data_directory:
105
109
  resolved_directory = Path(data_directory).expanduser().resolve()
106
110
  elif normalized_type == "persistent":
107
- resolved_directory = (root / "chroma").resolve()
111
+ resolved_directory = (root / DATA_FOLDER_NAME / "chroma").resolve()
108
112
  else:
109
113
  resolved_directory = None
110
114
 
@@ -142,7 +146,9 @@ class ChromaConfiguration:
142
146
  raise ValueError("Persistent Chroma client requires a data directory")
143
147
 
144
148
  if self.client_type == "http" and not self.host:
145
- raise ValueError("HTTP Chroma client requires --chroma-host or MCP_KB_CHROMA_HOST")
149
+ raise ValueError(
150
+ "HTTP Chroma client requires --chroma-host or MCP_KB_CHROMA_HOST"
151
+ )
146
152
 
147
153
  if self.client_type == "cloud":
148
154
  missing = [
@@ -201,7 +207,9 @@ def _load_dependencies() -> _ChromaDependencies:
201
207
  if hasattr(embedding_module, attr):
202
208
  factories[alias] = getattr(embedding_module, attr)
203
209
  if not factories:
204
- raise RuntimeError("No embedding functions were found in chromadb.utils.embedding_functions")
210
+ raise RuntimeError(
211
+ "No embedding functions were found in chromadb.utils.embedding_functions"
212
+ )
205
213
 
206
214
  return _ChromaDependencies(
207
215
  chroma_module=chroma_module,
@@ -234,14 +242,14 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
234
242
  self._client = self._create_client()
235
243
  self._collection = self._ensure_collection()
236
244
  self.textsplitter = TokenTextSplitter(
237
- chunk_size=200,
238
- chunk_overlap=20,
239
- add_start_index=True
245
+ chunk_size=200, chunk_overlap=20, add_start_index=True
240
246
  )
241
247
 
242
- def get_document_chunks(self, document_id: str, include: List[str] = ["metadatas", "documents"]) -> GetResult:
248
+ def get_document_chunks(
249
+ self, document_id: str, include: List[str] = ["metadatas", "documents"]
250
+ ) -> GetResult:
243
251
  """Get a document from the Chroma index."""
244
- return self._collection.get(where={"document_id": document_id},include=include)
252
+ return self._collection.get(where={"document_id": document_id}, include=include)
245
253
 
246
254
  def handle_upsert(self, event: FileUpsertEvent) -> None:
247
255
  """Upsert ``event`` into the configured Chroma collection.
@@ -261,7 +269,9 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
261
269
 
262
270
  def delete_document(self, document_id: str) -> None:
263
271
  """Delete a document from the Chroma index."""
264
- self._collection.delete(ids=self.get_document_chunks(document_id,include=[])["ids"])
272
+ self._collection.delete(
273
+ ids=self.get_document_chunks(document_id, include=[])["ids"]
274
+ )
265
275
 
266
276
  def handle_delete(self, event: FileDeleteEvent) -> None:
267
277
  """Remove documents associated with ``event`` from the Chroma index.
@@ -363,7 +373,9 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
363
373
  continue
364
374
 
365
375
  lines = text.splitlines()
366
- file_matches = self._extract_matches_from_lines(candidate, lines, query, context_lines)
376
+ file_matches = self._extract_matches_from_lines(
377
+ candidate, lines, query, context_lines
378
+ )
367
379
  if file_matches:
368
380
  matches.append(file_matches[0])
369
381
  elif lines:
@@ -408,22 +420,29 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
408
420
  pass
409
421
 
410
422
  payload_metadata = dict(metadata)
411
- payload_metadata['document_id'] = document_id
423
+ payload_metadata["document_id"] = document_id
412
424
 
413
425
  # splitting
414
426
 
415
427
  split_docs = self.textsplitter.create_documents([content])
416
-
428
+
417
429
  for i, d in enumerate(split_docs):
418
430
  d.metadata.update(payload_metadata)
419
- d.metadata['chunk_number'] = i
420
- d.metadata['startline'] = len(content[:d.metadata['start_index']].splitlines())
421
- d.metadata['endline'] = d.metadata['startline'] + len(d.page_content.splitlines())-1
431
+ d.metadata["chunk_number"] = i
432
+ d.metadata["startline"] = len(
433
+ content[: d.metadata["start_index"]].splitlines()
434
+ )
435
+ d.metadata["endline"] = (
436
+ d.metadata["startline"] + len(d.page_content.splitlines()) - 1
437
+ )
422
438
 
423
439
  self._collection.add(
424
440
  documents=[d.page_content for d in split_docs],
425
441
  metadatas=[d.metadata for d in split_docs],
426
- ids=[f"{d.metadata['document_id']}-{d.metadata['chunk_number']}" for d in split_docs],
442
+ ids=[
443
+ f"{d.metadata['document_id']}-{d.metadata['chunk_number']}"
444
+ for d in split_docs
445
+ ],
427
446
  )
428
447
 
429
448
  # Optional full reindex -----------------------------------------------------
@@ -451,19 +470,25 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
451
470
 
452
471
  count = 0
453
472
  root = kb.rules.root
454
- for path in kb.iter_active_files(include_docs=False):
455
- try:
456
- content = path.read_text(encoding="utf-8")
457
- except FileNotFoundError: # pragma: no cover - race with external edits
458
- continue
459
-
460
- relative = str(path.relative_to(root))
461
- document_id = f"{self.configuration.id_prefix}{relative}"
462
- metadata = {
463
- "relative_path": relative,
464
- }
465
- self._reindex_document(document_id, content, metadata)
466
- count += 1
473
+ with tqdm(
474
+ kb.iter_active_files(include_docs=False),
475
+ desc="Reindexing Chroma",
476
+ total=kb.total_active_files(include_docs=False),
477
+ ) as pbar:
478
+ for path in pbar:
479
+ pbar.set_description(f"Reindexing Chroma {path.name}")
480
+ try:
481
+ content = path.read_text(encoding="utf-8")
482
+ except FileNotFoundError: # pragma: no cover - race with external edits
483
+ continue
484
+
485
+ relative = str(path.relative_to(root))
486
+ document_id = f"{self.configuration.id_prefix}{relative}"
487
+ metadata = {
488
+ "relative_path": relative,
489
+ }
490
+ self._reindex_document(document_id, content, metadata)
491
+ count += 1
467
492
 
468
493
  return count
469
494
 
@@ -522,7 +547,9 @@ class ChromaIngestor(KnowledgeBaseListener, KnowledgeBaseReindexListener):
522
547
  config = self.configuration
523
548
 
524
549
  if not config.enabled:
525
- raise RuntimeError("ChromaIngestor cannot be constructed when ingestion is disabled")
550
+ raise RuntimeError(
551
+ "ChromaIngestor cannot be constructed when ingestion is disabled"
552
+ )
526
553
 
527
554
  if config.client_type == "ephemeral":
528
555
  return chroma.EphemeralClient()
@@ -1,10 +1,11 @@
1
1
  """Bootstrap helpers executed during server startup."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  import importlib.resources as resources
5
6
  from pathlib import Path
6
7
 
7
- from mcp_kb.config import DOCS_FOLDER_NAME, DOC_FILENAME
8
+ from mcp_kb.config import DATA_FOLDER_NAME, DOC_FILENAME
8
9
 
9
10
 
10
11
  def install_default_documentation(root: Path) -> Path:
@@ -26,14 +27,18 @@ def install_default_documentation(root: Path) -> Path:
26
27
  Path to the documentation file inside the knowledge base tree.
27
28
  """
28
29
 
29
- docs_dir = root / DOCS_FOLDER_NAME
30
+ docs_dir = root / DATA_FOLDER_NAME
30
31
  doc_path = docs_dir / DOC_FILENAME
31
32
  if doc_path.exists():
32
33
  return doc_path
33
34
 
34
35
  docs_dir.mkdir(parents=True, exist_ok=True)
35
36
 
36
- with resources.files("mcp_kb.data").joinpath("KNOWLEDBASE_DOC.md").open("r", encoding="utf-8") as source:
37
+ with (
38
+ resources.files("mcp_kb.data")
39
+ .joinpath("KNOWLEDBASE_DOC.md")
40
+ .open("r", encoding="utf-8") as source
41
+ ):
37
42
  doc_path.write_text(source.read(), encoding="utf-8")
38
43
 
39
44
  return doc_path
@@ -7,6 +7,7 @@ coupling the core filesystem logic to specific backends. Each event captures bot
7
7
  absolute and knowledge-base-relative paths so that listeners can decide which
8
8
  identifier best fits their storage requirements.
9
9
  """
10
+
10
11
  from __future__ import annotations
11
12
 
12
13
  from dataclasses import dataclass
@@ -5,13 +5,14 @@ can evolve independently. Search often benefits from dedicated caching or
5
5
  indexing strategies; keeping it in its own module means the server can swap the
6
6
  implementation later without changing the core file lifecycle API.
7
7
  """
8
+
8
9
  from __future__ import annotations
9
10
 
10
11
  from dataclasses import dataclass
11
12
  from pathlib import Path
12
13
  from typing import Dict, Iterable, List, Optional
13
14
 
14
- from mcp_kb.config import DOCS_FOLDER_NAME, DOC_FILENAME
15
+ from mcp_kb.config import DATA_FOLDER_NAME, DOC_FILENAME
15
16
  from mcp_kb.knowledge.events import KnowledgeBaseSearchListener
16
17
  from mcp_kb.knowledge.store import KnowledgeBase
17
18
 
@@ -141,13 +142,15 @@ def read_documentation(kb: KnowledgeBase) -> str:
141
142
  folder.
142
143
  """
143
144
 
144
- doc_path = kb.rules.root / DOCS_FOLDER_NAME / DOC_FILENAME
145
+ doc_path = kb.rules.root / DATA_FOLDER_NAME / DOC_FILENAME
145
146
  if not doc_path.exists():
146
147
  return ""
147
148
  return doc_path.read_text(encoding="utf-8")
148
149
 
149
150
 
150
- def _extract_matches_for_path(path: Path, query: str, context_lines: int) -> List[SearchMatch]:
151
+ def _extract_matches_for_path(
152
+ path: Path, query: str, context_lines: int
153
+ ) -> List[SearchMatch]:
151
154
  """Read ``path`` and return every match that contains ``query``."""
152
155
 
153
156
  lines = path.read_text(encoding="utf-8").splitlines()
@@ -2,11 +2,12 @@
2
2
 
3
3
  This module exposes the ``KnowledgeBase`` class, which orchestrates validated
4
4
  filesystem operations for the MCP server. The class encapsulates logic for
5
- creating, reading, appending, and modifying markdown files while respecting the
5
+ creating, reading, appending, and modifying text files while respecting the
6
6
  security constraints defined in the PRD. Each method returns plain Python data
7
7
  structures so that higher-level layers (e.g., JSON-RPC handlers) can focus on
8
8
  protocol serialization rather than filesystem minutiae.
9
9
  """
10
+
10
11
  from __future__ import annotations
11
12
 
12
13
  import re
@@ -14,8 +15,12 @@ from dataclasses import dataclass
14
15
  from pathlib import Path
15
16
  from typing import Iterable, Optional
16
17
 
17
- from mcp_kb.config import DELETE_SENTINEL, DOCS_FOLDER_NAME
18
- from mcp_kb.knowledge.events import FileDeleteEvent, FileUpsertEvent, KnowledgeBaseListener
18
+ from mcp_kb.config import DELETE_SENTINEL, DATA_FOLDER_NAME
19
+ from mcp_kb.knowledge.events import (
20
+ FileDeleteEvent,
21
+ FileUpsertEvent,
22
+ KnowledgeBaseListener,
23
+ )
19
24
  from mcp_kb.security.path_validation import (
20
25
  PathRules,
21
26
  ensure_write_allowed,
@@ -79,7 +84,7 @@ class KnowledgeBase:
79
84
  self.listeners = tuple(listeners or ())
80
85
 
81
86
  def create_file(self, relative_path: str, content: str) -> Path:
82
- """Create or overwrite a markdown file at ``relative_path``.
87
+ """Create or overwrite a text file at ``relative_path``.
83
88
 
84
89
  The method validates the path, ensures that the parent directory exists,
85
90
  and writes the provided content as UTF-8 text. Existing files are
@@ -186,8 +191,12 @@ class KnowledgeBase:
186
191
  self._notify_delete(target, original_relative)
187
192
  return target
188
193
 
194
+ def total_active_files(self, include_docs: bool = False) -> int:
195
+ """Return the total number of non-deleted UTF-8 text files under the root directory."""
196
+ return sum(1 for _ in self.iter_active_files(include_docs=include_docs))
197
+
189
198
  def iter_active_files(self, include_docs: bool = False) -> Iterable[Path]:
190
- """Yield non-deleted markdown files under the root directory.
199
+ """Yield non-deleted UTF-8 text files under the root directory.
191
200
 
192
201
  Parameters
193
202
  ----------
@@ -197,13 +206,18 @@ class KnowledgeBase:
197
206
  the search and overview requirements from the PRD.
198
207
  """
199
208
 
200
- for path in self.rules.root.rglob("*.md"):
209
+ from mcp_kb.utils.filesystem import is_text_file
210
+
211
+ for path in self.rules.root.rglob("*"):
212
+ if not path.is_file():
213
+ continue
201
214
  if DELETE_SENTINEL in path.name:
202
215
  continue
203
216
  parts = path.relative_to(self.rules.root).parts
204
- if parts and parts[0] == DOCS_FOLDER_NAME and not include_docs:
217
+ if parts and parts[0] == DATA_FOLDER_NAME and not include_docs:
205
218
  continue
206
- yield path
219
+ if is_text_file(path):
220
+ yield path
207
221
 
208
222
  def _relative_path(self, absolute: Path) -> str:
209
223
  """Return ``absolute`` rewritten relative to the knowledge base root."""
@@ -218,7 +232,7 @@ class KnowledgeBase:
218
232
  absolute:
219
233
  Fully resolved path that was modified on disk.
220
234
  content:
221
- Markdown payload that should be provided to subscribers.
235
+ Text payload that should be provided to subscribers.
222
236
  """
223
237
 
224
238
  if not self.listeners:
@@ -240,7 +254,9 @@ class KnowledgeBase:
240
254
  event = FileDeleteEvent(absolute_path=absolute, relative_path=relative)
241
255
  self._dispatch("handle_delete", event)
242
256
 
243
- def _dispatch(self, method_name: str, event: FileUpsertEvent | FileDeleteEvent) -> None:
257
+ def _dispatch(
258
+ self, method_name: str, event: FileUpsertEvent | FileDeleteEvent
259
+ ) -> None:
244
260
  """Call ``method_name`` on every listener and wrap failures for clarity."""
245
261
 
246
262
  for listener in self.listeners:
@@ -7,13 +7,14 @@ that target the reserved documentation folder. The helper functions are written
7
7
  so they can be reused both by the server runtime and by unit tests to keep the
8
8
  security rules consistent.
9
9
  """
10
+
10
11
  from __future__ import annotations
11
12
 
12
13
  from dataclasses import dataclass
13
14
  from pathlib import Path
14
15
  from typing import Iterable
15
16
 
16
- from mcp_kb.config import DOCS_FOLDER_NAME, DELETE_SENTINEL
17
+ from mcp_kb.config import DATA_FOLDER_NAME, DELETE_SENTINEL
17
18
 
18
19
 
19
20
  class PathValidationError(ValueError):
@@ -69,13 +70,17 @@ def normalize_path(candidate: str, rules: PathRules) -> Path:
69
70
 
70
71
  path_obj = Path(candidate)
71
72
  if path_obj.is_absolute():
72
- raise PathValidationError("Absolute paths are not permitted inside the knowledge base")
73
+ raise PathValidationError(
74
+ "Absolute paths are not permitted inside the knowledge base"
75
+ )
73
76
 
74
77
  normalized = (rules.root / path_obj).resolve()
75
78
  try:
76
79
  normalized.relative_to(rules.root)
77
80
  except ValueError as exc:
78
- raise PathValidationError("Path resolves outside the knowledge base root") from exc
81
+ raise PathValidationError(
82
+ "Path resolves outside the knowledge base root"
83
+ ) from exc
79
84
 
80
85
  if DELETE_SENTINEL in normalized.name:
81
86
  raise PathValidationError("Operations on soft-deleted files are not permitted")
@@ -5,6 +5,7 @@ operations defined elsewhere in the package. Using FastMCP drastically reduces
5
5
  protocol boilerplate because the framework introspects type hints and
6
6
  Docstrings to generate MCP-compatible tool schemas automatically.
7
7
  """
8
+
8
9
  from __future__ import annotations
9
10
 
10
11
  from dataclasses import dataclass
@@ -89,16 +90,16 @@ def create_fastmcp_app(
89
90
  mcp = FastMCP(
90
91
  "mcp-knowledge-base",
91
92
  instructions=(
92
- "You are connected to a local markdown knowledge base. Use the provided "
93
- "tools to create, inspect, and organize content while respecting the "
94
- "soft deletion semantics and the protected documentation folder."
93
+ "You are connected to a local text-based knowledge base. Use the provided "
94
+ "tools to create, inspect, and organize content and search the knowledgebase for information.\n"
95
+ "Call the documentation tool first to get the latest documentation."
95
96
  ),
96
97
  **fastmcp_kwargs,
97
98
  )
98
99
 
99
100
  @mcp.tool(name="create_file", title="Create File")
100
101
  def create_file(path: str, content: str) -> str:
101
- """Create or overwrite a markdown file at ``path`` with ``content``."""
102
+ """Create or overwrite a text file at ``path`` with ``content``."""
102
103
 
103
104
  try:
104
105
  created = kb.create_file(path, content)
@@ -107,11 +108,15 @@ def create_fastmcp_app(
107
108
  return f"Created {created}"
108
109
 
109
110
  @mcp.tool(name="read_file", title="Read File", structured_output=True)
110
- def read_file(path: str, start_line: int | None = None, end_line: int | None = None) -> ReadFileResult:
111
- """Read a markdown file returning metadata about the extracted segment."""
111
+ def read_file(
112
+ path: str, start_line: int | None = None, end_line: int | None = None
113
+ ) -> ReadFileResult:
114
+ """Read a text file returning metadata about the extracted segment."""
112
115
 
113
116
  try:
114
- segment: FileSegment = kb.read_file(path, start_line=start_line, end_line=end_line)
117
+ segment: FileSegment = kb.read_file(
118
+ path, start_line=start_line, end_line=end_line
119
+ )
115
120
  except PathValidationError as exc:
116
121
  raise ValueError(str(exc)) from exc
117
122
  except FileNotFoundError as exc:
@@ -176,7 +181,11 @@ def create_fastmcp_app(
176
181
  )
177
182
  return [
178
183
  SearchMatchResult(
179
- path=str(match.path),
184
+ path=str(
185
+ match.path.relative_to(kb.rules.root)
186
+ if match.path.is_absolute()
187
+ else match.path
188
+ ),
180
189
  line=match.line_number,
181
190
  context=match.context,
182
191
  )
@@ -6,6 +6,7 @@ such as validating incoming requests and shaping responses. Each helper function
6
6
  is intentionally small so that callers can compose them for different workflows
7
7
  without duplicating the low-level boilerplate.
8
8
  """
9
+
9
10
  from __future__ import annotations
10
11
 
11
12
  from contextlib import contextmanager
@@ -81,3 +82,47 @@ def rename(path: Path, target: Path) -> None:
81
82
  """Rename ``path`` to ``target`` using ``Path.rename`` semantics."""
82
83
 
83
84
  path.rename(target)
85
+
86
+
87
+ def is_text_file(path: Path, max_bytes: int = 2048) -> bool:
88
+ """Heuristically determine whether ``path`` contains UTF-8 text.
89
+
90
+ The check is designed to be fast and conservative for use when iterating
91
+ a directory tree. It reads at most ``max_bytes`` from the file in binary
92
+ mode and applies two filters:
93
+
94
+ - Reject files that contain NUL bytes, which are extremely uncommon in
95
+ textual formats and a strong indicator of binary content.
96
+ - Attempt to decode the sampled bytes as UTF-8. If decoding fails, the
97
+ file is treated as binary.
98
+
99
+ Parameters
100
+ ----------
101
+ path:
102
+ Absolute path to the file on disk.
103
+ max_bytes:
104
+ Upper bound on the number of bytes to sample from the head of the
105
+ file. A small sample keeps directory scans fast while remaining
106
+ accurate for typical text formats such as ``.md``, ``.txt``, ``.xml``,
107
+ and source files.
108
+
109
+ Returns
110
+ -------
111
+ bool
112
+ ``True`` if the file appears to be UTF-8 text; ``False`` otherwise.
113
+ """
114
+
115
+ try:
116
+ with path.open("rb") as handle:
117
+ sample = handle.read(max_bytes)
118
+ except (FileNotFoundError, PermissionError): # pragma: no cover - defensive
119
+ return False
120
+
121
+ if b"\x00" in sample:
122
+ return False
123
+
124
+ try:
125
+ sample.decode("utf-8")
126
+ return True
127
+ except UnicodeDecodeError:
128
+ return False
@@ -1,14 +1,14 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-kb
3
- Version: 0.1.0
3
+ Version: 0.2.1
4
4
  Summary: MCP server exposing a local markdown knowledge base
5
5
  Author: LLM Maintainer
6
6
  Requires-Python: >=3.11
7
7
  Description-Content-Type: text/markdown
8
- Requires-Dist: chromadb>=1.1.0
9
8
  Requires-Dist: httpx>=0.28.1
10
9
  Requires-Dist: mcp[cli]>=1.15.0
11
10
  Provides-Extra: vector
11
+ Requires-Dist: chromadb>=1.1.0; extra == "vector"
12
12
  Requires-Dist: tiktoken>=0.11.0; extra == "vector"
13
13
  Requires-Dist: langchain-text-splitters>=0.3.11; extra == "vector"
14
14
 
@@ -36,7 +36,7 @@ uv run mcp-kb-server --transport http --host 0.0.0.0 --port 9000
36
36
  ```
37
37
 
38
38
  On first launch the server copies a bundled `KNOWLEDBASE_DOC.md` into the
39
- `.docs/` directory if it is missing so that every deployment starts with a
39
+ `.data/` directory if it is missing so that every deployment starts with a
40
40
  baseline usage guide.
41
41
 
42
42
  ## Optional ChromaDB Mirroring
@@ -1,12 +1,11 @@
1
1
  [project]
2
2
  name = "mcp-kb"
3
- version = "0.1.0"
3
+ version = "0.2.1"
4
4
  description = "MCP server exposing a local markdown knowledge base"
5
5
  readme = "README.md"
6
6
  authors = [{ name = "LLM Maintainer" }]
7
7
  requires-python = ">=3.11"
8
8
  dependencies = [
9
- "chromadb>=1.1.0",
10
9
  "httpx>=0.28.1",
11
10
  "mcp[cli]>=1.15.0",
12
11
  ]
@@ -17,8 +16,10 @@ mcp-kb-reindex = "mcp_kb.cli.reindex:main"
17
16
 
18
17
  [project.optional-dependencies]
19
18
  vector = [
19
+ "chromadb>=1.1.0",
20
20
  "tiktoken>=0.11.0",
21
21
  "langchain-text-splitters>=0.3.11",
22
+
22
23
  ]
23
24
 
24
25
  [build-system]
@@ -1,36 +0,0 @@
1
- # Knowledge Base Usage Guide
2
-
3
- Welcome to the MCP-managed knowledge base. This document is automatically
4
- installed the first time the server starts to ensure every deployment ships with
5
- baseline documentation. Customize it to describe project-specific conventions or
6
- operational practices.
7
-
8
- ## Structure
9
-
10
- - All knowledge content lives beneath the `.knowledgebase/` root.
11
- - Documentation resides under `.docs/` and is read-only from the MCP tools.
12
- - Soft-deleted files are suffixed with `_DELETE_` and ignored by search/overview.
13
-
14
- ## Recommended Practices
15
-
16
- 1. Organize content into topic-based folders (e.g., `architecture/`, `ops/`).
17
- 2. Keep document titles within the first heading so search results show context.
18
- 3. Use relative markdown links to connect related documents inside the knowledge
19
- base.
20
- 4. Periodically review `_DELETE_` files and clean up as necessary via direct
21
- filesystem operations.
22
-
23
- ## Default Tools
24
-
25
- | Tool | Purpose |
26
- | --------------- | ----------------------------------------- |
27
- | `create_file` | Create or overwrite markdown documents |
28
- | `read_file` | Read entire files or specific line ranges |
29
- | `append_file` | Append additional content to a file |
30
- | `regex_replace` | Run regex-based replacements |
31
- | `search` | Search text across active documents |
32
- | `overview` | Display a tree overview of the knowledge |
33
- | `documentation` | Read this documentation file |
34
- | `delete` | Soft-delete files safely |
35
-
36
- Update this document to reflect your team's workflows after deployment.
File without changes
File without changes
@@ -4,6 +4,7 @@ This module centralizes the definition of common command-line options and
4
4
  helpers so that multiple entry points (e.g., server and reindex commands) can
5
5
  remain small and focused while sharing consistent behavior.
6
6
  """
7
+
7
8
  from __future__ import annotations
8
9
 
9
10
  import os
@@ -150,4 +151,3 @@ def build_chroma_listener(options: Namespace, root: Path) -> Optional[ChromaInge
150
151
  if not configuration.enabled:
151
152
  return None
152
153
  return ChromaIngestor(configuration)
153
-
File without changes
File without changes
File without changes
File without changes
@@ -1,7 +1,7 @@
1
- chromadb>=1.1.0
2
1
  httpx>=0.28.1
3
2
  mcp[cli]>=1.15.0
4
3
 
5
4
  [vector]
5
+ chromadb>=1.1.0
6
6
  tiktoken>=0.11.0
7
7
  langchain-text-splitters>=0.3.11
File without changes