knowledge-master 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_master/__init__.py +0 -0
- knowledge_master/__main__.py +4 -0
- knowledge_master/chunking.py +106 -0
- knowledge_master/cli.py +344 -0
- knowledge_master/embeddings.py +21 -0
- knowledge_master/intelligence.py +254 -0
- knowledge_master/parsers/__init__.py +0 -0
- knowledge_master/parsers/git_repo.py +115 -0
- knowledge_master/parsers/markdown.py +58 -0
- knowledge_master/server.py +194 -0
- knowledge_master/store.py +164 -0
- knowledge_master/watcher.py +104 -0
- knowledge_master/web.py +568 -0
- knowledge_master-0.1.0.dist-info/METADATA +275 -0
- knowledge_master-0.1.0.dist-info/RECORD +19 -0
- knowledge_master-0.1.0.dist-info/WHEEL +5 -0
- knowledge_master-0.1.0.dist-info/entry_points.txt +3 -0
- knowledge_master-0.1.0.dist-info/licenses/LICENSE +21 -0
- knowledge_master-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""Intelligence extraction — detect tech stack, patterns, conventions, service topology."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def extract_tech_stack(repo_path: str, graph):
|
|
10
|
+
"""Detect technologies from dependency files and create Tech nodes + relationships."""
|
|
11
|
+
repo_name = Path(repo_path).name
|
|
12
|
+
techs = set()
|
|
13
|
+
|
|
14
|
+
# Python
|
|
15
|
+
req = _read(repo_path, "requirements.txt")
|
|
16
|
+
if req:
|
|
17
|
+
techs.add(("Python", "language"))
|
|
18
|
+
for line in req.splitlines():
|
|
19
|
+
pkg = re.split(r"[=<>!~]", line.strip())[0].strip()
|
|
20
|
+
if pkg and not pkg.startswith("#"):
|
|
21
|
+
techs.add((pkg, "python-package"))
|
|
22
|
+
|
|
23
|
+
pyproject = _read(repo_path, "pyproject.toml")
|
|
24
|
+
if pyproject:
|
|
25
|
+
techs.add(("Python", "language"))
|
|
26
|
+
for m in re.findall(r'"([a-zA-Z0-9_-]+)(?:[=<>!~]|$)', pyproject):
|
|
27
|
+
techs.add((m, "python-package"))
|
|
28
|
+
|
|
29
|
+
# Node.js
|
|
30
|
+
pkg_json = _read(repo_path, "package.json")
|
|
31
|
+
if pkg_json:
|
|
32
|
+
techs.add(("Node.js", "runtime"))
|
|
33
|
+
try:
|
|
34
|
+
pkg = json.loads(pkg_json)
|
|
35
|
+
for dep in list(pkg.get("dependencies", {})) + list(pkg.get("devDependencies", {})):
|
|
36
|
+
techs.add((dep, "npm-package"))
|
|
37
|
+
except json.JSONDecodeError:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
# Rust
|
|
41
|
+
cargo = _read(repo_path, "Cargo.toml")
|
|
42
|
+
if cargo:
|
|
43
|
+
techs.add(("Rust", "language"))
|
|
44
|
+
for m in re.findall(r'^\s*([a-zA-Z0-9_-]+)\s*=', cargo, re.MULTILINE):
|
|
45
|
+
if m not in ("name", "version", "edition", "authors", "description", "license", "repository"):
|
|
46
|
+
techs.add((m, "rust-crate"))
|
|
47
|
+
|
|
48
|
+
# Go
|
|
49
|
+
gomod = _read(repo_path, "go.mod")
|
|
50
|
+
if gomod:
|
|
51
|
+
techs.add(("Go", "language"))
|
|
52
|
+
for m in re.findall(r'^\s+([\w./-]+)', gomod, re.MULTILINE):
|
|
53
|
+
techs.add((m.split("/")[-1], "go-module"))
|
|
54
|
+
|
|
55
|
+
# Docker
|
|
56
|
+
if _exists(repo_path, "Dockerfile") or _exists(repo_path, "docker-compose.yml"):
|
|
57
|
+
techs.add(("Docker", "infrastructure"))
|
|
58
|
+
|
|
59
|
+
# Kubernetes
|
|
60
|
+
for f in Path(repo_path).rglob("*.yaml"):
|
|
61
|
+
content = f.read_text(errors="ignore")[:500]
|
|
62
|
+
if "apiVersion:" in content and "kind:" in content:
|
|
63
|
+
techs.add(("Kubernetes", "infrastructure"))
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# Terraform
|
|
67
|
+
if any(Path(repo_path).rglob("*.tf")):
|
|
68
|
+
techs.add(("Terraform", "infrastructure"))
|
|
69
|
+
|
|
70
|
+
# Store in graph
|
|
71
|
+
for name, category in techs:
|
|
72
|
+
graph.query(
|
|
73
|
+
"MERGE (t:Tech {name: $name}) SET t.category = $cat",
|
|
74
|
+
params={"name": name, "cat": category},
|
|
75
|
+
)
|
|
76
|
+
graph.query(
|
|
77
|
+
"""MATCH (r:Repo {name: $repo}), (t:Tech {name: $tech})
|
|
78
|
+
MERGE (r)-[:USES_TECH]->(t)""",
|
|
79
|
+
params={"repo": repo_name, "tech": name},
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return list(techs)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def extract_services(repo_path: str, graph):
|
|
86
|
+
"""Parse docker-compose and K8s manifests to build service dependency graph."""
|
|
87
|
+
repo_name = Path(repo_path).name
|
|
88
|
+
services = []
|
|
89
|
+
|
|
90
|
+
# Docker Compose
|
|
91
|
+
for compose_file in ("docker-compose.yml", "docker-compose.yaml", "compose.yml"):
|
|
92
|
+
content = _read(repo_path, compose_file)
|
|
93
|
+
if not content:
|
|
94
|
+
continue
|
|
95
|
+
# Simple YAML parsing for services (avoid PyYAML dep)
|
|
96
|
+
in_services = False
|
|
97
|
+
current_svc = None
|
|
98
|
+
svc_indent = None
|
|
99
|
+
for line in content.splitlines():
|
|
100
|
+
if not line.strip() or line.strip().startswith("#"):
|
|
101
|
+
continue
|
|
102
|
+
indent = len(line) - len(line.lstrip())
|
|
103
|
+
stripped = line.strip()
|
|
104
|
+
if stripped == "services:":
|
|
105
|
+
in_services = True
|
|
106
|
+
svc_indent = indent + 2
|
|
107
|
+
continue
|
|
108
|
+
if in_services:
|
|
109
|
+
if indent <= indent - 2 and stripped and indent == 0 and not stripped.startswith("#"):
|
|
110
|
+
break
|
|
111
|
+
if indent == svc_indent and stripped.endswith(":") and not stripped.startswith("-"):
|
|
112
|
+
current_svc = stripped.rstrip(": ")
|
|
113
|
+
if not current_svc.startswith('"') and not current_svc.startswith("'"):
|
|
114
|
+
services.append(current_svc)
|
|
115
|
+
graph.query(
|
|
116
|
+
"MERGE (s:Service {name: $name}) SET s.source = 'docker-compose'",
|
|
117
|
+
params={"name": current_svc},
|
|
118
|
+
)
|
|
119
|
+
graph.query(
|
|
120
|
+
"""MATCH (r:Repo {name: $repo}), (s:Service {name: $svc})
|
|
121
|
+
MERGE (r)-[:DEFINES_SERVICE]->(s)""",
|
|
122
|
+
params={"repo": repo_name, "svc": current_svc},
|
|
123
|
+
)
|
|
124
|
+
# Detect depends_on
|
|
125
|
+
if current_svc and "depends_on" in stripped:
|
|
126
|
+
pass # next lines will have deps
|
|
127
|
+
if current_svc and stripped.startswith("- ") and indent > svc_indent + 2:
|
|
128
|
+
dep = stripped.lstrip("- ").strip().rstrip(":")
|
|
129
|
+
if dep in services or dep:
|
|
130
|
+
graph.query(
|
|
131
|
+
"MERGE (s:Service {name: $name})",
|
|
132
|
+
params={"name": dep},
|
|
133
|
+
)
|
|
134
|
+
graph.query(
|
|
135
|
+
"""MATCH (a:Service {name: $svc}), (b:Service {name: $dep})
|
|
136
|
+
MERGE (a)-[:DEPENDS_ON]->(b)""",
|
|
137
|
+
params={"svc": current_svc, "dep": dep},
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# K8s Deployments/Services
|
|
141
|
+
for yaml_file in Path(repo_path).rglob("*.yaml"):
|
|
142
|
+
content = yaml_file.read_text(errors="ignore")[:2000]
|
|
143
|
+
if "kind: Deployment" in content or "kind: StatefulSet" in content:
|
|
144
|
+
name_match = re.search(r'name:\s*(\S+)', content)
|
|
145
|
+
if name_match:
|
|
146
|
+
svc_name = name_match.group(1)
|
|
147
|
+
services.append(svc_name)
|
|
148
|
+
graph.query(
|
|
149
|
+
"MERGE (s:Service {name: $name}) SET s.source = 'kubernetes'",
|
|
150
|
+
params={"name": svc_name},
|
|
151
|
+
)
|
|
152
|
+
graph.query(
|
|
153
|
+
"""MATCH (r:Repo {name: $repo}), (s:Service {name: $svc})
|
|
154
|
+
MERGE (r)-[:DEFINES_SERVICE]->(s)""",
|
|
155
|
+
params={"repo": repo_name, "svc": svc_name},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
return services
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def extract_conventions(repo_path: str, graph):
|
|
162
|
+
"""Analyze naming patterns, folder structure, and coding conventions."""
|
|
163
|
+
repo_name = Path(repo_path).name
|
|
164
|
+
conventions = []
|
|
165
|
+
|
|
166
|
+
# File naming convention
|
|
167
|
+
files = [f.name for f in Path(repo_path).rglob("*") if f.is_file() and not any(
|
|
168
|
+
p in f.parts for p in (".git", "node_modules", "__pycache__", ".venv", "target")
|
|
169
|
+
)]
|
|
170
|
+
|
|
171
|
+
code_files = [f for f in files if Path(f).suffix in (".py", ".ts", ".js", ".rs", ".go")]
|
|
172
|
+
if code_files:
|
|
173
|
+
snake = sum(1 for f in code_files if "_" in Path(f).stem)
|
|
174
|
+
kebab = sum(1 for f in code_files if "-" in Path(f).stem)
|
|
175
|
+
camel = sum(1 for f in code_files if re.match(r'^[a-z]+[A-Z]', Path(f).stem))
|
|
176
|
+
pascal = sum(1 for f in code_files if re.match(r'^[A-Z][a-z]+[A-Z]', Path(f).stem))
|
|
177
|
+
total = len(code_files)
|
|
178
|
+
|
|
179
|
+
if snake / total > 0.5:
|
|
180
|
+
conventions.append(("snake_case files", "file-naming"))
|
|
181
|
+
elif kebab / total > 0.3:
|
|
182
|
+
conventions.append(("kebab-case files", "file-naming"))
|
|
183
|
+
elif pascal / total > 0.3:
|
|
184
|
+
conventions.append(("PascalCase files", "file-naming"))
|
|
185
|
+
elif camel / total > 0.3:
|
|
186
|
+
conventions.append(("camelCase files", "file-naming"))
|
|
187
|
+
|
|
188
|
+
# Folder structure patterns
|
|
189
|
+
dirs = set()
|
|
190
|
+
for f in Path(repo_path).iterdir():
|
|
191
|
+
if f.is_dir() and not f.name.startswith("."):
|
|
192
|
+
dirs.add(f.name)
|
|
193
|
+
|
|
194
|
+
if "src" in dirs:
|
|
195
|
+
conventions.append(("src/ directory", "structure"))
|
|
196
|
+
if "lib" in dirs:
|
|
197
|
+
conventions.append(("lib/ directory", "structure"))
|
|
198
|
+
if "tests" in dirs or "test" in dirs:
|
|
199
|
+
conventions.append(("separate test directory", "testing"))
|
|
200
|
+
if "docs" in dirs:
|
|
201
|
+
conventions.append(("docs/ directory", "documentation"))
|
|
202
|
+
if "infra" in dirs or "deploy" in dirs or "k8s" in dirs:
|
|
203
|
+
conventions.append(("infra as code", "infrastructure"))
|
|
204
|
+
|
|
205
|
+
# Detect patterns from code
|
|
206
|
+
for py_file in list(Path(repo_path).rglob("*.py"))[:50]:
|
|
207
|
+
content = py_file.read_text(errors="ignore")[:3000]
|
|
208
|
+
if "class" in content and "Repository" in content:
|
|
209
|
+
conventions.append(("Repository pattern", "design-pattern"))
|
|
210
|
+
if "@app.route" in content or "@router" in content:
|
|
211
|
+
conventions.append(("Route decorators", "design-pattern"))
|
|
212
|
+
if "class" in content and ("Mixin" in content or "Base" in content):
|
|
213
|
+
conventions.append(("Mixin/Base classes", "design-pattern"))
|
|
214
|
+
|
|
215
|
+
# Deduplicate
|
|
216
|
+
conventions = list(set(conventions))
|
|
217
|
+
|
|
218
|
+
# Store in graph
|
|
219
|
+
for name, category in conventions:
|
|
220
|
+
graph.query(
|
|
221
|
+
"MERGE (c:Convention {name: $name}) SET c.category = $cat",
|
|
222
|
+
params={"name": name, "cat": category},
|
|
223
|
+
)
|
|
224
|
+
graph.query(
|
|
225
|
+
"""MATCH (r:Repo {name: $repo}), (c:Convention {name: $conv})
|
|
226
|
+
MERGE (r)-[:FOLLOWS]->(c)""",
|
|
227
|
+
params={"repo": repo_name, "conv": name},
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return conventions
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def extract_all(repo_path: str, graph):
|
|
234
|
+
"""Run all extraction passes on a repo."""
|
|
235
|
+
techs = extract_tech_stack(repo_path, graph)
|
|
236
|
+
services = extract_services(repo_path, graph)
|
|
237
|
+
conventions = extract_conventions(repo_path, graph)
|
|
238
|
+
return {
|
|
239
|
+
"techs": len(techs),
|
|
240
|
+
"services": len(services),
|
|
241
|
+
"conventions": len(conventions),
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _read(repo_path: str, filename: str) -> str | None:
|
|
246
|
+
p = os.path.join(repo_path, filename)
|
|
247
|
+
if os.path.exists(p):
|
|
248
|
+
with open(p, "r", errors="ignore") as f:
|
|
249
|
+
return f.read()
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _exists(repo_path: str, filename: str) -> bool:
|
|
254
|
+
return os.path.exists(os.path.join(repo_path, filename))
|
|
File without changes
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Git repository parser - indexes files and extracts graph relationships."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from git import Repo
|
|
8
|
+
from rich.progress import Progress
|
|
9
|
+
|
|
10
|
+
from .. import chunking, embeddings, store
|
|
11
|
+
from ..intelligence import extract_all
|
|
12
|
+
|
|
13
|
+
INDEXABLE_EXTENSIONS = {
|
|
14
|
+
".py", ".ts", ".tsx", ".js", ".rs", ".go", ".java",
|
|
15
|
+
".md", ".markdown", ".txt", ".yaml", ".yml", ".json",
|
|
16
|
+
".toml", ".cfg", ".ini", ".sh", ".bash", ".dockerfile",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
SKIP_DIRS = {".git", "node_modules", "target", "dist", "build", "__pycache__", ".venv", "venv"}
|
|
20
|
+
|
|
21
|
+
MAX_FILE_SIZE = 512_000 # 500KB
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def index_repo(repo_path: str, graph=None, branch: str = "HEAD", on_progress=None):
|
|
25
|
+
"""Index an entire git repository into the knowledge graph."""
|
|
26
|
+
repo_path = os.path.expanduser(repo_path)
|
|
27
|
+
repo = Repo(repo_path)
|
|
28
|
+
repo_name = Path(repo_path).name
|
|
29
|
+
|
|
30
|
+
if graph is None:
|
|
31
|
+
graph = store.get_graph()
|
|
32
|
+
|
|
33
|
+
store.init_schema(graph)
|
|
34
|
+
store.upsert_repo(graph, repo_name, repo_path)
|
|
35
|
+
|
|
36
|
+
# Extract authors from recent commits
|
|
37
|
+
authors = set()
|
|
38
|
+
for commit in repo.iter_commits(branch, max_count=100):
|
|
39
|
+
authors.add((commit.author.name, commit.author.email))
|
|
40
|
+
|
|
41
|
+
for name, email in authors:
|
|
42
|
+
store.upsert_person(graph, name, email)
|
|
43
|
+
|
|
44
|
+
# Get tracked files
|
|
45
|
+
tracked = repo.git.ls_files().splitlines()
|
|
46
|
+
indexable = [f for f in tracked if _should_index(f)]
|
|
47
|
+
total = len(indexable)
|
|
48
|
+
|
|
49
|
+
with Progress(disable=not sys.stdout.isatty()) as progress:
|
|
50
|
+
task = progress.add_task(f"Indexing {repo_name}", total=total)
|
|
51
|
+
for i, filepath in enumerate(indexable):
|
|
52
|
+
full_path = os.path.join(repo_path, filepath)
|
|
53
|
+
try:
|
|
54
|
+
_index_file(graph, full_path, filepath, repo_name, repo)
|
|
55
|
+
except Exception as e:
|
|
56
|
+
progress.console.print(f" [yellow]skip {filepath}: {e}[/]")
|
|
57
|
+
progress.advance(task)
|
|
58
|
+
if on_progress:
|
|
59
|
+
on_progress(i + 1, total, filepath)
|
|
60
|
+
|
|
61
|
+
# Run intelligence extraction
|
|
62
|
+
intel = extract_all(repo_path, graph)
|
|
63
|
+
|
|
64
|
+
return {"repo": repo_name, "files_indexed": total, "intelligence": intel}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _should_index(filepath: str) -> bool:
|
|
68
|
+
"""Check if file should be indexed."""
|
|
69
|
+
ext = Path(filepath).suffix.lower()
|
|
70
|
+
parts = Path(filepath).parts
|
|
71
|
+
if any(d in SKIP_DIRS for d in parts):
|
|
72
|
+
return False
|
|
73
|
+
if ext not in INDEXABLE_EXTENSIONS:
|
|
74
|
+
return False
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _index_file(graph, full_path: str, relative_path: str, repo_name: str, repo: Repo):
|
|
79
|
+
"""Index a single file: chunk, embed, store with relationships."""
|
|
80
|
+
if os.path.getsize(full_path) > MAX_FILE_SIZE:
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
with open(full_path, "r", errors="ignore") as f:
|
|
84
|
+
content = f.read()
|
|
85
|
+
|
|
86
|
+
if not content.strip():
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
ext = Path(full_path).suffix.lower()
|
|
90
|
+
chunks = chunking.chunk_file(content, ext)
|
|
91
|
+
if not chunks:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Embed all chunks
|
|
95
|
+
vectors = embeddings.embed_batch(chunks)
|
|
96
|
+
|
|
97
|
+
# Store document node
|
|
98
|
+
store.upsert_document(graph, relative_path, ext.lstrip("."), {"title": relative_path})
|
|
99
|
+
store.link_document_to_repo(graph, relative_path, repo_name)
|
|
100
|
+
|
|
101
|
+
# Get last author for this file
|
|
102
|
+
try:
|
|
103
|
+
last_commit = next(repo.iter_commits(paths=relative_path, max_count=1))
|
|
104
|
+
store.link_person_authored(graph, last_commit.author.email, relative_path)
|
|
105
|
+
except StopIteration:
|
|
106
|
+
pass
|
|
107
|
+
|
|
108
|
+
# Store chunks with embeddings
|
|
109
|
+
for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
|
|
110
|
+
cid = chunking.chunk_id(relative_path, i)
|
|
111
|
+
store.upsert_chunk(
|
|
112
|
+
graph, cid, chunk_text, vector,
|
|
113
|
+
{"source": relative_path, "source_type": "code" if ext != ".md" else "docs"},
|
|
114
|
+
)
|
|
115
|
+
store.link_chunk_to_document(graph, cid, relative_path)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Markdown file/directory parser."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.progress import Progress
|
|
8
|
+
|
|
9
|
+
from .. import chunking, embeddings, store
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def index_directory(directory: str, graph=None, patterns: list[str] = None):
|
|
13
|
+
"""Index markdown files from a directory."""
|
|
14
|
+
directory = os.path.expanduser(directory)
|
|
15
|
+
if graph is None:
|
|
16
|
+
graph = store.get_graph()
|
|
17
|
+
|
|
18
|
+
store.init_schema(graph)
|
|
19
|
+
patterns = patterns or ["*.md", "*.markdown", "*.txt"]
|
|
20
|
+
|
|
21
|
+
files = []
|
|
22
|
+
for pattern in patterns:
|
|
23
|
+
files.extend(Path(directory).rglob(pattern))
|
|
24
|
+
|
|
25
|
+
with Progress(disable=not sys.stdout.isatty()) as progress:
|
|
26
|
+
task = progress.add_task(f"Indexing {directory}", total=len(files))
|
|
27
|
+
for filepath in files:
|
|
28
|
+
try:
|
|
29
|
+
_index_markdown(graph, str(filepath), directory)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
progress.console.print(f" [yellow]skip {filepath}: {e}[/]")
|
|
32
|
+
progress.advance(task)
|
|
33
|
+
|
|
34
|
+
return {"directory": directory, "files_indexed": len(files)}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _index_markdown(graph, filepath: str, base_dir: str):
|
|
38
|
+
"""Index a single markdown file."""
|
|
39
|
+
with open(filepath, "r", errors="ignore") as f:
|
|
40
|
+
content = f.read()
|
|
41
|
+
|
|
42
|
+
if not content.strip():
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
relative = os.path.relpath(filepath, base_dir)
|
|
46
|
+
chunks = chunking.chunk_markdown(content)
|
|
47
|
+
if not chunks:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
vectors = embeddings.embed_batch(chunks)
|
|
51
|
+
store.upsert_document(graph, relative, "markdown", {"title": relative})
|
|
52
|
+
|
|
53
|
+
for i, (text, vector) in enumerate(zip(chunks, vectors)):
|
|
54
|
+
cid = chunking.chunk_id(relative, i)
|
|
55
|
+
store.upsert_chunk(
|
|
56
|
+
graph, cid, text, vector, {"source": relative, "source_type": "docs"}
|
|
57
|
+
)
|
|
58
|
+
store.link_chunk_to_document(graph, cid, relative)
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""MCP Server exposing knowledge base tools for AI agents."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from mcp.server import Server
|
|
6
|
+
from mcp.server.stdio import stdio_server
|
|
7
|
+
from mcp.types import TextContent, Tool
|
|
8
|
+
|
|
9
|
+
from . import embeddings, store
|
|
10
|
+
from .parsers import git_repo, markdown
|
|
11
|
+
|
|
12
|
+
server = Server("knowledge-master")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@server.list_tools()
|
|
16
|
+
async def list_tools() -> list[Tool]:
|
|
17
|
+
return [
|
|
18
|
+
Tool(
|
|
19
|
+
name="search",
|
|
20
|
+
description="Semantic search across the knowledge base. Returns relevant chunks with source info and graph context (author, repo, related docs).",
|
|
21
|
+
inputSchema={
|
|
22
|
+
"type": "object",
|
|
23
|
+
"properties": {
|
|
24
|
+
"query": {"type": "string", "description": "Natural language search query"},
|
|
25
|
+
"top_k": {"type": "integer", "default": 10, "description": "Number of results"},
|
|
26
|
+
"source_type": {"type": "string", "enum": ["code", "docs", "email", "infra"], "description": "Filter by source type"},
|
|
27
|
+
"use_graph": {"type": "boolean", "default": True, "description": "Include graph context (author, repo relationships)"},
|
|
28
|
+
},
|
|
29
|
+
"required": ["query"],
|
|
30
|
+
},
|
|
31
|
+
),
|
|
32
|
+
Tool(
|
|
33
|
+
name="index_repo",
|
|
34
|
+
description="Index a git repository into the knowledge graph. Parses code files, extracts authors, builds relationships.",
|
|
35
|
+
inputSchema={
|
|
36
|
+
"type": "object",
|
|
37
|
+
"properties": {
|
|
38
|
+
"path": {"type": "string", "description": "Path to git repository"},
|
|
39
|
+
"branch": {"type": "string", "default": "HEAD", "description": "Branch to index"},
|
|
40
|
+
},
|
|
41
|
+
"required": ["path"],
|
|
42
|
+
},
|
|
43
|
+
),
|
|
44
|
+
Tool(
|
|
45
|
+
name="index_directory",
|
|
46
|
+
description="Index markdown/text files from a directory.",
|
|
47
|
+
inputSchema={
|
|
48
|
+
"type": "object",
|
|
49
|
+
"properties": {
|
|
50
|
+
"path": {"type": "string", "description": "Directory path to index"},
|
|
51
|
+
"patterns": {"type": "array", "items": {"type": "string"}, "description": "Glob patterns (default: *.md, *.txt)"},
|
|
52
|
+
},
|
|
53
|
+
"required": ["path"],
|
|
54
|
+
},
|
|
55
|
+
),
|
|
56
|
+
Tool(
|
|
57
|
+
name="get_status",
|
|
58
|
+
description="Get knowledge base statistics: number of chunks, documents, repos indexed.",
|
|
59
|
+
inputSchema={"type": "object", "properties": {}},
|
|
60
|
+
),
|
|
61
|
+
Tool(
|
|
62
|
+
name="blast_radius",
|
|
63
|
+
description="Show what depends on a target (service, tech, or file). Returns all entities that would be affected by changing the target.",
|
|
64
|
+
inputSchema={
|
|
65
|
+
"type": "object",
|
|
66
|
+
"properties": {
|
|
67
|
+
"target": {"type": "string", "description": "Service name, technology, or file to check dependencies for"},
|
|
68
|
+
},
|
|
69
|
+
"required": ["target"],
|
|
70
|
+
},
|
|
71
|
+
),
|
|
72
|
+
Tool(
|
|
73
|
+
name="check_conventions",
|
|
74
|
+
description="Check if a repo or path follows the detected coding conventions (naming, structure, patterns). Returns pass/fail for each convention.",
|
|
75
|
+
inputSchema={
|
|
76
|
+
"type": "object",
|
|
77
|
+
"properties": {
|
|
78
|
+
"path": {"type": "string", "description": "Path to repo or directory to check"},
|
|
79
|
+
},
|
|
80
|
+
"required": ["path"],
|
|
81
|
+
},
|
|
82
|
+
),
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@server.call_tool()
|
|
87
|
+
async def call_tool(name: str, arguments: dict) -> list[TextContent]:
|
|
88
|
+
graph = store.get_graph()
|
|
89
|
+
|
|
90
|
+
if name == "search":
|
|
91
|
+
query = arguments["query"]
|
|
92
|
+
top_k = arguments.get("top_k", 10)
|
|
93
|
+
use_graph = arguments.get("use_graph", True)
|
|
94
|
+
filters = {}
|
|
95
|
+
if arguments.get("source_type"):
|
|
96
|
+
filters["source_type"] = arguments["source_type"]
|
|
97
|
+
|
|
98
|
+
query_vector = embeddings.embed(query)
|
|
99
|
+
|
|
100
|
+
if use_graph:
|
|
101
|
+
results = store.graph_context_search(graph, query_vector, top_k)
|
|
102
|
+
else:
|
|
103
|
+
results = store.vector_search(graph, query_vector, top_k, filters)
|
|
104
|
+
|
|
105
|
+
return [TextContent(type="text", text=json.dumps(results, indent=2, default=str))]
|
|
106
|
+
|
|
107
|
+
elif name == "index_repo":
|
|
108
|
+
result = git_repo.index_repo(arguments["path"], graph, arguments.get("branch", "HEAD"))
|
|
109
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
110
|
+
|
|
111
|
+
elif name == "index_directory":
|
|
112
|
+
result = markdown.index_directory(
|
|
113
|
+
arguments["path"], graph, arguments.get("patterns")
|
|
114
|
+
)
|
|
115
|
+
return [TextContent(type="text", text=json.dumps(result))]
|
|
116
|
+
|
|
117
|
+
elif name == "get_status":
|
|
118
|
+
stats = store.get_stats(graph)
|
|
119
|
+
return [TextContent(type="text", text=json.dumps(stats))]
|
|
120
|
+
|
|
121
|
+
elif name == "blast_radius":
|
|
122
|
+
target = arguments["target"]
|
|
123
|
+
# Try Service
|
|
124
|
+
result = graph.query(
|
|
125
|
+
"""MATCH (t:Service {name: $name})
|
|
126
|
+
OPTIONAL MATCH (other)-[*1..3]->(t)
|
|
127
|
+
WHERE other <> t
|
|
128
|
+
RETURN labels(other)[0] AS type, other.name AS name, type(last(relationships(path))) AS rel""",
|
|
129
|
+
params={"name": target},
|
|
130
|
+
)
|
|
131
|
+
if not result.result_set or all(r[1] is None for r in result.result_set):
|
|
132
|
+
# Try Tech
|
|
133
|
+
result = graph.query(
|
|
134
|
+
"""MATCH (t:Tech {name: $name})
|
|
135
|
+
OPTIONAL MATCH (r:Repo)-[:USES_TECH]->(t)
|
|
136
|
+
RETURN 'Repo' AS type, r.name AS name, 'USES_TECH' AS rel""",
|
|
137
|
+
params={"name": target},
|
|
138
|
+
)
|
|
139
|
+
affected = [{"type": r[0], "name": r[1], "relationship": r[2]}
|
|
140
|
+
for r in (result.result_set or []) if r[1]]
|
|
141
|
+
output = {"target": target, "affected_count": len(affected), "affected": affected}
|
|
142
|
+
return [TextContent(type="text", text=json.dumps(output, indent=2))]
|
|
143
|
+
|
|
144
|
+
elif name == "check_conventions":
|
|
145
|
+
from pathlib import Path as P
|
|
146
|
+
path = str(P(arguments["path"]).expanduser().resolve())
|
|
147
|
+
repo_name = P(path).name
|
|
148
|
+
result = graph.query(
|
|
149
|
+
"""MATCH (r:Repo)-[:FOLLOWS]->(c:Convention)
|
|
150
|
+
WHERE r.name = $name
|
|
151
|
+
RETURN c.name, c.category""",
|
|
152
|
+
params={"name": repo_name},
|
|
153
|
+
)
|
|
154
|
+
if not result.result_set:
|
|
155
|
+
result = graph.query("MATCH (c:Convention) RETURN c.name, c.category")
|
|
156
|
+
|
|
157
|
+
checks = []
|
|
158
|
+
for conv_name, category in (result.result_set or []):
|
|
159
|
+
passed = _check_convention_simple(path, conv_name)
|
|
160
|
+
checks.append({"convention": conv_name, "category": category, "passed": passed})
|
|
161
|
+
|
|
162
|
+
return [TextContent(type="text", text=json.dumps({"path": path, "checks": checks}, indent=2))]
|
|
163
|
+
|
|
164
|
+
return [TextContent(type="text", text=f"Unknown tool: {name}")]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _check_convention_simple(path: str, convention: str) -> bool:
|
|
168
|
+
"""Check a single convention."""
|
|
169
|
+
from pathlib import Path as P
|
|
170
|
+
p = P(path)
|
|
171
|
+
if convention == "src/ directory":
|
|
172
|
+
return (p / "src").is_dir()
|
|
173
|
+
elif convention == "separate test directory":
|
|
174
|
+
return (p / "tests").is_dir() or (p / "test").is_dir()
|
|
175
|
+
elif convention == "docs/ directory":
|
|
176
|
+
return (p / "docs").is_dir()
|
|
177
|
+
elif convention == "snake_case files":
|
|
178
|
+
files = [f for f in p.rglob("*.py") if ".venv" not in str(f)]
|
|
179
|
+
return not any("-" in f.stem for f in files)
|
|
180
|
+
return True
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
async def run():
|
|
184
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
185
|
+
await server.run(read_stream, write_stream, server.create_initialization_options())
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def main():
|
|
189
|
+
import asyncio
|
|
190
|
+
asyncio.run(run())
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
|
+
main()
|