knowledge-master 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/PKG-INFO +47 -3
  2. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/README.md +45 -2
  3. knowledge_master-0.2.0/knowledge_master/api.py +92 -0
  4. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/cli.py +168 -48
  5. knowledge_master-0.2.0/knowledge_master/connectors.py +134 -0
  6. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/embeddings.py +7 -3
  7. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/intelligence.py +98 -42
  8. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/parsers/git_repo.py +5 -1
  9. knowledge_master-0.2.0/knowledge_master/rerank.py +60 -0
  10. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/server.py +1 -1
  11. knowledge_master-0.2.0/knowledge_master/static_analysis.py +141 -0
  12. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/store.py +15 -3
  13. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/web.py +33 -4
  14. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/PKG-INFO +47 -3
  15. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/SOURCES.txt +8 -1
  16. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/requires.txt +1 -0
  17. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/pyproject.toml +2 -1
  18. knowledge_master-0.2.0/tests/test_api.py +42 -0
  19. knowledge_master-0.2.0/tests/test_cli.py +35 -0
  20. knowledge_master-0.2.0/tests/test_static_analysis.py +48 -0
  21. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/LICENSE +0 -0
  22. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/__init__.py +0 -0
  23. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/__main__.py +0 -0
  24. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/chunking.py +0 -0
  25. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/parsers/__init__.py +0 -0
  26. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/parsers/markdown.py +0 -0
  27. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master/watcher.py +0 -0
  28. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/dependency_links.txt +0 -0
  29. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/entry_points.txt +0 -0
  30. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/knowledge_master.egg-info/top_level.txt +0 -0
  31. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/setup.cfg +0 -0
  32. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/tests/test_chunking.py +0 -0
  33. {knowledge_master-0.1.0 → knowledge_master-0.2.0}/tests/test_intelligence.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowledge-master
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Local-first knowledge graph for developers. Your AI agent's permanent memory.
5
5
  Author: Milenko Mitrovic
6
6
  License: MIT
@@ -27,6 +27,7 @@ Requires-Dist: gitpython<4.0,>=3.1.0
27
27
  Requires-Dist: rich<15.0,>=14.0.0
28
28
  Requires-Dist: fastapi<1.0,>=0.115.0
29
29
  Requires-Dist: uvicorn<1.0,>=0.34.0
30
+ Requires-Dist: pyyaml>=6.0
30
31
  Provides-Extra: office
31
32
  Requires-Dist: python-docx<2.0,>=1.1.0; extra == "office"
32
33
  Requires-Dist: openpyxl<4.0,>=3.1.0; extra == "office"
@@ -41,6 +42,10 @@ Dynamic: license-file
41
42
  **Your codebase's memory.** A local knowledge graph that gives AI agents real understanding of your architecture — not just text search.
42
43
 
43
44
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
45
+ ![Status: Alpha](https://img.shields.io/badge/Status-Alpha-orange)
46
+ ![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue)
47
+
48
+ > ⚠️ **Alpha software.** Core features work (search, graph, CLI, MCP server) but some capabilities are early-stage. See [Feature Status](#feature-status) below.
44
49
 
45
50
  ---
46
51
 
@@ -210,9 +215,11 @@ Your AI agent gets these tools:
210
215
  | `km start` | Boot Docker containers + pull embedding model |
211
216
  | `km stop` | Stop containers |
212
217
  | `km index <path>` | Index a git repo or docs directory |
213
- | `km search <query>` | Semantic search with graph context |
214
- | `km blast-radius <target>` | Show dependencies and affected entities |
218
+ | `km search <query>` | Semantic search with re-ranking |
219
+ | `km blast-radius <target>` | Multi-layer dependency analysis (imports services → people) |
220
+ | `km who-owns <file>` | File ownership from git blame (weighted by recency) |
215
221
  | `km check-conventions <path>` | Verify code follows detected patterns |
222
+ | `km connect <source>` | Pull from external MCP (email, Slack) |
216
223
  | `km list` | Show indexed repos, techs, stats |
217
224
  | `km remove <name>` | Remove a source from the knowledge base |
218
225
  | `km serve` | Start web UI at http://127.0.0.1:9999 |
@@ -231,6 +238,26 @@ When you index a repo, Knowledge Master detects:
231
238
  | **People** | Git commit authors and file ownership |
232
239
  | **Code structure** | Functions, classes, chunked by AST-aware boundaries |
233
240
 
241
+ ## Feature Status
242
+
243
+ | Feature | Status | Notes |
244
+ |---|---|---|
245
+ | Semantic search + re-ranking | ✅ Stable | Core retrieval works well |
246
+ | Knowledge graph (FalkorDB) | ✅ Stable | Node/edge storage, vector index |
247
+ | CLI commands | ✅ Stable | All commands functional |
248
+ | MCP server | ✅ Stable | search, blast_radius, check_conventions |
249
+ | Web UI + graph viz | ✅ Stable | htmx + D3, no build step |
250
+ | Git repo indexing | ✅ Stable | Parses code, extracts authors |
251
+ | Tech stack detection | ⚡ Basic | Regex over dependency files — works for common cases |
252
+ | Service topology | ⚡ Basic | docker-compose parsing — limited YAML support |
253
+ | Convention detection | ⚡ Basic | Folder structure + file naming patterns |
254
+ | Blast radius | ⚡ Basic | Graph traversal on stored edges — doesn't trace imports/calls |
255
+ | Email connector (ms-365) | 🧪 Experimental | Works but requires ms-365-mcp setup |
256
+ | Re-ranking | 🧪 Experimental | Novel approach, not benchmarked against cross-encoders |
257
+ | Incremental indexing | 🧪 Experimental | File watcher + git hooks, needs more testing |
258
+
259
+ **Legend:** ✅ Stable — ⚡ Basic (works, limited scope) — 🧪 Experimental (may change)
260
+
234
261
  ## Comparison
235
262
 
236
263
  | Feature | Knowledge Master | Generic RAG | GitHub Copilot | Glean |
@@ -259,6 +286,23 @@ python -m knowledge_master.server
259
286
  python -m knowledge_master.cli status
260
287
  ```
261
288
 
289
+ ## Security
290
+
291
+ Knowledge Master runs **entirely on your machine**. No data leaves localhost.
292
+
293
+ - All ports bound to `127.0.0.1` (not accessible from LAN)
294
+ - Ollama runs locally — no cloud API calls
295
+ - MCP server uses stdio (no network exposure)
296
+ - Optional API key auth for REST endpoints
297
+
298
+ ```bash
299
+ # Enable API key auth
300
+ export KM_API_KEY=$(openssl rand -hex 32)
301
+ km serve
302
+ ```
303
+
304
+ See [SECURITY.md](SECURITY.md) for full security model, risks, and hardening guide.
305
+
262
306
  ## Troubleshooting
263
307
 
264
308
  | Issue | Fix |
@@ -3,6 +3,10 @@
3
3
  **Your codebase's memory.** A local knowledge graph that gives AI agents real understanding of your architecture — not just text search.
4
4
 
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
6
+ ![Status: Alpha](https://img.shields.io/badge/Status-Alpha-orange)
7
+ ![Python 3.11+](https://img.shields.io/badge/Python-3.11+-blue)
8
+
9
+ > ⚠️ **Alpha software.** Core features work (search, graph, CLI, MCP server) but some capabilities are early-stage. See [Feature Status](#feature-status) below.
6
10
 
7
11
  ---
8
12
 
@@ -172,9 +176,11 @@ Your AI agent gets these tools:
172
176
  | `km start` | Boot Docker containers + pull embedding model |
173
177
  | `km stop` | Stop containers |
174
178
  | `km index <path>` | Index a git repo or docs directory |
175
- | `km search <query>` | Semantic search with graph context |
176
- | `km blast-radius <target>` | Show dependencies and affected entities |
179
+ | `km search <query>` | Semantic search with re-ranking |
180
+ | `km blast-radius <target>` | Multi-layer dependency analysis (imports services → people) |
181
+ | `km who-owns <file>` | File ownership from git blame (weighted by recency) |
177
182
  | `km check-conventions <path>` | Verify code follows detected patterns |
183
+ | `km connect <source>` | Pull from external MCP (email, Slack) |
178
184
  | `km list` | Show indexed repos, techs, stats |
179
185
  | `km remove <name>` | Remove a source from the knowledge base |
180
186
  | `km serve` | Start web UI at http://127.0.0.1:9999 |
@@ -193,6 +199,26 @@ When you index a repo, Knowledge Master detects:
193
199
  | **People** | Git commit authors and file ownership |
194
200
  | **Code structure** | Functions, classes, chunked by AST-aware boundaries |
195
201
 
202
+ ## Feature Status
203
+
204
+ | Feature | Status | Notes |
205
+ |---|---|---|
206
+ | Semantic search + re-ranking | ✅ Stable | Core retrieval works well |
207
+ | Knowledge graph (FalkorDB) | ✅ Stable | Node/edge storage, vector index |
208
+ | CLI commands | ✅ Stable | All commands functional |
209
+ | MCP server | ✅ Stable | search, blast_radius, check_conventions |
210
+ | Web UI + graph viz | ✅ Stable | htmx + D3, no build step |
211
+ | Git repo indexing | ✅ Stable | Parses code, extracts authors |
212
+ | Tech stack detection | ⚡ Basic | Regex over dependency files — works for common cases |
213
+ | Service topology | ⚡ Basic | docker-compose parsing — limited YAML support |
214
+ | Convention detection | ⚡ Basic | Folder structure + file naming patterns |
215
+ | Blast radius | ⚡ Basic | Graph traversal on stored edges — doesn't trace imports/calls |
216
+ | Email connector (ms-365) | 🧪 Experimental | Works but requires ms-365-mcp setup |
217
+ | Re-ranking | 🧪 Experimental | Novel approach, not benchmarked against cross-encoders |
218
+ | Incremental indexing | 🧪 Experimental | File watcher + git hooks, needs more testing |
219
+
220
+ **Legend:** ✅ Stable — ⚡ Basic (works, limited scope) — 🧪 Experimental (may change)
221
+
196
222
  ## Comparison
197
223
 
198
224
  | Feature | Knowledge Master | Generic RAG | GitHub Copilot | Glean |
@@ -221,6 +247,23 @@ python -m knowledge_master.server
221
247
  python -m knowledge_master.cli status
222
248
  ```
223
249
 
250
+ ## Security
251
+
252
+ Knowledge Master runs **entirely on your machine**. No data leaves localhost.
253
+
254
+ - All ports bound to `127.0.0.1` (not accessible from LAN)
255
+ - Ollama runs locally — no cloud API calls
256
+ - MCP server uses stdio (no network exposure)
257
+ - Optional API key auth for REST endpoints
258
+
259
+ ```bash
260
+ # Enable API key auth
261
+ export KM_API_KEY=$(openssl rand -hex 32)
262
+ km serve
263
+ ```
264
+
265
+ See [SECURITY.md](SECURITY.md) for full security model, risks, and hardening guide.
266
+
224
267
  ## Troubleshooting
225
268
 
226
269
  | Issue | Fix |
@@ -0,0 +1,92 @@
1
+ """REST API — JSON endpoints for external tool integration."""
2
+
3
+ from pathlib import Path
4
+
5
+ from fastapi import APIRouter
6
+
7
+ from . import embeddings, store
8
+ from .parsers import git_repo, markdown
9
+
10
+ router = APIRouter(prefix="/api/v1")
11
+
12
+
13
+ @router.get("/search")
14
+ async def search(q: str, top_k: int = 10, source_type: str = None):
15
+ """Semantic search across the knowledge base."""
16
+ graph = store.get_graph()
17
+ vec = embeddings.embed(q)
18
+ results = store.graph_context_search(graph, vec, top_k, query=q)
19
+ if source_type:
20
+ results = [r for r in results if r.get("source_type") == source_type]
21
+ return {"query": q, "results": results}
22
+
23
+
24
+ @router.get("/blast-radius/{target}")
25
+ async def blast_radius(target: str):
26
+ """Show what depends on a target."""
27
+ graph = store.get_graph()
28
+ # Try Service
29
+ result = graph.query(
30
+ """MATCH (t:Service {name: $name})
31
+ OPTIONAL MATCH (other)-[*1..3]->(t)
32
+ WHERE other <> t
33
+ RETURN labels(other)[0] AS type, other.name AS name""",
34
+ params={"name": target},
35
+ )
36
+ if not result.result_set or all(r[1] is None for r in result.result_set):
37
+ # Try Tech
38
+ result = graph.query(
39
+ """MATCH (t:Tech {name: $name})
40
+ OPTIONAL MATCH (r:Repo)-[:USES_TECH]->(t)
41
+ RETURN 'Repo' AS type, r.name AS name""",
42
+ params={"name": target},
43
+ )
44
+ affected = [{"type": r[0], "name": r[1]} for r in (result.result_set or []) if r[1]]
45
+ return {"target": target, "affected_count": len(affected), "affected": affected}
46
+
47
+
48
+ @router.get("/conventions/check")
49
+ async def check_conventions(path: str = "."):
50
+ """Check conventions for a path."""
51
+ path = str(Path(path).expanduser().resolve())
52
+ repo_name = Path(path).name
53
+ graph = store.get_graph()
54
+
55
+ result = graph.query(
56
+ "MATCH (r:Repo)-[:FOLLOWS]->(c:Convention) WHERE r.name = $name RETURN c.name, c.category",
57
+ params={"name": repo_name},
58
+ )
59
+ if not result.result_set:
60
+ result = graph.query("MATCH (c:Convention) RETURN c.name, c.category")
61
+
62
+ from .cli import _check_convention
63
+ checks = []
64
+ for conv_name, category in (result.result_set or []):
65
+ passed = _check_convention(path, conv_name)
66
+ checks.append({"convention": conv_name, "category": category, "passed": passed})
67
+ return {"path": path, "checks": checks}
68
+
69
+
70
+ @router.post("/index")
71
+ async def index_source(path: str, type: str = "auto"):
72
+ """Index a repo or directory."""
73
+ path = str(Path(path).expanduser().resolve())
74
+ if not Path(path).exists():
75
+ return {"error": f"Path not found: {path}"}
76
+
77
+ graph = store.get_graph()
78
+ store.init_schema(graph)
79
+ resolved_type = type if type != "auto" else ("repo" if (Path(path) / ".git").exists() else "docs")
80
+
81
+ if resolved_type == "repo":
82
+ result = git_repo.index_repo(path, graph)
83
+ else:
84
+ result = markdown.index_directory(path, graph)
85
+ return result
86
+
87
+
88
+ @router.get("/status")
89
+ async def status():
90
+ """Knowledge base stats."""
91
+ graph = store.get_graph()
92
+ return store.get_stats(graph)
@@ -108,7 +108,7 @@ def search(
108
108
  """Semantic search across the knowledge base."""
109
109
  graph = store.get_graph()
110
110
  vec = embeddings.embed(query)
111
- results = store.graph_context_search(graph, vec, top_k)
111
+ results = store.graph_context_search(graph, vec, top_k, query=query)
112
112
 
113
113
  table = Table(title=f"Results for: {query}")
114
114
  table.add_column("Score", width=6)
@@ -134,62 +134,135 @@ def search(
134
134
 
135
135
  @app.command()
136
136
  def blast_radius(
137
- target: str = typer.Argument(..., help="Service, file, or tech name to check"),
138
- depth: int = typer.Option(3, "--depth", "-d", help="Traversal depth"),
137
+ target: str = typer.Argument(..., help="Service, file, function, or tech name"),
138
+ depth: int = typer.Option(4, "--depth", "-d", help="Traversal depth"),
139
139
  ):
140
- """Show what depends on a target — the blast radius of changing it."""
140
+ """Show what depends on a target — multi-layer blast radius analysis."""
141
141
  graph = store.get_graph()
142
+ results = _compute_blast_radius(graph, target, depth)
142
143
 
143
- # Try as Service first
144
- result = graph.query(
145
- """MATCH (target:Service {name: $name})
146
- OPTIONAL MATCH path = (other)-[*1..3]->(target)
147
- WHERE other <> target
148
- RETURN labels(other)[0] AS type, other.name AS name,
149
- length(path) AS distance, type(last(relationships(path))) AS rel
150
- ORDER BY distance""",
151
- params={"name": target},
152
- )
153
-
154
- if not result.result_set:
155
- # Try as Tech
156
- result = graph.query(
157
- """MATCH (target:Tech {name: $name})
158
- OPTIONAL MATCH (r:Repo)-[:USES_TECH]->(target)
159
- RETURN 'Repo' AS type, r.name AS name, 1 AS distance, 'USES_TECH' AS rel""",
160
- params={"name": target},
161
- )
162
-
163
- if not result.result_set:
164
- # Try as file/document
165
- result = graph.query(
166
- """MATCH (target:Document) WHERE target.path CONTAINS $name
167
- OPTIONAL MATCH (c:Chunk)-[:PART_OF]->(target)
168
- OPTIONAL MATCH (p:Person)-[:AUTHORED]->(target)
169
- OPTIONAL MATCH (target)-[:IN_REPO]->(r:Repo)
170
- RETURN 'Repo' AS type, r.name AS name, 1 AS distance, 'CONTAINS' AS rel
171
- UNION
172
- MATCH (target:Document) WHERE target.path CONTAINS $name
173
- OPTIONAL MATCH (p:Person)-[:AUTHORED]->(target)
174
- RETURN 'Person' AS type, p.name AS name, 1 AS distance, 'AUTHORED' AS rel""",
175
- params={"name": target},
176
- )
177
-
178
- if not result.result_set or all(r[1] is None for r in result.result_set):
144
+ if not results:
179
145
  console.print(f"[yellow]No dependencies found for:[/] {target}")
180
- console.print("[dim]Try: a service name, technology, or file path[/]")
146
+ console.print("[dim]Try: a file path, function name, service, or technology[/]")
181
147
  return
182
148
 
183
149
  tree = Tree(f"[bold red]💥 Blast radius: {target}[/]")
184
- seen = set()
185
- for node_type, name, distance, rel in result.result_set:
186
- if name and name not in seen:
187
- seen.add(name)
188
- icon = {"Repo": "📦", "Service": "⚙️", "Person": "👤", "Document": "📄", "Tech": "🔧"}.get(node_type, "•")
189
- tree.add(f"{icon} [bold]{name}[/] [dim]({node_type}, via {rel})[/]")
150
+
151
+ # Group by confidence
152
+ definite = [r for r in results if r["confidence"] == "definite"]
153
+ likely = [r for r in results if r["confidence"] == "likely"]
154
+ possible = [r for r in results if r["confidence"] == "possible"]
155
+
156
+ if definite:
157
+ branch = tree.add("[bold]Definite impact[/]")
158
+ for r in definite:
159
+ icon = _icon(r["type"])
160
+ branch.add(f"{icon} [bold]{r['name']}[/] [dim]({r['type']}, {r['rel']})[/]")
161
+
162
+ if likely:
163
+ branch = tree.add("[yellow]Likely affected[/]")
164
+ for r in likely:
165
+ icon = _icon(r["type"])
166
+ branch.add(f"{icon} {r['name']} [dim]({r['type']}, {r['rel']})[/]")
167
+
168
+ if possible:
169
+ branch = tree.add("[dim]Possibly affected[/]")
170
+ for r in possible:
171
+ icon = _icon(r["type"])
172
+ branch.add(f"{icon} {r['name']} [dim]({r['type']}, {r['rel']})[/]")
190
173
 
191
174
  console.print(tree)
192
- console.print(f"\n[dim]{len(seen)} entities affected[/]")
175
+ console.print(f"\n[dim]{len(results)} entities: {len(definite)} definite, {len(likely)} likely, {len(possible)} possible[/]")
176
+
177
+
178
+ def _compute_blast_radius(graph, target: str, depth: int = 4) -> list[dict]:
179
+ """Multi-layer blast radius: Symbol → File → Service → Person."""
180
+ results = []
181
+ seen = set()
182
+
183
+ # Layer 1: File-level imports (who imports this file?)
184
+ r = graph.query(
185
+ """MATCH (src:Document)-[:IMPORTS]->(dst:Document)
186
+ WHERE dst.path CONTAINS $name
187
+ RETURN 'Document' AS type, src.path AS name, 'IMPORTS' AS rel""",
188
+ params={"name": target},
189
+ )
190
+ for row in (r.result_set or []):
191
+ if row[1] and row[1] not in seen:
192
+ seen.add(row[1])
193
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "definite"})
194
+
195
+ # Layer 1b: Symbol-level (who defines/uses this function?)
196
+ r = graph.query(
197
+ """MATCH (f:Function {name: $name})-[:DEFINED_IN]->(d:Document)
198
+ OPTIONAL MATCH (importer:Document)-[:IMPORTS]->(d)
199
+ RETURN 'Document' AS type, importer.path AS name, 'IMPORTS function' AS rel""",
200
+ params={"name": target},
201
+ )
202
+ for row in (r.result_set or []):
203
+ if row[1] and row[1] not in seen:
204
+ seen.add(row[1])
205
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "definite"})
206
+
207
+ # Layer 2: Service-level (which service owns affected files?)
208
+ affected_files = [r["name"] for r in results if r["type"] == "Document"]
209
+ affected_files.append(target) # include the target itself
210
+
211
+ r = graph.query(
212
+ """MATCH (d:Document)-[:IN_REPO]->(repo:Repo)-[:DEFINES_SERVICE]->(svc:Service)
213
+ WHERE any(f IN $files WHERE d.path CONTAINS f)
214
+ RETURN 'Service' AS type, svc.name AS name, 'owns affected file' AS rel""",
215
+ params={"files": affected_files},
216
+ )
217
+ for row in (r.result_set or []):
218
+ if row[1] and row[1] not in seen:
219
+ seen.add(row[1])
220
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "likely"})
221
+
222
+ # Layer 2b: Services that depend on affected services
223
+ affected_services = [r["name"] for r in results if r["type"] == "Service"]
224
+ if affected_services:
225
+ r = graph.query(
226
+ """MATCH (upstream:Service)-[:DEPENDS_ON]->(downstream:Service)
227
+ WHERE downstream.name IN $services
228
+ RETURN 'Service' AS type, upstream.name AS name, 'DEPENDS_ON' AS rel""",
229
+ params={"services": affected_services},
230
+ )
231
+ for row in (r.result_set or []):
232
+ if row[1] and row[1] not in seen:
233
+ seen.add(row[1])
234
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "likely"})
235
+
236
+ # Layer 3: Tech-level
237
+ r = graph.query(
238
+ """MATCH (t:Tech {name: $name})
239
+ OPTIONAL MATCH (repo:Repo)-[:USES_TECH]->(t)
240
+ RETURN 'Repo' AS type, repo.name AS name, 'USES_TECH' AS rel""",
241
+ params={"name": target},
242
+ )
243
+ for row in (r.result_set or []):
244
+ if row[1] and row[1] not in seen:
245
+ seen.add(row[1])
246
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "possible"})
247
+
248
+ # Layer 4: People (who authored affected files?)
249
+ r = graph.query(
250
+ """MATCH (p:Person)-[:AUTHORED]->(d:Document)
251
+ WHERE any(f IN $files WHERE d.path = f)
252
+ RETURN 'Person' AS type, p.name AS name, 'AUTHORED affected file' AS rel""",
253
+ params={"files": affected_files},
254
+ )
255
+ for row in (r.result_set or []):
256
+ if row[1] and row[1] not in seen:
257
+ seen.add(row[1])
258
+ results.append({"type": row[0], "name": row[1], "rel": row[2], "confidence": "possible"})
259
+
260
+ return results
261
+
262
+
263
+ def _icon(node_type: str) -> str:
264
+ return {"Repo": "📦", "Service": "⚙️", "Person": "👤", "Document": "📄",
265
+ "Tech": "🔧", "Function": "🔧", "Class": "🏗️"}.get(node_type, "•")
193
266
 
194
267
 
195
268
  @app.command()
@@ -313,6 +386,33 @@ def remove(source: str = typer.Argument(..., help="Repo name or doc path to remo
313
386
  console.print(f"[yellow]Not found:[/] {source}")
314
387
 
315
388
 
389
+
390
+ @app.command()
391
+ def connect(
392
+ source: str = typer.Argument(..., help="Source to pull from: outlook, slack, notion, or custom"),
393
+ command: str = typer.Option(None, "--command", "-c", help="Custom MCP server command"),
394
+ tool: str = typer.Option(None, "--tool", "-t", help="Tool name to call on the MCP server"),
395
+ ):
396
+ """Pull and index data from an external MCP server (email, Slack, etc.)."""
397
+ from .connectors import sync_pull_and_index, add_custom_source, SOURCES
398
+
399
+ if command and tool:
400
+ add_custom_source(source, command.split(), tool)
401
+
402
+ if source not in SOURCES:
403
+ console.print(f"[yellow]Unknown source:[/] {source}")
404
+ console.print(f"[dim]Available: {', '.join(SOURCES.keys())}[/]")
405
+ console.print("[dim]Or use --command and --tool for custom MCP servers[/]")
406
+ raise typer.Exit(1)
407
+
408
+ console.print(f"[bold blue]Connecting to {source}...[/]")
409
+ try:
410
+ result = sync_pull_and_index(source)
411
+ console.print(f"[green]✓ Done![/] {json.dumps(result)}")
412
+ except Exception as e:
413
+ console.print(f"[red]✗ Failed:[/] {e}")
414
+ raise typer.Exit(1)
415
+
316
416
  @app.command()
317
417
  def status():
318
418
  """Check system health."""
@@ -340,5 +440,25 @@ def serve(port: int = typer.Option(9999, help="Port for web UI")):
340
440
  uvicorn.run(create_app(), host="127.0.0.1", port=port)
341
441
 
342
442
 
443
+ @app.command(name="who-owns")
444
+ def who_owns(file: str = typer.Argument(..., help="File path to check ownership")):
445
+ """Show who owns a file based on git blame analysis."""
446
+ graph = store.get_graph()
447
+ result = graph.query(
448
+ """MATCH (p:Person)-[r:OWNS]->(d:Document)
449
+ WHERE d.path CONTAINS $file
450
+ RETURN p.name, r.weight, d.path
451
+ ORDER BY r.weight DESC LIMIT 1""",
452
+ params={"file": file},
453
+ )
454
+ if result.result_set:
455
+ name, weight, path = result.result_set[0]
456
+ console.print(f"[bold]{path}[/]")
457
+ console.print(f" Owner: [green]{name}[/] (weight: {weight:.2f})")
458
+ else:
459
+ console.print(f"[yellow]No ownership data for:[/] {file}")
460
+ console.print("[dim]Run 'km index <repo>' first to extract ownership.[/]")
461
+
462
+
343
463
  if __name__ == "__main__":
344
464
  app()
@@ -0,0 +1,134 @@
1
+ """MCP Connector — index data from external MCP servers (email, Slack, etc.)."""
2
+
3
+ import asyncio
4
+ import json
5
+ from dataclasses import dataclass
6
+
7
+ from . import chunking, embeddings, store
8
+
9
+
10
+ @dataclass
11
+ class MCPSource:
12
+ """Configuration for an external MCP server to pull data from."""
13
+ name: str
14
+ command: list[str]
15
+ tool_name: str # which tool to call to get data
16
+ tool_args: dict # arguments to pass
17
+ source_type: str # email, slack, docs, etc.
18
+
19
+
20
+ # Pre-configured sources — commands must be installed separately
21
+ SOURCES = {
22
+ "outlook": MCPSource(
23
+ name="Microsoft 365 Emails",
24
+ command=["npx", "@subzone81/ms-365-mcp", "--preset", "mail"],
25
+ tool_name="list-mail-messages",
26
+ tool_args={"top": 50},
27
+ source_type="email",
28
+ ),
29
+ "slack": MCPSource(
30
+ name="Slack Messages",
31
+ command=["npx", "@modelcontextprotocol/server-slack"],
32
+ tool_name="slack_search_messages",
33
+ tool_args={"query": ""},
34
+ source_type="slack",
35
+ ),
36
+ }
37
+
38
+
39
+ async def pull_and_index(source: MCPSource, graph=None):
40
+ """Connect to an MCP server, pull data, and index it into our graph."""
41
+ from mcp import ClientSession
42
+ from mcp.client.stdio import stdio_client, StdioServerParameters
43
+
44
+ if graph is None:
45
+ graph = store.get_graph()
46
+ store.init_schema(graph)
47
+
48
+ params = StdioServerParameters(command=source.command[0], args=source.command[1:])
49
+
50
+ async with stdio_client(params) as (read, write):
51
+ async with ClientSession(read, write) as session:
52
+ await session.initialize()
53
+
54
+ # Call the tool to get data
55
+ result = await session.call_tool(source.tool_name, source.tool_args)
56
+
57
+ items = _parse_mcp_result(result)
58
+ indexed = 0
59
+
60
+ for item in items:
61
+ text = item.get("text", item.get("content", item.get("body", "")))
62
+ if not text or len(text.strip()) < 20:
63
+ continue
64
+
65
+ title = item.get("subject", item.get("title", item.get("name", "")))
66
+ author = item.get("from", item.get("author", item.get("user", "")))
67
+ source_id = item.get("id", item.get("url", title))
68
+
69
+ # Chunk and embed
70
+ chunks = chunking.chunk_text(text)
71
+ vectors = embeddings.embed_batch(chunks)
72
+
73
+ # Store document
74
+ doc_path = f"{source.source_type}/{source_id}"
75
+ store.upsert_document(graph, doc_path, source.source_type, {"title": title})
76
+
77
+ # Store person if we have author info
78
+ if author:
79
+ email = author if "@" in author else ""
80
+ store.upsert_person(graph, author, email)
81
+ store.link_person_authored(graph, email or author, doc_path)
82
+
83
+ # Store chunks
84
+ for i, (chunk_text, vector) in enumerate(zip(chunks, vectors)):
85
+ cid = chunking.chunk_id(doc_path, i)
86
+ store.upsert_chunk(graph, cid, chunk_text, vector,
87
+ {"source": doc_path, "source_type": source.source_type})
88
+ store.link_chunk_to_document(graph, cid, doc_path)
89
+
90
+ indexed += 1
91
+
92
+ return {"source": source.name, "items_indexed": indexed}
93
+
94
+
95
+ def _parse_mcp_result(result) -> list[dict]:
96
+ """Parse MCP tool result into a list of items."""
97
+ items = []
98
+ for content in result.content:
99
+ if hasattr(content, "text"):
100
+ try:
101
+ data = json.loads(content.text)
102
+ if isinstance(data, list):
103
+ items.extend(data)
104
+ elif isinstance(data, dict):
105
+ if "results" in data:
106
+ items.extend(data["results"])
107
+ elif "messages" in data:
108
+ items.extend(data["messages"])
109
+ elif "items" in data:
110
+ items.extend(data["items"])
111
+ else:
112
+ items.append(data)
113
+ except json.JSONDecodeError:
114
+ # Plain text — treat as single item
115
+ items.append({"text": content.text, "title": "mcp-result"})
116
+ return items
117
+
118
+
119
+ def sync_pull_and_index(source_key: str, graph=None):
120
+ """Synchronous wrapper for CLI usage."""
121
+ if source_key not in SOURCES:
122
+ available = ", ".join(SOURCES.keys())
123
+ raise ValueError(f"Unknown source: {source_key}. Available: {available}")
124
+ source = SOURCES[source_key]
125
+ return asyncio.run(pull_and_index(source, graph))
126
+
127
+
128
+ def add_custom_source(name: str, command: list[str], tool_name: str,
129
+ tool_args: dict = None, source_type: str = "external"):
130
+ """Register a custom MCP source."""
131
+ SOURCES[name] = MCPSource(
132
+ name=name, command=command, tool_name=tool_name,
133
+ tool_args=tool_args or {}, source_type=source_type,
134
+ )
@@ -1,13 +1,17 @@
1
1
  """Embedding client using Ollama local models."""
2
2
 
3
- import ollama
3
+ from ollama import Client
4
4
 
5
5
  MODEL = "nomic-embed-text"
6
+ TIMEOUT = 30 # seconds
7
+
8
+ # Create client with timeout
9
+ _client = Client(timeout=TIMEOUT)
6
10
 
7
11
 
8
12
  def embed(text: str) -> list[float]:
9
13
  """Embed a single text string, returns vector."""
10
- response = ollama.embed(model=MODEL, input=text)
14
+ response = _client.embed(model=MODEL, input=text)
11
15
  return response["embeddings"][0]
12
16
 
13
17
 
@@ -16,6 +20,6 @@ def embed_batch(texts: list[str], batch_size: int = 64) -> list[list[float]]:
16
20
  vectors = []
17
21
  for i in range(0, len(texts), batch_size):
18
22
  batch = texts[i : i + batch_size]
19
- response = ollama.embed(model=MODEL, input=batch)
23
+ response = _client.embed(model=MODEL, input=batch)
20
24
  vectors.extend(response["embeddings"])
21
25
  return vectors