knowledge-master 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,4 @@
1
+ """Allow running as: python -m knowledge_master"""
2
+ from .cli import app
3
+
4
+ app()
@@ -0,0 +1,106 @@
1
+ """Smart chunking engine - splits text by structure (headings, functions, paragraphs)."""
2
+
3
+ import hashlib
4
+ import re
5
+
6
+
7
+ def chunk_id(source: str, index: int) -> str:
8
+ """Deterministic chunk ID from source path and position."""
9
+ return hashlib.md5(f"{source}:{index}".encode()).hexdigest()
10
+
11
+
12
+ def chunk_markdown(text: str, max_tokens: int = 512) -> list[dict]:
13
+ """Split markdown by headings, keeping sections together."""
14
+ sections = re.split(r"(?=^#{1,3}\s)", text, flags=re.MULTILINE)
15
+ chunks = []
16
+ for section in sections:
17
+ section = section.strip()
18
+ if not section:
19
+ continue
20
+ # If section is too long, split by paragraphs
21
+ if len(section.split()) > max_tokens:
22
+ paragraphs = section.split("\n\n")
23
+ buffer = ""
24
+ for para in paragraphs:
25
+ if len((buffer + para).split()) > max_tokens and buffer:
26
+ chunks.append(buffer.strip())
27
+ buffer = para
28
+ else:
29
+ buffer = buffer + "\n\n" + para if buffer else para
30
+ if buffer.strip():
31
+ chunks.append(buffer.strip())
32
+ else:
33
+ chunks.append(section)
34
+ return chunks
35
+
36
+
37
+ def chunk_code(text: str, language: str = "", max_tokens: int = 400) -> list[dict]:
38
+ """Split code by function/class boundaries or fixed blocks."""
39
+ # Try to split by top-level definitions
40
+ patterns = {
41
+ "python": r"(?=^(?:def |class |async def ))",
42
+ "typescript": r"(?=^(?:export |function |class |const \w+ = ))",
43
+ "rust": r"(?=^(?:fn |pub fn |impl |struct |enum ))",
44
+ "go": r"(?=^(?:func ))",
45
+ }
46
+ pattern = patterns.get(language)
47
+ if pattern:
48
+ parts = re.split(pattern, text, flags=re.MULTILINE)
49
+ else:
50
+ # Fall back to splitting by blank lines / large gaps
51
+ parts = re.split(r"\n{3,}", text)
52
+
53
+ chunks = []
54
+ buffer = ""
55
+ for part in parts:
56
+ part = part.strip()
57
+ if not part:
58
+ continue
59
+ if len((buffer + part).split()) > max_tokens and buffer:
60
+ chunks.append(buffer.strip())
61
+ buffer = part
62
+ else:
63
+ buffer = buffer + "\n\n" + part if buffer else part
64
+ if buffer.strip():
65
+ chunks.append(buffer.strip())
66
+ return chunks
67
+
68
+
69
+ def chunk_text(text: str, max_tokens: int = 512) -> list[str]:
70
+ """Generic text chunking by paragraphs."""
71
+ paragraphs = text.split("\n\n")
72
+ chunks = []
73
+ buffer = ""
74
+ for para in paragraphs:
75
+ if len((buffer + para).split()) > max_tokens and buffer:
76
+ chunks.append(buffer.strip())
77
+ buffer = para
78
+ else:
79
+ buffer = buffer + "\n\n" + para if buffer else para
80
+ if buffer.strip():
81
+ chunks.append(buffer.strip())
82
+ return chunks
83
+
84
+
85
+ LANGUAGE_MAP = {
86
+ ".py": "python",
87
+ ".ts": "typescript",
88
+ ".tsx": "typescript",
89
+ ".js": "javascript",
90
+ ".rs": "rust",
91
+ ".go": "go",
92
+ ".java": "java",
93
+ ".md": "markdown",
94
+ ".markdown": "markdown",
95
+ }
96
+
97
+
98
+ def chunk_file(text: str, extension: str, max_tokens: int = 512) -> list[str]:
99
+ """Route to appropriate chunker based on file extension."""
100
+ lang = LANGUAGE_MAP.get(extension, "")
101
+ if lang == "markdown":
102
+ return chunk_markdown(text, max_tokens)
103
+ elif lang:
104
+ return chunk_code(text, lang, max_tokens)
105
+ else:
106
+ return chunk_text(text, max_tokens)
@@ -0,0 +1,344 @@
1
+ """CLI for knowledge-master — simple commands for indexing, searching, and managing."""
2
+
3
+ import json
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ import typer
8
+ from rich.console import Console
9
+ from rich.table import Table
10
+ from rich.tree import Tree
11
+
12
+ from . import embeddings, store
13
+ from .parsers import git_repo, markdown
14
+
15
+ app = typer.Typer(name="km", help="Knowledge Master — your codebase's memory", no_args_is_help=True)
16
+ console = Console()
17
+
18
+ PROJECT_DIR = Path(__file__).parent.parent
19
+
20
+
21
+ @app.command()
22
+ def start():
23
+ """Start Knowledge Master (Docker containers + Ollama model)."""
24
+ compose_file = PROJECT_DIR / "docker-compose.yml"
25
+
26
+ console.print("[bold]Starting Knowledge Master...[/]\n")
27
+
28
+ # Check Docker
29
+ try:
30
+ subprocess.run(["docker", "info"], capture_output=True, check=True)
31
+ except (subprocess.CalledProcessError, FileNotFoundError):
32
+ console.print("[red]✗ Docker is not running.[/] Please start Docker first.")
33
+ raise typer.Exit(1)
34
+
35
+ # Start containers
36
+ console.print(" [dim]Starting FalkorDB + Postgres...[/]")
37
+ result = subprocess.run(["docker", "compose", "-f", str(compose_file), "up", "-d"],
38
+ capture_output=True, text=True)
39
+ if result.returncode != 0 and "error" in result.stderr.lower():
40
+ console.print(f"[red]✗ Docker Compose failed:[/] {result.stderr.strip()}")
41
+ raise typer.Exit(1)
42
+ # Wait for healthy
43
+ subprocess.run(["docker", "compose", "-f", str(compose_file), "up", "--wait"],
44
+ capture_output=True)
45
+ console.print(" [green]✓[/] Containers running")
46
+
47
+ # Check/pull Ollama model
48
+ console.print(" [dim]Checking embedding model...[/]")
49
+ try:
50
+ result = subprocess.run(["ollama", "list"], capture_output=True, text=True)
51
+ if "nomic-embed-text" not in result.stdout:
52
+ console.print(" [dim]Pulling nomic-embed-text...[/]")
53
+ subprocess.run(["ollama", "pull", "nomic-embed-text"], check=True)
54
+ console.print(" [green]✓[/] Embedding model ready")
55
+ except FileNotFoundError:
56
+ console.print("[red]✗ Ollama not found.[/] Install from https://ollama.com")
57
+ raise typer.Exit(1)
58
+
59
+ # Init schema
60
+ graph = store.get_graph()
61
+ store.init_schema(graph)
62
+ console.print(" [green]✓[/] Graph schema initialized")
63
+
64
+ console.print("\n[bold green]Knowledge Master is ready![/]")
65
+ console.print(" • Index a repo: [cyan]km index ~/path/to/repo[/]")
66
+ console.print(" • Search: [cyan]km search \"your query\"[/]")
67
+ console.print(" • Web UI: [cyan]km serve[/]")
68
+ console.print(" • Graph viz: [cyan]http://127.0.0.1:9999/graph[/]")
69
+
70
+
71
+ @app.command()
72
+ def stop():
73
+ """Stop Knowledge Master containers."""
74
+ compose_file = PROJECT_DIR / "docker-compose.yml"
75
+ subprocess.run(["docker", "compose", "-f", str(compose_file), "down"], capture_output=True)
76
+ console.print("[green]✓[/] Knowledge Master stopped")
77
+
78
+
79
+ @app.command()
80
+ def index(
81
+ path: str = typer.Argument(..., help="Path to git repo or directory"),
82
+ type: str = typer.Option("auto", "--type", "-t", help="Source type: auto, repo, docs"),
83
+ ):
84
+ """Index a git repo or directory of documents."""
85
+ path = str(Path(path).expanduser().resolve())
86
+
87
+ if type == "auto":
88
+ type = "repo" if (Path(path) / ".git").exists() else "docs"
89
+
90
+ graph = store.get_graph()
91
+ store.init_schema(graph)
92
+
93
+ if type == "repo":
94
+ console.print(f"[bold blue]Indexing git repo:[/] {path}")
95
+ result = git_repo.index_repo(path, graph)
96
+ else:
97
+ console.print(f"[bold blue]Indexing docs:[/] {path}")
98
+ result = markdown.index_directory(path, graph)
99
+
100
+ console.print(f"[green]✓ Done![/] {json.dumps(result)}")
101
+
102
+
103
+ @app.command()
104
+ def search(
105
+ query: str = typer.Argument(..., help="Search query"),
106
+ top_k: int = typer.Option(10, "--top", "-n", help="Number of results"),
107
+ ):
108
+ """Semantic search across the knowledge base."""
109
+ graph = store.get_graph()
110
+ vec = embeddings.embed(query)
111
+ results = store.graph_context_search(graph, vec, top_k)
112
+
113
+ table = Table(title=f"Results for: {query}")
114
+ table.add_column("Score", width=6)
115
+ table.add_column("Source", max_width=50)
116
+ table.add_column("Context", width=25)
117
+ table.add_column("Preview", max_width=80)
118
+
119
+ for r in results:
120
+ ctx_parts = []
121
+ if r.get("repo"):
122
+ ctx_parts.append(f"repo:{r['repo']}")
123
+ if r.get("author"):
124
+ ctx_parts.append(f"by:{r['author']}")
125
+ table.add_row(
126
+ f"{r.get('score', 0):.3f}",
127
+ r.get("source", ""),
128
+ ", ".join(ctx_parts),
129
+ (r.get("text", "")[:100] + "...") if r.get("text") else "",
130
+ )
131
+
132
+ console.print(table)
133
+
134
+
135
+ @app.command()
136
+ def blast_radius(
137
+ target: str = typer.Argument(..., help="Service, file, or tech name to check"),
138
+ depth: int = typer.Option(3, "--depth", "-d", help="Traversal depth"),
139
+ ):
140
+ """Show what depends on a target — the blast radius of changing it."""
141
+ graph = store.get_graph()
142
+
143
+ # Try as Service first
144
+ result = graph.query(
145
+ """MATCH (target:Service {name: $name})
146
+ OPTIONAL MATCH path = (other)-[*1..3]->(target)
147
+ WHERE other <> target
148
+ RETURN labels(other)[0] AS type, other.name AS name,
149
+ length(path) AS distance, type(last(relationships(path))) AS rel
150
+ ORDER BY distance""",
151
+ params={"name": target},
152
+ )
153
+
154
+ if not result.result_set:
155
+ # Try as Tech
156
+ result = graph.query(
157
+ """MATCH (target:Tech {name: $name})
158
+ OPTIONAL MATCH (r:Repo)-[:USES_TECH]->(target)
159
+ RETURN 'Repo' AS type, r.name AS name, 1 AS distance, 'USES_TECH' AS rel""",
160
+ params={"name": target},
161
+ )
162
+
163
+ if not result.result_set:
164
+ # Try as file/document
165
+ result = graph.query(
166
+ """MATCH (target:Document) WHERE target.path CONTAINS $name
167
+ OPTIONAL MATCH (c:Chunk)-[:PART_OF]->(target)
168
+ OPTIONAL MATCH (p:Person)-[:AUTHORED]->(target)
169
+ OPTIONAL MATCH (target)-[:IN_REPO]->(r:Repo)
170
+ RETURN 'Repo' AS type, r.name AS name, 1 AS distance, 'CONTAINS' AS rel
171
+ UNION
172
+ MATCH (target:Document) WHERE target.path CONTAINS $name
173
+ OPTIONAL MATCH (p:Person)-[:AUTHORED]->(target)
174
+ RETURN 'Person' AS type, p.name AS name, 1 AS distance, 'AUTHORED' AS rel""",
175
+ params={"name": target},
176
+ )
177
+
178
+ if not result.result_set or all(r[1] is None for r in result.result_set):
179
+ console.print(f"[yellow]No dependencies found for:[/] {target}")
180
+ console.print("[dim]Try: a service name, technology, or file path[/]")
181
+ return
182
+
183
+ tree = Tree(f"[bold red]💥 Blast radius: {target}[/]")
184
+ seen = set()
185
+ for node_type, name, distance, rel in result.result_set:
186
+ if name and name not in seen:
187
+ seen.add(name)
188
+ icon = {"Repo": "📦", "Service": "⚙️", "Person": "👤", "Document": "📄", "Tech": "🔧"}.get(node_type, "•")
189
+ tree.add(f"{icon} [bold]{name}[/] [dim]({node_type}, via {rel})[/]")
190
+
191
+ console.print(tree)
192
+ console.print(f"\n[dim]{len(seen)} entities affected[/]")
193
+
194
+
195
+ @app.command()
196
+ def check_conventions(
197
+ path: str = typer.Argument(".", help="Path to check against conventions"),
198
+ ):
199
+ """Check if a file or repo follows detected conventions."""
200
+ path = str(Path(path).expanduser().resolve())
201
+ repo_name = Path(path).name
202
+ graph = store.get_graph()
203
+
204
+ # Get conventions for this repo (or all if not found)
205
+ result = graph.query(
206
+ """MATCH (r:Repo)-[:FOLLOWS]->(c:Convention)
207
+ WHERE r.name = $name OR r.path = $path
208
+ RETURN c.name, c.category""",
209
+ params={"name": repo_name, "path": path},
210
+ )
211
+
212
+ if not result.result_set:
213
+ # Fall back to all conventions
214
+ result = graph.query("MATCH (c:Convention) RETURN c.name, c.category")
215
+
216
+ if not result.result_set:
217
+ console.print("[yellow]No conventions detected yet.[/] Index a repo first.")
218
+ return
219
+
220
+ console.print(f"[bold]Checking conventions for:[/] {path}\n")
221
+ violations = []
222
+ passes = []
223
+
224
+ for conv_name, category in result.result_set:
225
+ passed = _check_convention(path, conv_name)
226
+ if passed:
227
+ passes.append((conv_name, category))
228
+ else:
229
+ violations.append((conv_name, category))
230
+
231
+ for name, cat in passes:
232
+ console.print(f" [green]✓[/] {name} [dim]({cat})[/]")
233
+ for name, cat in violations:
234
+ console.print(f" [red]✗[/] {name} [dim]({cat})[/]")
235
+
236
+ if violations:
237
+ console.print(f"\n[red]{len(violations)} convention(s) violated[/]")
238
+ raise typer.Exit(1)
239
+ else:
240
+ console.print(f"\n[green]All {len(passes)} conventions pass ✓[/]")
241
+
242
+
243
+ def _check_convention(path: str, convention: str) -> bool:
244
+ """Check a single convention against a path."""
245
+ p = Path(path)
246
+ if convention == "src/ directory":
247
+ return (p / "src").is_dir()
248
+ elif convention == "separate test directory":
249
+ return (p / "tests").is_dir() or (p / "test").is_dir()
250
+ elif convention == "docs/ directory":
251
+ return (p / "docs").is_dir()
252
+ elif convention == "snake_case files":
253
+ code_files = list(p.rglob("*.py")) + list(p.rglob("*.ts")) + list(p.rglob("*.rs"))
254
+ code_files = [f for f in code_files if ".venv" not in str(f) and "node_modules" not in str(f)]
255
+ if not code_files:
256
+ return True
257
+ violations = [f for f in code_files if "-" in f.stem and not f.stem.startswith(".")]
258
+ return len(violations) == 0
259
+ elif convention == "kebab-case files":
260
+ code_files = list(p.rglob("*.py")) + list(p.rglob("*.ts"))
261
+ code_files = [f for f in code_files if ".venv" not in str(f)]
262
+ if not code_files:
263
+ return True
264
+ violations = [f for f in code_files if "_" in f.stem]
265
+ return len(violations) == 0
266
+ elif convention == "infra as code":
267
+ return (p / "infra").is_dir() or (p / "deploy").is_dir() or (p / "k8s").is_dir()
268
+ # Default: can't verify, assume pass
269
+ return True
270
+
271
+
272
+ @app.command(name="list")
273
+ def list_sources():
274
+ """List all indexed sources and stats."""
275
+ graph = store.get_graph()
276
+ stats = store.get_stats(graph)
277
+
278
+ console.print("\n[bold]Knowledge Base Stats[/]")
279
+ console.print(f" Chunks: {stats['chunks']}")
280
+ console.print(f" Documents: {stats['documents']}")
281
+ console.print(f" Repos: {stats['repos']}")
282
+
283
+ result = graph.query("MATCH (r:Repo) RETURN r.name, r.path")
284
+ if result.result_set:
285
+ console.print("\n[bold]Repos:[/]")
286
+ for name, path in result.result_set:
287
+ console.print(f" • {name or '(unnamed)'} — {path}")
288
+
289
+ result = graph.query("MATCH (t:Tech) WHERE t.category = 'language' OR t.category = 'infrastructure' RETURN t.name, t.category")
290
+ if result.result_set:
291
+ console.print("\n[bold]Stack:[/]")
292
+ for name, cat in result.result_set:
293
+ console.print(f" • {name} ({cat})")
294
+
295
+
296
+ @app.command()
297
+ def remove(source: str = typer.Argument(..., help="Repo name or doc path to remove")):
298
+ """Remove an indexed source and all its chunks."""
299
+ graph = store.get_graph()
300
+ result = graph.query(
301
+ """MATCH (r:Repo {name: $name})
302
+ OPTIONAL MATCH (d:Document)-[:IN_REPO]->(r)
303
+ OPTIONAL MATCH (c:Chunk)-[:PART_OF]->(d)
304
+ OPTIONAL MATCH (r)-[e]->()
305
+ DELETE c, d, e, r
306
+ RETURN count(c)""",
307
+ params={"name": source},
308
+ )
309
+ deleted = result.result_set[0][0] if result.result_set else 0
310
+ if deleted > 0:
311
+ console.print(f"[green]✓ Removed[/] {source} ({deleted} chunks)")
312
+ else:
313
+ console.print(f"[yellow]Not found:[/] {source}")
314
+
315
+
316
+ @app.command()
317
+ def status():
318
+ """Check system health."""
319
+ try:
320
+ graph = store.get_graph()
321
+ store.get_stats(graph)
322
+ console.print("[green]✓[/] FalkorDB: connected")
323
+ except Exception as e:
324
+ console.print(f"[red]✗[/] FalkorDB: {e}")
325
+
326
+ try:
327
+ embeddings.embed("test")
328
+ console.print("[green]✓[/] Ollama: ready")
329
+ except Exception as e:
330
+ console.print(f"[red]✗[/] Ollama: {e}")
331
+
332
+
333
+ @app.command()
334
+ def serve(port: int = typer.Option(9999, help="Port for web UI")):
335
+ """Start the web UI."""
336
+ from .web import create_app
337
+ import uvicorn
338
+
339
+ console.print(f"[bold green]Knowledge Master UI[/] → http://127.0.0.1:{port}")
340
+ uvicorn.run(create_app(), host="127.0.0.1", port=port)
341
+
342
+
343
+ if __name__ == "__main__":
344
+ app()
@@ -0,0 +1,21 @@
1
+ """Embedding client using Ollama local models."""
2
+
3
+ import ollama
4
+
5
+ MODEL = "nomic-embed-text"
6
+
7
+
8
+ def embed(text: str) -> list[float]:
9
+ """Embed a single text string, returns vector."""
10
+ response = ollama.embed(model=MODEL, input=text)
11
+ return response["embeddings"][0]
12
+
13
+
14
+ def embed_batch(texts: list[str], batch_size: int = 64) -> list[list[float]]:
15
+ """Embed multiple texts in batches."""
16
+ vectors = []
17
+ for i in range(0, len(texts), batch_size):
18
+ batch = texts[i : i + batch_size]
19
+ response = ollama.embed(model=MODEL, input=batch)
20
+ vectors.extend(response["embeddings"])
21
+ return vectors