knowledge-master 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowledge_master/__init__.py +0 -0
- knowledge_master/__main__.py +4 -0
- knowledge_master/chunking.py +106 -0
- knowledge_master/cli.py +344 -0
- knowledge_master/embeddings.py +21 -0
- knowledge_master/intelligence.py +254 -0
- knowledge_master/parsers/__init__.py +0 -0
- knowledge_master/parsers/git_repo.py +115 -0
- knowledge_master/parsers/markdown.py +58 -0
- knowledge_master/server.py +194 -0
- knowledge_master/store.py +164 -0
- knowledge_master/watcher.py +104 -0
- knowledge_master/web.py +568 -0
- knowledge_master-0.1.0.dist-info/METADATA +275 -0
- knowledge_master-0.1.0.dist-info/RECORD +19 -0
- knowledge_master-0.1.0.dist-info/WHEEL +5 -0
- knowledge_master-0.1.0.dist-info/entry_points.txt +3 -0
- knowledge_master-0.1.0.dist-info/licenses/LICENSE +21 -0
- knowledge_master-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""FalkorDB graph store - nodes, edges, vector search, and graph traversal."""
|
|
2
|
+
|
|
3
|
+
from falkordb import FalkorDB
|
|
4
|
+
|
|
5
|
+
GRAPH_NAME = "knowledge"
|
|
6
|
+
|
|
7
|
+
# Vector dimension for nomic-embed-text
|
|
8
|
+
VECTOR_DIM = 768
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_graph(host: str = "localhost", port: int = 6379):
|
|
12
|
+
"""Get FalkorDB graph instance."""
|
|
13
|
+
db = FalkorDB(host=host, port=port)
|
|
14
|
+
return db.select_graph(GRAPH_NAME)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def init_schema(graph):
|
|
18
|
+
"""Create indexes and constraints."""
|
|
19
|
+
commands = [
|
|
20
|
+
# Vector indexes
|
|
21
|
+
"CREATE VECTOR INDEX FOR (c:Chunk) ON (c.embedding) OPTIONS {dimension: 768, similarityFunction: 'cosine'}",
|
|
22
|
+
# Fulltext indexes
|
|
23
|
+
"CREATE FULLTEXT INDEX FOR (c:Chunk) ON (c.text)",
|
|
24
|
+
# Regular indexes for lookups
|
|
25
|
+
"CREATE INDEX FOR (d:Document) ON (d.path)",
|
|
26
|
+
"CREATE INDEX FOR (r:Repo) ON (r.name)",
|
|
27
|
+
"CREATE INDEX FOR (p:Person) ON (p.email)",
|
|
28
|
+
"CREATE INDEX FOR (f:File) ON (f.path)",
|
|
29
|
+
]
|
|
30
|
+
for cmd in commands:
|
|
31
|
+
try:
|
|
32
|
+
graph.query(cmd)
|
|
33
|
+
except Exception:
|
|
34
|
+
pass # index may already exist
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def upsert_chunk(graph, chunk_id: str, text: str, embedding: list[float], metadata: dict):
|
|
38
|
+
"""Insert or update a chunk node with embedding."""
|
|
39
|
+
graph.query(
|
|
40
|
+
"""MERGE (c:Chunk {id: $id})
|
|
41
|
+
SET c.text = $text, c.embedding = vecf32($embedding),
|
|
42
|
+
c.source = $source, c.source_type = $source_type,
|
|
43
|
+
c.indexed_at = timestamp()""",
|
|
44
|
+
params={
|
|
45
|
+
"id": chunk_id,
|
|
46
|
+
"text": text,
|
|
47
|
+
"embedding": embedding,
|
|
48
|
+
"source": metadata.get("source", ""),
|
|
49
|
+
"source_type": metadata.get("source_type", ""),
|
|
50
|
+
},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def upsert_document(graph, path: str, doc_type: str, metadata: dict):
|
|
55
|
+
"""Insert or update a document node."""
|
|
56
|
+
graph.query(
|
|
57
|
+
"""MERGE (d:Document {path: $path})
|
|
58
|
+
SET d.type = $type, d.title = $title, d.indexed_at = timestamp()""",
|
|
59
|
+
params={"path": path, "type": doc_type, "title": metadata.get("title", "")},
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def upsert_repo(graph, name: str, path: str):
|
|
64
|
+
"""Insert or update a repo node."""
|
|
65
|
+
graph.query(
|
|
66
|
+
"MERGE (r:Repo {name: $name}) SET r.path = $path",
|
|
67
|
+
params={"name": name, "path": path},
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def upsert_person(graph, name: str, email: str):
|
|
72
|
+
"""Insert or update a person node."""
|
|
73
|
+
graph.query(
|
|
74
|
+
"MERGE (p:Person {email: $email}) SET p.name = $name",
|
|
75
|
+
params={"name": name, "email": email},
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def link_chunk_to_document(graph, chunk_id: str, doc_path: str):
|
|
80
|
+
"""Create PART_OF edge from chunk to document."""
|
|
81
|
+
graph.query(
|
|
82
|
+
"""MATCH (c:Chunk {id: $chunk_id}), (d:Document {path: $doc_path})
|
|
83
|
+
MERGE (c)-[:PART_OF]->(d)""",
|
|
84
|
+
params={"chunk_id": chunk_id, "doc_path": doc_path},
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def link_document_to_repo(graph, doc_path: str, repo_name: str):
|
|
89
|
+
"""Create IN_REPO edge."""
|
|
90
|
+
graph.query(
|
|
91
|
+
"""MATCH (d:Document {path: $doc_path}), (r:Repo {name: $repo_name})
|
|
92
|
+
MERGE (d)-[:IN_REPO]->(r)""",
|
|
93
|
+
params={"doc_path": doc_path, "repo_name": repo_name},
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def link_person_authored(graph, email: str, doc_path: str):
|
|
98
|
+
"""Create AUTHORED edge."""
|
|
99
|
+
graph.query(
|
|
100
|
+
"""MATCH (p:Person {email: $email}), (d:Document {path: $doc_path})
|
|
101
|
+
MERGE (p)-[:AUTHORED]->(d)""",
|
|
102
|
+
params={"email": email, "doc_path": doc_path},
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def vector_search(graph, query_embedding: list[float], top_k: int = 10, filters: dict = None):
|
|
107
|
+
"""Semantic vector search across chunks."""
|
|
108
|
+
filter_clause = ""
|
|
109
|
+
params = {"embedding": query_embedding, "top_k": top_k}
|
|
110
|
+
|
|
111
|
+
if filters and filters.get("source_type"):
|
|
112
|
+
filter_clause = "WHERE c.source_type = $source_type"
|
|
113
|
+
params["source_type"] = filters["source_type"]
|
|
114
|
+
|
|
115
|
+
result = graph.query(
|
|
116
|
+
f"""CALL db.idx.vector.queryNodes('Chunk', 'embedding', $top_k, vecf32($embedding))
|
|
117
|
+
YIELD node AS c, score
|
|
118
|
+
{filter_clause}
|
|
119
|
+
RETURN c.id AS id, c.text AS text, c.source AS source,
|
|
120
|
+
c.source_type AS source_type, score
|
|
121
|
+
ORDER BY score DESC""",
|
|
122
|
+
params=params,
|
|
123
|
+
)
|
|
124
|
+
return [
|
|
125
|
+
{"id": r[0], "text": r[1], "source": r[2], "source_type": r[3], "score": r[4]}
|
|
126
|
+
for r in result.result_set
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def graph_context_search(graph, query_embedding: list[float], top_k: int = 5):
|
|
131
|
+
"""Hybrid search: vector find + graph traversal for related context."""
|
|
132
|
+
result = graph.query(
|
|
133
|
+
"""CALL db.idx.vector.queryNodes('Chunk', 'embedding', $top_k, vecf32($embedding))
|
|
134
|
+
YIELD node AS c, score
|
|
135
|
+
OPTIONAL MATCH (c)-[:PART_OF]->(d:Document)-[:IN_REPO]->(r:Repo)
|
|
136
|
+
OPTIONAL MATCH (p:Person)-[:AUTHORED]->(d)
|
|
137
|
+
RETURN c.text AS text, c.source AS source, score,
|
|
138
|
+
d.path AS doc_path, r.name AS repo, p.name AS author
|
|
139
|
+
ORDER BY score DESC""",
|
|
140
|
+
params={"embedding": query_embedding, "top_k": top_k},
|
|
141
|
+
)
|
|
142
|
+
return [
|
|
143
|
+
{
|
|
144
|
+
"text": r[0],
|
|
145
|
+
"source": r[1],
|
|
146
|
+
"score": r[2],
|
|
147
|
+
"doc_path": r[3],
|
|
148
|
+
"repo": r[4],
|
|
149
|
+
"author": r[5],
|
|
150
|
+
}
|
|
151
|
+
for r in result.result_set
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_stats(graph):
|
|
156
|
+
"""Get graph statistics."""
|
|
157
|
+
result = graph.query(
|
|
158
|
+
"""MATCH (c:Chunk) WITH count(c) AS chunks
|
|
159
|
+
MATCH (d:Document) WITH chunks, count(d) AS docs
|
|
160
|
+
MATCH (r:Repo) WITH chunks, docs, count(r) AS repos
|
|
161
|
+
RETURN chunks, docs, repos"""
|
|
162
|
+
)
|
|
163
|
+
row = result.result_set[0] if result.result_set else [0, 0, 0]
|
|
164
|
+
return {"chunks": row[0], "documents": row[1], "repos": row[2]}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Incremental indexing — git hook and file watcher support."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from . import store
|
|
8
|
+
from .parsers.git_repo import _index_file, _should_index, INDEXABLE_EXTENSIONS
|
|
9
|
+
|
|
10
|
+
from git import Repo
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def index_changed_files(repo_path: str, since: str = "HEAD~1"):
|
|
14
|
+
"""Index only files that changed since a given ref."""
|
|
15
|
+
repo_path = str(Path(repo_path).expanduser().resolve())
|
|
16
|
+
repo = Repo(repo_path)
|
|
17
|
+
repo_name = Path(repo_path).name
|
|
18
|
+
graph = store.get_graph()
|
|
19
|
+
store.init_schema(graph)
|
|
20
|
+
store.upsert_repo(graph, repo_name, repo_path)
|
|
21
|
+
|
|
22
|
+
# Get changed files
|
|
23
|
+
try:
|
|
24
|
+
diff_output = repo.git.diff("--name-only", since)
|
|
25
|
+
changed = [f for f in diff_output.splitlines() if _should_index(f)]
|
|
26
|
+
except Exception:
|
|
27
|
+
changed = []
|
|
28
|
+
|
|
29
|
+
indexed = 0
|
|
30
|
+
for filepath in changed:
|
|
31
|
+
full_path = os.path.join(repo_path, filepath)
|
|
32
|
+
if os.path.exists(full_path):
|
|
33
|
+
try:
|
|
34
|
+
_index_file(graph, full_path, filepath, repo_name, repo)
|
|
35
|
+
indexed += 1
|
|
36
|
+
except Exception:
|
|
37
|
+
pass
|
|
38
|
+
else:
|
|
39
|
+
# File was deleted — remove its chunks
|
|
40
|
+
graph.query(
|
|
41
|
+
"""MATCH (d:Document {path: $path})
|
|
42
|
+
OPTIONAL MATCH (c:Chunk)-[:PART_OF]->(d)
|
|
43
|
+
DELETE c, d""",
|
|
44
|
+
params={"path": filepath},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return {"repo": repo_name, "changed": len(changed), "indexed": indexed}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def install_git_hook(repo_path: str):
|
|
51
|
+
"""Install a post-commit git hook that triggers incremental indexing."""
|
|
52
|
+
repo_path = str(Path(repo_path).expanduser().resolve())
|
|
53
|
+
hook_dir = Path(repo_path) / ".git" / "hooks"
|
|
54
|
+
hook_file = hook_dir / "post-commit"
|
|
55
|
+
|
|
56
|
+
# Find km executable
|
|
57
|
+
km_bin = Path(sys.executable).parent / "km"
|
|
58
|
+
if not km_bin.exists():
|
|
59
|
+
km_bin = f"{sys.executable} -m knowledge_master.cli"
|
|
60
|
+
|
|
61
|
+
hook_content = f"""#!/bin/sh
|
|
62
|
+
# Knowledge Master — auto-index on commit
|
|
63
|
+
{km_bin} index {repo_path} --type repo 2>/dev/null &
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
hook_file.write_text(hook_content)
|
|
67
|
+
hook_file.chmod(0o755)
|
|
68
|
+
return str(hook_file)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def watch_directory(path: str, callback=None):
|
|
72
|
+
"""Watch a directory for changes and re-index (uses polling for cross-platform)."""
|
|
73
|
+
import time
|
|
74
|
+
path = str(Path(path).expanduser().resolve())
|
|
75
|
+
last_mtimes = {}
|
|
76
|
+
|
|
77
|
+
def scan():
|
|
78
|
+
current = {}
|
|
79
|
+
for ext in INDEXABLE_EXTENSIONS:
|
|
80
|
+
for f in Path(path).rglob(f"*{ext}"):
|
|
81
|
+
if ".git" in f.parts or ".venv" in f.parts:
|
|
82
|
+
continue
|
|
83
|
+
current[str(f)] = f.stat().st_mtime
|
|
84
|
+
return current
|
|
85
|
+
|
|
86
|
+
last_mtimes = scan()
|
|
87
|
+
print(f"Watching {path} for changes... (Ctrl+C to stop)")
|
|
88
|
+
|
|
89
|
+
while True:
|
|
90
|
+
time.sleep(2)
|
|
91
|
+
current = scan()
|
|
92
|
+
changed = [f for f, mtime in current.items() if last_mtimes.get(f) != mtime]
|
|
93
|
+
deleted = [f for f in last_mtimes if f not in current]
|
|
94
|
+
|
|
95
|
+
if changed or deleted:
|
|
96
|
+
if callback:
|
|
97
|
+
callback(changed, deleted)
|
|
98
|
+
else:
|
|
99
|
+
print(f" Changed: {len(changed)}, Deleted: {len(deleted)}")
|
|
100
|
+
if (Path(path) / ".git").exists():
|
|
101
|
+
result = index_changed_files(path, "HEAD~1")
|
|
102
|
+
print(f" Indexed: {result}")
|
|
103
|
+
|
|
104
|
+
last_mtimes = current
|