knowledge-master 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ """FalkorDB graph store - nodes, edges, vector search, and graph traversal."""
2
+
3
+ from falkordb import FalkorDB
4
+
5
+ GRAPH_NAME = "knowledge"
6
+
7
+ # Vector dimension for nomic-embed-text
8
+ VECTOR_DIM = 768
9
+
10
+
11
+ def get_graph(host: str = "localhost", port: int = 6379):
12
+ """Get FalkorDB graph instance."""
13
+ db = FalkorDB(host=host, port=port)
14
+ return db.select_graph(GRAPH_NAME)
15
+
16
+
17
+ def init_schema(graph):
18
+ """Create indexes and constraints."""
19
+ commands = [
20
+ # Vector indexes
21
+ "CREATE VECTOR INDEX FOR (c:Chunk) ON (c.embedding) OPTIONS {dimension: 768, similarityFunction: 'cosine'}",
22
+ # Fulltext indexes
23
+ "CREATE FULLTEXT INDEX FOR (c:Chunk) ON (c.text)",
24
+ # Regular indexes for lookups
25
+ "CREATE INDEX FOR (d:Document) ON (d.path)",
26
+ "CREATE INDEX FOR (r:Repo) ON (r.name)",
27
+ "CREATE INDEX FOR (p:Person) ON (p.email)",
28
+ "CREATE INDEX FOR (f:File) ON (f.path)",
29
+ ]
30
+ for cmd in commands:
31
+ try:
32
+ graph.query(cmd)
33
+ except Exception:
34
+ pass # index may already exist
35
+
36
+
37
+ def upsert_chunk(graph, chunk_id: str, text: str, embedding: list[float], metadata: dict):
38
+ """Insert or update a chunk node with embedding."""
39
+ graph.query(
40
+ """MERGE (c:Chunk {id: $id})
41
+ SET c.text = $text, c.embedding = vecf32($embedding),
42
+ c.source = $source, c.source_type = $source_type,
43
+ c.indexed_at = timestamp()""",
44
+ params={
45
+ "id": chunk_id,
46
+ "text": text,
47
+ "embedding": embedding,
48
+ "source": metadata.get("source", ""),
49
+ "source_type": metadata.get("source_type", ""),
50
+ },
51
+ )
52
+
53
+
54
+ def upsert_document(graph, path: str, doc_type: str, metadata: dict):
55
+ """Insert or update a document node."""
56
+ graph.query(
57
+ """MERGE (d:Document {path: $path})
58
+ SET d.type = $type, d.title = $title, d.indexed_at = timestamp()""",
59
+ params={"path": path, "type": doc_type, "title": metadata.get("title", "")},
60
+ )
61
+
62
+
63
+ def upsert_repo(graph, name: str, path: str):
64
+ """Insert or update a repo node."""
65
+ graph.query(
66
+ "MERGE (r:Repo {name: $name}) SET r.path = $path",
67
+ params={"name": name, "path": path},
68
+ )
69
+
70
+
71
+ def upsert_person(graph, name: str, email: str):
72
+ """Insert or update a person node."""
73
+ graph.query(
74
+ "MERGE (p:Person {email: $email}) SET p.name = $name",
75
+ params={"name": name, "email": email},
76
+ )
77
+
78
+
79
+ def link_chunk_to_document(graph, chunk_id: str, doc_path: str):
80
+ """Create PART_OF edge from chunk to document."""
81
+ graph.query(
82
+ """MATCH (c:Chunk {id: $chunk_id}), (d:Document {path: $doc_path})
83
+ MERGE (c)-[:PART_OF]->(d)""",
84
+ params={"chunk_id": chunk_id, "doc_path": doc_path},
85
+ )
86
+
87
+
88
+ def link_document_to_repo(graph, doc_path: str, repo_name: str):
89
+ """Create IN_REPO edge."""
90
+ graph.query(
91
+ """MATCH (d:Document {path: $doc_path}), (r:Repo {name: $repo_name})
92
+ MERGE (d)-[:IN_REPO]->(r)""",
93
+ params={"doc_path": doc_path, "repo_name": repo_name},
94
+ )
95
+
96
+
97
+ def link_person_authored(graph, email: str, doc_path: str):
98
+ """Create AUTHORED edge."""
99
+ graph.query(
100
+ """MATCH (p:Person {email: $email}), (d:Document {path: $doc_path})
101
+ MERGE (p)-[:AUTHORED]->(d)""",
102
+ params={"email": email, "doc_path": doc_path},
103
+ )
104
+
105
+
106
+ def vector_search(graph, query_embedding: list[float], top_k: int = 10, filters: dict = None):
107
+ """Semantic vector search across chunks."""
108
+ filter_clause = ""
109
+ params = {"embedding": query_embedding, "top_k": top_k}
110
+
111
+ if filters and filters.get("source_type"):
112
+ filter_clause = "WHERE c.source_type = $source_type"
113
+ params["source_type"] = filters["source_type"]
114
+
115
+ result = graph.query(
116
+ f"""CALL db.idx.vector.queryNodes('Chunk', 'embedding', $top_k, vecf32($embedding))
117
+ YIELD node AS c, score
118
+ {filter_clause}
119
+ RETURN c.id AS id, c.text AS text, c.source AS source,
120
+ c.source_type AS source_type, score
121
+ ORDER BY score DESC""",
122
+ params=params,
123
+ )
124
+ return [
125
+ {"id": r[0], "text": r[1], "source": r[2], "source_type": r[3], "score": r[4]}
126
+ for r in result.result_set
127
+ ]
128
+
129
+
130
+ def graph_context_search(graph, query_embedding: list[float], top_k: int = 5):
131
+ """Hybrid search: vector find + graph traversal for related context."""
132
+ result = graph.query(
133
+ """CALL db.idx.vector.queryNodes('Chunk', 'embedding', $top_k, vecf32($embedding))
134
+ YIELD node AS c, score
135
+ OPTIONAL MATCH (c)-[:PART_OF]->(d:Document)-[:IN_REPO]->(r:Repo)
136
+ OPTIONAL MATCH (p:Person)-[:AUTHORED]->(d)
137
+ RETURN c.text AS text, c.source AS source, score,
138
+ d.path AS doc_path, r.name AS repo, p.name AS author
139
+ ORDER BY score DESC""",
140
+ params={"embedding": query_embedding, "top_k": top_k},
141
+ )
142
+ return [
143
+ {
144
+ "text": r[0],
145
+ "source": r[1],
146
+ "score": r[2],
147
+ "doc_path": r[3],
148
+ "repo": r[4],
149
+ "author": r[5],
150
+ }
151
+ for r in result.result_set
152
+ ]
153
+
154
+
155
+ def get_stats(graph):
156
+ """Get graph statistics."""
157
+ result = graph.query(
158
+ """MATCH (c:Chunk) WITH count(c) AS chunks
159
+ MATCH (d:Document) WITH chunks, count(d) AS docs
160
+ MATCH (r:Repo) WITH chunks, docs, count(r) AS repos
161
+ RETURN chunks, docs, repos"""
162
+ )
163
+ row = result.result_set[0] if result.result_set else [0, 0, 0]
164
+ return {"chunks": row[0], "documents": row[1], "repos": row[2]}
@@ -0,0 +1,104 @@
1
+ """Incremental indexing — git hook and file watcher support."""
2
+
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from . import store
8
+ from .parsers.git_repo import _index_file, _should_index, INDEXABLE_EXTENSIONS
9
+
10
+ from git import Repo
11
+
12
+
13
+ def index_changed_files(repo_path: str, since: str = "HEAD~1"):
14
+ """Index only files that changed since a given ref."""
15
+ repo_path = str(Path(repo_path).expanduser().resolve())
16
+ repo = Repo(repo_path)
17
+ repo_name = Path(repo_path).name
18
+ graph = store.get_graph()
19
+ store.init_schema(graph)
20
+ store.upsert_repo(graph, repo_name, repo_path)
21
+
22
+ # Get changed files
23
+ try:
24
+ diff_output = repo.git.diff("--name-only", since)
25
+ changed = [f for f in diff_output.splitlines() if _should_index(f)]
26
+ except Exception:
27
+ changed = []
28
+
29
+ indexed = 0
30
+ for filepath in changed:
31
+ full_path = os.path.join(repo_path, filepath)
32
+ if os.path.exists(full_path):
33
+ try:
34
+ _index_file(graph, full_path, filepath, repo_name, repo)
35
+ indexed += 1
36
+ except Exception:
37
+ pass
38
+ else:
39
+ # File was deleted — remove its chunks
40
+ graph.query(
41
+ """MATCH (d:Document {path: $path})
42
+ OPTIONAL MATCH (c:Chunk)-[:PART_OF]->(d)
43
+ DELETE c, d""",
44
+ params={"path": filepath},
45
+ )
46
+
47
+ return {"repo": repo_name, "changed": len(changed), "indexed": indexed}
48
+
49
+
50
+ def install_git_hook(repo_path: str):
51
+ """Install a post-commit git hook that triggers incremental indexing."""
52
+ repo_path = str(Path(repo_path).expanduser().resolve())
53
+ hook_dir = Path(repo_path) / ".git" / "hooks"
54
+ hook_file = hook_dir / "post-commit"
55
+
56
+ # Find km executable
57
+ km_bin = Path(sys.executable).parent / "km"
58
+ if not km_bin.exists():
59
+ km_bin = f"{sys.executable} -m knowledge_master.cli"
60
+
61
+ hook_content = f"""#!/bin/sh
62
+ # Knowledge Master — auto-index on commit
63
+ {km_bin} index {repo_path} --type repo 2>/dev/null &
64
+ """
65
+
66
+ hook_file.write_text(hook_content)
67
+ hook_file.chmod(0o755)
68
+ return str(hook_file)
69
+
70
+
71
+ def watch_directory(path: str, callback=None):
72
+ """Watch a directory for changes and re-index (uses polling for cross-platform)."""
73
+ import time
74
+ path = str(Path(path).expanduser().resolve())
75
+ last_mtimes = {}
76
+
77
+ def scan():
78
+ current = {}
79
+ for ext in INDEXABLE_EXTENSIONS:
80
+ for f in Path(path).rglob(f"*{ext}"):
81
+ if ".git" in f.parts or ".venv" in f.parts:
82
+ continue
83
+ current[str(f)] = f.stat().st_mtime
84
+ return current
85
+
86
+ last_mtimes = scan()
87
+ print(f"Watching {path} for changes... (Ctrl+C to stop)")
88
+
89
+ while True:
90
+ time.sleep(2)
91
+ current = scan()
92
+ changed = [f for f, mtime in current.items() if last_mtimes.get(f) != mtime]
93
+ deleted = [f for f in last_mtimes if f not in current]
94
+
95
+ if changed or deleted:
96
+ if callback:
97
+ callback(changed, deleted)
98
+ else:
99
+ print(f" Changed: {len(changed)}, Deleted: {len(deleted)}")
100
+ if (Path(path) / ".git").exists():
101
+ result = index_changed_files(path, "HEAD~1")
102
+ print(f" Indexed: {result}")
103
+
104
+ last_mtimes = current