nelgraph 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nelgraph/__init__.py +185 -0
- nelgraph/cli.py +144 -0
- nelgraph/community/__init__.py +1 -0
- nelgraph/community/detector.py +68 -0
- nelgraph/community/summarizer.py +208 -0
- nelgraph/config.py +79 -0
- nelgraph/core/__init__.py +1 -0
- nelgraph/core/init_pipeline.py +341 -0
- nelgraph/core/sync_pipeline.py +212 -0
- nelgraph/docker-compose.yml +24 -0
- nelgraph/embeddings/__init__.py +1 -0
- nelgraph/embeddings/chroma_client.py +222 -0
- nelgraph/embeddings/embedder.py +92 -0
- nelgraph/extractors/__init__.py +1 -0
- nelgraph/extractors/llm_extractor.py +182 -0
- nelgraph/extractors/testing_enricher.py +327 -0
- nelgraph/graph/__init__.py +1 -0
- nelgraph/graph/builder.py +274 -0
- nelgraph/graph/neo4j_client.py +161 -0
- nelgraph/graph/schema.py +43 -0
- nelgraph/initialize_graph.py +174 -0
- nelgraph/knowledge_base.py +255 -0
- nelgraph/parsers/__init__.py +1 -0
- nelgraph/parsers/ast_parser.py +11 -0
- nelgraph/parsers/base_parser.py +529 -0
- nelgraph/parsers/doc_parser.py +44 -0
- nelgraph/parsers/git_parser.py +105 -0
- nelgraph/parsers/php_parser.py +396 -0
- nelgraph/query/__init__.py +1 -0
- nelgraph/query/engine.py +157 -0
- nelgraph/updater/__init__.py +1 -0
- nelgraph/updater/git_hook.py +109 -0
- nelgraph/updater/watcher.py +70 -0
- nelgraph-1.0.0.dist-info/METADATA +94 -0
- nelgraph-1.0.0.dist-info/RECORD +37 -0
- nelgraph-1.0.0.dist-info/WHEEL +4 -0
- nelgraph-1.0.0.dist-info/entry_points.txt +2 -0
nelgraph/__init__.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GraphRAG Knowledge Base — Internal Python Module
|
|
3
|
+
|
|
4
|
+
Cách dùng nhanh nhất:
|
|
5
|
+
|
|
6
|
+
import graphrag
|
|
7
|
+
graphrag.configure(codebase_path="/path/to/project", openrouter_api_key="sk-...")
|
|
8
|
+
graphrag.run_init()
|
|
9
|
+
|
|
10
|
+
ctx = graphrag.get_function_context("processOrder")
|
|
11
|
+
snap = graphrag.get_snapshot()
|
|
12
|
+
changes = graphrag.get_changes("abc123f")
|
|
13
|
+
graphrag.mark_tested("processOrder")
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# --- Public API ---
|
|
17
|
+
from nelgraph.knowledge_base import (
|
|
18
|
+
get_function_context,
|
|
19
|
+
get_snapshot,
|
|
20
|
+
get_changes,
|
|
21
|
+
mark_tested,
|
|
22
|
+
search,
|
|
23
|
+
run_init,
|
|
24
|
+
run_sync,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
__version__ = "1.0.0"
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"configure",
|
|
31
|
+
"get_function_context",
|
|
32
|
+
"get_snapshot",
|
|
33
|
+
"get_changes",
|
|
34
|
+
"mark_tested",
|
|
35
|
+
"search",
|
|
36
|
+
"run_init",
|
|
37
|
+
"run_sync",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def configure(
|
|
42
|
+
codebase_path: str = None,
|
|
43
|
+
openrouter_api_key: str = None,
|
|
44
|
+
neo4j_uri: str = None,
|
|
45
|
+
neo4j_password: str = None,
|
|
46
|
+
neo4j_user: str = None,
|
|
47
|
+
llm_model: str = None,
|
|
48
|
+
embedding_model: str = None,
|
|
49
|
+
embedding_dimensions: int = None,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Cấu hình graphrag bằng code thay vì .env file.
|
|
53
|
+
Gọi hàm này TRƯỚC khi dùng bất kỳ function nào khác.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
codebase_path: Đường dẫn tuyệt đối đến codebase cần analyze.
|
|
57
|
+
openrouter_api_key: API key của OpenRouter.
|
|
58
|
+
neo4j_uri: URI kết nối Neo4j (default: bolt://127.0.0.1:7687).
|
|
59
|
+
neo4j_password: Password Neo4j.
|
|
60
|
+
neo4j_user: Username Neo4j (default: neo4j).
|
|
61
|
+
llm_model: Model ID trên OpenRouter cho LLM enrichment.
|
|
62
|
+
embedding_model: Model ID trên OpenRouter cho embeddings.
|
|
63
|
+
embedding_dimensions: Số chiều vector (default: 512).
|
|
64
|
+
|
|
65
|
+
Ví dụ:
|
|
66
|
+
graphrag.configure(
|
|
67
|
+
codebase_path="/home/user/opensourcepos",
|
|
68
|
+
openrouter_api_key="sk-or-...",
|
|
69
|
+
)
|
|
70
|
+
"""
|
|
71
|
+
import os
|
|
72
|
+
import nelgraph.config as _cfg
|
|
73
|
+
|
|
74
|
+
if codebase_path:
|
|
75
|
+
import os as _os
|
|
76
|
+
codebase_path = _os.path.abspath(codebase_path).replace("\\", "/")
|
|
77
|
+
_cfg.CODEBASE_PATH = codebase_path
|
|
78
|
+
os.environ["CODEBASE_PATH"] = codebase_path
|
|
79
|
+
|
|
80
|
+
# Recalculate dependent paths
|
|
81
|
+
_cfg.GRAPHRAG_DATA_DIR = _os.path.join(codebase_path, ".graphrag_data").replace("\\", "/")
|
|
82
|
+
_cfg.NEO4J_DATA_DIR = _os.path.join(_cfg.GRAPHRAG_DATA_DIR, "neo4j", "data").replace("\\", "/")
|
|
83
|
+
_cfg.NEO4J_LOGS_DIR = _os.path.join(_cfg.GRAPHRAG_DATA_DIR, "neo4j", "logs").replace("\\", "/")
|
|
84
|
+
_cfg.CHROMA_PATH = _os.path.join(_cfg.GRAPHRAG_DATA_DIR, "chromadb").replace("\\", "/")
|
|
85
|
+
_cfg.SYNC_STATE_PATH = _os.path.join(_cfg.GRAPHRAG_DATA_DIR, "sync_state.json").replace("\\", "/")
|
|
86
|
+
|
|
87
|
+
if openrouter_api_key:
|
|
88
|
+
_cfg.OPENROUTER_API_KEY = openrouter_api_key
|
|
89
|
+
os.environ["OPENROUTER_API_KEY"] = openrouter_api_key
|
|
90
|
+
# Reset lazy clients so they pick up the new key
|
|
91
|
+
_reset_ai_clients()
|
|
92
|
+
|
|
93
|
+
if neo4j_uri:
|
|
94
|
+
_cfg.NEO4J_URI = neo4j_uri
|
|
95
|
+
os.environ["NEO4J_URI"] = neo4j_uri
|
|
96
|
+
|
|
97
|
+
if neo4j_password:
|
|
98
|
+
_cfg.NEO4J_PASSWORD = neo4j_password
|
|
99
|
+
os.environ["NEO4J_PASSWORD"] = neo4j_password
|
|
100
|
+
# Reset Neo4j singleton
|
|
101
|
+
import nelgraph.graph.neo4j_client as _nc
|
|
102
|
+
_nc._client = None
|
|
103
|
+
|
|
104
|
+
if neo4j_user:
|
|
105
|
+
_cfg.NEO4J_USER = neo4j_user
|
|
106
|
+
|
|
107
|
+
if llm_model:
|
|
108
|
+
_cfg.LLM_MODEL = llm_model
|
|
109
|
+
|
|
110
|
+
if embedding_model:
|
|
111
|
+
_cfg.EMBEDDING_MODEL = embedding_model
|
|
112
|
+
|
|
113
|
+
if embedding_dimensions:
|
|
114
|
+
_cfg.EMBEDDING_DIMENSIONS = embedding_dimensions
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _reset_ai_clients():
|
|
118
|
+
"""Reset tất cả lazy OpenAI client singletons để pick up config mới."""
|
|
119
|
+
try:
|
|
120
|
+
import nelgraph.embeddings.embedder as _emb
|
|
121
|
+
_emb._openai_client = None
|
|
122
|
+
except Exception:
|
|
123
|
+
pass
|
|
124
|
+
try:
|
|
125
|
+
import nelgraph.extractors.testing_enricher as _te
|
|
126
|
+
_te._client_ai = None
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
try:
|
|
130
|
+
import nelgraph.extractors.llm_extractor as _le
|
|
131
|
+
_le._client = None
|
|
132
|
+
except Exception:
|
|
133
|
+
pass
|
|
134
|
+
try:
|
|
135
|
+
import nelgraph.community.summarizer as _sm
|
|
136
|
+
_sm._client_ai = None
|
|
137
|
+
except Exception:
|
|
138
|
+
pass
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def status() -> dict:
|
|
142
|
+
"""
|
|
143
|
+
Trả về trạng thái hiện tại của graph.
|
|
144
|
+
Không cần Neo4j đang chạy — nếu không kết nối được thì báo offline.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
{
|
|
148
|
+
"neo4j": "connected" | "offline",
|
|
149
|
+
"codebase_path": "...",
|
|
150
|
+
"last_sync": "...",
|
|
151
|
+
"total_functions": 0,
|
|
152
|
+
"enriched_functions": 0,
|
|
153
|
+
}
|
|
154
|
+
"""
|
|
155
|
+
import nelgraph.config as _cfg
|
|
156
|
+
from nelgraph.initialize_graph import _load_sync_state
|
|
157
|
+
|
|
158
|
+
result = {
|
|
159
|
+
"neo4j": "offline",
|
|
160
|
+
"codebase_path": _cfg.CODEBASE_PATH,
|
|
161
|
+
"last_sync": None,
|
|
162
|
+
"total_functions": 0,
|
|
163
|
+
"enriched_functions": 0,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
sync_state = _load_sync_state()
|
|
167
|
+
if sync_state:
|
|
168
|
+
result["last_sync"] = sync_state.get("last_sync_time")
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
from nelgraph.graph.neo4j_client import get_client
|
|
172
|
+
client = get_client()
|
|
173
|
+
stats = client.run("""
|
|
174
|
+
OPTIONAL MATCH (f:Function) WITH count(f) as total
|
|
175
|
+
OPTIONAL MATCH (f2:Function) WHERE f2.how_it_works IS NOT NULL
|
|
176
|
+
RETURN total, count(f2) as enriched
|
|
177
|
+
""")
|
|
178
|
+
if stats:
|
|
179
|
+
result["neo4j"] = "connected"
|
|
180
|
+
result["total_functions"] = stats[0]["total"]
|
|
181
|
+
result["enriched_functions"] = stats[0]["enriched"]
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
return result
|
nelgraph/cli.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
import click
|
|
2
|
+
from rich.console import Console
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import requests
|
|
6
|
+
import nelgraph
|
|
7
|
+
|
|
8
|
+
if sys.platform.startswith("win"):
|
|
9
|
+
try:
|
|
10
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
11
|
+
sys.stderr.reconfigure(encoding="utf-8")
|
|
12
|
+
except Exception:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
console = Console()
|
|
16
|
+
|
|
17
|
+
def _check_for_updates():
|
|
18
|
+
try:
|
|
19
|
+
# Check PyPI version dynamically (timeout after 2s to prevent CLI blocking)
|
|
20
|
+
res = requests.get("https://pypi.org/pypi/nelgraph/json", timeout=2)
|
|
21
|
+
latest = res.json()["info"]["version"]
|
|
22
|
+
from nelgraph import __version__
|
|
23
|
+
if latest != __version__:
|
|
24
|
+
console.print(
|
|
25
|
+
f"[yellow]Update available: {__version__} → {latest}[/yellow]\n"
|
|
26
|
+
f"Run: [bold]pip install --upgrade nelgraph[/bold]"
|
|
27
|
+
)
|
|
28
|
+
except Exception:
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@click.group()
|
|
32
|
+
def main():
|
|
33
|
+
"""nelgraph — Codebase Knowledge Graph & Semantic Search CLI"""
|
|
34
|
+
_check_for_updates()
|
|
35
|
+
|
|
36
|
+
def _install_git_hook(codebase_path: str):
|
|
37
|
+
git_dir = os.path.join(codebase_path, ".git")
|
|
38
|
+
if not os.path.exists(git_dir):
|
|
39
|
+
return
|
|
40
|
+
hooks_dir = os.path.join(git_dir, "hooks")
|
|
41
|
+
os.makedirs(hooks_dir, exist_ok=True)
|
|
42
|
+
hook_path = os.path.join(hooks_dir, "post-commit")
|
|
43
|
+
|
|
44
|
+
# Simple shell hook to run nelgraph sync in background silently
|
|
45
|
+
hook_content = """#!/bin/sh
|
|
46
|
+
# Auto-sync graph in background after commit
|
|
47
|
+
nelgraph sync --silent &
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
with open(hook_path, "w", newline="\n", encoding="utf-8") as f:
|
|
51
|
+
f.write(hook_content)
|
|
52
|
+
# Make hook executable
|
|
53
|
+
import stat
|
|
54
|
+
st = os.stat(hook_path)
|
|
55
|
+
os.chmod(hook_path, st.st_mode | stat.S_IEXEC)
|
|
56
|
+
console.print("[green]✓ Git post-commit hook installed successfully.[/green]")
|
|
57
|
+
except Exception as e:
|
|
58
|
+
console.print(f"[yellow]⚠ Warning: Could not install Git post-commit hook: {e}[/yellow]")
|
|
59
|
+
|
|
60
|
+
@main.command()
|
|
61
|
+
@click.option("--key", help="OpenRouter API key")
|
|
62
|
+
@click.option("--path", default=".", help="Path to codebase (default: current dir)")
|
|
63
|
+
def init(key, path):
|
|
64
|
+
"""
|
|
65
|
+
Khởi tạo GraphRAG cho project hiện tại.
|
|
66
|
+
Tạo .env, start Neo4j Docker, parse + enrich toàn bộ codebase.
|
|
67
|
+
"""
|
|
68
|
+
abs_path = os.path.abspath(path).replace("\\", "/")
|
|
69
|
+
|
|
70
|
+
# Load env from target path if exists
|
|
71
|
+
from dotenv import load_dotenv
|
|
72
|
+
env_path = os.path.join(abs_path, ".env")
|
|
73
|
+
if os.path.exists(env_path):
|
|
74
|
+
load_dotenv(env_path)
|
|
75
|
+
|
|
76
|
+
api_key = key or os.getenv("OPENROUTER_API_KEY")
|
|
77
|
+
if not api_key:
|
|
78
|
+
api_key = click.prompt("OpenRouter API key", hide_input=True)
|
|
79
|
+
|
|
80
|
+
# Write or update .env in target directory
|
|
81
|
+
lines = []
|
|
82
|
+
if os.path.exists(env_path):
|
|
83
|
+
with open(env_path, "r", encoding="utf-8") as f:
|
|
84
|
+
lines = f.readlines()
|
|
85
|
+
|
|
86
|
+
has_key = False
|
|
87
|
+
has_path = False
|
|
88
|
+
new_lines = []
|
|
89
|
+
for line in lines:
|
|
90
|
+
if line.strip().startswith("OPENROUTER_API_KEY="):
|
|
91
|
+
new_lines.append(f"OPENROUTER_API_KEY={api_key}\n")
|
|
92
|
+
has_key = True
|
|
93
|
+
elif line.strip().startswith("CODEBASE_PATH="):
|
|
94
|
+
new_lines.append(f"CODEBASE_PATH={abs_path}\n")
|
|
95
|
+
has_path = True
|
|
96
|
+
else:
|
|
97
|
+
new_lines.append(line)
|
|
98
|
+
|
|
99
|
+
if not has_key:
|
|
100
|
+
new_lines.append(f"OPENROUTER_API_KEY={api_key}\n")
|
|
101
|
+
if not has_path:
|
|
102
|
+
new_lines.append(f"CODEBASE_PATH={abs_path}\n")
|
|
103
|
+
|
|
104
|
+
with open(env_path, "w", encoding="utf-8") as f:
|
|
105
|
+
f.writelines(new_lines)
|
|
106
|
+
|
|
107
|
+
console.print(f"[green]✓ Configured {env_path}[/green]")
|
|
108
|
+
|
|
109
|
+
# Programmatically configure nelgraph
|
|
110
|
+
nelgraph.configure(codebase_path=abs_path, openrouter_api_key=api_key)
|
|
111
|
+
|
|
112
|
+
# Run full initialization pipeline
|
|
113
|
+
nelgraph.run_init()
|
|
114
|
+
|
|
115
|
+
# Install git post-commit hook
|
|
116
|
+
_install_git_hook(abs_path)
|
|
117
|
+
|
|
118
|
+
@main.command()
|
|
119
|
+
@click.option("--silent", is_flag=True, help="Run silently without printing to stdout")
|
|
120
|
+
def sync(silent):
|
|
121
|
+
"""Sync thủ công — parse files đã thay đổi kể từ lần sync cuối."""
|
|
122
|
+
if silent:
|
|
123
|
+
# Redirect stdout/stderr to devnull
|
|
124
|
+
sys.stdout = open(os.devnull, 'w')
|
|
125
|
+
sys.stderr = open(os.devnull, 'w')
|
|
126
|
+
|
|
127
|
+
# Run sync pipeline
|
|
128
|
+
nelgraph.run_sync()
|
|
129
|
+
|
|
130
|
+
@main.command()
|
|
131
|
+
def status():
|
|
132
|
+
"""Xem trạng thái graph hiện tại."""
|
|
133
|
+
# Run status helper
|
|
134
|
+
from nelgraph.initialize_graph import run_status
|
|
135
|
+
run_status()
|
|
136
|
+
|
|
137
|
+
@main.command()
|
|
138
|
+
def watch():
|
|
139
|
+
"""Chạy file watcher — tự sync khi có file thay đổi."""
|
|
140
|
+
from nelgraph.updater.watcher import start_watcher
|
|
141
|
+
start_watcher()
|
|
142
|
+
|
|
143
|
+
if __name__ == "__main__":
|
|
144
|
+
main()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# GraphRAG Community Detection and Summarization Package
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import networkx as nx
|
|
2
|
+
import igraph as ig
|
|
3
|
+
import leidenalg
|
|
4
|
+
from nelgraph.graph.neo4j_client import get_client
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def build_networkx_graph() -> nx.Graph:
|
|
8
|
+
"""Convert Neo4j graph to NetworkX graph for algorithms."""
|
|
9
|
+
client = get_client()
|
|
10
|
+
|
|
11
|
+
G = nx.Graph()
|
|
12
|
+
|
|
13
|
+
# Get all nodes
|
|
14
|
+
nodes = client.run("MATCH (n) WHERE n.name IS NOT NULL RETURN elementId(n) as id, labels(n) as labels, n.name as name")
|
|
15
|
+
for record in nodes:
|
|
16
|
+
G.add_node(record["id"], name=record["name"], label=record["labels"][0] if record["labels"] else "Unknown")
|
|
17
|
+
|
|
18
|
+
# Get all edges
|
|
19
|
+
edges = client.run("MATCH (a)-[r]->(b) WHERE a.name IS NOT NULL AND b.name IS NOT NULL RETURN elementId(a) as from_id, elementId(b) as to_id, type(r) as rel_type")
|
|
20
|
+
for record in edges:
|
|
21
|
+
G.add_edge(record["from_id"], record["to_id"], rel_type=record["rel_type"])
|
|
22
|
+
|
|
23
|
+
print(f"[Community] NetworkX graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
|
|
24
|
+
return G
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def detect_communities() -> dict:
|
|
28
|
+
"""
|
|
29
|
+
Run Leiden algorithm via igraph/leidenalg.
|
|
30
|
+
Returns mapping node_id -> community_id.
|
|
31
|
+
Saves community_id to Neo4j nodes.
|
|
32
|
+
"""
|
|
33
|
+
G = build_networkx_graph()
|
|
34
|
+
if G.number_of_nodes() == 0:
|
|
35
|
+
print("[Community] Empty graph, skipping community detection.")
|
|
36
|
+
return {}
|
|
37
|
+
|
|
38
|
+
# Convert NetworkX to igraph
|
|
39
|
+
nx_nodes = list(G.nodes())
|
|
40
|
+
node_id_map = {n: i for i, n in enumerate(nx_nodes)}
|
|
41
|
+
|
|
42
|
+
ig_graph = ig.Graph()
|
|
43
|
+
ig_graph.add_vertices(len(nx_nodes))
|
|
44
|
+
|
|
45
|
+
for u, v in G.edges():
|
|
46
|
+
ig_graph.add_edge(node_id_map[u], node_id_map[v])
|
|
47
|
+
|
|
48
|
+
# Run Leiden algorithm
|
|
49
|
+
partition = leidenalg.find_partition(ig_graph, leidenalg.ModularityVertexPartition)
|
|
50
|
+
|
|
51
|
+
# Build result mapping: neo4j_node_id -> community_id
|
|
52
|
+
result = {}
|
|
53
|
+
for community_id, members in enumerate(partition):
|
|
54
|
+
for member_idx in members:
|
|
55
|
+
neo4j_id = nx_nodes[member_idx]
|
|
56
|
+
result[neo4j_id] = community_id
|
|
57
|
+
|
|
58
|
+
print(f"[Community] Detected {len(partition)} communities.")
|
|
59
|
+
|
|
60
|
+
# Save community IDs to Neo4j
|
|
61
|
+
client = get_client()
|
|
62
|
+
for node_id, community_id in result.items():
|
|
63
|
+
client.run("""
|
|
64
|
+
MATCH (n) WHERE elementId(n) = $node_id
|
|
65
|
+
SET n.community_id = $community_id
|
|
66
|
+
""", {"node_id": node_id, "community_id": community_id})
|
|
67
|
+
|
|
68
|
+
return result
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import openai
|
|
2
|
+
import re
|
|
3
|
+
from nelgraph.graph.neo4j_client import get_client
|
|
4
|
+
|
|
5
|
+
_client_ai = None
|
|
6
|
+
|
|
7
|
+
def _get_client_ai():
|
|
8
|
+
global _client_ai
|
|
9
|
+
if _client_ai is None:
|
|
10
|
+
import nelgraph.config as config
|
|
11
|
+
_client_ai = openai.OpenAI(
|
|
12
|
+
api_key=config.OPENROUTER_API_KEY,
|
|
13
|
+
base_url=config.OPENROUTER_BASE_URL,
|
|
14
|
+
)
|
|
15
|
+
return _client_ai
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_community_members(community_id: int) -> list[dict]:
|
|
19
|
+
"""Get all nodes belonging to a community."""
|
|
20
|
+
client = get_client()
|
|
21
|
+
result = client.run("""
|
|
22
|
+
MATCH (n) WHERE n.community_id = $cid AND n.name IS NOT NULL
|
|
23
|
+
RETURN labels(n) as labels, n.name as name, n.description as description
|
|
24
|
+
LIMIT 50
|
|
25
|
+
""", {"cid": community_id})
|
|
26
|
+
|
|
27
|
+
return [{"type": r["labels"][0], "name": r["name"], "description": r["description"]} for r in result]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def summarize_community(community_id: int) -> str:
|
|
31
|
+
"""Use LLM to create a brief summary (~200 tokens) for a community."""
|
|
32
|
+
members = get_community_members(community_id)
|
|
33
|
+
if not members:
|
|
34
|
+
return ""
|
|
35
|
+
|
|
36
|
+
members_text = "\n".join([f"- [{m['type']}] {m['name']}: {m['description'] or ''}" for m in members[:30]])
|
|
37
|
+
|
|
38
|
+
prompt = f"""You are summarizing a cluster of related code elements for a developer knowledge graph.
|
|
39
|
+
|
|
40
|
+
Community members:
|
|
41
|
+
{members_text}
|
|
42
|
+
|
|
43
|
+
Write a 2-3 sentence summary of this community that answers:
|
|
44
|
+
1. What is the main purpose/theme of this group?
|
|
45
|
+
2. What are the key elements?
|
|
46
|
+
3. Any notable risks, tasks, or decisions?
|
|
47
|
+
|
|
48
|
+
Keep it under 200 words. Be specific, not generic."""
|
|
49
|
+
|
|
50
|
+
import nelgraph.config as config
|
|
51
|
+
response = _get_client_ai().chat.completions.create(
|
|
52
|
+
model=config.LLM_MODEL,
|
|
53
|
+
max_tokens=300,
|
|
54
|
+
messages=[{"role": "user", "content": prompt}]
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return response.choices[0].message.content.strip()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def infer_community_name(community_id: int, summary: str) -> str:
|
|
61
|
+
"""Use LLM to give a short name to a community."""
|
|
62
|
+
import nelgraph.config as config
|
|
63
|
+
response = _get_client_ai().chat.completions.create(
|
|
64
|
+
model=config.LLM_MODEL,
|
|
65
|
+
max_tokens=20,
|
|
66
|
+
messages=[{"role": "user", "content": f"Give a 2-4 word name for this code community. Return ONLY the name:\n\n{summary}"}]
|
|
67
|
+
)
|
|
68
|
+
return response.choices[0].message.content.strip()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def summarize_all_communities():
|
|
72
|
+
"""Summarize all communities and save to Neo4j and ChromaDB."""
|
|
73
|
+
client = get_client()
|
|
74
|
+
|
|
75
|
+
import chromadb
|
|
76
|
+
from nelgraph.config import CHROMA_PATH
|
|
77
|
+
from nelgraph.embeddings.embedder import embed_texts
|
|
78
|
+
|
|
79
|
+
chroma = chromadb.PersistentClient(path=CHROMA_PATH)
|
|
80
|
+
try:
|
|
81
|
+
chroma.delete_collection("community_summaries")
|
|
82
|
+
except Exception:
|
|
83
|
+
pass
|
|
84
|
+
comm_collection = chroma.get_or_create_collection("community_summaries")
|
|
85
|
+
|
|
86
|
+
# Get list of community IDs
|
|
87
|
+
result = client.run("MATCH (n) WHERE n.community_id IS NOT NULL RETURN DISTINCT n.community_id as cid ORDER BY cid")
|
|
88
|
+
community_ids = [r["cid"] for r in result]
|
|
89
|
+
|
|
90
|
+
print(f"[Community] Summarizing {len(community_ids)} communities...")
|
|
91
|
+
|
|
92
|
+
llm_count = 0
|
|
93
|
+
auto_count = 0
|
|
94
|
+
|
|
95
|
+
batch_ids, batch_docs, batch_metas = [], [], []
|
|
96
|
+
|
|
97
|
+
for cid in community_ids:
|
|
98
|
+
members = get_community_members(cid)
|
|
99
|
+
if not members:
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
size = len(members)
|
|
103
|
+
if size < 3:
|
|
104
|
+
# Auto summarization for small communities
|
|
105
|
+
if size == 1:
|
|
106
|
+
name = f"Node: {members[0]['name']}"
|
|
107
|
+
summary = f"Isolated cluster containing element: {members[0]['name']} ({members[0]['type']})."
|
|
108
|
+
else:
|
|
109
|
+
name = f"Pair: {members[0]['name']} & {members[1]['name']}"
|
|
110
|
+
summary = f"Small cluster containing elements: {members[0]['name']} ({members[0]['type']}) and {members[1]['name']} ({members[1]['type']})."
|
|
111
|
+
auto_count += 1
|
|
112
|
+
else:
|
|
113
|
+
# LLM summarization for significant communities
|
|
114
|
+
members_text = "\n".join([f"- [{m['type']}] {m['name']}: {m['description'] or ''}" for m in members[:30]])
|
|
115
|
+
prompt = f"""You are summarizing a cluster of related code elements for a developer knowledge graph.
|
|
116
|
+
|
|
117
|
+
Community members:
|
|
118
|
+
{members_text}
|
|
119
|
+
|
|
120
|
+
Task:
|
|
121
|
+
1. Write a 2-3 sentence summary of this community that describes its main purpose/theme, key elements, and any notable risks, tasks, or decisions.
|
|
122
|
+
2. Provide a short, 2-4 word name for this community.
|
|
123
|
+
|
|
124
|
+
Return your response in the following format:
|
|
125
|
+
NAME: <your 2-4 word name>
|
|
126
|
+
SUMMARY: <your 2-3 sentence summary>"""
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
import nelgraph.config as config
|
|
130
|
+
response = _get_client_ai().chat.completions.create(
|
|
131
|
+
model=config.LLM_MODEL,
|
|
132
|
+
max_tokens=350,
|
|
133
|
+
messages=[{"role": "user", "content": prompt}]
|
|
134
|
+
)
|
|
135
|
+
text = response.choices[0].message.content.strip()
|
|
136
|
+
|
|
137
|
+
# Parse the name and summary
|
|
138
|
+
name = f"Community {cid}"
|
|
139
|
+
summary = ""
|
|
140
|
+
|
|
141
|
+
# Extract NAME
|
|
142
|
+
name_match = re.search(r"NAME:\s*(.*)", text, re.IGNORECASE)
|
|
143
|
+
if name_match:
|
|
144
|
+
name = name_match.group(1).strip()
|
|
145
|
+
name = name.strip('"\'*` ')
|
|
146
|
+
|
|
147
|
+
# Extract SUMMARY
|
|
148
|
+
summary_match = re.search(r"SUMMARY:\s*([\s\S]*)", text, re.IGNORECASE)
|
|
149
|
+
if summary_match:
|
|
150
|
+
summary = summary_match.group(1).strip()
|
|
151
|
+
else:
|
|
152
|
+
# Fallback parser
|
|
153
|
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
154
|
+
summary_lines = [l for l in lines if not l.upper().startswith("NAME:")]
|
|
155
|
+
summary = " ".join(summary_lines)
|
|
156
|
+
|
|
157
|
+
if not summary:
|
|
158
|
+
summary = text
|
|
159
|
+
|
|
160
|
+
except Exception as e:
|
|
161
|
+
name = f"Community {cid}"
|
|
162
|
+
summary = f"Cluster of related code elements including {members[0]['name']}."
|
|
163
|
+
llm_count += 1
|
|
164
|
+
|
|
165
|
+
# Create Community node in Neo4j
|
|
166
|
+
client.run("""
|
|
167
|
+
MERGE (c:Community {id: $cid})
|
|
168
|
+
SET c.name = $name, c.summary = $summary
|
|
169
|
+
""", {"cid": cid, "name": name, "summary": summary})
|
|
170
|
+
|
|
171
|
+
# Create BELONGS_TO edges
|
|
172
|
+
client.run("""
|
|
173
|
+
MATCH (c:Community {id: $cid})
|
|
174
|
+
MATCH (n) WHERE n.community_id = $cid AND NOT n:Community
|
|
175
|
+
MERGE (n)-[:BELONGS_TO]->(c)
|
|
176
|
+
""", {"cid": cid})
|
|
177
|
+
|
|
178
|
+
# Queue for ChromaDB embedding
|
|
179
|
+
batch_ids.append(str(cid))
|
|
180
|
+
batch_docs.append(summary)
|
|
181
|
+
batch_metas.append({"id": cid, "name": name})
|
|
182
|
+
|
|
183
|
+
if size < 3:
|
|
184
|
+
print(f" Community {cid} (Auto): '{name}'")
|
|
185
|
+
else:
|
|
186
|
+
print(f" Community {cid} (LLM): '{name}'")
|
|
187
|
+
|
|
188
|
+
# Embed and upsert in ChromaDB using batches
|
|
189
|
+
if batch_docs:
|
|
190
|
+
print(f"[Chroma] Embedding {len(batch_docs)} community summaries...")
|
|
191
|
+
try:
|
|
192
|
+
all_vectors = []
|
|
193
|
+
for i in range(0, len(batch_docs), 50):
|
|
194
|
+
slice_docs = batch_docs[i:i+50]
|
|
195
|
+
vectors = embed_texts(slice_docs)
|
|
196
|
+
all_vectors.extend(vectors)
|
|
197
|
+
|
|
198
|
+
comm_collection.upsert(
|
|
199
|
+
ids=batch_ids,
|
|
200
|
+
documents=batch_docs,
|
|
201
|
+
metadatas=batch_metas,
|
|
202
|
+
embeddings=all_vectors
|
|
203
|
+
)
|
|
204
|
+
print("[Chroma] All community summaries embedded.")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
print(f"[Chroma] Error embedding community summaries: {e}")
|
|
207
|
+
|
|
208
|
+
print(f"[Community] Summarization done. LLM calls: {llm_count}, Auto: {auto_count}.")
|
nelgraph/config.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
|
|
4
|
+
# Load .env from the current working directory first, and fall back to the config directory
|
|
5
|
+
load_dotenv()
|
|
6
|
+
config_dir = os.path.dirname(os.path.abspath(__file__))
|
|
7
|
+
load_dotenv(os.path.join(config_dir, ".env"))
|
|
8
|
+
|
|
9
|
+
PROJECT_NAME = os.getenv("PROJECT_NAME", "GraphRAG-Project")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Target project to index. By default, it is the current working directory.
|
|
14
|
+
CODEBASE_PATH = os.getenv("CODEBASE_PATH")
|
|
15
|
+
if not CODEBASE_PATH:
|
|
16
|
+
CODEBASE_PATH = "."
|
|
17
|
+
|
|
18
|
+
# If it's a relative path, resolve it relative to the current working directory
|
|
19
|
+
if not os.path.isabs(CODEBASE_PATH):
|
|
20
|
+
CODEBASE_PATH = os.path.abspath(CODEBASE_PATH)
|
|
21
|
+
|
|
22
|
+
CODEBASE_PATH = CODEBASE_PATH.replace("\\", "/")
|
|
23
|
+
|
|
24
|
+
# GraphRAG data directory — stored inside the target codebase's .graphrag_data/
|
|
25
|
+
GRAPHRAG_DATA_DIR = os.getenv("GRAPHRAG_DATA_DIR")
|
|
26
|
+
if not GRAPHRAG_DATA_DIR:
|
|
27
|
+
GRAPHRAG_DATA_DIR = os.path.join(CODEBASE_PATH, ".graphrag_data")
|
|
28
|
+
GRAPHRAG_DATA_DIR = GRAPHRAG_DATA_DIR.replace("\\", "/")
|
|
29
|
+
|
|
30
|
+
# Supported languages for AST parsing
|
|
31
|
+
SUPPORTED_LANGUAGES = {
|
|
32
|
+
".py": "python",
|
|
33
|
+
".js": "javascript",
|
|
34
|
+
".ts": "typescript",
|
|
35
|
+
".jsx": "javascript",
|
|
36
|
+
".tsx": "typescript",
|
|
37
|
+
".php": "php",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Directories to skip during parsing
|
|
41
|
+
IGNORE_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", "dist", "build", ".next", ".cursor", ".claude", ".codex", ".gemini", ".ai-log"}
|
|
42
|
+
tool_dir_name = os.path.basename(config_dir)
|
|
43
|
+
if tool_dir_name and os.path.abspath(CODEBASE_PATH) != os.path.abspath(config_dir):
|
|
44
|
+
IGNORE_DIRS.add(tool_dir_name)
|
|
45
|
+
|
|
46
|
+
# OpenRouter API (used for both LLM and embeddings)
|
|
47
|
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
48
|
+
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
|
|
49
|
+
|
|
50
|
+
# Model IDs on OpenRouter
|
|
51
|
+
LLM_MODEL = os.getenv("LLM_MODEL", "deepseek/deepseek-v4-flash")
|
|
52
|
+
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "openai/text-embedding-3-large")
|
|
53
|
+
EMBEDDING_DIMENSIONS = int(os.getenv("EMBEDDING_DIMENSIONS", "512"))
|
|
54
|
+
ENRICH_MIN_COMPLEXITY = int(os.getenv("ENRICH_MIN_COMPLEXITY", "2"))
|
|
55
|
+
|
|
56
|
+
# Neo4j
|
|
57
|
+
NEO4J_URI = os.getenv("NEO4J_URI", "bolt://127.0.0.1:7687")
|
|
58
|
+
import sys
|
|
59
|
+
if sys.platform == "win32" and "localhost" in NEO4J_URI:
|
|
60
|
+
NEO4J_URI = NEO4J_URI.replace("localhost", "127.0.0.1")
|
|
61
|
+
NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
|
|
62
|
+
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "graphrag123")
|
|
63
|
+
|
|
64
|
+
# Neo4j data directories (for Docker volume mounts)
|
|
65
|
+
NEO4J_DATA_DIR = os.path.join(GRAPHRAG_DATA_DIR, "neo4j", "data").replace("\\", "/")
|
|
66
|
+
NEO4J_LOGS_DIR = os.path.join(GRAPHRAG_DATA_DIR, "neo4j", "logs").replace("\\", "/")
|
|
67
|
+
|
|
68
|
+
# ChromaDB
|
|
69
|
+
CHROMA_PATH = os.getenv("CHROMA_PATH")
|
|
70
|
+
if not CHROMA_PATH:
|
|
71
|
+
CHROMA_PATH = os.path.join(GRAPHRAG_DATA_DIR, "chromadb")
|
|
72
|
+
CHROMA_PATH = CHROMA_PATH.replace("\\", "/")
|
|
73
|
+
|
|
74
|
+
# Sync state file for incremental updates
|
|
75
|
+
SYNC_STATE_PATH = os.path.join(GRAPHRAG_DATA_DIR, "sync_state.json").replace("\\", "/")
|
|
76
|
+
|
|
77
|
+
# GitHub (optional)
|
|
78
|
+
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN", "")
|
|
79
|
+
GITHUB_REPO = os.getenv("GITHUB_REPO", "") # format: "owner/repo"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# GraphRAG Core package
|