codecompass-mcp 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codecompass_mcp-2.0.0.dist-info/METADATA +368 -0
- codecompass_mcp-2.0.0.dist-info/RECORD +28 -0
- codecompass_mcp-2.0.0.dist-info/WHEEL +5 -0
- codecompass_mcp-2.0.0.dist-info/entry_points.txt +3 -0
- codecompass_mcp-2.0.0.dist-info/licenses/LICENSE +21 -0
- codecompass_mcp-2.0.0.dist-info/top_level.txt +6 -0
- config.py +16 -0
- graph/__init__.py +0 -0
- graph/cli.py +13 -0
- graph/code_graph_client.py +485 -0
- graph/code_query_cli.py +504 -0
- graph/mcp_server.py +280 -0
- graph/setup.py +255 -0
- ingestion/__init__.py +0 -0
- ingestion/chunker.py +70 -0
- ingestion/code_normalizer.py +158 -0
- ingestion/code_parser.py +709 -0
- ingestion/entity_resolver.py +179 -0
- ingestion/file_watcher.py +165 -0
- ingestion/graph_writer.py +17 -0
- ingestion/hierarchy_builder.py +148 -0
- ingestion/reader_agent.py +135 -0
- main.py +306 -0
- models/__init__.py +0 -0
- models/code_types.py +35 -0
- models/types.py +45 -0
- utils/__init__.py +0 -0
- utils/formatting.py +24 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import anthropic
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
from dotenv import load_dotenv
|
|
7
|
+
from graph.neo4j_client import Neo4jClient
|
|
8
|
+
|
|
9
|
+
load_dotenv(override=True)
|
|
10
|
+
_client = anthropic.Anthropic()
|
|
11
|
+
|
|
12
|
+
# Chunk size: how many node names to show Haiku per pass.
|
|
13
|
+
# Larger = fewer API calls but risks missing cross-chunk duplicates.
|
|
14
|
+
CHUNK_SIZE = 80
|
|
15
|
+
|
|
16
|
+
RESOLVER_SYSTEM = """You are a knowledge graph entity resolver.
|
|
17
|
+
|
|
18
|
+
Given a list of entity names from a knowledge graph, identify groups of names that clearly refer to the SAME real-world entity or concept.
|
|
19
|
+
|
|
20
|
+
Examples of duplicates:
|
|
21
|
+
- "knowledge tracing" and "Knowledge Tracing System" (same concept, different verbosity)
|
|
22
|
+
- "BKT" and "Bayesian Knowledge Tracing" (acronym + full name)
|
|
23
|
+
- "student model" and "Student Model" (casing only)
|
|
24
|
+
|
|
25
|
+
Rules:
|
|
26
|
+
- Be CONSERVATIVE — only group names you are highly confident are the same entity
|
|
27
|
+
- The first name in each group becomes the canonical (kept) name — pick the clearest, most complete form
|
|
28
|
+
- Do not group names that are merely related (e.g. "Python" and "Python library" are NOT the same)
|
|
29
|
+
|
|
30
|
+
Return ONLY valid JSON, no other text:
|
|
31
|
+
{"groups": [["canonical_name", "duplicate1", "duplicate2"], ...]}
|
|
32
|
+
|
|
33
|
+
If no clear duplicates exist, return {"groups": []}"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def resolve_entities(graph: Neo4jClient, dry_run: bool = False) -> int:
|
|
37
|
+
"""
|
|
38
|
+
Identify and merge duplicate entity nodes in the graph.
|
|
39
|
+
|
|
40
|
+
Strategy (Facade over Haiku + Neo4j):
|
|
41
|
+
1. Fetch all node names
|
|
42
|
+
2. Chunk into groups of CHUNK_SIZE and ask Haiku to find duplicates
|
|
43
|
+
3. Collect all duplicate groups across chunks
|
|
44
|
+
4. Merge each group: re-point relationships to canonical node, delete duplicates
|
|
45
|
+
|
|
46
|
+
Returns the number of nodes merged (0 on dry_run).
|
|
47
|
+
"""
|
|
48
|
+
all_nodes = graph.get_all_node_names()
|
|
49
|
+
if len(all_nodes) < 2:
|
|
50
|
+
print("[resolver] fewer than 2 nodes — nothing to resolve.")
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
print(f"[resolver] scanning {len(all_nodes)} nodes for duplicates...", flush=True)
|
|
54
|
+
|
|
55
|
+
all_groups: list[list[str]] = []
|
|
56
|
+
|
|
57
|
+
for i in range(0, len(all_nodes), CHUNK_SIZE):
|
|
58
|
+
chunk = all_nodes[i : i + CHUNK_SIZE]
|
|
59
|
+
node_list = "\n".join(f"- {n['name']} ({n['type']})" for n in chunk)
|
|
60
|
+
|
|
61
|
+
response = _client.messages.create(
|
|
62
|
+
model="claude-haiku-4-5-20251001",
|
|
63
|
+
max_tokens=1000,
|
|
64
|
+
system=RESOLVER_SYSTEM,
|
|
65
|
+
messages=[{"role": "user", "content": f"Entity names to resolve:\n{node_list}"}],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
raw = response.content[0].text.strip()
|
|
70
|
+
raw = re.sub(r"^```[a-z]*\n?", "", raw, flags=re.MULTILINE)
|
|
71
|
+
raw = re.sub(r"```$", "", raw.strip(), flags=re.MULTILINE)
|
|
72
|
+
groups = json.loads(raw).get("groups", [])
|
|
73
|
+
# Filter out singleton or empty groups
|
|
74
|
+
groups = [g for g in groups if isinstance(g, list) and len(g) >= 2]
|
|
75
|
+
if groups:
|
|
76
|
+
print(f" chunk {i//CHUNK_SIZE + 1}: found {len(groups)} duplicate group(s)", flush=True)
|
|
77
|
+
all_groups.extend(groups)
|
|
78
|
+
except (json.JSONDecodeError, IndexError) as e:
|
|
79
|
+
print(f" [resolver] parse error on chunk {i//CHUNK_SIZE + 1}: {e}")
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
if not all_groups:
|
|
83
|
+
print("[resolver] no duplicates found.")
|
|
84
|
+
return 0
|
|
85
|
+
|
|
86
|
+
name_to_node = {n["name"]: n for n in all_nodes}
|
|
87
|
+
merged_count = 0
|
|
88
|
+
|
|
89
|
+
for group in all_groups:
|
|
90
|
+
canonical_name = group[0]
|
|
91
|
+
duplicates = group[1:]
|
|
92
|
+
|
|
93
|
+
canonical_node = name_to_node.get(canonical_name)
|
|
94
|
+
if not canonical_node:
|
|
95
|
+
# Canonical name itself doesn't exist — skip
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
dup_ids = []
|
|
99
|
+
for dup_name in duplicates:
|
|
100
|
+
dup_node = name_to_node.get(dup_name)
|
|
101
|
+
if not dup_node:
|
|
102
|
+
continue
|
|
103
|
+
if dup_node["id"] == canonical_node["id"]:
|
|
104
|
+
continue
|
|
105
|
+
tag = "[dry-run] " if dry_run else ""
|
|
106
|
+
print(f" {tag}'{dup_name}' → '{canonical_name}'")
|
|
107
|
+
dup_ids.append(dup_node["id"])
|
|
108
|
+
|
|
109
|
+
if dup_ids and not dry_run:
|
|
110
|
+
graph.merge_nodes(canonical_node["id"], canonical_name, dup_ids)
|
|
111
|
+
merged_count += len(dup_ids)
|
|
112
|
+
|
|
113
|
+
return merged_count
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def resolve_dump(graph: Neo4jClient, out_file: str) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Phase 1 of native resolve: write all node names to a JSON file so that
|
|
119
|
+
Claude Code can analyse them and produce a groups file.
|
|
120
|
+
|
|
121
|
+
Usage:
|
|
122
|
+
python main.py resolve --native --dump /tmp/nodes.json
|
|
123
|
+
# → Claude Code reads /tmp/nodes.json, writes /tmp/groups.json
|
|
124
|
+
python main.py resolve --native --apply /tmp/groups.json
|
|
125
|
+
"""
|
|
126
|
+
all_nodes = graph.get_all_node_names()
|
|
127
|
+
with open(out_file, "w") as f:
|
|
128
|
+
json.dump(all_nodes, f, indent=2)
|
|
129
|
+
print(f"[resolver] {len(all_nodes)} nodes written to: {out_file}")
|
|
130
|
+
print()
|
|
131
|
+
print("Next step — ask Claude Code:")
|
|
132
|
+
print(f' "Read {out_file}, find duplicate entity names, write groups to /tmp/resolve_groups.json"')
|
|
133
|
+
print()
|
|
134
|
+
print("Then apply:")
|
|
135
|
+
print(" python main.py resolve --native --apply /tmp/resolve_groups.json")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def resolve_apply(graph: Neo4jClient, groups_file: str, dry_run: bool = False) -> int:
|
|
139
|
+
"""
|
|
140
|
+
Phase 2 of native resolve: load a groups JSON file produced by Claude Code
|
|
141
|
+
and merge the duplicates.
|
|
142
|
+
|
|
143
|
+
groups.json format:
|
|
144
|
+
[["canonical_name", "duplicate1", "duplicate2"], ...]
|
|
145
|
+
"""
|
|
146
|
+
with open(groups_file) as f:
|
|
147
|
+
all_groups = json.load(f)
|
|
148
|
+
|
|
149
|
+
all_groups = [g for g in all_groups if isinstance(g, list) and len(g) >= 2]
|
|
150
|
+
if not all_groups:
|
|
151
|
+
print("[resolver] groups file is empty — nothing to merge.")
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
all_nodes = graph.get_all_node_names()
|
|
155
|
+
name_to_node = {n["name"]: n for n in all_nodes}
|
|
156
|
+
merged_count = 0
|
|
157
|
+
|
|
158
|
+
for group in all_groups:
|
|
159
|
+
canonical_name = group[0]
|
|
160
|
+
duplicates = group[1:]
|
|
161
|
+
canonical_node = name_to_node.get(canonical_name)
|
|
162
|
+
if not canonical_node:
|
|
163
|
+
print(f" [skip] canonical not found: {canonical_name!r}")
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
dup_ids = []
|
|
167
|
+
for dup_name in duplicates:
|
|
168
|
+
dup_node = name_to_node.get(dup_name)
|
|
169
|
+
if not dup_node or dup_node["id"] == canonical_node["id"]:
|
|
170
|
+
continue
|
|
171
|
+
tag = "[dry-run] " if dry_run else ""
|
|
172
|
+
print(f" {tag}'{dup_name}' → '{canonical_name}'")
|
|
173
|
+
dup_ids.append(dup_node["id"])
|
|
174
|
+
|
|
175
|
+
if dup_ids and not dry_run:
|
|
176
|
+
graph.merge_nodes(canonical_node["id"], canonical_name, dup_ids)
|
|
177
|
+
merged_count += len(dup_ids)
|
|
178
|
+
|
|
179
|
+
return merged_count
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Incremental file watcher — keeps the code graph fresh without full re-ingestion.
|
|
2
|
+
|
|
3
|
+
Uses the Observer pattern via watchdog. File system events fire callbacks
|
|
4
|
+
that trigger targeted re-ingestion of only the changed file. The watcher
|
|
5
|
+
is decoupled from the ingestion logic — it only knows how to detect changes
|
|
6
|
+
and hand off file paths.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
watcher = FileWatcher(project_root, project_name, client, file_id_map)
|
|
10
|
+
watcher.start()
|
|
11
|
+
# ... runs until KeyboardInterrupt or watcher.stop()
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import time
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from watchdog.events import FileSystemEventHandler, FileSystemEvent
|
|
22
|
+
from watchdog.observers import Observer
|
|
23
|
+
|
|
24
|
+
from graph.code_graph_client import CodeGraphClient
|
|
25
|
+
from ingestion.code_parser import parse_file, SUPPORTED_EXTENSIONS
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def pid_file_path(project_name: str) -> str:
|
|
29
|
+
"""Return the path of the PID file for a given project's watcher process."""
|
|
30
|
+
return f"/tmp/codecompass_watcher_{project_name}.pid"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Event handler
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
class _CodeFileEventHandler(FileSystemEventHandler):
|
|
38
|
+
"""Handles FS events for source files and triggers incremental re-ingestion."""
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
project_root: str,
|
|
43
|
+
project_name: str,
|
|
44
|
+
client: CodeGraphClient,
|
|
45
|
+
file_id_map: dict[str, str],
|
|
46
|
+
) -> None:
|
|
47
|
+
super().__init__()
|
|
48
|
+
self._project_root = project_root
|
|
49
|
+
self._project_name = project_name
|
|
50
|
+
self._client = client
|
|
51
|
+
self._file_id_map = file_id_map # {rel_path: neo4j_file_node_id}
|
|
52
|
+
|
|
53
|
+
def on_modified(self, event: FileSystemEvent) -> None:
|
|
54
|
+
if event.is_directory:
|
|
55
|
+
return
|
|
56
|
+
self._handle_change(event.src_path)
|
|
57
|
+
|
|
58
|
+
def on_created(self, event: FileSystemEvent) -> None:
|
|
59
|
+
if event.is_directory:
|
|
60
|
+
return
|
|
61
|
+
self._handle_change(event.src_path)
|
|
62
|
+
|
|
63
|
+
def on_deleted(self, event: FileSystemEvent) -> None:
|
|
64
|
+
if event.is_directory:
|
|
65
|
+
return
|
|
66
|
+
rel_path = os.path.relpath(event.src_path, self._project_root)
|
|
67
|
+
self._remove_file_from_graph(rel_path)
|
|
68
|
+
|
|
69
|
+
def on_moved(self, event: FileSystemEvent) -> None:
|
|
70
|
+
if event.is_directory:
|
|
71
|
+
return
|
|
72
|
+
src_rel = os.path.relpath(event.src_path, self._project_root)
|
|
73
|
+
dest_rel = os.path.relpath(event.dest_path, self._project_root)
|
|
74
|
+
if any(src_rel.endswith(ext) for ext in SUPPORTED_EXTENSIONS):
|
|
75
|
+
self._remove_file_from_graph(src_rel)
|
|
76
|
+
if any(dest_rel.endswith(ext) for ext in SUPPORTED_EXTENSIONS):
|
|
77
|
+
self._handle_change(event.dest_path)
|
|
78
|
+
|
|
79
|
+
# ------------------------------------------------------------------
|
|
80
|
+
# Core delta logic
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
def _handle_change(self, abs_path: str) -> None:
|
|
84
|
+
"""Re-parse a changed file and apply only the delta to Neo4j."""
|
|
85
|
+
ext = Path(abs_path).suffix.lower()
|
|
86
|
+
if ext not in SUPPORTED_EXTENSIONS:
|
|
87
|
+
return
|
|
88
|
+
|
|
89
|
+
rel_path = os.path.relpath(abs_path, self._project_root)
|
|
90
|
+
print(f"[file_watcher] changed: {rel_path}")
|
|
91
|
+
|
|
92
|
+
# Delete stale entity nodes (File node stays — file still exists)
|
|
93
|
+
self._client.delete_file_triples(rel_path, self._project_name)
|
|
94
|
+
|
|
95
|
+
new_triples = parse_file(abs_path, self._project_root)
|
|
96
|
+
if not new_triples:
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
file_node_id = self._file_id_map.get(rel_path, "")
|
|
100
|
+
for triple in new_triples:
|
|
101
|
+
self._client.write_code_triple(triple, file_node_id, self._project_name)
|
|
102
|
+
|
|
103
|
+
print(f"[file_watcher] wrote {len(new_triples)} triples for {rel_path}")
|
|
104
|
+
|
|
105
|
+
def _remove_file_from_graph(self, rel_path: str) -> None:
|
|
106
|
+
"""Purge File node and all entity nodes for a deleted or moved file."""
|
|
107
|
+
print(f"[file_watcher] removed: {rel_path}")
|
|
108
|
+
self._client.delete_file(rel_path, self._project_name)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Public watcher class
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
class FileWatcher:
|
|
116
|
+
"""Watches a project root and keeps its code graph incrementally updated."""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
project_root: str,
|
|
121
|
+
project_name: str,
|
|
122
|
+
client: CodeGraphClient,
|
|
123
|
+
file_id_map: dict[str, str],
|
|
124
|
+
) -> None:
|
|
125
|
+
self._project_root = project_root
|
|
126
|
+
self._pid_file = pid_file_path(project_name)
|
|
127
|
+
self._handler = _CodeFileEventHandler(
|
|
128
|
+
project_root=project_root,
|
|
129
|
+
project_name=project_name,
|
|
130
|
+
client=client,
|
|
131
|
+
file_id_map=file_id_map,
|
|
132
|
+
)
|
|
133
|
+
self._observer = Observer()
|
|
134
|
+
self._observer.schedule(self._handler, project_root, recursive=True)
|
|
135
|
+
|
|
136
|
+
def start(self) -> None:
|
|
137
|
+
"""Start watching. Blocks until stop() is called or KeyboardInterrupt."""
|
|
138
|
+
self._observer.start()
|
|
139
|
+
self._write_pid()
|
|
140
|
+
print(f"[file_watcher] watching {self._project_root} — Ctrl-C to stop")
|
|
141
|
+
try:
|
|
142
|
+
while self._observer.is_alive():
|
|
143
|
+
time.sleep(1)
|
|
144
|
+
except KeyboardInterrupt:
|
|
145
|
+
self.stop()
|
|
146
|
+
|
|
147
|
+
def stop(self) -> None:
|
|
148
|
+
"""Stop the observer and wait for it to finish."""
|
|
149
|
+
self._observer.stop()
|
|
150
|
+
self._observer.join()
|
|
151
|
+
self._remove_pid()
|
|
152
|
+
print("[file_watcher] stopped")
|
|
153
|
+
|
|
154
|
+
def _write_pid(self) -> None:
|
|
155
|
+
try:
|
|
156
|
+
with open(self._pid_file, "w") as f:
|
|
157
|
+
f.write(str(os.getpid()))
|
|
158
|
+
except OSError:
|
|
159
|
+
pass
|
|
160
|
+
|
|
161
|
+
def _remove_pid(self) -> None:
|
|
162
|
+
try:
|
|
163
|
+
os.unlink(self._pid_file)
|
|
164
|
+
except OSError:
|
|
165
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from tqdm import tqdm
|
|
2
|
+
from graph.neo4j_client import Neo4jClient
|
|
3
|
+
from models.types import Triple
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def write_triples(client: Neo4jClient, triples: list[Triple]) -> int:
|
|
7
|
+
"""Write triples to Neo4j, deduplicating via (from, rel_type, to) key"""
|
|
8
|
+
seen: set[tuple] = set()
|
|
9
|
+
written = 0
|
|
10
|
+
for triple in tqdm(triples, desc="Writing to Neo4j", unit="triple"):
|
|
11
|
+
key = (triple.entity_from.id, triple.relation.type, triple.entity_to.id)
|
|
12
|
+
if key in seen:
|
|
13
|
+
continue
|
|
14
|
+
seen.add(key)
|
|
15
|
+
client.write_triple(triple)
|
|
16
|
+
written += 1
|
|
17
|
+
return written
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Hierarchy builder — walks a repo and writes the Project → Folder → File skeleton to Neo4j.
|
|
2
|
+
|
|
3
|
+
This runs before code_parser so every file has a node to attach entities to.
|
|
4
|
+
No API calls — purely local filesystem traversal.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
import uuid
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from models.code_types import FileNode, FolderNode
|
|
14
|
+
|
|
15
|
+
# Directory names skipped during traversal
|
|
16
|
+
_SKIP_DIRS = {
|
|
17
|
+
".git", "node_modules", "__pycache__", ".venv", "venv",
|
|
18
|
+
"dist", "build", ".mypy_cache", ".pytest_cache",
|
|
19
|
+
"coverage", "tmp", "cache", ".nx", "lcov-report",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
# Supported source file extensions (mirrors code_parser.SUPPORTED_EXTENSIONS)
|
|
23
|
+
_SOURCE_EXTENSIONS = {".py", ".js", ".ts", ".tsx", ".html", ".css", ".scss"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_hierarchy(project_root: str, project_name: str, client) -> dict[str, str]:
|
|
27
|
+
"""Walk project_root and write Project → Folder → File nodes to Neo4j.
|
|
28
|
+
|
|
29
|
+
Returns a mapping of {relative_file_path: neo4j_node_id} so the caller
|
|
30
|
+
can attach entity nodes to the correct File nodes.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
project_root: Absolute path to the repo.
|
|
34
|
+
project_name: Human-readable project identifier (e.g. "frontend").
|
|
35
|
+
client: CodeGraphClient connected to the project's database.
|
|
36
|
+
"""
|
|
37
|
+
project_id = _stable_id(f"project:{project_name}")
|
|
38
|
+
client.merge_project_node(project_id, project_name, project_root)
|
|
39
|
+
|
|
40
|
+
file_id_map: dict[str, str] = {}
|
|
41
|
+
|
|
42
|
+
for dirpath, dirnames, filenames in os.walk(project_root):
|
|
43
|
+
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
|
|
44
|
+
|
|
45
|
+
rel_dir = os.path.relpath(dirpath, project_root)
|
|
46
|
+
is_root = rel_dir == "."
|
|
47
|
+
|
|
48
|
+
if not is_root:
|
|
49
|
+
folder = _make_folder_node(rel_dir, project_root)
|
|
50
|
+
folder_id = _stable_id(f"folder:{project_name}:{folder.path}")
|
|
51
|
+
parent_id = _parent_id(folder, project_name, project_id)
|
|
52
|
+
client.merge_folder_node(folder_id, folder, project_name)
|
|
53
|
+
client.merge_contains_edge(parent_id, folder_id)
|
|
54
|
+
|
|
55
|
+
for filename in filenames:
|
|
56
|
+
ext = Path(filename).suffix.lower()
|
|
57
|
+
if ext not in _SOURCE_EXTENSIONS:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
full_path = os.path.join(dirpath, filename)
|
|
61
|
+
rel_path = os.path.relpath(full_path, project_root)
|
|
62
|
+
file = _make_file_node(rel_path, project_root)
|
|
63
|
+
file_id = _stable_id(f"file:{project_name}:{file.path}")
|
|
64
|
+
|
|
65
|
+
if is_root:
|
|
66
|
+
parent_node_id = project_id
|
|
67
|
+
else:
|
|
68
|
+
parent_node_id = _stable_id(f"folder:{project_name}:{rel_dir}")
|
|
69
|
+
|
|
70
|
+
client.merge_file_node(file_id, file, project_name)
|
|
71
|
+
client.merge_contains_edge(parent_node_id, file_id)
|
|
72
|
+
file_id_map[rel_path] = file_id
|
|
73
|
+
|
|
74
|
+
return file_id_map
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def collect_file_nodes(project_root: str) -> list[FileNode]:
|
|
78
|
+
"""Return FileNode objects for every supported source file under project_root.
|
|
79
|
+
|
|
80
|
+
Useful for dry-run inspection or passing file lists to other pipeline stages.
|
|
81
|
+
"""
|
|
82
|
+
nodes: list[FileNode] = []
|
|
83
|
+
for dirpath, dirnames, filenames in os.walk(project_root):
|
|
84
|
+
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
|
|
85
|
+
for filename in filenames:
|
|
86
|
+
ext = Path(filename).suffix.lower()
|
|
87
|
+
if ext in _SOURCE_EXTENSIONS:
|
|
88
|
+
full_path = os.path.join(dirpath, filename)
|
|
89
|
+
rel_path = os.path.relpath(full_path, project_root)
|
|
90
|
+
nodes.append(_make_file_node(rel_path, project_root))
|
|
91
|
+
return nodes
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def collect_folder_nodes(project_root: str) -> list[FolderNode]:
|
|
95
|
+
"""Return FolderNode objects for every non-skipped directory under project_root."""
|
|
96
|
+
nodes: list[FolderNode] = []
|
|
97
|
+
for dirpath, dirnames, _ in os.walk(project_root):
|
|
98
|
+
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
|
|
99
|
+
rel_dir = os.path.relpath(dirpath, project_root)
|
|
100
|
+
if rel_dir != ".":
|
|
101
|
+
nodes.append(_make_folder_node(rel_dir, project_root))
|
|
102
|
+
return nodes
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
# Internal helpers
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
|
|
109
|
+
def _make_file_node(rel_path: str, project_root: str) -> FileNode:
|
|
110
|
+
depth = len(Path(rel_path).parts)
|
|
111
|
+
return FileNode(
|
|
112
|
+
path=rel_path,
|
|
113
|
+
name=Path(rel_path).name,
|
|
114
|
+
extension=Path(rel_path).suffix.lower(),
|
|
115
|
+
depth=depth,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _make_folder_node(rel_dir: str, project_root: str) -> FolderNode:
|
|
120
|
+
depth = len(Path(rel_dir).parts)
|
|
121
|
+
return FolderNode(
|
|
122
|
+
path=rel_dir,
|
|
123
|
+
name=Path(rel_dir).name,
|
|
124
|
+
depth=depth,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _parent_id(folder: FolderNode, project_name: str, project_id: str) -> str:
|
|
129
|
+
"""Return the node ID of the immediate parent of this folder."""
|
|
130
|
+
parent_path = str(Path(folder.path).parent)
|
|
131
|
+
if parent_path == ".":
|
|
132
|
+
return project_id
|
|
133
|
+
return _stable_id(f"folder:{project_name}:{parent_path}")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def get_file_id_map(project_name: str, client) -> dict[str, str]:
|
|
137
|
+
"""Reconstruct the {relative_path: node_id} map from the existing graph.
|
|
138
|
+
|
|
139
|
+
Used by load-triples to attach entity nodes to File nodes that were
|
|
140
|
+
written during a previous build_hierarchy call.
|
|
141
|
+
"""
|
|
142
|
+
records = client.get_file_nodes(project_name)
|
|
143
|
+
return {r["path"]: r["id"] for r in records}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _stable_id(key: str) -> str:
|
|
147
|
+
"""Deterministic UUID from a string key — same key always produces the same ID."""
|
|
148
|
+
return str(uuid.uuid5(uuid.NAMESPACE_DNS, key))
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import anthropic
|
|
2
|
+
import json
|
|
3
|
+
import asyncio
|
|
4
|
+
import uuid
|
|
5
|
+
import re
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
7
|
+
from dotenv import load_dotenv
|
|
8
|
+
from tqdm.asyncio import tqdm
|
|
9
|
+
from tqdm import tqdm as tqdm_sync
|
|
10
|
+
from models.types import Entity, Relation, Triple
|
|
11
|
+
|
|
12
|
+
load_dotenv(override=True)
|
|
13
|
+
# Sync client — each call is run in a thread pool to avoid blocking the event loop
|
|
14
|
+
_client = anthropic.Anthropic()
|
|
15
|
+
|
|
16
|
+
# Cached system prompt block — same every call, so cache it after the first hit
|
|
17
|
+
_EXTRACTION_SYSTEM = [
|
|
18
|
+
{
|
|
19
|
+
"type": "text",
|
|
20
|
+
"text": """You are a knowledge graph extraction agent.
|
|
21
|
+
Given a text chunk, extract all meaningful entities and the relationships between them.
|
|
22
|
+
|
|
23
|
+
Rules:
|
|
24
|
+
- Entities: concrete nouns, concepts, people, places, events, systems, components
|
|
25
|
+
- Relations: verbs or relationship types connecting entities (CAUSES, DEPENDS_ON, HAS_COMPONENT, USED_BY, etc.)
|
|
26
|
+
- Weight: confidence in the relation from 0.0 to 1.0
|
|
27
|
+
- Be selective — only extract clear, meaningful relationships
|
|
28
|
+
- Use consistent entity names (no duplicates with different casing)
|
|
29
|
+
|
|
30
|
+
Return ONLY valid JSON, no other text:
|
|
31
|
+
{
|
|
32
|
+
"entities": [
|
|
33
|
+
{"name": "Entity Name", "type": "Concept|Person|Place|Event|System", "description": "brief description"}
|
|
34
|
+
],
|
|
35
|
+
"relations": [
|
|
36
|
+
{"from": "Entity A", "to": "Entity B", "type": "RELATION_TYPE", "weight": 0.9, "description": "brief explanation"}
|
|
37
|
+
]
|
|
38
|
+
}""",
|
|
39
|
+
"cache_control": {"type": "ephemeral"},
|
|
40
|
+
}
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Expose plain text for any code that imports EXTRACTION_SYSTEM by name
|
|
44
|
+
EXTRACTION_SYSTEM = _EXTRACTION_SYSTEM[0]["text"]
|
|
45
|
+
|
|
46
|
+
# How many Haiku calls to run concurrently.
|
|
47
|
+
# Haiku rate limits are generous — 15 keeps throughput high without hitting 429s.
|
|
48
|
+
MAX_CONCURRENT = 15
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _extract_triples_sync(chunk: str) -> list[Triple]:
|
|
52
|
+
"""Blocking extraction — called from a thread pool."""
|
|
53
|
+
response = _client.messages.create(
|
|
54
|
+
model="claude-haiku-4-5-20251001",
|
|
55
|
+
max_tokens=4096,
|
|
56
|
+
system=_EXTRACTION_SYSTEM,
|
|
57
|
+
messages=[{"role": "user", "content": f"Extract knowledge graph from:\n\n{chunk}"}],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
raw = response.content[0].text
|
|
62
|
+
raw = re.sub(r"^```[a-z]*\n?", "", raw.strip(), flags=re.MULTILINE)
|
|
63
|
+
raw = re.sub(r"```$", "", raw.strip(), flags=re.MULTILINE)
|
|
64
|
+
data = json.loads(raw.strip())
|
|
65
|
+
except (json.JSONDecodeError, IndexError) as e:
|
|
66
|
+
print(f"[reader_agent] JSON parse failed: {e} | raw: {raw[:200]!r}")
|
|
67
|
+
return []
|
|
68
|
+
|
|
69
|
+
entity_map: dict[str, Entity] = {}
|
|
70
|
+
for e in data.get("entities", []):
|
|
71
|
+
# Normalise: strip whitespace, collapse internal spaces, strip trailing punctuation
|
|
72
|
+
name = re.sub(r"\s+", " ", e.get("name", "").strip()).strip(".,;:()")
|
|
73
|
+
if not name:
|
|
74
|
+
continue
|
|
75
|
+
eid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name.lower()))
|
|
76
|
+
entity_map[name] = Entity(
|
|
77
|
+
id=eid,
|
|
78
|
+
name=name,
|
|
79
|
+
type=e.get("type", "Concept"),
|
|
80
|
+
description=e.get("description"),
|
|
81
|
+
source_chunk=chunk[:100],
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
triples: list[Triple] = []
|
|
85
|
+
for r in data.get("relations", []):
|
|
86
|
+
from_entity = entity_map.get(r.get("from", ""))
|
|
87
|
+
to_entity = entity_map.get(r.get("to", ""))
|
|
88
|
+
if not from_entity or not to_entity:
|
|
89
|
+
continue
|
|
90
|
+
relation = Relation(
|
|
91
|
+
from_id=from_entity.id,
|
|
92
|
+
to_id=to_entity.id,
|
|
93
|
+
type=r.get("type", "RELATED_TO"),
|
|
94
|
+
weight=float(r.get("weight", 0.8)),
|
|
95
|
+
description=r.get("description"),
|
|
96
|
+
)
|
|
97
|
+
triples.append(Triple(from_entity, relation, to_entity))
|
|
98
|
+
|
|
99
|
+
return triples
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def extract_triples_parallel_sync(chunks: list[str], max_workers: int = MAX_CONCURRENT) -> list[Triple]:
|
|
103
|
+
"""
|
|
104
|
+
Run extraction on all chunks in parallel using a thread pool.
|
|
105
|
+
Used for mid-session ingest_source where we're already in a sync context.
|
|
106
|
+
"""
|
|
107
|
+
all_triples: list[Triple] = []
|
|
108
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
109
|
+
futures = {executor.submit(_extract_triples_sync, chunk): i for i, chunk in enumerate(chunks)}
|
|
110
|
+
for future in tqdm_sync(as_completed(futures), total=len(futures), desc="Extracting", unit="chunk"):
|
|
111
|
+
try:
|
|
112
|
+
all_triples.extend(future.result())
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"[reader_agent] chunk failed: {e}")
|
|
115
|
+
return all_triples
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def extract_triples(chunk: str) -> list[Triple]:
|
|
119
|
+
"""Async wrapper — offloads blocking API call to a thread."""
|
|
120
|
+
return await asyncio.to_thread(_extract_triples_sync, chunk)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def ingest_chunks_parallel(
|
|
124
|
+
chunks: list[str], max_concurrent: int = MAX_CONCURRENT
|
|
125
|
+
) -> list[Triple]:
|
|
126
|
+
"""Run extraction on all chunks with bounded concurrency."""
|
|
127
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
128
|
+
|
|
129
|
+
async def bounded_extract(chunk: str) -> list[Triple]:
|
|
130
|
+
async with semaphore:
|
|
131
|
+
return await extract_triples(chunk)
|
|
132
|
+
|
|
133
|
+
tasks = [bounded_extract(c) for c in chunks]
|
|
134
|
+
results = await tqdm.gather(*tasks, desc="Extracting triples", unit="chunk")
|
|
135
|
+
return [triple for batch in results for triple in batch]
|