codecompass-mcp 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ import anthropic
2
+ import json
3
+ import re
4
+ import subprocess
5
+ import sys
6
+ from dotenv import load_dotenv
7
+ from graph.neo4j_client import Neo4jClient
8
+
9
+ load_dotenv(override=True)
10
+ _client = anthropic.Anthropic()
11
+
12
+ # Chunk size: how many node names to show Haiku per pass.
13
+ # Larger = fewer API calls but risks missing cross-chunk duplicates.
14
+ CHUNK_SIZE = 80
15
+
16
+ RESOLVER_SYSTEM = """You are a knowledge graph entity resolver.
17
+
18
+ Given a list of entity names from a knowledge graph, identify groups of names that clearly refer to the SAME real-world entity or concept.
19
+
20
+ Examples of duplicates:
21
+ - "knowledge tracing" and "Knowledge Tracing System" (same concept, different verbosity)
22
+ - "BKT" and "Bayesian Knowledge Tracing" (acronym + full name)
23
+ - "student model" and "Student Model" (casing only)
24
+
25
+ Rules:
26
+ - Be CONSERVATIVE — only group names you are highly confident are the same entity
27
+ - The first name in each group becomes the canonical (kept) name — pick the clearest, most complete form
28
+ - Do not group names that are merely related (e.g. "Python" and "Python library" are NOT the same)
29
+
30
+ Return ONLY valid JSON, no other text:
31
+ {"groups": [["canonical_name", "duplicate1", "duplicate2"], ...]}
32
+
33
+ If no clear duplicates exist, return {"groups": []}"""
34
+
35
+
36
+ def resolve_entities(graph: Neo4jClient, dry_run: bool = False) -> int:
37
+ """
38
+ Identify and merge duplicate entity nodes in the graph.
39
+
40
+ Strategy (Facade over Haiku + Neo4j):
41
+ 1. Fetch all node names
42
+ 2. Chunk into groups of CHUNK_SIZE and ask Haiku to find duplicates
43
+ 3. Collect all duplicate groups across chunks
44
+ 4. Merge each group: re-point relationships to canonical node, delete duplicates
45
+
46
+ Returns the number of nodes merged (0 on dry_run).
47
+ """
48
+ all_nodes = graph.get_all_node_names()
49
+ if len(all_nodes) < 2:
50
+ print("[resolver] fewer than 2 nodes — nothing to resolve.")
51
+ return 0
52
+
53
+ print(f"[resolver] scanning {len(all_nodes)} nodes for duplicates...", flush=True)
54
+
55
+ all_groups: list[list[str]] = []
56
+
57
+ for i in range(0, len(all_nodes), CHUNK_SIZE):
58
+ chunk = all_nodes[i : i + CHUNK_SIZE]
59
+ node_list = "\n".join(f"- {n['name']} ({n['type']})" for n in chunk)
60
+
61
+ response = _client.messages.create(
62
+ model="claude-haiku-4-5-20251001",
63
+ max_tokens=1000,
64
+ system=RESOLVER_SYSTEM,
65
+ messages=[{"role": "user", "content": f"Entity names to resolve:\n{node_list}"}],
66
+ )
67
+
68
+ try:
69
+ raw = response.content[0].text.strip()
70
+ raw = re.sub(r"^```[a-z]*\n?", "", raw, flags=re.MULTILINE)
71
+ raw = re.sub(r"```$", "", raw.strip(), flags=re.MULTILINE)
72
+ groups = json.loads(raw).get("groups", [])
73
+ # Filter out singleton or empty groups
74
+ groups = [g for g in groups if isinstance(g, list) and len(g) >= 2]
75
+ if groups:
76
+ print(f" chunk {i//CHUNK_SIZE + 1}: found {len(groups)} duplicate group(s)", flush=True)
77
+ all_groups.extend(groups)
78
+ except (json.JSONDecodeError, IndexError) as e:
79
+ print(f" [resolver] parse error on chunk {i//CHUNK_SIZE + 1}: {e}")
80
+ continue
81
+
82
+ if not all_groups:
83
+ print("[resolver] no duplicates found.")
84
+ return 0
85
+
86
+ name_to_node = {n["name"]: n for n in all_nodes}
87
+ merged_count = 0
88
+
89
+ for group in all_groups:
90
+ canonical_name = group[0]
91
+ duplicates = group[1:]
92
+
93
+ canonical_node = name_to_node.get(canonical_name)
94
+ if not canonical_node:
95
+ # Canonical name itself doesn't exist — skip
96
+ continue
97
+
98
+ dup_ids = []
99
+ for dup_name in duplicates:
100
+ dup_node = name_to_node.get(dup_name)
101
+ if not dup_node:
102
+ continue
103
+ if dup_node["id"] == canonical_node["id"]:
104
+ continue
105
+ tag = "[dry-run] " if dry_run else ""
106
+ print(f" {tag}'{dup_name}' → '{canonical_name}'")
107
+ dup_ids.append(dup_node["id"])
108
+
109
+ if dup_ids and not dry_run:
110
+ graph.merge_nodes(canonical_node["id"], canonical_name, dup_ids)
111
+ merged_count += len(dup_ids)
112
+
113
+ return merged_count
114
+
115
+
116
+ def resolve_dump(graph: Neo4jClient, out_file: str) -> None:
117
+ """
118
+ Phase 1 of native resolve: write all node names to a JSON file so that
119
+ Claude Code can analyse them and produce a groups file.
120
+
121
+ Usage:
122
+ python main.py resolve --native --dump /tmp/nodes.json
123
+ # → Claude Code reads /tmp/nodes.json, writes /tmp/groups.json
124
+ python main.py resolve --native --apply /tmp/groups.json
125
+ """
126
+ all_nodes = graph.get_all_node_names()
127
+ with open(out_file, "w") as f:
128
+ json.dump(all_nodes, f, indent=2)
129
+ print(f"[resolver] {len(all_nodes)} nodes written to: {out_file}")
130
+ print()
131
+ print("Next step — ask Claude Code:")
132
+ print(f' "Read {out_file}, find duplicate entity names, write groups to /tmp/resolve_groups.json"')
133
+ print()
134
+ print("Then apply:")
135
+ print(" python main.py resolve --native --apply /tmp/resolve_groups.json")
136
+
137
+
138
+ def resolve_apply(graph: Neo4jClient, groups_file: str, dry_run: bool = False) -> int:
139
+ """
140
+ Phase 2 of native resolve: load a groups JSON file produced by Claude Code
141
+ and merge the duplicates.
142
+
143
+ groups.json format:
144
+ [["canonical_name", "duplicate1", "duplicate2"], ...]
145
+ """
146
+ with open(groups_file) as f:
147
+ all_groups = json.load(f)
148
+
149
+ all_groups = [g for g in all_groups if isinstance(g, list) and len(g) >= 2]
150
+ if not all_groups:
151
+ print("[resolver] groups file is empty — nothing to merge.")
152
+ return 0
153
+
154
+ all_nodes = graph.get_all_node_names()
155
+ name_to_node = {n["name"]: n for n in all_nodes}
156
+ merged_count = 0
157
+
158
+ for group in all_groups:
159
+ canonical_name = group[0]
160
+ duplicates = group[1:]
161
+ canonical_node = name_to_node.get(canonical_name)
162
+ if not canonical_node:
163
+ print(f" [skip] canonical not found: {canonical_name!r}")
164
+ continue
165
+
166
+ dup_ids = []
167
+ for dup_name in duplicates:
168
+ dup_node = name_to_node.get(dup_name)
169
+ if not dup_node or dup_node["id"] == canonical_node["id"]:
170
+ continue
171
+ tag = "[dry-run] " if dry_run else ""
172
+ print(f" {tag}'{dup_name}' → '{canonical_name}'")
173
+ dup_ids.append(dup_node["id"])
174
+
175
+ if dup_ids and not dry_run:
176
+ graph.merge_nodes(canonical_node["id"], canonical_name, dup_ids)
177
+ merged_count += len(dup_ids)
178
+
179
+ return merged_count
@@ -0,0 +1,165 @@
1
+ """Incremental file watcher — keeps the code graph fresh without full re-ingestion.
2
+
3
+ Uses the Observer pattern via watchdog. File system events fire callbacks
4
+ that trigger targeted re-ingestion of only the changed file. The watcher
5
+ is decoupled from the ingestion logic — it only knows how to detect changes
6
+ and hand off file paths.
7
+
8
+ Usage:
9
+ watcher = FileWatcher(project_root, project_name, client, file_id_map)
10
+ watcher.start()
11
+ # ... runs until KeyboardInterrupt or watcher.stop()
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import time
18
+ from pathlib import Path
19
+ from typing import Optional
20
+
21
+ from watchdog.events import FileSystemEventHandler, FileSystemEvent
22
+ from watchdog.observers import Observer
23
+
24
+ from graph.code_graph_client import CodeGraphClient
25
+ from ingestion.code_parser import parse_file, SUPPORTED_EXTENSIONS
26
+
27
+
28
+ def pid_file_path(project_name: str) -> str:
29
+ """Return the path of the PID file for a given project's watcher process."""
30
+ return f"/tmp/codecompass_watcher_{project_name}.pid"
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Event handler
35
+ # ---------------------------------------------------------------------------
36
+
37
+ class _CodeFileEventHandler(FileSystemEventHandler):
38
+ """Handles FS events for source files and triggers incremental re-ingestion."""
39
+
40
+ def __init__(
41
+ self,
42
+ project_root: str,
43
+ project_name: str,
44
+ client: CodeGraphClient,
45
+ file_id_map: dict[str, str],
46
+ ) -> None:
47
+ super().__init__()
48
+ self._project_root = project_root
49
+ self._project_name = project_name
50
+ self._client = client
51
+ self._file_id_map = file_id_map # {rel_path: neo4j_file_node_id}
52
+
53
+ def on_modified(self, event: FileSystemEvent) -> None:
54
+ if event.is_directory:
55
+ return
56
+ self._handle_change(event.src_path)
57
+
58
+ def on_created(self, event: FileSystemEvent) -> None:
59
+ if event.is_directory:
60
+ return
61
+ self._handle_change(event.src_path)
62
+
63
+ def on_deleted(self, event: FileSystemEvent) -> None:
64
+ if event.is_directory:
65
+ return
66
+ rel_path = os.path.relpath(event.src_path, self._project_root)
67
+ self._remove_file_from_graph(rel_path)
68
+
69
+ def on_moved(self, event: FileSystemEvent) -> None:
70
+ if event.is_directory:
71
+ return
72
+ src_rel = os.path.relpath(event.src_path, self._project_root)
73
+ dest_rel = os.path.relpath(event.dest_path, self._project_root)
74
+ if any(src_rel.endswith(ext) for ext in SUPPORTED_EXTENSIONS):
75
+ self._remove_file_from_graph(src_rel)
76
+ if any(dest_rel.endswith(ext) for ext in SUPPORTED_EXTENSIONS):
77
+ self._handle_change(event.dest_path)
78
+
79
+ # ------------------------------------------------------------------
80
+ # Core delta logic
81
+ # ------------------------------------------------------------------
82
+
83
+ def _handle_change(self, abs_path: str) -> None:
84
+ """Re-parse a changed file and apply only the delta to Neo4j."""
85
+ ext = Path(abs_path).suffix.lower()
86
+ if ext not in SUPPORTED_EXTENSIONS:
87
+ return
88
+
89
+ rel_path = os.path.relpath(abs_path, self._project_root)
90
+ print(f"[file_watcher] changed: {rel_path}")
91
+
92
+ # Delete stale entity nodes (File node stays — file still exists)
93
+ self._client.delete_file_triples(rel_path, self._project_name)
94
+
95
+ new_triples = parse_file(abs_path, self._project_root)
96
+ if not new_triples:
97
+ return
98
+
99
+ file_node_id = self._file_id_map.get(rel_path, "")
100
+ for triple in new_triples:
101
+ self._client.write_code_triple(triple, file_node_id, self._project_name)
102
+
103
+ print(f"[file_watcher] wrote {len(new_triples)} triples for {rel_path}")
104
+
105
+ def _remove_file_from_graph(self, rel_path: str) -> None:
106
+ """Purge File node and all entity nodes for a deleted or moved file."""
107
+ print(f"[file_watcher] removed: {rel_path}")
108
+ self._client.delete_file(rel_path, self._project_name)
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Public watcher class
113
+ # ---------------------------------------------------------------------------
114
+
115
+ class FileWatcher:
116
+ """Watches a project root and keeps its code graph incrementally updated."""
117
+
118
+ def __init__(
119
+ self,
120
+ project_root: str,
121
+ project_name: str,
122
+ client: CodeGraphClient,
123
+ file_id_map: dict[str, str],
124
+ ) -> None:
125
+ self._project_root = project_root
126
+ self._pid_file = pid_file_path(project_name)
127
+ self._handler = _CodeFileEventHandler(
128
+ project_root=project_root,
129
+ project_name=project_name,
130
+ client=client,
131
+ file_id_map=file_id_map,
132
+ )
133
+ self._observer = Observer()
134
+ self._observer.schedule(self._handler, project_root, recursive=True)
135
+
136
+ def start(self) -> None:
137
+ """Start watching. Blocks until stop() is called or KeyboardInterrupt."""
138
+ self._observer.start()
139
+ self._write_pid()
140
+ print(f"[file_watcher] watching {self._project_root} — Ctrl-C to stop")
141
+ try:
142
+ while self._observer.is_alive():
143
+ time.sleep(1)
144
+ except KeyboardInterrupt:
145
+ self.stop()
146
+
147
+ def stop(self) -> None:
148
+ """Stop the observer and wait for it to finish."""
149
+ self._observer.stop()
150
+ self._observer.join()
151
+ self._remove_pid()
152
+ print("[file_watcher] stopped")
153
+
154
+ def _write_pid(self) -> None:
155
+ try:
156
+ with open(self._pid_file, "w") as f:
157
+ f.write(str(os.getpid()))
158
+ except OSError:
159
+ pass
160
+
161
+ def _remove_pid(self) -> None:
162
+ try:
163
+ os.unlink(self._pid_file)
164
+ except OSError:
165
+ pass
@@ -0,0 +1,17 @@
1
+ from tqdm import tqdm
2
+ from graph.neo4j_client import Neo4jClient
3
+ from models.types import Triple
4
+
5
+
6
+ def write_triples(client: Neo4jClient, triples: list[Triple]) -> int:
7
+ """Write triples to Neo4j, deduplicating via (from, rel_type, to) key"""
8
+ seen: set[tuple] = set()
9
+ written = 0
10
+ for triple in tqdm(triples, desc="Writing to Neo4j", unit="triple"):
11
+ key = (triple.entity_from.id, triple.relation.type, triple.entity_to.id)
12
+ if key in seen:
13
+ continue
14
+ seen.add(key)
15
+ client.write_triple(triple)
16
+ written += 1
17
+ return written
@@ -0,0 +1,148 @@
1
+ """Hierarchy builder — walks a repo and writes the Project → Folder → File skeleton to Neo4j.
2
+
3
+ This runs before code_parser so every file has a node to attach entities to.
4
+ No API calls — purely local filesystem traversal.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import uuid
11
+ from pathlib import Path
12
+
13
+ from models.code_types import FileNode, FolderNode
14
+
15
+ # Directory names skipped during traversal
16
+ _SKIP_DIRS = {
17
+ ".git", "node_modules", "__pycache__", ".venv", "venv",
18
+ "dist", "build", ".mypy_cache", ".pytest_cache",
19
+ "coverage", "tmp", "cache", ".nx", "lcov-report",
20
+ }
21
+
22
+ # Supported source file extensions (mirrors code_parser.SUPPORTED_EXTENSIONS)
23
+ _SOURCE_EXTENSIONS = {".py", ".js", ".ts", ".tsx", ".html", ".css", ".scss"}
24
+
25
+
26
+ def build_hierarchy(project_root: str, project_name: str, client) -> dict[str, str]:
27
+ """Walk project_root and write Project → Folder → File nodes to Neo4j.
28
+
29
+ Returns a mapping of {relative_file_path: neo4j_node_id} so the caller
30
+ can attach entity nodes to the correct File nodes.
31
+
32
+ Args:
33
+ project_root: Absolute path to the repo.
34
+ project_name: Human-readable project identifier (e.g. "frontend").
35
+ client: CodeGraphClient connected to the project's database.
36
+ """
37
+ project_id = _stable_id(f"project:{project_name}")
38
+ client.merge_project_node(project_id, project_name, project_root)
39
+
40
+ file_id_map: dict[str, str] = {}
41
+
42
+ for dirpath, dirnames, filenames in os.walk(project_root):
43
+ dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
44
+
45
+ rel_dir = os.path.relpath(dirpath, project_root)
46
+ is_root = rel_dir == "."
47
+
48
+ if not is_root:
49
+ folder = _make_folder_node(rel_dir, project_root)
50
+ folder_id = _stable_id(f"folder:{project_name}:{folder.path}")
51
+ parent_id = _parent_id(folder, project_name, project_id)
52
+ client.merge_folder_node(folder_id, folder, project_name)
53
+ client.merge_contains_edge(parent_id, folder_id)
54
+
55
+ for filename in filenames:
56
+ ext = Path(filename).suffix.lower()
57
+ if ext not in _SOURCE_EXTENSIONS:
58
+ continue
59
+
60
+ full_path = os.path.join(dirpath, filename)
61
+ rel_path = os.path.relpath(full_path, project_root)
62
+ file = _make_file_node(rel_path, project_root)
63
+ file_id = _stable_id(f"file:{project_name}:{file.path}")
64
+
65
+ if is_root:
66
+ parent_node_id = project_id
67
+ else:
68
+ parent_node_id = _stable_id(f"folder:{project_name}:{rel_dir}")
69
+
70
+ client.merge_file_node(file_id, file, project_name)
71
+ client.merge_contains_edge(parent_node_id, file_id)
72
+ file_id_map[rel_path] = file_id
73
+
74
+ return file_id_map
75
+
76
+
77
+ def collect_file_nodes(project_root: str) -> list[FileNode]:
78
+ """Return FileNode objects for every supported source file under project_root.
79
+
80
+ Useful for dry-run inspection or passing file lists to other pipeline stages.
81
+ """
82
+ nodes: list[FileNode] = []
83
+ for dirpath, dirnames, filenames in os.walk(project_root):
84
+ dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
85
+ for filename in filenames:
86
+ ext = Path(filename).suffix.lower()
87
+ if ext in _SOURCE_EXTENSIONS:
88
+ full_path = os.path.join(dirpath, filename)
89
+ rel_path = os.path.relpath(full_path, project_root)
90
+ nodes.append(_make_file_node(rel_path, project_root))
91
+ return nodes
92
+
93
+
94
+ def collect_folder_nodes(project_root: str) -> list[FolderNode]:
95
+ """Return FolderNode objects for every non-skipped directory under project_root."""
96
+ nodes: list[FolderNode] = []
97
+ for dirpath, dirnames, _ in os.walk(project_root):
98
+ dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]
99
+ rel_dir = os.path.relpath(dirpath, project_root)
100
+ if rel_dir != ".":
101
+ nodes.append(_make_folder_node(rel_dir, project_root))
102
+ return nodes
103
+
104
+
105
+ # ---------------------------------------------------------------------------
106
+ # Internal helpers
107
+ # ---------------------------------------------------------------------------
108
+
109
+ def _make_file_node(rel_path: str, project_root: str) -> FileNode:
110
+ depth = len(Path(rel_path).parts)
111
+ return FileNode(
112
+ path=rel_path,
113
+ name=Path(rel_path).name,
114
+ extension=Path(rel_path).suffix.lower(),
115
+ depth=depth,
116
+ )
117
+
118
+
119
+ def _make_folder_node(rel_dir: str, project_root: str) -> FolderNode:
120
+ depth = len(Path(rel_dir).parts)
121
+ return FolderNode(
122
+ path=rel_dir,
123
+ name=Path(rel_dir).name,
124
+ depth=depth,
125
+ )
126
+
127
+
128
+ def _parent_id(folder: FolderNode, project_name: str, project_id: str) -> str:
129
+ """Return the node ID of the immediate parent of this folder."""
130
+ parent_path = str(Path(folder.path).parent)
131
+ if parent_path == ".":
132
+ return project_id
133
+ return _stable_id(f"folder:{project_name}:{parent_path}")
134
+
135
+
136
+ def get_file_id_map(project_name: str, client) -> dict[str, str]:
137
+ """Reconstruct the {relative_path: node_id} map from the existing graph.
138
+
139
+ Used by load-triples to attach entity nodes to File nodes that were
140
+ written during a previous build_hierarchy call.
141
+ """
142
+ records = client.get_file_nodes(project_name)
143
+ return {r["path"]: r["id"] for r in records}
144
+
145
+
146
+ def _stable_id(key: str) -> str:
147
+ """Deterministic UUID from a string key — same key always produces the same ID."""
148
+ return str(uuid.uuid5(uuid.NAMESPACE_DNS, key))
@@ -0,0 +1,135 @@
1
+ import anthropic
2
+ import json
3
+ import asyncio
4
+ import uuid
5
+ import re
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from dotenv import load_dotenv
8
+ from tqdm.asyncio import tqdm
9
+ from tqdm import tqdm as tqdm_sync
10
+ from models.types import Entity, Relation, Triple
11
+
12
+ load_dotenv(override=True)
13
+ # Sync client — each call is run in a thread pool to avoid blocking the event loop
14
+ _client = anthropic.Anthropic()
15
+
16
+ # Cached system prompt block — same every call, so cache it after the first hit
17
+ _EXTRACTION_SYSTEM = [
18
+ {
19
+ "type": "text",
20
+ "text": """You are a knowledge graph extraction agent.
21
+ Given a text chunk, extract all meaningful entities and the relationships between them.
22
+
23
+ Rules:
24
+ - Entities: concrete nouns, concepts, people, places, events, systems, components
25
+ - Relations: verbs or relationship types connecting entities (CAUSES, DEPENDS_ON, HAS_COMPONENT, USED_BY, etc.)
26
+ - Weight: confidence in the relation from 0.0 to 1.0
27
+ - Be selective — only extract clear, meaningful relationships
28
+ - Use consistent entity names (no duplicates with different casing)
29
+
30
+ Return ONLY valid JSON, no other text:
31
+ {
32
+ "entities": [
33
+ {"name": "Entity Name", "type": "Concept|Person|Place|Event|System", "description": "brief description"}
34
+ ],
35
+ "relations": [
36
+ {"from": "Entity A", "to": "Entity B", "type": "RELATION_TYPE", "weight": 0.9, "description": "brief explanation"}
37
+ ]
38
+ }""",
39
+ "cache_control": {"type": "ephemeral"},
40
+ }
41
+ ]
42
+
43
+ # Expose plain text for any code that imports EXTRACTION_SYSTEM by name
44
+ EXTRACTION_SYSTEM = _EXTRACTION_SYSTEM[0]["text"]
45
+
46
+ # How many Haiku calls to run concurrently.
47
+ # Haiku rate limits are generous — 15 keeps throughput high without hitting 429s.
48
+ MAX_CONCURRENT = 15
49
+
50
+
51
+ def _extract_triples_sync(chunk: str) -> list[Triple]:
52
+ """Blocking extraction — called from a thread pool."""
53
+ response = _client.messages.create(
54
+ model="claude-haiku-4-5-20251001",
55
+ max_tokens=4096,
56
+ system=_EXTRACTION_SYSTEM,
57
+ messages=[{"role": "user", "content": f"Extract knowledge graph from:\n\n{chunk}"}],
58
+ )
59
+
60
+ try:
61
+ raw = response.content[0].text
62
+ raw = re.sub(r"^```[a-z]*\n?", "", raw.strip(), flags=re.MULTILINE)
63
+ raw = re.sub(r"```$", "", raw.strip(), flags=re.MULTILINE)
64
+ data = json.loads(raw.strip())
65
+ except (json.JSONDecodeError, IndexError) as e:
66
+ print(f"[reader_agent] JSON parse failed: {e} | raw: {raw[:200]!r}")
67
+ return []
68
+
69
+ entity_map: dict[str, Entity] = {}
70
+ for e in data.get("entities", []):
71
+ # Normalise: strip whitespace, collapse internal spaces, strip trailing punctuation
72
+ name = re.sub(r"\s+", " ", e.get("name", "").strip()).strip(".,;:()")
73
+ if not name:
74
+ continue
75
+ eid = str(uuid.uuid5(uuid.NAMESPACE_DNS, name.lower()))
76
+ entity_map[name] = Entity(
77
+ id=eid,
78
+ name=name,
79
+ type=e.get("type", "Concept"),
80
+ description=e.get("description"),
81
+ source_chunk=chunk[:100],
82
+ )
83
+
84
+ triples: list[Triple] = []
85
+ for r in data.get("relations", []):
86
+ from_entity = entity_map.get(r.get("from", ""))
87
+ to_entity = entity_map.get(r.get("to", ""))
88
+ if not from_entity or not to_entity:
89
+ continue
90
+ relation = Relation(
91
+ from_id=from_entity.id,
92
+ to_id=to_entity.id,
93
+ type=r.get("type", "RELATED_TO"),
94
+ weight=float(r.get("weight", 0.8)),
95
+ description=r.get("description"),
96
+ )
97
+ triples.append(Triple(from_entity, relation, to_entity))
98
+
99
+ return triples
100
+
101
+
102
+ def extract_triples_parallel_sync(chunks: list[str], max_workers: int = MAX_CONCURRENT) -> list[Triple]:
103
+ """
104
+ Run extraction on all chunks in parallel using a thread pool.
105
+ Used for mid-session ingest_source where we're already in a sync context.
106
+ """
107
+ all_triples: list[Triple] = []
108
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
109
+ futures = {executor.submit(_extract_triples_sync, chunk): i for i, chunk in enumerate(chunks)}
110
+ for future in tqdm_sync(as_completed(futures), total=len(futures), desc="Extracting", unit="chunk"):
111
+ try:
112
+ all_triples.extend(future.result())
113
+ except Exception as e:
114
+ print(f"[reader_agent] chunk failed: {e}")
115
+ return all_triples
116
+
117
+
118
+ async def extract_triples(chunk: str) -> list[Triple]:
119
+ """Async wrapper — offloads blocking API call to a thread."""
120
+ return await asyncio.to_thread(_extract_triples_sync, chunk)
121
+
122
+
123
+ async def ingest_chunks_parallel(
124
+ chunks: list[str], max_concurrent: int = MAX_CONCURRENT
125
+ ) -> list[Triple]:
126
+ """Run extraction on all chunks with bounded concurrency."""
127
+ semaphore = asyncio.Semaphore(max_concurrent)
128
+
129
+ async def bounded_extract(chunk: str) -> list[Triple]:
130
+ async with semaphore:
131
+ return await extract_triples(chunk)
132
+
133
+ tasks = [bounded_extract(c) for c in chunks]
134
+ results = await tqdm.gather(*tasks, desc="Extracting triples", unit="chunk")
135
+ return [triple for batch in results for triple in batch]