ontosight-codegraph 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,589 @@
1
+ """Read CodeGraph SQLite index and build call-graph subgraphs for OntoSight."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import sqlite3
7
+ from collections import deque
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Callable, Iterable, List, Optional, Sequence, Set, Tuple
11
+
12
+ from pydantic import BaseModel, Field
13
+
14
+ EXCLUDED_NODE_KINDS = frozenset({"parameter", "import", "export"})
15
+ CALL_EDGE_KIND = "calls"
16
+ INIT_HINT = "Run: npx @colbymchenry/codegraph init -i"
17
+
18
+ NODE_SELECT = """
19
+ SELECT id, kind, name, qualified_name, file_path, language,
20
+ start_line, end_line, signature, docstring
21
+ FROM nodes
22
+ """
23
+
24
+
25
+ class CodeGraphNotFoundError(FileNotFoundError):
26
+ """Raised when `.codegraph/codegraph.db` is missing."""
27
+
28
+
29
+ class CodeSymbolNode(BaseModel):
30
+ """A code symbol from the CodeGraph index."""
31
+
32
+ id: str
33
+ kind: str
34
+ name: str
35
+ qualified_name: str
36
+ file_path: str
37
+ language: str
38
+ start_line: int
39
+ end_line: int = 0
40
+ signature: Optional[str] = None
41
+ docstring: Optional[str] = None
42
+
43
+
44
+ class CodeCallEdge(BaseModel):
45
+ """A call edge between two symbols."""
46
+
47
+ id: str
48
+ source_id: str
49
+ target_id: str
50
+ line: Optional[int] = None
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class SubgraphResult:
55
+ """Extracted call subgraph plus metadata for CLI / viewer."""
56
+
57
+ nodes: List[CodeSymbolNode]
58
+ edges: List[CodeCallEdge]
59
+ truncated: bool
60
+ filter_summary: str
61
+ languages: List[str]
62
+ seed_ids: List[str]
63
+
64
+
65
+ def resolve_codegraph_db(project_path: Path) -> Path:
66
+ """Return path to CodeGraph database or raise with init hint."""
67
+ root = project_path.resolve()
68
+ db_path = root / ".codegraph" / "codegraph.db"
69
+ if not db_path.is_file():
70
+ raise CodeGraphNotFoundError(
71
+ f"CodeGraph index not found at {db_path}. {INIT_HINT}"
72
+ )
73
+ return db_path
74
+
75
+
76
+ def _row_to_node(row: sqlite3.Row) -> CodeSymbolNode:
77
+ return CodeSymbolNode(
78
+ id=row["id"],
79
+ kind=row["kind"],
80
+ name=row["name"],
81
+ qualified_name=row["qualified_name"],
82
+ file_path=row["file_path"],
83
+ language=row["language"],
84
+ start_line=row["start_line"],
85
+ end_line=row["end_line"] or 0,
86
+ signature=row["signature"],
87
+ docstring=row["docstring"],
88
+ )
89
+
90
+
91
+ def _normalize_path_prefix(path_filter: Optional[str]) -> Optional[str]:
92
+ if not path_filter:
93
+ return None
94
+ cleaned = path_filter.replace("\\", "/").strip("/")
95
+ return cleaned or None
96
+
97
+
98
+ def _tokenize_task(task: str) -> List[str]:
99
+ tokens = re.findall(r"[A-Za-z_][A-Za-z0-9_]*|[A-Za-z]{3,}", task.lower())
100
+ return [t for t in tokens if len(t) >= 2]
101
+
102
+
103
+ class CodeGraphStore:
104
+ """Read-only access to a CodeGraph SQLite database."""
105
+
106
+ def __init__(self, db_path: Path):
107
+ self.db_path = db_path
108
+ self._conn = sqlite3.connect(str(db_path))
109
+ self._conn.row_factory = sqlite3.Row
110
+
111
+ def close(self) -> None:
112
+ self._conn.close()
113
+
114
+ def __enter__(self) -> "CodeGraphStore":
115
+ return self
116
+
117
+ def __exit__(self, *args: object) -> None:
118
+ self.close()
119
+
120
+ def _path_clause(self, path_filter: Optional[str]) -> Tuple[str, List[str]]:
121
+ prefix = _normalize_path_prefix(path_filter)
122
+ if not prefix:
123
+ return "", []
124
+ return " AND file_path LIKE ? ESCAPE '\\'", [f"{prefix}%"]
125
+
126
+ def _eligible_node_clause(self) -> str:
127
+ placeholders = ",".join("?" * len(EXCLUDED_NODE_KINDS))
128
+ return f" AND kind NOT IN ({placeholders})"
129
+
130
+ def fetch_nodes_by_ids(self, node_ids: Set[str]) -> List[CodeSymbolNode]:
131
+ if not node_ids:
132
+ return []
133
+ placeholders = ",".join("?" * len(node_ids))
134
+ query = f"{NODE_SELECT} WHERE id IN ({placeholders})"
135
+ rows = self._conn.execute(query, list(node_ids)).fetchall()
136
+ return [_row_to_node(r) for r in rows]
137
+
138
+ def fetch_call_edges_for_nodes(self, node_ids: Set[str]) -> List[CodeCallEdge]:
139
+ if not node_ids:
140
+ return []
141
+ placeholders = ",".join("?" * len(node_ids))
142
+ query = f"""
143
+ SELECT rowid AS edge_rowid, source, target, line
144
+ FROM edges
145
+ WHERE kind = ?
146
+ AND source IN ({placeholders})
147
+ AND target IN ({placeholders})
148
+ """
149
+ params: List[object] = [CALL_EDGE_KIND, *node_ids, *node_ids]
150
+ rows = self._conn.execute(query, params).fetchall()
151
+ edges: List[CodeCallEdge] = []
152
+ for row in rows:
153
+ edges.append(
154
+ CodeCallEdge(
155
+ id=f"call:{row['source']}:{row['target']}:{row['edge_rowid']}",
156
+ source_id=row["source"],
157
+ target_id=row["target"],
158
+ line=row["line"],
159
+ )
160
+ )
161
+ return edges
162
+
163
+ def _load_adjacency(
164
+ self, path_filter: Optional[str]
165
+ ) -> Tuple[dict[str, Set[str]], dict[str, int]]:
166
+ """Build undirected adjacency and in-degree for call edges within path filter."""
167
+ path_sql, path_params = self._path_clause(path_filter)
168
+ kind_params = list(EXCLUDED_NODE_KINDS)
169
+ node_query = f"""
170
+ SELECT id FROM nodes
171
+ WHERE 1=1{self._eligible_node_clause()}{path_sql}
172
+ """
173
+ allowed = {
174
+ row["id"]
175
+ for row in self._conn.execute(node_query, kind_params + path_params).fetchall()
176
+ }
177
+ if not allowed:
178
+ return {}, {}
179
+
180
+ placeholders = ",".join("?" * len(allowed))
181
+ edge_query = f"""
182
+ SELECT source, target FROM edges
183
+ WHERE kind = ?
184
+ AND source IN ({placeholders})
185
+ AND target IN ({placeholders})
186
+ """
187
+ params: List[object] = [CALL_EDGE_KIND, *allowed, *allowed]
188
+ adjacency: dict[str, Set[str]] = {nid: set() for nid in allowed}
189
+ in_degree: dict[str, int] = {nid: 0 for nid in allowed}
190
+
191
+ for row in self._conn.execute(edge_query, params).fetchall():
192
+ src, tgt = row["source"], row["target"]
193
+ adjacency[src].add(tgt)
194
+ adjacency[tgt].add(src)
195
+ in_degree[tgt] = in_degree.get(tgt, 0) + 1
196
+
197
+ return adjacency, in_degree
198
+
199
+ def find_symbol_seeds(self, symbol: str, path_filter: Optional[str]) -> List[str]:
200
+ path_sql, path_params = self._path_clause(path_filter)
201
+ kind_params = list(EXCLUDED_NODE_KINDS)
202
+ exact_query = f"""
203
+ SELECT id FROM nodes
204
+ WHERE (name = ? OR qualified_name = ?)
205
+ {self._eligible_node_clause()}{path_sql}
206
+ ORDER BY start_line
207
+ LIMIT 10
208
+ """
209
+ params: List[object] = [symbol, symbol, *kind_params, *path_params]
210
+ rows = self._conn.execute(exact_query, params).fetchall()
211
+ if rows:
212
+ return [row["id"] for row in rows]
213
+
214
+ like_query = f"""
215
+ SELECT id FROM nodes
216
+ WHERE (name LIKE ? OR qualified_name LIKE ?)
217
+ {self._eligible_node_clause()}{path_sql}
218
+ ORDER BY start_line
219
+ LIMIT 10
220
+ """
221
+ like = f"%{symbol}%"
222
+ params = [like, like, *kind_params, *path_params]
223
+ rows = self._conn.execute(like_query, params).fetchall()
224
+ return [row["id"] for row in rows]
225
+
226
+ def find_task_seeds(self, task: str, path_filter: Optional[str], limit: int = 5) -> List[str]:
227
+ tokens = _tokenize_task(task)
228
+ if not tokens:
229
+ return self.find_symbol_seeds(task, path_filter)
230
+
231
+ scores: dict[str, int] = {}
232
+ path_sql, path_params = self._path_clause(path_filter)
233
+ kind_params = list(EXCLUDED_NODE_KINDS)
234
+
235
+ for token in tokens:
236
+ query = f"""
237
+ SELECT id, name, qualified_name, docstring FROM nodes
238
+ WHERE (LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?
239
+ OR LOWER(COALESCE(docstring, '')) LIKE ?)
240
+ {self._eligible_node_clause()}{path_sql}
241
+ LIMIT 50
242
+ """
243
+ pattern = f"%{token}%"
244
+ params: List[object] = [pattern, pattern, pattern, *kind_params, *path_params]
245
+ for row in self._conn.execute(query, params).fetchall():
246
+ nid = row["id"]
247
+ text = " ".join(
248
+ filter(
249
+ None,
250
+ [row["name"], row["qualified_name"], row["docstring"] or ""],
251
+ )
252
+ ).lower()
253
+ score = sum(1 for t in tokens if t in text)
254
+ scores[nid] = scores.get(nid, 0) + score
255
+
256
+ ranked = sorted(scores.items(), key=lambda item: (-item[1], item[0]))
257
+ return [nid for nid, _ in ranked[:limit]]
258
+
259
+ def pick_auto_seeds(self, path_filter: Optional[str], limit: int = 3) -> List[str]:
260
+ _, in_degree = self._load_adjacency(path_filter)
261
+ if not in_degree:
262
+ return []
263
+ ranked = sorted(in_degree.items(), key=lambda item: (-item[1], item[0]))
264
+ return [nid for nid, deg in ranked[:limit] if deg > 0] or [ranked[0][0]]
265
+
266
+ def search_symbols(
267
+ self, query: str, path_filter: Optional[str], limit: int = 10
268
+ ) -> List[CodeSymbolNode]:
269
+ path_sql, path_params = self._path_clause(path_filter)
270
+ kind_params = list(EXCLUDED_NODE_KINDS)
271
+ pattern = f"%{query.strip()}%"
272
+ sql = f"""
273
+ SELECT id, kind, name, qualified_name, file_path, language,
274
+ start_line, end_line, signature, docstring
275
+ FROM nodes
276
+ WHERE (name LIKE ? OR qualified_name LIKE ?
277
+ OR COALESCE(signature, '') LIKE ?
278
+ OR COALESCE(docstring, '') LIKE ?)
279
+ {self._eligible_node_clause()}{path_sql}
280
+ ORDER BY
281
+ CASE WHEN name = ? THEN 0
282
+ WHEN name LIKE ? THEN 1
283
+ ELSE 2 END,
284
+ start_line
285
+ LIMIT ?
286
+ """
287
+ exact = query.strip()
288
+ params: List[object] = [
289
+ pattern,
290
+ pattern,
291
+ pattern,
292
+ pattern,
293
+ *kind_params,
294
+ *path_params,
295
+ exact,
296
+ f"{exact}%",
297
+ limit,
298
+ ]
299
+ rows = self._conn.execute(sql, params).fetchall()
300
+ return [_row_to_node(r) for r in rows]
301
+
302
+ def expand_subgraph(
303
+ self,
304
+ seed_ids: Sequence[str],
305
+ *,
306
+ path_filter: Optional[str],
307
+ hops: int,
308
+ max_nodes: int,
309
+ ) -> Tuple[Set[str], bool]:
310
+ adjacency, _ = self._load_adjacency(path_filter)
311
+ if not adjacency:
312
+ return set(seed_ids), False
313
+
314
+ visited: Set[str] = set()
315
+ queue: deque[Tuple[str, int]] = deque()
316
+
317
+ for seed in seed_ids:
318
+ if seed in adjacency:
319
+ visited.add(seed)
320
+ queue.append((seed, 0))
321
+
322
+ if not visited:
323
+ # Seeds may be outside path filter; include them alone.
324
+ return set(seed_ids), False
325
+
326
+ truncated = False
327
+ while queue:
328
+ node_id, depth = queue.popleft()
329
+ if depth >= hops:
330
+ continue
331
+ for neighbor in adjacency.get(node_id, ()):
332
+ if neighbor in visited:
333
+ continue
334
+ if len(visited) >= max_nodes:
335
+ truncated = True
336
+ return visited, truncated
337
+ visited.add(neighbor)
338
+ queue.append((neighbor, depth + 1))
339
+
340
+ return visited, truncated
341
+
342
+
343
+ def _build_filter_summary(
344
+ *,
345
+ path_filter: Optional[str],
346
+ symbol: Optional[str],
347
+ task: Optional[str],
348
+ seed_ids: Sequence[str],
349
+ ) -> str:
350
+ parts: List[str] = []
351
+ if path_filter:
352
+ parts.append(f"path={path_filter}")
353
+ if symbol:
354
+ parts.append(f"symbol={symbol}")
355
+ elif task:
356
+ parts.append(f"task={task!r}")
357
+ elif seed_ids:
358
+ parts.append(f"seeds={len(seed_ids)}")
359
+ else:
360
+ parts.append("auto-seed")
361
+ return ", ".join(parts)
362
+
363
+
364
+ def load_call_subgraph(
365
+ project_path: Path,
366
+ *,
367
+ path_filter: Optional[str] = None,
368
+ symbol: Optional[str] = None,
369
+ task: Optional[str] = None,
370
+ hops: int = 2,
371
+ max_nodes: int = 200,
372
+ ) -> SubgraphResult:
373
+ """Load a bounded call-graph subgraph from CodeGraph."""
374
+ if hops < 0:
375
+ raise ValueError("hops must be >= 0")
376
+ if max_nodes < 1:
377
+ raise ValueError("max_nodes must be >= 1")
378
+ if symbol and task:
379
+ raise ValueError("Specify only one of symbol or task")
380
+
381
+ db_path = resolve_codegraph_db(project_path)
382
+
383
+ with CodeGraphStore(db_path) as store:
384
+ if symbol:
385
+ seed_ids = store.find_symbol_seeds(symbol, path_filter)
386
+ if not seed_ids:
387
+ raise ValueError(f"No symbol matching {symbol!r} in CodeGraph index")
388
+ elif task:
389
+ seed_ids = store.find_task_seeds(task, path_filter)
390
+ if not seed_ids:
391
+ raise ValueError(f"No symbols matching task {task!r} in CodeGraph index")
392
+ else:
393
+ seed_ids = store.pick_auto_seeds(path_filter)
394
+
395
+ if not seed_ids:
396
+ return SubgraphResult(
397
+ nodes=[],
398
+ edges=[],
399
+ truncated=False,
400
+ filter_summary=_build_filter_summary(
401
+ path_filter=path_filter,
402
+ symbol=symbol,
403
+ task=task,
404
+ seed_ids=seed_ids,
405
+ ),
406
+ languages=[],
407
+ seed_ids=[],
408
+ )
409
+
410
+ node_ids, truncated = store.expand_subgraph(
411
+ seed_ids,
412
+ path_filter=path_filter,
413
+ hops=hops,
414
+ max_nodes=max_nodes,
415
+ )
416
+ # Always include seeds even if path filter excluded them from adjacency.
417
+ node_ids.update(seed_ids)
418
+
419
+ nodes = store.fetch_nodes_by_ids(node_ids)
420
+ # Drop excluded kinds unless they are call endpoints.
421
+ edges = store.fetch_call_edges_for_nodes(node_ids)
422
+ endpoint_ids = {e.source_id for e in edges} | {e.target_id for e in edges}
423
+ nodes = [
424
+ n
425
+ for n in nodes
426
+ if n.kind not in EXCLUDED_NODE_KINDS or n.id in endpoint_ids
427
+ ]
428
+ allowed_ids = {n.id for n in nodes}
429
+ edges = [
430
+ e
431
+ for e in edges
432
+ if e.source_id in allowed_ids and e.target_id in allowed_ids
433
+ ]
434
+ languages = sorted({n.language for n in nodes if n.language})
435
+
436
+ return SubgraphResult(
437
+ nodes=nodes,
438
+ edges=edges,
439
+ truncated=truncated,
440
+ filter_summary=_build_filter_summary(
441
+ path_filter=path_filter,
442
+ symbol=symbol,
443
+ task=task,
444
+ seed_ids=seed_ids,
445
+ ),
446
+ languages=languages,
447
+ seed_ids=list(seed_ids),
448
+ )
449
+
450
+
451
+ _QUERY_KEY_ALIASES = {
452
+ "path": "path_filter",
453
+ "max-nodes": "max_nodes",
454
+ "max_nodes": "max_nodes",
455
+ }
456
+
457
+
458
+ def parse_codegraph_query(query: str) -> dict:
459
+ """Parse CLI-style CodeGraph query strings into load_call_subgraph kwargs.
460
+
461
+ Supported forms:
462
+ symbol=load_call_subgraph path=hyperextract/ hops=3 max_nodes=500
463
+ symbol:load_call_subgraph task="fix parser bug"
464
+ """
465
+ if not query or not query.strip():
466
+ return {}
467
+
468
+ text = query.strip()
469
+ result: dict = {}
470
+ i = 0
471
+ length = len(text)
472
+
473
+ def skip_ws() -> None:
474
+ nonlocal i
475
+ while i < length and text[i].isspace():
476
+ i += 1
477
+
478
+ def read_quoted() -> str:
479
+ nonlocal i
480
+ quote = text[i]
481
+ i += 1
482
+ start = i
483
+ while i < length and text[i] != quote:
484
+ if text[i] == "\\" and i + 1 < length:
485
+ i += 2
486
+ continue
487
+ i += 1
488
+ value = text[start:i]
489
+ if i < length:
490
+ i += 1
491
+ return value
492
+
493
+ def read_token() -> str:
494
+ nonlocal i
495
+ start = i
496
+ while i < length and not text[i].isspace() and text[i] not in "=:":
497
+ i += 1
498
+ return text[start:i]
499
+
500
+ while i < length:
501
+ skip_ws()
502
+ if i >= length:
503
+ break
504
+
505
+ key = read_token()
506
+ if not key:
507
+ i += 1
508
+ continue
509
+
510
+ skip_ws()
511
+ if i >= length or text[i] not in "=:":
512
+ raise ValueError(f"Expected '=' or ':' after key {key!r}")
513
+ i += 1
514
+ skip_ws()
515
+
516
+ if i < length and text[i] in "\"'":
517
+ value = read_quoted()
518
+ else:
519
+ value = read_token()
520
+
521
+ canonical = _QUERY_KEY_ALIASES.get(key.lower(), key.lower())
522
+ if canonical in ("hops", "max_nodes"):
523
+ try:
524
+ result[canonical] = int(value)
525
+ except ValueError as exc:
526
+ raise ValueError(f"Invalid integer for {key}: {value!r}") from exc
527
+ else:
528
+ result[canonical] = value
529
+
530
+ if "symbol" in result and "task" in result:
531
+ raise ValueError("Specify only one of symbol or task")
532
+ return result
533
+
534
+
535
+ def merge_query_params(
536
+ *,
537
+ query: Optional[str] = None,
538
+ path_filter: Optional[str] = None,
539
+ symbol: Optional[str] = None,
540
+ task: Optional[str] = None,
541
+ hops: Optional[int] = None,
542
+ max_nodes: Optional[int] = None,
543
+ default_path_filter: Optional[str] = None,
544
+ ) -> dict:
545
+ """Merge structured fields with an optional query string (structured wins)."""
546
+ merged: dict = {}
547
+ if query and query.strip():
548
+ merged.update(parse_codegraph_query(query))
549
+
550
+ if path_filter is not None:
551
+ merged["path_filter"] = path_filter
552
+ elif default_path_filter and "path_filter" not in merged:
553
+ merged["path_filter"] = default_path_filter
554
+ if symbol is not None:
555
+ merged["symbol"] = symbol
556
+ if task is not None:
557
+ merged["task"] = task
558
+ if hops is not None:
559
+ merged["hops"] = hops
560
+ if max_nodes is not None:
561
+ merged["max_nodes"] = max_nodes
562
+
563
+ if merged.get("symbol") and merged.get("task"):
564
+ raise ValueError("Specify only one of symbol or task")
565
+ return merged
566
+
567
+
568
+ def make_search_callback(
569
+ project_path: Path,
570
+ path_filter: Optional[str] = None,
571
+ *,
572
+ top_k: int = 10,
573
+ ) -> Callable[[str], Tuple[List[CodeSymbolNode], List[CodeCallEdge]]]:
574
+ """Create OntoSight search callback backed by CodeGraph symbol search."""
575
+
576
+ def search(query: str) -> Tuple[List[CodeSymbolNode], List[CodeCallEdge]]:
577
+ db_path = resolve_codegraph_db(project_path)
578
+ with CodeGraphStore(db_path) as store:
579
+ nodes = store.search_symbols(query, path_filter, limit=top_k)
580
+ node_ids = {n.id for n in nodes}
581
+ edges = store.fetch_call_edges_for_nodes(node_ids)
582
+ edges = [
583
+ e
584
+ for e in edges
585
+ if e.source_id in node_ids and e.target_id in node_ids
586
+ ]
587
+ return nodes, edges
588
+
589
+ return search