codegraph-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codegraph/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """CodeScope: Graph + Vector Code Intelligence powered by neug and zvec."""
codegraph/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Allow ``python -m codegraph`` to invoke the CLI."""
2
+
3
+ from codegraph.cli import main
4
+
5
+ main()
@@ -0,0 +1 @@
1
+ """CodeScope language adapters for parsing source code via tree-sitter."""
@@ -0,0 +1,38 @@
1
+ """Abstract base class for language-specific source code adapters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from codegraph.models import ParseResult
8
+
9
+
10
+ class BaseAdapter(ABC):
11
+ """Parse a single source file and extract structural information."""
12
+
13
+ @abstractmethod
14
+ def language_name(self) -> str:
15
+ """Return the canonical language name, e.g. ``'python'``, ``'c'``."""
16
+ ...
17
+
18
+ @abstractmethod
19
+ def supported_extensions(self) -> list[str]:
20
+ """Return file extensions this adapter handles, e.g. ['.py']."""
21
+ ...
22
+
23
+ @abstractmethod
24
+ def parse_file(self, source: bytes, file_path: str) -> ParseResult:
25
+ """Parse *source* bytes and return structured code elements.
26
+
27
+ Parameters
28
+ ----------
29
+ source:
30
+ Raw bytes of the source file.
31
+ file_path:
32
+ Repository-relative path used to generate unique IDs.
33
+ """
34
+ ...
35
+
36
+ def can_handle(self, file_path: str) -> bool:
37
+ """Return ``True`` if *file_path* has a supported extension."""
38
+ return any(file_path.endswith(ext) for ext in self.supported_extensions())
@@ -0,0 +1,520 @@
1
+ """C source code adapter using tree-sitter.
2
+
3
+ Handles ``.c`` and ``.h`` files.
4
+ Extracts:
5
+ - Function definitions (including kernel macro-wrapped patterns)
6
+ - Function calls (direct and via field expressions like ``ptr->method()``)
7
+ - ``#include`` directives (system and local)
8
+ - ``struct``/``union``/``enum`` definitions as Class nodes
9
+ - ``static`` scoping for translation-unit-local functions
10
+ - Kernel macros: ``SYSCALL_DEFINE*``, ``EXPORT_SYMBOL*``,
11
+ ``module_init``/``module_exit``
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+
18
+ from tree_sitter_language_pack import get_parser
19
+
20
+ from codegraph.adapters.base import BaseAdapter
21
+ from codegraph.models import (
22
+ CallInfo,
23
+ ParsedClass,
24
+ ParsedFunction,
25
+ ParsedImport,
26
+ ParseResult,
27
+ )
28
+
29
+ _SYSCALL_RE = re.compile(r"^SYSCALL_DEFINE\d$")
30
+ _EXPORT_MACROS = frozenset({"EXPORT_SYMBOL", "EXPORT_SYMBOL_GPL"})
31
+ _MODULE_MACROS = frozenset({"module_init", "module_exit"})
32
+
33
+ _STATIC_MARKER = "__static__"
34
+
35
+
36
+ def _node_text(node) -> str:
37
+ return node.text.decode("utf-8") if node and node.text else ""
38
+
39
+
40
+ def _extract_c_doc(node) -> str:
41
+ """Extract a preceding block comment (``/* ... */``) as documentation."""
42
+ prev = node.prev_sibling
43
+ if prev is None or prev.type != "comment":
44
+ return ""
45
+ text = _node_text(prev)
46
+ if not text.startswith("/*"):
47
+ return ""
48
+ text = text[2:]
49
+ if text.endswith("*/"):
50
+ text = text[:-2]
51
+ lines = text.splitlines()
52
+ cleaned: list[str] = []
53
+ for line in lines:
54
+ line = line.strip()
55
+ if line.startswith("*"):
56
+ line = line[1:].strip()
57
+ if line:
58
+ cleaned.append(line)
59
+ return " ".join(cleaned)
60
+
61
+
62
+ def _build_c_signature(type_text: str, declarator_text: str) -> str:
63
+ return f"{type_text} {declarator_text}"
64
+
65
+
66
+ def _get_function_name(func_declarator_node) -> str:
67
+ """Walk down nested declarators to find the identifier name."""
68
+ node = func_declarator_node
69
+ while node is not None:
70
+ decl = node.child_by_field_name("declarator")
71
+ if decl is None:
72
+ return _node_text(node)
73
+ if decl.type == "identifier":
74
+ return _node_text(decl)
75
+ if decl.type == "function_declarator":
76
+ node = decl
77
+ continue
78
+ if decl.type in ("pointer_declarator", "parenthesized_declarator"):
79
+ inner = decl.child_by_field_name("declarator")
80
+ if inner and inner.type == "identifier":
81
+ return _node_text(inner)
82
+ if inner and inner.type == "function_declarator":
83
+ node = inner
84
+ continue
85
+ return _node_text(decl)
86
+ return _node_text(decl)
87
+ return "unknown"
88
+
89
+
90
+ def _collect_calls(node, calls: list[CallInfo]) -> None:
91
+ """Recursively collect function calls from a C AST subtree."""
92
+ if node.type == "call_expression":
93
+ func = node.child_by_field_name("function")
94
+ if func:
95
+ if func.type == "field_expression":
96
+ obj_node = func.child_by_field_name("argument")
97
+ field_node = func.child_by_field_name("field")
98
+ receiver = _node_text(obj_node) if obj_node else None
99
+ callee = _node_text(field_node) if field_node else _node_text(func)
100
+ calls.append(CallInfo(
101
+ callee_name=callee,
102
+ receiver=receiver,
103
+ raw_expression=_node_text(func),
104
+ ))
105
+ else:
106
+ callee = _node_text(func)
107
+ calls.append(CallInfo(
108
+ callee_name=callee,
109
+ receiver=None,
110
+ raw_expression=callee,
111
+ ))
112
+ for child in node.children:
113
+ _collect_calls(child, calls)
114
+
115
+
116
+ class CAdapter(BaseAdapter):
117
+ """Extract functions, structs, calls and includes from C files."""
118
+
119
+ def __init__(self) -> None:
120
+ self._parser = get_parser("c")
121
+
122
+ def language_name(self) -> str:
123
+ return "c"
124
+
125
+ def supported_extensions(self) -> list[str]:
126
+ return [".c", ".h"]
127
+
128
+ def parse_file(self, source: bytes, file_path: str) -> ParseResult:
129
+ tree = self._parser.parse(source)
130
+ root = tree.root_node
131
+
132
+ functions: list[ParsedFunction] = []
133
+ classes: list[ParsedClass] = []
134
+ imports: list[ParsedImport] = []
135
+
136
+ exported_symbols: set[str] = set()
137
+ module_entry_points: set[str] = set()
138
+
139
+ self._walk(
140
+ root, file_path, functions, classes, imports,
141
+ exported_symbols, module_entry_points,
142
+ )
143
+
144
+ self._apply_export_overrides(functions, exported_symbols)
145
+
146
+ # Deduplicate functions from #ifdef/#else branches — keep the one
147
+ # with the largest body (most likely the real implementation vs stub).
148
+ seen_funcs: dict[str, int] = {}
149
+ for idx, fn in enumerate(functions):
150
+ key = f"{fn.file_path}:{fn.name}"
151
+ if key in seen_funcs:
152
+ prev_idx = seen_funcs[key]
153
+ prev_fn = functions[prev_idx]
154
+ prev_span = prev_fn.end_line - prev_fn.start_line
155
+ cur_span = fn.end_line - fn.start_line
156
+ if cur_span > prev_span:
157
+ seen_funcs[key] = idx
158
+ else:
159
+ seen_funcs[key] = idx
160
+ functions = [functions[i] for i in sorted(seen_funcs.values())]
161
+
162
+ seen_class_ids: set[str] = set()
163
+ unique_classes: list[ParsedClass] = []
164
+ for cls in classes:
165
+ key = f"{cls.file_path}:{cls.name}"
166
+ if key not in seen_class_ids:
167
+ seen_class_ids.add(key)
168
+ unique_classes.append(cls)
169
+ classes = unique_classes
170
+
171
+ return ParseResult(functions=functions, classes=classes, imports=imports)
172
+
173
+ def _walk(
174
+ self,
175
+ node,
176
+ file_path: str,
177
+ functions: list[ParsedFunction],
178
+ classes: list[ParsedClass],
179
+ imports: list[ParsedImport],
180
+ exported_symbols: set[str],
181
+ module_entry_points: set[str],
182
+ ) -> None:
183
+ children = list(node.children)
184
+ i = 0
185
+ while i < len(children):
186
+ child = children[i]
187
+
188
+ if child.type == "function_definition":
189
+ self._extract_function(child, file_path, functions)
190
+
191
+ elif child.type == "declaration":
192
+ self._extract_declaration(child, file_path, classes)
193
+
194
+ elif child.type in (
195
+ "struct_specifier", "union_specifier", "enum_specifier",
196
+ ):
197
+ self._extract_struct_like(child, file_path, classes)
198
+
199
+ elif child.type == "type_definition":
200
+ self._extract_typedef(child, file_path, classes)
201
+
202
+ elif child.type == "preproc_include":
203
+ self._extract_include(child, file_path, imports)
204
+
205
+ elif child.type == "expression_statement":
206
+ self._handle_expression_stmt(
207
+ child, children, i, file_path,
208
+ functions, exported_symbols, module_entry_points,
209
+ )
210
+
211
+ elif child.type in (
212
+ "preproc_ifdef", "preproc_if", "preproc_else",
213
+ "preproc_elif", "preproc_function_def",
214
+ ):
215
+ self._walk(
216
+ child, file_path, functions, classes, imports,
217
+ exported_symbols, module_entry_points,
218
+ )
219
+
220
+ i += 1
221
+
222
+ def _extract_function(
223
+ self,
224
+ node,
225
+ file_path: str,
226
+ functions: list[ParsedFunction],
227
+ ) -> None:
228
+ is_static = False
229
+ for child in node.children:
230
+ if child.type == "storage_class_specifier":
231
+ if _node_text(child).strip() == "static":
232
+ is_static = True
233
+
234
+ type_node = node.child_by_field_name("type")
235
+ decl_node = node.child_by_field_name("declarator")
236
+ body_node = node.child_by_field_name("body")
237
+
238
+ if decl_node is None:
239
+ return
240
+
241
+ name = _get_function_name(decl_node)
242
+ type_text = _node_text(type_node) if type_node else ""
243
+ decl_text = _node_text(decl_node)
244
+ sig = _build_c_signature(type_text, decl_text)
245
+
246
+ doc = _extract_c_doc(node)
247
+
248
+ calls: list[CallInfo] = []
249
+ if body_node:
250
+ _collect_calls(body_node, calls)
251
+
252
+ start_line = node.start_point[0] + 1
253
+ end_line = node.end_point[0] + 1
254
+ qualified = f"{file_path}:{name}"
255
+
256
+ functions.append(
257
+ ParsedFunction(
258
+ name=name,
259
+ qualified_name=qualified,
260
+ signature=sig,
261
+ file_path=file_path,
262
+ start_line=start_line,
263
+ end_line=end_line,
264
+ doc_comment=doc,
265
+ call_names=[c.callee_name for c in calls],
266
+ calls=calls,
267
+ class_name=_STATIC_MARKER if is_static else None,
268
+ )
269
+ )
270
+
271
+ def _extract_declaration(
272
+ self,
273
+ node,
274
+ file_path: str,
275
+ classes: list[ParsedClass],
276
+ ) -> None:
277
+ """Handle top-level ``declaration`` nodes that may contain struct/union/enum.
278
+
279
+ Only extracts struct-like types that contain a body (i.e., definitions,
280
+ not bare type references in variable declarations).
281
+ """
282
+ type_node = node.child_by_field_name("type")
283
+ if type_node and type_node.type in (
284
+ "struct_specifier", "union_specifier", "enum_specifier",
285
+ ):
286
+ if type_node.child_by_field_name("body") is not None:
287
+ self._extract_struct_like(type_node, file_path, classes)
288
+
289
+ def _extract_struct_like(
290
+ self,
291
+ node,
292
+ file_path: str,
293
+ classes: list[ParsedClass],
294
+ ) -> None:
295
+ """Extract struct/union/enum as a ParsedClass."""
296
+ kind = node.type.replace("_specifier", "") # struct, union, enum
297
+ name_node = node.child_by_field_name("name")
298
+ body_node = node.child_by_field_name("body")
299
+
300
+ if name_node is None:
301
+ return
302
+
303
+ name = _node_text(name_node)
304
+ display_name = f"{kind} {name}"
305
+ qualified = f"{file_path}:{display_name}"
306
+
307
+ member_names: list[str] = []
308
+ if body_node:
309
+ for child in body_node.children:
310
+ if child.type == "field_declaration":
311
+ decl = child.child_by_field_name("declarator")
312
+ if decl:
313
+ member_names.append(_node_text(decl))
314
+ elif child.type == "enumerator":
315
+ n = child.child_by_field_name("name")
316
+ if n:
317
+ member_names.append(_node_text(n))
318
+
319
+ start_line = node.start_point[0] + 1
320
+ end_line = node.end_point[0] + 1
321
+
322
+ classes.append(
323
+ ParsedClass(
324
+ name=display_name,
325
+ qualified_name=qualified,
326
+ file_path=file_path,
327
+ start_line=start_line,
328
+ end_line=end_line,
329
+ method_names=member_names,
330
+ base_classes=[],
331
+ )
332
+ )
333
+
334
+ def _extract_typedef(
335
+ self,
336
+ node,
337
+ file_path: str,
338
+ classes: list[ParsedClass],
339
+ ) -> None:
340
+ """Handle ``typedef struct { ... } name_t;``."""
341
+ type_node = node.child_by_field_name("type")
342
+ decl_node = node.child_by_field_name("declarator")
343
+
344
+ if type_node is None or type_node.type not in (
345
+ "struct_specifier", "union_specifier", "enum_specifier",
346
+ ):
347
+ return
348
+
349
+ inner_name = type_node.child_by_field_name("name")
350
+ typedef_name = _node_text(decl_node) if decl_node else None
351
+
352
+ if inner_name:
353
+ self._extract_struct_like(type_node, file_path, classes)
354
+ elif typedef_name:
355
+ kind = type_node.type.replace("_specifier", "")
356
+ display_name = f"{kind} {typedef_name}"
357
+ qualified = f"{file_path}:{display_name}"
358
+
359
+ member_names: list[str] = []
360
+ body_node = type_node.child_by_field_name("body")
361
+ if body_node:
362
+ for child in body_node.children:
363
+ if child.type == "field_declaration":
364
+ d = child.child_by_field_name("declarator")
365
+ if d:
366
+ member_names.append(_node_text(d))
367
+ elif child.type == "enumerator":
368
+ n = child.child_by_field_name("name")
369
+ if n:
370
+ member_names.append(_node_text(n))
371
+
372
+ start_line = node.start_point[0] + 1
373
+ end_line = node.end_point[0] + 1
374
+
375
+ classes.append(
376
+ ParsedClass(
377
+ name=display_name,
378
+ qualified_name=qualified,
379
+ file_path=file_path,
380
+ start_line=start_line,
381
+ end_line=end_line,
382
+ method_names=member_names,
383
+ base_classes=[],
384
+ )
385
+ )
386
+
387
+ def _extract_include(
388
+ self,
389
+ node,
390
+ file_path: str,
391
+ imports: list[ParsedImport],
392
+ ) -> None:
393
+ path_node = node.child_by_field_name("path")
394
+ if path_node is None:
395
+ return
396
+
397
+ if path_node.type == "system_lib_string":
398
+ raw = _node_text(path_node)
399
+ target = raw.strip("<>")
400
+ is_relative = False
401
+ elif path_node.type == "string_literal":
402
+ for child in path_node.children:
403
+ if child.type == "string_content":
404
+ target = _node_text(child)
405
+ break
406
+ else:
407
+ raw = _node_text(path_node)
408
+ target = raw.strip('"')
409
+ is_relative = True
410
+ else:
411
+ return
412
+
413
+ imports.append(
414
+ ParsedImport(
415
+ source_path=file_path,
416
+ target_module=target,
417
+ imported_names=[],
418
+ is_relative=is_relative,
419
+ )
420
+ )
421
+
422
+ def _handle_expression_stmt(
423
+ self,
424
+ stmt_node,
425
+ siblings: list,
426
+ index: int,
427
+ file_path: str,
428
+ functions: list[ParsedFunction],
429
+ exported_symbols: set[str],
430
+ module_entry_points: set[str],
431
+ ) -> None:
432
+ """Handle expression statements that may be kernel macro invocations."""
433
+ for child in stmt_node.children:
434
+ if child.type != "call_expression":
435
+ continue
436
+ func = child.child_by_field_name("function")
437
+ args = child.child_by_field_name("arguments")
438
+ if func is None:
439
+ continue
440
+
441
+ macro_name = _node_text(func)
442
+
443
+ if _SYSCALL_RE.match(macro_name):
444
+ self._extract_syscall(
445
+ child, siblings, index, file_path, functions,
446
+ )
447
+ elif macro_name in _EXPORT_MACROS:
448
+ if args:
449
+ for a in args.children:
450
+ if a.is_named and a.type == "identifier":
451
+ exported_symbols.add(_node_text(a))
452
+ elif macro_name in _MODULE_MACROS:
453
+ if args:
454
+ for a in args.children:
455
+ if a.is_named and a.type == "identifier":
456
+ module_entry_points.add(_node_text(a))
457
+
458
+ def _extract_syscall(
459
+ self,
460
+ call_node,
461
+ siblings: list,
462
+ index: int,
463
+ file_path: str,
464
+ functions: list[ParsedFunction],
465
+ ) -> None:
466
+ """Create a Function node for ``SYSCALL_DEFINE*(name, ...){ body }``."""
467
+ args = call_node.child_by_field_name("arguments")
468
+ if args is None:
469
+ return
470
+
471
+ first_arg = None
472
+ for child in args.children:
473
+ if child.is_named:
474
+ first_arg = _node_text(child)
475
+ break
476
+ if not first_arg:
477
+ return
478
+
479
+ sys_name = f"sys_{first_arg}"
480
+
481
+ body_node = None
482
+ for j in range(index + 1, min(index + 3, len(siblings))):
483
+ if siblings[j].type == "compound_statement":
484
+ body_node = siblings[j]
485
+ break
486
+
487
+ calls: list[CallInfo] = []
488
+ if body_node:
489
+ _collect_calls(body_node, calls)
490
+
491
+ macro_text = _node_text(call_node)
492
+ start_line = call_node.start_point[0] + 1
493
+ end_line = (body_node.end_point[0] + 1) if body_node else start_line
494
+
495
+ functions.append(
496
+ ParsedFunction(
497
+ name=sys_name,
498
+ qualified_name=f"{file_path}:{sys_name}",
499
+ signature=macro_text,
500
+ file_path=file_path,
501
+ start_line=start_line,
502
+ end_line=end_line,
503
+ doc_comment="",
504
+ call_names=[c.callee_name for c in calls],
505
+ calls=calls,
506
+ class_name=None,
507
+ )
508
+ )
509
+
510
+ @staticmethod
511
+ def _apply_export_overrides(
512
+ functions: list[ParsedFunction],
513
+ exported_symbols: set[str],
514
+ ) -> None:
515
+ """If a static function is also EXPORT_SYMBOL'd, remove the static marker."""
516
+ if not exported_symbols:
517
+ return
518
+ for fn in functions:
519
+ if fn.class_name == _STATIC_MARKER and fn.name in exported_symbols:
520
+ fn.class_name = None