codegraph-gen 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,349 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import tree_sitter
4
+ from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class CCppParser(BaseParser):
10
+ def __init__(self, lang_module):
11
+ self.language = tree_sitter.Language(lang_module.language())
12
+ self.parser = tree_sitter.Parser(self.language)
13
+
14
+ def _get_declarator_name(self, node, source: bytes) -> str:
15
+ if not node:
16
+ return ""
17
+ if node.type in ("identifier", "field_identifier", "destructor_name"):
18
+ return source[node.start_byte : node.end_byte].decode(
19
+ "utf-8", errors="replace"
20
+ )
21
+ elif node.type in ("qualified_identifier", "operator_name"):
22
+ return source[node.start_byte : node.end_byte].decode(
23
+ "utf-8", errors="replace"
24
+ )
25
+ elif node.type in (
26
+ "pointer_declarator",
27
+ "reference_declarator",
28
+ "parenthesized_declarator",
29
+ "array_declarator",
30
+ ):
31
+ decl = node.child_by_field_name("declarator")
32
+ if decl:
33
+ return self._get_declarator_name(decl, source)
34
+ elif node.type == "function_declarator":
35
+ decl = node.child_by_field_name("declarator")
36
+ if decl:
37
+ return self._get_declarator_name(decl, source)
38
+ # Search all children for identifier/qualified_identifier/etc.
39
+ for child in node.children:
40
+ name = self._get_declarator_name(child, source)
41
+ if name:
42
+ return name
43
+ return ""
44
+
45
+ def _get_docstring(self, node, source: bytes) -> str:
46
+ docstring = ""
47
+ prev = node.prev_sibling
48
+ comments = []
49
+ while prev and prev.type in ("comment", "line_comment", "block_comment"):
50
+ comment_text = source[prev.start_byte : prev.end_byte].decode(
51
+ "utf-8", errors="replace"
52
+ )
53
+ # Strip comment markers (//, /*, */, ///)
54
+ clean_text = comment_text.strip().lstrip("/").strip()
55
+ if clean_text.endswith("*/"):
56
+ clean_text = clean_text[:-2].strip()
57
+ if clean_text.startswith("/*"):
58
+ clean_text = clean_text[2:].strip()
59
+ comments.append(clean_text)
60
+ prev = prev.prev_sibling
61
+
62
+ if comments:
63
+ docstring = "\n".join(reversed(comments))
64
+ return docstring
65
+
66
+ def _get_signature(self, node, source: bytes) -> str:
67
+ body = node.child_by_field_name("body")
68
+ if body:
69
+ end_byte = body.start_byte
70
+ sig = (
71
+ source[node.start_byte : end_byte]
72
+ .decode("utf-8", errors="replace")
73
+ .strip()
74
+ )
75
+ if sig.endswith("{"):
76
+ sig = sig[:-1].strip()
77
+ return sig
78
+ return (
79
+ source[node.start_byte : node.end_byte]
80
+ .decode("utf-8", errors="replace")
81
+ .split("\n")[0]
82
+ )
83
+
84
+ def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
85
+ try:
86
+ source = file_path.read_bytes()
87
+ except Exception as e:
88
+ logger.error(f"Error reading file {file_path}: {e}")
89
+ return ExtractionResult()
90
+
91
+ tree = self.parser.parse(source)
92
+ root = tree.root_node
93
+
94
+ rel_path = str(file_path.relative_to(workspace_dir))
95
+ result = ExtractionResult()
96
+ defined_ids = set()
97
+
98
+ # Add file node
99
+ file_node_id = rel_path
100
+ result.nodes.append(
101
+ NodeSchema(
102
+ id=file_node_id,
103
+ label=file_path.name,
104
+ type="file",
105
+ source_file=rel_path,
106
+ line_start=1,
107
+ line_end=len(source.splitlines()) or 1,
108
+ signature=f"file {file_path.name}",
109
+ docstring=self._get_docstring(root, source),
110
+ )
111
+ )
112
+ defined_ids.add(file_node_id)
113
+
114
+ scope_stack = [(file_node_id, "file")]
115
+
116
+ def get_current_parent_id():
117
+ return scope_stack[-1][0] if scope_stack else file_node_id
118
+
119
+ def walk(node):
120
+ nonlocal result
121
+
122
+ if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
123
+ logger.debug(f"Skipping syntax error node in C/C++ AST: {node}")
124
+ return
125
+
126
+ node_type = node.type
127
+ pushed_scope = False
128
+
129
+ if node_type in (
130
+ "class_specifier",
131
+ "struct_specifier",
132
+ "union_specifier",
133
+ "enum_specifier",
134
+ "namespace_definition",
135
+ ):
136
+ if node_type != "namespace_definition":
137
+ body_node = node.child_by_field_name("body")
138
+ if not body_node:
139
+ for child in node.children:
140
+ walk(child)
141
+ return
142
+
143
+ name_node = node.child_by_field_name("name")
144
+ name = ""
145
+ if name_node:
146
+ name = (
147
+ source[name_node.start_byte : name_node.end_byte]
148
+ .decode("utf-8", errors="replace")
149
+ .strip()
150
+ )
151
+
152
+ if not name:
153
+ # Anonymous specifier
154
+ for child in node.children:
155
+ walk(child)
156
+ return
157
+
158
+ parent_id = get_current_parent_id()
159
+ if "::" in name:
160
+ symbol_id = f"{rel_path}::{name}"
161
+ else:
162
+ parent_parts = parent_id.split("::", 1)
163
+ if len(parent_parts) > 1:
164
+ symbol_id = f"{rel_path}::{parent_parts[1]}.{name}"
165
+ else:
166
+ symbol_id = f"{rel_path}::{name}"
167
+
168
+ sym_type = "class"
169
+ if node_type == "struct_specifier":
170
+ sym_type = "struct"
171
+ elif node_type == "union_specifier":
172
+ sym_type = "union"
173
+ elif node_type == "enum_specifier":
174
+ sym_type = "enum"
175
+ elif node_type == "namespace_definition":
176
+ sym_type = "namespace"
177
+
178
+ result.nodes.append(
179
+ NodeSchema(
180
+ id=symbol_id,
181
+ label=name,
182
+ type=sym_type,
183
+ source_file=rel_path,
184
+ line_start=node.start_point[0] + 1,
185
+ line_end=node.end_point[0] + 1,
186
+ signature=self._get_signature(node, source),
187
+ docstring=self._get_docstring(node, source),
188
+ )
189
+ )
190
+ defined_ids.add(symbol_id)
191
+
192
+ result.edges.append(
193
+ EdgeSchema(source=parent_id, target=symbol_id, relation="contains")
194
+ )
195
+
196
+ # Handle base classes inheritance
197
+ for child in node.children:
198
+ if child.type == "base_class_clause":
199
+
200
+ def extract_base_types(n):
201
+ if n.type in (
202
+ "type_identifier",
203
+ "qualified_identifier",
204
+ "template_type",
205
+ ):
206
+ return (
207
+ source[n.start_byte : n.end_byte]
208
+ .decode("utf-8", errors="replace")
209
+ .strip()
210
+ )
211
+ for c in n.children:
212
+ bt = extract_base_types(c)
213
+ if bt:
214
+ return bt
215
+ return None
216
+
217
+ for sub in child.children:
218
+ base_name = extract_base_types(sub)
219
+ if base_name:
220
+ result.edges.append(
221
+ EdgeSchema(
222
+ source=symbol_id,
223
+ target=base_name,
224
+ relation="inherits",
225
+ )
226
+ )
227
+
228
+ scope_stack.append((symbol_id, sym_type))
229
+ pushed_scope = True
230
+
231
+ elif node_type == "function_definition":
232
+ declarator = node.child_by_field_name("declarator")
233
+ func_name = self._get_declarator_name(declarator, source)
234
+
235
+ if func_name:
236
+ parent_id = get_current_parent_id()
237
+ parent_type = scope_stack[-1][1] if scope_stack else "file"
238
+
239
+ if "::" in func_name:
240
+ class_part, method_part = func_name.rsplit("::", 1)
241
+ class_id = f"{rel_path}::{class_part.replace('::', '.')}"
242
+ method_id = f"{class_id}.{method_part}"
243
+ sym_type = "method"
244
+ func_label = method_part
245
+
246
+ actual_parent = (
247
+ class_id if class_id in defined_ids else file_node_id
248
+ )
249
+ result.edges.append(
250
+ EdgeSchema(
251
+ source=actual_parent,
252
+ target=method_id,
253
+ relation="contains",
254
+ )
255
+ )
256
+ elif parent_type in ("class", "struct", "union", "namespace"):
257
+ method_id = f"{parent_id}.{func_name}"
258
+ sym_type = (
259
+ "method" if parent_type != "namespace" else "function"
260
+ )
261
+ func_label = func_name
262
+ result.edges.append(
263
+ EdgeSchema(
264
+ source=parent_id, target=method_id, relation="contains"
265
+ )
266
+ )
267
+ else:
268
+ method_id = f"{rel_path}::{func_name}"
269
+ sym_type = "function"
270
+ func_label = func_name
271
+ result.edges.append(
272
+ EdgeSchema(
273
+ source=parent_id, target=method_id, relation="contains"
274
+ )
275
+ )
276
+
277
+ result.nodes.append(
278
+ NodeSchema(
279
+ id=method_id,
280
+ label=func_label,
281
+ type=sym_type,
282
+ source_file=rel_path,
283
+ line_start=node.start_point[0] + 1,
284
+ line_end=node.end_point[0] + 1,
285
+ signature=self._get_signature(node, source),
286
+ docstring=self._get_docstring(node, source),
287
+ )
288
+ )
289
+ defined_ids.add(method_id)
290
+
291
+ scope_stack.append((method_id, sym_type))
292
+ pushed_scope = True
293
+
294
+ elif node_type == "preproc_include":
295
+ path_node = node.child_by_field_name("path")
296
+ if not path_node:
297
+ for child in node.children:
298
+ if child.type in ("string_literal", "system_lib_string"):
299
+ path_node = child
300
+ break
301
+ if path_node:
302
+ include_path = (
303
+ source[path_node.start_byte : path_node.end_byte]
304
+ .decode("utf-8", errors="replace")
305
+ .strip('"<>')
306
+ )
307
+ result.edges.append(
308
+ EdgeSchema(
309
+ source=file_node_id, target=include_path, relation="imports"
310
+ )
311
+ )
312
+
313
+ elif node_type == "call_expression":
314
+ func_node = node.child_by_field_name("function")
315
+ if func_node:
316
+ callee_name = (
317
+ source[func_node.start_byte : func_node.end_byte]
318
+ .decode("utf-8", errors="replace")
319
+ .strip()
320
+ )
321
+ caller_id = get_current_parent_id()
322
+ result.edges.append(
323
+ EdgeSchema(
324
+ source=caller_id, target=callee_name, relation="calls"
325
+ )
326
+ )
327
+
328
+ for child in node.children:
329
+ walk(child)
330
+
331
+ if pushed_scope:
332
+ scope_stack.pop()
333
+
334
+ walk(root)
335
+ return result
336
+
337
+
338
+ class CParser(CCppParser):
339
+ def __init__(self):
340
+ import tree_sitter_c
341
+
342
+ super().__init__(tree_sitter_c)
343
+
344
+
345
+ class CppParser(CCppParser):
346
+ def __init__(self):
347
+ import tree_sitter_cpp
348
+
349
+ super().__init__(tree_sitter_cpp)
@@ -0,0 +1,268 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import tree_sitter
4
+ import tree_sitter_go
5
+ from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class GoParser(BaseParser):
11
+ def __init__(self):
12
+ self.language = tree_sitter.Language(tree_sitter_go.language())
13
+ self.parser = tree_sitter.Parser(self.language)
14
+
15
+ def _get_docstring(self, node, source: bytes) -> str:
16
+ """Finds comments immediately preceding the node."""
17
+ docstring = ""
18
+ prev = node.prev_sibling
19
+ comments = []
20
+ while prev and prev.type in ("comment", "line_comment"):
21
+ comment_text = source[prev.start_byte : prev.end_byte].decode(
22
+ "utf-8", errors="replace"
23
+ )
24
+ # Strip comment markers (//)
25
+ clean_text = comment_text.strip().lstrip("//").strip()
26
+ comments.append(clean_text)
27
+ prev = prev.prev_sibling
28
+
29
+ if comments:
30
+ docstring = "\n".join(reversed(comments))
31
+ return docstring
32
+
33
+ def _get_signature(self, node, source: bytes) -> str:
34
+ body = node.child_by_field_name("body")
35
+ if body:
36
+ end_byte = body.start_byte
37
+ sig_bytes = source[node.start_byte : end_byte]
38
+ sig = sig_bytes.decode("utf-8", errors="replace").strip()
39
+ if sig.endswith("{"):
40
+ sig = sig[:-1].strip()
41
+ return sig
42
+ return (
43
+ source[node.start_byte : node.end_byte]
44
+ .decode("utf-8", errors="replace")
45
+ .split("\n")[0]
46
+ )
47
+
48
+ def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
49
+ try:
50
+ source = file_path.read_bytes()
51
+ except Exception as e:
52
+ logger.error(f"Error reading file {file_path}: {e}")
53
+ return ExtractionResult()
54
+
55
+ tree = self.parser.parse(source)
56
+ root = tree.root_node
57
+
58
+ rel_path = str(file_path.relative_to(workspace_dir))
59
+ result = ExtractionResult()
60
+
61
+ # Add file node
62
+ file_node_id = rel_path
63
+ result.nodes.append(
64
+ NodeSchema(
65
+ id=file_node_id,
66
+ label=file_path.name,
67
+ type="file",
68
+ source_file=rel_path,
69
+ line_start=1,
70
+ line_end=len(source.splitlines()) or 1,
71
+ signature=f"package {file_path.parent.name or 'main'}",
72
+ docstring=self._get_docstring(root, source),
73
+ )
74
+ )
75
+
76
+ def get_receiver_type(method_node) -> str | None:
77
+ receiver = method_node.child_by_field_name("receiver")
78
+ if receiver:
79
+ # Find parameter_declaration in receiver
80
+ for child in receiver.children:
81
+ if child.type == "parameter_declaration":
82
+ type_node = child.child_by_field_name("type")
83
+ if type_node:
84
+ # Might be *Type, so strip '*'
85
+ raw_type = source[
86
+ type_node.start_byte : type_node.end_byte
87
+ ].decode("utf-8", errors="replace")
88
+ return raw_type.strip()
89
+ return None
90
+
91
+ def walk(node):
92
+ nonlocal result
93
+
94
+ if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
95
+ logger.debug(f"Skipping syntax error node in Go AST: {node}")
96
+ return
97
+
98
+ node_type = node.type
99
+
100
+ if node_type == "type_declaration":
101
+ for child in node.children:
102
+ if child.type == "type_spec":
103
+ name_node = child.child_by_field_name("name")
104
+ if name_node:
105
+ type_name = source[
106
+ name_node.start_byte : name_node.end_byte
107
+ ].decode("utf-8", errors="replace")
108
+ type_id = f"{rel_path}::{type_name}"
109
+
110
+ sym_type = "struct"
111
+ for tc in child.children:
112
+ if tc.type == "interface_type":
113
+ sym_type = "interface"
114
+ break
115
+
116
+ result.nodes.append(
117
+ NodeSchema(
118
+ id=type_id,
119
+ label=type_name,
120
+ type=sym_type,
121
+ source_file=rel_path,
122
+ line_start=child.start_point[0] + 1,
123
+ line_end=child.end_point[0] + 1,
124
+ signature=f"type {type_name} {sym_type}",
125
+ docstring=self._get_docstring(node, source),
126
+ )
127
+ )
128
+
129
+ result.edges.append(
130
+ EdgeSchema(
131
+ source=file_node_id,
132
+ target=type_id,
133
+ relation="contains",
134
+ )
135
+ )
136
+
137
+ elif node_type == "function_declaration":
138
+ name_node = node.child_by_field_name("name")
139
+ if name_node:
140
+ func_name = source[
141
+ name_node.start_byte : name_node.end_byte
142
+ ].decode("utf-8", errors="replace")
143
+ func_id = f"{rel_path}::{func_name}"
144
+
145
+ result.nodes.append(
146
+ NodeSchema(
147
+ id=func_id,
148
+ label=func_name,
149
+ type="function",
150
+ source_file=rel_path,
151
+ line_start=node.start_point[0] + 1,
152
+ line_end=node.end_point[0] + 1,
153
+ signature=self._get_signature(node, source),
154
+ docstring=self._get_docstring(node, source),
155
+ )
156
+ )
157
+
158
+ result.edges.append(
159
+ EdgeSchema(
160
+ source=file_node_id, target=func_id, relation="contains"
161
+ )
162
+ )
163
+
164
+ elif node_type == "method_declaration":
165
+ name_node = node.child_by_field_name("name")
166
+ if name_node:
167
+ method_name = source[
168
+ name_node.start_byte : name_node.end_byte
169
+ ].decode("utf-8", errors="replace")
170
+ receiver_type = get_receiver_type(node)
171
+
172
+ if receiver_type:
173
+ parent_id = f"{rel_path}::{receiver_type}"
174
+ method_id = f"{parent_id}.{method_name}"
175
+ relation = "contains"
176
+ else:
177
+ parent_id = file_node_id
178
+ method_id = f"{rel_path}::{method_name}"
179
+ relation = "contains"
180
+
181
+ result.nodes.append(
182
+ NodeSchema(
183
+ id=method_id,
184
+ label=method_name,
185
+ type="method",
186
+ source_file=rel_path,
187
+ line_start=node.start_point[0] + 1,
188
+ line_end=node.end_point[0] + 1,
189
+ signature=self._get_signature(node, source),
190
+ docstring=self._get_docstring(node, source),
191
+ )
192
+ )
193
+
194
+ result.edges.append(
195
+ EdgeSchema(
196
+ source=parent_id, target=method_id, relation=relation
197
+ )
198
+ )
199
+
200
+ elif node_type == "import_spec":
201
+ path_node = node.child_by_field_name("path")
202
+ if path_node:
203
+ import_path = source[
204
+ path_node.start_byte : path_node.end_byte
205
+ ].decode("utf-8", errors="replace")
206
+ import_path = import_path.strip("\"'")
207
+
208
+ pkg_name = import_path.split("/")[-1]
209
+ import_map = {}
210
+
211
+ name_node = node.child_by_field_name("name")
212
+ if name_node:
213
+ local_name = source[
214
+ name_node.start_byte : name_node.end_byte
215
+ ].decode("utf-8", errors="replace")
216
+ if local_name == ".":
217
+ import_map["*"] = "*"
218
+ else:
219
+ import_map[local_name] = pkg_name
220
+ else:
221
+ import_map[pkg_name] = pkg_name
222
+
223
+ result.edges.append(
224
+ EdgeSchema(
225
+ source=file_node_id,
226
+ target=import_path,
227
+ relation="imports",
228
+ import_map=import_map,
229
+ )
230
+ )
231
+
232
+ elif node_type == "call_expression":
233
+ func_node = node.child_by_field_name("function")
234
+ if func_node:
235
+ callee_name = source[
236
+ func_node.start_byte : func_node.end_byte
237
+ ].decode("utf-8", errors="replace")
238
+ caller_id = file_node_id
239
+ curr = node.parent
240
+ while curr:
241
+ if curr.type in ("function_declaration", "method_declaration"):
242
+ c_name_node = curr.child_by_field_name("name")
243
+ if c_name_node:
244
+ c_name = source[
245
+ c_name_node.start_byte : c_name_node.end_byte
246
+ ].decode("utf-8", errors="replace")
247
+ if curr.type == "method_declaration":
248
+ r_type = get_receiver_type(curr)
249
+ if r_type:
250
+ caller_id = f"{rel_path}::{r_type}.{c_name}"
251
+ else:
252
+ caller_id = f"{rel_path}::{c_name}"
253
+ else:
254
+ caller_id = f"{rel_path}::{c_name}"
255
+ break
256
+ curr = curr.parent
257
+
258
+ result.edges.append(
259
+ EdgeSchema(
260
+ source=caller_id, target=callee_name, relation="calls"
261
+ )
262
+ )
263
+
264
+ for child in node.children:
265
+ walk(child)
266
+
267
+ walk(root)
268
+ return result