codegraph-gen 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,387 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import tree_sitter
4
+ import tree_sitter_kotlin
5
+ from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class KotlinParser(BaseParser):
11
+ def __init__(self):
12
+ self.language = tree_sitter.Language(tree_sitter_kotlin.language())
13
+ self.parser = tree_sitter.Parser(self.language)
14
+
15
+ def _get_docstring(self, node, source: bytes) -> str:
16
+ """Finds comments immediately preceding the node."""
17
+ docstring = ""
18
+ prev = node.prev_sibling
19
+ comments = []
20
+ while prev and prev.type in ("comment", "line_comment", "block_comment"):
21
+ comment_text = source[prev.start_byte : prev.end_byte].decode(
22
+ "utf-8", errors="replace"
23
+ )
24
+ # Strip comment markers (//, /*, /**, *)
25
+ clean_text = (
26
+ comment_text.strip()
27
+ .lstrip("/*")
28
+ .rstrip("*/")
29
+ .lstrip("*")
30
+ .lstrip("/")
31
+ .strip()
32
+ )
33
+ comments.append(clean_text)
34
+ prev = prev.prev_sibling
35
+
36
+ if comments:
37
+ docstring = "\n".join(reversed(comments))
38
+ return docstring
39
+
40
+ def _get_signature(self, node, source: bytes) -> str:
41
+ body = None
42
+ for child in node.children:
43
+ if child.type in (
44
+ "class_body",
45
+ "function_body",
46
+ "block",
47
+ ):
48
+ body = child
49
+ break
50
+ if body:
51
+ end_byte = body.start_byte
52
+ sig_bytes = source[node.start_byte : end_byte]
53
+ sig = sig_bytes.decode("utf-8", errors="replace").strip()
54
+ if sig.endswith("{"):
55
+ sig = sig[:-1].strip()
56
+ return sig
57
+ return (
58
+ source[node.start_byte : node.end_byte]
59
+ .decode("utf-8", errors="replace")
60
+ .split("\n")[0]
61
+ )
62
+
63
+ def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
64
+ try:
65
+ source = file_path.read_bytes()
66
+ except Exception as e:
67
+ logger.error(f"Error reading file {file_path}: {e}")
68
+ return ExtractionResult()
69
+
70
+ tree = self.parser.parse(source)
71
+ root = tree.root_node
72
+
73
+ rel_path = str(file_path.relative_to(workspace_dir))
74
+ result = ExtractionResult()
75
+
76
+ # Add file node
77
+ file_node_id = rel_path
78
+ result.nodes.append(
79
+ NodeSchema(
80
+ id=file_node_id,
81
+ label=file_path.name,
82
+ type="file",
83
+ source_file=rel_path,
84
+ line_start=1,
85
+ line_end=len(source.splitlines()) or 1,
86
+ signature=f"package {file_path.stem}",
87
+ docstring=self._get_docstring(root, source),
88
+ )
89
+ )
90
+
91
+ scope_stack = [(file_node_id, "file")]
92
+
93
+ def get_current_parent_id():
94
+ return scope_stack[-1][0] if scope_stack else file_node_id
95
+
96
+ def walk(node):
97
+ nonlocal result
98
+
99
+ if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
100
+ logger.debug(f"Skipping syntax error node in Kotlin AST: {node}")
101
+ return
102
+
103
+ node_type = node.type
104
+ pushed_scope = False
105
+
106
+ if node_type in ("class_declaration", "object_declaration"):
107
+ name_node = node.child_by_field_name("name")
108
+ if name_node:
109
+ class_name = source[
110
+ name_node.start_byte : name_node.end_byte
111
+ ].decode("utf-8", errors="replace")
112
+ parent_id = get_current_parent_id()
113
+ class_id = f"{rel_path}::{class_name}"
114
+
115
+ if node_type == "class_declaration":
116
+ is_interface = any(c.type == "interface" for c in node.children)
117
+ sym_type = "interface" if is_interface else "class"
118
+ else:
119
+ sym_type = "class" # Map object declaration to class
120
+
121
+ result.nodes.append(
122
+ NodeSchema(
123
+ id=class_id,
124
+ label=class_name,
125
+ type=sym_type,
126
+ source_file=rel_path,
127
+ line_start=node.start_point[0] + 1,
128
+ line_end=node.end_point[0] + 1,
129
+ signature=self._get_signature(node, source),
130
+ docstring=self._get_docstring(node, source),
131
+ )
132
+ )
133
+
134
+ result.edges.append(
135
+ EdgeSchema(
136
+ source=parent_id, target=class_id, relation="contains"
137
+ )
138
+ )
139
+
140
+ # Check inheritance / delegation specifiers
141
+ for child in node.children:
142
+ if child.type == "delegation_specifiers":
143
+ for spec in child.children:
144
+ if spec.type == "delegation_specifier":
145
+
146
+ def find_user_type(n):
147
+ if n.type == "user_type":
148
+ return n
149
+ for c in n.children:
150
+ res = find_user_type(c)
151
+ if res:
152
+ return res
153
+ return None
154
+
155
+ user_type_node = find_user_type(spec)
156
+ if user_type_node:
157
+ id_node = next(
158
+ (
159
+ c
160
+ for c in user_type_node.children
161
+ if c.type == "identifier"
162
+ ),
163
+ None,
164
+ )
165
+ if id_node:
166
+ parent_name = source[
167
+ id_node.start_byte : id_node.end_byte
168
+ ].decode("utf-8", errors="replace")
169
+ result.edges.append(
170
+ EdgeSchema(
171
+ source=class_id,
172
+ target=parent_name,
173
+ relation="inherits",
174
+ )
175
+ )
176
+
177
+ scope_stack.append((class_id, sym_type))
178
+ pushed_scope = True
179
+
180
+ elif node_type == "function_declaration":
181
+ name_node = node.child_by_field_name("name")
182
+ if name_node:
183
+ func_name = source[
184
+ name_node.start_byte : name_node.end_byte
185
+ ].decode("utf-8", errors="replace")
186
+ parent_id = get_current_parent_id()
187
+ parent_type = scope_stack[-1][1] if scope_stack else "file"
188
+
189
+ if parent_type in ("class", "interface"):
190
+ func_id = f"{parent_id}.{func_name}"
191
+ sym_type = "method"
192
+ else:
193
+ func_id = f"{rel_path}::{func_name}"
194
+ sym_type = "function"
195
+
196
+ local_bindings = {}
197
+
198
+ def extract_type_from_kt_node(kt_node):
199
+ if kt_node.type == "user_type":
200
+ id_node = next(
201
+ (
202
+ c
203
+ for c in kt_node.children
204
+ if c.type == "identifier"
205
+ ),
206
+ None,
207
+ )
208
+ if id_node:
209
+ return source[
210
+ id_node.start_byte : id_node.end_byte
211
+ ].decode("utf-8", errors="replace")
212
+ elif kt_node.type == "call_expression":
213
+ callee = kt_node.child_by_field_name(
214
+ "constructor"
215
+ ) or next(
216
+ (
217
+ c
218
+ for c in kt_node.children
219
+ if c.type == "identifier"
220
+ ),
221
+ None,
222
+ )
223
+ if callee:
224
+ return source[
225
+ callee.start_byte : callee.end_byte
226
+ ].decode("utf-8", errors="replace")
227
+ for child in kt_node.children:
228
+ res = extract_type_from_kt_node(child)
229
+ if res:
230
+ return res
231
+ return None
232
+
233
+ def collect_local_bindings(n):
234
+ if n.type == "parameter":
235
+ id_node = next(
236
+ (c for c in n.children if c.type == "identifier"), None
237
+ )
238
+ type_node = next(
239
+ (c for c in n.children if c.type == "user_type"), None
240
+ )
241
+ if id_node and type_node:
242
+ var_name = source[
243
+ id_node.start_byte : id_node.end_byte
244
+ ].decode("utf-8", errors="replace")
245
+ t_name = extract_type_from_kt_node(type_node)
246
+ if t_name:
247
+ local_bindings[var_name] = t_name
248
+ elif n.type == "property_declaration":
249
+ var_decl = next(
250
+ (
251
+ c
252
+ for c in n.children
253
+ if c.type == "variable_declaration"
254
+ ),
255
+ None,
256
+ )
257
+ val_expr = next(
258
+ (c for c in n.children if c.type == "call_expression"),
259
+ None,
260
+ )
261
+ if var_decl:
262
+ id_node = next(
263
+ (
264
+ c
265
+ for c in var_decl.children
266
+ if c.type == "identifier"
267
+ ),
268
+ None,
269
+ )
270
+ type_node = next(
271
+ (
272
+ c
273
+ for c in var_decl.children
274
+ if c.type == "user_type"
275
+ ),
276
+ None,
277
+ )
278
+ if id_node:
279
+ var_name = source[
280
+ id_node.start_byte : id_node.end_byte
281
+ ].decode("utf-8", errors="replace")
282
+ if type_node:
283
+ t_name = extract_type_from_kt_node(type_node)
284
+ if t_name:
285
+ local_bindings[var_name] = t_name
286
+ elif val_expr:
287
+ t_name = extract_type_from_kt_node(val_expr)
288
+ if t_name:
289
+ local_bindings[var_name] = t_name
290
+
291
+ for child in n.children:
292
+ if child.type not in (
293
+ "function_declaration",
294
+ "class_declaration",
295
+ "object_declaration",
296
+ ):
297
+ collect_local_bindings(child)
298
+
299
+ collect_local_bindings(node)
300
+
301
+ result.nodes.append(
302
+ NodeSchema(
303
+ id=func_id,
304
+ label=func_name,
305
+ type=sym_type,
306
+ source_file=rel_path,
307
+ line_start=node.start_point[0] + 1,
308
+ line_end=node.end_point[0] + 1,
309
+ signature=self._get_signature(node, source),
310
+ docstring=self._get_docstring(node, source),
311
+ local_bindings=local_bindings,
312
+ )
313
+ )
314
+
315
+ result.edges.append(
316
+ EdgeSchema(
317
+ source=parent_id, target=func_id, relation="contains"
318
+ )
319
+ )
320
+
321
+ scope_stack.append((func_id, sym_type))
322
+ pushed_scope = True
323
+
324
+ elif node_type == "import":
325
+ qual_id_node = next(
326
+ (c for c in node.children if c.type == "qualified_identifier"), None
327
+ )
328
+ if qual_id_node:
329
+ target = source[
330
+ qual_id_node.start_byte : qual_id_node.end_byte
331
+ ].decode("utf-8", errors="replace")
332
+ is_wildcard = any(c.type == "*" for c in node.children)
333
+ alias = None
334
+
335
+ as_idx = next(
336
+ (i for i, c in enumerate(node.children) if c.type == "as"), -1
337
+ )
338
+ if as_idx != -1 and as_idx + 1 < len(node.children):
339
+ alias_node = node.children[as_idx + 1]
340
+ if alias_node.type == "identifier":
341
+ alias = source[
342
+ alias_node.start_byte : alias_node.end_byte
343
+ ].decode("utf-8", errors="replace")
344
+
345
+ if is_wildcard:
346
+ import_map = {"*": "*"}
347
+ elif alias:
348
+ last_part = target.split(".")[-1]
349
+ import_map = {alias: last_part}
350
+ else:
351
+ last_part = target.split(".")[-1]
352
+ import_map = {last_part: last_part}
353
+
354
+ result.edges.append(
355
+ EdgeSchema(
356
+ source=file_node_id,
357
+ target=target,
358
+ relation="imports",
359
+ import_map=import_map,
360
+ )
361
+ )
362
+
363
+ elif node_type == "call_expression":
364
+ func_node = None
365
+ for child in node.children:
366
+ if child.type in ("identifier", "navigation_expression"):
367
+ func_node = child
368
+ break
369
+ if func_node:
370
+ callee_name = source[
371
+ func_node.start_byte : func_node.end_byte
372
+ ].decode("utf-8", errors="replace")
373
+ caller_id = get_current_parent_id()
374
+ result.edges.append(
375
+ EdgeSchema(
376
+ source=caller_id, target=callee_name, relation="calls"
377
+ )
378
+ )
379
+
380
+ for child in node.children:
381
+ walk(child)
382
+
383
+ if pushed_scope:
384
+ scope_stack.pop()
385
+
386
+ walk(root)
387
+ return result