codegraph-gen 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_gen/__init__.py +0 -0
- codegraph_gen/__main__.py +311 -0
- codegraph_gen/ai.py +77 -0
- codegraph_gen/analyzer.py +100 -0
- codegraph_gen/builder.py +747 -0
- codegraph_gen/cluster.py +116 -0
- codegraph_gen/config.py +76 -0
- codegraph_gen/detect.py +59 -0
- codegraph_gen/engine.py +367 -0
- codegraph_gen/parser/__init__.py +27 -0
- codegraph_gen/parser/base.py +38 -0
- codegraph_gen/parser/cpp.py +349 -0
- codegraph_gen/parser/go.py +268 -0
- codegraph_gen/parser/javascript.py +370 -0
- codegraph_gen/parser/kotlin.py +387 -0
- codegraph_gen/parser/python.py +415 -0
- codegraph_gen/parser/rust.py +497 -0
- codegraph_gen/parser/swift.py +327 -0
- codegraph_gen/py.typed +0 -0
- codegraph_gen/renderer.py +498 -0
- codegraph_gen/writer.py +97 -0
- codegraph_gen-0.2.0.dist-info/METADATA +169 -0
- codegraph_gen-0.2.0.dist-info/RECORD +25 -0
- codegraph_gen-0.2.0.dist-info/WHEEL +4 -0
- codegraph_gen-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import tree_sitter
|
|
4
|
+
import tree_sitter_kotlin
|
|
5
|
+
from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class KotlinParser(BaseParser):
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.language = tree_sitter.Language(tree_sitter_kotlin.language())
|
|
13
|
+
self.parser = tree_sitter.Parser(self.language)
|
|
14
|
+
|
|
15
|
+
def _get_docstring(self, node, source: bytes) -> str:
|
|
16
|
+
"""Finds comments immediately preceding the node."""
|
|
17
|
+
docstring = ""
|
|
18
|
+
prev = node.prev_sibling
|
|
19
|
+
comments = []
|
|
20
|
+
while prev and prev.type in ("comment", "line_comment", "block_comment"):
|
|
21
|
+
comment_text = source[prev.start_byte : prev.end_byte].decode(
|
|
22
|
+
"utf-8", errors="replace"
|
|
23
|
+
)
|
|
24
|
+
# Strip comment markers (//, /*, /**, *)
|
|
25
|
+
clean_text = (
|
|
26
|
+
comment_text.strip()
|
|
27
|
+
.lstrip("/*")
|
|
28
|
+
.rstrip("*/")
|
|
29
|
+
.lstrip("*")
|
|
30
|
+
.lstrip("/")
|
|
31
|
+
.strip()
|
|
32
|
+
)
|
|
33
|
+
comments.append(clean_text)
|
|
34
|
+
prev = prev.prev_sibling
|
|
35
|
+
|
|
36
|
+
if comments:
|
|
37
|
+
docstring = "\n".join(reversed(comments))
|
|
38
|
+
return docstring
|
|
39
|
+
|
|
40
|
+
def _get_signature(self, node, source: bytes) -> str:
|
|
41
|
+
body = None
|
|
42
|
+
for child in node.children:
|
|
43
|
+
if child.type in (
|
|
44
|
+
"class_body",
|
|
45
|
+
"function_body",
|
|
46
|
+
"block",
|
|
47
|
+
):
|
|
48
|
+
body = child
|
|
49
|
+
break
|
|
50
|
+
if body:
|
|
51
|
+
end_byte = body.start_byte
|
|
52
|
+
sig_bytes = source[node.start_byte : end_byte]
|
|
53
|
+
sig = sig_bytes.decode("utf-8", errors="replace").strip()
|
|
54
|
+
if sig.endswith("{"):
|
|
55
|
+
sig = sig[:-1].strip()
|
|
56
|
+
return sig
|
|
57
|
+
return (
|
|
58
|
+
source[node.start_byte : node.end_byte]
|
|
59
|
+
.decode("utf-8", errors="replace")
|
|
60
|
+
.split("\n")[0]
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
|
|
64
|
+
try:
|
|
65
|
+
source = file_path.read_bytes()
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"Error reading file {file_path}: {e}")
|
|
68
|
+
return ExtractionResult()
|
|
69
|
+
|
|
70
|
+
tree = self.parser.parse(source)
|
|
71
|
+
root = tree.root_node
|
|
72
|
+
|
|
73
|
+
rel_path = str(file_path.relative_to(workspace_dir))
|
|
74
|
+
result = ExtractionResult()
|
|
75
|
+
|
|
76
|
+
# Add file node
|
|
77
|
+
file_node_id = rel_path
|
|
78
|
+
result.nodes.append(
|
|
79
|
+
NodeSchema(
|
|
80
|
+
id=file_node_id,
|
|
81
|
+
label=file_path.name,
|
|
82
|
+
type="file",
|
|
83
|
+
source_file=rel_path,
|
|
84
|
+
line_start=1,
|
|
85
|
+
line_end=len(source.splitlines()) or 1,
|
|
86
|
+
signature=f"package {file_path.stem}",
|
|
87
|
+
docstring=self._get_docstring(root, source),
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
scope_stack = [(file_node_id, "file")]
|
|
92
|
+
|
|
93
|
+
def get_current_parent_id():
|
|
94
|
+
return scope_stack[-1][0] if scope_stack else file_node_id
|
|
95
|
+
|
|
96
|
+
def walk(node):
|
|
97
|
+
nonlocal result
|
|
98
|
+
|
|
99
|
+
if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
|
|
100
|
+
logger.debug(f"Skipping syntax error node in Kotlin AST: {node}")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
node_type = node.type
|
|
104
|
+
pushed_scope = False
|
|
105
|
+
|
|
106
|
+
if node_type in ("class_declaration", "object_declaration"):
|
|
107
|
+
name_node = node.child_by_field_name("name")
|
|
108
|
+
if name_node:
|
|
109
|
+
class_name = source[
|
|
110
|
+
name_node.start_byte : name_node.end_byte
|
|
111
|
+
].decode("utf-8", errors="replace")
|
|
112
|
+
parent_id = get_current_parent_id()
|
|
113
|
+
class_id = f"{rel_path}::{class_name}"
|
|
114
|
+
|
|
115
|
+
if node_type == "class_declaration":
|
|
116
|
+
is_interface = any(c.type == "interface" for c in node.children)
|
|
117
|
+
sym_type = "interface" if is_interface else "class"
|
|
118
|
+
else:
|
|
119
|
+
sym_type = "class" # Map object declaration to class
|
|
120
|
+
|
|
121
|
+
result.nodes.append(
|
|
122
|
+
NodeSchema(
|
|
123
|
+
id=class_id,
|
|
124
|
+
label=class_name,
|
|
125
|
+
type=sym_type,
|
|
126
|
+
source_file=rel_path,
|
|
127
|
+
line_start=node.start_point[0] + 1,
|
|
128
|
+
line_end=node.end_point[0] + 1,
|
|
129
|
+
signature=self._get_signature(node, source),
|
|
130
|
+
docstring=self._get_docstring(node, source),
|
|
131
|
+
)
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
result.edges.append(
|
|
135
|
+
EdgeSchema(
|
|
136
|
+
source=parent_id, target=class_id, relation="contains"
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Check inheritance / delegation specifiers
|
|
141
|
+
for child in node.children:
|
|
142
|
+
if child.type == "delegation_specifiers":
|
|
143
|
+
for spec in child.children:
|
|
144
|
+
if spec.type == "delegation_specifier":
|
|
145
|
+
|
|
146
|
+
def find_user_type(n):
|
|
147
|
+
if n.type == "user_type":
|
|
148
|
+
return n
|
|
149
|
+
for c in n.children:
|
|
150
|
+
res = find_user_type(c)
|
|
151
|
+
if res:
|
|
152
|
+
return res
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
user_type_node = find_user_type(spec)
|
|
156
|
+
if user_type_node:
|
|
157
|
+
id_node = next(
|
|
158
|
+
(
|
|
159
|
+
c
|
|
160
|
+
for c in user_type_node.children
|
|
161
|
+
if c.type == "identifier"
|
|
162
|
+
),
|
|
163
|
+
None,
|
|
164
|
+
)
|
|
165
|
+
if id_node:
|
|
166
|
+
parent_name = source[
|
|
167
|
+
id_node.start_byte : id_node.end_byte
|
|
168
|
+
].decode("utf-8", errors="replace")
|
|
169
|
+
result.edges.append(
|
|
170
|
+
EdgeSchema(
|
|
171
|
+
source=class_id,
|
|
172
|
+
target=parent_name,
|
|
173
|
+
relation="inherits",
|
|
174
|
+
)
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
scope_stack.append((class_id, sym_type))
|
|
178
|
+
pushed_scope = True
|
|
179
|
+
|
|
180
|
+
elif node_type == "function_declaration":
|
|
181
|
+
name_node = node.child_by_field_name("name")
|
|
182
|
+
if name_node:
|
|
183
|
+
func_name = source[
|
|
184
|
+
name_node.start_byte : name_node.end_byte
|
|
185
|
+
].decode("utf-8", errors="replace")
|
|
186
|
+
parent_id = get_current_parent_id()
|
|
187
|
+
parent_type = scope_stack[-1][1] if scope_stack else "file"
|
|
188
|
+
|
|
189
|
+
if parent_type in ("class", "interface"):
|
|
190
|
+
func_id = f"{parent_id}.{func_name}"
|
|
191
|
+
sym_type = "method"
|
|
192
|
+
else:
|
|
193
|
+
func_id = f"{rel_path}::{func_name}"
|
|
194
|
+
sym_type = "function"
|
|
195
|
+
|
|
196
|
+
local_bindings = {}
|
|
197
|
+
|
|
198
|
+
def extract_type_from_kt_node(kt_node):
|
|
199
|
+
if kt_node.type == "user_type":
|
|
200
|
+
id_node = next(
|
|
201
|
+
(
|
|
202
|
+
c
|
|
203
|
+
for c in kt_node.children
|
|
204
|
+
if c.type == "identifier"
|
|
205
|
+
),
|
|
206
|
+
None,
|
|
207
|
+
)
|
|
208
|
+
if id_node:
|
|
209
|
+
return source[
|
|
210
|
+
id_node.start_byte : id_node.end_byte
|
|
211
|
+
].decode("utf-8", errors="replace")
|
|
212
|
+
elif kt_node.type == "call_expression":
|
|
213
|
+
callee = kt_node.child_by_field_name(
|
|
214
|
+
"constructor"
|
|
215
|
+
) or next(
|
|
216
|
+
(
|
|
217
|
+
c
|
|
218
|
+
for c in kt_node.children
|
|
219
|
+
if c.type == "identifier"
|
|
220
|
+
),
|
|
221
|
+
None,
|
|
222
|
+
)
|
|
223
|
+
if callee:
|
|
224
|
+
return source[
|
|
225
|
+
callee.start_byte : callee.end_byte
|
|
226
|
+
].decode("utf-8", errors="replace")
|
|
227
|
+
for child in kt_node.children:
|
|
228
|
+
res = extract_type_from_kt_node(child)
|
|
229
|
+
if res:
|
|
230
|
+
return res
|
|
231
|
+
return None
|
|
232
|
+
|
|
233
|
+
def collect_local_bindings(n):
|
|
234
|
+
if n.type == "parameter":
|
|
235
|
+
id_node = next(
|
|
236
|
+
(c for c in n.children if c.type == "identifier"), None
|
|
237
|
+
)
|
|
238
|
+
type_node = next(
|
|
239
|
+
(c for c in n.children if c.type == "user_type"), None
|
|
240
|
+
)
|
|
241
|
+
if id_node and type_node:
|
|
242
|
+
var_name = source[
|
|
243
|
+
id_node.start_byte : id_node.end_byte
|
|
244
|
+
].decode("utf-8", errors="replace")
|
|
245
|
+
t_name = extract_type_from_kt_node(type_node)
|
|
246
|
+
if t_name:
|
|
247
|
+
local_bindings[var_name] = t_name
|
|
248
|
+
elif n.type == "property_declaration":
|
|
249
|
+
var_decl = next(
|
|
250
|
+
(
|
|
251
|
+
c
|
|
252
|
+
for c in n.children
|
|
253
|
+
if c.type == "variable_declaration"
|
|
254
|
+
),
|
|
255
|
+
None,
|
|
256
|
+
)
|
|
257
|
+
val_expr = next(
|
|
258
|
+
(c for c in n.children if c.type == "call_expression"),
|
|
259
|
+
None,
|
|
260
|
+
)
|
|
261
|
+
if var_decl:
|
|
262
|
+
id_node = next(
|
|
263
|
+
(
|
|
264
|
+
c
|
|
265
|
+
for c in var_decl.children
|
|
266
|
+
if c.type == "identifier"
|
|
267
|
+
),
|
|
268
|
+
None,
|
|
269
|
+
)
|
|
270
|
+
type_node = next(
|
|
271
|
+
(
|
|
272
|
+
c
|
|
273
|
+
for c in var_decl.children
|
|
274
|
+
if c.type == "user_type"
|
|
275
|
+
),
|
|
276
|
+
None,
|
|
277
|
+
)
|
|
278
|
+
if id_node:
|
|
279
|
+
var_name = source[
|
|
280
|
+
id_node.start_byte : id_node.end_byte
|
|
281
|
+
].decode("utf-8", errors="replace")
|
|
282
|
+
if type_node:
|
|
283
|
+
t_name = extract_type_from_kt_node(type_node)
|
|
284
|
+
if t_name:
|
|
285
|
+
local_bindings[var_name] = t_name
|
|
286
|
+
elif val_expr:
|
|
287
|
+
t_name = extract_type_from_kt_node(val_expr)
|
|
288
|
+
if t_name:
|
|
289
|
+
local_bindings[var_name] = t_name
|
|
290
|
+
|
|
291
|
+
for child in n.children:
|
|
292
|
+
if child.type not in (
|
|
293
|
+
"function_declaration",
|
|
294
|
+
"class_declaration",
|
|
295
|
+
"object_declaration",
|
|
296
|
+
):
|
|
297
|
+
collect_local_bindings(child)
|
|
298
|
+
|
|
299
|
+
collect_local_bindings(node)
|
|
300
|
+
|
|
301
|
+
result.nodes.append(
|
|
302
|
+
NodeSchema(
|
|
303
|
+
id=func_id,
|
|
304
|
+
label=func_name,
|
|
305
|
+
type=sym_type,
|
|
306
|
+
source_file=rel_path,
|
|
307
|
+
line_start=node.start_point[0] + 1,
|
|
308
|
+
line_end=node.end_point[0] + 1,
|
|
309
|
+
signature=self._get_signature(node, source),
|
|
310
|
+
docstring=self._get_docstring(node, source),
|
|
311
|
+
local_bindings=local_bindings,
|
|
312
|
+
)
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
result.edges.append(
|
|
316
|
+
EdgeSchema(
|
|
317
|
+
source=parent_id, target=func_id, relation="contains"
|
|
318
|
+
)
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
scope_stack.append((func_id, sym_type))
|
|
322
|
+
pushed_scope = True
|
|
323
|
+
|
|
324
|
+
elif node_type == "import":
|
|
325
|
+
qual_id_node = next(
|
|
326
|
+
(c for c in node.children if c.type == "qualified_identifier"), None
|
|
327
|
+
)
|
|
328
|
+
if qual_id_node:
|
|
329
|
+
target = source[
|
|
330
|
+
qual_id_node.start_byte : qual_id_node.end_byte
|
|
331
|
+
].decode("utf-8", errors="replace")
|
|
332
|
+
is_wildcard = any(c.type == "*" for c in node.children)
|
|
333
|
+
alias = None
|
|
334
|
+
|
|
335
|
+
as_idx = next(
|
|
336
|
+
(i for i, c in enumerate(node.children) if c.type == "as"), -1
|
|
337
|
+
)
|
|
338
|
+
if as_idx != -1 and as_idx + 1 < len(node.children):
|
|
339
|
+
alias_node = node.children[as_idx + 1]
|
|
340
|
+
if alias_node.type == "identifier":
|
|
341
|
+
alias = source[
|
|
342
|
+
alias_node.start_byte : alias_node.end_byte
|
|
343
|
+
].decode("utf-8", errors="replace")
|
|
344
|
+
|
|
345
|
+
if is_wildcard:
|
|
346
|
+
import_map = {"*": "*"}
|
|
347
|
+
elif alias:
|
|
348
|
+
last_part = target.split(".")[-1]
|
|
349
|
+
import_map = {alias: last_part}
|
|
350
|
+
else:
|
|
351
|
+
last_part = target.split(".")[-1]
|
|
352
|
+
import_map = {last_part: last_part}
|
|
353
|
+
|
|
354
|
+
result.edges.append(
|
|
355
|
+
EdgeSchema(
|
|
356
|
+
source=file_node_id,
|
|
357
|
+
target=target,
|
|
358
|
+
relation="imports",
|
|
359
|
+
import_map=import_map,
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
elif node_type == "call_expression":
|
|
364
|
+
func_node = None
|
|
365
|
+
for child in node.children:
|
|
366
|
+
if child.type in ("identifier", "navigation_expression"):
|
|
367
|
+
func_node = child
|
|
368
|
+
break
|
|
369
|
+
if func_node:
|
|
370
|
+
callee_name = source[
|
|
371
|
+
func_node.start_byte : func_node.end_byte
|
|
372
|
+
].decode("utf-8", errors="replace")
|
|
373
|
+
caller_id = get_current_parent_id()
|
|
374
|
+
result.edges.append(
|
|
375
|
+
EdgeSchema(
|
|
376
|
+
source=caller_id, target=callee_name, relation="calls"
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
for child in node.children:
|
|
381
|
+
walk(child)
|
|
382
|
+
|
|
383
|
+
if pushed_scope:
|
|
384
|
+
scope_stack.pop()
|
|
385
|
+
|
|
386
|
+
walk(root)
|
|
387
|
+
return result
|