codegraph-gen 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_gen/__init__.py +0 -0
- codegraph_gen/__main__.py +311 -0
- codegraph_gen/ai.py +77 -0
- codegraph_gen/analyzer.py +100 -0
- codegraph_gen/builder.py +747 -0
- codegraph_gen/cluster.py +116 -0
- codegraph_gen/config.py +76 -0
- codegraph_gen/detect.py +59 -0
- codegraph_gen/engine.py +367 -0
- codegraph_gen/parser/__init__.py +27 -0
- codegraph_gen/parser/base.py +38 -0
- codegraph_gen/parser/cpp.py +349 -0
- codegraph_gen/parser/go.py +268 -0
- codegraph_gen/parser/javascript.py +370 -0
- codegraph_gen/parser/kotlin.py +387 -0
- codegraph_gen/parser/python.py +415 -0
- codegraph_gen/parser/rust.py +497 -0
- codegraph_gen/parser/swift.py +327 -0
- codegraph_gen/py.typed +0 -0
- codegraph_gen/renderer.py +498 -0
- codegraph_gen/writer.py +97 -0
- codegraph_gen-0.2.0.dist-info/METADATA +169 -0
- codegraph_gen-0.2.0.dist-info/RECORD +25 -0
- codegraph_gen-0.2.0.dist-info/WHEEL +4 -0
- codegraph_gen-0.2.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import tree_sitter
|
|
4
|
+
import tree_sitter_javascript
|
|
5
|
+
import tree_sitter_typescript
|
|
6
|
+
from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JavaScriptParser(BaseParser):
|
|
12
|
+
def __init__(self):
|
|
13
|
+
# Cache parsers for javascript, typescript and tsx
|
|
14
|
+
self.js_lang = tree_sitter.Language(tree_sitter_javascript.language())
|
|
15
|
+
self.ts_lang = tree_sitter.Language(
|
|
16
|
+
tree_sitter_typescript.language_typescript()
|
|
17
|
+
)
|
|
18
|
+
self.tsx_lang = tree_sitter.Language(tree_sitter_typescript.language_tsx())
|
|
19
|
+
|
|
20
|
+
self.js_parser = tree_sitter.Parser(self.js_lang)
|
|
21
|
+
self.ts_parser = tree_sitter.Parser(self.ts_lang)
|
|
22
|
+
self.tsx_parser = tree_sitter.Parser(self.tsx_lang)
|
|
23
|
+
|
|
24
|
+
def _get_docstring(self, node, source: bytes) -> str:
|
|
25
|
+
"""Finds comments immediately preceding the node."""
|
|
26
|
+
# Tree-sitter doesn't always attach comments to nodes, but we can look for
|
|
27
|
+
# sibling nodes of type 'comment' that end right before this node starts.
|
|
28
|
+
docstring = ""
|
|
29
|
+
prev = node.prev_sibling
|
|
30
|
+
comments = []
|
|
31
|
+
while prev and prev.type in ("comment", "line_comment", "block_comment"):
|
|
32
|
+
comment_text = source[prev.start_byte : prev.end_byte].decode(
|
|
33
|
+
"utf-8", errors="replace"
|
|
34
|
+
)
|
|
35
|
+
# Strip comment markers
|
|
36
|
+
clean_text = (
|
|
37
|
+
comment_text.strip().lstrip("/*").rstrip("*/").lstrip("*").strip()
|
|
38
|
+
)
|
|
39
|
+
comments.append(clean_text)
|
|
40
|
+
prev = prev.prev_sibling
|
|
41
|
+
|
|
42
|
+
if comments:
|
|
43
|
+
docstring = "\n".join(reversed(comments))
|
|
44
|
+
return docstring
|
|
45
|
+
|
|
46
|
+
def _get_signature(self, node, source: bytes) -> str:
|
|
47
|
+
body = node.child_by_field_name("body")
|
|
48
|
+
if body:
|
|
49
|
+
end_byte = body.start_byte
|
|
50
|
+
sig_bytes = source[node.start_byte : end_byte]
|
|
51
|
+
sig = sig_bytes.decode("utf-8", errors="replace").strip()
|
|
52
|
+
# Trim trailing open curly brace
|
|
53
|
+
if sig.endswith("{"):
|
|
54
|
+
sig = sig[:-1].strip()
|
|
55
|
+
return sig
|
|
56
|
+
return (
|
|
57
|
+
source[node.start_byte : node.end_byte]
|
|
58
|
+
.decode("utf-8", errors="replace")
|
|
59
|
+
.split("\n")[0]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
|
|
63
|
+
try:
|
|
64
|
+
source = file_path.read_bytes()
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f"Error reading file {file_path}: {e}")
|
|
67
|
+
return ExtractionResult()
|
|
68
|
+
|
|
69
|
+
ext = file_path.suffix.lower()
|
|
70
|
+
if ext == ".tsx":
|
|
71
|
+
parser = self.tsx_parser
|
|
72
|
+
elif ext in (".ts", ".cts", ".mts"):
|
|
73
|
+
parser = self.ts_parser
|
|
74
|
+
else:
|
|
75
|
+
parser = self.js_parser
|
|
76
|
+
|
|
77
|
+
tree = parser.parse(source)
|
|
78
|
+
root = tree.root_node
|
|
79
|
+
|
|
80
|
+
rel_path = str(file_path.relative_to(workspace_dir))
|
|
81
|
+
result = ExtractionResult()
|
|
82
|
+
|
|
83
|
+
# Add file node
|
|
84
|
+
file_node_id = rel_path
|
|
85
|
+
result.nodes.append(
|
|
86
|
+
NodeSchema(
|
|
87
|
+
id=file_node_id,
|
|
88
|
+
label=file_path.name,
|
|
89
|
+
type="file",
|
|
90
|
+
source_file=rel_path,
|
|
91
|
+
line_start=1,
|
|
92
|
+
line_end=len(source.splitlines()) or 1,
|
|
93
|
+
signature=f"module {file_path.name}",
|
|
94
|
+
docstring=self._get_docstring(root, source),
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
scope_stack = [(file_node_id, "file")]
|
|
99
|
+
|
|
100
|
+
def get_current_parent_id():
|
|
101
|
+
return scope_stack[-1][0] if scope_stack else file_node_id
|
|
102
|
+
|
|
103
|
+
def walk(node):
|
|
104
|
+
nonlocal result
|
|
105
|
+
|
|
106
|
+
if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
|
|
107
|
+
logger.debug(f"Skipping syntax error node in JS/TS AST: {node}")
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
node_type = node.type
|
|
111
|
+
pushed_scope = False
|
|
112
|
+
|
|
113
|
+
if node_type in ("class_declaration", "interface_declaration"):
|
|
114
|
+
name_node = node.child_by_field_name("name")
|
|
115
|
+
if name_node:
|
|
116
|
+
class_name = source[
|
|
117
|
+
name_node.start_byte : name_node.end_byte
|
|
118
|
+
].decode("utf-8", errors="replace")
|
|
119
|
+
parent_id = get_current_parent_id()
|
|
120
|
+
|
|
121
|
+
class_id = f"{rel_path}::{class_name}"
|
|
122
|
+
sym_type = (
|
|
123
|
+
"class" if node_type == "class_declaration" else "interface"
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
result.nodes.append(
|
|
127
|
+
NodeSchema(
|
|
128
|
+
id=class_id,
|
|
129
|
+
label=class_name,
|
|
130
|
+
type=sym_type,
|
|
131
|
+
source_file=rel_path,
|
|
132
|
+
line_start=node.start_point[0] + 1,
|
|
133
|
+
line_end=node.end_point[0] + 1,
|
|
134
|
+
signature=self._get_signature(node, source),
|
|
135
|
+
docstring=self._get_docstring(node, source),
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
result.edges.append(
|
|
140
|
+
EdgeSchema(
|
|
141
|
+
source=parent_id, target=class_id, relation="contains"
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# Inheritance: heritage / extends clause
|
|
146
|
+
for child in node.children:
|
|
147
|
+
if child.type in ("class_heritage", "interface_heritage"):
|
|
148
|
+
# extends Expression
|
|
149
|
+
for sub in child.children:
|
|
150
|
+
if sub.type in ("identifier", "nested_identifier"):
|
|
151
|
+
parent_class_name = source[
|
|
152
|
+
sub.start_byte : sub.end_byte
|
|
153
|
+
].decode("utf-8", errors="replace")
|
|
154
|
+
result.edges.append(
|
|
155
|
+
EdgeSchema(
|
|
156
|
+
source=class_id,
|
|
157
|
+
target=parent_class_name,
|
|
158
|
+
relation="inherits",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
scope_stack.append((class_id, sym_type))
|
|
163
|
+
pushed_scope = True
|
|
164
|
+
|
|
165
|
+
elif node_type in ("function_declaration", "method_definition"):
|
|
166
|
+
name_node = node.child_by_field_name("name")
|
|
167
|
+
if name_node:
|
|
168
|
+
func_name = source[
|
|
169
|
+
name_node.start_byte : name_node.end_byte
|
|
170
|
+
].decode("utf-8", errors="replace")
|
|
171
|
+
parent_id = get_current_parent_id()
|
|
172
|
+
parent_type = scope_stack[-1][1] if scope_stack else "file"
|
|
173
|
+
|
|
174
|
+
if parent_type in ("class", "interface"):
|
|
175
|
+
func_id = f"{parent_id}.{func_name}"
|
|
176
|
+
sym_type = "method"
|
|
177
|
+
else:
|
|
178
|
+
func_id = f"{rel_path}::{func_name}"
|
|
179
|
+
sym_type = "function"
|
|
180
|
+
|
|
181
|
+
local_bindings = {}
|
|
182
|
+
|
|
183
|
+
def extract_type_from_ts_node(ts_node):
|
|
184
|
+
if ts_node.type == "type_identifier":
|
|
185
|
+
return source[ts_node.start_byte : ts_node.end_byte].decode(
|
|
186
|
+
"utf-8", errors="replace"
|
|
187
|
+
)
|
|
188
|
+
elif ts_node.type == "property_identifier":
|
|
189
|
+
return source[ts_node.start_byte : ts_node.end_byte].decode(
|
|
190
|
+
"utf-8", errors="replace"
|
|
191
|
+
)
|
|
192
|
+
elif ts_node.type == "nested_type_identifier":
|
|
193
|
+
for child in reversed(ts_node.children):
|
|
194
|
+
if child.type in ("type_identifier", "identifier"):
|
|
195
|
+
return extract_type_from_ts_node(child)
|
|
196
|
+
elif ts_node.type == "generic_type":
|
|
197
|
+
type_node = ts_node.child_by_field_name("name") or (
|
|
198
|
+
ts_node.children[0] if ts_node.children else None
|
|
199
|
+
)
|
|
200
|
+
if type_node:
|
|
201
|
+
return extract_type_from_ts_node(type_node)
|
|
202
|
+
elif ts_node.type == "new_expression":
|
|
203
|
+
constructor_node = ts_node.child_by_field_name(
|
|
204
|
+
"constructor"
|
|
205
|
+
)
|
|
206
|
+
if constructor_node:
|
|
207
|
+
if constructor_node.type == "identifier":
|
|
208
|
+
return source[
|
|
209
|
+
constructor_node.start_byte : constructor_node.end_byte
|
|
210
|
+
].decode("utf-8", errors="replace")
|
|
211
|
+
elif constructor_node.type == "member_expression":
|
|
212
|
+
prop = constructor_node.child_by_field_name(
|
|
213
|
+
"property"
|
|
214
|
+
)
|
|
215
|
+
if prop:
|
|
216
|
+
return source[
|
|
217
|
+
prop.start_byte : prop.end_byte
|
|
218
|
+
].decode("utf-8", errors="replace")
|
|
219
|
+
elif ts_node.type == "type_annotation":
|
|
220
|
+
for child in ts_node.children:
|
|
221
|
+
res = extract_type_from_ts_node(child)
|
|
222
|
+
if res:
|
|
223
|
+
return res
|
|
224
|
+
for child in ts_node.children:
|
|
225
|
+
res = extract_type_from_ts_node(child)
|
|
226
|
+
if res:
|
|
227
|
+
return res
|
|
228
|
+
return None
|
|
229
|
+
|
|
230
|
+
def collect_local_bindings(n):
|
|
231
|
+
if n.type in ("required_parameter", "optional_parameter"):
|
|
232
|
+
pattern = n.child_by_field_name("pattern")
|
|
233
|
+
type_node = n.child_by_field_name("type")
|
|
234
|
+
if pattern and pattern.type == "identifier" and type_node:
|
|
235
|
+
var_name = source[
|
|
236
|
+
pattern.start_byte : pattern.end_byte
|
|
237
|
+
].decode("utf-8", errors="replace")
|
|
238
|
+
t_name = extract_type_from_ts_node(type_node)
|
|
239
|
+
if t_name:
|
|
240
|
+
local_bindings[var_name] = t_name
|
|
241
|
+
elif n.type == "variable_declarator":
|
|
242
|
+
name_node = n.child_by_field_name("name")
|
|
243
|
+
value_node = n.child_by_field_name("value")
|
|
244
|
+
type_node = n.child_by_field_name("type")
|
|
245
|
+
if name_node and name_node.type == "identifier":
|
|
246
|
+
var_name = source[
|
|
247
|
+
name_node.start_byte : name_node.end_byte
|
|
248
|
+
].decode("utf-8", errors="replace")
|
|
249
|
+
if type_node:
|
|
250
|
+
t_name = extract_type_from_ts_node(type_node)
|
|
251
|
+
if t_name:
|
|
252
|
+
local_bindings[var_name] = t_name
|
|
253
|
+
elif value_node and value_node.type == "new_expression":
|
|
254
|
+
t_name = extract_type_from_ts_node(value_node)
|
|
255
|
+
if t_name:
|
|
256
|
+
local_bindings[var_name] = t_name
|
|
257
|
+
|
|
258
|
+
for child in n.children:
|
|
259
|
+
if child.type not in (
|
|
260
|
+
"function_declaration",
|
|
261
|
+
"method_definition",
|
|
262
|
+
"class_declaration",
|
|
263
|
+
):
|
|
264
|
+
collect_local_bindings(child)
|
|
265
|
+
|
|
266
|
+
collect_local_bindings(node)
|
|
267
|
+
|
|
268
|
+
result.nodes.append(
|
|
269
|
+
NodeSchema(
|
|
270
|
+
id=func_id,
|
|
271
|
+
label=func_name,
|
|
272
|
+
type=sym_type,
|
|
273
|
+
source_file=rel_path,
|
|
274
|
+
line_start=node.start_point[0] + 1,
|
|
275
|
+
line_end=node.end_point[0] + 1,
|
|
276
|
+
signature=self._get_signature(node, source),
|
|
277
|
+
docstring=self._get_docstring(node, source),
|
|
278
|
+
local_bindings=local_bindings,
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
result.edges.append(
|
|
283
|
+
EdgeSchema(
|
|
284
|
+
source=parent_id, target=func_id, relation="contains"
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
scope_stack.append((func_id, sym_type))
|
|
289
|
+
pushed_scope = True
|
|
290
|
+
|
|
291
|
+
elif node_type == "import_statement":
|
|
292
|
+
source_node = node.child_by_field_name("source")
|
|
293
|
+
if source_node:
|
|
294
|
+
import_path = source[
|
|
295
|
+
source_node.start_byte : source_node.end_byte
|
|
296
|
+
].decode("utf-8", errors="replace")
|
|
297
|
+
import_path = import_path.strip("\"'")
|
|
298
|
+
|
|
299
|
+
import_map = {}
|
|
300
|
+
clause_node = None
|
|
301
|
+
for child in node.children:
|
|
302
|
+
if child.type == "import_clause":
|
|
303
|
+
clause_node = child
|
|
304
|
+
break
|
|
305
|
+
|
|
306
|
+
if clause_node:
|
|
307
|
+
for c in clause_node.children:
|
|
308
|
+
if c.type == "identifier":
|
|
309
|
+
name = source[c.start_byte : c.end_byte].decode(
|
|
310
|
+
"utf-8", errors="replace"
|
|
311
|
+
)
|
|
312
|
+
import_map[name] = "default"
|
|
313
|
+
elif c.type == "namespace_import":
|
|
314
|
+
for sub in c.children:
|
|
315
|
+
if sub.type == "identifier":
|
|
316
|
+
name = source[
|
|
317
|
+
sub.start_byte : sub.end_byte
|
|
318
|
+
].decode("utf-8", errors="replace")
|
|
319
|
+
import_map[name] = "*"
|
|
320
|
+
break
|
|
321
|
+
elif c.type == "named_imports":
|
|
322
|
+
for spec in c.children:
|
|
323
|
+
if spec.type == "import_specifier":
|
|
324
|
+
name_node = spec.child_by_field_name("name")
|
|
325
|
+
alias_node = spec.child_by_field_name("alias")
|
|
326
|
+
if name_node and alias_node:
|
|
327
|
+
name = source[
|
|
328
|
+
name_node.start_byte : name_node.end_byte
|
|
329
|
+
].decode("utf-8", errors="replace")
|
|
330
|
+
alias = source[
|
|
331
|
+
alias_node.start_byte : alias_node.end_byte
|
|
332
|
+
].decode("utf-8", errors="replace")
|
|
333
|
+
import_map[alias] = name
|
|
334
|
+
elif name_node:
|
|
335
|
+
name = source[
|
|
336
|
+
name_node.start_byte : name_node.end_byte
|
|
337
|
+
].decode("utf-8", errors="replace")
|
|
338
|
+
import_map[name] = name
|
|
339
|
+
|
|
340
|
+
result.edges.append(
|
|
341
|
+
EdgeSchema(
|
|
342
|
+
source=file_node_id,
|
|
343
|
+
target=import_path,
|
|
344
|
+
relation="imports",
|
|
345
|
+
import_map=import_map,
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
elif node_type in ("call_expression", "new_expression"):
|
|
350
|
+
func_node = node.child_by_field_name("function")
|
|
351
|
+
if func_node:
|
|
352
|
+
callee_name = source[
|
|
353
|
+
func_node.start_byte : func_node.end_byte
|
|
354
|
+
].decode("utf-8", errors="replace")
|
|
355
|
+
caller_id = get_current_parent_id()
|
|
356
|
+
|
|
357
|
+
result.edges.append(
|
|
358
|
+
EdgeSchema(
|
|
359
|
+
source=caller_id, target=callee_name, relation="calls"
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
for child in node.children:
|
|
364
|
+
walk(child)
|
|
365
|
+
|
|
366
|
+
if pushed_scope:
|
|
367
|
+
scope_stack.pop()
|
|
368
|
+
|
|
369
|
+
walk(root)
|
|
370
|
+
return result
|