codegraph-gen 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,370 @@
1
+ import logging
2
+ from pathlib import Path
3
+ import tree_sitter
4
+ import tree_sitter_javascript
5
+ import tree_sitter_typescript
6
+ from codegraph_gen.parser.base import BaseParser, ExtractionResult, NodeSchema, EdgeSchema
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class JavaScriptParser(BaseParser):
12
+ def __init__(self):
13
+ # Cache parsers for javascript, typescript and tsx
14
+ self.js_lang = tree_sitter.Language(tree_sitter_javascript.language())
15
+ self.ts_lang = tree_sitter.Language(
16
+ tree_sitter_typescript.language_typescript()
17
+ )
18
+ self.tsx_lang = tree_sitter.Language(tree_sitter_typescript.language_tsx())
19
+
20
+ self.js_parser = tree_sitter.Parser(self.js_lang)
21
+ self.ts_parser = tree_sitter.Parser(self.ts_lang)
22
+ self.tsx_parser = tree_sitter.Parser(self.tsx_lang)
23
+
24
+ def _get_docstring(self, node, source: bytes) -> str:
25
+ """Finds comments immediately preceding the node."""
26
+ # Tree-sitter doesn't always attach comments to nodes, but we can look for
27
+ # sibling nodes of type 'comment' that end right before this node starts.
28
+ docstring = ""
29
+ prev = node.prev_sibling
30
+ comments = []
31
+ while prev and prev.type in ("comment", "line_comment", "block_comment"):
32
+ comment_text = source[prev.start_byte : prev.end_byte].decode(
33
+ "utf-8", errors="replace"
34
+ )
35
+ # Strip comment markers
36
+ clean_text = (
37
+ comment_text.strip().lstrip("/*").rstrip("*/").lstrip("*").strip()
38
+ )
39
+ comments.append(clean_text)
40
+ prev = prev.prev_sibling
41
+
42
+ if comments:
43
+ docstring = "\n".join(reversed(comments))
44
+ return docstring
45
+
46
+ def _get_signature(self, node, source: bytes) -> str:
47
+ body = node.child_by_field_name("body")
48
+ if body:
49
+ end_byte = body.start_byte
50
+ sig_bytes = source[node.start_byte : end_byte]
51
+ sig = sig_bytes.decode("utf-8", errors="replace").strip()
52
+ # Trim trailing open curly brace
53
+ if sig.endswith("{"):
54
+ sig = sig[:-1].strip()
55
+ return sig
56
+ return (
57
+ source[node.start_byte : node.end_byte]
58
+ .decode("utf-8", errors="replace")
59
+ .split("\n")[0]
60
+ )
61
+
62
+ def parse_file(self, file_path: Path, workspace_dir: Path) -> ExtractionResult:
63
+ try:
64
+ source = file_path.read_bytes()
65
+ except Exception as e:
66
+ logger.error(f"Error reading file {file_path}: {e}")
67
+ return ExtractionResult()
68
+
69
+ ext = file_path.suffix.lower()
70
+ if ext == ".tsx":
71
+ parser = self.tsx_parser
72
+ elif ext in (".ts", ".cts", ".mts"):
73
+ parser = self.ts_parser
74
+ else:
75
+ parser = self.js_parser
76
+
77
+ tree = parser.parse(source)
78
+ root = tree.root_node
79
+
80
+ rel_path = str(file_path.relative_to(workspace_dir))
81
+ result = ExtractionResult()
82
+
83
+ # Add file node
84
+ file_node_id = rel_path
85
+ result.nodes.append(
86
+ NodeSchema(
87
+ id=file_node_id,
88
+ label=file_path.name,
89
+ type="file",
90
+ source_file=rel_path,
91
+ line_start=1,
92
+ line_end=len(source.splitlines()) or 1,
93
+ signature=f"module {file_path.name}",
94
+ docstring=self._get_docstring(root, source),
95
+ )
96
+ )
97
+
98
+ scope_stack = [(file_node_id, "file")]
99
+
100
+ def get_current_parent_id():
101
+ return scope_stack[-1][0] if scope_stack else file_node_id
102
+
103
+ def walk(node):
104
+ nonlocal result
105
+
106
+ if node.type == "ERROR" or (hasattr(node, "is_error") and node.is_error):
107
+ logger.debug(f"Skipping syntax error node in JS/TS AST: {node}")
108
+ return
109
+
110
+ node_type = node.type
111
+ pushed_scope = False
112
+
113
+ if node_type in ("class_declaration", "interface_declaration"):
114
+ name_node = node.child_by_field_name("name")
115
+ if name_node:
116
+ class_name = source[
117
+ name_node.start_byte : name_node.end_byte
118
+ ].decode("utf-8", errors="replace")
119
+ parent_id = get_current_parent_id()
120
+
121
+ class_id = f"{rel_path}::{class_name}"
122
+ sym_type = (
123
+ "class" if node_type == "class_declaration" else "interface"
124
+ )
125
+
126
+ result.nodes.append(
127
+ NodeSchema(
128
+ id=class_id,
129
+ label=class_name,
130
+ type=sym_type,
131
+ source_file=rel_path,
132
+ line_start=node.start_point[0] + 1,
133
+ line_end=node.end_point[0] + 1,
134
+ signature=self._get_signature(node, source),
135
+ docstring=self._get_docstring(node, source),
136
+ )
137
+ )
138
+
139
+ result.edges.append(
140
+ EdgeSchema(
141
+ source=parent_id, target=class_id, relation="contains"
142
+ )
143
+ )
144
+
145
+ # Inheritance: heritage / extends clause
146
+ for child in node.children:
147
+ if child.type in ("class_heritage", "interface_heritage"):
148
+ # extends Expression
149
+ for sub in child.children:
150
+ if sub.type in ("identifier", "nested_identifier"):
151
+ parent_class_name = source[
152
+ sub.start_byte : sub.end_byte
153
+ ].decode("utf-8", errors="replace")
154
+ result.edges.append(
155
+ EdgeSchema(
156
+ source=class_id,
157
+ target=parent_class_name,
158
+ relation="inherits",
159
+ )
160
+ )
161
+
162
+ scope_stack.append((class_id, sym_type))
163
+ pushed_scope = True
164
+
165
+ elif node_type in ("function_declaration", "method_definition"):
166
+ name_node = node.child_by_field_name("name")
167
+ if name_node:
168
+ func_name = source[
169
+ name_node.start_byte : name_node.end_byte
170
+ ].decode("utf-8", errors="replace")
171
+ parent_id = get_current_parent_id()
172
+ parent_type = scope_stack[-1][1] if scope_stack else "file"
173
+
174
+ if parent_type in ("class", "interface"):
175
+ func_id = f"{parent_id}.{func_name}"
176
+ sym_type = "method"
177
+ else:
178
+ func_id = f"{rel_path}::{func_name}"
179
+ sym_type = "function"
180
+
181
+ local_bindings = {}
182
+
183
+ def extract_type_from_ts_node(ts_node):
184
+ if ts_node.type == "type_identifier":
185
+ return source[ts_node.start_byte : ts_node.end_byte].decode(
186
+ "utf-8", errors="replace"
187
+ )
188
+ elif ts_node.type == "property_identifier":
189
+ return source[ts_node.start_byte : ts_node.end_byte].decode(
190
+ "utf-8", errors="replace"
191
+ )
192
+ elif ts_node.type == "nested_type_identifier":
193
+ for child in reversed(ts_node.children):
194
+ if child.type in ("type_identifier", "identifier"):
195
+ return extract_type_from_ts_node(child)
196
+ elif ts_node.type == "generic_type":
197
+ type_node = ts_node.child_by_field_name("name") or (
198
+ ts_node.children[0] if ts_node.children else None
199
+ )
200
+ if type_node:
201
+ return extract_type_from_ts_node(type_node)
202
+ elif ts_node.type == "new_expression":
203
+ constructor_node = ts_node.child_by_field_name(
204
+ "constructor"
205
+ )
206
+ if constructor_node:
207
+ if constructor_node.type == "identifier":
208
+ return source[
209
+ constructor_node.start_byte : constructor_node.end_byte
210
+ ].decode("utf-8", errors="replace")
211
+ elif constructor_node.type == "member_expression":
212
+ prop = constructor_node.child_by_field_name(
213
+ "property"
214
+ )
215
+ if prop:
216
+ return source[
217
+ prop.start_byte : prop.end_byte
218
+ ].decode("utf-8", errors="replace")
219
+ elif ts_node.type == "type_annotation":
220
+ for child in ts_node.children:
221
+ res = extract_type_from_ts_node(child)
222
+ if res:
223
+ return res
224
+ for child in ts_node.children:
225
+ res = extract_type_from_ts_node(child)
226
+ if res:
227
+ return res
228
+ return None
229
+
230
+ def collect_local_bindings(n):
231
+ if n.type in ("required_parameter", "optional_parameter"):
232
+ pattern = n.child_by_field_name("pattern")
233
+ type_node = n.child_by_field_name("type")
234
+ if pattern and pattern.type == "identifier" and type_node:
235
+ var_name = source[
236
+ pattern.start_byte : pattern.end_byte
237
+ ].decode("utf-8", errors="replace")
238
+ t_name = extract_type_from_ts_node(type_node)
239
+ if t_name:
240
+ local_bindings[var_name] = t_name
241
+ elif n.type == "variable_declarator":
242
+ name_node = n.child_by_field_name("name")
243
+ value_node = n.child_by_field_name("value")
244
+ type_node = n.child_by_field_name("type")
245
+ if name_node and name_node.type == "identifier":
246
+ var_name = source[
247
+ name_node.start_byte : name_node.end_byte
248
+ ].decode("utf-8", errors="replace")
249
+ if type_node:
250
+ t_name = extract_type_from_ts_node(type_node)
251
+ if t_name:
252
+ local_bindings[var_name] = t_name
253
+ elif value_node and value_node.type == "new_expression":
254
+ t_name = extract_type_from_ts_node(value_node)
255
+ if t_name:
256
+ local_bindings[var_name] = t_name
257
+
258
+ for child in n.children:
259
+ if child.type not in (
260
+ "function_declaration",
261
+ "method_definition",
262
+ "class_declaration",
263
+ ):
264
+ collect_local_bindings(child)
265
+
266
+ collect_local_bindings(node)
267
+
268
+ result.nodes.append(
269
+ NodeSchema(
270
+ id=func_id,
271
+ label=func_name,
272
+ type=sym_type,
273
+ source_file=rel_path,
274
+ line_start=node.start_point[0] + 1,
275
+ line_end=node.end_point[0] + 1,
276
+ signature=self._get_signature(node, source),
277
+ docstring=self._get_docstring(node, source),
278
+ local_bindings=local_bindings,
279
+ )
280
+ )
281
+
282
+ result.edges.append(
283
+ EdgeSchema(
284
+ source=parent_id, target=func_id, relation="contains"
285
+ )
286
+ )
287
+
288
+ scope_stack.append((func_id, sym_type))
289
+ pushed_scope = True
290
+
291
+ elif node_type == "import_statement":
292
+ source_node = node.child_by_field_name("source")
293
+ if source_node:
294
+ import_path = source[
295
+ source_node.start_byte : source_node.end_byte
296
+ ].decode("utf-8", errors="replace")
297
+ import_path = import_path.strip("\"'")
298
+
299
+ import_map = {}
300
+ clause_node = None
301
+ for child in node.children:
302
+ if child.type == "import_clause":
303
+ clause_node = child
304
+ break
305
+
306
+ if clause_node:
307
+ for c in clause_node.children:
308
+ if c.type == "identifier":
309
+ name = source[c.start_byte : c.end_byte].decode(
310
+ "utf-8", errors="replace"
311
+ )
312
+ import_map[name] = "default"
313
+ elif c.type == "namespace_import":
314
+ for sub in c.children:
315
+ if sub.type == "identifier":
316
+ name = source[
317
+ sub.start_byte : sub.end_byte
318
+ ].decode("utf-8", errors="replace")
319
+ import_map[name] = "*"
320
+ break
321
+ elif c.type == "named_imports":
322
+ for spec in c.children:
323
+ if spec.type == "import_specifier":
324
+ name_node = spec.child_by_field_name("name")
325
+ alias_node = spec.child_by_field_name("alias")
326
+ if name_node and alias_node:
327
+ name = source[
328
+ name_node.start_byte : name_node.end_byte
329
+ ].decode("utf-8", errors="replace")
330
+ alias = source[
331
+ alias_node.start_byte : alias_node.end_byte
332
+ ].decode("utf-8", errors="replace")
333
+ import_map[alias] = name
334
+ elif name_node:
335
+ name = source[
336
+ name_node.start_byte : name_node.end_byte
337
+ ].decode("utf-8", errors="replace")
338
+ import_map[name] = name
339
+
340
+ result.edges.append(
341
+ EdgeSchema(
342
+ source=file_node_id,
343
+ target=import_path,
344
+ relation="imports",
345
+ import_map=import_map,
346
+ )
347
+ )
348
+
349
+ elif node_type in ("call_expression", "new_expression"):
350
+ func_node = node.child_by_field_name("function")
351
+ if func_node:
352
+ callee_name = source[
353
+ func_node.start_byte : func_node.end_byte
354
+ ].decode("utf-8", errors="replace")
355
+ caller_id = get_current_parent_id()
356
+
357
+ result.edges.append(
358
+ EdgeSchema(
359
+ source=caller_id, target=callee_name, relation="calls"
360
+ )
361
+ )
362
+
363
+ for child in node.children:
364
+ walk(child)
365
+
366
+ if pushed_scope:
367
+ scope_stack.pop()
368
+
369
+ walk(root)
370
+ return result