context-mcp-server 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,222 +1,438 @@
1
1
  """
2
- ast_extractor.py — extract nodes from code files using tree-sitter AST.
3
-
4
- Falls back to regex if tree-sitter grammars aren't installed.
5
- Each node: { id, name, type, file, line, docstring?, calls?, imports? }
2
+ ast_extractor.py — extract AST nodes from code files.
3
+ Tries tree-sitter first; falls back to regex if grammar not installed.
6
4
  """
7
5
 
6
+ from __future__ import annotations
8
7
  import re
9
8
  from pathlib import Path
9
+ from typing import Any
10
10
 
11
+ try:
12
+ from tree_sitter import Language, Parser, Node as TSNode
13
+ _TS_AVAILABLE = True
14
+ except ImportError:
15
+ _TS_AVAILABLE = False
11
16
 
12
- # ── tree-sitter setup (optional — graceful fallback) ─────────────────────────
13
-
14
- def _try_load_ts():
15
- try:
16
- import tree_sitter_python as tspython
17
- import tree_sitter_javascript as tsjavascript
18
- from tree_sitter import Language, Parser
19
- PY_LANG = Language(tspython.language())
20
- JS_LANG = Language(tsjavascript.language())
21
- return {"python": (PY_LANG, Parser(PY_LANG)), "javascript": (JS_LANG, Parser(JS_LANG))}
22
- except ImportError:
23
- return {}
24
-
25
- _TS_PARSERS = _try_load_ts()
26
-
27
-
28
- # ── tree-sitter queries ───────────────────────────────────────────────────────
29
-
30
- _PY_CLASS_QUERY = """
31
- (class_definition name: (identifier) @name) @class
32
- """
33
-
34
- _PY_FUNC_QUERY = """
35
- (function_definition name: (identifier) @name) @func
36
- """
17
+ # ── Language registry ─────────────────────────────────────────────────────────
18
+ # pkg: importable package name fn: function returning Language object
19
+ _REGISTRY: dict[str, dict] = {
20
+ "python": {
21
+ "pkg": "tree_sitter_python", "fn": "language",
22
+ "ext": {".py", ".pyw"},
23
+ "function_types": {"function_definition"},
24
+ "class_types": {"class_definition"},
25
+ "import_types": {"import_statement", "import_from_statement"},
26
+ "call_types": {"call"},
27
+ "name_field": "name",
28
+ "call_field": "function",
29
+ },
30
+ "javascript": {
31
+ "pkg": "tree_sitter_javascript", "fn": "language",
32
+ "ext": {".js", ".mjs", ".cjs", ".jsx"},
33
+ "function_types": {
34
+ "function_declaration", "function_expression",
35
+ "arrow_function", "method_definition",
36
+ "generator_function_declaration",
37
+ },
38
+ "class_types": {"class_declaration", "class_expression"},
39
+ "import_types": {"import_statement"},
40
+ "call_types": {"call_expression"},
41
+ "name_field": "name",
42
+ "call_field": "function",
43
+ },
44
+ "typescript": {
45
+ "pkg": "tree_sitter_typescript", "fn": "language_typescript",
46
+ "ext": {".ts", ".mts", ".cts"},
47
+ "function_types": {
48
+ "function_declaration", "function_expression",
49
+ "arrow_function", "method_definition",
50
+ "method_signature", "abstract_method_signature",
51
+ },
52
+ "class_types": {
53
+ "class_declaration", "class_expression",
54
+ "interface_declaration", "type_alias_declaration",
55
+ },
56
+ "import_types": {"import_statement"},
57
+ "call_types": {"call_expression"},
58
+ "name_field": "name",
59
+ "call_field": "function",
60
+ },
61
+ "tsx": {
62
+ "pkg": "tree_sitter_typescript", "fn": "language_tsx",
63
+ "ext": {".tsx"},
64
+ "function_types": {
65
+ "function_declaration", "function_expression",
66
+ "arrow_function", "method_definition",
67
+ },
68
+ "class_types": {"class_declaration", "interface_declaration"},
69
+ "import_types": {"import_statement"},
70
+ "call_types": {"call_expression"},
71
+ "name_field": "name",
72
+ "call_field": "function",
73
+ },
74
+ "go": {
75
+ "pkg": "tree_sitter_go", "fn": "language",
76
+ "ext": {".go"},
77
+ "function_types": {"function_declaration", "method_declaration"},
78
+ "class_types": {"type_declaration"},
79
+ "import_types": {"import_declaration"},
80
+ "call_types": {"call_expression"},
81
+ "name_field": "name",
82
+ "call_field": "function",
83
+ },
84
+ "rust": {
85
+ "pkg": "tree_sitter_rust", "fn": "language",
86
+ "ext": {".rs"},
87
+ "function_types": {"function_item"},
88
+ "class_types": {"struct_item", "enum_item", "trait_item", "impl_item"},
89
+ "import_types": {"use_declaration"},
90
+ "call_types": {"call_expression"},
91
+ "name_field": "name",
92
+ "call_field": "function",
93
+ },
94
+ "java": {
95
+ "pkg": "tree_sitter_java", "fn": "language",
96
+ "ext": {".java"},
97
+ "function_types": {"method_declaration", "constructor_declaration"},
98
+ "class_types": {
99
+ "class_declaration", "interface_declaration",
100
+ "enum_declaration", "annotation_type_declaration",
101
+ },
102
+ "import_types": {"import_declaration"},
103
+ "call_types": {"method_invocation"},
104
+ "name_field": "name",
105
+ "call_field": "name",
106
+ },
107
+ "kotlin": {
108
+ "pkg": "tree_sitter_kotlin", "fn": "language",
109
+ "ext": {".kt"},
110
+ "function_types": {"function_declaration", "anonymous_function"},
111
+ "class_types": {"class_declaration", "interface_declaration", "object_declaration"},
112
+ "import_types": {"import_header"},
113
+ "call_types": {"call_expression"},
114
+ "name_field": "simple_identifier",
115
+ "call_field": "call_suffix",
116
+ },
117
+ "c": {
118
+ "pkg": "tree_sitter_c", "fn": "language",
119
+ "ext": {".c", ".h"},
120
+ "function_types": {"function_definition"},
121
+ "class_types": {"struct_specifier", "enum_specifier", "union_specifier"},
122
+ "import_types": {"preproc_include"},
123
+ "call_types": {"call_expression"},
124
+ "name_field": "declarator",
125
+ "call_field": "function",
126
+ },
127
+ "cpp": {
128
+ "pkg": "tree_sitter_cpp", "fn": "language",
129
+ "ext": {".cpp", ".cc", ".cxx", ".hpp", ".hh"},
130
+ "function_types": {"function_definition"},
131
+ "class_types": {
132
+ "class_specifier", "struct_specifier",
133
+ "enum_specifier", "namespace_definition",
134
+ },
135
+ "import_types": {"preproc_include"},
136
+ "call_types": {"call_expression"},
137
+ "name_field": "declarator",
138
+ "call_field": "function",
139
+ },
140
+ "csharp": {
141
+ "pkg": "tree_sitter_c_sharp", "fn": "language",
142
+ "ext": {".cs"},
143
+ "function_types": {"method_declaration", "constructor_declaration", "local_function_statement"},
144
+ "class_types": {
145
+ "class_declaration", "interface_declaration",
146
+ "struct_declaration", "enum_declaration", "record_declaration",
147
+ },
148
+ "import_types": {"using_directive"},
149
+ "call_types": {"invocation_expression"},
150
+ "name_field": "name",
151
+ "call_field": "expression",
152
+ },
153
+ "ruby": {
154
+ "pkg": "tree_sitter_ruby", "fn": "language",
155
+ "ext": {".rb", ".rake"},
156
+ "function_types": {"method", "singleton_method"},
157
+ "class_types": {"class", "module"},
158
+ "import_types": set(),
159
+ "call_types": {"call"},
160
+ "name_field": "name",
161
+ "call_field": "method",
162
+ },
163
+ "php": {
164
+ "pkg": "tree_sitter_php", "fn": "language",
165
+ "ext": {".php"},
166
+ "function_types": {"function_definition", "method_declaration"},
167
+ "class_types": {"class_declaration", "interface_declaration", "trait_declaration"},
168
+ "import_types": {"namespace_use_declaration"},
169
+ "call_types": {"function_call_expression", "member_call_expression"},
170
+ "name_field": "name",
171
+ "call_field": "function",
172
+ },
173
+ "swift": {
174
+ "pkg": "tree_sitter_swift", "fn": "language",
175
+ "ext": {".swift"},
176
+ "function_types": {"function_declaration"},
177
+ "class_types": {
178
+ "class_declaration", "struct_declaration",
179
+ "protocol_declaration", "extension_declaration",
180
+ },
181
+ "import_types": {"import_declaration"},
182
+ "call_types": {"call_expression"},
183
+ "name_field": "name",
184
+ "call_field": "function",
185
+ },
186
+ "lua": {
187
+ "pkg": "tree_sitter_lua", "fn": "language",
188
+ "ext": {".lua", ".luau"},
189
+ "function_types": {"function_declaration", "local_function"},
190
+ "class_types": set(),
191
+ "import_types": set(),
192
+ "call_types": {"function_call"},
193
+ "name_field": "name",
194
+ "call_field": "name",
195
+ },
196
+ "dart": {
197
+ "pkg": "tree_sitter_dart", "fn": "language",
198
+ "ext": {".dart"},
199
+ "function_types": {"function_signature", "method_signature"},
200
+ "class_types": {"class_definition", "mixin_declaration"},
201
+ "import_types": {"import_or_export"},
202
+ "call_types": {"invocation_expression"},
203
+ "name_field": "name",
204
+ "call_field": "function_expression",
205
+ },
206
+ }
37
207
 
38
- _JS_CLASS_QUERY = """
39
- (class_declaration name: (identifier) @name) @class
40
- """
208
+ # Extension → language config lookup
209
+ _EXT_TO_LANG: dict[str, dict] = {}
210
+ for _lang, _cfg in _REGISTRY.items():
211
+ for _ext in _cfg["ext"]:
212
+ _EXT_TO_LANG[_ext] = _cfg
41
213
 
42
- _JS_FUNC_QUERY = """
43
- [
44
- (function_declaration name: (identifier) @name)
45
- (method_definition name: (property_identifier) @name)
46
- ] @func
47
- """
214
+ # ── Grammar cache ─────────────────────────────────────────────────────────────
215
+ _GRAMMAR_CACHE: dict[str, Any] = {}
48
216
 
49
217
 
50
- def _ts_extract(source: bytes, lang_key: str, rel_path: str) -> list:
51
- parsers = _TS_PARSERS
52
- if lang_key not in parsers:
218
+ def _get_language(cfg: dict) -> "Language | None":
219
+ key = f"{cfg['pkg']}.{cfg['fn']}"
220
+ if key in _GRAMMAR_CACHE:
221
+ return _GRAMMAR_CACHE[key]
222
+ try:
223
+ import importlib
224
+ mod = importlib.import_module(cfg["pkg"])
225
+ lang = Language(getattr(mod, cfg["fn"])())
226
+ _GRAMMAR_CACHE[key] = lang
227
+ return lang
228
+ except Exception:
229
+ _GRAMMAR_CACHE[key] = None
230
+ return None
231
+
232
+
233
+ # ── Tree walker ───────────────────────────────────────────────────────────────
234
+
235
+ def _walk(node: "TSNode", target_types: set[str]):
236
+ if node.type in target_types:
237
+ yield node
238
+ for child in node.children:
239
+ yield from _walk(child, target_types)
240
+
241
+
242
+ def _get_name(node: "TSNode", name_field: str) -> str | None:
243
+ named = node.child_by_field_name(name_field)
244
+ if named:
245
+ return named.text.decode("utf-8", errors="ignore").strip()
246
+ for child in node.children:
247
+ if child.type in {"identifier", "name", "simple_identifier",
248
+ "property_identifier", "type_identifier"}:
249
+ return child.text.decode("utf-8", errors="ignore").strip()
250
+ return None
251
+
252
+
253
+ def _get_call_name(node: "TSNode", call_field: str) -> str | None:
254
+ func = node.child_by_field_name(call_field)
255
+ if not func:
256
+ return None
257
+ text = func.text.decode("utf-8", errors="ignore").strip()
258
+ return text.split(".")[-1] if "." in text else text
259
+
260
+
261
+ def _find_enclosing_function(
262
+ node: "TSNode",
263
+ function_types: set[str],
264
+ name_field: str,
265
+ ) -> str | None:
266
+ parent = node.parent
267
+ while parent:
268
+ if parent.type in function_types:
269
+ return _get_name(parent, name_field)
270
+ parent = parent.parent
271
+ return None
272
+
273
+
274
+ def _extract_with_treesitter(source: bytes, rel_path: str, cfg: dict) -> list[dict]:
275
+ lang = _get_language(cfg)
276
+ if lang is None:
53
277
  return []
54
- lang, parser = parsers[lang_key]
55
- tree = parser.parse(source)
56
278
 
57
- nodes = []
58
- lines = source.decode("utf-8", errors="replace").splitlines()
279
+ parser = Parser(lang)
280
+ tree = parser.parse(source)
281
+ root = tree.root_node
282
+
283
+ nodes: list[dict] = []
284
+ seen: set[str] = set()
59
285
 
60
- def _node(kind, name, line):
61
- return {
62
- "id": f"{rel_path}::{kind}::{name}",
286
+ def _add(name: str, ntype: str, line: int):
287
+ if not name or name in seen:
288
+ return
289
+ seen.add(name)
290
+ nodes.append({
291
+ "id": f"{rel_path}::{ntype}::{name}",
63
292
  "name": name,
64
- "type": kind,
293
+ "type": ntype,
65
294
  "file": rel_path,
66
- "line": line,
67
- }
68
-
69
- def _iter_captures(query, root):
70
- """Yield (capture_name, tree_node) pairs; compatible with tree-sitter >=0.20."""
71
- try:
72
- # tree-sitter >= 0.22: matches() returns list of (pattern_idx, {name: [Node]})
73
- for _pat_idx, caps in query.matches(root):
74
- for cap_name, cap_nodes in caps.items():
75
- for n in (cap_nodes if isinstance(cap_nodes, list) else [cap_nodes]):
76
- yield cap_name, n
77
- except Exception:
78
- pass
79
-
80
- # Classes
81
- try:
82
- query = lang.query(_PY_CLASS_QUERY if lang_key == "python" else _JS_CLASS_QUERY)
83
- for cap_name, node in _iter_captures(query, tree.root_node):
84
- if cap_name == "name" and node.type == "identifier":
85
- nodes.append(_node("class", node.text.decode(), node.start_point[0] + 1))
86
- except Exception:
87
- pass
295
+ "line": line + 1,
296
+ })
88
297
 
89
- # Functions
90
- try:
91
- query = lang.query(_PY_FUNC_QUERY if lang_key == "python" else _JS_FUNC_QUERY)
92
- for cap_name, node in _iter_captures(query, tree.root_node):
93
- if cap_name == "name" and node.type in ("identifier", "property_identifier"):
94
- nodes.append(_node("function", node.text.decode(), node.start_point[0] + 1))
95
- except Exception:
96
- pass
298
+ for node in _walk(root, cfg["function_types"]):
299
+ name = _get_name(node, cfg["name_field"])
300
+ if name:
301
+ _add(name, "function", node.start_point[0])
302
+
303
+ for node in _walk(root, cfg["class_types"]):
304
+ name = _get_name(node, cfg["name_field"])
305
+ if name:
306
+ _add(name, "class", node.start_point[0])
97
307
 
98
308
  return nodes
99
309
 
100
310
 
101
- # ── Regex fallback ────────────────────────────────────────────────────────────
311
+ # ── Regex fallbacks ───────────────────────────────────────────────────────────
102
312
 
103
- _PATTERNS = {
313
+ _REGEX_PATTERNS: dict[str, dict[str, str | None]] = {
104
314
  "python": {
105
- "class": re.compile(r"^class\s+(\w+)", re.MULTILINE),
106
- "function": re.compile(r"^def\s+(\w+)", re.MULTILINE),
107
- "import": re.compile(r"^(?:import|from)\s+([\w.]+)", re.MULTILINE),
315
+ "function": r"^(?:async\s+)?def\s+([a-zA-Z_]\w*)\s*\(",
316
+ "class": r"^class\s+([a-zA-Z_]\w*)\s*[:\(]",
108
317
  },
109
318
  "javascript": {
110
- "class": re.compile(r"\bclass\s+(\w+)", re.MULTILINE),
111
- "function": re.compile(r"\bfunction\s+(\w+)", re.MULTILINE),
112
- "import": re.compile(r"^import\s+.*?from\s+['\"](.+?)['\"]", re.MULTILINE),
319
+ "function": r"(?:function\s+([a-zA-Z_$]\w*)|([a-zA-Z_$]\w*)\s*[:=]\s*(?:async\s+)?(?:function|\([^)]*\)\s*=>))",
320
+ "class": r"class\s+([a-zA-Z_$]\w*)",
321
+ },
322
+ "typescript": {
323
+ "function": r"(?:function\s+([a-zA-Z_$]\w*)|([a-zA-Z_$]\w*)\s*[:=]\s*(?:async\s+)?(?:function|\([^)]*\)\s*=>))",
324
+ "class": r"(?:class|interface)\s+([a-zA-Z_$]\w*)",
113
325
  },
114
326
  "go": {
115
- "function": re.compile(r"^func\s+(?:\(\w+\s+\*?\w+\)\s+)?(\w+)", re.MULTILINE),
116
- "struct": re.compile(r"^type\s+(\w+)\s+struct", re.MULTILINE),
117
- "import": re.compile(r'"([\w./]+)"', re.MULTILINE),
327
+ "function": r"^func\s+(?:\([^)]*\)\s+)?([a-zA-Z_]\w*)\s*\(",
328
+ "class": r"^type\s+([a-zA-Z_]\w*)\s+(?:struct|interface)",
118
329
  },
119
330
  "rust": {
120
- "function": re.compile(r"^(?:pub\s+)?fn\s+(\w+)", re.MULTILINE),
121
- "struct": re.compile(r"^(?:pub\s+)?struct\s+(\w+)", re.MULTILINE),
122
- "import": re.compile(r"^use\s+([\w:]+)", re.MULTILINE),
331
+ "function": r"^(?:pub\s+)?(?:async\s+)?fn\s+([a-zA-Z_]\w*)",
332
+ "class": r"^(?:pub\s+)?(?:struct|enum|trait|impl)\s+([a-zA-Z_]\w*)",
123
333
  },
124
334
  "java": {
125
- "class": re.compile(r"\bclass\s+(\w+)", re.MULTILINE),
126
- "function": re.compile(r"(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+(\w+)\s*\(", re.MULTILINE),
127
- "import": re.compile(r"^import\s+([\w.]+);", re.MULTILINE),
335
+ "function": r"(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([a-zA-Z_]\w*)\s*\(",
336
+ "class": r"(?:class|interface|enum)\s+([a-zA-Z_]\w*)",
337
+ },
338
+ "c": {
339
+ "function": r"^[a-zA-Z_][\w\s\*]+\s+([a-zA-Z_]\w*)\s*\([^;]*\)\s*\{",
340
+ "class": r"^(?:struct|enum|union)\s+([a-zA-Z_]\w*)",
341
+ },
342
+ "cpp": {
343
+ "function": r"(?:[\w:~]+\s+)+([a-zA-Z_]\w*)\s*\([^;]*\)\s*(?:const\s*)?\{",
344
+ "class": r"(?:class|struct|enum|namespace)\s+([a-zA-Z_]\w*)",
128
345
  },
129
346
  "ruby": {
130
- "class": re.compile(r"^class\s+(\w+)", re.MULTILINE),
131
- "function": re.compile(r"^\s*def\s+(\w+)", re.MULTILINE),
347
+ "function": r"^\s*def\s+([a-zA-Z_]\w*[?!]?)",
348
+ "class": r"^\s*(?:class|module)\s+([A-Z]\w*)",
349
+ },
350
+ "csharp": {
351
+ "function": r"(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+([a-zA-Z_]\w*)\s*\(",
352
+ "class": r"(?:class|interface|struct|enum|record)\s+([a-zA-Z_]\w*)",
353
+ },
354
+ "php": {
355
+ "function": r"^\s*(?:public|private|protected|static|\s)*function\s+([a-zA-Z_]\w*)",
356
+ "class": r"^\s*(?:abstract\s+)?(?:class|interface|trait)\s+([a-zA-Z_]\w*)",
132
357
  },
133
- "sql": {
134
- "table": re.compile(r"CREATE\s+TABLE\s+(?:IF\s+NOT\s+EXISTS\s+)?[`\"]?(\w+)[`\"]?", re.IGNORECASE),
135
- "index": re.compile(r"CREATE\s+(?:UNIQUE\s+)?INDEX\s+\w+\s+ON\s+[`\"]?(\w+)[`\"]?", re.IGNORECASE),
358
+ "swift": {
359
+ "function": r"^\s*(?:public|private|internal|open|\s)*func\s+([a-zA-Z_]\w*)",
360
+ "class": r"^\s*(?:public|private|internal|open|\s)*(?:class|struct|protocol|extension|enum)\s+([a-zA-Z_]\w*)",
136
361
  },
137
- # config files: no node extraction — file node created by scanner
362
+ "lua": {
363
+ "function": r"(?:local\s+)?function\s+([a-zA-Z_]\w*)",
364
+ "class": None,
365
+ },
366
+ }
367
+
368
+ _EXT_TO_LANG_NAME: dict[str, str] = {
369
+ ".py": "python", ".pyw": "python",
370
+ ".js": "javascript", ".mjs": "javascript", ".jsx": "javascript",
371
+ ".ts": "typescript", ".tsx": "typescript",
372
+ ".go": "go",
373
+ ".rs": "rust",
374
+ ".java": "java", ".kt": "java",
375
+ ".c": "c", ".h": "c",
376
+ ".cpp": "cpp", ".cc": "cpp", ".cxx": "cpp", ".hpp": "cpp", ".hh": "cpp",
377
+ ".cs": "csharp",
378
+ ".rb": "ruby", ".rake": "ruby",
379
+ ".php": "php",
380
+ ".swift": "swift",
381
+ ".lua": "lua", ".luau": "lua",
138
382
  }
139
383
 
140
384
 
141
- def _ext_to_lang(ext: str) -> str:
142
- return {
143
- ".py": "python", ".pyw": "python",
144
- ".js": "javascript", ".mjs": "javascript", ".cjs": "javascript",
145
- ".jsx": "javascript", ".ts": "javascript", ".tsx": "javascript",
146
- ".go": "go", ".rs": "rust",
147
- ".java": "java", ".rb": "ruby",
148
- ".sql": "sql",
149
- ".yaml": "config", ".yml": "config", ".toml": "config",
150
- ".env": "config", ".ini": "config", ".cfg": "config",
151
- }.get(ext, "")
152
-
153
-
154
- def _regex_extract(text: str, lang: str, rel_path: str) -> list:
155
- patterns = _PATTERNS.get(lang, {})
156
- children = []
157
- for kind, pat in patterns.items():
158
- if kind == "import":
159
- continue
160
- for m in pat.finditer(text):
161
- line = text[:m.start()].count("\n") + 1
162
- children.append({
163
- "id": f"{rel_path}::{kind}::{m.group(1)}",
164
- "name": m.group(1),
165
- "type": kind,
166
- "file": rel_path,
167
- "line": line,
168
- })
169
-
170
- imp_pat = patterns.get("import")
171
- imports = [m.group(1) for m in imp_pat.finditer(text)][:30] if imp_pat else []
172
-
173
- return _wrap_in_module(rel_path, children, imports)
174
-
175
-
176
- # ── Module wrapper ───────────────────────────────────────────────────────────
177
-
178
- def _wrap_in_module(rel_path: str, children: list, imports: list) -> list:
385
+ def _extract_with_regex(source: str, rel_path: str, ext: str) -> list[dict]:
386
+ lang = _EXT_TO_LANG_NAME.get(ext.lower())
387
+ if not lang or lang not in _REGEX_PATTERNS:
388
+ return []
389
+
390
+ patterns = _REGEX_PATTERNS[lang]
391
+ nodes: list[dict] = []
392
+ seen: set[str] = set()
393
+
394
+ for line_no, line in enumerate(source.splitlines(), 1):
395
+ for ntype, pattern in patterns.items():
396
+ if not pattern:
397
+ continue
398
+ m = re.search(pattern, line)
399
+ if m:
400
+ name = next((g for g in m.groups() if g), None)
401
+ if name and name not in seen:
402
+ seen.add(name)
403
+ nodes.append({
404
+ "id": f"{rel_path}::{ntype}::{name}",
405
+ "name": name,
406
+ "type": ntype,
407
+ "file": rel_path,
408
+ "line": line_no,
409
+ })
410
+ return nodes
411
+
412
+
413
+ # ── Public interface ──────────────────────────────────────────────────────────
414
+
415
+ def extract(abs_path: str, rel_path: str) -> list[dict]:
179
416
  """
180
- Create a module node for the file, link all child nodes to it via defined_in.
181
- The module node carries the imports so builder can create file-to-file edges.
417
+ Extract AST nodes from a code file.
418
+ Tries tree-sitter first; falls back to regex if grammar not installed.
182
419
  """
183
- from pathlib import Path as _Path
184
- stem = _Path(rel_path).stem
185
- mod_id = f"{rel_path}::module::{stem}"
186
- module = {
187
- "id": mod_id,
188
- "name": stem,
189
- "type": "module",
190
- "file": rel_path,
191
- "line": 1,
192
- "imports": imports,
193
- }
194
- for child in children:
195
- child["relations"] = [{"id": mod_id, "relation": "defined-in", "confidence": "EXTRACTED"}]
196
- return [module] + children
197
-
198
-
199
- # ── Public API ────────────────────────────────────────────────────────────────
200
-
201
- def extract(abs_path: str, rel_path: str) -> list:
202
- """Extract nodes from a code/sql/config file. Returns list of node dicts."""
203
420
  ext = Path(abs_path).suffix.lower()
204
- lang = _ext_to_lang(ext)
205
- if not lang:
206
- return []
421
+ cfg = _EXT_TO_LANG.get(ext)
207
422
 
208
423
  try:
209
- raw = open(abs_path, "rb").read()
210
- text = raw.decode("utf-8", errors="replace")
211
- except OSError:
424
+ source_bytes = Path(abs_path).read_bytes()
425
+ except Exception:
212
426
  return []
213
427
 
214
- # tree-sitter for Python and JS/TS if available
215
- if lang in ("python", "javascript") and lang in _TS_PARSERS:
216
- children = _ts_extract(raw, lang, rel_path)
217
- if children:
218
- imp_pat = _PATTERNS.get(lang, {}).get("import")
219
- imports = [m.group(1) for m in imp_pat.finditer(text)][:30] if imp_pat else []
220
- return _wrap_in_module(rel_path, children, imports)
428
+ if _TS_AVAILABLE and cfg:
429
+ nodes = _extract_with_treesitter(source_bytes, rel_path, cfg)
430
+ if nodes:
431
+ return nodes
432
+
433
+ try:
434
+ source_text = source_bytes.decode("utf-8", errors="ignore")
435
+ except Exception:
436
+ return []
221
437
 
222
- return _regex_extract(text, lang, rel_path)
438
+ return _extract_with_regex(source_text, rel_path, ext)