repomap-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repomap/parser.py ADDED
@@ -0,0 +1,1697 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Repo Map Parser — Tree-sitter Analysis Layer
4
+ ==============================================
5
+ 负责代码解析、符号提取、import/export 绑定提取。
6
+
7
+ 此模块独立于引擎层,可被单独使用进行代码分析。
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import re
14
+ from collections import defaultdict
15
+ from typing import Any
16
+
17
+ from . import JSImportBinding, JSExportBinding, Symbol
18
+
19
+ logger = logging.getLogger("repomap")
20
+
21
+ # ═══════════════════════════════════════════════════════════════════════════════
22
+ # Tree-sitter Queries(内嵌,无需外部 .scm 文件)
23
+ # ═══════════════════════════════════════════════════════════════════════════════
24
+
25
+ QUERIES: dict[str, dict[str, str]] = {
26
+ "python": {
27
+ "function": """
28
+ (function_definition name: (identifier) @name) @definition.function
29
+ (decorated_definition (function_definition name: (identifier) @name)) @definition.function
30
+ (class_definition body: (block (function_definition name: (identifier) @name))) @definition.method
31
+ (assignment left: (identifier) @name right: (lambda)) @definition.lambda
32
+ """,
33
+ "class": """
34
+ (class_definition name: (identifier) @name) @definition.class
35
+ (decorated_definition (class_definition name: (identifier) @name)) @definition.class
36
+ """,
37
+ "import": """
38
+ (import_statement name: (dotted_name) @name)
39
+ (import_statement name: (aliased_import name: (dotted_name) @name))
40
+ (import_from_statement module_name: (dotted_name) @name)
41
+ (import_from_statement module_name: (relative_import) @name)
42
+ """,
43
+ "call": """
44
+ (call function: (identifier) @name) @reference.call
45
+ (call function: (attribute attribute: (identifier) @name)) @reference.call
46
+ """,
47
+ "http_route": """
48
+ ;; FastAPI: @app.get("/path") or @router.post("/path")
49
+ (decorated_definition
50
+ (decorator
51
+ (call
52
+ function: (attribute
53
+ object: (identifier) @_obj
54
+ attribute: (identifier) @method)
55
+ arguments: (argument_list (string) @path)))
56
+ definition: (function_definition name: (identifier) @handler))
57
+ (#match? @_obj "^(app|router|api)$")
58
+ (#match? @method "^(get|post|put|delete|patch|head|options)$")
59
+ """,
60
+ },
61
+ "javascript": {
62
+ "function": """
63
+ (function_declaration name: (identifier) @name) @definition.function
64
+ (variable_declarator name: (identifier) @name value: (arrow_function)) @definition.function
65
+ (variable_declarator name: (identifier) @name value: (function_expression)) @definition.function
66
+ (method_definition name: (property_identifier) @name) @definition.method
67
+ """,
68
+ "anonymous_function": """
69
+ (arrow_function) @definition.anonymous_function
70
+ (function_expression) @definition.anonymous_function
71
+ """,
72
+ "class": """
73
+ (class_declaration name: (identifier) @name) @definition.class
74
+ """,
75
+ "import": """
76
+ (import_statement source: (string) @source)
77
+ (import_specifier name: (identifier) @name)
78
+ (import_clause (identifier) @name)
79
+ """,
80
+ "call": """
81
+ (call_expression function: (identifier) @name) @reference.call
82
+ (call_expression function: (member_expression property: (property_identifier) @name)) @reference.call
83
+ """,
84
+ "http_route": """
85
+ ;; Express: app.get("/path", handler) / router.post("/path", handler)
86
+ (call_expression
87
+ function: (member_expression
88
+ object: (identifier) @_router
89
+ property: (property_identifier) @method)
90
+ arguments: (arguments
91
+ (string) @path
92
+ .
93
+ [(identifier) @handler (arrow_function) @handler (function_expression) @handler]))
94
+ (#match? @_router "^(app|router)$")
95
+ (#match? @method "^(get|post|put|delete|patch|use|all)$")
96
+ """,
97
+ },
98
+ # TypeScript:使用专用绑定时节点名不同;回退到 JS parser 时 TS 特有语法会报 ERROR,
99
+ # 此处只保留两个 parser 都支持的通用模式
100
+ "typescript": {
101
+ "function": """
102
+ (function_declaration name: (identifier) @name) @definition.function
103
+ (variable_declarator name: (identifier) @name value: (arrow_function)) @definition.function
104
+ (method_definition name: (property_identifier) @name) @definition.method
105
+ """,
106
+ "anonymous_function": """
107
+ (arrow_function) @definition.anonymous_function
108
+ (function_expression) @definition.anonymous_function
109
+ """,
110
+ "class": """
111
+ (class_declaration name: (_) @name) @definition.class
112
+ """,
113
+ "import": """
114
+ (import_statement source: (string) @source)
115
+ (import_specifier name: (identifier) @name)
116
+ (import_clause (identifier) @name)
117
+ """,
118
+ "call": """
119
+ (call_expression function: (identifier) @name) @reference.call
120
+ (call_expression function: (member_expression property: (property_identifier) @name)) @reference.call
121
+ """,
122
+ "http_route": """
123
+ ;; Express: app.get("/path", handler) / router.post("/path", handler)
124
+ (call_expression
125
+ function: (member_expression
126
+ object: (identifier) @_router
127
+ property: (property_identifier) @method)
128
+ arguments: (arguments
129
+ (string) @path
130
+ .
131
+ [(identifier) @handler (arrow_function) @handler (function_expression) @handler]))
132
+ (#match? @_router "^(app|router)$")
133
+ (#match? @method "^(get|post|put|delete|patch|use|all)$")
134
+ """,
135
+ },
136
+ "go": {
137
+ "function": """
138
+ (function_declaration name: (identifier) @name) @definition.function
139
+ (method_declaration name: (field_identifier) @name) @definition.method
140
+ """,
141
+ "class": """
142
+ (type_spec name: (type_identifier) @name type: (struct_type)) @definition.struct
143
+ (type_spec name: (type_identifier) @name type: (interface_type)) @definition.interface
144
+ """,
145
+ "import": """
146
+ (import_spec path: (interpreted_string_literal) @path)
147
+ """,
148
+ "call": """
149
+ (call_expression function: (identifier) @name) @reference.call
150
+ (call_expression function: (selector_expression field: (field_identifier) @name)) @reference.call
151
+ """,
152
+ },
153
+ "rust": {
154
+ "function": """
155
+ (function_item name: (identifier) @name) @definition.function
156
+ (function_signature_item name: (identifier) @name) @definition.trait_method
157
+ """,
158
+ "class": """
159
+ (struct_item name: (type_identifier) @name) @definition.struct
160
+ (enum_item name: (type_identifier) @name) @definition.enum
161
+ (trait_item name: (type_identifier) @name) @definition.trait
162
+ (impl_item type: (type_identifier) @name) @definition.impl
163
+ (type_item name: (type_identifier) @name) @definition.type
164
+ (mod_item name: (identifier) @name) @definition.module
165
+ """,
166
+ "import": """
167
+ ; 捕获 use crate::module::Item 中的 module 部分
168
+ (use_declaration
169
+ argument: (scoped_identifier
170
+ path: (identifier) @path
171
+ name: (identifier) @name))
172
+ ; 捕获 use crate::module::{A, B} 中的 module 部分
173
+ (use_declaration
174
+ argument: (scoped_use_list
175
+ path: (identifier) @path))
176
+ ; 捕获 use module::Item 中的 module
177
+ (use_declaration
178
+ argument: (scoped_identifier
179
+ path: (identifier) @path
180
+ name: (identifier) @name))
181
+ ; 捕获 extern crate name;
182
+ (extern_crate_declaration name: (identifier) @name)
183
+ ; 捕获 use module;
184
+ (use_declaration argument: (identifier) @name)
185
+ """,
186
+ "call": """
187
+ (call_expression function: (identifier) @name) @reference.call
188
+ (call_expression function: (field_expression field: (field_identifier) @name)) @reference.call
189
+ (call_expression function: (scoped_identifier name: (identifier) @name)) @reference.call
190
+ """,
191
+ "http_route": """
192
+ ;; Axum: .route("/path", get(handler))
193
+ (call_expression
194
+ function: (field_expression
195
+ field: (field_identifier) @_method_name)
196
+ arguments: (arguments
197
+ (string_literal) @path
198
+ (call_expression
199
+ function: (identifier) @http_method
200
+ arguments: (arguments (identifier) @handler))))
201
+ (#eq? @_method_name "route")
202
+ (#match? @http_method "^(get|post|put|delete|patch|head|options)$")
203
+ """,
204
+ },
205
+ "c": {
206
+ "function": """
207
+ (function_definition
208
+ declarator: (function_declarator
209
+ declarator: (identifier) @name)) @definition.function
210
+ """,
211
+ "class": """
212
+ (struct_specifier name: (type_identifier) @name) @definition.struct
213
+ (union_specifier name: (type_identifier) @name) @definition.union
214
+ (enum_specifier name: (type_identifier) @name) @definition.enum
215
+ """,
216
+ "import": """
217
+ (preproc_include path: (_) @path)
218
+ """,
219
+ "call": """
220
+ (call_expression function: (identifier) @name) @reference.call
221
+ """,
222
+ },
223
+ "java": {
224
+ "function": """
225
+ (method_declaration name: (identifier) @name) @definition.method
226
+ (constructor_declaration name: (identifier) @name) @definition.method
227
+ """,
228
+ "class": """
229
+ (class_declaration name: (identifier) @name) @definition.class
230
+ (interface_declaration name: (identifier) @name) @definition.interface
231
+ (enum_declaration name: (identifier) @name) @definition.enum
232
+ """,
233
+ "import": """
234
+ (import_declaration (scoped_identifier) @name)
235
+ (import_declaration (identifier) @name)
236
+ """,
237
+ "call": """
238
+ (method_invocation name: (identifier) @name) @reference.call
239
+ """,
240
+ "http_route": """
241
+ ;; Spring Boot: @GetMapping("/path") / @PostMapping("/path")
242
+ (annotation
243
+ name: (identifier) @method
244
+ arguments: (annotation_argument_list
245
+ (element_value_pair
246
+ value: (string_literal) @path)))
247
+ (#match? @method "^(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)$")
248
+ """,
249
+ },
250
+ "kotlin": {
251
+ "function": """
252
+ (function_declaration name: (simple_identifier) @name) @definition.function
253
+ """,
254
+ "class": """
255
+ (class_declaration name: (type_identifier) @name) @definition.class
256
+ (object_declaration name: (type_identifier) @name) @definition.object
257
+ (interface_declaration name: (type_identifier) @name) @definition.interface
258
+ """,
259
+ "import": """
260
+ (import_header (identifier) @name)
261
+ """,
262
+ "call": """
263
+ (call_expression (simple_identifier) @name) @reference.call
264
+ (call_expression (navigation_expression (simple_identifier) @name)) @reference.call
265
+ """,
266
+ },
267
+ "swift": {
268
+ "function": """
269
+ (function_declaration name: (simple_identifier) @name) @definition.function
270
+ """,
271
+ "class": """
272
+ (class_declaration name: (type_identifier) @name) @definition.class
273
+ (struct_declaration name: (type_identifier) @name) @definition.struct
274
+ (enum_declaration name: (type_identifier) @name) @definition.enum
275
+ (protocol_declaration name: (type_identifier) @name) @definition.protocol
276
+ """,
277
+ "import": """
278
+ (import_declaration (identifier) @name)
279
+ """,
280
+ "call": """
281
+ (call_expression (simple_identifier) @name) @reference.call
282
+ (call_expression (navigation_expression (simple_identifier) @name)) @reference.call
283
+ """,
284
+ },
285
+ "cpp": {
286
+ "function": """
287
+ (function_definition
288
+ declarator: (function_declarator
289
+ declarator: [(identifier) (qualified_identifier)] @name)) @definition.function
290
+ """,
291
+ "class": """
292
+ (class_specifier name: (type_identifier) @name) @definition.class
293
+ (struct_specifier name: (type_identifier) @name) @definition.struct
294
+ (enum_specifier name: (type_identifier) @name) @definition.enum
295
+ """,
296
+ "import": """
297
+ (preproc_include path: (_) @path)
298
+ """,
299
+ "call": """
300
+ (call_expression function: [(identifier) (qualified_identifier)] @name) @reference.call
301
+ """,
302
+ },
303
+ "c_sharp": {
304
+ "function": """
305
+ (method_declaration name: (identifier) @name) @definition.method
306
+ (local_function_statement name: (identifier) @name) @definition.function
307
+ """,
308
+ "class": """
309
+ (class_declaration name: (identifier) @name) @definition.class
310
+ (interface_declaration name: (identifier) @name) @definition.interface
311
+ (struct_declaration name: (identifier) @name) @definition.struct
312
+ (enum_declaration name: (identifier) @name) @definition.enum
313
+ """,
314
+ "import": """
315
+ (using_directive name: [(identifier) (qualified_name)] @name)
316
+ """,
317
+ "call": """
318
+ (invocation_expression function: (identifier) @name) @reference.call
319
+ (invocation_expression function: (member_access_expression name: (identifier) @name)) @reference.call
320
+ """,
321
+ },
322
+ "php": {
323
+ "function": """
324
+ (function_definition name: (name) @name) @definition.function
325
+ (method_declaration name: (name) @name) @definition.method
326
+ """,
327
+ "class": """
328
+ (class_declaration name: (name) @name) @definition.class
329
+ (interface_declaration name: (name) @name) @definition.interface
330
+ (trait_declaration name: (name) @name) @definition.trait
331
+ (enum_declaration name: (name) @name) @definition.enum
332
+ """,
333
+ "import": """
334
+ (namespace_use_declaration (qualified_name) @name)
335
+ """,
336
+ "call": """
337
+ (function_call_expression function: (name) @name) @reference.call
338
+ (member_call_expression name: (name) @name) @reference.call
339
+ """,
340
+ },
341
+ "ruby": {
342
+ "function": """
343
+ (method name: (identifier) @name) @definition.method
344
+ (singleton_method name: (identifier) @name) @definition.method
345
+ """,
346
+ "class": """
347
+ (class name: (constant) @name) @definition.class
348
+ (module name: (constant) @name) @definition.module
349
+ """,
350
+ "import": """
351
+ (call method: (identifier) @_method arguments: (argument_list (string) @path))
352
+ (#match? @_method "^(require|require_relative|load)$")
353
+ """,
354
+ "call": """
355
+ (call method: (identifier) @name) @reference.call
356
+ """,
357
+ },
358
+ "html": {},
359
+ "css": {},
360
+ "json": {},
361
+ }
362
+ QUERIES["tsx"] = QUERIES["typescript"]
363
+
364
+ EXT_TO_LANG: dict[str, str] = {
365
+ ".py": "python",
366
+ ".pyi": "python",
367
+ ".js": "javascript",
368
+ ".jsx": "javascript",
369
+ ".mjs": "javascript",
370
+ ".cjs": "javascript",
371
+ ".ts": "typescript",
372
+ ".tsx": "tsx",
373
+ ".mts": "typescript",
374
+ ".cts": "typescript",
375
+ ".go": "go",
376
+ ".rs": "rust",
377
+ ".html": "html",
378
+ ".htm": "html",
379
+ ".css": "css",
380
+ ".json": "json",
381
+ ".c": "c",
382
+ ".h": "c",
383
+ ".cpp": "cpp",
384
+ ".cc": "cpp",
385
+ ".cxx": "cpp",
386
+ ".hpp": "cpp",
387
+ ".hh": "cpp",
388
+ ".java": "java",
389
+ ".kt": "kotlin",
390
+ ".kts": "kotlin",
391
+ ".swift": "swift",
392
+ ".cs": "c_sharp",
393
+ ".php": "php",
394
+ ".phtml": "php",
395
+ ".rb": "ruby",
396
+ }
397
+
398
+ # ═══════════════════════════════════════════════════════════════════════════════
399
+ # Tree-sitter 适配层
400
+ # ═══════════════════════════════════════════════════════════════════════════════
401
+
402
+
403
+ class TreeSitterAdapter:
404
+ """
405
+ 封装 tree-sitter 多语言解析。
406
+ ‑ 兼容 tree-sitter 0.20 ~ 0.25+(捕获结果格式差异)
407
+ ‑ 懒加载语言绑定,未安装的静默跳过
408
+ """
409
+
410
+ def __init__(self) -> None:
411
+ self.parsers: dict[str, Any] = {}
412
+ # lang -> query_type -> compiled Query
413
+ self._queries: dict[str, dict[str, Any]] = {}
414
+ self._init_parsers()
415
+
416
+ # ── 初始化 ─────────────────────────────────────────────────────────────────
417
+
418
+ def _init_parsers(self) -> None:
419
+ """加载各语言 parser,并预编译 queries。"""
420
+ bindings = {
421
+ "python": ("tree_sitter_python", "language"),
422
+ "javascript": ("tree_sitter_javascript", "language"),
423
+ "go": ("tree_sitter_go", "language"),
424
+ "rust": ("tree_sitter_rust", "language"),
425
+ "html": ("tree_sitter_html", "language"),
426
+ "css": ("tree_sitter_css", "language"),
427
+ "json": ("tree_sitter_json", "language"),
428
+ }
429
+
430
+ # 动态导入,失败则跳过
431
+ for lang, (module, attr) in bindings.items():
432
+ try:
433
+ mod = __import__(module)
434
+ lang_fn = getattr(mod, attr)
435
+ from tree_sitter import Language, Parser # type: ignore
436
+ self.parsers[lang] = Parser(Language(lang_fn()))
437
+ logger.debug(f"Parser loaded: {lang}")
438
+ except Exception as e:
439
+ logger.debug(f"Parser unavailable [{lang}]: {e}")
440
+
441
+ # C
442
+ try:
443
+ from tree_sitter_c import language as lang_c
444
+ from tree_sitter import Language, Parser
445
+ self.parsers["c"] = Parser(Language(lang_c()))
446
+ logger.debug("Parser loaded: c")
447
+ except Exception as e:
448
+ logger.debug(f"Parser unavailable [c]: {e}")
449
+
450
+ # Java
451
+ try:
452
+ from tree_sitter_java import language as lang_java
453
+ from tree_sitter import Language, Parser
454
+ self.parsers["java"] = Parser(Language(lang_java()))
455
+ logger.debug("Parser loaded: java")
456
+ except Exception as e:
457
+ logger.debug(f"Parser unavailable [java]: {e}")
458
+
459
+ # Kotlin — 可选依赖,未安装时静默跳过
460
+ try:
461
+ from tree_sitter_kotlin import language as lang_kotlin
462
+ from tree_sitter import Language, Parser
463
+ self.parsers["kotlin"] = Parser(Language(lang_kotlin()))
464
+ logger.debug("Parser loaded: kotlin")
465
+ except Exception as e:
466
+ logger.debug(f"Parser unavailable [kotlin]: {e}")
467
+
468
+ # Swift — 可选依赖,未安装时静默跳过
469
+ try:
470
+ from tree_sitter_swift import language as lang_swift
471
+ from tree_sitter import Language, Parser
472
+ self.parsers["swift"] = Parser(Language(lang_swift()))
473
+ logger.debug("Parser loaded: swift")
474
+ except Exception as e:
475
+ logger.debug(f"Parser unavailable [swift]: {e}")
476
+
477
+ # C++ — 可选依赖,未安装时静默跳过
478
+ try:
479
+ from tree_sitter_cpp import language as lang_cpp
480
+ from tree_sitter import Language, Parser
481
+ self.parsers["cpp"] = Parser(Language(lang_cpp()))
482
+ logger.debug("Parser loaded: cpp")
483
+ except Exception as e:
484
+ logger.debug(f"Parser unavailable [cpp]: {e}")
485
+
486
+ # C# — 可选依赖,未安装时静默跳过
487
+ try:
488
+ from tree_sitter_c_sharp import language as lang_csharp
489
+ from tree_sitter import Language, Parser
490
+ self.parsers["c_sharp"] = Parser(Language(lang_csharp()))
491
+ logger.debug("Parser loaded: c_sharp")
492
+ except Exception as e:
493
+ logger.debug(f"Parser unavailable [c_sharp]: {e}")
494
+
495
+ # PHP — 可选依赖,未安装时静默跳过
496
+ try:
497
+ from tree_sitter_php import language as lang_php
498
+ from tree_sitter import Language, Parser
499
+ self.parsers["php"] = Parser(Language(lang_php()))
500
+ logger.debug("Parser loaded: php")
501
+ except Exception as e:
502
+ logger.debug(f"Parser unavailable [php]: {e}")
503
+
504
+ # Ruby — 可选依赖,未安装时静默跳过
505
+ try:
506
+ from tree_sitter_ruby import language as lang_ruby
507
+ from tree_sitter import Language, Parser
508
+ self.parsers["ruby"] = Parser(Language(lang_ruby()))
509
+ logger.debug("Parser loaded: ruby")
510
+ except Exception as e:
511
+ logger.debug(f"Parser unavailable [ruby]: {e}")
512
+
513
+ # TypeScript / TSX:优先专用绑定,TypeScript 回退到 JavaScript parser,TSX 不回退以避免误解析 JSX。
514
+ try:
515
+ from tree_sitter_typescript import language_typescript, language_tsx # type: ignore
516
+ from tree_sitter import Language, Parser # type: ignore
517
+ self.parsers["typescript"] = Parser(Language(language_typescript()))
518
+ self.parsers["tsx"] = Parser(Language(language_tsx()))
519
+ logger.debug("Parser loaded: typescript (dedicated)")
520
+ logger.debug("Parser loaded: tsx (dedicated)")
521
+ except Exception:
522
+ try:
523
+ from tree_sitter_typescript import language_typescript # type: ignore
524
+ from tree_sitter import Language, Parser # type: ignore
525
+ self.parsers["typescript"] = Parser(Language(language_typescript()))
526
+ logger.debug("Parser loaded: typescript (dedicated)")
527
+ except Exception:
528
+ if "javascript" in self.parsers:
529
+ self.parsers["typescript"] = self.parsers["javascript"]
530
+ logger.debug("Parser loaded: typescript (fallback to javascript)")
531
+
532
+ # 预编译 queries —— 只对已加载的语言
533
+ self._precompile_queries()
534
+
535
+ def _precompile_queries(self) -> None:
536
+ """预编译所有 tree-sitter queries,失败时记录警告。
537
+
538
+ 要求 tree-sitter >= 0.21(支持 Query 类)
539
+ """
540
+ try:
541
+ from tree_sitter import Query # type: ignore
542
+ except ImportError:
543
+ logger.warning("tree-sitter Query class not available (requires >=0.21), queries disabled")
544
+ return
545
+
546
+ for lang, patterns in QUERIES.items():
547
+ if lang not in self.parsers:
548
+ continue
549
+ self._queries[lang] = {}
550
+ parser = self.parsers[lang]
551
+ for qtype, src in patterns.items():
552
+ try:
553
+ q = Query(parser.language, src)
554
+ self._queries[lang][qtype] = q
555
+ except Exception as e:
556
+ logger.warning(f"Query compile failed [{lang}/{qtype}]: {e}")
557
+
558
+ # ── 公开接口 ────────────────────────────────────────────────────────────────
559
+
560
+ def parse(self, content: bytes, lang: str) -> Any | None:
561
+ parser = self.parsers.get(lang)
562
+ if not parser:
563
+ return None
564
+
565
+ # 内容大小限制(防止内存溢出)
566
+ MAX_PARSE_SIZE = 10 * 1024 * 1024 # 10MB
567
+ if len(content) > MAX_PARSE_SIZE:
568
+ logger.warning(f"File too large for parsing ({len(content)} bytes > {MAX_PARSE_SIZE}), skipping")
569
+ return None
570
+
571
+ # 检测异常内容模式(可能导致解析器崩溃)
572
+ try:
573
+ # 检查是否包含可能导致解析器栈溢出的极端嵌套模式
574
+ content_str = content.decode('utf-8', errors='ignore')
575
+ # 检测极端深度的括号嵌套(可能导致递归溢出)
576
+ max_nesting = 0
577
+ current_nesting = 0
578
+ for char in content_str[:100000]: # 只检查前 100KB
579
+ if char in '({[<':
580
+ current_nesting += 1
581
+ max_nesting = max(max_nesting, current_nesting)
582
+ elif char in ')}]>':
583
+ current_nesting -= 1
584
+ # 如果嵌套深度超过 1000,可能触发解析器栈溢出
585
+ if max_nesting > 1000:
586
+ logger.warning(f"Extreme nesting detected ({max_nesting} levels), skipping file to prevent parser crash")
587
+ return None
588
+ except Exception:
589
+ pass # 解码失败继续尝试解析
590
+
591
+ try:
592
+ return parser.parse(content)
593
+ except RecursionError:
594
+ logger.warning(f"Parser recursion limit exceeded for {lang}, skipping file")
595
+ return None
596
+ except MemoryError:
597
+ logger.warning(f"Parser out of memory for {lang}, skipping file")
598
+ return None
599
+ except Exception as e:
600
+ logger.debug(f"Parse error [{lang}]: {e}")
601
+ return None
602
+
603
+ def extract_symbols(self, tree: Any, lang: str, file: str, content: bytes) -> list[Symbol]:
604
+ """从 AST 提取函数 / 类等符号定义。"""
605
+ if lang == "html":
606
+ return self._extract_html_symbols(tree, file)
607
+ if lang == "css":
608
+ return self._extract_css_symbols(tree, file)
609
+ if lang == "json":
610
+ return self._extract_json_symbols(tree, file)
611
+
612
+ symbols_by_id: dict[str, Symbol] = {}
613
+ root = tree.root_node
614
+
615
+ for qtype in ("function", "class"):
616
+ query = self._queries.get(lang, {}).get(qtype)
617
+ if not query:
618
+ continue
619
+
620
+ captures = self._run_query(query, root)
621
+ name_nodes: list[Any] = []
622
+ def_nodes: list[tuple[Any, str]] = []
623
+
624
+ for cap_name, node in captures:
625
+ if cap_name == "name":
626
+ name_nodes.append(node)
627
+ elif "definition" in cap_name or "export" in cap_name:
628
+ def_nodes.append((node, cap_name))
629
+
630
+ for name_node in name_nodes:
631
+ matching_defs = [
632
+ (def_node, def_cap)
633
+ for def_node, def_cap in def_nodes
634
+ if self._within(name_node, def_node)
635
+ ]
636
+ matching_defs.sort(
637
+ key=lambda item: (
638
+ (item[0].end_point[0] - item[0].start_point[0], item[0].end_point[1] - item[0].start_point[1]),
639
+ item[0].start_point[0],
640
+ item[0].start_point[1],
641
+ )
642
+ )
643
+ for def_node, def_cap in matching_defs:
644
+ kind = def_cap.split(".")[-1] if "." in def_cap else def_cap
645
+ vis = "exported" if "export" in def_cap else "public"
646
+ name = self._text(name_node)
647
+ if not name:
648
+ break
649
+ # Python: _ 前缀视为 private
650
+ if lang == "python" and name.startswith("_") and not name.startswith("__"):
651
+ vis = "private"
652
+ sym_id = f"{file}::{name}::{name_node.start_point[0] + 1}"
653
+ symbols_by_id[sym_id] = Symbol(
654
+ id=sym_id,
655
+ name=name,
656
+ kind=kind,
657
+ file=file,
658
+ line=name_node.start_point[0] + 1,
659
+ end_line=def_node.end_point[0] + 1,
660
+ col=name_node.start_point[1],
661
+ visibility=vis,
662
+ docstring=self._docstring(def_node, lang),
663
+ signature=self._signature(def_node, lang),
664
+ )
665
+ break
666
+
667
+ for symbol in self._extract_exported_function_expression_symbols(tree, lang, file):
668
+ symbols_by_id.setdefault(symbol.id, symbol)
669
+
670
+ for symbol in self._extract_object_literal_method_symbols(tree, lang, file):
671
+ symbols_by_id.setdefault(symbol.id, symbol)
672
+
673
+ for symbol in self._extract_anonymous_symbols(tree, lang, file):
674
+ symbols_by_id.setdefault(symbol.id, symbol)
675
+
676
+ return sorted(
677
+ symbols_by_id.values(),
678
+ key=lambda symbol: (
679
+ symbol.file,
680
+ symbol.line,
681
+ symbol.end_line,
682
+ symbol.col,
683
+ symbol.name,
684
+ symbol.kind,
685
+ ),
686
+ )
687
+
688
+ def _extract_exported_function_expression_symbols(self, tree: Any, lang: str, file: str) -> list[Symbol]:
689
+ if lang not in ("javascript", "typescript", "tsx"):
690
+ return []
691
+ symbols_by_id: dict[str, Symbol] = {}
692
+ for node in self._walk_tree(tree.root_node):
693
+ if node.type not in {"function_expression", "arrow_function"}:
694
+ continue
695
+ if not self._is_exported_anonymous_expression(node):
696
+ continue
697
+ explicit_name = self._declaration_primary_name(node)
698
+ if explicit_name:
699
+ name = explicit_name
700
+ elif self._is_export_default(node):
701
+ name = self._export_default_name(node)
702
+ else:
703
+ name = self._anonymous_symbol_name(node)
704
+ line = node.start_point[0] + 1
705
+ symbol_id = f"{file}::{name}::{line}"
706
+ symbols_by_id[symbol_id] = Symbol(
707
+ id=symbol_id,
708
+ name=name,
709
+ kind="anonymous_function",
710
+ file=file,
711
+ line=line,
712
+ end_line=node.end_point[0] + 1,
713
+ col=node.start_point[1],
714
+ visibility="private",
715
+ signature=self._signature(node, lang),
716
+ )
717
+ return sorted(symbols_by_id.values(), key=lambda symbol: (symbol.file, symbol.line, symbol.col, symbol.name))
718
+
719
+ def _is_export_default(self, node: Any) -> bool:
720
+ current = getattr(node, "parent", None)
721
+ depth = 0
722
+ while current is not None and depth < 4:
723
+ if current.type == "export_statement":
724
+ return self._first_child_of_type(current, "default") is not None
725
+ current = getattr(current, "parent", None)
726
+ depth += 1
727
+ return False
728
+
729
+ def _extract_object_literal_method_symbols(self, tree: Any, lang: str, file: str) -> list[Symbol]:
730
+ if lang not in ("javascript", "typescript", "tsx"):
731
+ return []
732
+ symbols_by_id: dict[str, Symbol] = {}
733
+ for node in self._walk_tree(tree.root_node):
734
+ if node.type != "pair":
735
+ continue
736
+ value_node = node.child_by_field_name("value")
737
+ if value_node is None or value_node.type not in {"arrow_function", "function_expression"}:
738
+ continue
739
+ key_node = node.child_by_field_name("key")
740
+ if key_node is None:
741
+ for child in node.children:
742
+ if child.type in {"property_identifier", "identifier", "string"}:
743
+ key_node = child
744
+ break
745
+ name = self._identifier_text(key_node) or (self._string_literal_value(key_node) if key_node and key_node.type == "string" else "")
746
+ if not name:
747
+ continue
748
+ line = key_node.start_point[0] + 1
749
+ symbol_id = f"{file}::{name}::{line}"
750
+ symbols_by_id[symbol_id] = Symbol(
751
+ id=symbol_id,
752
+ name=name,
753
+ kind="method",
754
+ file=file,
755
+ line=line,
756
+ end_line=value_node.end_point[0] + 1,
757
+ col=key_node.start_point[1],
758
+ visibility="public",
759
+ signature=self._signature(value_node, lang),
760
+ )
761
+ return sorted(symbols_by_id.values(), key=lambda symbol: (symbol.file, symbol.line, symbol.col, symbol.name))
762
+
763
+ def _extract_anonymous_symbols(self, tree: Any, lang: str, file: str) -> list[Symbol]:
764
+ if lang not in ("javascript", "typescript", "tsx"):
765
+ return []
766
+
767
+ anonymous_symbols: dict[str, Symbol] = {}
768
+ for node in self._walk_tree(tree.root_node):
769
+ if node.type not in {"arrow_function", "function_expression"}:
770
+ continue
771
+ if self._has_named_owner(node) and not self._is_exported_anonymous_expression(node):
772
+ continue
773
+ if node.end_point[0] <= node.start_point[0] and not self._is_exported_anonymous_expression(node):
774
+ continue
775
+
776
+ explicit_name = self._declaration_primary_name(node)
777
+ if explicit_name is not None and not self._is_exported_anonymous_expression(node):
778
+ continue
779
+ line = node.start_point[0] + 1
780
+
781
+ # 尝试从上下文推断更有意义的名字
782
+ name = explicit_name or self._contextual_anonymous_name(node)
783
+
784
+ symbol_id = f"{file}::{name}::{line}"
785
+ anonymous_symbols[symbol_id] = Symbol(
786
+ id=symbol_id,
787
+ name=name,
788
+ kind="anonymous_function",
789
+ file=file,
790
+ line=line,
791
+ end_line=node.end_point[0] + 1,
792
+ col=node.start_point[1],
793
+ visibility="private",
794
+ signature=self._signature(node, lang),
795
+ )
796
+
797
+ return list(anonymous_symbols.values())
798
+
799
+ def _contextual_anonymous_name(self, node: Any) -> str:
800
+ """从父节点上下文推断匿名函数名(JSX handler / Hook callback 等)。"""
801
+ parent = getattr(node, "parent", None)
802
+ if parent is None:
803
+ return self._anonymous_symbol_name(node)
804
+
805
+ # JSX 属性: onClick={() => ...} → onClick_handler@L24
806
+ if parent.type == "jsx_expression":
807
+ grandparent = getattr(parent, "parent", None)
808
+ if grandparent is not None and grandparent.type == "jsx_attribute":
809
+ prop_name = ""
810
+ for child in grandparent.children:
811
+ if child.type == "property_identifier":
812
+ prop_name = self._text(child)
813
+ break
814
+ if prop_name:
815
+ return f"<{prop_name}_handler@{node.start_point[0] + 1}>"
816
+
817
+ # 调用参数: useEffect(() => ...) → useEffect_callback@L24
818
+ if parent.type == "arguments":
819
+ grandparent = getattr(parent, "parent", None)
820
+ if grandparent is not None and grandparent.type == "call_expression":
821
+ func_node = grandparent.child_by_field_name("function")
822
+ if func_node is not None:
823
+ func_name = self._text(func_node)
824
+ if func_name and len(func_name) <= 40:
825
+ return f"<{func_name}_callback@{node.start_point[0] + 1}>"
826
+
827
+ # 数组方法回调: arr.map(() => ...) → map_callback@L24
828
+ if parent.type == "arguments":
829
+ grandparent = getattr(parent, "parent", None)
830
+ if grandparent is not None and grandparent.type == "call_expression":
831
+ func_node = grandparent.child_by_field_name("function")
832
+ if func_node is not None and func_node.type == "member_expression":
833
+ prop_node = func_node.child_by_field_name("property")
834
+ if prop_node is not None:
835
+ method_name = self._text(prop_node)
836
+ if method_name in {"map", "filter", "reduce", "forEach", "find", "some", "every", "sort", "flatMap"}:
837
+ return f"<{method_name}_callback@{node.start_point[0] + 1}>"
838
+
839
+ return self._anonymous_symbol_name(node)
840
+
841
+ def _is_exported_anonymous_expression(self, node: Any) -> bool:
842
+ current = getattr(node, "parent", None)
843
+ depth = 0
844
+ while current is not None and depth < 4:
845
+ if current.type == "export_statement" and self._first_child_of_type(current, "default") is not None:
846
+ return True
847
+ if current.type == "assignment_expression":
848
+ left_node = current.child_by_field_name("left")
849
+ if left_node is not None and self._commonjs_export_target(left_node) is not None:
850
+ return True
851
+ current = getattr(current, "parent", None)
852
+ depth += 1
853
+ return False
854
+
855
+ def _has_named_owner(self, node: Any) -> bool:
856
+ current = getattr(node, "parent", None)
857
+ depth = 0
858
+ while current is not None and depth < 4:
859
+ if current.type in {"function_declaration", "method_definition"}:
860
+ return True
861
+ if current.type == "pair":
862
+ value_node = current.child_by_field_name("value")
863
+ key_node = current.child_by_field_name("key")
864
+ if value_node is node and key_node is not None:
865
+ return True
866
+ if current.type == "variable_declarator":
867
+ for child in current.children:
868
+ if child.type == "identifier":
869
+ return True
870
+ current = getattr(current, "parent", None)
871
+ depth += 1
872
+ return False
873
+
874
+ def extract_imports(self, tree: Any, lang: str) -> list[tuple[str, int]]:
875
+ query = self._queries.get(lang, {}).get("import")
876
+ if not query:
877
+ return []
878
+ results = set()
879
+
880
+ # 对于Rust,需要特殊处理:优先使用path而不是name
881
+ if lang == "rust":
882
+ paths_by_line: dict[int, str] = {}
883
+ names_by_line: dict[int, list[str]] = defaultdict(list)
884
+
885
+ for cap_name, node in self._run_query(query, tree.root_node):
886
+ text = self._text(node)
887
+ line = node.start_point[0] + 1
888
+ if cap_name == "path":
889
+ paths_by_line[line] = text
890
+ elif cap_name == "name":
891
+ names_by_line[line].append(text)
892
+
893
+ # 优先使用path(模块名),其次使用name
894
+ for line in sorted(set(list(paths_by_line.keys()) + list(names_by_line.keys()))):
895
+ if line in paths_by_line:
896
+ results.add((paths_by_line[line], line))
897
+ else:
898
+ for name in names_by_line[line]:
899
+ results.add((name, line))
900
+ else:
901
+ for cap_name, node in self._run_query(query, tree.root_node):
902
+ if lang in ("javascript", "typescript", "tsx") and cap_name != "source":
903
+ continue
904
+ text = self._text(node).strip("\"'")
905
+ if text:
906
+ results.add((text, node.start_point[0] + 1))
907
+ return sorted(results, key=lambda item: (item[1], item[0]))
908
+
909
+ @staticmethod
910
+ def _call_reference_kind(node: Any) -> str:
911
+ parent = getattr(node, "parent", None)
912
+ while parent is not None:
913
+ if parent.type in {"call_expression", "call"}:
914
+ function_node = parent.child_by_field_name("function")
915
+ if function_node is not None and function_node.type in {"member_expression", "field_expression", "selector_expression", "attribute"}:
916
+ return "member"
917
+ return "direct"
918
+ parent = getattr(parent, "parent", None)
919
+ return "direct"
920
+
921
+ def _extract_html_symbols(self, tree: Any, file: str) -> list[Symbol]:
922
+ symbols_by_id: dict[str, Symbol] = {}
923
+ seen_names: dict[tuple[str, int], int] = {}
924
+ for node in self._walk_tree(tree.root_node):
925
+ if node.type != "element":
926
+ continue
927
+ start_tag = self._first_child_of_type(node, "start_tag")
928
+ if start_tag is None:
929
+ continue
930
+ tag_name = None
931
+ for child in start_tag.children:
932
+ if child.type == "tag_name":
933
+ tag_name = self._text(child)
934
+ break
935
+ if not tag_name:
936
+ continue
937
+ line = node.start_point[0] + 1
938
+ visible_name = f"<{tag_name}>"
939
+ key = (visible_name, line)
940
+ seen_names[key] = seen_names.get(key, 0) + 1
941
+ if seen_names[key] > 1:
942
+ visible_name = f"{visible_name}#{seen_names[key]}"
943
+ symbol_id = f"{file}::{visible_name}::{line}"
944
+ symbols_by_id[symbol_id] = Symbol(
945
+ id=symbol_id,
946
+ name=visible_name,
947
+ kind="element",
948
+ file=file,
949
+ line=line,
950
+ end_line=node.end_point[0] + 1,
951
+ col=node.start_point[1],
952
+ visibility="public",
953
+ signature=visible_name,
954
+ )
955
+ return sorted(symbols_by_id.values(), key=lambda symbol: (symbol.file, symbol.line, symbol.col, symbol.name))
956
+
957
+ def _extract_css_symbols(self, tree: Any, file: str) -> list[Symbol]:
958
+ symbols_by_id: dict[str, Symbol] = {}
959
+ seen_names: dict[tuple[str, int], int] = {}
960
+ selector_types = {"class_selector", "id_selector", "tag_name", "nesting_selector"}
961
+ for node in self._walk_tree(tree.root_node):
962
+ if node.type not in selector_types:
963
+ continue
964
+ raw_name = self._text(node).strip()
965
+ if not raw_name:
966
+ continue
967
+ line = node.start_point[0] + 1
968
+ kind = "selector"
969
+ if raw_name.startswith("."):
970
+ kind = "class_selector"
971
+ elif raw_name.startswith("#"):
972
+ kind = "id_selector"
973
+ key = (raw_name, line)
974
+ seen_names[key] = seen_names.get(key, 0) + 1
975
+ visible_name = raw_name if seen_names[key] == 1 else f"{raw_name}#{seen_names[key]}"
976
+ symbol_id = f"{file}::{visible_name}::{line}"
977
+ symbols_by_id[symbol_id] = Symbol(
978
+ id=symbol_id,
979
+ name=visible_name,
980
+ kind=kind,
981
+ file=file,
982
+ line=line,
983
+ end_line=node.end_point[0] + 1,
984
+ col=node.start_point[1],
985
+ visibility="public",
986
+ signature=raw_name,
987
+ )
988
+ return sorted(symbols_by_id.values(), key=lambda symbol: (symbol.file, symbol.line, symbol.col, symbol.name))
989
+
990
+ def _extract_json_symbols(self, tree: Any, file: str) -> list[Symbol]:
991
+ symbols_by_id: dict[str, Symbol] = {}
992
+ seen_names: dict[tuple[str, int], int] = {}
993
+ for node in self._walk_tree(tree.root_node):
994
+ if node.type != "pair":
995
+ continue
996
+ key_node = node.child_by_field_name("key")
997
+ if key_node is None:
998
+ continue
999
+ key_name = self._string_literal_value(key_node)
1000
+ if not key_name:
1001
+ continue
1002
+ line = node.start_point[0] + 1
1003
+ key = (key_name, line)
1004
+ seen_names[key] = seen_names.get(key, 0) + 1
1005
+ visible_name = key_name if seen_names[key] == 1 else f"{key_name}#{seen_names[key]}"
1006
+ symbol_id = f"{file}::{visible_name}::{line}"
1007
+ symbols_by_id[symbol_id] = Symbol(
1008
+ id=symbol_id,
1009
+ name=visible_name,
1010
+ kind="json_key",
1011
+ file=file,
1012
+ line=line,
1013
+ end_line=node.end_point[0] + 1,
1014
+ col=node.start_point[1],
1015
+ visibility="public",
1016
+ signature=f'"{key_name}"',
1017
+ )
1018
+ return sorted(symbols_by_id.values(), key=lambda symbol: (symbol.file, symbol.line, symbol.col, symbol.name))
1019
+
1020
+ def extract_js_ts_import_bindings(
1021
+ self,
1022
+ content: bytes,
1023
+ lang: str,
1024
+ tree: Any | None = None,
1025
+ ) -> list[JSImportBinding]:
1026
+ """提取 JS/TS import 绑定信息。"""
1027
+ if lang not in ("javascript", "typescript", "tsx"):
1028
+ return []
1029
+ parsed_tree = tree or self.parse(content, lang)
1030
+ if not parsed_tree:
1031
+ return []
1032
+ bindings: dict[tuple[str, str, str, int, str], JSImportBinding] = {}
1033
+ for node in parsed_tree.root_node.children:
1034
+ if node.type == "import_statement":
1035
+ self._collect_es_import_bindings(node, bindings)
1036
+ for node in self._walk_tree(parsed_tree.root_node):
1037
+ if node.type == "variable_declarator":
1038
+ self._collect_commonjs_import_bindings(node, bindings)
1039
+ return sorted(
1040
+ bindings.values(),
1041
+ key=lambda item: (item.line, item.module, item.local_name, item.imported_name, item.kind),
1042
+ )
1043
+
1044
+ def extract_js_ts_export_bindings(
1045
+ self,
1046
+ content: bytes,
1047
+ lang: str,
1048
+ tree: Any | None = None,
1049
+ ) -> list[JSExportBinding]:
1050
+ """提取 JS/TS export 绑定信息。"""
1051
+ if lang not in ("javascript", "typescript", "tsx"):
1052
+ return []
1053
+ parsed_tree = tree or self.parse(content, lang)
1054
+ if not parsed_tree:
1055
+ return []
1056
+ bindings: dict[tuple[str, str | None, str | None, int, str], JSExportBinding] = {}
1057
+
1058
+ def add_binding(exported_name: str, source_name: str | None, module: str | None, line: int, kind: str) -> None:
1059
+ key = (exported_name, source_name, module, line, kind)
1060
+ bindings[key] = JSExportBinding(
1061
+ exported_name=exported_name,
1062
+ source_name=source_name,
1063
+ module=module,
1064
+ line=line,
1065
+ kind=kind,
1066
+ )
1067
+
1068
+ for node in parsed_tree.root_node.children:
1069
+ if node.type == "export_statement":
1070
+ self._collect_es_export_bindings(node, add_binding)
1071
+ for node in self._walk_tree(parsed_tree.root_node):
1072
+ if node.type == "assignment_expression":
1073
+ self._collect_commonjs_export_bindings(node, add_binding)
1074
+ return sorted(
1075
+ bindings.values(),
1076
+ key=lambda item: (
1077
+ item.line,
1078
+ item.exported_name,
1079
+ item.source_name or "",
1080
+ item.module or "",
1081
+ item.kind,
1082
+ ),
1083
+ )
1084
+
1085
+ def _collect_es_import_bindings(
1086
+ self,
1087
+ node: Any,
1088
+ bindings: dict[tuple[str, str, str, int, str], JSImportBinding],
1089
+ ) -> None:
1090
+ module = self._module_literal_from_statement(node)
1091
+ if not module:
1092
+ return
1093
+ line = node.start_point[0] + 1
1094
+ import_clause = self._first_child_of_type(node, "import_clause")
1095
+ if not import_clause:
1096
+ return
1097
+ for child in import_clause.children:
1098
+ if child.type == "identifier":
1099
+ self._add_import_binding(bindings, child.text.decode("utf-8"), "default", module, line, "default")
1100
+ elif child.type == "named_imports":
1101
+ for specifier in child.children:
1102
+ if specifier.type != "import_specifier":
1103
+ continue
1104
+ source_node = specifier.child_by_field_name("name")
1105
+ alias_node = specifier.child_by_field_name("alias")
1106
+ source_name = self._identifier_text(source_node)
1107
+ local_name = self._identifier_text(alias_node) or source_name
1108
+ if source_name and local_name:
1109
+ self._add_import_binding(bindings, local_name, source_name, module, line, "named")
1110
+ elif child.type == "namespace_import":
1111
+ local_name = self._last_identifier(child)
1112
+ if local_name:
1113
+ self._add_import_binding(bindings, local_name, "*", module, line, "namespace")
1114
+
1115
+ def _collect_commonjs_import_bindings(
1116
+ self,
1117
+ node: Any,
1118
+ bindings: dict[tuple[str, str, str, int, str], JSImportBinding],
1119
+ ) -> None:
1120
+ value_node = node.child_by_field_name("value")
1121
+ module = self._require_call_module(value_node)
1122
+ if not module:
1123
+ return
1124
+ name_node = node.child_by_field_name("name")
1125
+ if not name_node:
1126
+ return
1127
+ line = node.start_point[0] + 1
1128
+ if name_node.type == "identifier":
1129
+ self._add_import_binding(bindings, name_node.text.decode("utf-8"), "default", module, line, "default")
1130
+ return
1131
+ if name_node.type != "object_pattern":
1132
+ return
1133
+ for child in name_node.children:
1134
+ if child.type in {"shorthand_property_identifier_pattern", "identifier"}:
1135
+ name = child.text.decode("utf-8")
1136
+ self._add_import_binding(bindings, name, name, module, line, "named")
1137
+ elif child.type == "pair_pattern":
1138
+ source_name = self._identifier_text(child.child_by_field_name("key"))
1139
+ local_name = self._identifier_text(child.child_by_field_name("value"))
1140
+ if source_name and local_name:
1141
+ self._add_import_binding(bindings, local_name, source_name, module, line, "named")
1142
+
1143
+ def _collect_es_export_bindings(
1144
+ self,
1145
+ node: Any,
1146
+ add_binding: Any,
1147
+ ) -> None:
1148
+ line = node.start_point[0] + 1
1149
+ module = self._module_literal_from_statement(node)
1150
+ has_default = self._first_child_of_type(node, "default") is not None
1151
+ namespace_export = self._first_child_of_type(node, "namespace_export")
1152
+ export_clause = self._first_child_of_type(node, "export_clause")
1153
+ declaration = self._export_declaration_node(node)
1154
+
1155
+ if namespace_export is not None and module:
1156
+ exported_name = self._last_identifier(namespace_export)
1157
+ if exported_name:
1158
+ add_binding(exported_name, "*", module, line, "namespace")
1159
+ return
1160
+
1161
+ if self._first_child_of_type(node, "*") is not None and module:
1162
+ add_binding("*", "*", module, line, "wildcard")
1163
+ return
1164
+
1165
+ if export_clause is not None:
1166
+ kind = "reexport" if module else "local"
1167
+ for specifier in export_clause.children:
1168
+ if specifier.type != "export_specifier":
1169
+ continue
1170
+ source_name = self._identifier_text(specifier.child_by_field_name("name"))
1171
+ exported_name = self._identifier_text(specifier.child_by_field_name("alias")) or source_name
1172
+ if source_name and exported_name:
1173
+ add_binding(exported_name, source_name, module, line, kind)
1174
+ return
1175
+
1176
+ if has_default:
1177
+ source_name = self._export_default_source_name(node, declaration)
1178
+ if source_name:
1179
+ add_binding("default", source_name, None, line, "local")
1180
+ return
1181
+
1182
+ for exported_name in self._exported_names_from_declaration(declaration):
1183
+ add_binding(exported_name, exported_name, None, line, "local")
1184
+
1185
+ def _collect_commonjs_export_bindings(
1186
+ self,
1187
+ node: Any,
1188
+ add_binding: Any,
1189
+ ) -> None:
1190
+ target_node = node.child_by_field_name("left")
1191
+ value_node = node.child_by_field_name("right")
1192
+ if target_node is None or value_node is None or target_node.type != "member_expression":
1193
+ return
1194
+ export_target = self._commonjs_export_target(target_node)
1195
+ if export_target is None:
1196
+ return
1197
+ line = node.start_point[0] + 1
1198
+ if export_target == "default":
1199
+ if value_node.type == "object":
1200
+ for child in value_node.children:
1201
+ if child.type == "shorthand_property_identifier":
1202
+ name = child.text.decode("utf-8")
1203
+ add_binding(name, name, None, line, "local")
1204
+ elif child.type == "pair":
1205
+ exported_name = self._identifier_text(child.child_by_field_name("key"))
1206
+ source_name = self._identifier_text(child.child_by_field_name("value")) or self._expression_binding_name(child.child_by_field_name("value"))
1207
+ if exported_name and source_name:
1208
+ add_binding(exported_name, source_name, None, line, "local")
1209
+ return
1210
+ source_name = self._expression_binding_name(value_node)
1211
+ if source_name:
1212
+ add_binding("default", source_name, None, line, "local")
1213
+ return
1214
+ source_name = self._expression_binding_name(value_node)
1215
+ if source_name:
1216
+ add_binding(export_target, source_name, None, line, "local")
1217
+
1218
+ def _add_import_binding(
1219
+ self,
1220
+ bindings: dict[tuple[str, str, str, int, str], JSImportBinding],
1221
+ local_name: str,
1222
+ imported_name: str,
1223
+ module: str,
1224
+ line: int,
1225
+ kind: str,
1226
+ ) -> None:
1227
+ key = (local_name, imported_name, module, line, kind)
1228
+ bindings[key] = JSImportBinding(local_name, imported_name, module, line, kind)
1229
+
1230
+ def _module_literal_from_statement(self, node: Any) -> str | None:
1231
+ for child in node.children:
1232
+ if child.type == "string":
1233
+ return self._string_literal_value(child)
1234
+ return None
1235
+
1236
+ def _require_call_module(self, node: Any | None) -> str | None:
1237
+ if node is None or node.type != "call_expression":
1238
+ return None
1239
+ function_node = node.child_by_field_name("function")
1240
+ arguments_node = node.child_by_field_name("arguments")
1241
+ if function_node is None or function_node.type != "identifier":
1242
+ return None
1243
+ if function_node.text.decode("utf-8") != "require" or arguments_node is None:
1244
+ return None
1245
+ for child in arguments_node.children:
1246
+ if child.type == "string":
1247
+ return self._string_literal_value(child)
1248
+ return None
1249
+
1250
+ def _commonjs_export_target(self, node: Any) -> str | None:
1251
+ object_node = node.child_by_field_name("object")
1252
+ property_node = node.child_by_field_name("property")
1253
+ if object_node is None or property_node is None:
1254
+ return None
1255
+ if object_node.type == "identifier" and object_node.text.decode("utf-8") == "exports":
1256
+ return property_node.text.decode("utf-8")
1257
+ if object_node.type == "member_expression":
1258
+ inner_object = object_node.child_by_field_name("object")
1259
+ inner_property = object_node.child_by_field_name("property")
1260
+ if (
1261
+ inner_object is not None
1262
+ and inner_property is not None
1263
+ and inner_object.type == "identifier"
1264
+ and inner_object.text.decode("utf-8") == "module"
1265
+ and inner_property.type == "property_identifier"
1266
+ and inner_property.text.decode("utf-8") == "exports"
1267
+ ):
1268
+ return property_node.text.decode("utf-8")
1269
+ if (
1270
+ object_node.type == "identifier"
1271
+ and property_node.type == "property_identifier"
1272
+ and object_node.text.decode("utf-8") == "module"
1273
+ and property_node.text.decode("utf-8") == "exports"
1274
+ ):
1275
+ return "default"
1276
+ return None
1277
+
1278
+ def _export_declaration_node(self, node: Any) -> Any | None:
1279
+ for child in node.children:
1280
+ if child.type in {
1281
+ "function_declaration",
1282
+ "class_declaration",
1283
+ "lexical_declaration",
1284
+ "interface_declaration",
1285
+ "type_alias_declaration",
1286
+ "enum_declaration",
1287
+ }:
1288
+ return child
1289
+ return None
1290
+
1291
+ def _export_default_source_name(self, node: Any, declaration: Any | None) -> str | None:
1292
+ if declaration is not None:
1293
+ return self._declaration_primary_name(declaration)
1294
+ for child in node.children:
1295
+ if child.type in {"export", "default", ";"}:
1296
+ continue
1297
+ source_name = self._expression_binding_name(child)
1298
+ if source_name:
1299
+ return source_name
1300
+ return None
1301
+
1302
+ def _exported_names_from_declaration(self, declaration: Any | None) -> list[str]:
1303
+ if declaration is None:
1304
+ return []
1305
+ if declaration.type == "lexical_declaration":
1306
+ names: list[str] = []
1307
+ for child in declaration.children:
1308
+ if child.type != "variable_declarator":
1309
+ continue
1310
+ name_node = child.child_by_field_name("name")
1311
+ if name_node is not None and name_node.type == "identifier":
1312
+ names.append(name_node.text.decode("utf-8"))
1313
+ return names
1314
+ primary_name = self._declaration_primary_name(declaration)
1315
+ return [primary_name] if primary_name else []
1316
+
1317
+ def _declaration_primary_name(self, declaration: Any) -> str | None:
1318
+ for field_name in ("name",):
1319
+ target = declaration.child_by_field_name(field_name)
1320
+ if target is not None:
1321
+ return target.text.decode("utf-8")
1322
+ for child in declaration.children:
1323
+ if child.type in {"identifier", "type_identifier"}:
1324
+ return child.text.decode("utf-8")
1325
+ return None
1326
+
1327
+ def _expression_binding_name(self, node: Any | None) -> str | None:
1328
+ if node is None:
1329
+ return None
1330
+ if node.type in {"identifier", "property_identifier", "type_identifier"}:
1331
+ return node.text.decode("utf-8")
1332
+ if node.type in {"function_declaration", "class_declaration", "function_expression"}:
1333
+ return self._declaration_primary_name(node) or self._anonymous_symbol_name(node)
1334
+ if node.type == "arrow_function":
1335
+ return self._anonymous_symbol_name(node)
1336
+ return None
1337
+
1338
+ @staticmethod
1339
+ def _anonymous_symbol_name(node: Any) -> str:
1340
+ return f"<anonymous@{node.start_point[0] + 1}>"
1341
+
1342
+ @staticmethod
1343
+ def _export_default_name(node: Any) -> str:
1344
+ """为 export default 无名字的函数/类生成可读名。"""
1345
+ line = node.start_point[0] + 1
1346
+ kind = node.type.replace("_expression", "").replace("_declaration", "")
1347
+ return f"<default_export_{kind}@{line}>"
1348
+
1349
+ def _string_literal_value(self, node: Any) -> str:
1350
+ return self._text(node).strip("\"'`")
1351
+
1352
+ def _first_child_of_type(self, node: Any, node_type: str) -> Any | None:
1353
+ for child in node.children:
1354
+ if child.type == node_type:
1355
+ return child
1356
+ return None
1357
+
1358
+ def _last_identifier(self, node: Any) -> str | None:
1359
+ identifiers = [
1360
+ child.text.decode("utf-8")
1361
+ for child in node.children
1362
+ if child.type in {"identifier", "property_identifier", "type_identifier"}
1363
+ ]
1364
+ return identifiers[-1] if identifiers else None
1365
+
1366
+ def _identifier_text(self, node: Any | None) -> str | None:
1367
+ if node is None:
1368
+ return None
1369
+ if node.type in {"identifier", "property_identifier", "type_identifier", "shorthand_property_identifier", "shorthand_property_identifier_pattern"}:
1370
+ return node.text.decode("utf-8")
1371
+ return None
1372
+
1373
+ def _walk_tree(self, root: Any) -> list[Any]:
1374
+ nodes = [root]
1375
+ result: list[Any] = []
1376
+ while nodes:
1377
+ current = nodes.pop()
1378
+ result.append(current)
1379
+ nodes.extend(reversed(current.children))
1380
+ return result
1381
+
1382
+ def extract_calls(self, tree: Any, lang: str) -> list[tuple[str, int, str]]:
1383
+ query = self._queries.get(lang, {}).get("call")
1384
+ if not query:
1385
+ return []
1386
+ results = []
1387
+ for cap_name, node in self._run_query(query, tree.root_node):
1388
+ if cap_name != "name":
1389
+ continue
1390
+ name = self._text(node)
1391
+ if name:
1392
+ results.append((name, node.start_point[0] + 1, self._call_reference_kind(node)))
1393
+ return sorted(set(results), key=lambda item: (item[1], item[0], item[2]))
1394
+
1395
+ def extract_http_routes(self, tree: Any, lang: str, file: str) -> list[Any]:
1396
+ """从 AST 中提取 HTTP 路由定义。
1397
+
1398
+ 支持框架:FastAPI (Python), Express (JS/TS), Axum (Rust)。
1399
+ route inventory 只输出严格匹配的生产路由定义,避免把测试 DSL、日志、
1400
+ Array/Option 等普通调用误判为 HTTP route。
1401
+ """
1402
+ from . import HttpRoute
1403
+
1404
+ if self._should_skip_route_file(file):
1405
+ return []
1406
+
1407
+ query = self._queries.get(lang, {}).get("http_route")
1408
+ if not query:
1409
+ return []
1410
+
1411
+ routes: list[HttpRoute] = []
1412
+ for captures in self._run_query_matches(query, tree.root_node):
1413
+ route = self._http_route_from_captures(captures, lang, file)
1414
+ if route is not None:
1415
+ routes.append(route)
1416
+
1417
+ return sorted(
1418
+ routes,
1419
+ key=lambda route: (route.file, route.line, route.method, route.path, route.handler),
1420
+ )
1421
+
1422
+ def _http_route_from_captures(self, captures: dict[str, list[Any]], lang: str, file: str) -> Any | None:
1423
+ from . import HttpRoute
1424
+
1425
+ path_node = self._first_capture(captures, "path")
1426
+ handler_node = self._first_capture(captures, "handler")
1427
+ if path_node is None or handler_node is None:
1428
+ return None
1429
+
1430
+ method_node = self._first_capture(captures, "method") or self._first_capture(captures, "http_method")
1431
+ method = (self._text(method_node) if method_node is not None else "").lower()
1432
+ if not method:
1433
+ return None
1434
+
1435
+ if lang == "python":
1436
+ obj = self._text(self._first_capture(captures, "_obj"))
1437
+ if obj not in {"app", "router", "api"} or method not in {"get", "post", "put", "delete", "patch", "head", "options"}:
1438
+ return None
1439
+ framework = "fastapi"
1440
+ elif lang in ("javascript", "typescript", "tsx"):
1441
+ router = self._text(self._first_capture(captures, "_router"))
1442
+ if router not in {"app", "router"} or method not in {"get", "post", "put", "delete", "patch", "use", "all"}:
1443
+ return None
1444
+ if method in {"describe", "test", "it", "expect", "log", "some", "map", "filter", "find", "reduce", "foreach"}:
1445
+ return None
1446
+ framework = "express"
1447
+ elif lang == "rust":
1448
+ method_name = self._text(self._first_capture(captures, "_method_name"))
1449
+ if method_name != "route" or method not in {"get", "post", "put", "delete", "patch", "head", "options"}:
1450
+ return None
1451
+ if method in {"some", "ok", "err", "is_some", "unwrap", "map", "filter"}:
1452
+ return None
1453
+ framework = "axum"
1454
+ else:
1455
+ return None
1456
+
1457
+ path = self._string_literal_value(path_node)
1458
+ if not path:
1459
+ return None
1460
+ handler_name = self._route_handler_name(handler_node)
1461
+ if not handler_name:
1462
+ return None
1463
+
1464
+ return HttpRoute(
1465
+ method=method.upper(),
1466
+ path=path,
1467
+ handler=handler_name,
1468
+ file=file,
1469
+ line=handler_node.start_point[0] + 1,
1470
+ framework=framework,
1471
+ )
1472
+
1473
+ def _run_query_matches(self, query: Any, root: Any) -> list[dict[str, list[Any]]]:
1474
+ """按 tree-sitter match 返回 captures,避免跨匹配错位拼接 route。"""
1475
+ try:
1476
+ from tree_sitter import QueryCursor # type: ignore
1477
+ cursor = QueryCursor(query)
1478
+ if hasattr(cursor, "matches"):
1479
+ raw_matches = cursor.matches(root)
1480
+ results: list[dict[str, list[Any]]] = []
1481
+ for item in raw_matches:
1482
+ if not isinstance(item, (list, tuple)) or len(item) != 2:
1483
+ continue
1484
+ _, captures = item
1485
+ if not isinstance(captures, dict) or not captures:
1486
+ continue
1487
+ normalized: dict[str, list[Any]] = {}
1488
+ for cap_name, nodes in captures.items():
1489
+ normalized[cap_name] = nodes if isinstance(nodes, list) else [nodes]
1490
+ results.append(normalized)
1491
+ if results:
1492
+ return results
1493
+ except Exception as e:
1494
+ logger.debug(f"Query match run error: {e}")
1495
+
1496
+ # 兼容旧 runtime:只能拿到 capture 列表时,按捕获起始行粗分组后再严格校验。
1497
+ captures_by_line: dict[int, dict[str, list[Any]]] = {}
1498
+ for cap_name, node in self._run_query(query, root):
1499
+ line = node.start_point[0]
1500
+ captures_by_line.setdefault(line, {}).setdefault(cap_name, []).append(node)
1501
+ return [captures for _, captures in sorted(captures_by_line.items())]
1502
+
1503
+ @staticmethod
1504
+ def _first_capture(captures: dict[str, list[Any]], name: str) -> Any | None:
1505
+ nodes = captures.get(name) or []
1506
+ return nodes[0] if nodes else None
1507
+
1508
+ @staticmethod
1509
+ def _should_skip_route_file(file: str) -> bool:
1510
+ normalized = file.replace("\\", "/")
1511
+ parts = {part.lower() for part in normalized.split("/")}
1512
+ if parts & {"e2e", "tests", "__tests__"}:
1513
+ return True
1514
+ name = normalized.rsplit("/", 1)[-1].lower()
1515
+ return bool(re.search(r"(_test\.rs|\.(test|spec)\.(js|jsx|ts|tsx|mjs|cjs|mts|cts))$", name))
1516
+
1517
+ def _route_handler_name(self, node: Any) -> str:
1518
+ explicit = self._identifier_text(node)
1519
+ if explicit:
1520
+ return explicit
1521
+ if node.type in {"arrow_function", "function_expression", "lambda"}:
1522
+ return self._anonymous_symbol_name(node)
1523
+ return self._text(node)
1524
+
1525
+ def _parse_import_specifiers(self, spec: str, module: str, line: int) -> list[JSImportBinding]:
1526
+ bindings: list[JSImportBinding] = []
1527
+ remaining = spec.strip()
1528
+ if not remaining:
1529
+ return bindings
1530
+
1531
+ if remaining.startswith("{"):
1532
+ named_text = remaining[1:remaining.rfind("}")]
1533
+ for imported_name, local_name in self._parse_named_clause(named_text):
1534
+ bindings.append(JSImportBinding(local_name, imported_name, module, line, "named"))
1535
+ return bindings
1536
+
1537
+ if remaining.startswith("*"):
1538
+ namespace_match = re.match(r"\*\s+as\s+([A-Za-z_$][\w$]*)", remaining)
1539
+ if namespace_match:
1540
+ bindings.append(
1541
+ JSImportBinding(
1542
+ local_name=namespace_match.group(1),
1543
+ imported_name="*",
1544
+ module=module,
1545
+ line=line,
1546
+ kind="namespace",
1547
+ )
1548
+ )
1549
+ return bindings
1550
+
1551
+ default_part = remaining
1552
+ rest = ""
1553
+ if "," in remaining:
1554
+ default_part, rest = remaining.split(",", 1)
1555
+ default_name = default_part.strip()
1556
+ if default_name and re.fullmatch(r"[A-Za-z_$][\w$]*", default_name):
1557
+ bindings.append(
1558
+ JSImportBinding(
1559
+ local_name=default_name,
1560
+ imported_name="default",
1561
+ module=module,
1562
+ line=line,
1563
+ kind="default",
1564
+ )
1565
+ )
1566
+ rest = rest.strip()
1567
+ if rest.startswith("{") and "}" in rest:
1568
+ named_text = rest[1:rest.rfind("}")]
1569
+ for imported_name, local_name in self._parse_named_clause(named_text):
1570
+ bindings.append(JSImportBinding(local_name, imported_name, module, line, "named"))
1571
+ elif rest.startswith("*"):
1572
+ namespace_match = re.match(r"\*\s+as\s+([A-Za-z_$][\w$]*)", rest)
1573
+ if namespace_match:
1574
+ bindings.append(
1575
+ JSImportBinding(
1576
+ local_name=namespace_match.group(1),
1577
+ imported_name="*",
1578
+ module=module,
1579
+ line=line,
1580
+ kind="namespace",
1581
+ )
1582
+ )
1583
+ return bindings
1584
+
1585
+ def _parse_named_clause(self, text: str) -> list[tuple[str, str]]:
1586
+ pairs: list[tuple[str, str]] = []
1587
+ for raw_item in text.split(","):
1588
+ item = raw_item.strip()
1589
+ if not item:
1590
+ continue
1591
+ item = re.sub(r"^type\s+", "", item)
1592
+ parts = re.split(r"\s+as\s+", item, maxsplit=1)
1593
+ if len(parts) == 2:
1594
+ source_name, exported_name = parts[0].strip(), parts[1].strip()
1595
+ else:
1596
+ source_name = exported_name = item
1597
+ if re.fullmatch(r"[A-Za-z_$][\w$]*", source_name) and re.fullmatch(r"[A-Za-z_$][\w$]*", exported_name):
1598
+ pairs.append((source_name, exported_name))
1599
+ return pairs
1600
+
1601
+ def _parse_commonjs_object_clause(self, text: str) -> list[tuple[str, str]]:
1602
+ pairs: list[tuple[str, str]] = []
1603
+ for raw_item in text.split(","):
1604
+ item = raw_item.strip()
1605
+ if not item:
1606
+ continue
1607
+ parts = [part.strip() for part in item.split(":", 1)]
1608
+ if len(parts) == 2:
1609
+ source_name, local_name = parts
1610
+ else:
1611
+ source_name = local_name = parts[0]
1612
+ if re.fullmatch(r"[A-Za-z_$][\w$]*", source_name) and re.fullmatch(r"[A-Za-z_$][\w$]*", local_name):
1613
+ pairs.append((source_name, local_name))
1614
+ return pairs
1615
+
1616
+ @staticmethod
1617
+ def _line_number(text: str, offset: int) -> int:
1618
+ return text.count("\n", 0, offset) + 1
1619
+
1620
+ # ── 内部辅助 ────────────────────────────────────────────────────────────────
1621
+
1622
+ def _run_query(self, query: Any, root: Any) -> list[tuple[str, Any]]:
1623
+ """
1624
+ 执行 tree-sitter query 并返回统一格式 list[(cap_name, Node)]
1625
+
1626
+ 要求 tree-sitter >= 0.22(使用 QueryCursor)
1627
+ """
1628
+ try:
1629
+ from tree_sitter import QueryCursor # type: ignore
1630
+ cursor = QueryCursor(query)
1631
+ raw = cursor.captures(root)
1632
+
1633
+ pairs: list[tuple[str, Any]] = []
1634
+ if isinstance(raw, dict):
1635
+ # 新版格式: dict[cap_name, list[Node]]
1636
+ for cap_name, nodes in raw.items():
1637
+ node_list = nodes if isinstance(nodes, list) else [nodes]
1638
+ for n in node_list:
1639
+ pairs.append((cap_name, n))
1640
+ else:
1641
+ # 旧版格式: list[(Node, cap_name)]
1642
+ for item in raw:
1643
+ if isinstance(item, (list, tuple)) and len(item) == 2:
1644
+ node, cap_name = item
1645
+ pairs.append((cap_name, node))
1646
+ return pairs
1647
+ except Exception as e:
1648
+ logger.debug(f"Query run error: {e}")
1649
+ return []
1650
+
1651
+ @staticmethod
1652
+ def _within(child: Any, parent: Any) -> bool:
1653
+ return (child.start_point >= parent.start_point and
1654
+ child.end_point <= parent.end_point)
1655
+
1656
+ @staticmethod
1657
+ def _text(node: Any) -> str:
1658
+ return node.text.decode("utf-8") if getattr(node, "text", None) else ""
1659
+
1660
+ def _docstring(self, node: Any, lang: str) -> str:
1661
+ if not node:
1662
+ return ""
1663
+ try:
1664
+ if lang == "python":
1665
+ for child in node.children:
1666
+ if child.type == "expression_statement":
1667
+ for sub in child.children:
1668
+ if sub.type == "string":
1669
+ return self._text(sub).strip("\"'` \n")
1670
+ elif lang in ("javascript", "typescript", "go", "rust"):
1671
+ prev = getattr(node, "prev_sibling", None)
1672
+ if prev and "comment" in prev.type:
1673
+ return self._text(prev).lstrip("/* \n").rstrip("*/ \n")
1674
+ except Exception:
1675
+ pass
1676
+ return ""
1677
+
1678
+ def _signature(self, node: Any, lang: str) -> str:
1679
+ if not node:
1680
+ return ""
1681
+ try:
1682
+ first_line = self._text(node).split("\n")[0]
1683
+ patterns = {
1684
+ "python": r"(?:async\s+)?def\s+\w+\s*\([^)]*\)(?:\s*->\s*[^:]+)?",
1685
+ "javascript": r"(?:async\s+)?(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)\s*=>)",
1686
+ "typescript": r"(?:async\s+)?(?:function\s+\w+|(?:const|let|var)\s+\w+\s*=\s*(?:async\s*)?\([^)]*\)(?:\s*:\s*\S+)?\s*=>)",
1687
+ "rust": r"(?:pub\s+)?(?:async\s+)?fn\s+\w+(?:<[^>]*>)?\s*\([^)]*\)(?:\s*->\s*[^{]+)?",
1688
+ "go": r"func\s+(?:\([^)]+\)\s+)?\w+\s*\([^)]*\)(?:\s*\([^)]*\))?(?:\s*[^{]+)?",
1689
+ }
1690
+ pat = patterns.get(lang, "")
1691
+ if pat:
1692
+ m = re.search(pat, first_line)
1693
+ if m:
1694
+ return m.group(0).strip()
1695
+ except Exception:
1696
+ pass
1697
+ return ""