@haaaiawd/anws 2.0.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,706 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ extract_ast.py — 多语言代码仓库 AST 结构提取器
4
+
5
+ 用途:基于 Tree-sitter 提取代码仓库的模块/类/函数结构,输出 JSON 到 stdout
6
+ 支持:Python, JavaScript, TypeScript, TSX, Java, Go, Rust, C#, C/C++, Kotlin, Ruby, Swift, PHP, Lua ...
7
+ 用法:python extract_ast.py <repo_path> [--max-nodes 500]
8
+ """
9
+
10
+ import sys
11
+ import json
12
+ import argparse
13
+ from pathlib import Path
14
+ from typing import Any, Optional, cast
15
+
16
+
17
+ EXCLUDE_DIRS = {'.git', '__pycache__', '.venv', 'venv', 'node_modules',
18
+ 'dist', 'build', '.mypy_cache', '.pytest_cache', 'site-packages',
19
+ '.nexus-map', '.tox', '.eggs', 'target', 'cmake-build-debug',
20
+ '.vs', 'out', '_build', 'vendor', '.ruff_cache', '.godot',
21
+ '.idea', '.vscode', '.nox'}
22
+
23
+ EXCLUDE_FILE_SUFFIXES = ('.import', '.vulkan.cache')
24
+
25
+ # ── 内建语言配置:从同目录 languages.json 加载 ────────────────────
26
+ _LANGUAGES_JSON = Path(__file__).parent / 'languages.json'
27
+
28
+
29
+ def _load_builtin_languages() -> tuple[dict[str, str], dict[str, dict[str, str]], dict[str, str]]:
30
+ """从 languages.json 加载内建的扩展名映射、Tree-sitter 查询和已知不支持的扩展名。"""
31
+ try:
32
+ data = json.loads(_LANGUAGES_JSON.read_text(encoding='utf-8'))
33
+ except (FileNotFoundError, json.JSONDecodeError) as exc:
34
+ sys.stderr.write(f"[ERROR] Failed to load {_LANGUAGES_JSON}: {exc}\n")
35
+ sys.exit(1)
36
+
37
+ extensions: dict[str, str] = data.get('extensions', {})
38
+ raw_queries: dict[str, dict[str, str]] = data.get('queries', {})
39
+ unsupported: dict[str, str] = data.get('unsupported_extensions', {})
40
+
41
+ # 规范化 queries:确保每个语言都有 struct 和 imports 键
42
+ queries: dict[str, dict[str, str]] = {}
43
+ for lang, parts in raw_queries.items():
44
+ queries[lang] = {
45
+ 'struct': parts.get('struct', ''),
46
+ 'imports': parts.get('imports', ''),
47
+ }
48
+
49
+ return extensions, queries, unsupported
50
+
51
+
52
+ BUILTIN_EXTENSION_MAP, BUILTIN_LANG_QUERIES, BUILTIN_KNOWN_UNSUPPORTED_EXTENSIONS = (
53
+ _load_builtin_languages()
54
+ )
55
+
56
+
57
+ def _should_skip_path(repo_path: Path, path: Path) -> bool:
58
+ rel_path = path.relative_to(repo_path)
59
+ if any(part in EXCLUDE_DIRS for part in rel_path.parts):
60
+ return True
61
+ if path.is_file() and any(path.name.endswith(suffix) for suffix in EXCLUDE_FILE_SUFFIXES):
62
+ return True
63
+ return False
64
+
65
+
66
+ def write_filtered_file_tree(repo_path: Path, output_path: Path) -> None:
67
+ lines: list[str] = []
68
+ for path in sorted(repo_path.rglob('*')):
69
+ if _should_skip_path(repo_path, path):
70
+ continue
71
+ rel_path = path.relative_to(repo_path).as_posix()
72
+ suffix = '/' if path.is_dir() else ''
73
+ lines.append(rel_path + suffix)
74
+
75
+ output_path.parent.mkdir(parents=True, exist_ok=True)
76
+ output_path.write_text('\n'.join(lines) + ('\n' if lines else ''), encoding='utf-8')
77
+
78
+ def _normalize_extension(ext: str) -> str:
79
+ normalized = ext.strip().lower()
80
+ if not normalized:
81
+ raise ValueError('extension must not be empty')
82
+ if not normalized.startswith('.'):
83
+ normalized = f'.{normalized}'
84
+ return normalized
85
+
86
+
87
+ def _copy_lang_queries(source: dict[str, dict[str, str]]) -> dict[str, dict[str, str]]:
88
+ return {
89
+ lang: {
90
+ 'struct': query_parts.get('struct', ''),
91
+ 'imports': query_parts.get('imports', ''),
92
+ }
93
+ for lang, query_parts in source.items()
94
+ }
95
+
96
+
97
+ def _apply_cli_customizations(
98
+ cli_extensions: list[str] | None,
99
+ cli_queries: list[list[str]] | None,
100
+ ) -> tuple[
101
+ dict[str, str],
102
+ dict[str, dict[str, str]],
103
+ list[str],
104
+ dict[str, str],
105
+ ]:
106
+ """
107
+ 从命令行参数应用语言自定义(--add-extension 和 --add-query)。
108
+ 返回 (extension_override, query_override, warnings)
109
+ """
110
+ extension_override: dict[str, str] = {}
111
+ query_override: dict[str, dict[str, str]] = {}
112
+ warnings: list[str] = []
113
+ custom_query_languages: dict[str, str] = {}
114
+
115
+ if cli_extensions:
116
+ for item in cli_extensions:
117
+ if '=' not in item:
118
+ warnings.append(f'ignored invalid extension mapping {item!r}, expected EXT=LANG')
119
+ continue
120
+ ext_part, lang_part = item.split('=', 1)
121
+ try:
122
+ ext = _normalize_extension(ext_part)
123
+ lang = lang_part.strip().lower()
124
+ if not lang:
125
+ warnings.append(f'ignored empty language name for extension {ext_part!r}')
126
+ continue
127
+ extension_override[ext] = lang
128
+ except ValueError as e:
129
+ warnings.append(f'ignored invalid extension {ext_part!r}: {e}')
130
+ continue
131
+
132
+ if cli_queries:
133
+ for query_item in cli_queries:
134
+ if len(query_item) != 3:
135
+ warnings.append(f'ignored malformed query: expected 3 parts, got {len(query_item)}')
136
+ continue
137
+ lang, query_type, query_str = query_item
138
+ lang = lang.strip().lower()
139
+ if not lang:
140
+ warnings.append('ignored empty language name in query')
141
+ continue
142
+ if query_type not in ('struct', 'imports'):
143
+ warnings.append(f'ignored unknown query type {query_type!r} for language {lang!r}')
144
+ continue
145
+
146
+ if lang not in query_override:
147
+ query_override[lang] = {'struct': '', 'imports': ''}
148
+ query_override[lang][query_type] = query_str
149
+ custom_query_languages[lang] = '<cli>'
150
+
151
+ return extension_override, query_override, warnings, custom_query_languages
152
+
153
+
154
+ def _load_language_customizations(
155
+ repo_path: Path,
156
+ explicit_config_path: Optional[str],
157
+ cli_extension_override: dict[str, str],
158
+ cli_query_override: dict[str, dict[str, str]],
159
+ cli_warnings: list[str],
160
+ cli_custom_query_languages: dict[str, str],
161
+ ) -> tuple[
162
+ dict[str, str],
163
+ dict[str, dict[str, str]],
164
+ dict[str, str],
165
+ list[str],
166
+ list[str],
167
+ dict[str, str],
168
+ ]:
169
+ """
170
+ 加载和合并语言自定义配置。
171
+
172
+ 优先级:CLI --language-config > CLI --add-* 参数 > 内置配置
173
+
174
+ 返回 (extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages)
175
+ """
176
+ extension_map = dict(BUILTIN_EXTENSION_MAP)
177
+ lang_queries = _copy_lang_queries(BUILTIN_LANG_QUERIES)
178
+ known_unsupported_extensions = dict(BUILTIN_KNOWN_UNSUPPORTED_EXTENSIONS)
179
+ warnings: list[str] = list(cli_warnings)
180
+ loaded_config_paths: list[str] = []
181
+ custom_query_languages: dict[str, str] = dict(cli_custom_query_languages)
182
+
183
+ # 首先合并 CLI 参数的自定义
184
+ extension_map.update(cli_extension_override)
185
+ for lang, query_parts in cli_query_override.items():
186
+ if lang in lang_queries:
187
+ # 只覆盖提供的部分
188
+ if query_parts.get('struct'):
189
+ lang_queries[lang]['struct'] = query_parts['struct']
190
+ if query_parts.get('imports'):
191
+ lang_queries[lang]['imports'] = query_parts['imports']
192
+ else:
193
+ lang_queries[lang] = query_parts
194
+
195
+ # 然后加载 --language-config 文件(如果提供),优先级最高
196
+ if explicit_config_path:
197
+ config_path = Path(explicit_config_path)
198
+ resolved_path = config_path if config_path.is_absolute() else (repo_path / config_path)
199
+
200
+ try:
201
+ config_data = json.loads(resolved_path.read_text(encoding='utf-8'))
202
+ except FileNotFoundError:
203
+ warnings.append(f'language config not found: {resolved_path}')
204
+ return extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages
205
+ except json.JSONDecodeError as exc:
206
+ warnings.append(f'language config parse error in {resolved_path}: {exc}')
207
+ return extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages
208
+ except OSError as exc:
209
+ warnings.append(f'language config read error in {resolved_path}: {exc}')
210
+ return extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages
211
+
212
+ if not isinstance(config_data, dict):
213
+ warnings.append(f'language config ignored because root value is not an object: {resolved_path}')
214
+ return extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages
215
+
216
+ loaded_config_paths.append(str(resolved_path))
217
+
218
+ # 从 --language-config 加载扩展名映射
219
+ extensions = config_data.get('extensions', {})
220
+ if isinstance(extensions, dict):
221
+ for raw_ext, raw_lang in extensions.items():
222
+ if isinstance(raw_ext, str) and isinstance(raw_lang, str) and raw_lang.strip():
223
+ try:
224
+ ext = _normalize_extension(raw_ext)
225
+ lang = raw_lang.strip().lower()
226
+ extension_map[ext] = lang
227
+ known_unsupported_extensions.pop(ext, None)
228
+ except ValueError:
229
+ pass
230
+
231
+ # 从 --language-config 加载查询
232
+ queries = config_data.get('queries', {})
233
+ if isinstance(queries, dict):
234
+ for raw_lang, raw_query_parts in queries.items():
235
+ if isinstance(raw_lang, str) and raw_lang.strip() and isinstance(raw_query_parts, dict):
236
+ lang = raw_lang.strip().lower()
237
+ struct_query = raw_query_parts.get('struct', '')
238
+ imports_query = raw_query_parts.get('imports', '')
239
+ if isinstance(struct_query, str) and isinstance(imports_query, str):
240
+ lang_queries[lang] = {
241
+ 'struct': struct_query,
242
+ 'imports': imports_query,
243
+ }
244
+ custom_query_languages[lang] = str(resolved_path)
245
+
246
+ # 从 --language-config 加载不支持的扩展名
247
+ unsupported_extensions = config_data.get('unsupported_extensions', {})
248
+ if isinstance(unsupported_extensions, dict):
249
+ for raw_ext, raw_lang in unsupported_extensions.items():
250
+ if isinstance(raw_ext, str) and isinstance(raw_lang, str) and raw_lang.strip():
251
+ try:
252
+ ext = _normalize_extension(raw_ext)
253
+ lang = raw_lang.strip().lower()
254
+ known_unsupported_extensions[ext] = lang
255
+ extension_map.pop(ext, None)
256
+ except ValueError:
257
+ pass
258
+
259
+ return extension_map, lang_queries, known_unsupported_extensions, warnings, loaded_config_paths, custom_query_languages
260
+
261
+
262
+
263
+ def _load_languages(
264
+ extension_map: dict[str, str],
265
+ lang_queries: dict[str, dict[str, str]],
266
+ requested: Optional[list[str]] = None,
267
+ ) -> dict[str, Any]:
268
+ """
269
+ 加载 Tree-sitter 语言对象,返回 {lang_name: Language} 字典。
270
+ 优先使用 tree-sitter-language-pack(160+ 语言),不可用时回退单语言包。
271
+ """
272
+ try:
273
+ from tree_sitter_language_pack import get_language as _get
274
+
275
+ def get_language(name: str) -> Any:
276
+ return _get(cast(Any, name))
277
+ except ImportError:
278
+ # 仅 Python 单语言包 fallback
279
+ try:
280
+ import tree_sitter_python
281
+ from tree_sitter import Language
282
+
283
+ def get_language(name: str) -> Any:
284
+ if name == 'python':
285
+ return Language(tree_sitter_python.language())
286
+ raise LookupError(name)
287
+ except ImportError:
288
+ sys.stderr.write(
289
+ "[ERROR] 缺少 tree-sitter 语言支持。\n"
290
+ "请运行: pip install tree-sitter-language-pack\n"
291
+ )
292
+ sys.exit(1)
293
+
294
+ targets = requested if requested else sorted(set(extension_map.values()) | set(lang_queries.keys()))
295
+ languages: dict[str, Any] = {}
296
+ for name in targets:
297
+ try:
298
+ languages[name] = get_language(name)
299
+ except (LookupError, KeyError):
300
+ # 该语言包未安装,优雅跳过
301
+ pass
302
+
303
+ if not languages:
304
+ sys.stderr.write("[ERROR] 没有可用的语言解析器,请安装 tree-sitter-language-pack\n")
305
+ sys.exit(1)
306
+ return languages
307
+
308
+
309
+ def _file_module_id(repo_path: Path, file_path: Path) -> str:
310
+ """将文件路径转换为点分隔的模块 ID。
311
+ 例:src/nexus/api/routes.py → src.nexus.api.routes
312
+ src/core/parser.hpp → src.core.parser
313
+ """
314
+ rel = file_path.relative_to(repo_path)
315
+ parts = list(rel.parts)
316
+ stem = Path(parts[-1]).stem # 去掉扩展名
317
+ parts[-1] = stem
318
+ # Python 特殊处理:__init__ 合并到包路径
319
+ if stem == '__init__' and len(parts) > 1:
320
+ parts = parts[:-1]
321
+ return '.'.join(parts) if parts else stem
322
+
323
+
324
+
325
+
326
+ def extract_file(
327
+ repo_path: Path,
328
+ file_path: Path,
329
+ lang_name: str,
330
+ language: Any,
331
+ lang_queries: dict[str, dict[str, str]],
332
+ ) -> tuple[list[dict], list[dict], list[str]]:
333
+ """解析单个源文件,返回 (nodes, edges, errors)"""
334
+ from tree_sitter import Parser as TSParser, Query, QueryCursor
335
+
336
+ nodes: list[dict] = []
337
+ edges: list[dict] = []
338
+ errors: list[str] = []
339
+
340
+ try:
341
+ source = file_path.read_bytes()
342
+ except OSError as e:
343
+ errors.append(f"{file_path}: read error: {e}")
344
+ return nodes, edges, errors
345
+
346
+ try:
347
+ parser = TSParser(language)
348
+ tree = parser.parse(source)
349
+ except Exception as e:
350
+ errors.append(f"{file_path}: parse error: {e}")
351
+ return nodes, edges, errors
352
+
353
+ rel_path = str(file_path.relative_to(repo_path)).replace('\\', '/')
354
+ module_id = _file_module_id(repo_path, file_path)
355
+ line_count = source.count(b'\n') + 1
356
+
357
+ # Module 节点(文件级)
358
+ nodes.append({
359
+ 'id': module_id,
360
+ 'type': 'Module',
361
+ 'label': module_id.split('.')[-1],
362
+ 'path': rel_path,
363
+ 'lines': line_count,
364
+ 'lang': lang_name,
365
+ })
366
+
367
+ queries = lang_queries.get(lang_name, {})
368
+
369
+ # ── 结构:类 / 函数 ──────────────────────────────────────────
370
+ struct_q_text = queries.get('struct', '')
371
+ if struct_q_text.strip():
372
+ try:
373
+ struct_query = Query(language, struct_q_text)
374
+ class_ranges: list[tuple[int, int, str]] = []
375
+
376
+ for pattern_idx, captures in QueryCursor(struct_query).matches(tree.root_node):
377
+ capture_names = list(captures.keys())
378
+ is_class = any('class' in k for k in capture_names)
379
+ def_key = 'class.def' if is_class else 'func.def'
380
+ name_key = 'class.name' if is_class else 'func.name'
381
+
382
+ def_nodes = captures.get(def_key, [])
383
+ name_nodes = captures.get(name_key, [])
384
+ if not def_nodes or not name_nodes:
385
+ continue
386
+
387
+ def_node = def_nodes[0]
388
+ name_node = name_nodes[0]
389
+ name = source[name_node.start_byte:name_node.end_byte].decode('utf-8', 'replace')
390
+
391
+ if is_class:
392
+ node_id = f"{module_id}.{name}"
393
+ nodes.append({
394
+ 'id': node_id,
395
+ 'type': 'Class',
396
+ 'label': name,
397
+ 'path': rel_path,
398
+ 'parent': module_id,
399
+ 'start_line': def_node.start_point[0] + 1,
400
+ 'end_line': def_node.end_point[0] + 1,
401
+ })
402
+ class_ranges.append((def_node.start_byte, def_node.end_byte, node_id))
403
+ edges.append({'source': module_id, 'target': node_id, 'type': 'contains'})
404
+ else:
405
+ parent_id = module_id
406
+ for cls_start, cls_end, cls_id in class_ranges:
407
+ if cls_start <= def_node.start_byte and def_node.end_byte <= cls_end:
408
+ parent_id = cls_id
409
+ break
410
+ node_id = f"{parent_id}.{name}"
411
+ nodes.append({
412
+ 'id': node_id,
413
+ 'type': 'Function',
414
+ 'label': name,
415
+ 'path': rel_path,
416
+ 'parent': parent_id,
417
+ 'start_line': def_node.start_point[0] + 1,
418
+ 'end_line': def_node.end_point[0] + 1,
419
+ })
420
+ edges.append({'source': parent_id, 'target': node_id, 'type': 'contains'})
421
+
422
+ except Exception as e:
423
+ errors.append(f"{file_path}: struct query error: {e}")
424
+
425
+ # ── 导入:imports 边 ─────────────────────────────────────────
426
+ import_q_text = queries.get('imports', '')
427
+ if import_q_text.strip():
428
+ try:
429
+ import_query = Query(language, import_q_text)
430
+ for _pattern_idx, captures in QueryCursor(import_query).matches(tree.root_node):
431
+ for mod_node in captures.get('mod', []):
432
+ target = source[mod_node.start_byte:mod_node.end_byte].decode('utf-8', 'replace').strip('"\'<> ')
433
+ if target:
434
+ edges.append({'source': module_id, 'target': target, 'type': 'imports'})
435
+ except Exception as e:
436
+ errors.append(f"{file_path}: import query error: {e}")
437
+
438
+ return nodes, edges, errors
439
+
440
+
441
+ def collect_source_files(
442
+ repo_path: Path,
443
+ languages: dict[str, Any],
444
+ extension_map: dict[str, str],
445
+ known_unsupported_extensions: dict[str, str],
446
+ ) -> tuple[list[tuple[Path, str]], dict[str, int], dict[str, int], dict[str, int]]:
447
+ """收集 repo 中所有已知语言的源文件,跳过排除目录。
448
+
449
+ 返回:
450
+ - [(file_path, lang_name)]
451
+ - supported_file_counts: {lang_name: file_count}
452
+ - known_unsupported_file_counts: {lang_name: file_count}
453
+ - configured_but_unavailable_file_counts: {lang_name: file_count}
454
+ """
455
+ files: list[tuple[Path, str]] = []
456
+ supported_file_counts: dict[str, int] = {}
457
+ known_unsupported_file_counts: dict[str, int] = {}
458
+ configured_but_unavailable_file_counts: dict[str, int] = {}
459
+
460
+ for p in repo_path.rglob('*'):
461
+ if not p.is_file():
462
+ continue
463
+ if _should_skip_path(repo_path, p):
464
+ continue
465
+
466
+ suffix = p.suffix.lower()
467
+ lang = extension_map.get(suffix)
468
+ if lang:
469
+ if lang in languages:
470
+ files.append((p, lang))
471
+ supported_file_counts[lang] = supported_file_counts.get(lang, 0) + 1
472
+ else:
473
+ configured_but_unavailable_file_counts[lang] = (
474
+ configured_but_unavailable_file_counts.get(lang, 0) + 1
475
+ )
476
+ continue
477
+
478
+ unsupported_lang = known_unsupported_extensions.get(suffix)
479
+ if unsupported_lang:
480
+ known_unsupported_file_counts[unsupported_lang] = (
481
+ known_unsupported_file_counts.get(unsupported_lang, 0) + 1
482
+ )
483
+
484
+ return (
485
+ sorted(files, key=lambda x: x[0]),
486
+ supported_file_counts,
487
+ known_unsupported_file_counts,
488
+ configured_but_unavailable_file_counts,
489
+ )
490
+
491
+
492
+
493
+ def apply_max_nodes(
494
+ nodes: list[dict],
495
+ edges: list[dict],
496
+ max_nodes: int,
497
+ ) -> tuple[list[dict], list[dict], bool, int]:
498
+ """
499
+ 节点数超出 max_nodes 时,优先保留 Module/Class,截断 Function。
500
+ 返回 (filtered_nodes, filtered_edges, truncated, truncated_count)
501
+ """
502
+ if len(nodes) <= max_nodes:
503
+ return nodes, edges, False, 0
504
+
505
+ priority_nodes = [n for n in nodes if n['type'] in ('Module', 'Class')]
506
+ func_nodes = [n for n in nodes if n['type'] == 'Function']
507
+
508
+ remaining_slots = max_nodes - len(priority_nodes)
509
+ if remaining_slots < 0:
510
+ kept_nodes = priority_nodes
511
+ truncated_count = len(func_nodes)
512
+ else:
513
+ kept_funcs = func_nodes[:remaining_slots]
514
+ kept_nodes = priority_nodes + kept_funcs
515
+ truncated_count = len(func_nodes) - len(kept_funcs)
516
+
517
+ kept_ids = {n['id'] for n in kept_nodes}
518
+ kept_edges = [
519
+ e for e in edges
520
+ if e['source'] in kept_ids or e['type'] == 'imports'
521
+ ]
522
+ return kept_nodes, kept_edges, True, truncated_count
523
+
524
+
525
+ def main() -> None:
526
+ parser = argparse.ArgumentParser(
527
+ description='Extract AST structure from a multi-language repository'
528
+ )
529
+ parser.add_argument('repo_path', help='Target repository path')
530
+ parser.add_argument('--max-nodes', type=int, default=500,
531
+ help='Max nodes in output (default: 500). Truncates Function nodes first.')
532
+ parser.add_argument(
533
+ '--add-extension',
534
+ action='append',
535
+ dest='add_extensions',
536
+ metavar='EXT=LANG',
537
+ help='Add extension mapping, e.g., .templ=templ. Can be used multiple times.',
538
+ )
539
+ parser.add_argument(
540
+ '--add-query',
541
+ action='append',
542
+ dest='add_queries',
543
+ nargs=3,
544
+ metavar=('LANG', 'TYPE', 'QUERY'),
545
+ help='Add/override a query for a language. TYPE is "struct" or "imports". Can be used multiple times.',
546
+ )
547
+ parser.add_argument(
548
+ '--language-config',
549
+ help='Optional JSON file that adds or overrides extension mappings and tree-sitter queries. Useful for complex configurations.',
550
+ )
551
+ parser.add_argument(
552
+ '--file-tree-out',
553
+ help='Optional output path for a filtered file tree (e.g. .nexus-map/raw/file_tree.txt). Uses the same exclude rules as AST collection.',
554
+ )
555
+ args = parser.parse_args()
556
+
557
+ repo_path = Path(args.repo_path).resolve()
558
+ if not repo_path.exists():
559
+ sys.stderr.write(f"[ERROR] repo_path not found: {repo_path}\n")
560
+ sys.exit(1)
561
+ if not (repo_path / '.git').exists():
562
+ sys.stderr.write(f"[WARNING] .git not found in {repo_path}, may not be a git repo\n")
563
+
564
+ if args.file_tree_out:
565
+ file_tree_path = Path(args.file_tree_out)
566
+ if not file_tree_path.is_absolute():
567
+ file_tree_path = repo_path / file_tree_path
568
+ write_filtered_file_tree(repo_path, file_tree_path.resolve())
569
+
570
+ # 处理 CLI 自定义参数
571
+ cli_ext_override, cli_query_override, cli_warnings, cli_custom_query_languages = _apply_cli_customizations(
572
+ args.add_extensions,
573
+ args.add_queries,
574
+ )
575
+
576
+ # 加载和合并配置
577
+ (
578
+ extension_map,
579
+ lang_queries,
580
+ known_unsupported_extensions,
581
+ config_warnings,
582
+ loaded_config_paths,
583
+ custom_query_languages,
584
+ ) = _load_language_customizations(
585
+ repo_path,
586
+ args.language_config,
587
+ cli_ext_override,
588
+ cli_query_override,
589
+ cli_warnings,
590
+ cli_custom_query_languages,
591
+ )
592
+
593
+ languages = _load_languages(extension_map, lang_queries)
594
+ (
595
+ source_files,
596
+ supported_file_counts,
597
+ known_unsupported_file_counts,
598
+ configured_but_unavailable_file_counts,
599
+ ) = collect_source_files(
600
+ repo_path,
601
+ languages,
602
+ extension_map,
603
+ known_unsupported_extensions,
604
+ )
605
+
606
+ if not source_files:
607
+ sys.stderr.write(f"[WARNING] No supported source files found in {repo_path}\n")
608
+
609
+ all_nodes: list[dict] = []
610
+ all_edges: list[dict] = []
611
+ all_errors: list[str] = []
612
+ detected_langs: set[str] = set()
613
+ total_lines = 0
614
+ warnings: list[str] = list(config_warnings)
615
+ module_only_file_counts: dict[str, int] = {}
616
+ languages_with_structural_queries = sorted(
617
+ lang for lang, query_parts in lang_queries.items()
618
+ if query_parts.get('struct', '').strip()
619
+ )
620
+
621
+ for file_path, lang_name in source_files:
622
+ nodes, edges, errors = extract_file(
623
+ repo_path,
624
+ file_path,
625
+ lang_name,
626
+ languages[lang_name],
627
+ lang_queries,
628
+ )
629
+ all_nodes.extend(nodes)
630
+ all_edges.extend(edges)
631
+ all_errors.extend(errors)
632
+ if lang_name not in languages_with_structural_queries:
633
+ module_only_file_counts[lang_name] = module_only_file_counts.get(lang_name, 0) + 1
634
+ if nodes:
635
+ detected_langs.add(lang_name)
636
+ total_lines += nodes[0].get('lines', 0)
637
+
638
+ final_nodes, final_edges, truncated, truncated_count = apply_max_nodes(
639
+ all_nodes, all_edges, args.max_nodes
640
+ )
641
+
642
+ if known_unsupported_file_counts:
643
+ unsupported_summary = ', '.join(
644
+ f"{lang} ({count} files)"
645
+ for lang, count in sorted(known_unsupported_file_counts.items())
646
+ )
647
+ warnings.append(
648
+ "known unsupported languages present; downstream outputs must mark inferred sections explicitly: "
649
+ f"{unsupported_summary}"
650
+ )
651
+
652
+ if configured_but_unavailable_file_counts:
653
+ unavailable_summary = ', '.join(
654
+ f"{lang} ({count} files)"
655
+ for lang, count in sorted(configured_but_unavailable_file_counts.items())
656
+ )
657
+ warnings.append(
658
+ 'some configured languages were detected in source files but no parser could be loaded: '
659
+ f'{unavailable_summary}'
660
+ )
661
+
662
+ if module_only_file_counts:
663
+ module_only_summary = ', '.join(
664
+ f"{lang} ({count} files)"
665
+ for lang, count in sorted(module_only_file_counts.items())
666
+ )
667
+ warnings.append(
668
+ "some languages were parsed with module-only coverage because no structural query template is bundled: "
669
+ f"{module_only_summary}"
670
+ )
671
+
672
+ if loaded_config_paths:
673
+ config_summary = ', '.join(loaded_config_paths)
674
+ warnings.append(f'custom language configuration loaded: {config_summary}')
675
+
676
+ result = {
677
+ 'languages': sorted(detected_langs),
678
+ 'stats': {
679
+ 'total_files': len(source_files),
680
+ 'total_lines': total_lines,
681
+ 'parse_errors': len(all_errors),
682
+ 'truncated': truncated,
683
+ 'truncated_nodes': truncated_count,
684
+ 'supported_file_counts': supported_file_counts,
685
+ 'languages_with_structural_queries': languages_with_structural_queries,
686
+ 'languages_with_custom_queries': sorted(custom_query_languages.keys()),
687
+ 'module_only_file_counts': module_only_file_counts,
688
+ 'known_unsupported_file_counts': known_unsupported_file_counts,
689
+ 'configured_but_unavailable_file_counts': configured_but_unavailable_file_counts,
690
+ 'custom_language_config_paths': loaded_config_paths,
691
+ },
692
+ 'nodes': final_nodes,
693
+ 'edges': final_edges,
694
+ }
695
+
696
+ if all_errors:
697
+ result['_errors'] = all_errors[:20]
698
+ if warnings:
699
+ result['warnings'] = warnings
700
+
701
+ print(json.dumps(result, ensure_ascii=False, indent=2))
702
+
703
+
704
+ if __name__ == '__main__':
705
+ main()
706
+