jarvis-ai-assistant 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jarvis/__init__.py +1 -1
- jarvis/jarvis_agent/__init__.py +15 -4
- jarvis/jarvis_agent/agent_manager.py +3 -0
- jarvis/jarvis_agent/jarvis.py +44 -14
- jarvis/jarvis_agent/run_loop.py +6 -1
- jarvis/jarvis_agent/task_planner.py +1 -0
- jarvis/jarvis_c2rust/__init__.py +13 -0
- jarvis/jarvis_c2rust/cli.py +405 -0
- jarvis/jarvis_c2rust/collector.py +209 -0
- jarvis/jarvis_c2rust/library_replacer.py +933 -0
- jarvis/jarvis_c2rust/llm_module_agent.py +1265 -0
- jarvis/jarvis_c2rust/scanner.py +1671 -0
- jarvis/jarvis_c2rust/transpiler.py +1236 -0
- jarvis/jarvis_code_agent/code_agent.py +144 -18
- jarvis/jarvis_data/config_schema.json +8 -3
- jarvis/jarvis_tools/cli/main.py +1 -0
- jarvis/jarvis_tools/execute_script.py +1 -1
- jarvis/jarvis_tools/read_code.py +11 -1
- jarvis/jarvis_tools/read_symbols.py +129 -0
- jarvis/jarvis_tools/registry.py +9 -1
- jarvis/jarvis_utils/config.py +14 -4
- jarvis/jarvis_utils/git_utils.py +39 -0
- jarvis/jarvis_utils/utils.py +13 -5
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/METADATA +13 -1
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/RECORD +29 -21
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/entry_points.txt +2 -0
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/WHEEL +0 -0
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {jarvis_ai_assistant-0.5.1.dist-info → jarvis_ai_assistant-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1671 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
使用 libclang 的 C/C++ 函数扫描器和调用图提取器。
|
|
5
|
+
|
|
6
|
+
设计决策:
|
|
7
|
+
- 解析器: clang.cindex (libclang),用于生成包含精确类型和位置的健壮 C/C++ AST。
|
|
8
|
+
|
|
9
|
+
JSONL 文件
|
|
10
|
+
- symbols_raw.jsonl
|
|
11
|
+
原始扫描产物:每个符号(函数或类型)一个 JSON 对象,统一模式:
|
|
12
|
+
字段:
|
|
13
|
+
- id (int)
|
|
14
|
+
- category (str): "function" | "type"
|
|
15
|
+
- name (str)
|
|
16
|
+
- qualified_name (str)
|
|
17
|
+
- signature (str) # 函数签名;类型则可选或为空
|
|
18
|
+
- return_type (str) # 函数返回类型;类型则可选或为空
|
|
19
|
+
- params (list[{name, type}]) # 函数参数;类型则可选或为空
|
|
20
|
+
- kind (str) # 类型种类: struct/class/union/enum/typedef/type_alias
|
|
21
|
+
- underlying_type (str) # 针对 typedef/type_alias;其他为空
|
|
22
|
+
- ref (list[str]) # 统一的引用:被调用的函数或引用的类型
|
|
23
|
+
- file (str)
|
|
24
|
+
- start_line (int), start_col (int), end_line (int), end_col (int)
|
|
25
|
+
- language (str)
|
|
26
|
+
- created_at (str, ISO-like), updated_at (str, ISO-like)
|
|
27
|
+
- symbols.jsonl
|
|
28
|
+
经过裁剪/评估后的符号表(由 prune 子命令或人工整理生成),用于后续转译与规划
|
|
29
|
+
- meta.json
|
|
30
|
+
{
|
|
31
|
+
"functions": N,
|
|
32
|
+
"types": M,
|
|
33
|
+
"symbols": N+M,
|
|
34
|
+
"generated_at": "...",
|
|
35
|
+
"schema_version": 1,
|
|
36
|
+
"source_root": "<abs path>"
|
|
37
|
+
}
|
|
38
|
+
用法:
|
|
39
|
+
python -m jarvis.jarvis_c2rust.scanner --root /path/to/scan
|
|
40
|
+
|
|
41
|
+
注意:
|
|
42
|
+
- 如果存在 compile_commands.json 文件,将会用它来提高解析准确性。
|
|
43
|
+
- 如果找不到 libclang,将引发一个信息丰富的错误,并提示设置环境变量:
|
|
44
|
+
- LIBCLANG_PATH (目录) 或 CLANG_LIBRARY_FILE (完整路径)
|
|
45
|
+
- LLVM_HOME (包含 lib/libclang.so 的前缀)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
from __future__ import annotations
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
import json
|
|
52
|
+
import os
|
|
53
|
+
|
|
54
|
+
import sys
|
|
55
|
+
import time
|
|
56
|
+
from dataclasses import dataclass
|
|
57
|
+
from pathlib import Path
|
|
58
|
+
from typing import Any, Dict, Iterable, List, Optional, Set
|
|
59
|
+
import typer
|
|
60
|
+
import shutil
|
|
61
|
+
|
|
62
|
+
# ---------------------------
|
|
63
|
+
# libclang loader
|
|
64
|
+
# ---------------------------
|
|
65
|
+
def _try_import_libclang() -> Any:
|
|
66
|
+
"""
|
|
67
|
+
Load clang.cindex and support libclang 16-21 (inclusive).
|
|
68
|
+
Resolution order:
|
|
69
|
+
1) Respect CLANG_LIBRARY_FILE (must be one of 16-21)
|
|
70
|
+
2) Respect LIBCLANG_PATH (pick libclang from that dir and verify major 16-21)
|
|
71
|
+
3) Respect LLVM_HOME/lib/libclang.*
|
|
72
|
+
4) Probe common locations for versions 16-21
|
|
73
|
+
If Python bindings or libclang are outside 16-21, raise with actionable hints.
|
|
74
|
+
"""
|
|
75
|
+
SUPPORTED_MAJORS = {16, 17, 18, 19, 20, 21}
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
from clang import cindex # type: ignore
|
|
79
|
+
except Exception as e:
|
|
80
|
+
raise RuntimeError(
|
|
81
|
+
"导入 clang.cindex 失败。本工具支持 clang 16-21。\n"
|
|
82
|
+
"修复方法:\n"
|
|
83
|
+
"- pip install 'clang>=16,<22'\n"
|
|
84
|
+
"- 确保已安装 libclang (16-21) (例如,apt install llvm-21 clang-21 libclang-21-dev)\n"
|
|
85
|
+
"- 设置环境变量 CLANG_LIBRARY_FILE 指向匹配的共享库,或 LIBCLANG_PATH 指向其目录。"
|
|
86
|
+
) from e
|
|
87
|
+
|
|
88
|
+
# Verify Python clang bindings major version (if available)
|
|
89
|
+
py_major: Optional[int] = None
|
|
90
|
+
try:
|
|
91
|
+
import clang as _clang # type: ignore
|
|
92
|
+
import re as _re
|
|
93
|
+
v = getattr(_clang, "__version__", None)
|
|
94
|
+
if v:
|
|
95
|
+
m = _re.match(r"(\\d+)", str(v))
|
|
96
|
+
if m:
|
|
97
|
+
py_major = int(m.group(1))
|
|
98
|
+
except Exception:
|
|
99
|
+
py_major = None
|
|
100
|
+
|
|
101
|
+
# If version is known and not in supported set, fail; if unknown (None), proceed and rely on libclang probing
|
|
102
|
+
if py_major is not None and py_major not in SUPPORTED_MAJORS:
|
|
103
|
+
raise RuntimeError(
|
|
104
|
+
f"Python 'clang' 绑定的主版本必须是 {sorted(SUPPORTED_MAJORS)} 中的一个。\n"
|
|
105
|
+
"修复方法:\n"
|
|
106
|
+
"- pip install --upgrade 'clang>=16,<22'"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Helper to probe libclang major version
|
|
110
|
+
def _probe_major_from_lib(path: str) -> Optional[int]:
|
|
111
|
+
try:
|
|
112
|
+
import ctypes
|
|
113
|
+
import re as _re
|
|
114
|
+
class CXString(ctypes.Structure):
|
|
115
|
+
_fields_ = [("data", ctypes.c_void_p), ("private_flags", ctypes.c_uint)]
|
|
116
|
+
lib = ctypes.CDLL(path)
|
|
117
|
+
# Ensure correct ctypes signatures to avoid mis-parsing strings
|
|
118
|
+
lib.clang_getClangVersion.restype = CXString
|
|
119
|
+
lib.clang_getCString.argtypes = [CXString]
|
|
120
|
+
lib.clang_getCString.restype = ctypes.c_char_p
|
|
121
|
+
lib.clang_disposeString.argtypes = [CXString]
|
|
122
|
+
s = lib.clang_getClangVersion()
|
|
123
|
+
cstr = lib.clang_getCString(s) # returns const char*
|
|
124
|
+
try:
|
|
125
|
+
ver = cstr.decode("utf-8", "ignore") if cstr else ""
|
|
126
|
+
except Exception:
|
|
127
|
+
# Fallback if restype not honored by platform
|
|
128
|
+
ver = ctypes.cast(cstr, ctypes.c_char_p).value.decode("utf-8", "ignore") if cstr else ""
|
|
129
|
+
lib.clang_disposeString(s)
|
|
130
|
+
if ver:
|
|
131
|
+
m = _re.search(r"clang version (\d+)", ver)
|
|
132
|
+
if m:
|
|
133
|
+
return int(m.group(1))
|
|
134
|
+
except Exception:
|
|
135
|
+
return None
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def _ensure_supported_and_set(lib_path: str) -> bool:
|
|
139
|
+
major = _probe_major_from_lib(lib_path)
|
|
140
|
+
if major in SUPPORTED_MAJORS:
|
|
141
|
+
try:
|
|
142
|
+
cindex.Config.set_library_file(lib_path)
|
|
143
|
+
return True
|
|
144
|
+
except Exception:
|
|
145
|
+
return False
|
|
146
|
+
return False
|
|
147
|
+
|
|
148
|
+
# 1) CLANG_LIBRARY_FILE
|
|
149
|
+
lib_file = os.environ.get("CLANG_LIBRARY_FILE")
|
|
150
|
+
if lib_file and Path(lib_file).exists():
|
|
151
|
+
if _ensure_supported_and_set(lib_file):
|
|
152
|
+
return cindex
|
|
153
|
+
else:
|
|
154
|
+
raise RuntimeError(
|
|
155
|
+
f"环境变量 CLANG_LIBRARY_FILE 指向 '{lib_file}', 但它不是 libclang 16-21 版本。\n"
|
|
156
|
+
"请将其设置为受支持的 libclang (例如 /usr/lib/llvm-21/lib/libclang.so 或匹配的版本)。"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# 2) LIBCLANG_PATH
|
|
160
|
+
lib_dir = os.environ.get("LIBCLANG_PATH")
|
|
161
|
+
if lib_dir and Path(lib_dir).exists():
|
|
162
|
+
base = Path(lib_dir)
|
|
163
|
+
candidates: List[Path] = []
|
|
164
|
+
|
|
165
|
+
# Versioned shared libraries
|
|
166
|
+
for maj in (21, 20, 19, 18, 17, 16):
|
|
167
|
+
candidates.append(base / f"libclang.so.{maj}")
|
|
168
|
+
# Generic names
|
|
169
|
+
candidates.extend([
|
|
170
|
+
base / "libclang.so", # Linux
|
|
171
|
+
base / "libclang.dylib", # macOS
|
|
172
|
+
base / "libclang.dll", # Windows
|
|
173
|
+
])
|
|
174
|
+
for cand in candidates:
|
|
175
|
+
if cand.exists() and _ensure_supported_and_set(str(cand)):
|
|
176
|
+
return cindex
|
|
177
|
+
# If a directory is given but no valid supported version found, error out explicitly
|
|
178
|
+
raise RuntimeError(
|
|
179
|
+
f"环境变量 LIBCLANG_PATH={lib_dir} 不包含 libclang 16-21。\n"
|
|
180
|
+
"期望找到 libclang.so.[16-21] (Linux) 或来自 llvm@16..@21 的 libclang.dylib (macOS)。"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# 3) LLVM_HOME
|
|
184
|
+
llvm_home = os.environ.get("LLVM_HOME")
|
|
185
|
+
if llvm_home:
|
|
186
|
+
p = Path(llvm_home) / "lib"
|
|
187
|
+
candidates: List[Path] = []
|
|
188
|
+
for maj in (21, 20, 19, 18, 17, 16):
|
|
189
|
+
candidates.append(p / f"libclang.so.{maj}")
|
|
190
|
+
candidates.extend([
|
|
191
|
+
p / "libclang.so",
|
|
192
|
+
p / "libclang.dylib",
|
|
193
|
+
p / "libclang.dll",
|
|
194
|
+
])
|
|
195
|
+
for cand in candidates:
|
|
196
|
+
if cand.exists() and _ensure_supported_and_set(str(cand)):
|
|
197
|
+
return cindex
|
|
198
|
+
|
|
199
|
+
# 4) Common locations for versions 16-21
|
|
200
|
+
import platform as _platform
|
|
201
|
+
sys_name = _platform.system()
|
|
202
|
+
path_candidates: List[Path] = []
|
|
203
|
+
if sys_name == "Linux":
|
|
204
|
+
for maj in (21, 20, 19, 18, 17, 16):
|
|
205
|
+
path_candidates.extend([
|
|
206
|
+
Path(f"/usr/lib/llvm-{maj}/lib/libclang.so.{maj}"),
|
|
207
|
+
Path(f"/usr/lib/llvm-{maj}/lib/libclang.so"),
|
|
208
|
+
])
|
|
209
|
+
# Generic fallbacks
|
|
210
|
+
path_candidates.extend([
|
|
211
|
+
Path("/usr/local/lib/libclang.so.21"),
|
|
212
|
+
Path("/usr/local/lib/libclang.so.20"),
|
|
213
|
+
Path("/usr/local/lib/libclang.so.19"),
|
|
214
|
+
Path("/usr/local/lib/libclang.so.18"),
|
|
215
|
+
Path("/usr/local/lib/libclang.so.17"),
|
|
216
|
+
Path("/usr/local/lib/libclang.so.16"),
|
|
217
|
+
Path("/usr/local/lib/libclang.so"),
|
|
218
|
+
Path("/usr/lib/libclang.so.21"),
|
|
219
|
+
Path("/usr/lib/libclang.so.20"),
|
|
220
|
+
Path("/usr/lib/libclang.so.19"),
|
|
221
|
+
Path("/usr/lib/libclang.so.18"),
|
|
222
|
+
Path("/usr/lib/libclang.so.17"),
|
|
223
|
+
Path("/usr/lib/libclang.so.16"),
|
|
224
|
+
Path("/usr/lib/libclang.so"),
|
|
225
|
+
])
|
|
226
|
+
elif sys_name == "Darwin":
|
|
227
|
+
# Homebrew llvm@N formulas
|
|
228
|
+
for maj in (21, 20, 19, 18, 17, 16):
|
|
229
|
+
path_candidates.append(Path(f"/opt/homebrew/opt/llvm@{maj}/lib/libclang.dylib"))
|
|
230
|
+
path_candidates.append(Path(f"/usr/local/opt/llvm@{maj}/lib/libclang.dylib"))
|
|
231
|
+
# Generic llvm formula path (may be symlinked to a specific version)
|
|
232
|
+
path_candidates.extend([
|
|
233
|
+
Path("/opt/homebrew/opt/llvm/lib/libclang.dylib"),
|
|
234
|
+
Path("/usr/local/opt/llvm/lib/libclang.dylib"),
|
|
235
|
+
])
|
|
236
|
+
else:
|
|
237
|
+
# Best-effort on other systems (Windows)
|
|
238
|
+
path_candidates = [
|
|
239
|
+
Path("C:/Program Files/LLVM/bin/libclang.dll"),
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
# Include additional globbed candidates for distributions that install versioned sonames like libclang.so.21.1.4
|
|
243
|
+
try:
|
|
244
|
+
extra_glob_dirs = [
|
|
245
|
+
Path("/usr/lib"),
|
|
246
|
+
Path("/usr/local/lib"),
|
|
247
|
+
Path("/lib"),
|
|
248
|
+
Path("/usr/lib64"),
|
|
249
|
+
Path("/lib64"),
|
|
250
|
+
Path("/usr/lib/x86_64-linux-gnu"),
|
|
251
|
+
]
|
|
252
|
+
extra_globs: List[Path] = []
|
|
253
|
+
for d in extra_glob_dirs:
|
|
254
|
+
try:
|
|
255
|
+
extra_globs.extend(d.glob("libclang.so.*"))
|
|
256
|
+
except Exception:
|
|
257
|
+
pass
|
|
258
|
+
# Deduplicate while preserving order (Path is hashable)
|
|
259
|
+
seen = set()
|
|
260
|
+
merged_candidates: List[Path] = []
|
|
261
|
+
for p in list(path_candidates) + extra_globs:
|
|
262
|
+
if p not in seen:
|
|
263
|
+
merged_candidates.append(p)
|
|
264
|
+
seen.add(p)
|
|
265
|
+
except Exception:
|
|
266
|
+
merged_candidates = list(path_candidates)
|
|
267
|
+
|
|
268
|
+
for cand in merged_candidates:
|
|
269
|
+
if cand.exists() and _ensure_supported_and_set(str(cand)):
|
|
270
|
+
return cindex
|
|
271
|
+
|
|
272
|
+
# Final fallback: try using system default resolution without explicitly setting the library file.
|
|
273
|
+
# Some distributions (e.g., Arch) place libclang in standard linker paths (/usr/lib/libclang.so),
|
|
274
|
+
# which clang.cindex can locate without Config.set_library_file.
|
|
275
|
+
try:
|
|
276
|
+
_ = cindex.Index.create()
|
|
277
|
+
return cindex
|
|
278
|
+
except Exception:
|
|
279
|
+
pass
|
|
280
|
+
|
|
281
|
+
# If we got here, we failed to locate a supported libclang 16-21
|
|
282
|
+
raise RuntimeError(
|
|
283
|
+
"未能定位到 libclang 16-21。本工具支持 clang 16-21 版本。\n"
|
|
284
|
+
"修复选项:\n"
|
|
285
|
+
"- 在 Ubuntu/Debian 上: sudo apt-get install -y llvm-21 clang-21 libclang-21-dev (或 20/19/18/17/16)。\n"
|
|
286
|
+
"- 在 macOS (Homebrew) 上: brew install llvm@21 (或 @20/@19/@18/@17/@16)。\n"
|
|
287
|
+
"- 在 Arch Linux 上: 确保 clang 提供了 /usr/lib/libclang.so (通常是这样) 或显式设置 CLANG_LIBRARY_FILE。\n"
|
|
288
|
+
"- 然后设置环境变量 (如果未自动检测到):\n"
|
|
289
|
+
" export CLANG_LIBRARY_FILE=/usr/lib/llvm-21/lib/libclang.so # Linux (请调整版本)\n"
|
|
290
|
+
" export CLANG_LIBRARY_FILE=/opt/homebrew/opt/llvm@21/lib/libclang.dylib # macOS (请调整版本)\n"
|
|
291
|
+
)
|
|
292
|
+
# ---------------------------
|
|
293
|
+
# Data structures
|
|
294
|
+
# ---------------------------
|
|
295
|
+
@dataclass
|
|
296
|
+
class FunctionInfo:
|
|
297
|
+
name: str
|
|
298
|
+
qualified_name: str
|
|
299
|
+
signature: str
|
|
300
|
+
return_type: str
|
|
301
|
+
params: List[Dict[str, str]]
|
|
302
|
+
calls: List[str]
|
|
303
|
+
file: str
|
|
304
|
+
start_line: int
|
|
305
|
+
start_col: int
|
|
306
|
+
end_line: int
|
|
307
|
+
end_col: int
|
|
308
|
+
language: str
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ---------------------------
|
|
316
|
+
# Compile commands loader
|
|
317
|
+
# ---------------------------
|
|
318
|
+
def find_compile_commands(start: Path) -> Optional[Path]:
|
|
319
|
+
"""
|
|
320
|
+
Search upward from 'start' for compile_commands.json
|
|
321
|
+
"""
|
|
322
|
+
cur = start.resolve()
|
|
323
|
+
root = cur.anchor
|
|
324
|
+
while True:
|
|
325
|
+
candidate = cur / "compile_commands.json"
|
|
326
|
+
if candidate.exists():
|
|
327
|
+
return candidate
|
|
328
|
+
if str(cur) == root:
|
|
329
|
+
break
|
|
330
|
+
cur = cur.parent
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def load_compile_commands(cc_path: Path) -> Dict[str, List[str]]:
|
|
335
|
+
"""
|
|
336
|
+
Load compile_commands.json and return a mapping:
|
|
337
|
+
file(abs path str) -> compile args (list[str], without compiler executable)
|
|
338
|
+
"""
|
|
339
|
+
try:
|
|
340
|
+
data = json.loads(cc_path.read_text(encoding="utf-8"))
|
|
341
|
+
except Exception:
|
|
342
|
+
return {}
|
|
343
|
+
|
|
344
|
+
mapping: Dict[str, List[str]] = {}
|
|
345
|
+
for entry in data:
|
|
346
|
+
file_path = Path(entry.get("file", "")).resolve()
|
|
347
|
+
if not file_path:
|
|
348
|
+
continue
|
|
349
|
+
if "arguments" in entry and isinstance(entry["arguments"], list):
|
|
350
|
+
# arguments usually includes the compiler as argv[0]
|
|
351
|
+
args = entry["arguments"][1:] if entry["arguments"] else []
|
|
352
|
+
else:
|
|
353
|
+
# fallback to split command string
|
|
354
|
+
cmd = entry.get("command", "")
|
|
355
|
+
import shlex
|
|
356
|
+
parts = shlex.split(cmd) if cmd else []
|
|
357
|
+
args = parts[1:] if parts else []
|
|
358
|
+
|
|
359
|
+
# Clean args: drop compile-only/output flags that confuse libclang
|
|
360
|
+
cleaned: List[str] = []
|
|
361
|
+
skip_next = False
|
|
362
|
+
for a in args:
|
|
363
|
+
if skip_next:
|
|
364
|
+
skip_next = False
|
|
365
|
+
continue
|
|
366
|
+
if a in ("-c",):
|
|
367
|
+
continue
|
|
368
|
+
if a in ("-o", "-MF"):
|
|
369
|
+
skip_next = True
|
|
370
|
+
continue
|
|
371
|
+
if a.startswith("-o"):
|
|
372
|
+
continue
|
|
373
|
+
cleaned.append(a)
|
|
374
|
+
mapping[str(file_path)] = cleaned
|
|
375
|
+
return mapping
|
|
376
|
+
|
|
377
|
+
# ---------------------------
|
|
378
|
+
# File discovery
|
|
379
|
+
# ---------------------------
|
|
380
|
+
SOURCE_EXTS: Set[str] = {
|
|
381
|
+
".c", ".cc", ".cpp", ".cxx", ".C",
|
|
382
|
+
".h", ".hh", ".hpp", ".hxx",
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
def iter_source_files(root: Path) -> Iterable[Path]:
|
|
386
|
+
for p in root.rglob("*"):
|
|
387
|
+
if not p.is_file():
|
|
388
|
+
continue
|
|
389
|
+
if p.suffix in SOURCE_EXTS:
|
|
390
|
+
yield p.resolve()
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# ---------------------------
|
|
394
|
+
# AST utilities
|
|
395
|
+
# ---------------------------
|
|
396
|
+
def get_qualified_name(cursor) -> str:
|
|
397
|
+
parts = []
|
|
398
|
+
cur = cursor.semantic_parent
|
|
399
|
+
while cur is not None and cur.kind.name != "TRANSLATION_UNIT":
|
|
400
|
+
if cur.spelling:
|
|
401
|
+
parts.append(cur.spelling)
|
|
402
|
+
cur = cur.semantic_parent
|
|
403
|
+
parts.reverse()
|
|
404
|
+
base = "::".join(parts)
|
|
405
|
+
if base:
|
|
406
|
+
return f"{base}::{cursor.spelling}"
|
|
407
|
+
return cursor.spelling or ""
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def collect_params(cursor) -> List[Dict[str, str]]:
|
|
411
|
+
params = []
|
|
412
|
+
for c in cursor.get_children():
|
|
413
|
+
# In libclang, parameters are PARM_DECL
|
|
414
|
+
if c.kind.name == "PARM_DECL":
|
|
415
|
+
t = ""
|
|
416
|
+
try:
|
|
417
|
+
t = c.type.spelling or ""
|
|
418
|
+
except Exception:
|
|
419
|
+
t = ""
|
|
420
|
+
params.append({"name": c.spelling or "", "type": t})
|
|
421
|
+
return params
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
def collect_calls(cursor) -> List[str]:
|
|
425
|
+
"""
|
|
426
|
+
Collect called function names within a function definition.
|
|
427
|
+
"""
|
|
428
|
+
calls: List[str] = []
|
|
429
|
+
|
|
430
|
+
def walk(node):
|
|
431
|
+
for ch in node.get_children():
|
|
432
|
+
kind = ch.kind.name
|
|
433
|
+
if kind == "CALL_EXPR":
|
|
434
|
+
# Get referenced function if available
|
|
435
|
+
name = ""
|
|
436
|
+
try:
|
|
437
|
+
if ch.referenced is not None and ch.referenced.spelling:
|
|
438
|
+
# Prefer qualified if possible
|
|
439
|
+
qn = get_qualified_name(ch.referenced)
|
|
440
|
+
name = qn or ch.referenced.spelling
|
|
441
|
+
else:
|
|
442
|
+
# fallback to displayname
|
|
443
|
+
name = ch.displayname or ""
|
|
444
|
+
except Exception:
|
|
445
|
+
name = ch.displayname or ""
|
|
446
|
+
if name:
|
|
447
|
+
calls.append(name)
|
|
448
|
+
# Recurse
|
|
449
|
+
walk(ch)
|
|
450
|
+
|
|
451
|
+
walk(cursor)
|
|
452
|
+
return calls
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
def is_function_like(cursor) -> bool:
|
|
456
|
+
return cursor.kind.name in {
|
|
457
|
+
"FUNCTION_DECL",
|
|
458
|
+
"CXX_METHOD",
|
|
459
|
+
"CONSTRUCTOR",
|
|
460
|
+
"DESTRUCTOR",
|
|
461
|
+
"FUNCTION_TEMPLATE",
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def lang_from_cursor(cursor) -> str:
|
|
466
|
+
try:
|
|
467
|
+
return str(cursor.language.name)
|
|
468
|
+
except Exception:
|
|
469
|
+
# Guess by extension
|
|
470
|
+
f = cursor.location.file
|
|
471
|
+
if f is not None:
|
|
472
|
+
ext = os.path.splitext(str(f))[1].lower()
|
|
473
|
+
if ext in (".c",):
|
|
474
|
+
return "C"
|
|
475
|
+
return "CXX"
|
|
476
|
+
return "UNKNOWN"
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
# ---------------------------
|
|
480
|
+
# Scanner core
|
|
481
|
+
# ---------------------------
|
|
482
|
+
def scan_file(cindex, file_path: Path, args: List[str]) -> List[FunctionInfo]:
|
|
483
|
+
index = cindex.Index.create()
|
|
484
|
+
tu = index.parse(
|
|
485
|
+
str(file_path),
|
|
486
|
+
args=args,
|
|
487
|
+
options=0, # need bodies to collect calls
|
|
488
|
+
)
|
|
489
|
+
functions: List[FunctionInfo] = []
|
|
490
|
+
|
|
491
|
+
def visit(node):
|
|
492
|
+
# Only consider functions with definitions in this file
|
|
493
|
+
if is_function_like(node) and node.is_definition():
|
|
494
|
+
loc_file = node.location.file
|
|
495
|
+
if loc_file is not None and Path(loc_file.name).resolve() == file_path.resolve():
|
|
496
|
+
try:
|
|
497
|
+
name = node.spelling or ""
|
|
498
|
+
qualified_name = get_qualified_name(node)
|
|
499
|
+
signature = node.displayname or name
|
|
500
|
+
try:
|
|
501
|
+
return_type = node.result_type.spelling # not available for constructors/destructors
|
|
502
|
+
except Exception:
|
|
503
|
+
return_type = ""
|
|
504
|
+
params = collect_params(node)
|
|
505
|
+
calls = collect_calls(node)
|
|
506
|
+
extent = node.extent
|
|
507
|
+
start_line = extent.start.line
|
|
508
|
+
start_col = extent.start.column
|
|
509
|
+
end_line = extent.end.line
|
|
510
|
+
end_col = extent.end.column
|
|
511
|
+
language = lang_from_cursor(node)
|
|
512
|
+
fi = FunctionInfo(
|
|
513
|
+
name=name,
|
|
514
|
+
qualified_name=qualified_name,
|
|
515
|
+
signature=signature,
|
|
516
|
+
return_type=return_type,
|
|
517
|
+
params=params,
|
|
518
|
+
calls=calls,
|
|
519
|
+
file=str(file_path),
|
|
520
|
+
start_line=start_line,
|
|
521
|
+
start_col=start_col,
|
|
522
|
+
end_line=end_line,
|
|
523
|
+
end_col=end_col,
|
|
524
|
+
language=language,
|
|
525
|
+
)
|
|
526
|
+
functions.append(fi)
|
|
527
|
+
except Exception:
|
|
528
|
+
# Be robust, continue scanning
|
|
529
|
+
pass
|
|
530
|
+
|
|
531
|
+
for ch in node.get_children():
|
|
532
|
+
visit(ch)
|
|
533
|
+
|
|
534
|
+
visit(tu.cursor)
|
|
535
|
+
return functions
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def scan_directory(scan_root: Path, db_path: Optional[Path] = None) -> Path:
|
|
539
|
+
"""
|
|
540
|
+
Scan a directory for C/C++ symbols and store results into JSONL/JSON.
|
|
541
|
+
|
|
542
|
+
Returns the path to symbols_raw.jsonl.
|
|
543
|
+
- symbols_raw.jsonl: one JSON object per symbol (category: function/type),原始扫描产物
|
|
544
|
+
- symbols.jsonl: 与原始产物等价的初始基线(便于未执行 prune 时直接进入后续流程)
|
|
545
|
+
- meta.json: summary counts and timestamp
|
|
546
|
+
"""
|
|
547
|
+
scan_root = scan_root.resolve()
|
|
548
|
+
out_dir = scan_root / ".jarvis" / "c2rust"
|
|
549
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
550
|
+
|
|
551
|
+
# JSONL/JSON outputs (symbols only)
|
|
552
|
+
symbols_raw_jsonl = out_dir / "symbols_raw.jsonl"
|
|
553
|
+
symbols_curated_jsonl = out_dir / "symbols.jsonl"
|
|
554
|
+
meta_json = out_dir / "meta.json"
|
|
555
|
+
|
|
556
|
+
# Prepare libclang
|
|
557
|
+
cindex = _try_import_libclang()
|
|
558
|
+
# Fallback safeguard: if loader returned None, try importing directly
|
|
559
|
+
if cindex is None:
|
|
560
|
+
try:
|
|
561
|
+
from clang import cindex as _ci # type: ignore
|
|
562
|
+
cindex = _ci
|
|
563
|
+
except Exception as e:
|
|
564
|
+
raise RuntimeError(f"Failed to load libclang bindings: {e}")
|
|
565
|
+
|
|
566
|
+
# Preflight check: verify libclang/python bindings compatibility before scanning
|
|
567
|
+
try:
|
|
568
|
+
_ = cindex.Index.create()
|
|
569
|
+
except Exception as e:
|
|
570
|
+
msg = str(e)
|
|
571
|
+
if "undefined symbol" in msg:
|
|
572
|
+
# Try to suggest a better libclang candidate that contains the missing symbol
|
|
573
|
+
def _has_symbol(lib_path: str, symbol: str) -> bool:
|
|
574
|
+
try:
|
|
575
|
+
import ctypes
|
|
576
|
+
lib = ctypes.CDLL(lib_path)
|
|
577
|
+
getattr(lib, symbol)
|
|
578
|
+
return True
|
|
579
|
+
except Exception:
|
|
580
|
+
return False
|
|
581
|
+
|
|
582
|
+
# Build candidate search dirs (Linux/macOS)
|
|
583
|
+
import platform as _platform
|
|
584
|
+
sys_name = _platform.system()
|
|
585
|
+
candidates: List[str] = []
|
|
586
|
+
if sys_name == "Linux":
|
|
587
|
+
candidates = [
|
|
588
|
+
"/usr/lib/llvm-21/lib/libclang.so",
|
|
589
|
+
"/usr/lib/llvm-20/lib/libclang.so",
|
|
590
|
+
"/usr/lib/llvm-19/lib/libclang.so",
|
|
591
|
+
"/usr/lib/llvm-18/lib/libclang.so",
|
|
592
|
+
"/usr/lib/llvm-17/lib/libclang.so",
|
|
593
|
+
"/usr/lib/llvm-16/lib/libclang.so",
|
|
594
|
+
"/usr/lib/libclang.so",
|
|
595
|
+
"/usr/local/lib/libclang.so",
|
|
596
|
+
]
|
|
597
|
+
elif sys_name == "Darwin":
|
|
598
|
+
# Homebrew locations
|
|
599
|
+
candidates = [
|
|
600
|
+
"/opt/homebrew/opt/llvm/lib/libclang.dylib",
|
|
601
|
+
"/usr/local/opt/llvm/lib/libclang.dylib",
|
|
602
|
+
]
|
|
603
|
+
|
|
604
|
+
good = [p for p in candidates if Path(p).exists() and _has_symbol(p, "clang_getOffsetOfBase")]
|
|
605
|
+
hint = ""
|
|
606
|
+
if good:
|
|
607
|
+
hint = f"\n建议的包含所需符号的库:\n export CLANG_LIBRARY_FILE={good[0]}\n然后重新运行: jarvis-c2rust scan -r {scan_root}"
|
|
608
|
+
|
|
609
|
+
typer.secho(
|
|
610
|
+
"[c2rust-scanner] 检测到 libclang/python 绑定不匹配 (未定义符号)。"
|
|
611
|
+
f"\n详情: {msg}"
|
|
612
|
+
"\n这通常意味着您的 Python 'clang' 绑定版本高于已安装的 libclang。"
|
|
613
|
+
"\n修复选项:\n"
|
|
614
|
+
"- 安装/更新 libclang 以匹配您 Python 'clang' 的主版本 (例如 16-21)。\n"
|
|
615
|
+
"- 或将 Python 'clang' 版本固定为与系统 libclang 匹配 (例如 pip install 'clang>=16,<22')。\n"
|
|
616
|
+
"- 或设置 CLANG_LIBRARY_FILE 指向匹配的 libclang 共享库。\n"
|
|
617
|
+
f"{hint}",
|
|
618
|
+
fg=typer.colors.RED,
|
|
619
|
+
err=True,
|
|
620
|
+
)
|
|
621
|
+
raise typer.Exit(code=2)
|
|
622
|
+
else:
|
|
623
|
+
# Other initialization errors: surface and exit
|
|
624
|
+
typer.secho(f"[c2rust-scanner] libclang 初始化失败: {e}", fg=typer.colors.RED, err=True)
|
|
625
|
+
raise typer.Exit(code=2)
|
|
626
|
+
|
|
627
|
+
# compile_commands
|
|
628
|
+
cc_file = find_compile_commands(scan_root)
|
|
629
|
+
cc_args_map: Dict[str, List[str]] = {}
|
|
630
|
+
if cc_file:
|
|
631
|
+
cc_args_map = load_compile_commands(cc_file)
|
|
632
|
+
|
|
633
|
+
# default args: at least include root dir to help header resolution
|
|
634
|
+
default_args = ["-I", str(scan_root)]
|
|
635
|
+
|
|
636
|
+
files = list(iter_source_files(scan_root))
|
|
637
|
+
total_files = len(files)
|
|
638
|
+
print(f"[c2rust-scanner] 正在扫描 {scan_root} 目录下的 {total_files} 个文件")
|
|
639
|
+
|
|
640
|
+
scanned = 0
|
|
641
|
+
total_functions = 0
|
|
642
|
+
total_types = 0
|
|
643
|
+
|
|
644
|
+
# JSONL writers
|
|
645
|
+
now_ts = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
|
|
646
|
+
sym_id_seq = 1
|
|
647
|
+
|
|
648
|
+
def _fn_record(fn: FunctionInfo, id_val: int) -> Dict[str, Any]:
|
|
649
|
+
return {
|
|
650
|
+
"id": id_val,
|
|
651
|
+
"name": fn.name,
|
|
652
|
+
"qualified_name": fn.qualified_name,
|
|
653
|
+
"signature": fn.signature,
|
|
654
|
+
"return_type": fn.return_type,
|
|
655
|
+
"params": fn.params,
|
|
656
|
+
"ref": fn.calls, # unified field: referenced functions/types
|
|
657
|
+
"file": fn.file,
|
|
658
|
+
"start_line": fn.start_line,
|
|
659
|
+
"start_col": fn.start_col,
|
|
660
|
+
"end_line": fn.end_line,
|
|
661
|
+
"end_col": fn.end_col,
|
|
662
|
+
"language": fn.language,
|
|
663
|
+
"created_at": now_ts,
|
|
664
|
+
"updated_at": now_ts,
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
def _tp_record(tp: TypeInfo, id_val: int) -> Dict[str, Any]:
|
|
668
|
+
# For types, 'ref' 表示引用到的类型集合;当前最小实现:若为typedef/alias则包含 underlying_type
|
|
669
|
+
refs: List[str] = []
|
|
670
|
+
if tp.underlying_type:
|
|
671
|
+
try:
|
|
672
|
+
s = str(tp.underlying_type).strip()
|
|
673
|
+
if s:
|
|
674
|
+
refs.append(s)
|
|
675
|
+
except Exception:
|
|
676
|
+
pass
|
|
677
|
+
return {
|
|
678
|
+
"id": id_val,
|
|
679
|
+
"name": tp.name,
|
|
680
|
+
"qualified_name": tp.qualified_name,
|
|
681
|
+
"kind": tp.kind,
|
|
682
|
+
"underlying_type": tp.underlying_type,
|
|
683
|
+
"ref": refs,
|
|
684
|
+
"file": tp.file,
|
|
685
|
+
"start_line": tp.start_line,
|
|
686
|
+
"start_col": tp.start_col,
|
|
687
|
+
"end_line": tp.end_line,
|
|
688
|
+
"end_col": tp.end_col,
|
|
689
|
+
"language": tp.language,
|
|
690
|
+
"created_at": now_ts,
|
|
691
|
+
"updated_at": now_ts,
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
# Unified symbol records (functions and types)
|
|
695
|
+
def _sym_record_from_function(fn: FunctionInfo, id_val: int) -> Dict[str, Any]:
|
|
696
|
+
return {
|
|
697
|
+
"id": id_val,
|
|
698
|
+
"category": "function",
|
|
699
|
+
"name": fn.name,
|
|
700
|
+
"qualified_name": fn.qualified_name,
|
|
701
|
+
"signature": fn.signature,
|
|
702
|
+
"return_type": fn.return_type,
|
|
703
|
+
"params": fn.params,
|
|
704
|
+
"ref": fn.calls,
|
|
705
|
+
"file": fn.file,
|
|
706
|
+
"start_line": fn.start_line,
|
|
707
|
+
"start_col": fn.start_col,
|
|
708
|
+
"end_line": fn.end_line,
|
|
709
|
+
"end_col": fn.end_col,
|
|
710
|
+
"language": fn.language,
|
|
711
|
+
"created_at": now_ts,
|
|
712
|
+
"updated_at": now_ts,
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
def _sym_record_from_type(tp: TypeInfo, id_val: int) -> Dict[str, Any]:
|
|
716
|
+
refs_t: List[str] = []
|
|
717
|
+
if tp.underlying_type:
|
|
718
|
+
try:
|
|
719
|
+
s = str(tp.underlying_type).strip()
|
|
720
|
+
if s:
|
|
721
|
+
refs_t.append(s)
|
|
722
|
+
except Exception:
|
|
723
|
+
pass
|
|
724
|
+
return {
|
|
725
|
+
"id": id_val,
|
|
726
|
+
"category": "type",
|
|
727
|
+
"name": tp.name,
|
|
728
|
+
"qualified_name": tp.qualified_name,
|
|
729
|
+
"kind": tp.kind,
|
|
730
|
+
"underlying_type": tp.underlying_type,
|
|
731
|
+
"ref": refs_t,
|
|
732
|
+
"file": tp.file,
|
|
733
|
+
"start_line": tp.start_line,
|
|
734
|
+
"start_col": tp.start_col,
|
|
735
|
+
"end_line": tp.end_line,
|
|
736
|
+
"end_col": tp.end_col,
|
|
737
|
+
"language": tp.language,
|
|
738
|
+
"created_at": now_ts,
|
|
739
|
+
"updated_at": now_ts,
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
# Open JSONL file (symbols only)
|
|
743
|
+
f_sym = symbols_raw_jsonl.open("w", encoding="utf-8")
|
|
744
|
+
try:
|
|
745
|
+
for p in files:
|
|
746
|
+
# prefer compile_commands args if available
|
|
747
|
+
args = cc_args_map.get(str(p), default_args)
|
|
748
|
+
try:
|
|
749
|
+
funcs = scan_file(cindex, p, args)
|
|
750
|
+
except Exception as e:
|
|
751
|
+
# If we hit undefined symbol, it's a libclang/python bindings mismatch; abort with guidance
|
|
752
|
+
msg = str(e)
|
|
753
|
+
if "undefined symbol" in msg:
|
|
754
|
+
def _has_symbol(lib_path: str, symbol: str) -> bool:
|
|
755
|
+
try:
|
|
756
|
+
import ctypes
|
|
757
|
+
lib = ctypes.CDLL(lib_path)
|
|
758
|
+
getattr(lib, symbol)
|
|
759
|
+
return True
|
|
760
|
+
except Exception:
|
|
761
|
+
return False
|
|
762
|
+
|
|
763
|
+
import platform as _platform
|
|
764
|
+
sys_name = _platform.system()
|
|
765
|
+
candidates: List[str] = []
|
|
766
|
+
if sys_name == "Linux":
|
|
767
|
+
candidates = [
|
|
768
|
+
"/usr/lib/llvm-20/lib/libclang.so",
|
|
769
|
+
"/usr/lib/llvm-19/lib/libclang.so",
|
|
770
|
+
"/usr/lib/llvm-18/lib/libclang.so",
|
|
771
|
+
"/usr/lib/libclang.so",
|
|
772
|
+
"/usr/local/lib/libclang.so",
|
|
773
|
+
]
|
|
774
|
+
elif sys_name == "Darwin":
|
|
775
|
+
candidates = [
|
|
776
|
+
"/opt/homebrew/opt/llvm/lib/libclang.dylib",
|
|
777
|
+
"/usr/local/opt/llvm/lib/libclang.dylib",
|
|
778
|
+
]
|
|
779
|
+
|
|
780
|
+
good = [lp for lp in candidates if Path(lp).exists() and _has_symbol(lp, "clang_getOffsetOfBase")]
|
|
781
|
+
hint = ""
|
|
782
|
+
if good:
|
|
783
|
+
hint = f"\n建议的包含所需符号的库:\n export CLANG_LIBRARY_FILE={good[0]}\n然后重新运行: jarvis-c2rust scan -r {scan_root}"
|
|
784
|
+
|
|
785
|
+
typer.secho(
|
|
786
|
+
"[c2rust-scanner] 解析期间检测到 libclang/python 绑定不匹配 (未定义符号)。"
|
|
787
|
+
f"\n详情: {msg}"
|
|
788
|
+
"\n这通常意味着您的 Python 'clang' 绑定版本高于已安装的 libclang。"
|
|
789
|
+
"\n修复选项:\n"
|
|
790
|
+
"- 安装/更新 libclang 以匹配您 Python 'clang' 的主版本 (例如 19/20)。\n"
|
|
791
|
+
"- 或将 Python 'clang' 版本固定为与系统 libclang 匹配 (例如 pip install 'clang==18.*')。\n"
|
|
792
|
+
"- 或设置 CLANG_LIBRARY_FILE 指向匹配的 libclang 共享库。\n"
|
|
793
|
+
f"{hint}",
|
|
794
|
+
fg=typer.colors.RED,
|
|
795
|
+
err=True,
|
|
796
|
+
)
|
|
797
|
+
raise typer.Exit(code=2)
|
|
798
|
+
|
|
799
|
+
# Try without args as fallback for regular parse errors
|
|
800
|
+
try:
|
|
801
|
+
funcs = scan_file(cindex, p, [])
|
|
802
|
+
except Exception:
|
|
803
|
+
print(f"[c2rust-scanner] 解析 {p} 失败: {e}", file=sys.stderr)
|
|
804
|
+
continue
|
|
805
|
+
|
|
806
|
+
# Write JSONL
|
|
807
|
+
for fn in funcs:
|
|
808
|
+
# write unified symbol record
|
|
809
|
+
srec = _sym_record_from_function(fn, sym_id_seq)
|
|
810
|
+
f_sym.write(json.dumps(srec, ensure_ascii=False) + "\n")
|
|
811
|
+
# increase sequences
|
|
812
|
+
sym_id_seq += 1
|
|
813
|
+
total_functions += len(funcs)
|
|
814
|
+
|
|
815
|
+
# Scan types in this file
|
|
816
|
+
try:
|
|
817
|
+
types = scan_types_file(cindex, p, args)
|
|
818
|
+
except Exception:
|
|
819
|
+
try:
|
|
820
|
+
types = scan_types_file(cindex, p, [])
|
|
821
|
+
except Exception:
|
|
822
|
+
types = []
|
|
823
|
+
|
|
824
|
+
for t in types:
|
|
825
|
+
# write unified symbol record
|
|
826
|
+
srec_t = _sym_record_from_type(t, sym_id_seq)
|
|
827
|
+
f_sym.write(json.dumps(srec_t, ensure_ascii=False) + "\n")
|
|
828
|
+
# increase sequences
|
|
829
|
+
sym_id_seq += 1
|
|
830
|
+
total_types += len(types)
|
|
831
|
+
|
|
832
|
+
scanned += 1
|
|
833
|
+
if scanned % 20 == 0 or scanned == total_files:
|
|
834
|
+
print(f"[c2rust-scanner] 进度: {scanned}/{total_files} 个文件, {total_functions} 个函数, {total_types} 个类型")
|
|
835
|
+
finally:
|
|
836
|
+
try:
|
|
837
|
+
f_sym.close()
|
|
838
|
+
except Exception:
|
|
839
|
+
pass
|
|
840
|
+
|
|
841
|
+
# Write meta.json
|
|
842
|
+
meta = {
|
|
843
|
+
"functions": total_functions,
|
|
844
|
+
"types": total_types,
|
|
845
|
+
"symbols": total_functions + total_types,
|
|
846
|
+
"generated_at": now_ts,
|
|
847
|
+
"schema_version": 1,
|
|
848
|
+
"source_root": str(scan_root),
|
|
849
|
+
}
|
|
850
|
+
try:
|
|
851
|
+
meta_json.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
852
|
+
except Exception:
|
|
853
|
+
pass
|
|
854
|
+
|
|
855
|
+
print(f"[c2rust-scanner] 完成。收集到的函数: {total_functions}, 类型: {total_types}, 符号: {total_functions + total_types}")
|
|
856
|
+
print(f"[c2rust-scanner] JSONL 已写入: {symbols_raw_jsonl} (原始符号)")
|
|
857
|
+
# 同步生成基线 symbols.jsonl(与 raw 等价),便于后续流程仅基于 symbols.jsonl 运行
|
|
858
|
+
try:
|
|
859
|
+
shutil.copy2(symbols_raw_jsonl, symbols_curated_jsonl)
|
|
860
|
+
print(f"[c2rust-scanner] JSONL 基线已写入: {symbols_curated_jsonl} (用于后续流程)")
|
|
861
|
+
except Exception as _e:
|
|
862
|
+
typer.secho(f"[c2rust-scanner] 生成 symbols.jsonl 失败: {_e}", fg=typer.colors.RED, err=True)
|
|
863
|
+
raise
|
|
864
|
+
print(f"[c2rust-scanner] 元数据已写入: {meta_json}")
|
|
865
|
+
return symbols_raw_jsonl
|
|
866
|
+
|
|
867
|
+
# ---------------------------
|
|
868
|
+
# Type scanning
|
|
869
|
+
# ---------------------------
|
|
870
|
+
@dataclass
|
|
871
|
+
class TypeInfo:
|
|
872
|
+
name: str
|
|
873
|
+
qualified_name: str
|
|
874
|
+
kind: str
|
|
875
|
+
underlying_type: str
|
|
876
|
+
file: str
|
|
877
|
+
start_line: int
|
|
878
|
+
start_col: int
|
|
879
|
+
end_line: int
|
|
880
|
+
end_col: int
|
|
881
|
+
language: str
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
TYPE_KINDS: Set[str] = {
|
|
887
|
+
"STRUCT_DECL",
|
|
888
|
+
"UNION_DECL",
|
|
889
|
+
"ENUM_DECL",
|
|
890
|
+
"CXX_RECORD_DECL", # C++ class/struct/union
|
|
891
|
+
"TYPEDEF_DECL",
|
|
892
|
+
"TYPE_ALIAS_DECL",
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def scan_types_file(cindex, file_path: Path, args: List[str]) -> List[TypeInfo]:
|
|
897
|
+
index = cindex.Index.create()
|
|
898
|
+
tu = index.parse(
|
|
899
|
+
str(file_path),
|
|
900
|
+
args=args,
|
|
901
|
+
options=0,
|
|
902
|
+
)
|
|
903
|
+
types: List[TypeInfo] = []
|
|
904
|
+
|
|
905
|
+
def visit(node):
|
|
906
|
+
kind = node.kind.name
|
|
907
|
+
# Filter by file
|
|
908
|
+
loc_file = node.location.file
|
|
909
|
+
if loc_file is None or Path(loc_file.name).resolve() != file_path.resolve():
|
|
910
|
+
for ch in node.get_children():
|
|
911
|
+
visit(ch)
|
|
912
|
+
return
|
|
913
|
+
|
|
914
|
+
if kind in TYPE_KINDS:
|
|
915
|
+
# Accept full definitions for record/enum; typedef/alias are inherently definitions
|
|
916
|
+
need_def = kind in {"STRUCT_DECL", "UNION_DECL", "ENUM_DECL", "CXX_RECORD_DECL"}
|
|
917
|
+
if (not need_def) or node.is_definition():
|
|
918
|
+
try:
|
|
919
|
+
name = node.spelling or ""
|
|
920
|
+
qualified_name = get_qualified_name(node)
|
|
921
|
+
underlying = ""
|
|
922
|
+
if kind in {"TYPEDEF_DECL", "TYPE_ALIAS_DECL"}:
|
|
923
|
+
try:
|
|
924
|
+
underlying = node.underlying_typedef_type.spelling or ""
|
|
925
|
+
except Exception:
|
|
926
|
+
underlying = ""
|
|
927
|
+
extent = node.extent
|
|
928
|
+
start_line = extent.start.line
|
|
929
|
+
start_col = extent.start.column
|
|
930
|
+
end_line = extent.end.line
|
|
931
|
+
end_col = extent.end.column
|
|
932
|
+
language = lang_from_cursor(node)
|
|
933
|
+
ti = TypeInfo(
|
|
934
|
+
name=name,
|
|
935
|
+
qualified_name=qualified_name,
|
|
936
|
+
kind=kind.lower(),
|
|
937
|
+
underlying_type=underlying,
|
|
938
|
+
file=str(file_path),
|
|
939
|
+
start_line=start_line,
|
|
940
|
+
start_col=start_col,
|
|
941
|
+
end_line=end_line,
|
|
942
|
+
end_col=end_col,
|
|
943
|
+
language=language,
|
|
944
|
+
)
|
|
945
|
+
types.append(ti)
|
|
946
|
+
except Exception:
|
|
947
|
+
pass
|
|
948
|
+
|
|
949
|
+
for ch in node.get_children():
|
|
950
|
+
visit(ch)
|
|
951
|
+
|
|
952
|
+
visit(tu.cursor)
|
|
953
|
+
return types
|
|
954
|
+
|
|
955
|
+
|
|
956
|
+
# ---------------------------
|
|
957
|
+
# CLI and DOT export
|
|
958
|
+
# ---------------------------
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def generate_dot_from_db(db_path: Path, out_path: Path) -> None:
|
|
962
|
+
# Generate a global reference dependency graph (DOT) from symbols.jsonl.
|
|
963
|
+
def _resolve_symbols_jsonl_path(hint: Path) -> Path:
|
|
964
|
+
p = Path(hint)
|
|
965
|
+
# 允许直接传入 .jsonl 文件
|
|
966
|
+
if p.is_file() and p.suffix.lower() == ".jsonl":
|
|
967
|
+
return p
|
|
968
|
+
# 仅支持目录下的标准路径:<dir>/.jarvis/c2rust/symbols.jsonl
|
|
969
|
+
if p.is_dir():
|
|
970
|
+
prefer = p / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
971
|
+
return prefer
|
|
972
|
+
# 默认:项目 <cwd>/.jarvis/c2rust/symbols.jsonl
|
|
973
|
+
return Path(".") / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
974
|
+
|
|
975
|
+
sjsonl = _resolve_symbols_jsonl_path(db_path)
|
|
976
|
+
if not sjsonl.exists():
|
|
977
|
+
raise FileNotFoundError(f"未找到 symbols.jsonl: {sjsonl}")
|
|
978
|
+
|
|
979
|
+
# Load symbols (functions and types), unified handling (no category filtering)
|
|
980
|
+
by_id: Dict[int, Dict[str, Any]] = {}
|
|
981
|
+
name_to_id: Dict[str, int] = {}
|
|
982
|
+
adj_names: Dict[int, List[str]] = {}
|
|
983
|
+
with open(sjsonl, "r", encoding="utf-8") as f:
|
|
984
|
+
idx = 0
|
|
985
|
+
for line in f:
|
|
986
|
+
line = line.strip()
|
|
987
|
+
if not line:
|
|
988
|
+
continue
|
|
989
|
+
try:
|
|
990
|
+
obj = json.loads(line)
|
|
991
|
+
except Exception:
|
|
992
|
+
continue
|
|
993
|
+
idx += 1
|
|
994
|
+
fid = int(obj.get("id") or idx)
|
|
995
|
+
nm = obj.get("name") or ""
|
|
996
|
+
qn = obj.get("qualified_name") or ""
|
|
997
|
+
sig = obj.get("signature") or ""
|
|
998
|
+
refs = obj.get("ref")
|
|
999
|
+
if not isinstance(refs, list):
|
|
1000
|
+
refs = []
|
|
1001
|
+
refs = [c for c in refs if isinstance(c, str) and c]
|
|
1002
|
+
|
|
1003
|
+
by_id[fid] = {"name": nm, "qname": qn, "sig": sig}
|
|
1004
|
+
if nm:
|
|
1005
|
+
name_to_id.setdefault(nm, fid)
|
|
1006
|
+
if qn:
|
|
1007
|
+
name_to_id.setdefault(qn, fid)
|
|
1008
|
+
adj_names[fid] = refs
|
|
1009
|
+
|
|
1010
|
+
# Convert name-based adjacency to id-based adjacency (internal edges only)
|
|
1011
|
+
adj_ids: Dict[int, List[int]] = {}
|
|
1012
|
+
all_ids: List[int] = sorted(by_id.keys())
|
|
1013
|
+
for src in all_ids:
|
|
1014
|
+
internal: List[int] = []
|
|
1015
|
+
for target in adj_names.get(src, []):
|
|
1016
|
+
tid = name_to_id.get(target)
|
|
1017
|
+
if tid is not None and tid != src:
|
|
1018
|
+
internal.append(tid)
|
|
1019
|
+
try:
|
|
1020
|
+
internal = list(dict.fromkeys(internal))
|
|
1021
|
+
except Exception:
|
|
1022
|
+
internal = sorted(list(set(internal)))
|
|
1023
|
+
adj_ids[src] = internal
|
|
1024
|
+
|
|
1025
|
+
def base_label(fid: int) -> str:
|
|
1026
|
+
meta = by_id.get(fid, {})
|
|
1027
|
+
base = meta.get("qname") or meta.get("name") or f"sym_{fid}"
|
|
1028
|
+
sig = meta.get("sig") or ""
|
|
1029
|
+
if sig and sig != base:
|
|
1030
|
+
return f"{base}\\n{sig}"
|
|
1031
|
+
return base
|
|
1032
|
+
|
|
1033
|
+
# Prepare output path
|
|
1034
|
+
if out_path is None:
|
|
1035
|
+
out_path = sjsonl.parent / "global_refgraph.dot"
|
|
1036
|
+
out_path = Path(out_path)
|
|
1037
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1038
|
+
|
|
1039
|
+
# Write global DOT
|
|
1040
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
1041
|
+
f.write("digraph refgraph {\n")
|
|
1042
|
+
f.write(" rankdir=LR;\n")
|
|
1043
|
+
f.write(" graph [fontsize=10];\n")
|
|
1044
|
+
f.write(" node [fontsize=10];\n")
|
|
1045
|
+
f.write(" edge [fontsize=9];\n")
|
|
1046
|
+
|
|
1047
|
+
# Nodes
|
|
1048
|
+
for fid in all_ids:
|
|
1049
|
+
lbl = base_label(fid)
|
|
1050
|
+
safe_label = lbl.replace("\\", "\\\\").replace('"', '\\"')
|
|
1051
|
+
f.write(f' n{fid} [label="{safe_label}", shape=box];\n')
|
|
1052
|
+
|
|
1053
|
+
# Edges
|
|
1054
|
+
for src in all_ids:
|
|
1055
|
+
for dst in adj_ids.get(src, []):
|
|
1056
|
+
f.write(f" n{src} -> n{dst};\n")
|
|
1057
|
+
|
|
1058
|
+
f.write("}\n")
|
|
1059
|
+
|
|
1060
|
+
return out_path
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
def find_root_function_ids(db_path: Path) -> List[int]:
|
|
1064
|
+
"""
|
|
1065
|
+
Return IDs of root symbols (no incoming references) by reading symbols.jsonl (or given .jsonl path).
|
|
1066
|
+
- 严格使用 ref 字段
|
|
1067
|
+
- 函数与类型统一处理(不区分)
|
|
1068
|
+
"""
|
|
1069
|
+
def _resolve_symbols_jsonl_path(hint: Path) -> Path:
|
|
1070
|
+
p = Path(hint)
|
|
1071
|
+
if p.is_file() and p.suffix.lower() == ".jsonl":
|
|
1072
|
+
return p
|
|
1073
|
+
if p.is_dir():
|
|
1074
|
+
prefer = p / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1075
|
+
return prefer
|
|
1076
|
+
# 默认:项目 .jarvis/c2rust/symbols.jsonl
|
|
1077
|
+
return Path(".") / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1078
|
+
|
|
1079
|
+
fjsonl = _resolve_symbols_jsonl_path(db_path)
|
|
1080
|
+
if not fjsonl.exists():
|
|
1081
|
+
raise FileNotFoundError(f"未找到 symbols.jsonl: {fjsonl}")
|
|
1082
|
+
|
|
1083
|
+
records: List[Any] = []
|
|
1084
|
+
with open(fjsonl, "r", encoding="utf-8") as f:
|
|
1085
|
+
idx = 0
|
|
1086
|
+
for line in f:
|
|
1087
|
+
line = line.strip()
|
|
1088
|
+
if not line:
|
|
1089
|
+
continue
|
|
1090
|
+
try:
|
|
1091
|
+
obj = json.loads(line)
|
|
1092
|
+
except Exception:
|
|
1093
|
+
continue
|
|
1094
|
+
idx += 1
|
|
1095
|
+
fid = int(obj.get("id") or idx)
|
|
1096
|
+
name = obj.get("name") or ""
|
|
1097
|
+
qname = obj.get("qualified_name") or ""
|
|
1098
|
+
refs = obj.get("ref")
|
|
1099
|
+
if not isinstance(refs, list):
|
|
1100
|
+
refs = []
|
|
1101
|
+
refs = [r for r in refs if isinstance(r, str) and r]
|
|
1102
|
+
records.append((fid, name, qname, refs))
|
|
1103
|
+
|
|
1104
|
+
name_to_id: Dict[str, int] = {}
|
|
1105
|
+
all_ids: Set[int] = set()
|
|
1106
|
+
for fid, name, qname, _ in records:
|
|
1107
|
+
fid = int(fid)
|
|
1108
|
+
all_ids.add(fid)
|
|
1109
|
+
if isinstance(name, str) and name:
|
|
1110
|
+
name_to_id.setdefault(name, fid)
|
|
1111
|
+
if isinstance(qname, str) and qname:
|
|
1112
|
+
name_to_id.setdefault(qname, fid)
|
|
1113
|
+
|
|
1114
|
+
non_roots: Set[int] = set()
|
|
1115
|
+
for fid, _name, _qname, refs in records:
|
|
1116
|
+
for target in refs:
|
|
1117
|
+
tid = name_to_id.get(target)
|
|
1118
|
+
if tid is not None and tid != fid:
|
|
1119
|
+
non_roots.add(tid)
|
|
1120
|
+
|
|
1121
|
+
root_ids = sorted(list(all_ids - non_roots))
|
|
1122
|
+
return root_ids
|
|
1123
|
+
|
|
1124
|
+
|
|
1125
|
+
def compute_translation_order_jsonl(db_path: Path, out_path: Optional[Path] = None) -> Path:
|
|
1126
|
+
"""
|
|
1127
|
+
Compute translation order on reference graph and write order to JSONL.
|
|
1128
|
+
Data source: symbols.jsonl (or provided .jsonl path), strictly using ref field and including all symbols.
|
|
1129
|
+
Output:
|
|
1130
|
+
Each line is a JSON object:
|
|
1131
|
+
{
|
|
1132
|
+
"step": int,
|
|
1133
|
+
"ids": [symbol_id, ...],
|
|
1134
|
+
"group": bool,
|
|
1135
|
+
"roots": [root_id], # root this step is attributed to (empty if residual)
|
|
1136
|
+
"created_at": "YYYY-MM-DDTHH:MM:SS"
|
|
1137
|
+
}
|
|
1138
|
+
"""
|
|
1139
|
+
def _resolve_symbols_jsonl_path(hint: Path) -> Path:
|
|
1140
|
+
p = Path(hint)
|
|
1141
|
+
if p.is_file() and p.suffix.lower() == ".jsonl":
|
|
1142
|
+
return p
|
|
1143
|
+
if p.is_dir():
|
|
1144
|
+
prefer = p / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1145
|
+
return prefer
|
|
1146
|
+
return Path(".") / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1147
|
+
|
|
1148
|
+
fjsonl = _resolve_symbols_jsonl_path(db_path)
|
|
1149
|
+
if not fjsonl.exists():
|
|
1150
|
+
raise FileNotFoundError(f"未找到 symbols.jsonl: {fjsonl}")
|
|
1151
|
+
|
|
1152
|
+
# Load symbols and build name-based adjacency from ref
|
|
1153
|
+
by_id: Dict[int, Dict[str, Any]] = {}
|
|
1154
|
+
name_to_id: Dict[str, int] = {}
|
|
1155
|
+
adj_names: Dict[int, List[str]] = {}
|
|
1156
|
+
with open(fjsonl, "r", encoding="utf-8") as f:
|
|
1157
|
+
idx = 0
|
|
1158
|
+
for line in f:
|
|
1159
|
+
line = line.strip()
|
|
1160
|
+
if not line:
|
|
1161
|
+
continue
|
|
1162
|
+
try:
|
|
1163
|
+
obj = json.loads(line)
|
|
1164
|
+
except Exception:
|
|
1165
|
+
continue
|
|
1166
|
+
idx += 1
|
|
1167
|
+
fid = int(obj.get("id") or idx)
|
|
1168
|
+
nm = obj.get("name") or ""
|
|
1169
|
+
qn = obj.get("qualified_name") or ""
|
|
1170
|
+
refs = obj.get("ref")
|
|
1171
|
+
if not isinstance(refs, list):
|
|
1172
|
+
refs = []
|
|
1173
|
+
refs = [r for r in refs if isinstance(r, str) and r]
|
|
1174
|
+
by_id[fid] = {
|
|
1175
|
+
"name": nm,
|
|
1176
|
+
"qname": qn,
|
|
1177
|
+
"cat": (obj.get("category") or ""),
|
|
1178
|
+
"file": obj.get("file") or "",
|
|
1179
|
+
"start_line": obj.get("start_line"),
|
|
1180
|
+
"end_line": obj.get("end_line"),
|
|
1181
|
+
"start_col": obj.get("start_col"),
|
|
1182
|
+
"end_col": obj.get("end_col"),
|
|
1183
|
+
"language": obj.get("language") or "",
|
|
1184
|
+
"record": obj, # embed full symbol record for order file self-containment
|
|
1185
|
+
}
|
|
1186
|
+
if nm:
|
|
1187
|
+
name_to_id.setdefault(nm, fid)
|
|
1188
|
+
if qn:
|
|
1189
|
+
name_to_id.setdefault(qn, fid)
|
|
1190
|
+
adj_names[fid] = refs
|
|
1191
|
+
|
|
1192
|
+
# Convert to id-based adjacency (internal edges only)
|
|
1193
|
+
adj_ids: Dict[int, List[int]] = {}
|
|
1194
|
+
all_ids: List[int] = sorted(by_id.keys())
|
|
1195
|
+
for src in all_ids:
|
|
1196
|
+
internal: List[int] = []
|
|
1197
|
+
for target in adj_names.get(src, []):
|
|
1198
|
+
tid = name_to_id.get(target)
|
|
1199
|
+
if tid is not None and tid != src:
|
|
1200
|
+
internal.append(tid)
|
|
1201
|
+
try:
|
|
1202
|
+
internal = list(dict.fromkeys(internal))
|
|
1203
|
+
except Exception:
|
|
1204
|
+
internal = sorted(list(set(internal)))
|
|
1205
|
+
adj_ids[src] = internal
|
|
1206
|
+
|
|
1207
|
+
# Roots by incoming degree (no incoming)
|
|
1208
|
+
try:
|
|
1209
|
+
roots = find_root_function_ids(fjsonl)
|
|
1210
|
+
except Exception:
|
|
1211
|
+
roots = []
|
|
1212
|
+
|
|
1213
|
+
# Tarjan SCC
|
|
1214
|
+
index_counter = 0
|
|
1215
|
+
stack: List[int] = []
|
|
1216
|
+
onstack: Set[int] = set()
|
|
1217
|
+
indices: Dict[int, int] = {}
|
|
1218
|
+
lowlinks: Dict[int, int] = {}
|
|
1219
|
+
sccs: List[List[int]] = []
|
|
1220
|
+
|
|
1221
|
+
def strongconnect(v: int) -> None:
|
|
1222
|
+
nonlocal index_counter, stack
|
|
1223
|
+
indices[v] = index_counter
|
|
1224
|
+
lowlinks[v] = index_counter
|
|
1225
|
+
index_counter += 1
|
|
1226
|
+
stack.append(v)
|
|
1227
|
+
onstack.add(v)
|
|
1228
|
+
|
|
1229
|
+
for w in adj_ids.get(v, []):
|
|
1230
|
+
if w not in indices:
|
|
1231
|
+
strongconnect(w)
|
|
1232
|
+
lowlinks[v] = min(lowlinks[v], lowlinks[w])
|
|
1233
|
+
elif w in onstack:
|
|
1234
|
+
lowlinks[v] = min(lowlinks[v], indices[w])
|
|
1235
|
+
|
|
1236
|
+
if lowlinks[v] == indices[v]:
|
|
1237
|
+
comp: List[int] = []
|
|
1238
|
+
while True:
|
|
1239
|
+
w = stack.pop()
|
|
1240
|
+
onstack.discard(w)
|
|
1241
|
+
comp.append(w)
|
|
1242
|
+
if w == v:
|
|
1243
|
+
break
|
|
1244
|
+
sccs.append(sorted(comp))
|
|
1245
|
+
|
|
1246
|
+
for node in all_ids:
|
|
1247
|
+
if node not in indices:
|
|
1248
|
+
strongconnect(node)
|
|
1249
|
+
|
|
1250
|
+
# Component DAG (reversed: dependency -> dependent) for leaves-first order
|
|
1251
|
+
id2comp: Dict[int, int] = {}
|
|
1252
|
+
for i, comp in enumerate(sccs):
|
|
1253
|
+
for nid in comp:
|
|
1254
|
+
id2comp[nid] = i
|
|
1255
|
+
|
|
1256
|
+
comp_count = len(sccs)
|
|
1257
|
+
comp_rev_adj: Dict[int, Set[int]] = {i: set() for i in range(comp_count)}
|
|
1258
|
+
indeg: Dict[int, int] = {i: 0 for i in range(comp_count)}
|
|
1259
|
+
for u in all_ids:
|
|
1260
|
+
cu = id2comp[u]
|
|
1261
|
+
for v in adj_ids.get(u, []):
|
|
1262
|
+
cv = id2comp[v]
|
|
1263
|
+
if cu != cv:
|
|
1264
|
+
if cu not in comp_rev_adj[cv]:
|
|
1265
|
+
comp_rev_adj[cv].add(cu)
|
|
1266
|
+
for cv, succs in comp_rev_adj.items():
|
|
1267
|
+
for cu in succs:
|
|
1268
|
+
indeg[cu] += 1
|
|
1269
|
+
|
|
1270
|
+
# Kahn on reversed DAG
|
|
1271
|
+
from collections import deque
|
|
1272
|
+
q = deque(sorted([i for i in range(comp_count) if indeg[i] == 0]))
|
|
1273
|
+
comp_order: List[int] = []
|
|
1274
|
+
while q:
|
|
1275
|
+
c = q.popleft()
|
|
1276
|
+
comp_order.append(c)
|
|
1277
|
+
for nxt in sorted(comp_rev_adj.get(c, set())):
|
|
1278
|
+
indeg[nxt] -= 1
|
|
1279
|
+
if indeg[nxt] == 0:
|
|
1280
|
+
q.append(nxt)
|
|
1281
|
+
|
|
1282
|
+
if len(comp_order) < comp_count:
|
|
1283
|
+
remaining = [i for i in range(comp_count) if i not in comp_order]
|
|
1284
|
+
comp_order.extend(sorted(remaining))
|
|
1285
|
+
|
|
1286
|
+
# Emit steps by root priority
|
|
1287
|
+
emitted: Set[int] = set()
|
|
1288
|
+
steps: List[Dict[str, Any]] = []
|
|
1289
|
+
now_ts = time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime())
|
|
1290
|
+
|
|
1291
|
+
# precompute reachability per root
|
|
1292
|
+
def _reachable(start_id: int) -> Set[int]:
|
|
1293
|
+
visited: Set[int] = set()
|
|
1294
|
+
stack2: List[int] = [start_id]
|
|
1295
|
+
visited.add(start_id)
|
|
1296
|
+
while stack2:
|
|
1297
|
+
s = stack2.pop()
|
|
1298
|
+
for v in adj_ids.get(s, []):
|
|
1299
|
+
if v not in visited:
|
|
1300
|
+
visited.add(v)
|
|
1301
|
+
stack2.append(v)
|
|
1302
|
+
return visited
|
|
1303
|
+
|
|
1304
|
+
root_reach: Dict[int, Set[int]] = {rid: _reachable(rid) for rid in roots}
|
|
1305
|
+
|
|
1306
|
+
def _emit_for_root(root_id: Optional[int]) -> None:
|
|
1307
|
+
# Emit order per root follows leaves-first (on reversed component DAG),
|
|
1308
|
+
# but delay entry functions (e.g., main) to the end if they are singleton components.
|
|
1309
|
+
reach = root_reach.get(root_id, set()) if root_id is not None else None
|
|
1310
|
+
|
|
1311
|
+
def _is_entry(nid: int) -> bool:
|
|
1312
|
+
meta = by_id.get(nid, {})
|
|
1313
|
+
nm = str(meta.get("name") or "").lower()
|
|
1314
|
+
qn = str(meta.get("qname") or "").lower()
|
|
1315
|
+
# Configurable delayed entry symbols via env:
|
|
1316
|
+
# - JARVIS_C2RUST_DELAY_ENTRY_SYMBOLS
|
|
1317
|
+
# - JARVIS_C2RUST_DELAY_ENTRIES
|
|
1318
|
+
# - C2RUST_DELAY_ENTRIES
|
|
1319
|
+
entries_env = os.environ.get("JARVIS_C2RUST_DELAY_ENTRY_SYMBOLS") or \
|
|
1320
|
+
os.environ.get("JARVIS_C2RUST_DELAY_ENTRIES") or \
|
|
1321
|
+
os.environ.get("C2RUST_DELAY_ENTRIES") or ""
|
|
1322
|
+
entries_set = set()
|
|
1323
|
+
if entries_env:
|
|
1324
|
+
try:
|
|
1325
|
+
import re as _re
|
|
1326
|
+
parts = _re.split(r"[,\s;]+", entries_env.strip())
|
|
1327
|
+
except Exception:
|
|
1328
|
+
parts = [p.strip() for p in entries_env.replace(";", ",").split(",")]
|
|
1329
|
+
entries_set = {p.strip().lower() for p in parts if p and p.strip()}
|
|
1330
|
+
# If configured, use the provided entries; otherwise fallback to default 'main'
|
|
1331
|
+
if entries_set:
|
|
1332
|
+
return (nm in entries_set) or (qn in entries_set)
|
|
1333
|
+
return nm == "main" or qn == "main" or qn.endswith("::main")
|
|
1334
|
+
|
|
1335
|
+
delayed_entries: List[int] = []
|
|
1336
|
+
|
|
1337
|
+
for comp_idx in comp_order:
|
|
1338
|
+
comp_nodes = sccs[comp_idx]
|
|
1339
|
+
selected: List[int] = []
|
|
1340
|
+
# Select nodes for this component, deferring entry (main) if safe to do so
|
|
1341
|
+
for nid in comp_nodes:
|
|
1342
|
+
if nid in emitted:
|
|
1343
|
+
continue
|
|
1344
|
+
if reach is not None and nid not in reach:
|
|
1345
|
+
continue
|
|
1346
|
+
# Skip type symbols in order emission (types don't require translation steps)
|
|
1347
|
+
meta_n = by_id.get(nid, {})
|
|
1348
|
+
if str(meta_n.get("cat") or "") == "type":
|
|
1349
|
+
continue
|
|
1350
|
+
# Only delay entry when the SCC is a singleton to avoid breaking intra-SCC semantics
|
|
1351
|
+
if _is_entry(nid) and len(comp_nodes) == 1:
|
|
1352
|
+
delayed_entries.append(nid)
|
|
1353
|
+
else:
|
|
1354
|
+
selected.append(nid)
|
|
1355
|
+
|
|
1356
|
+
if selected:
|
|
1357
|
+
for nid in selected:
|
|
1358
|
+
emitted.add(nid)
|
|
1359
|
+
syms: List[str] = []
|
|
1360
|
+
for nid in sorted(selected):
|
|
1361
|
+
meta = by_id.get(nid, {})
|
|
1362
|
+
label = meta.get("qname") or meta.get("name") or f"sym_{nid}"
|
|
1363
|
+
syms.append(label)
|
|
1364
|
+
roots_labels: List[str] = []
|
|
1365
|
+
if root_id is not None:
|
|
1366
|
+
meta_r = by_id.get(root_id, {})
|
|
1367
|
+
rlabel = meta_r.get("qname") or meta_r.get("name") or f"sym_{root_id}"
|
|
1368
|
+
roots_labels = [rlabel]
|
|
1369
|
+
steps.append({
|
|
1370
|
+
"step": len(steps) + 1,
|
|
1371
|
+
"ids": sorted(selected),
|
|
1372
|
+
"items": [by_id.get(nid, {}).get("record") for nid in sorted(selected) if isinstance(by_id.get(nid, {}).get("record"), dict)],
|
|
1373
|
+
"symbols": syms,
|
|
1374
|
+
"group": len(syms) > 1,
|
|
1375
|
+
"roots": roots_labels,
|
|
1376
|
+
"created_at": now_ts,
|
|
1377
|
+
})
|
|
1378
|
+
|
|
1379
|
+
# Emit delayed entry functions as the final step for this root
|
|
1380
|
+
if delayed_entries:
|
|
1381
|
+
for nid in delayed_entries:
|
|
1382
|
+
emitted.add(nid)
|
|
1383
|
+
syms: List[str] = []
|
|
1384
|
+
for nid in sorted(delayed_entries):
|
|
1385
|
+
meta = by_id.get(nid, {})
|
|
1386
|
+
label = meta.get("qname") or meta.get("name") or f"sym_{nid}"
|
|
1387
|
+
syms.append(label)
|
|
1388
|
+
roots_labels: List[str] = []
|
|
1389
|
+
if root_id is not None:
|
|
1390
|
+
meta_r = by_id.get(root_id, {})
|
|
1391
|
+
rlabel = meta_r.get("qname") or meta_r.get("name") or f"sym_{root_id}"
|
|
1392
|
+
roots_labels = [rlabel]
|
|
1393
|
+
steps.append({
|
|
1394
|
+
"step": len(steps) + 1,
|
|
1395
|
+
"ids": sorted(delayed_entries),
|
|
1396
|
+
"items": [by_id.get(nid, {}).get("record") for nid in sorted(delayed_entries) if isinstance(by_id.get(nid, {}).get("record"), dict)],
|
|
1397
|
+
"symbols": syms,
|
|
1398
|
+
"group": len(syms) > 1,
|
|
1399
|
+
"roots": roots_labels,
|
|
1400
|
+
"created_at": now_ts,
|
|
1401
|
+
})
|
|
1402
|
+
|
|
1403
|
+
for rid in sorted(roots, key=lambda r: len(root_reach.get(r, set())), reverse=True):
|
|
1404
|
+
_emit_for_root(rid)
|
|
1405
|
+
_emit_for_root(None)
|
|
1406
|
+
|
|
1407
|
+
if out_path is None:
|
|
1408
|
+
# 根据输入符号表选择输出文件名:
|
|
1409
|
+
# - symbols_raw.jsonl -> translation_order_raw.jsonl(扫描阶段原始顺序)
|
|
1410
|
+
# - 其他(如 symbols.jsonl/curated) -> translation_order.jsonl(默认)
|
|
1411
|
+
base = "translation_order.jsonl"
|
|
1412
|
+
try:
|
|
1413
|
+
name = Path(fjsonl).name.lower()
|
|
1414
|
+
if "symbols_raw.jsonl" in name:
|
|
1415
|
+
base = "translation_order_raw.jsonl"
|
|
1416
|
+
except Exception:
|
|
1417
|
+
pass
|
|
1418
|
+
out_path = fjsonl.parent / base
|
|
1419
|
+
out_path = Path(out_path)
|
|
1420
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1421
|
+
# Purge redundant fields before writing (keep ids and records; drop symbols/items)
|
|
1422
|
+
try:
|
|
1423
|
+
# 保留 items(包含完整符号记录及替换信息),仅移除冗余的 symbols 文本标签
|
|
1424
|
+
steps = [dict((k, v) for k, v in st.items() if k not in ("symbols",)) for st in steps]
|
|
1425
|
+
except Exception:
|
|
1426
|
+
pass
|
|
1427
|
+
with open(out_path, "w", encoding="utf-8") as fo:
|
|
1428
|
+
for st in steps:
|
|
1429
|
+
fo.write(json.dumps(st, ensure_ascii=False) + "\n")
|
|
1430
|
+
return out_path
|
|
1431
|
+
|
|
1432
|
+
|
|
1433
|
+
def export_root_subgraphs_to_dir(db_path: Path, out_dir: Path) -> List[Path]:
|
|
1434
|
+
# Generate per-root reference subgraph DOT files from symbols.jsonl into out_dir (unified: functions and types).
|
|
1435
|
+
def _resolve_symbols_jsonl_path(hint: Path) -> Path:
|
|
1436
|
+
p = Path(hint)
|
|
1437
|
+
if p.is_file() and p.suffix.lower() == ".jsonl":
|
|
1438
|
+
return p
|
|
1439
|
+
if p.is_dir():
|
|
1440
|
+
prefer = p / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1441
|
+
return prefer
|
|
1442
|
+
return Path(".") / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1443
|
+
|
|
1444
|
+
sjsonl = _resolve_symbols_jsonl_path(db_path)
|
|
1445
|
+
if not sjsonl.exists():
|
|
1446
|
+
raise FileNotFoundError(f"未找到 symbols.jsonl: {sjsonl}")
|
|
1447
|
+
|
|
1448
|
+
out_dir = Path(out_dir)
|
|
1449
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
1450
|
+
|
|
1451
|
+
# Load symbols (unified)
|
|
1452
|
+
by_id: Dict[int, Dict[str, str]] = {}
|
|
1453
|
+
name_to_id: Dict[str, int] = {}
|
|
1454
|
+
adj: Dict[int, List[str]] = {}
|
|
1455
|
+
|
|
1456
|
+
with open(sjsonl, "r", encoding="utf-8") as f:
|
|
1457
|
+
idx = 0
|
|
1458
|
+
for line in f:
|
|
1459
|
+
line = line.strip()
|
|
1460
|
+
if not line:
|
|
1461
|
+
continue
|
|
1462
|
+
try:
|
|
1463
|
+
obj = json.loads(line)
|
|
1464
|
+
except Exception:
|
|
1465
|
+
continue
|
|
1466
|
+
idx += 1
|
|
1467
|
+
# unified handling: include all symbols
|
|
1468
|
+
fid = int(obj.get("id") or idx)
|
|
1469
|
+
nm = obj.get("name") or ""
|
|
1470
|
+
qn = obj.get("qualified_name") or ""
|
|
1471
|
+
sig = obj.get("signature") or ""
|
|
1472
|
+
refs = obj.get("ref")
|
|
1473
|
+
if not isinstance(refs, list):
|
|
1474
|
+
refs = []
|
|
1475
|
+
refs = [c for c in refs if isinstance(c, str) and c]
|
|
1476
|
+
|
|
1477
|
+
by_id[fid] = {"name": nm, "qname": qn, "sig": sig}
|
|
1478
|
+
if nm:
|
|
1479
|
+
name_to_id.setdefault(nm, fid)
|
|
1480
|
+
if qn:
|
|
1481
|
+
name_to_id.setdefault(qn, fid)
|
|
1482
|
+
adj[fid] = refs
|
|
1483
|
+
|
|
1484
|
+
def base_label(fid: int) -> str:
|
|
1485
|
+
meta = by_id.get(fid, {})
|
|
1486
|
+
base = meta.get("qname") or meta.get("name") or f"sym_{fid}"
|
|
1487
|
+
sig = meta.get("sig") or ""
|
|
1488
|
+
if sig and sig != base:
|
|
1489
|
+
return f"{base}\\n{sig}"
|
|
1490
|
+
return base
|
|
1491
|
+
|
|
1492
|
+
def sanitize_filename(s: str) -> str:
|
|
1493
|
+
if not s:
|
|
1494
|
+
return "root"
|
|
1495
|
+
s = s.replace("::", "__")
|
|
1496
|
+
return "".join(ch if ch.isalnum() or ch in ("_", "-") else "_" for ch in s)[:120]
|
|
1497
|
+
|
|
1498
|
+
generated: List[Path] = []
|
|
1499
|
+
root_ids = find_root_function_ids(db_path)
|
|
1500
|
+
|
|
1501
|
+
for rid in root_ids:
|
|
1502
|
+
# DFS over internal refs from the root
|
|
1503
|
+
visited: Set[int] = set()
|
|
1504
|
+
stack: List[int] = [rid]
|
|
1505
|
+
visited.add(rid)
|
|
1506
|
+
while stack:
|
|
1507
|
+
src = stack.pop()
|
|
1508
|
+
for callee in adj.get(src, []):
|
|
1509
|
+
cid = name_to_id.get(callee)
|
|
1510
|
+
if cid is not None and cid not in visited:
|
|
1511
|
+
visited.add(cid)
|
|
1512
|
+
stack.append(cid)
|
|
1513
|
+
|
|
1514
|
+
# Build nodes and edges
|
|
1515
|
+
node_labels: Dict[str, str] = {}
|
|
1516
|
+
external_nodes: Dict[str, str] = {}
|
|
1517
|
+
ext_count = 0
|
|
1518
|
+
edges = set()
|
|
1519
|
+
|
|
1520
|
+
id_to_node = {fid: f"n{fid}" for fid in visited}
|
|
1521
|
+
|
|
1522
|
+
# Internal nodes
|
|
1523
|
+
for fid in visited:
|
|
1524
|
+
node_labels[id_to_node[fid]] = base_label(fid)
|
|
1525
|
+
|
|
1526
|
+
# Edges (internal -> internal/external)
|
|
1527
|
+
for src in visited:
|
|
1528
|
+
src_node = id_to_node[src]
|
|
1529
|
+
for callee in adj.get(src, []):
|
|
1530
|
+
cid = name_to_id.get(callee)
|
|
1531
|
+
if cid is not None and cid in visited:
|
|
1532
|
+
edges.add((src_node, id_to_node[cid]))
|
|
1533
|
+
else:
|
|
1534
|
+
dst = external_nodes.get(callee)
|
|
1535
|
+
if dst is None:
|
|
1536
|
+
dst = f"ext{ext_count}"
|
|
1537
|
+
ext_count += 1
|
|
1538
|
+
external_nodes[callee] = dst
|
|
1539
|
+
node_labels[dst] = callee
|
|
1540
|
+
edges.add((src_node, dst))
|
|
1541
|
+
|
|
1542
|
+
# Write DOT
|
|
1543
|
+
root_base = by_id.get(rid, {}).get("qname") or by_id.get(rid, {}).get("name") or f"sym_{rid}"
|
|
1544
|
+
fname = f"subgraph_root_{rid}_{sanitize_filename(root_base)}.dot"
|
|
1545
|
+
out_path = out_dir / fname
|
|
1546
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
1547
|
+
f.write("digraph refgraph_sub {\n")
|
|
1548
|
+
f.write(" rankdir=LR;\n")
|
|
1549
|
+
f.write(" graph [fontsize=10];\n")
|
|
1550
|
+
f.write(" node [fontsize=10];\n")
|
|
1551
|
+
f.write(" edge [fontsize=9];\n")
|
|
1552
|
+
|
|
1553
|
+
# Emit nodes
|
|
1554
|
+
for nid, lbl in node_labels.items():
|
|
1555
|
+
safe_label = lbl.replace("\\", "\\\\").replace('"', '\\"')
|
|
1556
|
+
if nid.startswith("ext"):
|
|
1557
|
+
f.write(f' {nid} [label="{safe_label}", shape=ellipse, style=dashed, color=gray50, fontcolor=gray30];\n')
|
|
1558
|
+
else:
|
|
1559
|
+
f.write(f' {nid} [label="{safe_label}", shape=box];\n')
|
|
1560
|
+
|
|
1561
|
+
# Emit edges
|
|
1562
|
+
for s, d in sorted(edges):
|
|
1563
|
+
f.write(f" {s} -> {d};\n")
|
|
1564
|
+
|
|
1565
|
+
f.write("}\n")
|
|
1566
|
+
|
|
1567
|
+
generated.append(out_path)
|
|
1568
|
+
|
|
1569
|
+
return generated
|
|
1570
|
+
|
|
1571
|
+
|
|
1572
|
+
# ---------------------------
|
|
1573
|
+
# Third-party replacement evaluation
|
|
1574
|
+
# ---------------------------
|
|
1575
|
+
|
|
1576
|
+
def run_scan(
|
|
1577
|
+
dot: Optional[Path] = None,
|
|
1578
|
+
only_dot: bool = False,
|
|
1579
|
+
subgraphs_dir: Optional[Path] = None,
|
|
1580
|
+
only_subgraphs: bool = False,
|
|
1581
|
+
png: bool = False,
|
|
1582
|
+
) -> None:
|
|
1583
|
+
# Scan for C/C++ functions and persist results to JSONL; optionally generate DOT.
|
|
1584
|
+
# Determine data path
|
|
1585
|
+
root = Path('.')
|
|
1586
|
+
data_path_raw = Path('.') / ".jarvis" / "c2rust" / "symbols_raw.jsonl"
|
|
1587
|
+
data_path_curated = Path('.') / ".jarvis" / "c2rust" / "symbols.jsonl"
|
|
1588
|
+
|
|
1589
|
+
# Helper: render a DOT file to PNG using Graphviz 'dot'
|
|
1590
|
+
def _render_dot_to_png(dot_file: Path, png_out: Optional[Path] = None) -> Path:
|
|
1591
|
+
try:
|
|
1592
|
+
from shutil import which
|
|
1593
|
+
import subprocess
|
|
1594
|
+
except Exception as _e:
|
|
1595
|
+
raise RuntimeError(f"准备 PNG 渲染时出现环境问题: {_e}")
|
|
1596
|
+
exe = which("dot")
|
|
1597
|
+
if not exe:
|
|
1598
|
+
raise RuntimeError("在 PATH 中未找到 Graphviz 'dot'。请安装 graphviz 并确保 'dot' 可用。")
|
|
1599
|
+
dot_file = Path(dot_file)
|
|
1600
|
+
if png_out is None:
|
|
1601
|
+
png_out = dot_file.with_suffix(".png")
|
|
1602
|
+
else:
|
|
1603
|
+
png_out = Path(png_out)
|
|
1604
|
+
png_out.parent.mkdir(parents=True, exist_ok=True)
|
|
1605
|
+
try:
|
|
1606
|
+
subprocess.run([exe, "-Tpng", str(dot_file), "-o", str(png_out)], check=True)
|
|
1607
|
+
except FileNotFoundError:
|
|
1608
|
+
raise RuntimeError("未找到 Graphviz 'dot' 可执行文件。")
|
|
1609
|
+
except subprocess.CalledProcessError as e:
|
|
1610
|
+
raise RuntimeError(f"'dot' 渲染 {dot_file} 为 PNG 失败: {e}")
|
|
1611
|
+
return png_out
|
|
1612
|
+
|
|
1613
|
+
if not (only_dot or only_subgraphs):
|
|
1614
|
+
try:
|
|
1615
|
+
scan_directory(root)
|
|
1616
|
+
except Exception as e:
|
|
1617
|
+
typer.secho(f"[c2rust-scanner] 错误: {e}", fg=typer.colors.RED, err=True)
|
|
1618
|
+
raise typer.Exit(code=1)
|
|
1619
|
+
else:
|
|
1620
|
+
# Only-generate mode (no rescan). 验证输入,仅基于既有 symbols.jsonl 进行可选的 DOT/子图输出;此处不计算翻译顺序。
|
|
1621
|
+
if not data_path_curated.exists():
|
|
1622
|
+
typer.secho(f"[c2rust-scanner] 未找到数据: {data_path_curated}", fg=typer.colors.RED, err=True)
|
|
1623
|
+
raise typer.Exit(code=2)
|
|
1624
|
+
if only_dot and dot is None:
|
|
1625
|
+
typer.secho("[c2rust-scanner] --only-dot 需要 --dot 来指定输出文件", fg=typer.colors.RED, err=True)
|
|
1626
|
+
raise typer.Exit(code=2)
|
|
1627
|
+
if only_subgraphs and subgraphs_dir is None:
|
|
1628
|
+
typer.secho("[c2rust-scanner] --only-subgraphs 需要 --subgraphs-dir 来指定输出目录", fg=typer.colors.RED, err=True)
|
|
1629
|
+
raise typer.Exit(code=2)
|
|
1630
|
+
|
|
1631
|
+
# Generate DOT (global) if requested
|
|
1632
|
+
if dot is not None:
|
|
1633
|
+
try:
|
|
1634
|
+
# 使用正式符号表生成可视化
|
|
1635
|
+
generate_dot_from_db(data_path_curated, dot)
|
|
1636
|
+
typer.secho(f"[c2rust-scanner] DOT 文件已写入: {dot}", fg=typer.colors.GREEN)
|
|
1637
|
+
if png:
|
|
1638
|
+
png_path = _render_dot_to_png(dot)
|
|
1639
|
+
typer.secho(f"[c2rust-scanner] PNG 文件已写入: {png_path}", fg=typer.colors.GREEN)
|
|
1640
|
+
except Exception as e:
|
|
1641
|
+
typer.secho(f"[c2rust-scanner] 写入 DOT/PNG 失败: {e}", fg=typer.colors.RED, err=True)
|
|
1642
|
+
raise typer.Exit(code=1)
|
|
1643
|
+
|
|
1644
|
+
# Generate per-root subgraphs if requested
|
|
1645
|
+
if subgraphs_dir is not None:
|
|
1646
|
+
try:
|
|
1647
|
+
# 使用正式符号表生成根节点子图
|
|
1648
|
+
files = export_root_subgraphs_to_dir(data_path_curated, subgraphs_dir)
|
|
1649
|
+
if png:
|
|
1650
|
+
png_count = 0
|
|
1651
|
+
for dp in files:
|
|
1652
|
+
try:
|
|
1653
|
+
_render_dot_to_png(dp)
|
|
1654
|
+
png_count += 1
|
|
1655
|
+
except Exception as _e:
|
|
1656
|
+
# Fail fast on PNG generation error for subgraphs to make issues visible
|
|
1657
|
+
raise
|
|
1658
|
+
typer.secho(
|
|
1659
|
+
f"[c2rust-scanner] 根节点子图已写入: {len(files)} 个 DOT 文件和 {png_count} 个 PNG 文件 -> {subgraphs_dir}",
|
|
1660
|
+
fg=typer.colors.GREEN,
|
|
1661
|
+
)
|
|
1662
|
+
else:
|
|
1663
|
+
typer.secho(
|
|
1664
|
+
f"[c2rust-scanner] 根节点子图已写入: {len(files)} 个文件 -> {subgraphs_dir}",
|
|
1665
|
+
fg=typer.colors.GREEN,
|
|
1666
|
+
)
|
|
1667
|
+
except Exception as e:
|
|
1668
|
+
typer.secho(f"[c2rust-scanner] 写入子图 DOT/PNG 失败: {e}", fg=typer.colors.RED, err=True)
|
|
1669
|
+
raise typer.Exit(code=1)
|
|
1670
|
+
|
|
1671
|
+
|