ghostcode 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ghostcode/__init__.py +3 -0
- ghostcode/audit/__init__.py +0 -0
- ghostcode/audit/logger.py +149 -0
- ghostcode/cli.py +986 -0
- ghostcode/config.py +187 -0
- ghostcode/mapping/__init__.py +0 -0
- ghostcode/mapping/encryption.py +143 -0
- ghostcode/mapping/ghost_map.py +222 -0
- ghostcode/mapping/token_generator.py +78 -0
- ghostcode/parsers/__init__.py +0 -0
- ghostcode/parsers/base.py +66 -0
- ghostcode/parsers/cpp_parser.py +341 -0
- ghostcode/parsers/python_parser.py +397 -0
- ghostcode/reveal/__init__.py +0 -0
- ghostcode/reveal/code_revealer.py +374 -0
- ghostcode/reveal/diff_analyzer.py +426 -0
- ghostcode/reveal/explanation_translator.py +214 -0
- ghostcode/risk_report.py +467 -0
- ghostcode/transformers/__init__.py +0 -0
- ghostcode/transformers/comment_anonymizer.py +95 -0
- ghostcode/transformers/comment_stripper.py +60 -0
- ghostcode/transformers/isolator.py +312 -0
- ghostcode/transformers/literal_scrubber.py +452 -0
- ghostcode/transformers/multi_file.py +99 -0
- ghostcode/transformers/symbol_renamer.py +64 -0
- ghostcode/utils/__init__.py +0 -0
- ghostcode/utils/clipboard.py +52 -0
- ghostcode/utils/stdlib_registry.py +221 -0
- ghostcode-0.5.0.dist-info/METADATA +92 -0
- ghostcode-0.5.0.dist-info/RECORD +33 -0
- ghostcode-0.5.0.dist-info/WHEEL +5 -0
- ghostcode-0.5.0.dist-info/entry_points.txt +2 -0
- ghostcode-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
"""C++ AST parser using libclang.
|
|
2
|
+
|
|
3
|
+
Strategy: Two-pass approach.
|
|
4
|
+
Pass 1 (AST): Walk the clang AST to identify which symbols are user-defined.
|
|
5
|
+
Collect their names, kinds, and scopes.
|
|
6
|
+
Pass 2 (Token): Scan ALL tokens in the source file. For each identifier token,
|
|
7
|
+
check if it matches a known user-defined symbol. If yes, record
|
|
8
|
+
its exact byte offset.
|
|
9
|
+
|
|
10
|
+
This two-pass approach is more robust than trying to get exact offsets from the
|
|
11
|
+
AST alone, because libclang's reference cursors can miss implicit references
|
|
12
|
+
(e.g., member access via implicit 'this->') or report imprecise locations.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import subprocess
|
|
18
|
+
from clang.cindex import (
|
|
19
|
+
Config,
|
|
20
|
+
CursorKind,
|
|
21
|
+
Index,
|
|
22
|
+
TokenKind,
|
|
23
|
+
TranslationUnit,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
from .base import BaseParser, Comment, ParseResult, Symbol, SymbolLocation
|
|
27
|
+
|
|
28
|
+
# Cursor kinds that represent user-definable symbols
|
|
29
|
+
USER_SYMBOL_KINDS = {
|
|
30
|
+
CursorKind.VAR_DECL,
|
|
31
|
+
CursorKind.PARM_DECL,
|
|
32
|
+
CursorKind.FUNCTION_DECL,
|
|
33
|
+
CursorKind.CXX_METHOD,
|
|
34
|
+
CursorKind.CONSTRUCTOR,
|
|
35
|
+
CursorKind.DESTRUCTOR,
|
|
36
|
+
CursorKind.CLASS_DECL,
|
|
37
|
+
CursorKind.STRUCT_DECL,
|
|
38
|
+
CursorKind.FIELD_DECL,
|
|
39
|
+
CursorKind.ENUM_DECL,
|
|
40
|
+
CursorKind.ENUM_CONSTANT_DECL,
|
|
41
|
+
CursorKind.NAMESPACE,
|
|
42
|
+
CursorKind.TYPEDEF_DECL,
|
|
43
|
+
CursorKind.TYPE_ALIAS_DECL,
|
|
44
|
+
CursorKind.CLASS_TEMPLATE,
|
|
45
|
+
CursorKind.FUNCTION_TEMPLATE,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Map from CursorKind to our simplified kind string
|
|
49
|
+
KIND_MAP = {
|
|
50
|
+
CursorKind.VAR_DECL: "variable",
|
|
51
|
+
CursorKind.PARM_DECL: "parameter",
|
|
52
|
+
CursorKind.FUNCTION_DECL: "function",
|
|
53
|
+
CursorKind.CXX_METHOD: "method",
|
|
54
|
+
CursorKind.CONSTRUCTOR: "method",
|
|
55
|
+
CursorKind.DESTRUCTOR: "method",
|
|
56
|
+
CursorKind.CLASS_DECL: "class",
|
|
57
|
+
CursorKind.STRUCT_DECL: "struct",
|
|
58
|
+
CursorKind.FIELD_DECL: "field",
|
|
59
|
+
CursorKind.ENUM_DECL: "enum",
|
|
60
|
+
CursorKind.ENUM_CONSTANT_DECL: "enum_constant",
|
|
61
|
+
CursorKind.NAMESPACE: "namespace",
|
|
62
|
+
CursorKind.TYPEDEF_DECL: "typedef",
|
|
63
|
+
CursorKind.TYPE_ALIAS_DECL: "type_alias",
|
|
64
|
+
CursorKind.CLASS_TEMPLATE: "class",
|
|
65
|
+
CursorKind.FUNCTION_TEMPLATE: "function",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Common system include paths on macOS
|
|
69
|
+
SYSTEM_PATHS = (
|
|
70
|
+
"/usr/include",
|
|
71
|
+
"/usr/lib",
|
|
72
|
+
"/usr/local/include",
|
|
73
|
+
"/Library/Developer",
|
|
74
|
+
"/Applications/Xcode.app",
|
|
75
|
+
"/opt/homebrew",
|
|
76
|
+
"/usr/local/Cellar",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# C++ keywords that should never be renamed
|
|
80
|
+
CPP_KEYWORDS = {
|
|
81
|
+
"alignas", "alignof", "and", "and_eq", "asm", "auto", "bitand",
|
|
82
|
+
"bitor", "bool", "break", "case", "catch", "char", "char8_t",
|
|
83
|
+
"char16_t", "char32_t", "class", "compl", "concept", "const",
|
|
84
|
+
"consteval", "constexpr", "constinit", "const_cast", "continue",
|
|
85
|
+
"co_await", "co_return", "co_yield", "decltype", "default", "delete",
|
|
86
|
+
"do", "double", "dynamic_cast", "else", "enum", "explicit", "export",
|
|
87
|
+
"extern", "false", "float", "for", "friend", "goto", "if", "inline",
|
|
88
|
+
"int", "long", "mutable", "namespace", "new", "noexcept", "not",
|
|
89
|
+
"not_eq", "nullptr", "operator", "or", "or_eq", "private", "protected",
|
|
90
|
+
"public", "register", "reinterpret_cast", "requires", "return", "short",
|
|
91
|
+
"signed", "sizeof", "static", "static_assert", "static_cast", "struct",
|
|
92
|
+
"switch", "template", "this", "thread_local", "throw", "true", "try",
|
|
93
|
+
"typedef", "typeid", "typename", "union", "unsigned", "using",
|
|
94
|
+
"virtual", "void", "volatile", "wchar_t", "while", "xor", "xor_eq",
|
|
95
|
+
"override", "final",
|
|
96
|
+
# Common builtins
|
|
97
|
+
"main", "argc", "argv", "NULL", "size_t", "ptrdiff_t",
|
|
98
|
+
"int8_t", "int16_t", "int32_t", "int64_t",
|
|
99
|
+
"uint8_t", "uint16_t", "uint32_t", "uint64_t",
|
|
100
|
+
"string", "cout", "cin", "cerr", "endl", "std",
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _find_libclang():
|
|
105
|
+
"""Attempt to locate libclang on macOS."""
|
|
106
|
+
common_paths = [
|
|
107
|
+
"/opt/homebrew/opt/llvm/lib/libclang.dylib",
|
|
108
|
+
"/usr/local/opt/llvm/lib/libclang.dylib",
|
|
109
|
+
"/Library/Developer/CommandLineTools/usr/lib/libclang.dylib",
|
|
110
|
+
"/Applications/Xcode.app/Contents/Developer/Toolchains/"
|
|
111
|
+
"XcodeDefault.xctoolchain/usr/lib/libclang.dylib",
|
|
112
|
+
]
|
|
113
|
+
for path in common_paths:
|
|
114
|
+
if os.path.exists(path):
|
|
115
|
+
return path
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _get_sdk_path() -> str | None:
|
|
120
|
+
"""Get the macOS SDK path for C++ stdlib headers."""
|
|
121
|
+
try:
|
|
122
|
+
result = subprocess.run(
|
|
123
|
+
["xcrun", "--show-sdk-path"],
|
|
124
|
+
capture_output=True, text=True, timeout=5,
|
|
125
|
+
)
|
|
126
|
+
if result.returncode == 0:
|
|
127
|
+
return result.stdout.strip()
|
|
128
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
129
|
+
pass
|
|
130
|
+
return None
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _get_clang_args() -> list[str]:
|
|
134
|
+
"""Build clang args including proper SDK include paths."""
|
|
135
|
+
args = ["-std=c++17", "-x", "c++"]
|
|
136
|
+
sdk = _get_sdk_path()
|
|
137
|
+
if sdk:
|
|
138
|
+
args.extend([
|
|
139
|
+
f"-isysroot", sdk,
|
|
140
|
+
f"-I{sdk}/usr/include/c++/v1",
|
|
141
|
+
f"-I{sdk}/usr/include",
|
|
142
|
+
])
|
|
143
|
+
return args
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class CppParser(BaseParser):
|
|
147
|
+
"""C++ AST parser using libclang.
|
|
148
|
+
|
|
149
|
+
Two-pass strategy:
|
|
150
|
+
Pass 1: AST walk to discover user-defined symbol names and kinds.
|
|
151
|
+
Pass 2: Token scan to find every occurrence with exact byte offsets.
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
def __init__(self):
|
|
155
|
+
libclang_path = _find_libclang()
|
|
156
|
+
if libclang_path and not Config.loaded:
|
|
157
|
+
Config.set_library_file(libclang_path)
|
|
158
|
+
self._index = Index.create()
|
|
159
|
+
self._source_file: str = ""
|
|
160
|
+
|
|
161
|
+
def parse(self, file_path: str) -> ParseResult:
|
|
162
|
+
self._source_file = os.path.abspath(file_path)
|
|
163
|
+
|
|
164
|
+
with open(file_path) as f:
|
|
165
|
+
source_code = f.read()
|
|
166
|
+
|
|
167
|
+
tu = self._index.parse(
|
|
168
|
+
file_path,
|
|
169
|
+
args=_get_clang_args(),
|
|
170
|
+
options=TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Pass 1: Walk AST to discover user-defined symbols
|
|
174
|
+
user_symbols: dict[str, Symbol] = {}
|
|
175
|
+
warnings: list[dict] = []
|
|
176
|
+
self._discover_symbols(tu.cursor, user_symbols, warnings)
|
|
177
|
+
|
|
178
|
+
# Pass 2: Scan source for all occurrences of discovered symbols
|
|
179
|
+
self._find_all_occurrences(source_code, user_symbols)
|
|
180
|
+
|
|
181
|
+
# Extract comments
|
|
182
|
+
comments = self._extract_comments(tu, source_code)
|
|
183
|
+
|
|
184
|
+
return ParseResult(
|
|
185
|
+
symbols=list(user_symbols.values()),
|
|
186
|
+
comments=comments,
|
|
187
|
+
source_code=source_code,
|
|
188
|
+
file_path=file_path,
|
|
189
|
+
warnings=warnings,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def _discover_symbols(self, cursor, symbols: dict, warnings: list):
|
|
193
|
+
"""Pass 1: Walk AST to discover all user-defined symbol names."""
|
|
194
|
+
if cursor.location.file:
|
|
195
|
+
filepath = str(cursor.location.file)
|
|
196
|
+
if self._is_system_header(filepath):
|
|
197
|
+
return
|
|
198
|
+
if os.path.abspath(filepath) != self._source_file:
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
if cursor.kind in USER_SYMBOL_KINDS:
|
|
202
|
+
name = cursor.spelling
|
|
203
|
+
if name and not name.startswith("operator") and name not in CPP_KEYWORDS:
|
|
204
|
+
if self._is_in_source_file(cursor):
|
|
205
|
+
scope = self._get_scope(cursor)
|
|
206
|
+
kind = KIND_MAP.get(cursor.kind, "variable")
|
|
207
|
+
|
|
208
|
+
# For constructors/destructors, use the class scope
|
|
209
|
+
if cursor.kind in (CursorKind.CONSTRUCTOR, CursorKind.DESTRUCTOR):
|
|
210
|
+
# Don't create a separate symbol for constructors —
|
|
211
|
+
# the class name symbol already covers it
|
|
212
|
+
pass
|
|
213
|
+
else:
|
|
214
|
+
key = name # Use simple name as key for token matching
|
|
215
|
+
if key not in symbols:
|
|
216
|
+
symbols[key] = Symbol(
|
|
217
|
+
name=name, kind=kind, scope=scope
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
for child in cursor.get_children():
|
|
221
|
+
self._discover_symbols(child, symbols, warnings)
|
|
222
|
+
|
|
223
|
+
def _find_all_occurrences(self, source_code: str, symbols: dict):
|
|
224
|
+
"""Pass 2: Find every occurrence of each user symbol in source code.
|
|
225
|
+
|
|
226
|
+
Uses word-boundary regex to find exact positions. This catches ALL
|
|
227
|
+
references including implicit this->, initializer lists, and any
|
|
228
|
+
other context the AST walk might miss.
|
|
229
|
+
"""
|
|
230
|
+
for name, symbol in symbols.items():
|
|
231
|
+
pattern = re.compile(r"\b" + re.escape(name) + r"\b")
|
|
232
|
+
for match in pattern.finditer(source_code):
|
|
233
|
+
offset = match.start()
|
|
234
|
+
end_offset = match.end()
|
|
235
|
+
|
|
236
|
+
# Determine line number
|
|
237
|
+
line = source_code[:offset].count("\n") + 1
|
|
238
|
+
|
|
239
|
+
# Skip if this is inside a string literal or #include
|
|
240
|
+
if self._is_inside_string(source_code, offset):
|
|
241
|
+
continue
|
|
242
|
+
if self._is_inside_include(source_code, offset):
|
|
243
|
+
continue
|
|
244
|
+
# Skip if preceded by :: from std namespace (e.g., std::vector)
|
|
245
|
+
if self._is_std_qualified(source_code, offset):
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
symbol.locations.append(SymbolLocation(
|
|
249
|
+
file=self._source_file,
|
|
250
|
+
line=line,
|
|
251
|
+
col=offset - source_code.rfind("\n", 0, offset),
|
|
252
|
+
offset=offset,
|
|
253
|
+
end_offset=end_offset,
|
|
254
|
+
))
|
|
255
|
+
|
|
256
|
+
def _is_inside_string(self, source: str, offset: int) -> bool:
|
|
257
|
+
"""Check if an offset is inside a string literal."""
|
|
258
|
+
# Find the line containing this offset
|
|
259
|
+
line_start = source.rfind("\n", 0, offset) + 1
|
|
260
|
+
line_end = source.find("\n", offset)
|
|
261
|
+
if line_end == -1:
|
|
262
|
+
line_end = len(source)
|
|
263
|
+
line = source[line_start:line_end]
|
|
264
|
+
pos_in_line = offset - line_start
|
|
265
|
+
|
|
266
|
+
# Count unescaped quotes before this position
|
|
267
|
+
in_string = False
|
|
268
|
+
quote_char = None
|
|
269
|
+
i = 0
|
|
270
|
+
while i < pos_in_line:
|
|
271
|
+
ch = line[i]
|
|
272
|
+
if not in_string:
|
|
273
|
+
if ch in ('"', "'"):
|
|
274
|
+
in_string = True
|
|
275
|
+
quote_char = ch
|
|
276
|
+
else:
|
|
277
|
+
if ch == "\\" :
|
|
278
|
+
i += 1 # skip escaped char
|
|
279
|
+
elif ch == quote_char:
|
|
280
|
+
in_string = False
|
|
281
|
+
i += 1
|
|
282
|
+
|
|
283
|
+
return in_string
|
|
284
|
+
|
|
285
|
+
def _is_inside_include(self, source: str, offset: int) -> bool:
|
|
286
|
+
"""Check if an offset is on a #include line."""
|
|
287
|
+
line_start = source.rfind("\n", 0, offset) + 1
|
|
288
|
+
line_end = source.find("\n", offset)
|
|
289
|
+
if line_end == -1:
|
|
290
|
+
line_end = len(source)
|
|
291
|
+
line = source[line_start:line_end].strip()
|
|
292
|
+
return line.startswith("#include")
|
|
293
|
+
|
|
294
|
+
def _is_std_qualified(self, source: str, offset: int) -> bool:
|
|
295
|
+
"""Check if the identifier is preceded by 'std::'."""
|
|
296
|
+
# Look for 'std::' immediately before the identifier
|
|
297
|
+
prefix_start = max(0, offset - 5)
|
|
298
|
+
prefix = source[prefix_start:offset]
|
|
299
|
+
return prefix.endswith("std::")
|
|
300
|
+
|
|
301
|
+
def _get_scope(self, cursor) -> str:
|
|
302
|
+
"""Get the qualified scope of a cursor."""
|
|
303
|
+
parts = []
|
|
304
|
+
parent = cursor.semantic_parent
|
|
305
|
+
while parent and parent.kind != CursorKind.TRANSLATION_UNIT:
|
|
306
|
+
if parent.spelling:
|
|
307
|
+
parts.append(parent.spelling)
|
|
308
|
+
parent = parent.semantic_parent
|
|
309
|
+
return "::".join(reversed(parts))
|
|
310
|
+
|
|
311
|
+
def _is_in_source_file(self, cursor) -> bool:
|
|
312
|
+
"""Check if cursor is in the file being parsed."""
|
|
313
|
+
if not cursor.location.file:
|
|
314
|
+
return False
|
|
315
|
+
return os.path.abspath(str(cursor.location.file)) == self._source_file
|
|
316
|
+
|
|
317
|
+
def is_user_defined(self, name: str, **kwargs) -> bool:
|
|
318
|
+
if name in CPP_KEYWORDS:
|
|
319
|
+
return False
|
|
320
|
+
cursor = kwargs.get("cursor")
|
|
321
|
+
if cursor and cursor.location.file:
|
|
322
|
+
if self._is_system_header(str(cursor.location.file)):
|
|
323
|
+
return False
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
def _is_system_header(self, filepath: str) -> bool:
|
|
327
|
+
abspath = os.path.abspath(filepath)
|
|
328
|
+
return any(abspath.startswith(sp) for sp in SYSTEM_PATHS)
|
|
329
|
+
|
|
330
|
+
def _extract_comments(self, tu, source_code: str) -> list[Comment]:
|
|
331
|
+
"""Extract all comments using libclang tokenization."""
|
|
332
|
+
comments = []
|
|
333
|
+
for token in tu.cursor.get_tokens():
|
|
334
|
+
if token.kind == TokenKind.COMMENT:
|
|
335
|
+
extent = token.extent
|
|
336
|
+
comments.append(Comment(
|
|
337
|
+
offset=extent.start.offset,
|
|
338
|
+
end_offset=extent.end.offset,
|
|
339
|
+
line=token.location.line,
|
|
340
|
+
))
|
|
341
|
+
return comments
|