ghostcode 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,341 @@
1
+ """C++ AST parser using libclang.
2
+
3
+ Strategy: Two-pass approach.
4
+ Pass 1 (AST): Walk the clang AST to identify which symbols are user-defined.
5
+ Collect their names, kinds, and scopes.
6
+ Pass 2 (Token): Scan ALL tokens in the source file. For each identifier token,
7
+ check if it matches a known user-defined symbol. If yes, record
8
+ its exact byte offset.
9
+
10
+ This two-pass approach is more robust than trying to get exact offsets from the
11
+ AST alone, because libclang's reference cursors can miss implicit references
12
+ (e.g., member access via implicit 'this->') or report imprecise locations.
13
+ """
14
+
15
+ import os
16
+ import re
17
+ import subprocess
18
+ from clang.cindex import (
19
+ Config,
20
+ CursorKind,
21
+ Index,
22
+ TokenKind,
23
+ TranslationUnit,
24
+ )
25
+
26
+ from .base import BaseParser, Comment, ParseResult, Symbol, SymbolLocation
27
+
28
+ # Cursor kinds that represent user-definable symbols
29
+ USER_SYMBOL_KINDS = {
30
+ CursorKind.VAR_DECL,
31
+ CursorKind.PARM_DECL,
32
+ CursorKind.FUNCTION_DECL,
33
+ CursorKind.CXX_METHOD,
34
+ CursorKind.CONSTRUCTOR,
35
+ CursorKind.DESTRUCTOR,
36
+ CursorKind.CLASS_DECL,
37
+ CursorKind.STRUCT_DECL,
38
+ CursorKind.FIELD_DECL,
39
+ CursorKind.ENUM_DECL,
40
+ CursorKind.ENUM_CONSTANT_DECL,
41
+ CursorKind.NAMESPACE,
42
+ CursorKind.TYPEDEF_DECL,
43
+ CursorKind.TYPE_ALIAS_DECL,
44
+ CursorKind.CLASS_TEMPLATE,
45
+ CursorKind.FUNCTION_TEMPLATE,
46
+ }
47
+
48
+ # Map from CursorKind to our simplified kind string
49
+ KIND_MAP = {
50
+ CursorKind.VAR_DECL: "variable",
51
+ CursorKind.PARM_DECL: "parameter",
52
+ CursorKind.FUNCTION_DECL: "function",
53
+ CursorKind.CXX_METHOD: "method",
54
+ CursorKind.CONSTRUCTOR: "method",
55
+ CursorKind.DESTRUCTOR: "method",
56
+ CursorKind.CLASS_DECL: "class",
57
+ CursorKind.STRUCT_DECL: "struct",
58
+ CursorKind.FIELD_DECL: "field",
59
+ CursorKind.ENUM_DECL: "enum",
60
+ CursorKind.ENUM_CONSTANT_DECL: "enum_constant",
61
+ CursorKind.NAMESPACE: "namespace",
62
+ CursorKind.TYPEDEF_DECL: "typedef",
63
+ CursorKind.TYPE_ALIAS_DECL: "type_alias",
64
+ CursorKind.CLASS_TEMPLATE: "class",
65
+ CursorKind.FUNCTION_TEMPLATE: "function",
66
+ }
67
+
68
+ # Common system include paths on macOS
69
+ SYSTEM_PATHS = (
70
+ "/usr/include",
71
+ "/usr/lib",
72
+ "/usr/local/include",
73
+ "/Library/Developer",
74
+ "/Applications/Xcode.app",
75
+ "/opt/homebrew",
76
+ "/usr/local/Cellar",
77
+ )
78
+
79
+ # C++ keywords that should never be renamed
80
+ CPP_KEYWORDS = {
81
+ "alignas", "alignof", "and", "and_eq", "asm", "auto", "bitand",
82
+ "bitor", "bool", "break", "case", "catch", "char", "char8_t",
83
+ "char16_t", "char32_t", "class", "compl", "concept", "const",
84
+ "consteval", "constexpr", "constinit", "const_cast", "continue",
85
+ "co_await", "co_return", "co_yield", "decltype", "default", "delete",
86
+ "do", "double", "dynamic_cast", "else", "enum", "explicit", "export",
87
+ "extern", "false", "float", "for", "friend", "goto", "if", "inline",
88
+ "int", "long", "mutable", "namespace", "new", "noexcept", "not",
89
+ "not_eq", "nullptr", "operator", "or", "or_eq", "private", "protected",
90
+ "public", "register", "reinterpret_cast", "requires", "return", "short",
91
+ "signed", "sizeof", "static", "static_assert", "static_cast", "struct",
92
+ "switch", "template", "this", "thread_local", "throw", "true", "try",
93
+ "typedef", "typeid", "typename", "union", "unsigned", "using",
94
+ "virtual", "void", "volatile", "wchar_t", "while", "xor", "xor_eq",
95
+ "override", "final",
96
+ # Common builtins
97
+ "main", "argc", "argv", "NULL", "size_t", "ptrdiff_t",
98
+ "int8_t", "int16_t", "int32_t", "int64_t",
99
+ "uint8_t", "uint16_t", "uint32_t", "uint64_t",
100
+ "string", "cout", "cin", "cerr", "endl", "std",
101
+ }
102
+
103
+
104
+ def _find_libclang():
105
+ """Attempt to locate libclang on macOS."""
106
+ common_paths = [
107
+ "/opt/homebrew/opt/llvm/lib/libclang.dylib",
108
+ "/usr/local/opt/llvm/lib/libclang.dylib",
109
+ "/Library/Developer/CommandLineTools/usr/lib/libclang.dylib",
110
+ "/Applications/Xcode.app/Contents/Developer/Toolchains/"
111
+ "XcodeDefault.xctoolchain/usr/lib/libclang.dylib",
112
+ ]
113
+ for path in common_paths:
114
+ if os.path.exists(path):
115
+ return path
116
+ return None
117
+
118
+
119
+ def _get_sdk_path() -> str | None:
120
+ """Get the macOS SDK path for C++ stdlib headers."""
121
+ try:
122
+ result = subprocess.run(
123
+ ["xcrun", "--show-sdk-path"],
124
+ capture_output=True, text=True, timeout=5,
125
+ )
126
+ if result.returncode == 0:
127
+ return result.stdout.strip()
128
+ except (subprocess.TimeoutExpired, FileNotFoundError):
129
+ pass
130
+ return None
131
+
132
+
133
+ def _get_clang_args() -> list[str]:
134
+ """Build clang args including proper SDK include paths."""
135
+ args = ["-std=c++17", "-x", "c++"]
136
+ sdk = _get_sdk_path()
137
+ if sdk:
138
+ args.extend([
139
+ f"-isysroot", sdk,
140
+ f"-I{sdk}/usr/include/c++/v1",
141
+ f"-I{sdk}/usr/include",
142
+ ])
143
+ return args
144
+
145
+
146
+ class CppParser(BaseParser):
147
+ """C++ AST parser using libclang.
148
+
149
+ Two-pass strategy:
150
+ Pass 1: AST walk to discover user-defined symbol names and kinds.
151
+ Pass 2: Token scan to find every occurrence with exact byte offsets.
152
+ """
153
+
154
+ def __init__(self):
155
+ libclang_path = _find_libclang()
156
+ if libclang_path and not Config.loaded:
157
+ Config.set_library_file(libclang_path)
158
+ self._index = Index.create()
159
+ self._source_file: str = ""
160
+
161
+ def parse(self, file_path: str) -> ParseResult:
162
+ self._source_file = os.path.abspath(file_path)
163
+
164
+ with open(file_path) as f:
165
+ source_code = f.read()
166
+
167
+ tu = self._index.parse(
168
+ file_path,
169
+ args=_get_clang_args(),
170
+ options=TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD,
171
+ )
172
+
173
+ # Pass 1: Walk AST to discover user-defined symbols
174
+ user_symbols: dict[str, Symbol] = {}
175
+ warnings: list[dict] = []
176
+ self._discover_symbols(tu.cursor, user_symbols, warnings)
177
+
178
+ # Pass 2: Scan source for all occurrences of discovered symbols
179
+ self._find_all_occurrences(source_code, user_symbols)
180
+
181
+ # Extract comments
182
+ comments = self._extract_comments(tu, source_code)
183
+
184
+ return ParseResult(
185
+ symbols=list(user_symbols.values()),
186
+ comments=comments,
187
+ source_code=source_code,
188
+ file_path=file_path,
189
+ warnings=warnings,
190
+ )
191
+
192
+ def _discover_symbols(self, cursor, symbols: dict, warnings: list):
193
+ """Pass 1: Walk AST to discover all user-defined symbol names."""
194
+ if cursor.location.file:
195
+ filepath = str(cursor.location.file)
196
+ if self._is_system_header(filepath):
197
+ return
198
+ if os.path.abspath(filepath) != self._source_file:
199
+ return
200
+
201
+ if cursor.kind in USER_SYMBOL_KINDS:
202
+ name = cursor.spelling
203
+ if name and not name.startswith("operator") and name not in CPP_KEYWORDS:
204
+ if self._is_in_source_file(cursor):
205
+ scope = self._get_scope(cursor)
206
+ kind = KIND_MAP.get(cursor.kind, "variable")
207
+
208
+ # For constructors/destructors, use the class scope
209
+ if cursor.kind in (CursorKind.CONSTRUCTOR, CursorKind.DESTRUCTOR):
210
+ # Don't create a separate symbol for constructors —
211
+ # the class name symbol already covers it
212
+ pass
213
+ else:
214
+ key = name # Use simple name as key for token matching
215
+ if key not in symbols:
216
+ symbols[key] = Symbol(
217
+ name=name, kind=kind, scope=scope
218
+ )
219
+
220
+ for child in cursor.get_children():
221
+ self._discover_symbols(child, symbols, warnings)
222
+
223
+ def _find_all_occurrences(self, source_code: str, symbols: dict):
224
+ """Pass 2: Find every occurrence of each user symbol in source code.
225
+
226
+ Uses word-boundary regex to find exact positions. This catches ALL
227
+ references including implicit this->, initializer lists, and any
228
+ other context the AST walk might miss.
229
+ """
230
+ for name, symbol in symbols.items():
231
+ pattern = re.compile(r"\b" + re.escape(name) + r"\b")
232
+ for match in pattern.finditer(source_code):
233
+ offset = match.start()
234
+ end_offset = match.end()
235
+
236
+ # Determine line number
237
+ line = source_code[:offset].count("\n") + 1
238
+
239
+ # Skip if this is inside a string literal or #include
240
+ if self._is_inside_string(source_code, offset):
241
+ continue
242
+ if self._is_inside_include(source_code, offset):
243
+ continue
244
+ # Skip if preceded by :: from std namespace (e.g., std::vector)
245
+ if self._is_std_qualified(source_code, offset):
246
+ continue
247
+
248
+ symbol.locations.append(SymbolLocation(
249
+ file=self._source_file,
250
+ line=line,
251
+ col=offset - source_code.rfind("\n", 0, offset),
252
+ offset=offset,
253
+ end_offset=end_offset,
254
+ ))
255
+
256
+ def _is_inside_string(self, source: str, offset: int) -> bool:
257
+ """Check if an offset is inside a string literal."""
258
+ # Find the line containing this offset
259
+ line_start = source.rfind("\n", 0, offset) + 1
260
+ line_end = source.find("\n", offset)
261
+ if line_end == -1:
262
+ line_end = len(source)
263
+ line = source[line_start:line_end]
264
+ pos_in_line = offset - line_start
265
+
266
+ # Count unescaped quotes before this position
267
+ in_string = False
268
+ quote_char = None
269
+ i = 0
270
+ while i < pos_in_line:
271
+ ch = line[i]
272
+ if not in_string:
273
+ if ch in ('"', "'"):
274
+ in_string = True
275
+ quote_char = ch
276
+ else:
277
+ if ch == "\\" :
278
+ i += 1 # skip escaped char
279
+ elif ch == quote_char:
280
+ in_string = False
281
+ i += 1
282
+
283
+ return in_string
284
+
285
+ def _is_inside_include(self, source: str, offset: int) -> bool:
286
+ """Check if an offset is on a #include line."""
287
+ line_start = source.rfind("\n", 0, offset) + 1
288
+ line_end = source.find("\n", offset)
289
+ if line_end == -1:
290
+ line_end = len(source)
291
+ line = source[line_start:line_end].strip()
292
+ return line.startswith("#include")
293
+
294
+ def _is_std_qualified(self, source: str, offset: int) -> bool:
295
+ """Check if the identifier is preceded by 'std::'."""
296
+ # Look for 'std::' immediately before the identifier
297
+ prefix_start = max(0, offset - 5)
298
+ prefix = source[prefix_start:offset]
299
+ return prefix.endswith("std::")
300
+
301
+ def _get_scope(self, cursor) -> str:
302
+ """Get the qualified scope of a cursor."""
303
+ parts = []
304
+ parent = cursor.semantic_parent
305
+ while parent and parent.kind != CursorKind.TRANSLATION_UNIT:
306
+ if parent.spelling:
307
+ parts.append(parent.spelling)
308
+ parent = parent.semantic_parent
309
+ return "::".join(reversed(parts))
310
+
311
+ def _is_in_source_file(self, cursor) -> bool:
312
+ """Check if cursor is in the file being parsed."""
313
+ if not cursor.location.file:
314
+ return False
315
+ return os.path.abspath(str(cursor.location.file)) == self._source_file
316
+
317
+ def is_user_defined(self, name: str, **kwargs) -> bool:
318
+ if name in CPP_KEYWORDS:
319
+ return False
320
+ cursor = kwargs.get("cursor")
321
+ if cursor and cursor.location.file:
322
+ if self._is_system_header(str(cursor.location.file)):
323
+ return False
324
+ return True
325
+
326
+ def _is_system_header(self, filepath: str) -> bool:
327
+ abspath = os.path.abspath(filepath)
328
+ return any(abspath.startswith(sp) for sp in SYSTEM_PATHS)
329
+
330
+ def _extract_comments(self, tu, source_code: str) -> list[Comment]:
331
+ """Extract all comments using libclang tokenization."""
332
+ comments = []
333
+ for token in tu.cursor.get_tokens():
334
+ if token.kind == TokenKind.COMMENT:
335
+ extent = token.extent
336
+ comments.append(Comment(
337
+ offset=extent.start.offset,
338
+ end_offset=extent.end.offset,
339
+ line=token.location.line,
340
+ ))
341
+ return comments