ghostcode 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,397 @@
1
+ """Python AST parser using the built-in ast module.
2
+
3
+ Walks the Python AST to identify user-defined symbols (functions, classes,
4
+ variables, parameters, attributes) and distinguishes them from stdlib/builtins
5
+ and framework symbols.
6
+
7
+ Uses the same two-pass strategy as the C++ parser:
8
+ Pass 1: AST walk to discover user-defined symbol names.
9
+ Pass 2: Token scan to find every occurrence with exact byte offsets.
10
+
11
+ Special handling:
12
+ - Keeps dunder methods (__init__, __str__, etc.)
13
+ - Keeps decorator names
14
+ - Tracks imports to avoid renaming imported modules
15
+ - String correlation: flags string literals matching symbol names
16
+ """
17
+
18
+ import ast
19
+ import builtins
20
+ import keyword
21
+ import os
22
+ import re
23
+
24
+ from .base import BaseParser, Comment, ParseResult, Symbol, SymbolLocation
25
+
26
+ # Python builtins that should never be renamed
27
+ PYTHON_BUILTINS = set(dir(builtins)) | set(keyword.kwlist) | {
28
+ "self", "cls", "super", "type", "object",
29
+ "print", "len", "range", "enumerate", "zip", "map", "filter",
30
+ "isinstance", "issubclass", "hasattr", "getattr", "setattr", "delattr",
31
+ "property", "staticmethod", "classmethod",
32
+ "open", "input", "str", "int", "float", "bool", "list", "dict",
33
+ "set", "tuple", "bytes", "bytearray", "memoryview",
34
+ "None", "True", "False", "Ellipsis", "NotImplemented",
35
+ "__name__", "__main__", "__file__", "__doc__", "__all__",
36
+ "__init__", "__del__", "__repr__", "__str__", "__bytes__",
37
+ "__format__", "__lt__", "__le__", "__eq__", "__ne__", "__gt__",
38
+ "__ge__", "__hash__", "__bool__", "__getattr__", "__getattribute__",
39
+ "__setattr__", "__delattr__", "__dir__", "__get__", "__set__",
40
+ "__delete__", "__init_subclass__", "__set_name__", "__slots__",
41
+ "__dict__", "__weakref__", "__class__", "__bases__", "__mro__",
42
+ "__subclasses__", "__call__", "__len__", "__length_hint__",
43
+ "__getitem__", "__setitem__", "__delitem__", "__missing__",
44
+ "__iter__", "__next__", "__reversed__", "__contains__",
45
+ "__add__", "__radd__", "__iadd__", "__sub__", "__rsub__",
46
+ "__mul__", "__rmul__", "__imul__", "__truediv__", "__floordiv__",
47
+ "__mod__", "__pow__", "__and__", "__or__", "__xor__",
48
+ "__lshift__", "__rshift__", "__neg__", "__pos__", "__abs__",
49
+ "__invert__", "__enter__", "__exit__", "__await__", "__aiter__",
50
+ "__anext__", "__aenter__", "__aexit__",
51
+ }
52
+
53
+ # Common framework base classes and functions to not rename
54
+ FRAMEWORK_SYMBOLS = {
55
+ # Django
56
+ "models", "Model", "Form", "View", "CharField", "IntegerField",
57
+ "FloatField", "BooleanField", "DateTimeField", "ForeignKey",
58
+ "ManyToManyField", "OneToOneField", "Manager", "QuerySet",
59
+ # Flask
60
+ "Flask", "Blueprint", "request", "Response", "jsonify",
61
+ "render_template", "redirect", "url_for", "abort",
62
+ # FastAPI
63
+ "FastAPI", "APIRouter", "Depends", "HTTPException", "Body", "Query",
64
+ # SQLAlchemy
65
+ "Column", "String", "Integer", "Float", "Boolean", "DateTime",
66
+ "ForeignKey", "relationship", "backref", "Base",
67
+ # Pydantic
68
+ "BaseModel", "Field", "validator",
69
+ # PyTorch
70
+ "Module", "Tensor", "nn", "optim", "DataLoader", "Dataset",
71
+ "forward", "backward",
72
+ # Common test frameworks
73
+ "TestCase", "setUp", "tearDown", "test_",
74
+ }
75
+
76
+
77
+ class PythonParser(BaseParser):
78
+ """Python AST parser.
79
+
80
+ Two-pass strategy:
81
+ Pass 1: ast.NodeVisitor to discover user-defined symbol names.
82
+ Pass 2: regex token scan to find all occurrences with byte offsets.
83
+ """
84
+
85
+ def __init__(self):
86
+ self._source_file: str = ""
87
+ self._imports: set[str] = set()
88
+
89
+ def parse(self, file_path: str) -> ParseResult:
90
+ self._source_file = os.path.abspath(file_path)
91
+
92
+ with open(file_path) as f:
93
+ source_code = f.read()
94
+
95
+ tree = ast.parse(source_code, filename=file_path)
96
+
97
+ # Pass 1: Discover symbols
98
+ user_symbols: dict[str, Symbol] = {}
99
+ warnings: list[dict] = []
100
+ self._imports = set()
101
+
102
+ self._collect_imports(tree)
103
+ self._discover_symbols(tree, user_symbols, warnings)
104
+
105
+ # Pass 2: Find all occurrences
106
+ self._find_all_occurrences(source_code, user_symbols)
107
+
108
+ # Extract comments
109
+ comments = self._extract_comments(source_code)
110
+
111
+ return ParseResult(
112
+ symbols=list(user_symbols.values()),
113
+ comments=comments,
114
+ source_code=source_code,
115
+ file_path=file_path,
116
+ warnings=warnings,
117
+ )
118
+
119
+ def _collect_imports(self, tree: ast.AST):
120
+ """Collect all imported names so we don't rename them."""
121
+ for node in ast.walk(tree):
122
+ if isinstance(node, ast.Import):
123
+ for alias in node.names:
124
+ name = alias.asname if alias.asname else alias.name
125
+ self._imports.add(name)
126
+ # Also add top-level module
127
+ self._imports.add(alias.name.split(".")[0])
128
+ elif isinstance(node, ast.ImportFrom):
129
+ if node.module:
130
+ self._imports.add(node.module.split(".")[0])
131
+ for alias in node.names:
132
+ name = alias.asname if alias.asname else alias.name
133
+ self._imports.add(name)
134
+
135
+ def _discover_symbols(self, tree: ast.AST, symbols: dict,
136
+ warnings: list, scope: str = ""):
137
+ """Walk AST to discover user-defined symbol names."""
138
+ for node in ast.iter_child_nodes(tree):
139
+ if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
140
+ name = node.name
141
+ if self.is_user_defined(name):
142
+ kind = "method" if scope else "function"
143
+ if name not in symbols:
144
+ symbols[name] = Symbol(name=name, kind=kind, scope=scope)
145
+
146
+ # Parameters
147
+ for arg in node.args.args + node.args.posonlyargs + node.args.kwonlyargs:
148
+ arg_name = arg.arg
149
+ if self.is_user_defined(arg_name):
150
+ if arg_name not in symbols:
151
+ symbols[arg_name] = Symbol(
152
+ name=arg_name, kind="parameter",
153
+ scope=f"{scope}::{name}" if scope else name,
154
+ )
155
+
156
+ if node.args.vararg and self.is_user_defined(node.args.vararg.arg):
157
+ vname = node.args.vararg.arg
158
+ if vname not in symbols:
159
+ symbols[vname] = Symbol(
160
+ name=vname, kind="parameter",
161
+ scope=f"{scope}::{name}" if scope else name,
162
+ )
163
+
164
+ if node.args.kwarg and self.is_user_defined(node.args.kwarg.arg):
165
+ kname = node.args.kwarg.arg
166
+ if kname not in symbols:
167
+ symbols[kname] = Symbol(
168
+ name=kname, kind="parameter",
169
+ scope=f"{scope}::{name}" if scope else name,
170
+ )
171
+
172
+ # Recurse into function body
173
+ func_scope = f"{scope}::{name}" if scope else name
174
+ self._discover_symbols(node, symbols, warnings, func_scope)
175
+
176
+ # Local variables via assignment
177
+ self._collect_assignments(node, symbols, warnings, func_scope)
178
+
179
+ elif isinstance(node, ast.ClassDef):
180
+ name = node.name
181
+ if self.is_user_defined(name):
182
+ if name not in symbols:
183
+ symbols[name] = Symbol(name=name, kind="class", scope=scope)
184
+
185
+ class_scope = f"{scope}::{name}" if scope else name
186
+ self._discover_symbols(node, symbols, warnings, class_scope)
187
+
188
+ elif isinstance(node, ast.AnnAssign) and node.target:
189
+ # Annotated assignments (e.g., dataclass fields: model_name: str)
190
+ if isinstance(node.target, ast.Name):
191
+ name = node.target.id
192
+ if self.is_user_defined(name) and name not in symbols:
193
+ symbols[name] = Symbol(
194
+ name=name, kind="field", scope=scope,
195
+ )
196
+
197
+ elif isinstance(node, ast.Assign):
198
+ # Module-level assignments
199
+ self._process_assignment(node, symbols, scope)
200
+
201
+ def _collect_assignments(self, func_node: ast.AST, symbols: dict,
202
+ warnings: list, scope: str):
203
+ """Collect variable assignments inside a function body."""
204
+ for node in ast.walk(func_node):
205
+ if isinstance(node, ast.Assign):
206
+ self._process_assignment(node, symbols, scope)
207
+ elif isinstance(node, ast.AnnAssign) and node.target:
208
+ if isinstance(node.target, ast.Name):
209
+ name = node.target.id
210
+ if self.is_user_defined(name) and name not in symbols:
211
+ symbols[name] = Symbol(
212
+ name=name, kind="variable", scope=scope,
213
+ )
214
+ elif isinstance(node, ast.AugAssign):
215
+ if isinstance(node.target, ast.Name):
216
+ name = node.target.id
217
+ if self.is_user_defined(name) and name not in symbols:
218
+ symbols[name] = Symbol(
219
+ name=name, kind="variable", scope=scope,
220
+ )
221
+
222
+ def _process_assignment(self, node: ast.Assign, symbols: dict, scope: str):
223
+ """Process an assignment and collect variable names."""
224
+ for target in node.targets:
225
+ if isinstance(target, ast.Name):
226
+ name = target.id
227
+ if self.is_user_defined(name) and name not in symbols:
228
+ symbols[name] = Symbol(
229
+ name=name, kind="variable", scope=scope,
230
+ )
231
+ elif isinstance(target, ast.Tuple) or isinstance(target, ast.List):
232
+ for elt in target.elts:
233
+ if isinstance(elt, ast.Name):
234
+ name = elt.id
235
+ if self.is_user_defined(name) and name not in symbols:
236
+ symbols[name] = Symbol(
237
+ name=name, kind="variable", scope=scope,
238
+ )
239
+ elif isinstance(target, ast.Attribute):
240
+ # self.attribute_name → collect attribute_name
241
+ if (isinstance(target.value, ast.Name)
242
+ and target.value.id in ("self", "cls")):
243
+ name = target.attr
244
+ if self.is_user_defined(name) and name not in symbols:
245
+ symbols[name] = Symbol(
246
+ name=name, kind="field", scope=scope,
247
+ )
248
+
249
+ def _find_all_occurrences(self, source_code: str, symbols: dict):
250
+ """Find every occurrence of each symbol with exact byte offsets."""
251
+ for name, symbol in symbols.items():
252
+ pattern = re.compile(r"\b" + re.escape(name) + r"\b")
253
+ for match in pattern.finditer(source_code):
254
+ offset = match.start()
255
+ end_offset = match.end()
256
+ line = source_code[:offset].count("\n") + 1
257
+
258
+ # Skip if inside a string literal
259
+ if self._is_inside_string(source_code, offset):
260
+ continue
261
+ # Skip if inside a comment
262
+ if self._is_inside_comment(source_code, offset):
263
+ continue
264
+ # Skip if part of an import statement
265
+ if self._is_on_import_line(source_code, offset):
266
+ continue
267
+ # Skip if it's a decorator
268
+ if self._is_decorator(source_code, offset):
269
+ continue
270
+
271
+ symbol.locations.append(SymbolLocation(
272
+ file=self._source_file,
273
+ line=line,
274
+ col=offset - source_code.rfind("\n", 0, offset),
275
+ offset=offset,
276
+ end_offset=end_offset,
277
+ ))
278
+
279
+ def _is_inside_string(self, source: str, offset: int) -> bool:
280
+ """Check if offset is inside a string literal."""
281
+ # Check for triple-quoted strings first
282
+ before = source[:offset]
283
+ # Count triple quotes
284
+ for triple in ['"""', "'''"]:
285
+ count = before.count(triple)
286
+ if count % 2 == 1:
287
+ return True
288
+
289
+ # Check single-line strings
290
+ line_start = before.rfind("\n") + 1
291
+ line_prefix = source[line_start:offset]
292
+ in_str = False
293
+ quote_ch = None
294
+ i = 0
295
+ while i < len(line_prefix):
296
+ ch = line_prefix[i]
297
+ if not in_str:
298
+ if ch in ('"', "'"):
299
+ in_str = True
300
+ quote_ch = ch
301
+ else:
302
+ if ch == "\\":
303
+ i += 1
304
+ elif ch == quote_ch:
305
+ in_str = False
306
+ i += 1
307
+ return in_str
308
+
309
+ def _is_inside_comment(self, source: str, offset: int) -> bool:
310
+ """Check if offset is inside a # comment."""
311
+ line_start = source.rfind("\n", 0, offset) + 1
312
+ line_prefix = source[line_start:offset]
313
+ # Check if there's an unquoted # before this position
314
+ in_str = False
315
+ quote_ch = None
316
+ for ch in line_prefix:
317
+ if not in_str:
318
+ if ch == "#":
319
+ return True
320
+ if ch in ('"', "'"):
321
+ in_str = True
322
+ quote_ch = ch
323
+ else:
324
+ if ch == quote_ch:
325
+ in_str = False
326
+ return False
327
+
328
+ def _is_on_import_line(self, source: str, offset: int) -> bool:
329
+ """Check if offset is on an import line."""
330
+ line_start = source.rfind("\n", 0, offset) + 1
331
+ line_end = source.find("\n", offset)
332
+ if line_end == -1:
333
+ line_end = len(source)
334
+ line = source[line_start:line_end].strip()
335
+ return line.startswith("import ") or line.startswith("from ")
336
+
337
+ def _is_decorator(self, source: str, offset: int) -> bool:
338
+ """Check if offset is on a decorator line."""
339
+ line_start = source.rfind("\n", 0, offset) + 1
340
+ line = source[line_start:offset + 50].strip()
341
+ return line.startswith("@")
342
+
343
+ def is_user_defined(self, name: str, **kwargs) -> bool:
344
+ """Check if a name is user-defined vs builtin/keyword/imported."""
345
+ if not name:
346
+ return False
347
+ if name in PYTHON_BUILTINS:
348
+ return False
349
+ if name in self._imports:
350
+ return False
351
+ if name in FRAMEWORK_SYMBOLS:
352
+ return False
353
+ if name.startswith("__") and name.endswith("__"):
354
+ return False
355
+ if name.startswith("_") and len(name) == 1:
356
+ return False # _ is throwaway
357
+ return True
358
+
359
+ def _extract_comments(self, source_code: str) -> list[Comment]:
360
+ """Extract all comments (# lines and docstrings)."""
361
+ comments = []
362
+
363
+ # Line comments
364
+ for match in re.finditer(r"#[^\n]*", source_code):
365
+ # Make sure it's not inside a string
366
+ if not self._is_inside_string(source_code, match.start()):
367
+ comments.append(Comment(
368
+ offset=match.start(),
369
+ end_offset=match.end(),
370
+ line=source_code[:match.start()].count("\n") + 1,
371
+ ))
372
+
373
+ # Docstrings (triple-quoted strings that are expression statements)
374
+ tree = ast.parse(source_code)
375
+ for node in ast.walk(tree):
376
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef,
377
+ ast.ClassDef, ast.Module)):
378
+ body = node.body
379
+ if (body and isinstance(body[0], ast.Expr)
380
+ and isinstance(body[0].value, ast.Constant)
381
+ and isinstance(body[0].value.value, str)):
382
+ doc_node = body[0]
383
+ # Find the docstring in source
384
+ start_line = doc_node.lineno - 1 # 0-indexed
385
+ end_line = doc_node.end_lineno # 1-indexed, exclusive
386
+ lines = source_code.split("\n")
387
+ start_offset = sum(len(lines[i]) + 1 for i in range(start_line))
388
+ end_offset = sum(len(lines[i]) + 1 for i in range(end_line))
389
+ # Trim trailing newline
390
+ end_offset = min(end_offset, len(source_code))
391
+ comments.append(Comment(
392
+ offset=start_offset,
393
+ end_offset=end_offset,
394
+ line=doc_node.lineno,
395
+ ))
396
+
397
+ return comments
File without changes