codegraph-nav 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codegraph_nav/__init__.py +194 -0
- codegraph_nav/ast_grep_analyzer.py +448 -0
- codegraph_nav/cli.py +223 -0
- codegraph_nav/code_navigator.py +1328 -0
- codegraph_nav/code_search.py +1009 -0
- codegraph_nav/colors.py +209 -0
- codegraph_nav/completions.py +354 -0
- codegraph_nav/dart_analyzer.py +301 -0
- codegraph_nav/dependency_graph.py +814 -0
- codegraph_nav/domain/__init__.py +20 -0
- codegraph_nav/domain/routes.py +337 -0
- codegraph_nav/domain/schemas.py +229 -0
- codegraph_nav/domain/tags.py +87 -0
- codegraph_nav/exporters.py +563 -0
- codegraph_nav/go_analyzer.py +273 -0
- codegraph_nav/graph/__init__.py +72 -0
- codegraph_nav/graph/builder.py +409 -0
- codegraph_nav/graph/communities.py +402 -0
- codegraph_nav/graph/flows.py +311 -0
- codegraph_nav/graph/query.py +380 -0
- codegraph_nav/graph/schema.py +266 -0
- codegraph_nav/graph/search.py +257 -0
- codegraph_nav/graph/store.py +517 -0
- codegraph_nav/hints.py +195 -0
- codegraph_nav/import_resolver.py +891 -0
- codegraph_nav/js_ts_analyzer.py +564 -0
- codegraph_nav/line_reader.py +664 -0
- codegraph_nav/mcp/__init__.py +39 -0
- codegraph_nav/mcp/__main__.py +5 -0
- codegraph_nav/mcp/server.py +2228 -0
- codegraph_nav/py.typed +2 -0
- codegraph_nav/ruby_analyzer.py +259 -0
- codegraph_nav/rust_analyzer.py +379 -0
- codegraph_nav/token_efficient_renderer.py +743 -0
- codegraph_nav/watcher.py +382 -0
- codegraph_nav-0.1.0.dist-info/METADATA +487 -0
- codegraph_nav-0.1.0.dist-info/RECORD +41 -0
- codegraph_nav-0.1.0.dist-info/WHEEL +5 -0
- codegraph_nav-0.1.0.dist-info/entry_points.txt +4 -0
- codegraph_nav-0.1.0.dist-info/licenses/LICENSE +21 -0
- codegraph_nav-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1328 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Code Mapper - Generates a structural map/graph of a codebase for token-efficient navigation.
|
|
3
|
+
|
|
4
|
+
This module creates a lightweight index of functions, classes, methods, and their
|
|
5
|
+
relationships within a codebase. The generated index can be used for quick symbol
|
|
6
|
+
lookup without reading entire files.
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
Command line usage:
|
|
10
|
+
$ codegraph-nav scan /path/to/project -o .codegraph.json
|
|
11
|
+
|
|
12
|
+
Python API usage:
|
|
13
|
+
>>> mapper = CodeNavigator('/path/to/project')
|
|
14
|
+
>>> code_map = mapper.scan()
|
|
15
|
+
>>> print(code_map['stats'])
|
|
16
|
+
{'files_processed': 142, 'symbols_found': 1847, 'errors': 0}
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
LANGUAGE_EXTENSIONS: Dict mapping language names to file extensions.
|
|
20
|
+
DEFAULT_IGNORE_PATTERNS: List of patterns to ignore when scanning.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import ast
|
|
25
|
+
import fnmatch
|
|
26
|
+
import json
|
|
27
|
+
import os
|
|
28
|
+
import re
|
|
29
|
+
import subprocess
|
|
30
|
+
import sys
|
|
31
|
+
import time
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
from datetime import datetime
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any, Protocol
|
|
36
|
+
|
|
37
|
+
from .colors import get_colors
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class _Analyzer(Protocol):
|
|
41
|
+
"""Structural type for language analyzers exposing analyze()."""
|
|
42
|
+
|
|
43
|
+
def analyze(self) -> list["Symbol"]: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
__version__ = "0.1.0"
|
|
47
|
+
|
|
48
|
+
# Supported languages and their extensions
|
|
49
|
+
LANGUAGE_EXTENSIONS = {
|
|
50
|
+
"python": [".py"],
|
|
51
|
+
"javascript": [".js", ".jsx", ".mjs"],
|
|
52
|
+
"typescript": [".ts", ".tsx"],
|
|
53
|
+
"java": [".java"],
|
|
54
|
+
"go": [".go"],
|
|
55
|
+
"rust": [".rs"],
|
|
56
|
+
"c": [".c", ".h"],
|
|
57
|
+
"cpp": [".cpp", ".hpp", ".cc", ".hh", ".cxx"],
|
|
58
|
+
"ruby": [".rb"],
|
|
59
|
+
"php": [".php"],
|
|
60
|
+
"dart": [".dart"],
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
DEFAULT_IGNORE_PATTERNS = [
|
|
64
|
+
# Build artifacts and dependencies
|
|
65
|
+
"node_modules",
|
|
66
|
+
"__pycache__",
|
|
67
|
+
"venv",
|
|
68
|
+
"env",
|
|
69
|
+
"dist",
|
|
70
|
+
"build",
|
|
71
|
+
".next",
|
|
72
|
+
"coverage",
|
|
73
|
+
".nyc_output",
|
|
74
|
+
"*.min.js",
|
|
75
|
+
"*.bundle.js",
|
|
76
|
+
".tox",
|
|
77
|
+
"eggs",
|
|
78
|
+
"*.egg-info",
|
|
79
|
+
".pytest_cache",
|
|
80
|
+
"vendor",
|
|
81
|
+
"target",
|
|
82
|
+
"bin",
|
|
83
|
+
"obj",
|
|
84
|
+
# Dart/Flutter build artifacts and generated code
|
|
85
|
+
".dart_tool",
|
|
86
|
+
"*.g.dart",
|
|
87
|
+
"*.freezed.dart",
|
|
88
|
+
"*.gr.dart",
|
|
89
|
+
# Version control
|
|
90
|
+
".git",
|
|
91
|
+
".svn",
|
|
92
|
+
".hg",
|
|
93
|
+
# IDE settings
|
|
94
|
+
".idea",
|
|
95
|
+
".vscode",
|
|
96
|
+
# Environment files - ALL variants (SECURITY: prevents exposure of secrets)
|
|
97
|
+
".env",
|
|
98
|
+
".env.*",
|
|
99
|
+
".env.local",
|
|
100
|
+
".env.*.local",
|
|
101
|
+
".env.production*",
|
|
102
|
+
".env.development*",
|
|
103
|
+
".envrc",
|
|
104
|
+
"*.env",
|
|
105
|
+
# Credentials and secrets (SECURITY)
|
|
106
|
+
"secrets*",
|
|
107
|
+
"*secret*",
|
|
108
|
+
"*secrets*",
|
|
109
|
+
"*credential*",
|
|
110
|
+
"*credentials*",
|
|
111
|
+
".aws",
|
|
112
|
+
".gcp",
|
|
113
|
+
".ssh",
|
|
114
|
+
".gnupg",
|
|
115
|
+
# Keys and certificates (SECURITY)
|
|
116
|
+
"*.pem",
|
|
117
|
+
"*.key",
|
|
118
|
+
"*.p8",
|
|
119
|
+
"*.p12",
|
|
120
|
+
"*.pfx",
|
|
121
|
+
"id_rsa*",
|
|
122
|
+
"id_ed25519*",
|
|
123
|
+
"id_ecdsa*",
|
|
124
|
+
"*.crt",
|
|
125
|
+
"*.cer",
|
|
126
|
+
# Config files with potential secrets (SECURITY)
|
|
127
|
+
".npmrc",
|
|
128
|
+
".pypirc",
|
|
129
|
+
".netrc",
|
|
130
|
+
"config/database.yml",
|
|
131
|
+
"config/secrets.yml",
|
|
132
|
+
# API keys and tokens
|
|
133
|
+
"*apikey*",
|
|
134
|
+
"*api_key*",
|
|
135
|
+
"*token*",
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass
|
|
140
|
+
class Symbol:
|
|
141
|
+
"""Represents a code symbol (function, class, method, etc.).
|
|
142
|
+
|
|
143
|
+
Attributes:
|
|
144
|
+
name: The symbol's name (e.g., 'process_payment').
|
|
145
|
+
type: The symbol type ('function', 'class', 'method', 'variable', 'import').
|
|
146
|
+
file_path: Relative path to the file containing the symbol.
|
|
147
|
+
line_start: Starting line number (1-indexed).
|
|
148
|
+
line_end: Ending line number (1-indexed, inclusive).
|
|
149
|
+
signature: Function/class signature (e.g., 'def foo(x: int) -> str').
|
|
150
|
+
docstring: First few lines of docstring, if present.
|
|
151
|
+
parent: For methods, the containing class name.
|
|
152
|
+
dependencies: List of symbols this symbol calls/uses.
|
|
153
|
+
decorators: List of decorator names applied to this symbol.
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
>>> symbol = Symbol(
|
|
157
|
+
... name='process_payment',
|
|
158
|
+
... type='function',
|
|
159
|
+
... file_path='src/billing.py',
|
|
160
|
+
... line_start=45,
|
|
161
|
+
... line_end=89,
|
|
162
|
+
... signature='def process_payment(user_id: int, amount: Decimal)'
|
|
163
|
+
... )
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
name: str
|
|
167
|
+
type: str
|
|
168
|
+
file_path: str
|
|
169
|
+
line_start: int
|
|
170
|
+
line_end: int
|
|
171
|
+
signature: str | None = None
|
|
172
|
+
docstring: str | None = None
|
|
173
|
+
parent: str | None = None
|
|
174
|
+
dependencies: list[str] | None = None
|
|
175
|
+
decorators: list[str] | None = None
|
|
176
|
+
truncated: bool = False # True if symbol exceeded max line limit during analysis
|
|
177
|
+
|
|
178
|
+
def __post_init__(self):
|
|
179
|
+
"""Initialize mutable default values."""
|
|
180
|
+
if self.dependencies is None:
|
|
181
|
+
self.dependencies = []
|
|
182
|
+
if self.decorators is None:
|
|
183
|
+
self.decorators = []
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class PythonAnalyzer(ast.NodeVisitor):
|
|
187
|
+
"""Analyzes Python files using AST for accurate symbol extraction.
|
|
188
|
+
|
|
189
|
+
This analyzer provides the most accurate symbol detection for Python files,
|
|
190
|
+
using Python's built-in AST module to parse the code structure.
|
|
191
|
+
|
|
192
|
+
Attributes:
|
|
193
|
+
file_path: Path to the file being analyzed.
|
|
194
|
+
source: Source code content.
|
|
195
|
+
lines: List of source lines.
|
|
196
|
+
symbols: Extracted symbols.
|
|
197
|
+
current_class: Name of class currently being visited (for method detection).
|
|
198
|
+
imports: List of imported modules/names.
|
|
199
|
+
|
|
200
|
+
Example:
|
|
201
|
+
>>> source = '''
|
|
202
|
+
... def greet(name: str) -> str:
|
|
203
|
+
... \"\"\"Say hello.\"\"\"
|
|
204
|
+
... return f"Hello, {name}"
|
|
205
|
+
... '''
|
|
206
|
+
>>> analyzer = PythonAnalyzer('example.py', source)
|
|
207
|
+
>>> symbols = analyzer.analyze()
|
|
208
|
+
>>> print(symbols[0].name)
|
|
209
|
+
'greet'
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
def __init__(self, file_path: str, source: str):
|
|
213
|
+
"""Initialize the Python analyzer.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
file_path: Relative path to the file.
|
|
217
|
+
source: Source code content.
|
|
218
|
+
"""
|
|
219
|
+
self.file_path = file_path
|
|
220
|
+
self.source = source
|
|
221
|
+
self.lines = source.split("\n")
|
|
222
|
+
self.symbols: list[Symbol] = []
|
|
223
|
+
self.current_class: str | None = None
|
|
224
|
+
self.imports: list[str] = []
|
|
225
|
+
|
|
226
|
+
def get_line_end(self, node) -> int:
|
|
227
|
+
"""Get the end line of an AST node.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
node: An AST node.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
The ending line number of the node.
|
|
234
|
+
"""
|
|
235
|
+
if hasattr(node, "end_lineno") and node.end_lineno:
|
|
236
|
+
return int(node.end_lineno)
|
|
237
|
+
if hasattr(node, "body") and node.body:
|
|
238
|
+
last_node = node.body[-1]
|
|
239
|
+
return self.get_line_end(last_node)
|
|
240
|
+
return int(node.lineno)
|
|
241
|
+
|
|
242
|
+
def get_signature(self, node) -> str:
|
|
243
|
+
"""Extract function/method signature from an AST node.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
node: A FunctionDef or AsyncFunctionDef AST node.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
String representation of the function signature.
|
|
250
|
+
|
|
251
|
+
Example:
|
|
252
|
+
>>> # For 'async def foo(x: int) -> str:'
|
|
253
|
+
>>> signature = analyzer.get_signature(node)
|
|
254
|
+
>>> print(signature)
|
|
255
|
+
'async def foo(x: int) -> str'
|
|
256
|
+
"""
|
|
257
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
258
|
+
args = []
|
|
259
|
+
for arg in node.args.args:
|
|
260
|
+
arg_str = arg.arg
|
|
261
|
+
if arg.annotation:
|
|
262
|
+
try:
|
|
263
|
+
arg_str += f": {ast.unparse(arg.annotation)}"
|
|
264
|
+
except (TypeError, AttributeError, RecursionError, ValueError):
|
|
265
|
+
# ast.unparse can fail on malformed/complex AST nodes
|
|
266
|
+
pass
|
|
267
|
+
args.append(arg_str)
|
|
268
|
+
|
|
269
|
+
returns = ""
|
|
270
|
+
if node.returns:
|
|
271
|
+
try:
|
|
272
|
+
returns = f" -> {ast.unparse(node.returns)}"
|
|
273
|
+
except (TypeError, AttributeError, RecursionError, ValueError):
|
|
274
|
+
# ast.unparse can fail on malformed/complex AST nodes
|
|
275
|
+
pass
|
|
276
|
+
|
|
277
|
+
prefix = "async " if isinstance(node, ast.AsyncFunctionDef) else ""
|
|
278
|
+
return f"{prefix}def {node.name}({', '.join(args)}){returns}"
|
|
279
|
+
return ""
|
|
280
|
+
|
|
281
|
+
def get_decorators(self, node) -> list[str]:
|
|
282
|
+
"""Extract decorator names from an AST node.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
node: An AST node with decorator_list attribute.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of decorator name strings.
|
|
289
|
+
"""
|
|
290
|
+
decorators = []
|
|
291
|
+
for dec in node.decorator_list:
|
|
292
|
+
try:
|
|
293
|
+
decorators.append(ast.unparse(dec))
|
|
294
|
+
except (TypeError, AttributeError, RecursionError, ValueError):
|
|
295
|
+
# Fallback: try to get simple decorator name
|
|
296
|
+
if isinstance(dec, ast.Name):
|
|
297
|
+
decorators.append(dec.id)
|
|
298
|
+
return decorators
|
|
299
|
+
|
|
300
|
+
def get_docstring(self, node) -> str | None:
|
|
301
|
+
"""Extract docstring from an AST node, truncated for efficiency.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
node: An AST node that may have a docstring.
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
First 3 lines of the docstring, or None if no docstring.
|
|
308
|
+
"""
|
|
309
|
+
doc = ast.get_docstring(node)
|
|
310
|
+
if doc:
|
|
311
|
+
lines = doc.split("\n")
|
|
312
|
+
if len(lines) > 3:
|
|
313
|
+
return "\n".join(lines[:3]) + "..."
|
|
314
|
+
return doc
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
def visit_Import(self, node):
|
|
318
|
+
"""Visit an import statement."""
|
|
319
|
+
for alias in node.names:
|
|
320
|
+
self.imports.append(alias.name)
|
|
321
|
+
self.generic_visit(node)
|
|
322
|
+
|
|
323
|
+
def visit_ImportFrom(self, node):
|
|
324
|
+
"""Visit a from...import statement."""
|
|
325
|
+
module = node.module or ""
|
|
326
|
+
for alias in node.names:
|
|
327
|
+
self.imports.append(f"{module}.{alias.name}")
|
|
328
|
+
self.generic_visit(node)
|
|
329
|
+
|
|
330
|
+
def visit_ClassDef(self, node):
|
|
331
|
+
"""Visit a class definition."""
|
|
332
|
+
bases = []
|
|
333
|
+
for base in node.bases:
|
|
334
|
+
try:
|
|
335
|
+
bases.append(ast.unparse(base))
|
|
336
|
+
except (TypeError, AttributeError, RecursionError, ValueError):
|
|
337
|
+
# ast.unparse can fail on complex/malformed base class expressions
|
|
338
|
+
pass
|
|
339
|
+
|
|
340
|
+
signature = f"class {node.name}"
|
|
341
|
+
if bases:
|
|
342
|
+
signature += f"({', '.join(bases)})"
|
|
343
|
+
|
|
344
|
+
symbol = Symbol(
|
|
345
|
+
name=node.name,
|
|
346
|
+
type="class",
|
|
347
|
+
file_path=self.file_path,
|
|
348
|
+
line_start=node.lineno,
|
|
349
|
+
line_end=self.get_line_end(node),
|
|
350
|
+
signature=signature,
|
|
351
|
+
docstring=self.get_docstring(node),
|
|
352
|
+
decorators=self.get_decorators(node),
|
|
353
|
+
)
|
|
354
|
+
self.symbols.append(symbol)
|
|
355
|
+
|
|
356
|
+
old_class = self.current_class
|
|
357
|
+
self.current_class = node.name
|
|
358
|
+
self.generic_visit(node)
|
|
359
|
+
self.current_class = old_class
|
|
360
|
+
|
|
361
|
+
def visit_FunctionDef(self, node):
|
|
362
|
+
"""Visit a function definition."""
|
|
363
|
+
self._visit_function(node)
|
|
364
|
+
|
|
365
|
+
def visit_AsyncFunctionDef(self, node):
|
|
366
|
+
"""Visit an async function definition."""
|
|
367
|
+
self._visit_function(node)
|
|
368
|
+
|
|
369
|
+
def _visit_function(self, node):
|
|
370
|
+
"""Process a function or async function definition.
|
|
371
|
+
|
|
372
|
+
Args:
|
|
373
|
+
node: A FunctionDef or AsyncFunctionDef AST node.
|
|
374
|
+
"""
|
|
375
|
+
symbol_type = "method" if self.current_class else "function"
|
|
376
|
+
|
|
377
|
+
calls = []
|
|
378
|
+
for child in ast.walk(node):
|
|
379
|
+
if isinstance(child, ast.Call):
|
|
380
|
+
if isinstance(child.func, ast.Name):
|
|
381
|
+
calls.append(child.func.id)
|
|
382
|
+
elif isinstance(child.func, ast.Attribute):
|
|
383
|
+
calls.append(child.func.attr)
|
|
384
|
+
|
|
385
|
+
symbol = Symbol(
|
|
386
|
+
name=node.name,
|
|
387
|
+
type=symbol_type,
|
|
388
|
+
file_path=self.file_path,
|
|
389
|
+
line_start=node.lineno,
|
|
390
|
+
line_end=self.get_line_end(node),
|
|
391
|
+
signature=self.get_signature(node),
|
|
392
|
+
docstring=self.get_docstring(node),
|
|
393
|
+
parent=self.current_class,
|
|
394
|
+
dependencies=list(set(calls)),
|
|
395
|
+
decorators=self.get_decorators(node),
|
|
396
|
+
)
|
|
397
|
+
self.symbols.append(symbol)
|
|
398
|
+
self.generic_visit(node)
|
|
399
|
+
|
|
400
|
+
def analyze(self) -> list[Symbol]:
|
|
401
|
+
"""Parse and analyze the file.
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
List of Symbol objects found in the file.
|
|
405
|
+
|
|
406
|
+
Raises:
|
|
407
|
+
SyntaxError: If the file has invalid Python syntax (caught and logged).
|
|
408
|
+
"""
|
|
409
|
+
try:
|
|
410
|
+
tree = ast.parse(self.source)
|
|
411
|
+
self.visit(tree)
|
|
412
|
+
except SyntaxError as e:
|
|
413
|
+
print(f"Syntax error in {self.file_path}: {e}", file=sys.stderr)
|
|
414
|
+
return self.symbols
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
class GenericAnalyzer:
|
|
418
|
+
"""Regex-based analyzer for non-Python languages.
|
|
419
|
+
|
|
420
|
+
Provides symbol detection for JavaScript, TypeScript, Java, Go, Rust, and C/C++
|
|
421
|
+
using regular expression patterns. Less accurate than AST analysis but works
|
|
422
|
+
across multiple languages.
|
|
423
|
+
|
|
424
|
+
Attributes:
|
|
425
|
+
PATTERNS: Dict of regex patterns for each supported language.
|
|
426
|
+
file_path: Path to the file being analyzed.
|
|
427
|
+
source: Source code content.
|
|
428
|
+
language: The programming language of the file.
|
|
429
|
+
|
|
430
|
+
Example:
|
|
431
|
+
>>> source = 'function greet(name) { return "Hello, " + name; }'
|
|
432
|
+
>>> analyzer = GenericAnalyzer('example.js', source, 'javascript')
|
|
433
|
+
>>> symbols = analyzer.analyze()
|
|
434
|
+
"""
|
|
435
|
+
|
|
436
|
+
PATTERNS = {
|
|
437
|
+
"javascript": {
|
|
438
|
+
"function": r"(?:async\s+)?function\s+(\w+)\s*\([^)]*\)",
|
|
439
|
+
"arrow": r"(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\([^)]*\)\s*=>",
|
|
440
|
+
"class": r"class\s+(\w+)(?:\s+extends\s+\w+)?",
|
|
441
|
+
"method": r"(?:async\s+)?(\w+)\s*\([^)]*\)\s*{",
|
|
442
|
+
},
|
|
443
|
+
"typescript": {
|
|
444
|
+
"function": r"(?:async\s+)?function\s+(\w+)\s*(?:<[^>]*>)?\s*\([^)]*\)",
|
|
445
|
+
"interface": r"interface\s+(\w+)",
|
|
446
|
+
"type": r"type\s+(\w+)\s*=",
|
|
447
|
+
"class": r"class\s+(\w+)(?:\s+extends\s+\w+)?(?:\s+implements\s+\w+)?",
|
|
448
|
+
},
|
|
449
|
+
"java": {
|
|
450
|
+
"class": r"(?:public|private|protected)?\s*class\s+(\w+)",
|
|
451
|
+
"interface": r"interface\s+(\w+)",
|
|
452
|
+
"method": r"(?:public|private|protected)?\s*(?:static\s+)?(?:\w+(?:<[^>]*>)?)\s+(\w+)\s*\([^)]*\)",
|
|
453
|
+
},
|
|
454
|
+
"go": {
|
|
455
|
+
"function": r"func\s+(\w+)\s*(?:\[[^\]]*\])?\s*\(",
|
|
456
|
+
"method": r"func\s+\([^)]+\)\s+(\w+)\s*(?:\[[^\]]*\])?\s*\(",
|
|
457
|
+
"struct": r"type\s+(\w+)\s+struct",
|
|
458
|
+
"interface": r"type\s+(\w+)\s+interface",
|
|
459
|
+
"type_alias": r"type\s+(\w+)\s+(?!struct\b|interface\b)\w+",
|
|
460
|
+
},
|
|
461
|
+
"ruby": {
|
|
462
|
+
"function": r"^[ \t]*def\s+(?!self\.)(\w+[!?=]?)",
|
|
463
|
+
"class": r"^[ \t]*class\s+([A-Z]\w*)",
|
|
464
|
+
"module": r"^[ \t]*module\s+([A-Z]\w*)",
|
|
465
|
+
},
|
|
466
|
+
"rust": {
|
|
467
|
+
"function": r"(?:pub\s+)?(?:async\s+)?fn\s+(\w+)",
|
|
468
|
+
"struct": r"(?:pub\s+)?struct\s+(\w+)",
|
|
469
|
+
"impl": r"impl(?:<[^>]*>)?\s+(\w+)",
|
|
470
|
+
"trait": r"(?:pub\s+)?trait\s+(\w+)",
|
|
471
|
+
"enum": r"(?:pub\s+)?enum\s+(\w+)",
|
|
472
|
+
},
|
|
473
|
+
"dart": {
|
|
474
|
+
"class": r"(?:abstract\s+)?class\s+(\w+)",
|
|
475
|
+
"mixin": r"^[ \t]*mixin\s+(\w+)",
|
|
476
|
+
"enum": r"enum\s+(\w+)\s*\{",
|
|
477
|
+
"extension": r"extension\s+(\w+)\s+on\s+\w+",
|
|
478
|
+
"function": r"^[ \t]*(?!(?:if|for|while|switch|catch|return|do|else|throw|new|await|assert|yield)\b)(?:Future(?:<[^>]+>)?|void|String|int|double|bool|num|dynamic|Widget|List(?:<[^>]+>)?|Map(?:<[^>]+>)?|Set(?:<[^>]+>)?|Iterable(?:<[^>]+>)?|Stream(?:<[^>]+>)?|[A-Z]\w*\??)\s+(\w+)\s*\([^)]*\)\s*(?:async\s*\*?\s*)?\{",
|
|
479
|
+
},
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
# Maximum lines to scan for a symbol's end before giving up
|
|
483
|
+
MAX_SYMBOL_LINES = 500
|
|
484
|
+
|
|
485
|
+
# Languages that use 'end' keyword instead of braces for block termination
|
|
486
|
+
KEYWORD_END_LANGUAGES = {"ruby"}
|
|
487
|
+
|
|
488
|
+
# Keywords that open a new block in end-based languages
|
|
489
|
+
_END_OPENERS = (
|
|
490
|
+
"def ",
|
|
491
|
+
"class ",
|
|
492
|
+
"module ",
|
|
493
|
+
"do",
|
|
494
|
+
"if ",
|
|
495
|
+
"unless ",
|
|
496
|
+
"while ",
|
|
497
|
+
"until ",
|
|
498
|
+
"for ",
|
|
499
|
+
"begin",
|
|
500
|
+
"case ",
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
def __init__(self, file_path: str, source: str, language: str):
|
|
504
|
+
"""Initialize the generic analyzer.
|
|
505
|
+
|
|
506
|
+
Args:
|
|
507
|
+
file_path: Relative path to the file.
|
|
508
|
+
source: Source code content.
|
|
509
|
+
language: Programming language identifier.
|
|
510
|
+
"""
|
|
511
|
+
self.file_path = file_path
|
|
512
|
+
self.source = source
|
|
513
|
+
self.language = language
|
|
514
|
+
self.lines = source.split("\n")
|
|
515
|
+
|
|
516
|
+
def analyze(self) -> list[Symbol]:
|
|
517
|
+
"""Analyze the file using regex patterns.
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
List of Symbol objects found in the file.
|
|
521
|
+
"""
|
|
522
|
+
import re
|
|
523
|
+
|
|
524
|
+
symbols = []
|
|
525
|
+
patterns = self.PATTERNS.get(self.language, {})
|
|
526
|
+
|
|
527
|
+
for symbol_type, pattern in patterns.items():
|
|
528
|
+
for match in re.finditer(pattern, self.source, re.MULTILINE):
|
|
529
|
+
name = match.group(1)
|
|
530
|
+
line_num = self.source[: match.start()].count("\n") + 1
|
|
531
|
+
|
|
532
|
+
line_end = line_num
|
|
533
|
+
was_truncated = False
|
|
534
|
+
|
|
535
|
+
if self.language in self.KEYWORD_END_LANGUAGES:
|
|
536
|
+
# Keyword-based end detection (Ruby: def/class/module ... end)
|
|
537
|
+
depth = 1
|
|
538
|
+
for i, line in enumerate(self.lines[line_num:], start=line_num + 1):
|
|
539
|
+
stripped = line.strip()
|
|
540
|
+
if not stripped.startswith("#"):
|
|
541
|
+
for kw in self._END_OPENERS:
|
|
542
|
+
if stripped.startswith(kw) or stripped == kw.strip():
|
|
543
|
+
depth += 1
|
|
544
|
+
break
|
|
545
|
+
if (
|
|
546
|
+
stripped == "end"
|
|
547
|
+
or stripped.startswith("end ")
|
|
548
|
+
or stripped.startswith("end;")
|
|
549
|
+
):
|
|
550
|
+
depth -= 1
|
|
551
|
+
if depth <= 0:
|
|
552
|
+
line_end = i
|
|
553
|
+
break
|
|
554
|
+
if i > line_num + self.MAX_SYMBOL_LINES:
|
|
555
|
+
line_end = i
|
|
556
|
+
was_truncated = True
|
|
557
|
+
break
|
|
558
|
+
else:
|
|
559
|
+
# Brace-based end detection (Go, JS, Java, Rust, C/C++)
|
|
560
|
+
brace_count = 0
|
|
561
|
+
started = False
|
|
562
|
+
for i, line in enumerate(self.lines[line_num - 1 :], start=line_num):
|
|
563
|
+
brace_count += line.count("{") - line.count("}")
|
|
564
|
+
if "{" in line:
|
|
565
|
+
started = True
|
|
566
|
+
if started and brace_count <= 0:
|
|
567
|
+
line_end = i
|
|
568
|
+
break
|
|
569
|
+
if i > line_num + self.MAX_SYMBOL_LINES:
|
|
570
|
+
line_end = i
|
|
571
|
+
was_truncated = True
|
|
572
|
+
break
|
|
573
|
+
|
|
574
|
+
symbols.append(
|
|
575
|
+
Symbol(
|
|
576
|
+
name=name,
|
|
577
|
+
type=symbol_type,
|
|
578
|
+
file_path=self.file_path,
|
|
579
|
+
line_start=line_num,
|
|
580
|
+
line_end=line_end,
|
|
581
|
+
signature=match.group(0).strip()[:100],
|
|
582
|
+
truncated=was_truncated,
|
|
583
|
+
)
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
return symbols
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
class GitIntegration:
|
|
590
|
+
"""Git integration utilities for the code mapper.
|
|
591
|
+
|
|
592
|
+
Provides methods to get git-tracked files, parse .gitignore,
|
|
593
|
+
and find changes since a specific commit.
|
|
594
|
+
|
|
595
|
+
Attributes:
|
|
596
|
+
root_path: Path to the git repository root.
|
|
597
|
+
available: Whether git is available and this is a git repo.
|
|
598
|
+
|
|
599
|
+
Example:
|
|
600
|
+
>>> git = GitIntegration('/path/to/repo')
|
|
601
|
+
>>> if git.available:
|
|
602
|
+
... tracked_files = git.get_tracked_files()
|
|
603
|
+
... print(f"Found {len(tracked_files)} tracked files")
|
|
604
|
+
"""
|
|
605
|
+
|
|
606
|
+
def __init__(self, root_path: Path):
|
|
607
|
+
"""Initialize git integration.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
root_path: Path to the repository root.
|
|
611
|
+
"""
|
|
612
|
+
self.root_path = root_path
|
|
613
|
+
self.available = self._check_git_available()
|
|
614
|
+
|
|
615
|
+
def _check_git_available(self) -> bool:
|
|
616
|
+
"""Check if git is available and this is a git repository."""
|
|
617
|
+
try:
|
|
618
|
+
result = subprocess.run(
|
|
619
|
+
["git", "rev-parse", "--git-dir"],
|
|
620
|
+
cwd=self.root_path,
|
|
621
|
+
capture_output=True,
|
|
622
|
+
text=True,
|
|
623
|
+
timeout=5,
|
|
624
|
+
)
|
|
625
|
+
return result.returncode == 0
|
|
626
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
def get_tracked_files(self) -> set[str]:
|
|
630
|
+
"""Get all files tracked by git.
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
Set of relative file paths tracked by git.
|
|
634
|
+
"""
|
|
635
|
+
if not self.available:
|
|
636
|
+
return set()
|
|
637
|
+
|
|
638
|
+
try:
|
|
639
|
+
result = subprocess.run(
|
|
640
|
+
["git", "ls-files"],
|
|
641
|
+
cwd=self.root_path,
|
|
642
|
+
capture_output=True,
|
|
643
|
+
text=True,
|
|
644
|
+
timeout=30,
|
|
645
|
+
)
|
|
646
|
+
if result.returncode == 0:
|
|
647
|
+
return set(result.stdout.strip().split("\n")) if result.stdout.strip() else set()
|
|
648
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
649
|
+
pass
|
|
650
|
+
return set()
|
|
651
|
+
|
|
652
|
+
def get_gitignore_patterns(self) -> list[str]:
|
|
653
|
+
"""Parse .gitignore and return patterns.
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
List of gitignore patterns.
|
|
657
|
+
"""
|
|
658
|
+
patterns = []
|
|
659
|
+
gitignore_path = self.root_path / ".gitignore"
|
|
660
|
+
|
|
661
|
+
if gitignore_path.exists():
|
|
662
|
+
try:
|
|
663
|
+
content = gitignore_path.read_text(encoding="utf-8")
|
|
664
|
+
for line in content.splitlines():
|
|
665
|
+
line = line.strip()
|
|
666
|
+
# Skip comments and empty lines
|
|
667
|
+
if line and not line.startswith("#"):
|
|
668
|
+
patterns.append(line)
|
|
669
|
+
except Exception:
|
|
670
|
+
pass
|
|
671
|
+
|
|
672
|
+
return patterns
|
|
673
|
+
|
|
674
|
+
def get_files_changed_since(self, commit: str) -> set[str]:
|
|
675
|
+
"""Get files that changed since a specific commit.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
commit: Safe git reference (hash, branch, tag, HEAD~N).
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
Set of relative file paths that have changed.
|
|
682
|
+
|
|
683
|
+
Raises:
|
|
684
|
+
ValueError: If commit contains unsafe characters.
|
|
685
|
+
"""
|
|
686
|
+
if not re.fullmatch(r"[a-zA-Z0-9][a-zA-Z0-9_.~^/@{}\-]*", commit):
|
|
687
|
+
raise ValueError(f"Invalid git reference: {commit}")
|
|
688
|
+
|
|
689
|
+
if not self.available:
|
|
690
|
+
return set()
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
result = subprocess.run(
|
|
694
|
+
["git", "diff", "--name-only", commit, "HEAD"],
|
|
695
|
+
cwd=self.root_path,
|
|
696
|
+
capture_output=True,
|
|
697
|
+
text=True,
|
|
698
|
+
timeout=30,
|
|
699
|
+
)
|
|
700
|
+
if result.returncode == 0:
|
|
701
|
+
return set(result.stdout.strip().split("\n")) if result.stdout.strip() else set()
|
|
702
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
703
|
+
pass
|
|
704
|
+
return set()
|
|
705
|
+
|
|
706
|
+
def get_uncommitted_changes(self) -> set[str]:
|
|
707
|
+
"""Get files with uncommitted changes.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
Set of relative file paths with uncommitted changes.
|
|
711
|
+
"""
|
|
712
|
+
if not self.available:
|
|
713
|
+
return set()
|
|
714
|
+
|
|
715
|
+
try:
|
|
716
|
+
# Get both staged and unstaged changes
|
|
717
|
+
result = subprocess.run(
|
|
718
|
+
["git", "status", "--porcelain"],
|
|
719
|
+
cwd=self.root_path,
|
|
720
|
+
capture_output=True,
|
|
721
|
+
text=True,
|
|
722
|
+
timeout=30,
|
|
723
|
+
)
|
|
724
|
+
if result.returncode == 0:
|
|
725
|
+
files = set()
|
|
726
|
+
for line in result.stdout.strip().split("\n"):
|
|
727
|
+
if line and len(line) > 3:
|
|
728
|
+
# Format: "XY filename" where XY is status
|
|
729
|
+
files.add(line[3:].strip())
|
|
730
|
+
return files
|
|
731
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
732
|
+
pass
|
|
733
|
+
return set()
|
|
734
|
+
|
|
735
|
+
|
|
736
|
+
class CodeNavigator:
|
|
737
|
+
"""Main class for mapping a codebase to create a searchable index.
|
|
738
|
+
|
|
739
|
+
Scans a directory tree, analyzes source files, and generates a JSON index
|
|
740
|
+
containing all symbols, their locations, signatures, and dependencies.
|
|
741
|
+
|
|
742
|
+
Attributes:
|
|
743
|
+
root_path: Absolute path to the codebase root.
|
|
744
|
+
ignore_patterns: List of patterns to skip during scanning.
|
|
745
|
+
symbols: List of all discovered symbols.
|
|
746
|
+
file_hashes: Dict mapping file paths to content hashes.
|
|
747
|
+
stats: Dict with processing statistics.
|
|
748
|
+
|
|
749
|
+
Example:
|
|
750
|
+
>>> mapper = CodeNavigator('/path/to/project')
|
|
751
|
+
>>> code_map = mapper.scan()
|
|
752
|
+
>>> print(f"Found {code_map['stats']['symbols_found']} symbols")
|
|
753
|
+
Found 1847 symbols
|
|
754
|
+
|
|
755
|
+
>>> # Save to file
|
|
756
|
+
>>> import json
|
|
757
|
+
>>> with open('.codegraph.json', 'w') as f:
|
|
758
|
+
... json.dump(code_map, f)
|
|
759
|
+
"""
|
|
760
|
+
|
|
761
|
+
def __init__(
|
|
762
|
+
self,
|
|
763
|
+
root_path: str,
|
|
764
|
+
ignore_patterns: list[str] | None = None,
|
|
765
|
+
git_only: bool = False,
|
|
766
|
+
use_gitignore: bool = False,
|
|
767
|
+
):
|
|
768
|
+
"""Initialize the code mapper.
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
root_path: Path to the root directory to scan.
|
|
772
|
+
ignore_patterns: Additional patterns to ignore. Merged with defaults.
|
|
773
|
+
git_only: If True, only scan files tracked by git.
|
|
774
|
+
use_gitignore: If True, also ignore patterns from .gitignore.
|
|
775
|
+
"""
|
|
776
|
+
self.root_path = Path(root_path).resolve()
|
|
777
|
+
self.ignore_patterns = list(ignore_patterns or DEFAULT_IGNORE_PATTERNS)
|
|
778
|
+
self.git_only = git_only
|
|
779
|
+
self.use_gitignore = use_gitignore
|
|
780
|
+
self.symbols: list[Symbol] = []
|
|
781
|
+
self.file_hashes: dict[str, str] = {}
|
|
782
|
+
self.stats = {"files_processed": 0, "symbols_found": 0, "errors": 0}
|
|
783
|
+
self._existing_map: dict[str, Any] | None = None
|
|
784
|
+
|
|
785
|
+
# Initialize git integration
|
|
786
|
+
self._git = GitIntegration(self.root_path)
|
|
787
|
+
self._git_tracked_files: set[str] | None = None
|
|
788
|
+
|
|
789
|
+
# Add gitignore patterns if requested
|
|
790
|
+
if self.use_gitignore and self._git.available:
|
|
791
|
+
gitignore_patterns = self._git.get_gitignore_patterns()
|
|
792
|
+
self.ignore_patterns.extend(gitignore_patterns)
|
|
793
|
+
|
|
794
|
+
# Cache git tracked files if git_only mode
|
|
795
|
+
if self.git_only and self._git.available:
|
|
796
|
+
self._git_tracked_files = self._git.get_tracked_files()
|
|
797
|
+
|
|
798
|
+
def should_ignore(self, path: Path) -> bool:
|
|
799
|
+
"""Check if a path should be ignored during scanning.
|
|
800
|
+
|
|
801
|
+
Args:
|
|
802
|
+
path: Path to check.
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
True if the path matches any ignore pattern or is not git-tracked.
|
|
806
|
+
"""
|
|
807
|
+
path_str = str(path)
|
|
808
|
+
name = path.name
|
|
809
|
+
|
|
810
|
+
for pattern in self.ignore_patterns:
|
|
811
|
+
if fnmatch.fnmatch(name, pattern):
|
|
812
|
+
return True
|
|
813
|
+
if pattern in path_str:
|
|
814
|
+
return True
|
|
815
|
+
|
|
816
|
+
return False
|
|
817
|
+
|
|
818
|
+
def _is_git_tracked(self, file_path: Path) -> bool:
|
|
819
|
+
"""Check if a file is tracked by git.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
file_path: Absolute path to the file.
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
True if the file is git-tracked (or git_only mode is disabled).
|
|
826
|
+
"""
|
|
827
|
+
if not self.git_only or self._git_tracked_files is None:
|
|
828
|
+
return True
|
|
829
|
+
|
|
830
|
+
try:
|
|
831
|
+
rel_path = str(file_path.relative_to(self.root_path))
|
|
832
|
+
return rel_path in self._git_tracked_files
|
|
833
|
+
except ValueError:
|
|
834
|
+
return False
|
|
835
|
+
|
|
836
|
+
def get_language(self, file_path: Path) -> str | None:
|
|
837
|
+
"""Determine the programming language from file extension.
|
|
838
|
+
|
|
839
|
+
Args:
|
|
840
|
+
file_path: Path to the file.
|
|
841
|
+
|
|
842
|
+
Returns:
|
|
843
|
+
Language identifier string, or None if not recognized.
|
|
844
|
+
"""
|
|
845
|
+
ext = file_path.suffix.lower()
|
|
846
|
+
for lang, extensions in LANGUAGE_EXTENSIONS.items():
|
|
847
|
+
if ext in extensions:
|
|
848
|
+
return lang
|
|
849
|
+
return None
|
|
850
|
+
|
|
851
|
+
def hash_file(self, content: str) -> str:
|
|
852
|
+
"""Generate a hash for file content.
|
|
853
|
+
|
|
854
|
+
Args:
|
|
855
|
+
content: File content string.
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
12-character MD5 hash of the content.
|
|
859
|
+
"""
|
|
860
|
+
from . import compute_content_hash
|
|
861
|
+
|
|
862
|
+
return compute_content_hash(content)
|
|
863
|
+
|
|
864
|
+
def analyze_file(self, file_path: Path) -> list[Symbol]:
|
|
865
|
+
"""Analyze a single file and extract its symbols.
|
|
866
|
+
|
|
867
|
+
Args:
|
|
868
|
+
file_path: Path to the file to analyze.
|
|
869
|
+
|
|
870
|
+
Returns:
|
|
871
|
+
List of Symbol objects found in the file.
|
|
872
|
+
"""
|
|
873
|
+
try:
|
|
874
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
875
|
+
content = f.read()
|
|
876
|
+
|
|
877
|
+
rel_path = str(file_path.relative_to(self.root_path))
|
|
878
|
+
self.file_hashes[rel_path] = self.hash_file(content)
|
|
879
|
+
|
|
880
|
+
language = self.get_language(file_path)
|
|
881
|
+
analyzer: _Analyzer
|
|
882
|
+
if language == "python":
|
|
883
|
+
analyzer = PythonAnalyzer(rel_path, content)
|
|
884
|
+
elif language == "javascript":
|
|
885
|
+
from .js_ts_analyzer import JavaScriptAnalyzer
|
|
886
|
+
|
|
887
|
+
is_jsx = file_path.suffix.lower() in (".jsx",)
|
|
888
|
+
analyzer = JavaScriptAnalyzer(rel_path, content, is_jsx=is_jsx)
|
|
889
|
+
elif language == "typescript":
|
|
890
|
+
from .js_ts_analyzer import TypeScriptAnalyzer
|
|
891
|
+
|
|
892
|
+
is_tsx = file_path.suffix.lower() in (".tsx",)
|
|
893
|
+
analyzer = TypeScriptAnalyzer(rel_path, content, is_tsx=is_tsx)
|
|
894
|
+
elif language == "ruby":
|
|
895
|
+
from .ruby_analyzer import RubyAnalyzer
|
|
896
|
+
|
|
897
|
+
analyzer = RubyAnalyzer(rel_path, content)
|
|
898
|
+
elif language == "go":
|
|
899
|
+
from .go_analyzer import GoAnalyzer
|
|
900
|
+
|
|
901
|
+
analyzer = GoAnalyzer(rel_path, content)
|
|
902
|
+
elif language == "rust":
|
|
903
|
+
from .rust_analyzer import RustAnalyzer
|
|
904
|
+
|
|
905
|
+
analyzer = RustAnalyzer(rel_path, content)
|
|
906
|
+
elif language == "dart":
|
|
907
|
+
from .dart_analyzer import DartAnalyzer
|
|
908
|
+
|
|
909
|
+
analyzer = DartAnalyzer(rel_path, content)
|
|
910
|
+
elif language:
|
|
911
|
+
analyzer = GenericAnalyzer(rel_path, content, language)
|
|
912
|
+
else:
|
|
913
|
+
return []
|
|
914
|
+
|
|
915
|
+
return analyzer.analyze()
|
|
916
|
+
|
|
917
|
+
except Exception as e:
|
|
918
|
+
self.stats["errors"] += 1
|
|
919
|
+
print(f"Error analyzing {file_path}: {e}", file=sys.stderr)
|
|
920
|
+
return []
|
|
921
|
+
|
|
922
|
+
# Maximum time allowed for a scan operation (seconds)
|
|
923
|
+
SCAN_TIMEOUT = 30
|
|
924
|
+
|
|
925
|
+
def scan(self) -> dict[str, Any]:
|
|
926
|
+
"""Scan the entire codebase and generate a code map.
|
|
927
|
+
|
|
928
|
+
Returns:
|
|
929
|
+
Dict containing the complete code map with files, index, and stats.
|
|
930
|
+
Includes 'scan_timeout': True if the operation was cut short.
|
|
931
|
+
|
|
932
|
+
Example:
|
|
933
|
+
>>> mapper = CodeNavigator('/my/project')
|
|
934
|
+
>>> result = mapper.scan()
|
|
935
|
+
>>> print(result.keys())
|
|
936
|
+
dict_keys(['version', 'root', 'generated_at', 'stats', 'files', 'index'])
|
|
937
|
+
"""
|
|
938
|
+
mode = "git-tracked files" if self.git_only else "codebase"
|
|
939
|
+
print(f"Scanning {mode} at: {self.root_path}", file=sys.stderr)
|
|
940
|
+
|
|
941
|
+
if self.git_only:
|
|
942
|
+
if not self._git.available:
|
|
943
|
+
print("Warning: git not available, scanning all files", file=sys.stderr)
|
|
944
|
+
elif self._git_tracked_files:
|
|
945
|
+
print(f" Git tracked files: {len(self._git_tracked_files)}", file=sys.stderr)
|
|
946
|
+
|
|
947
|
+
scan_start = time.monotonic()
|
|
948
|
+
timed_out = False
|
|
949
|
+
|
|
950
|
+
for root, dirs, files in os.walk(self.root_path):
|
|
951
|
+
if time.monotonic() - scan_start > self.SCAN_TIMEOUT:
|
|
952
|
+
timed_out = True
|
|
953
|
+
print("Warning: scan timed out, returning partial results", file=sys.stderr)
|
|
954
|
+
break
|
|
955
|
+
dirs[:] = [d for d in dirs if not self.should_ignore(Path(root) / d)]
|
|
956
|
+
|
|
957
|
+
for file in files:
|
|
958
|
+
file_path = Path(root) / file
|
|
959
|
+
if self.should_ignore(file_path):
|
|
960
|
+
continue
|
|
961
|
+
|
|
962
|
+
# Skip if not git-tracked (when git_only mode is enabled)
|
|
963
|
+
if not self._is_git_tracked(file_path):
|
|
964
|
+
continue
|
|
965
|
+
|
|
966
|
+
language = self.get_language(file_path)
|
|
967
|
+
if language:
|
|
968
|
+
symbols = self.analyze_file(file_path)
|
|
969
|
+
self.symbols.extend(symbols)
|
|
970
|
+
self.stats["files_processed"] += 1
|
|
971
|
+
|
|
972
|
+
self.stats["symbols_found"] = len(self.symbols)
|
|
973
|
+
if timed_out:
|
|
974
|
+
self.stats["scan_timeout"] = True
|
|
975
|
+
return self.generate_map()
|
|
976
|
+
|
|
977
|
+
def get_current_file_hash(self, file_path: Path) -> str | None:
|
|
978
|
+
"""Get the hash of a file's current content without full analysis.
|
|
979
|
+
|
|
980
|
+
Args:
|
|
981
|
+
file_path: Path to the file.
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
Hash string, or None if file cannot be read.
|
|
985
|
+
"""
|
|
986
|
+
try:
|
|
987
|
+
with open(file_path, encoding="utf-8", errors="ignore") as f:
|
|
988
|
+
content = f.read()
|
|
989
|
+
return self.hash_file(content)
|
|
990
|
+
except Exception:
|
|
991
|
+
return None
|
|
992
|
+
|
|
993
|
+
def scan_incremental(self, existing_map_path: str) -> dict[str, Any]:
|
|
994
|
+
"""Incrementally update an existing code map.
|
|
995
|
+
|
|
996
|
+
Only re-analyzes files that have changed since the last scan.
|
|
997
|
+
This is much faster than a full scan for large codebases.
|
|
998
|
+
|
|
999
|
+
Args:
|
|
1000
|
+
existing_map_path: Path to the existing .codegraph.json file.
|
|
1001
|
+
|
|
1002
|
+
Returns:
|
|
1003
|
+
Dict containing the updated code map.
|
|
1004
|
+
|
|
1005
|
+
Example:
|
|
1006
|
+
>>> mapper = CodeNavigator('/my/project')
|
|
1007
|
+
>>> result = mapper.scan_incremental('.codegraph.json')
|
|
1008
|
+
>>> print(result['stats'])
|
|
1009
|
+
{'files_processed': 5, 'files_unchanged': 137, 'files_added': 2, ...}
|
|
1010
|
+
"""
|
|
1011
|
+
# Load existing map - only extract 'files' to minimize memory usage
|
|
1012
|
+
# The full map can be large; we only need the files dict for comparison
|
|
1013
|
+
try:
|
|
1014
|
+
with open(existing_map_path, encoding="utf-8") as f:
|
|
1015
|
+
existing_map = json.load(f)
|
|
1016
|
+
# Extract only what we need, let the rest be garbage collected
|
|
1017
|
+
existing_files = existing_map.get("files", {})
|
|
1018
|
+
del existing_map # Explicit cleanup of the full map
|
|
1019
|
+
except (FileNotFoundError, json.JSONDecodeError) as e:
|
|
1020
|
+
print(f"Cannot load existing map ({e}), performing full scan", file=sys.stderr)
|
|
1021
|
+
return self.scan()
|
|
1022
|
+
print(f"Incremental scan at: {self.root_path}", file=sys.stderr)
|
|
1023
|
+
print(f"Existing map has {len(existing_files)} files", file=sys.stderr)
|
|
1024
|
+
|
|
1025
|
+
# Initialize incremental stats
|
|
1026
|
+
self.stats = {
|
|
1027
|
+
"files_processed": 0,
|
|
1028
|
+
"files_unchanged": 0,
|
|
1029
|
+
"files_added": 0,
|
|
1030
|
+
"files_modified": 0,
|
|
1031
|
+
"files_deleted": 0,
|
|
1032
|
+
"symbols_found": 0,
|
|
1033
|
+
"errors": 0,
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
# Track which files we've seen in current scan
|
|
1037
|
+
current_files: dict[str, str] = {} # rel_path -> hash
|
|
1038
|
+
|
|
1039
|
+
# First pass: collect all current files and their hashes
|
|
1040
|
+
# Note: Files may be deleted/modified during walk (TOCTOU).
|
|
1041
|
+
# We handle this by checking existence and catching exceptions.
|
|
1042
|
+
for root, dirs, files in os.walk(self.root_path):
|
|
1043
|
+
dirs[:] = [d for d in dirs if not self.should_ignore(Path(root) / d)]
|
|
1044
|
+
|
|
1045
|
+
for file in files:
|
|
1046
|
+
file_path = Path(root) / file
|
|
1047
|
+
if self.should_ignore(file_path):
|
|
1048
|
+
continue
|
|
1049
|
+
|
|
1050
|
+
# Skip symlinks to prevent symlink attacks
|
|
1051
|
+
try:
|
|
1052
|
+
if file_path.is_symlink():
|
|
1053
|
+
continue
|
|
1054
|
+
except OSError:
|
|
1055
|
+
continue
|
|
1056
|
+
|
|
1057
|
+
language = self.get_language(file_path)
|
|
1058
|
+
if language:
|
|
1059
|
+
rel_path = str(file_path.relative_to(self.root_path))
|
|
1060
|
+
try:
|
|
1061
|
+
current_hash = self.get_current_file_hash(file_path)
|
|
1062
|
+
if current_hash:
|
|
1063
|
+
current_files[rel_path] = current_hash
|
|
1064
|
+
except OSError:
|
|
1065
|
+
# File disappeared or became inaccessible during scan
|
|
1066
|
+
pass
|
|
1067
|
+
|
|
1068
|
+
# Categorize files
|
|
1069
|
+
unchanged_files = []
|
|
1070
|
+
modified_files = []
|
|
1071
|
+
added_files = []
|
|
1072
|
+
|
|
1073
|
+
for rel_path, current_hash in current_files.items():
|
|
1074
|
+
if rel_path in existing_files:
|
|
1075
|
+
existing_hash = existing_files[rel_path].get("hash", "")
|
|
1076
|
+
if current_hash == existing_hash:
|
|
1077
|
+
unchanged_files.append(rel_path)
|
|
1078
|
+
else:
|
|
1079
|
+
modified_files.append(rel_path)
|
|
1080
|
+
else:
|
|
1081
|
+
added_files.append(rel_path)
|
|
1082
|
+
|
|
1083
|
+
# Files in existing map but not in current scan = deleted
|
|
1084
|
+
deleted_files = [f for f in existing_files if f not in current_files]
|
|
1085
|
+
|
|
1086
|
+
print(f" Unchanged: {len(unchanged_files)}", file=sys.stderr)
|
|
1087
|
+
print(f" Modified: {len(modified_files)}", file=sys.stderr)
|
|
1088
|
+
print(f" Added: {len(added_files)}", file=sys.stderr)
|
|
1089
|
+
print(f" Deleted: {len(deleted_files)}", file=sys.stderr)
|
|
1090
|
+
|
|
1091
|
+
# Preserve unchanged files' symbols
|
|
1092
|
+
for rel_path in unchanged_files:
|
|
1093
|
+
file_info = existing_files[rel_path]
|
|
1094
|
+
self.file_hashes[rel_path] = file_info.get("hash", "")
|
|
1095
|
+
|
|
1096
|
+
# Convert stored symbols back to Symbol objects
|
|
1097
|
+
for sym_data in file_info.get("symbols", []):
|
|
1098
|
+
symbol = Symbol(
|
|
1099
|
+
name=sym_data["name"],
|
|
1100
|
+
type=sym_data["type"],
|
|
1101
|
+
file_path=rel_path,
|
|
1102
|
+
line_start=sym_data["lines"][0],
|
|
1103
|
+
line_end=sym_data["lines"][1],
|
|
1104
|
+
signature=sym_data.get("signature"),
|
|
1105
|
+
docstring=sym_data.get("docstring"),
|
|
1106
|
+
parent=sym_data.get("parent"),
|
|
1107
|
+
dependencies=sym_data.get("deps") or [],
|
|
1108
|
+
decorators=sym_data.get("decorators") or [],
|
|
1109
|
+
truncated=sym_data.get("truncated", False),
|
|
1110
|
+
)
|
|
1111
|
+
self.symbols.append(symbol)
|
|
1112
|
+
|
|
1113
|
+
self.stats["files_unchanged"] = len(unchanged_files)
|
|
1114
|
+
|
|
1115
|
+
# Analyze modified and added files
|
|
1116
|
+
# Note: TOCTOU mitigation - files may have changed or been deleted
|
|
1117
|
+
# between the hash check and analysis. We handle this gracefully.
|
|
1118
|
+
files_to_analyze = modified_files + added_files
|
|
1119
|
+
for rel_path in files_to_analyze:
|
|
1120
|
+
file_path = self.root_path / rel_path
|
|
1121
|
+
try:
|
|
1122
|
+
# Check file still exists and is a regular file (not symlink)
|
|
1123
|
+
if not file_path.is_file() or file_path.is_symlink():
|
|
1124
|
+
# File was deleted or replaced with symlink between hash and analyze
|
|
1125
|
+
print(
|
|
1126
|
+
f" Skipping {rel_path}: file no longer exists or is symlink",
|
|
1127
|
+
file=sys.stderr,
|
|
1128
|
+
)
|
|
1129
|
+
self.stats["errors"] += 1
|
|
1130
|
+
continue
|
|
1131
|
+
|
|
1132
|
+
symbols = self.analyze_file(file_path)
|
|
1133
|
+
self.symbols.extend(symbols)
|
|
1134
|
+
self.stats["files_processed"] += 1
|
|
1135
|
+
except OSError as e:
|
|
1136
|
+
# File became inaccessible between hash check and analysis (TOCTOU)
|
|
1137
|
+
print(f" Skipping {rel_path}: {e}", file=sys.stderr)
|
|
1138
|
+
self.stats["errors"] += 1
|
|
1139
|
+
continue
|
|
1140
|
+
|
|
1141
|
+
self.stats["files_added"] = len(added_files)
|
|
1142
|
+
self.stats["files_modified"] = len(modified_files)
|
|
1143
|
+
self.stats["files_deleted"] = len(deleted_files)
|
|
1144
|
+
self.stats["symbols_found"] = len(self.symbols)
|
|
1145
|
+
|
|
1146
|
+
return self.generate_map()
|
|
1147
|
+
|
|
1148
|
+
def generate_map(self) -> dict[str, Any]:
|
|
1149
|
+
"""Generate the code map structure from collected symbols.
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
Dict with version, root, timestamp, stats, files map, and symbol index.
|
|
1153
|
+
"""
|
|
1154
|
+
# Start with all analyzed files (including those with no symbols)
|
|
1155
|
+
files_map: dict[str, dict[str, Any]] = {}
|
|
1156
|
+
for file_path, file_hash in self.file_hashes.items():
|
|
1157
|
+
files_map[file_path] = {
|
|
1158
|
+
"hash": file_hash,
|
|
1159
|
+
"symbols": [],
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
# Add symbols to their respective files
|
|
1163
|
+
for symbol in self.symbols:
|
|
1164
|
+
if symbol.file_path not in files_map:
|
|
1165
|
+
files_map[symbol.file_path] = {
|
|
1166
|
+
"hash": self.file_hashes.get(symbol.file_path, ""),
|
|
1167
|
+
"symbols": [],
|
|
1168
|
+
}
|
|
1169
|
+
symbol_dict: dict[str, Any] = {
|
|
1170
|
+
"name": symbol.name,
|
|
1171
|
+
"type": symbol.type,
|
|
1172
|
+
"lines": [symbol.line_start, symbol.line_end],
|
|
1173
|
+
"signature": symbol.signature,
|
|
1174
|
+
"docstring": symbol.docstring,
|
|
1175
|
+
"parent": symbol.parent,
|
|
1176
|
+
"deps": symbol.dependencies[:10] if symbol.dependencies else None,
|
|
1177
|
+
"decorators": symbol.decorators if symbol.decorators else None,
|
|
1178
|
+
}
|
|
1179
|
+
# Only include truncated flag when True (keeps output compact)
|
|
1180
|
+
if symbol.truncated:
|
|
1181
|
+
symbol_dict["truncated"] = True
|
|
1182
|
+
files_map[symbol.file_path]["symbols"].append(symbol_dict)
|
|
1183
|
+
|
|
1184
|
+
symbol_index: dict[str, list[dict[str, Any]]] = {}
|
|
1185
|
+
for symbol in self.symbols:
|
|
1186
|
+
key = symbol.name.lower()
|
|
1187
|
+
if key not in symbol_index:
|
|
1188
|
+
symbol_index[key] = []
|
|
1189
|
+
symbol_index[key].append(
|
|
1190
|
+
{
|
|
1191
|
+
"file": symbol.file_path,
|
|
1192
|
+
"type": symbol.type,
|
|
1193
|
+
"lines": [symbol.line_start, symbol.line_end],
|
|
1194
|
+
"parent": symbol.parent,
|
|
1195
|
+
}
|
|
1196
|
+
)
|
|
1197
|
+
|
|
1198
|
+
return {
|
|
1199
|
+
"version": "1.0",
|
|
1200
|
+
"root": str(self.root_path),
|
|
1201
|
+
"generated_at": datetime.now().isoformat(),
|
|
1202
|
+
"stats": self.stats,
|
|
1203
|
+
"files": files_map,
|
|
1204
|
+
"index": symbol_index,
|
|
1205
|
+
}
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
def add_map_arguments(parser: argparse.ArgumentParser) -> None:
|
|
1209
|
+
"""Add map command arguments to a parser.
|
|
1210
|
+
|
|
1211
|
+
Args:
|
|
1212
|
+
parser: The argument parser to add arguments to.
|
|
1213
|
+
"""
|
|
1214
|
+
parser.add_argument("path", help="Path to the codebase root directory")
|
|
1215
|
+
parser.add_argument(
|
|
1216
|
+
"-o",
|
|
1217
|
+
"--output",
|
|
1218
|
+
default=".codegraph.json",
|
|
1219
|
+
help="Output file path (default: .codegraph.json)",
|
|
1220
|
+
)
|
|
1221
|
+
parser.add_argument("-i", "--ignore", nargs="*", help="Additional patterns to ignore")
|
|
1222
|
+
parser.add_argument(
|
|
1223
|
+
"--incremental",
|
|
1224
|
+
action="store_true",
|
|
1225
|
+
help="Only update changed files (requires existing map)",
|
|
1226
|
+
)
|
|
1227
|
+
parser.add_argument(
|
|
1228
|
+
"--git-only",
|
|
1229
|
+
action="store_true",
|
|
1230
|
+
help="Only scan files tracked by git",
|
|
1231
|
+
)
|
|
1232
|
+
parser.add_argument(
|
|
1233
|
+
"--use-gitignore",
|
|
1234
|
+
action="store_true",
|
|
1235
|
+
help="Also ignore patterns from .gitignore",
|
|
1236
|
+
)
|
|
1237
|
+
parser.add_argument(
|
|
1238
|
+
"--compact", action="store_true", help="Output compact JSON (default: pretty-printed)"
|
|
1239
|
+
)
|
|
1240
|
+
parser.add_argument("--no-color", action="store_true", help="Disable colored output")
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
def run_map(args: argparse.Namespace) -> None:
|
|
1244
|
+
"""Execute the map command with parsed arguments.
|
|
1245
|
+
|
|
1246
|
+
Args:
|
|
1247
|
+
args: Parsed command-line arguments.
|
|
1248
|
+
"""
|
|
1249
|
+
ignore_patterns = DEFAULT_IGNORE_PATTERNS.copy()
|
|
1250
|
+
if args.ignore:
|
|
1251
|
+
ignore_patterns.extend(args.ignore)
|
|
1252
|
+
|
|
1253
|
+
git_only = getattr(args, "git_only", False)
|
|
1254
|
+
use_gitignore = getattr(args, "use_gitignore", False)
|
|
1255
|
+
|
|
1256
|
+
mapper = CodeNavigator(
|
|
1257
|
+
args.path,
|
|
1258
|
+
ignore_patterns,
|
|
1259
|
+
git_only=git_only,
|
|
1260
|
+
use_gitignore=use_gitignore,
|
|
1261
|
+
)
|
|
1262
|
+
|
|
1263
|
+
output_path = args.output
|
|
1264
|
+
if not os.path.isabs(output_path):
|
|
1265
|
+
output_path = os.path.join(args.path, output_path)
|
|
1266
|
+
|
|
1267
|
+
# Use incremental scan if requested and existing map exists
|
|
1268
|
+
incremental = getattr(args, "incremental", False)
|
|
1269
|
+
if incremental and os.path.exists(output_path):
|
|
1270
|
+
code_map = mapper.scan_incremental(output_path)
|
|
1271
|
+
else:
|
|
1272
|
+
if incremental:
|
|
1273
|
+
print(f"No existing map at {output_path}, performing full scan", file=sys.stderr)
|
|
1274
|
+
code_map = mapper.scan()
|
|
1275
|
+
|
|
1276
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
1277
|
+
if args.compact:
|
|
1278
|
+
json.dump(code_map, f, separators=(",", ":"))
|
|
1279
|
+
else:
|
|
1280
|
+
json.dump(code_map, f, indent=2)
|
|
1281
|
+
|
|
1282
|
+
c = get_colors(no_color=args.no_color)
|
|
1283
|
+
stats = code_map["stats"]
|
|
1284
|
+
|
|
1285
|
+
# Display appropriate message based on scan type
|
|
1286
|
+
if "files_unchanged" in stats:
|
|
1287
|
+
# Incremental scan
|
|
1288
|
+
print(f"\n{c.success('✓')} Code map updated: {c.cyan(output_path)}", file=sys.stderr)
|
|
1289
|
+
print(f" Unchanged: {c.dim(str(stats['files_unchanged']))}", file=sys.stderr)
|
|
1290
|
+
print(f" Modified: {c.yellow(str(stats['files_modified']))}", file=sys.stderr)
|
|
1291
|
+
print(f" Added: {c.green(str(stats['files_added']))}", file=sys.stderr)
|
|
1292
|
+
print(f" Deleted: {c.magenta(str(stats['files_deleted']))}", file=sys.stderr)
|
|
1293
|
+
print(f" Total symbols: {c.green(str(stats['symbols_found']))}", file=sys.stderr)
|
|
1294
|
+
else:
|
|
1295
|
+
# Full scan
|
|
1296
|
+
print(f"\n{c.success('✓')} Code map generated: {c.cyan(output_path)}", file=sys.stderr)
|
|
1297
|
+
print(f" Files processed: {c.green(str(stats['files_processed']))}", file=sys.stderr)
|
|
1298
|
+
print(f" Symbols found: {c.green(str(stats['symbols_found']))}", file=sys.stderr)
|
|
1299
|
+
|
|
1300
|
+
summary = {"output": output_path, "stats": stats}
|
|
1301
|
+
if args.compact:
|
|
1302
|
+
print(json.dumps(summary, separators=(",", ":")))
|
|
1303
|
+
else:
|
|
1304
|
+
print(json.dumps(summary, indent=2))
|
|
1305
|
+
|
|
1306
|
+
|
|
1307
|
+
def main():
|
|
1308
|
+
"""Command-line interface for the code mapper.
|
|
1309
|
+
|
|
1310
|
+
Usage:
|
|
1311
|
+
codegraph-nav scan /path/to/project [-o OUTPUT] [-i IGNORE...] [--compact]
|
|
1312
|
+
|
|
1313
|
+
Example:
|
|
1314
|
+
$ codegraph-nav scan /my/project -o .codegraph.json
|
|
1315
|
+
"""
|
|
1316
|
+
parser = argparse.ArgumentParser(
|
|
1317
|
+
description="Generate a code map for token-efficient navigation",
|
|
1318
|
+
epilog="Example: codegraph-nav scan /my/project -o .codegraph.json",
|
|
1319
|
+
)
|
|
1320
|
+
add_map_arguments(parser)
|
|
1321
|
+
parser.add_argument("-v", "--version", action="version", version=f"%(prog)s {__version__}")
|
|
1322
|
+
|
|
1323
|
+
args = parser.parse_args()
|
|
1324
|
+
run_map(args)
|
|
1325
|
+
|
|
1326
|
+
|
|
1327
|
+
if __name__ == "__main__":
|
|
1328
|
+
main()
|