codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,1077 @@
|
|
|
1
|
+
"""Method-call AST graph builder.
|
|
2
|
+
|
|
3
|
+
Builds method-level call graphs from source code using tree-sitter for accurate
|
|
4
|
+
multi-language parsing. Falls back to Python ``ast`` for Python files when
|
|
5
|
+
tree-sitter is unavailable, and to lightweight regex for other languages.
|
|
6
|
+
|
|
7
|
+
Supported languages (via tree-sitter):
|
|
8
|
+
Python, JavaScript, TypeScript, TSX, JSX, Go, Java
|
|
9
|
+
|
|
10
|
+
Install tree-sitter grammars:
|
|
11
|
+
pip install "corbell[treesitter]"
|
|
12
|
+
# or individually:
|
|
13
|
+
pip install tree-sitter tree-sitter-python tree-sitter-javascript \\
|
|
14
|
+
tree-sitter-typescript tree-sitter-go tree-sitter-java
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import ast
|
|
20
|
+
import re
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
24
|
+
|
|
25
|
+
from corbell.core.graph.schema import DependencyEdge, GraphStore, MethodNode
|
|
26
|
+
from corbell.core.gitignore import load_gitignore
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# Tree-sitter setup (optional dependency)
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
import tree_sitter # noqa: F401
|
|
34
|
+
from tree_sitter import Language, Parser as TSParser
|
|
35
|
+
_TS_AVAILABLE = True
|
|
36
|
+
except ImportError:
|
|
37
|
+
_TS_AVAILABLE = False
|
|
38
|
+
|
|
39
|
+
# Mapping: our language name -> (tree-sitter module name, language() callable attr)
|
|
40
|
+
_TS_MODULES: Dict[str, str] = {
|
|
41
|
+
"python": "tree_sitter_python",
|
|
42
|
+
"javascript": "tree_sitter_javascript",
|
|
43
|
+
"typescript": "tree_sitter_typescript",
|
|
44
|
+
"tsx": "tree_sitter_typescript", # same package, different grammar fn
|
|
45
|
+
"go": "tree_sitter_go",
|
|
46
|
+
"java": "tree_sitter_java",
|
|
47
|
+
"csharp": "tree_sitter_c_sharp",
|
|
48
|
+
"rust": "tree_sitter_rust",
|
|
49
|
+
"ruby": "tree_sitter_ruby",
|
|
50
|
+
"php": "tree_sitter_php",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Which AST node types to treat as function/method definitions per language
|
|
54
|
+
_TS_TARGET_NODES: Dict[str, Set[str]] = {
|
|
55
|
+
"python": {
|
|
56
|
+
"function_definition",
|
|
57
|
+
"async_function_definition",
|
|
58
|
+
},
|
|
59
|
+
"javascript": {
|
|
60
|
+
"function_declaration",
|
|
61
|
+
"function_expression",
|
|
62
|
+
"generator_function_declaration",
|
|
63
|
+
"arrow_function",
|
|
64
|
+
"method_definition",
|
|
65
|
+
},
|
|
66
|
+
"typescript": {
|
|
67
|
+
"function_declaration",
|
|
68
|
+
"function_expression",
|
|
69
|
+
"generator_function_declaration",
|
|
70
|
+
"arrow_function",
|
|
71
|
+
"method_definition",
|
|
72
|
+
"ambient_declaration", # declare function ...
|
|
73
|
+
},
|
|
74
|
+
"tsx": {
|
|
75
|
+
"function_declaration",
|
|
76
|
+
"function_expression",
|
|
77
|
+
"generator_function_declaration",
|
|
78
|
+
"arrow_function",
|
|
79
|
+
"method_definition",
|
|
80
|
+
},
|
|
81
|
+
"go": {
|
|
82
|
+
"function_declaration",
|
|
83
|
+
"method_declaration",
|
|
84
|
+
},
|
|
85
|
+
"java": {
|
|
86
|
+
"method_declaration",
|
|
87
|
+
"constructor_declaration",
|
|
88
|
+
},
|
|
89
|
+
"csharp": {
|
|
90
|
+
"method_declaration",
|
|
91
|
+
"constructor_declaration",
|
|
92
|
+
"local_function_statement",
|
|
93
|
+
},
|
|
94
|
+
"rust": {
|
|
95
|
+
"function_item",
|
|
96
|
+
},
|
|
97
|
+
"ruby": {
|
|
98
|
+
"method",
|
|
99
|
+
"singleton_method",
|
|
100
|
+
},
|
|
101
|
+
"php": {
|
|
102
|
+
"function_definition",
|
|
103
|
+
"method_declaration",
|
|
104
|
+
},
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
# Child field names that hold the identifier for each language's function node
|
|
108
|
+
_TS_NAME_FIELDS: Dict[str, List[str]] = {
|
|
109
|
+
"python": ["name"],
|
|
110
|
+
"javascript": ["name"],
|
|
111
|
+
"typescript": ["name"],
|
|
112
|
+
"go": ["name"],
|
|
113
|
+
"java": ["name"],
|
|
114
|
+
"csharp": ["name"],
|
|
115
|
+
"rust": ["name"],
|
|
116
|
+
"ruby": ["name"],
|
|
117
|
+
"php": ["name"],
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
_SKIP_DIRS = {
|
|
121
|
+
".git", "__pycache__", "node_modules", "venv", "env", ".venv", "tests", "__tests__",
|
|
122
|
+
".pytest_cache", "dist", "build", "coverage", ".next", ".nuxt",
|
|
123
|
+
".svelte-kit", ".cache", "out", "__tests__", ".turbo", ".vercel",
|
|
124
|
+
"storybook-static", ".storybook",
|
|
125
|
+
}
|
|
126
|
+
_EXT_LANG = {
|
|
127
|
+
".py": "python",
|
|
128
|
+
".js": "javascript",
|
|
129
|
+
".ts": "typescript",
|
|
130
|
+
".tsx": "tsx", # tsx uses a separate tree-sitter grammar (language_tsx)
|
|
131
|
+
".jsx": "javascript",
|
|
132
|
+
".go": "go",
|
|
133
|
+
".java": "java",
|
|
134
|
+
".cs": "csharp",
|
|
135
|
+
".rs": "rust",
|
|
136
|
+
".rb": "ruby",
|
|
137
|
+
".php": "php",
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# Call site node types per language (for extracting function calls)
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
_TS_CALL_SITE_NODES: Dict[str, Set[str]] = {
|
|
145
|
+
"python": {"call"},
|
|
146
|
+
"javascript": {"call_expression", "new_expression"},
|
|
147
|
+
"typescript": {"call_expression", "new_expression"},
|
|
148
|
+
"tsx": {"call_expression", "new_expression"},
|
|
149
|
+
"go": {"call_expression"},
|
|
150
|
+
"java": {"method_invocation", "object_creation_expression"},
|
|
151
|
+
"csharp": {"invocation_expression", "object_creation_expression"},
|
|
152
|
+
"rust": {"call_expression", "macro_invocation"},
|
|
153
|
+
"ruby": {"call"},
|
|
154
|
+
"php": {"function_call_expression", "member_call_expression", "scoped_call_expression", "object_creation_expression"},
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# ---------------------------------------------------------------------------
|
|
158
|
+
# Builtin blocklist — filter high-noise language builtins from call graph
|
|
159
|
+
# ---------------------------------------------------------------------------
|
|
160
|
+
|
|
161
|
+
_BUILTIN_BLOCKLIST: Dict[str, Set[str]] = {
|
|
162
|
+
"python": {
|
|
163
|
+
"print", "len", "range", "enumerate", "zip", "map", "filter",
|
|
164
|
+
"sorted", "reversed", "list", "dict", "set", "tuple", "str",
|
|
165
|
+
"int", "float", "bool", "bytes", "type", "isinstance", "issubclass",
|
|
166
|
+
"hasattr", "getattr", "setattr", "delattr", "super", "object",
|
|
167
|
+
"open", "repr", "hash", "id", "hex", "oct", "bin", "abs", "round",
|
|
168
|
+
"min", "max", "sum", "all", "any", "next", "iter", "vars",
|
|
169
|
+
"format", "input", "exec", "eval", "compile", "globals", "locals",
|
|
170
|
+
"staticmethod", "classmethod", "property", "append", "extend",
|
|
171
|
+
"items", "keys", "values", "get", "update", "pop", "copy", "join",
|
|
172
|
+
"split", "strip", "replace", "startswith", "endswith", "decode",
|
|
173
|
+
"encode", "lower", "upper", "format_map",
|
|
174
|
+
},
|
|
175
|
+
"javascript": {
|
|
176
|
+
"console", "log", "error", "warn", "info", "debug", "assert",
|
|
177
|
+
"setTimeout", "setInterval", "clearTimeout", "clearInterval",
|
|
178
|
+
"setImmediate", "clearImmediate", "queueMicrotask",
|
|
179
|
+
"Promise", "resolve", "reject", "then", "catch", "finally", "all",
|
|
180
|
+
"fetch", "JSON", "parse", "stringify", "Math", "Date", "Array",
|
|
181
|
+
"Object", "String", "Number", "Boolean", "Symbol", "BigInt",
|
|
182
|
+
"parseInt", "parseFloat", "isNaN", "isFinite", "encodeURIComponent",
|
|
183
|
+
"decodeURIComponent", "encodeURI", "decodeURI", "require",
|
|
184
|
+
"map", "filter", "reduce", "forEach", "find", "findIndex",
|
|
185
|
+
"push", "pop", "shift", "unshift", "splice", "slice", "join",
|
|
186
|
+
"toString", "valueOf", "hasOwnProperty", "includes", "indexOf",
|
|
187
|
+
"addEventListener", "removeEventListener", "emit", "on", "off",
|
|
188
|
+
"next", "return", "throw", "keys", "values", "entries", "assign",
|
|
189
|
+
"useState", "useEffect", "useContext", "useRef", "useMemo",
|
|
190
|
+
"useCallback", "useReducer", "useLayoutEffect", "createContext",
|
|
191
|
+
"createElement", "render", "it", "describe", "expect", "test",
|
|
192
|
+
"beforeEach", "afterEach", "beforeAll", "afterAll", "jest",
|
|
193
|
+
},
|
|
194
|
+
"go": {
|
|
195
|
+
"make", "len", "cap", "append", "copy", "delete", "close",
|
|
196
|
+
"panic", "recover", "print", "println", "new", "real", "imag",
|
|
197
|
+
"Errorf", "Sprintf", "Printf", "Println", "Fprintf", "Scanf",
|
|
198
|
+
"Error", "String", "Format", "Marshal", "Unmarshal",
|
|
199
|
+
"Fatal", "Fatalf", "Log", "Logf",
|
|
200
|
+
},
|
|
201
|
+
"java": {
|
|
202
|
+
"println", "print", "printf", "format", "toString", "hashCode",
|
|
203
|
+
"equals", "compareTo", "length", "size", "isEmpty", "contains",
|
|
204
|
+
"add", "get", "put", "remove", "clear", "iterator", "next",
|
|
205
|
+
"append", "insert", "delete", "substring", "charAt", "indexOf",
|
|
206
|
+
"parseInt", "parseLong", "parseDouble", "parseFloat",
|
|
207
|
+
"valueOf", "of", "ofNullable", "orElse", "isPresent", "get",
|
|
208
|
+
"stream", "collect", "toList", "toMap", "filter", "map",
|
|
209
|
+
"forEach", "anyMatch", "allMatch", "findFirst",
|
|
210
|
+
},
|
|
211
|
+
"csharp": {
|
|
212
|
+
"WriteLine", "Write", "ToString", "Equals", "GetHashCode", "GetType",
|
|
213
|
+
"ReferenceEquals", "Parse", "TryParse", "Format", "Join", "Concat",
|
|
214
|
+
"IsNullOrEmpty", "IsNullOrWhiteSpace", "Select", "Where", "ToList",
|
|
215
|
+
"ToArray", "FirstOrDefault", "Any", "All", "Count", "Max", "Min",
|
|
216
|
+
"Sum", "Add", "Remove", "Clear", "Contains", "IndexOf", "Substring",
|
|
217
|
+
},
|
|
218
|
+
"rust": {
|
|
219
|
+
"println", "print", "format", "panic", "unwrap", "expect",
|
|
220
|
+
"clone", "to_string", "into", "from", "as_ref", "as_mut",
|
|
221
|
+
"len", "is_empty", "push", "pop", "insert", "remove", "clear",
|
|
222
|
+
"iter", "iter_mut", "into_iter", "map", "filter", "collect",
|
|
223
|
+
"any", "all", "find", "Ok", "Err", "Some", "None",
|
|
224
|
+
},
|
|
225
|
+
"ruby": {
|
|
226
|
+
"puts", "print", "p", "printf", "sprintf", "raise", "fail",
|
|
227
|
+
"require", "require_relative", "include", "extend", "prepend",
|
|
228
|
+
"to_s", "to_i", "to_f", "to_a", "to_h", "to_sym", "class",
|
|
229
|
+
"is_a?", "kind_of?", "instance_of?", "respond_to?", "nil?",
|
|
230
|
+
"empty?", "length", "size", "push", "pop", "shift", "unshift",
|
|
231
|
+
"map", "select", "reject", "reduce", "inject", "each", "find",
|
|
232
|
+
},
|
|
233
|
+
"php": {
|
|
234
|
+
"echo", "print", "print_r", "var_dump", "var_export", "printf",
|
|
235
|
+
"sprintf", "die", "exit", "isset", "empty", "unset", "count",
|
|
236
|
+
"sizeof", "array_push", "array_pop", "array_shift", "array_unshift",
|
|
237
|
+
"array_map", "array_filter", "array_reduce", "array_keys", "array_values",
|
|
238
|
+
"in_array", "explode", "implode", "str_replace", "substr", "strlen",
|
|
239
|
+
"strpos", "strtolower", "strtoupper", "trim", "json_encode", "json_decode",
|
|
240
|
+
"Exception", "RuntimeException", "InvalidArgumentException",
|
|
241
|
+
},
|
|
242
|
+
}
|
|
243
|
+
# Add typescript as alias of javascript builtins
|
|
244
|
+
_BUILTIN_BLOCKLIST["typescript"] = _BUILTIN_BLOCKLIST["javascript"]
|
|
245
|
+
_BUILTIN_BLOCKLIST["tsx"] = _BUILTIN_BLOCKLIST["javascript"]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ---------------------------------------------------------------------------
|
|
249
|
+
# Parser cache
|
|
250
|
+
# ---------------------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
_parser_cache: Dict[str, Any] = {} # lang -> TSParser | None
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _get_ts_parser(lang: str) -> Optional[Any]:
|
|
256
|
+
"""Return a cached tree-sitter Parser for *lang*, or None if unavailable."""
|
|
257
|
+
if not _TS_AVAILABLE:
|
|
258
|
+
return None
|
|
259
|
+
if lang in _parser_cache:
|
|
260
|
+
return _parser_cache[lang]
|
|
261
|
+
|
|
262
|
+
module_name = _TS_MODULES.get(lang)
|
|
263
|
+
parser = None
|
|
264
|
+
if module_name:
|
|
265
|
+
try:
|
|
266
|
+
mod = __import__(module_name)
|
|
267
|
+
# tree_sitter_typescript exposes two grammars:
|
|
268
|
+
# language_typescript() for .ts files
|
|
269
|
+
# language_tsx() for .tsx files (JSX-aware)
|
|
270
|
+
if lang == "tsx" and hasattr(mod, "language_tsx"):
|
|
271
|
+
lang_obj = Language(mod.language_tsx())
|
|
272
|
+
elif lang == "typescript" and hasattr(mod, "language_typescript"):
|
|
273
|
+
lang_obj = Language(mod.language_typescript())
|
|
274
|
+
elif lang == "php" and hasattr(mod, "language_php"):
|
|
275
|
+
lang_obj = Language(mod.language_php())
|
|
276
|
+
elif hasattr(mod, "language"):
|
|
277
|
+
lang_obj = Language(mod.language())
|
|
278
|
+
else:
|
|
279
|
+
raise AttributeError(f"No language() callable in {module_name}")
|
|
280
|
+
p = TSParser(lang_obj)
|
|
281
|
+
parser = p
|
|
282
|
+
except Exception:
|
|
283
|
+
parser = None
|
|
284
|
+
|
|
285
|
+
_parser_cache[lang] = parser
|
|
286
|
+
return parser
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
# Main builder
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
class MethodGraphBuilder:
|
|
295
|
+
"""Extract method nodes and call edges, store in GraphStore."""
|
|
296
|
+
|
|
297
|
+
def __init__(self, graph_store: GraphStore):
|
|
298
|
+
self.store = graph_store
|
|
299
|
+
|
|
300
|
+
def build_for_service(self, service_id: str, repo_path: Path) -> Dict[str, Any]:
|
|
301
|
+
"""Scan *repo_path* and populate method nodes + call edges.
|
|
302
|
+
|
|
303
|
+
Uses tree-sitter for all supported languages when the grammar packages
|
|
304
|
+
are installed. Falls back to Python ``ast`` for Python files, and to
|
|
305
|
+
lightweight regex for JS/TS/Go/Java when tree-sitter is unavailable.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
service_id: Identifier for the owning service.
|
|
309
|
+
repo_path: Root directory of the repository to scan.
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Summary dict with ``methods``, ``calls``, ``files_scanned``, ``ts_available``.
|
|
313
|
+
"""
|
|
314
|
+
all_methods: Dict[str, Dict] = {}
|
|
315
|
+
all_calls: List[Dict] = []
|
|
316
|
+
files_scanned = 0
|
|
317
|
+
|
|
318
|
+
gitignore_spec = load_gitignore(Path(repo_path))
|
|
319
|
+
|
|
320
|
+
for fp in Path(repo_path).rglob("*"):
|
|
321
|
+
if not fp.is_file():
|
|
322
|
+
continue
|
|
323
|
+
# Only skip if the immediate parent directory name is in SKIP_DIRS
|
|
324
|
+
# (avoids false-positives from matching path segments like 'corbel')
|
|
325
|
+
rel = fp.relative_to(repo_path)
|
|
326
|
+
if any(part in _SKIP_DIRS for part in rel.parts):
|
|
327
|
+
continue
|
|
328
|
+
if gitignore_spec.match_file(str(rel).replace("\\", "/")):
|
|
329
|
+
continue
|
|
330
|
+
lang = _EXT_LANG.get(fp.suffix)
|
|
331
|
+
if not lang:
|
|
332
|
+
continue
|
|
333
|
+
files_scanned += 1
|
|
334
|
+
result = self._analyze_file(fp, service_id, lang)
|
|
335
|
+
for m in result["methods"]:
|
|
336
|
+
all_methods[m["id"]] = m
|
|
337
|
+
all_calls.extend(result["calls"])
|
|
338
|
+
|
|
339
|
+
# Persist method nodes
|
|
340
|
+
for method_id, info in all_methods.items():
|
|
341
|
+
node = MethodNode(
|
|
342
|
+
id=method_id,
|
|
343
|
+
repo=str(repo_path),
|
|
344
|
+
file_path=info["file_path"],
|
|
345
|
+
class_name=info.get("class_name"),
|
|
346
|
+
method_name=info["name"],
|
|
347
|
+
signature=info.get("signature", info["name"]),
|
|
348
|
+
docstring=info.get("docstring"),
|
|
349
|
+
line_start=info.get("line_number", 0),
|
|
350
|
+
line_end=info.get("line_end", info.get("line_number", 0)),
|
|
351
|
+
service_id=service_id,
|
|
352
|
+
typed_signature=info.get("typed_signature"),
|
|
353
|
+
)
|
|
354
|
+
self.store.upsert_node(node)
|
|
355
|
+
|
|
356
|
+
# Build and persist call graph edges
|
|
357
|
+
call_graph = self._build_call_graph(all_methods, all_calls)
|
|
358
|
+
for caller_id, callee_id, meta in call_graph:
|
|
359
|
+
self.store.upsert_edge(
|
|
360
|
+
DependencyEdge(
|
|
361
|
+
source_id=caller_id,
|
|
362
|
+
target_id=callee_id,
|
|
363
|
+
kind="method_call",
|
|
364
|
+
metadata=meta,
|
|
365
|
+
)
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
return {
|
|
369
|
+
"methods": len(all_methods),
|
|
370
|
+
"calls": len(call_graph),
|
|
371
|
+
"files_scanned": files_scanned,
|
|
372
|
+
"ts_available": _TS_AVAILABLE,
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
# ------------------------------------------------------------------ #
|
|
377
|
+
# Dispatch #
|
|
378
|
+
# ------------------------------------------------------------------ #
|
|
379
|
+
|
|
380
|
+
def _analyze_file(self, fp: Path, service_id: str, lang: str) -> Dict:
|
|
381
|
+
try:
|
|
382
|
+
content = fp.read_text(encoding="utf-8", errors="ignore")
|
|
383
|
+
except Exception:
|
|
384
|
+
return {"methods": [], "calls": []}
|
|
385
|
+
|
|
386
|
+
# 1. Try tree-sitter
|
|
387
|
+
parser = _get_ts_parser(lang)
|
|
388
|
+
if parser is not None:
|
|
389
|
+
return self._analyze_with_tree_sitter(fp, content, service_id, lang, parser)
|
|
390
|
+
|
|
391
|
+
# 2. Python-specific fallback: stdlib ast (accurate)
|
|
392
|
+
if lang == "python":
|
|
393
|
+
return self._analyze_python_ast(fp, content, service_id)
|
|
394
|
+
|
|
395
|
+
# 3. Last resort: regex (JS/TS/Go/Java when tree-sitter is absent)
|
|
396
|
+
return self._analyze_regex_fallback(fp, content, service_id, lang)
|
|
397
|
+
|
|
398
|
+
def _make_method_id(self, service_id: str, fp: Path, full_name: str) -> str:
|
|
399
|
+
return f"{service_id}::{fp.name}::{full_name}"
|
|
400
|
+
|
|
401
|
+
# ------------------------------------------------------------------ #
|
|
402
|
+
# Tree-sitter analyzer (all languages) #
|
|
403
|
+
# ------------------------------------------------------------------ #
|
|
404
|
+
|
|
405
|
+
def _analyze_with_tree_sitter(
|
|
406
|
+
self,
|
|
407
|
+
fp: Path,
|
|
408
|
+
content: str,
|
|
409
|
+
service_id: str,
|
|
410
|
+
lang: str,
|
|
411
|
+
parser: Any,
|
|
412
|
+
) -> Dict:
|
|
413
|
+
"""Parse *content* with tree-sitter and extract method nodes + call sites."""
|
|
414
|
+
methods: List[Dict] = []
|
|
415
|
+
calls: List[Dict] = []
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
tree = parser.parse(bytes(content, "utf-8"))
|
|
419
|
+
except Exception:
|
|
420
|
+
return {"methods": [], "calls": []}
|
|
421
|
+
|
|
422
|
+
target_node_types = _TS_TARGET_NODES.get(lang, set())
|
|
423
|
+
call_site_types = _TS_CALL_SITE_NODES.get(lang, set())
|
|
424
|
+
builtins = _BUILTIN_BLOCKLIST.get(lang, set())
|
|
425
|
+
def _node_name(node) -> Optional[str]:
|
|
426
|
+
"""Extract the identifier name from a function/method node."""
|
|
427
|
+
# 1. Try matching identifier child that is exactly the "name" field
|
|
428
|
+
for child in node.children:
|
|
429
|
+
if child.type == "identifier" and child == node.child_by_field_name("name"):
|
|
430
|
+
return child.text.decode("utf-8", errors="ignore")
|
|
431
|
+
# 2. Try via the "name" field directly (PHP uses node type "name")
|
|
432
|
+
name_field = node.child_by_field_name("name")
|
|
433
|
+
if name_field is not None:
|
|
434
|
+
return name_field.text.decode("utf-8", errors="ignore")
|
|
435
|
+
# 3. Fall back to first identifier child
|
|
436
|
+
for child in node.children:
|
|
437
|
+
if child.type == "identifier":
|
|
438
|
+
return child.text.decode("utf-8", errors="ignore")
|
|
439
|
+
return None
|
|
440
|
+
|
|
441
|
+
def _receiver_or_class(node) -> Optional[str]:
|
|
442
|
+
"""For Go method_declaration, extract the receiver type name."""
|
|
443
|
+
recv = node.child_by_field_name("receiver")
|
|
444
|
+
if recv:
|
|
445
|
+
for sub in recv.children:
|
|
446
|
+
if sub.type in ("type_identifier", "pointer_type", "qualified_type"):
|
|
447
|
+
return sub.text.decode("utf-8", errors="ignore").lstrip("*")
|
|
448
|
+
return None
|
|
449
|
+
|
|
450
|
+
def _extract_callee_name(node) -> Optional[str]:
|
|
451
|
+
"""Extract the called function/method name from a call site node."""
|
|
452
|
+
if lang == "python":
|
|
453
|
+
func = node.child_by_field_name("function")
|
|
454
|
+
if func is None:
|
|
455
|
+
return None
|
|
456
|
+
if func.type == "identifier":
|
|
457
|
+
return func.text.decode("utf-8", errors="ignore")
|
|
458
|
+
if func.type == "attribute":
|
|
459
|
+
attr = func.child_by_field_name("attribute")
|
|
460
|
+
if attr:
|
|
461
|
+
return attr.text.decode("utf-8", errors="ignore")
|
|
462
|
+
elif lang in ("javascript", "typescript", "tsx"):
|
|
463
|
+
if node.type == "new_expression":
|
|
464
|
+
# new MyClass(...) — get the constructor name
|
|
465
|
+
ctor = node.child_by_field_name("constructor")
|
|
466
|
+
if ctor and ctor.type == "identifier":
|
|
467
|
+
return ctor.text.decode("utf-8", errors="ignore")
|
|
468
|
+
return None
|
|
469
|
+
func = node.child_by_field_name("function")
|
|
470
|
+
if func is None:
|
|
471
|
+
return None
|
|
472
|
+
if func.type == "identifier":
|
|
473
|
+
return func.text.decode("utf-8", errors="ignore")
|
|
474
|
+
if func.type in ("member_expression", "subscript_expression"):
|
|
475
|
+
prop = func.child_by_field_name("property")
|
|
476
|
+
if prop:
|
|
477
|
+
return prop.text.decode("utf-8", errors="ignore")
|
|
478
|
+
elif lang == "go":
|
|
479
|
+
func = node.child_by_field_name("function")
|
|
480
|
+
if func is None:
|
|
481
|
+
return None
|
|
482
|
+
if func.type == "identifier":
|
|
483
|
+
return func.text.decode("utf-8", errors="ignore")
|
|
484
|
+
if func.type == "selector_expression":
|
|
485
|
+
field = func.child_by_field_name("field")
|
|
486
|
+
if field:
|
|
487
|
+
return field.text.decode("utf-8", errors="ignore")
|
|
488
|
+
elif lang == "java":
|
|
489
|
+
if node.type == "object_creation_expression":
|
|
490
|
+
type_node = node.child_by_field_name("type")
|
|
491
|
+
if type_node:
|
|
492
|
+
return type_node.text.decode("utf-8", errors="ignore")
|
|
493
|
+
return None
|
|
494
|
+
name = node.child_by_field_name("name")
|
|
495
|
+
if name:
|
|
496
|
+
return name.text.decode("utf-8", errors="ignore")
|
|
497
|
+
elif lang == "csharp":
|
|
498
|
+
if node.type == "object_creation_expression":
|
|
499
|
+
t = node.child_by_field_name("type")
|
|
500
|
+
if t:
|
|
501
|
+
return t.text.decode("utf-8", errors="ignore")
|
|
502
|
+
return None
|
|
503
|
+
func = node.child_by_field_name("function")
|
|
504
|
+
if func is None:
|
|
505
|
+
return None
|
|
506
|
+
if func.type == "identifier":
|
|
507
|
+
return func.text.decode("utf-8", errors="ignore")
|
|
508
|
+
if func.type == "member_access_expression":
|
|
509
|
+
name = func.child_by_field_name("name")
|
|
510
|
+
if name:
|
|
511
|
+
return name.text.decode("utf-8", errors="ignore")
|
|
512
|
+
elif lang == "rust":
|
|
513
|
+
func = node.child_by_field_name("function")
|
|
514
|
+
if func:
|
|
515
|
+
if func.type in ("identifier", "scoped_identifier"):
|
|
516
|
+
return func.text.decode("utf-8", errors="ignore")
|
|
517
|
+
elif func.type == "field_expression":
|
|
518
|
+
field = func.child_by_field_name("field")
|
|
519
|
+
if field:
|
|
520
|
+
return field.text.decode("utf-8", errors="ignore")
|
|
521
|
+
elif node.type == "macro_invocation":
|
|
522
|
+
mac = node.child_by_field_name("macro")
|
|
523
|
+
if mac:
|
|
524
|
+
return mac.text.decode("utf-8", errors="ignore")
|
|
525
|
+
elif lang == "ruby":
|
|
526
|
+
method = node.child_by_field_name("method")
|
|
527
|
+
if method:
|
|
528
|
+
return method.text.decode("utf-8", errors="ignore")
|
|
529
|
+
elif lang == "php":
|
|
530
|
+
if node.type == "object_creation_expression":
|
|
531
|
+
cls = node.child_by_field_name("class")
|
|
532
|
+
if cls:
|
|
533
|
+
return cls.text.decode("utf-8", errors="ignore")
|
|
534
|
+
return None
|
|
535
|
+
name_node = node.child_by_field_name("name")
|
|
536
|
+
if name_node:
|
|
537
|
+
return name_node.text.decode("utf-8", errors="ignore")
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
def _extract_typed_signature(node) -> str:
|
|
541
|
+
"""Build a typed signature string like ``validate(token: str) -> bool``."""
|
|
542
|
+
name = _node_name(node) or "?"
|
|
543
|
+
params_node = node.child_by_field_name("parameters")
|
|
544
|
+
param_strs: List[str] = []
|
|
545
|
+
|
|
546
|
+
if params_node:
|
|
547
|
+
for param in params_node.named_children:
|
|
548
|
+
if lang in ("javascript", "typescript", "tsx"):
|
|
549
|
+
pattern = (
|
|
550
|
+
param.child_by_field_name("pattern")
|
|
551
|
+
or param.child_by_field_name("name")
|
|
552
|
+
)
|
|
553
|
+
type_ann = param.child_by_field_name("type")
|
|
554
|
+
pname = pattern.text.decode("utf-8", "ignore") if pattern else ""
|
|
555
|
+
if type_ann:
|
|
556
|
+
raw_t = type_ann.text.decode("utf-8", "ignore").strip().lstrip(":").strip()
|
|
557
|
+
param_strs.append(f"{pname}: {raw_t}" if pname else raw_t)
|
|
558
|
+
elif pname:
|
|
559
|
+
param_strs.append(pname)
|
|
560
|
+
|
|
561
|
+
elif lang == "python":
|
|
562
|
+
if param.type in (
|
|
563
|
+
"typed_parameter", "typed_default_parameter"
|
|
564
|
+
):
|
|
565
|
+
pname = ""
|
|
566
|
+
ptype = ""
|
|
567
|
+
for child in param.children:
|
|
568
|
+
if child.type == "identifier" and not pname:
|
|
569
|
+
pname = child.text.decode("utf-8", "ignore")
|
|
570
|
+
elif child.type == "type":
|
|
571
|
+
ptype = child.text.decode("utf-8", "ignore")
|
|
572
|
+
param_strs.append(f"{pname}: {ptype}" if ptype else pname)
|
|
573
|
+
elif param.type in ("identifier", "list_splat_pattern", "dictionary_splat_pattern"):
|
|
574
|
+
param_strs.append(param.text.decode("utf-8", "ignore"))
|
|
575
|
+
elif param.type == "default_parameter":
|
|
576
|
+
n = param.child_by_field_name("name")
|
|
577
|
+
if n:
|
|
578
|
+
param_strs.append(n.text.decode("utf-8", "ignore"))
|
|
579
|
+
|
|
580
|
+
elif lang == "go":
|
|
581
|
+
pnames: List[str] = []
|
|
582
|
+
ptype = ""
|
|
583
|
+
for child in param.children:
|
|
584
|
+
if child.type == "identifier":
|
|
585
|
+
pnames.append(child.text.decode("utf-8", "ignore"))
|
|
586
|
+
elif child.type in (
|
|
587
|
+
"type_identifier", "pointer_type", "qualified_type",
|
|
588
|
+
"slice_type", "array_type", "map_type", "interface_type",
|
|
589
|
+
):
|
|
590
|
+
ptype = child.text.decode("utf-8", "ignore")
|
|
591
|
+
if pnames:
|
|
592
|
+
param_strs.append(
|
|
593
|
+
f"{' '.join(pnames)} {ptype}".strip() if ptype else " ".join(pnames)
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
elif lang == "java":
|
|
597
|
+
pname_node = param.child_by_field_name("name")
|
|
598
|
+
ptype_node = param.child_by_field_name("type")
|
|
599
|
+
if pname_node and ptype_node:
|
|
600
|
+
param_strs.append(
|
|
601
|
+
f"{ptype_node.text.decode('utf-8','ignore')} "
|
|
602
|
+
f"{pname_node.text.decode('utf-8','ignore')}"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
elif lang == "csharp":
|
|
606
|
+
pname_node = param.child_by_field_name("name")
|
|
607
|
+
ptype_node = param.child_by_field_name("type")
|
|
608
|
+
if pname_node and ptype_node:
|
|
609
|
+
param_strs.append(
|
|
610
|
+
f"{ptype_node.text.decode('utf-8','ignore')} "
|
|
611
|
+
f"{pname_node.text.decode('utf-8','ignore')}"
|
|
612
|
+
)
|
|
613
|
+
elif param.type == "parameter":
|
|
614
|
+
param_strs.append(param.text.decode("utf-8", "ignore"))
|
|
615
|
+
|
|
616
|
+
elif lang == "rust":
|
|
617
|
+
pat = param.child_by_field_name("pattern")
|
|
618
|
+
typ = param.child_by_field_name("type")
|
|
619
|
+
if pat and typ:
|
|
620
|
+
param_strs.append(
|
|
621
|
+
f"{pat.text.decode('utf-8','ignore')}: {typ.text.decode('utf-8','ignore')}"
|
|
622
|
+
)
|
|
623
|
+
else:
|
|
624
|
+
param_strs.append(param.text.decode("utf-8", "ignore"))
|
|
625
|
+
|
|
626
|
+
elif lang == "ruby":
|
|
627
|
+
if param.type in ("identifier", "keyword_parameter", "optional_parameter"):
|
|
628
|
+
param_strs.append(param.text.decode("utf-8", "ignore"))
|
|
629
|
+
|
|
630
|
+
elif lang == "php":
|
|
631
|
+
pname_node = param.child_by_field_name("name")
|
|
632
|
+
ptype_node = param.child_by_field_name("type")
|
|
633
|
+
pstr = ""
|
|
634
|
+
if ptype_node:
|
|
635
|
+
pstr += ptype_node.text.decode("utf-8", "ignore") + " "
|
|
636
|
+
if pname_node:
|
|
637
|
+
pstr += pname_node.text.decode("utf-8", "ignore")
|
|
638
|
+
if pstr:
|
|
639
|
+
param_strs.append(pstr.strip())
|
|
640
|
+
|
|
641
|
+
params_str = ", ".join(param_strs)
|
|
642
|
+
|
|
643
|
+
# Return type
|
|
644
|
+
ret_node = node.child_by_field_name("return_type")
|
|
645
|
+
if ret_node:
|
|
646
|
+
ret_raw = ret_node.text.decode("utf-8", "ignore").strip()
|
|
647
|
+
# Strip leading ':' (TS) or '->' (Python ts node already has it stripped)
|
|
648
|
+
ret_clean = ret_raw.lstrip(":->").strip().lstrip(">:").strip()
|
|
649
|
+
if ret_clean:
|
|
650
|
+
return f"{name}({params_str}) -> {ret_clean}"
|
|
651
|
+
return f"{name}({params_str})"
|
|
652
|
+
|
|
653
|
+
def traverse(
|
|
654
|
+
node,
|
|
655
|
+
enclosing_class: Optional[str] = None,
|
|
656
|
+
parent=None,
|
|
657
|
+
enclosing_method_id: Optional[str] = None,
|
|
658
|
+
) -> None:
|
|
659
|
+
# Track class/struct/interface context
|
|
660
|
+
if node.type in {"class_declaration", "class_definition",
|
|
661
|
+
"struct_type", "type_declaration",
|
|
662
|
+
"interface_declaration"}:
|
|
663
|
+
name_child = node.child_by_field_name("name")
|
|
664
|
+
cls_name = (
|
|
665
|
+
name_child.text.decode("utf-8", errors="ignore")
|
|
666
|
+
if name_child else None
|
|
667
|
+
)
|
|
668
|
+
for child in node.children:
|
|
669
|
+
traverse(
|
|
670
|
+
child,
|
|
671
|
+
enclosing_class=cls_name or enclosing_class,
|
|
672
|
+
parent=node,
|
|
673
|
+
enclosing_method_id=enclosing_method_id,
|
|
674
|
+
)
|
|
675
|
+
return
|
|
676
|
+
|
|
677
|
+
current_method_id = enclosing_method_id # inherited default
|
|
678
|
+
|
|
679
|
+
if node.type in target_node_types:
|
|
680
|
+
raw_name = _node_name(node)
|
|
681
|
+
|
|
682
|
+
# For Go method_declaration, use receiver type as class
|
|
683
|
+
eff_class = enclosing_class
|
|
684
|
+
if lang == "go" and node.type == "method_declaration":
|
|
685
|
+
eff_class = _receiver_or_class(node) or eff_class
|
|
686
|
+
|
|
687
|
+
# Arrow functions / function expressions without their own name
|
|
688
|
+
if raw_name is None and node.type in {
|
|
689
|
+
"arrow_function", "function_expression", "generator_function",
|
|
690
|
+
}:
|
|
691
|
+
if parent and parent.type == "variable_declarator":
|
|
692
|
+
name_child = parent.child_by_field_name("name")
|
|
693
|
+
if name_child:
|
|
694
|
+
raw_name = name_child.text.decode("utf-8", errors="ignore")
|
|
695
|
+
|
|
696
|
+
if raw_name:
|
|
697
|
+
# Skip test and mock methods
|
|
698
|
+
lower_name = raw_name.lower()
|
|
699
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
700
|
+
return
|
|
701
|
+
|
|
702
|
+
full = f"{eff_class}.{raw_name}" if eff_class else raw_name
|
|
703
|
+
mid = self._make_method_id(service_id, fp, full)
|
|
704
|
+
line_start = node.start_point[0] + 1
|
|
705
|
+
line_end = node.end_point[0] + 1
|
|
706
|
+
|
|
707
|
+
# Python docstring extraction
|
|
708
|
+
docstring: Optional[str] = None
|
|
709
|
+
if lang == "python" and node.children:
|
|
710
|
+
body = node.child_by_field_name("body")
|
|
711
|
+
if body and body.children:
|
|
712
|
+
first = body.children[0]
|
|
713
|
+
if first.type == "expression_statement":
|
|
714
|
+
ds_node = first.children[0] if first.children else None
|
|
715
|
+
if ds_node and ds_node.type == "string":
|
|
716
|
+
docstring = ds_node.text.decode(
|
|
717
|
+
"utf-8", errors="ignore"
|
|
718
|
+
).strip("\"'")
|
|
719
|
+
|
|
720
|
+
typed_sig = _extract_typed_signature(node)
|
|
721
|
+
|
|
722
|
+
methods.append({
|
|
723
|
+
"id": mid,
|
|
724
|
+
"name": raw_name,
|
|
725
|
+
"full_name": full,
|
|
726
|
+
"class_name": eff_class,
|
|
727
|
+
"file_path": str(fp),
|
|
728
|
+
"line_number": line_start,
|
|
729
|
+
"line_end": line_end,
|
|
730
|
+
"signature": raw_name, # plain name (backward compat)
|
|
731
|
+
"typed_signature": typed_sig, # NEW: full typed form
|
|
732
|
+
"docstring": docstring,
|
|
733
|
+
"service_id": service_id,
|
|
734
|
+
})
|
|
735
|
+
current_method_id = mid # children see us as enclosing method
|
|
736
|
+
|
|
737
|
+
elif call_site_types and node.type in call_site_types and enclosing_method_id:
|
|
738
|
+
# Extract call site
|
|
739
|
+
callee = _extract_callee_name(node)
|
|
740
|
+
if callee and callee not in builtins:
|
|
741
|
+
calls.append({
|
|
742
|
+
"caller_id": enclosing_method_id,
|
|
743
|
+
"callee_name": callee,
|
|
744
|
+
"line_number": node.start_point[0] + 1,
|
|
745
|
+
})
|
|
746
|
+
|
|
747
|
+
for child in node.children:
|
|
748
|
+
traverse(
|
|
749
|
+
child,
|
|
750
|
+
enclosing_class=enclosing_class,
|
|
751
|
+
parent=node,
|
|
752
|
+
enclosing_method_id=current_method_id,
|
|
753
|
+
)
|
|
754
|
+
|
|
755
|
+
traverse(tree.root_node)
|
|
756
|
+
return {"methods": methods, "calls": calls}
|
|
757
|
+
|
|
758
|
+
# ------------------------------------------------------------------ #
|
|
759
|
+
# Python ast fallback #
|
|
760
|
+
# ------------------------------------------------------------------ #
|
|
761
|
+
|
|
762
|
+
def _analyze_python_ast(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
763
|
+
"""Use Python's stdlib ast for accurate extraction when tree-sitter is absent."""
|
|
764
|
+
methods: List[Dict] = []
|
|
765
|
+
calls: List[Dict] = []
|
|
766
|
+
|
|
767
|
+
try:
|
|
768
|
+
tree = ast.parse(content, filename=str(fp))
|
|
769
|
+
except SyntaxError:
|
|
770
|
+
return {"methods": [], "calls": []}
|
|
771
|
+
|
|
772
|
+
class _Visitor(ast.NodeVisitor):
|
|
773
|
+
def __init__(self_inner):
|
|
774
|
+
self_inner.current_class: Optional[str] = None
|
|
775
|
+
self_inner.current_method_id: Optional[str] = None
|
|
776
|
+
|
|
777
|
+
def visit_ClassDef(self_inner, node):
|
|
778
|
+
old = self_inner.current_class
|
|
779
|
+
self_inner.current_class = node.name
|
|
780
|
+
self_inner.generic_visit(node)
|
|
781
|
+
self_inner.current_class = old
|
|
782
|
+
|
|
783
|
+
def _visit_func(self_inner, node):
|
|
784
|
+
mname = node.name
|
|
785
|
+
# Skip test and mock methods
|
|
786
|
+
lower_name = mname.lower()
|
|
787
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
788
|
+
return
|
|
789
|
+
|
|
790
|
+
full = (
|
|
791
|
+
f"{self_inner.current_class}.{mname}"
|
|
792
|
+
if self_inner.current_class else mname
|
|
793
|
+
)
|
|
794
|
+
mid = self._make_method_id(service_id, fp, full)
|
|
795
|
+
|
|
796
|
+
sig_parts = [a.arg for a in node.args.args]
|
|
797
|
+
sig = f"def {mname}({', '.join(sig_parts)})"
|
|
798
|
+
docstring = ast.get_docstring(node)
|
|
799
|
+
|
|
800
|
+
line_end = max(
|
|
801
|
+
(getattr(n, "end_lineno", node.lineno) for n in ast.walk(node)),
|
|
802
|
+
default=node.lineno,
|
|
803
|
+
)
|
|
804
|
+
methods.append({
|
|
805
|
+
"id": mid,
|
|
806
|
+
"name": mname,
|
|
807
|
+
"full_name": full,
|
|
808
|
+
"class_name": self_inner.current_class,
|
|
809
|
+
"file_path": str(fp),
|
|
810
|
+
"line_number": node.lineno,
|
|
811
|
+
"line_end": line_end,
|
|
812
|
+
"is_async": isinstance(node, ast.AsyncFunctionDef),
|
|
813
|
+
"signature": sig,
|
|
814
|
+
"docstring": docstring,
|
|
815
|
+
"service_id": service_id,
|
|
816
|
+
})
|
|
817
|
+
|
|
818
|
+
old_mid = self_inner.current_method_id
|
|
819
|
+
self_inner.current_method_id = mid
|
|
820
|
+
self_inner.generic_visit(node)
|
|
821
|
+
self_inner.current_method_id = old_mid
|
|
822
|
+
|
|
823
|
+
visit_FunctionDef = _visit_func
|
|
824
|
+
visit_AsyncFunctionDef = _visit_func
|
|
825
|
+
|
|
826
|
+
def visit_Call(self_inner, node):
|
|
827
|
+
if not self_inner.current_method_id:
|
|
828
|
+
self_inner.generic_visit(node)
|
|
829
|
+
return
|
|
830
|
+
callee: Optional[str] = None
|
|
831
|
+
if isinstance(node.func, ast.Name):
|
|
832
|
+
callee = node.func.id
|
|
833
|
+
elif isinstance(node.func, ast.Attribute):
|
|
834
|
+
callee = node.func.attr
|
|
835
|
+
if callee:
|
|
836
|
+
calls.append({
|
|
837
|
+
"caller_id": self_inner.current_method_id,
|
|
838
|
+
"callee_name": callee,
|
|
839
|
+
"line_number": node.lineno,
|
|
840
|
+
})
|
|
841
|
+
self_inner.generic_visit(node)
|
|
842
|
+
|
|
843
|
+
_Visitor().visit(tree)
|
|
844
|
+
return {"methods": methods, "calls": calls}
|
|
845
|
+
|
|
846
|
+
# ------------------------------------------------------------------ #
|
|
847
|
+
# Regex fallback (JS/TS/Go/Java when tree-sitter absent) #
|
|
848
|
+
# ------------------------------------------------------------------ #
|
|
849
|
+
|
|
850
|
+
def _analyze_regex_fallback(
|
|
851
|
+
self, fp: Path, content: str, service_id: str, lang: str
|
|
852
|
+
) -> Dict:
|
|
853
|
+
"""Minimal regex extraction used only when tree-sitter grammars are missing."""
|
|
854
|
+
if lang in ("javascript", "typescript", "tsx"):
|
|
855
|
+
return self._regex_js(fp, content, service_id)
|
|
856
|
+
if lang == "go":
|
|
857
|
+
return self._regex_go(fp, content, service_id)
|
|
858
|
+
if lang == "java":
|
|
859
|
+
return self._regex_java(fp, content, service_id)
|
|
860
|
+
if lang == "csharp":
|
|
861
|
+
return self._regex_csharp(fp, content, service_id)
|
|
862
|
+
if lang == "rust":
|
|
863
|
+
return self._regex_rust(fp, content, service_id)
|
|
864
|
+
if lang == "ruby":
|
|
865
|
+
return self._regex_ruby(fp, content, service_id)
|
|
866
|
+
if lang == "php":
|
|
867
|
+
return self._regex_php(fp, content, service_id)
|
|
868
|
+
return {"methods": [], "calls": []}
|
|
869
|
+
|
|
870
|
+
# --- JS/TS regex (used only as last-resort fallback) ---
|
|
871
|
+
|
|
872
|
+
def _regex_js(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
873
|
+
methods: List[Dict] = []
|
|
874
|
+
lines = content.splitlines()
|
|
875
|
+
current_class: Optional[str] = None
|
|
876
|
+
KEYWORDS = {
|
|
877
|
+
"if", "else", "for", "while", "switch", "catch", "try", "return",
|
|
878
|
+
"new", "typeof", "instanceof", "import", "export", "from", "class",
|
|
879
|
+
"extends", "implements", "interface", "type", "enum", "declare",
|
|
880
|
+
"public", "private", "protected", "static", "async", "await",
|
|
881
|
+
}
|
|
882
|
+
PATTERNS: List[Tuple[re.Pattern, str]] = [
|
|
883
|
+
(re.compile(r"^\s*export\s+default\s+(?:async\s+)?function\s*([\w$]*)\s*[<(]"), "default_fn"),
|
|
884
|
+
(re.compile(r"^\s*export\s+(?:async\s+)?function\s+([\w$]+)\s*[<(]"), "exported_fn"),
|
|
885
|
+
(re.compile(r"^\s*(?:export\s+)?async\s+function\s+([\w$]+)\s*[<(]"), "async_fn"),
|
|
886
|
+
(re.compile(r"^\s*(?:export\s+)?function\s+([\w$]+)\s*[<(]"), "fn"),
|
|
887
|
+
(re.compile(r"^\s*export\s+(?:const|let|var)\s+([\w$]+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[\w$]+)\s*(?::[^=]+)?=>"), "exported_arrow"),
|
|
888
|
+
(re.compile(r"^\s*(?:const|let|var)\s+([\w$]+)\s*=\s*(?:async\s+)?(?:\([^)]*\)|[\w$]+)\s*(?::[^=>]+)?=>"), "arrow"),
|
|
889
|
+
(re.compile(r"^\s*(?:(?:public|private|protected|static|abstract|override|async|readonly)\s+)*"
|
|
890
|
+
r"([\w$]+)\s*[<(][^)]*\)\s*(?::[^{]+)?\s*\{"), "class_method"),
|
|
891
|
+
]
|
|
892
|
+
class_pat = re.compile(r"^\s*(?:export\s+)?(?:abstract\s+)?class\s+([\w$]+)")
|
|
893
|
+
for lnum, line in enumerate(lines, 1):
|
|
894
|
+
cm = class_pat.match(line)
|
|
895
|
+
if cm:
|
|
896
|
+
current_class = cm.group(1)
|
|
897
|
+
for pat, kind in PATTERNS:
|
|
898
|
+
m = pat.match(line)
|
|
899
|
+
if not m:
|
|
900
|
+
continue
|
|
901
|
+
raw = m.group(1) if m.lastindex and m.group(1) else None
|
|
902
|
+
if raw is None:
|
|
903
|
+
raw = fp.stem if kind == "default_fn" else None
|
|
904
|
+
if not raw or raw in KEYWORDS:
|
|
905
|
+
continue
|
|
906
|
+
# Skip test and mock methods
|
|
907
|
+
lower_name = raw.lower()
|
|
908
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
909
|
+
continue
|
|
910
|
+
|
|
911
|
+
full = f"{current_class}.{raw}" if (current_class and kind == "class_method") else raw
|
|
912
|
+
mid = self._make_method_id(service_id, fp, full)
|
|
913
|
+
methods.append({
|
|
914
|
+
"id": mid, "name": raw, "full_name": full,
|
|
915
|
+
"class_name": current_class if kind == "class_method" else None,
|
|
916
|
+
"file_path": str(fp), "line_number": lnum, "line_end": lnum,
|
|
917
|
+
"signature": raw, "docstring": None, "service_id": service_id,
|
|
918
|
+
})
|
|
919
|
+
break
|
|
920
|
+
return {"methods": methods, "calls": []}
|
|
921
|
+
|
|
922
|
+
def _regex_go(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
923
|
+
methods: List[Dict] = []
|
|
924
|
+
pat = re.compile(r"^func\s+(?:\([^)]+\)\s+)?(\w+)\s*\(")
|
|
925
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
926
|
+
m = pat.match(line)
|
|
927
|
+
if m:
|
|
928
|
+
mname = m.group(1)
|
|
929
|
+
# Skip test and mock methods
|
|
930
|
+
lower_name = mname.lower()
|
|
931
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
932
|
+
continue
|
|
933
|
+
|
|
934
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
935
|
+
methods.append({
|
|
936
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
937
|
+
"class_name": None, "file_path": str(fp),
|
|
938
|
+
"line_number": lnum, "line_end": lnum,
|
|
939
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
940
|
+
})
|
|
941
|
+
return {"methods": methods, "calls": []}
|
|
942
|
+
|
|
943
|
+
def _regex_java(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
944
|
+
methods: List[Dict] = []
|
|
945
|
+
pat = re.compile(
|
|
946
|
+
r"(?:public|private|protected|static|\s)+[\w<>\[\]]+\s+(\w+)\s*\([^)]*\)\s*\{?"
|
|
947
|
+
)
|
|
948
|
+
skip = {"if", "for", "while", "switch", "catch", "class"}
|
|
949
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
950
|
+
m = pat.search(line)
|
|
951
|
+
if m and m.group(1) not in skip and "class " not in line:
|
|
952
|
+
mname = m.group(1)
|
|
953
|
+
# Skip test and mock methods
|
|
954
|
+
lower_name = mname.lower()
|
|
955
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
956
|
+
continue
|
|
957
|
+
|
|
958
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
959
|
+
methods.append({
|
|
960
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
961
|
+
"class_name": None, "file_path": str(fp),
|
|
962
|
+
"line_number": lnum, "line_end": lnum,
|
|
963
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
964
|
+
})
|
|
965
|
+
return {"methods": methods, "calls": []}
|
|
966
|
+
|
|
967
|
+
def _regex_csharp(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
968
|
+
methods: List[Dict] = []
|
|
969
|
+
pat = re.compile(
|
|
970
|
+
r"(?:public|private|protected|internal|static|async|\s)+[\w<>\[\]]+\s+(\w+)\s*\([^)]*\)\s*\{?"
|
|
971
|
+
)
|
|
972
|
+
skip = {"if", "for", "while", "switch", "catch", "class"}
|
|
973
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
974
|
+
m = pat.search(line)
|
|
975
|
+
if m and m.group(1) not in skip and "class " not in line:
|
|
976
|
+
mname = m.group(1)
|
|
977
|
+
lower_name = mname.lower()
|
|
978
|
+
if lower_name.startswith("test") or "mock" in lower_name:
|
|
979
|
+
continue
|
|
980
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
981
|
+
methods.append({
|
|
982
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
983
|
+
"class_name": None, "file_path": str(fp),
|
|
984
|
+
"line_number": lnum, "line_end": lnum,
|
|
985
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
986
|
+
})
|
|
987
|
+
return {"methods": methods, "calls": []}
|
|
988
|
+
|
|
989
|
+
def _regex_rust(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
990
|
+
methods: List[Dict] = []
|
|
991
|
+
pat = re.compile(r"^\s*(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*\(")
|
|
992
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
993
|
+
m = pat.match(line)
|
|
994
|
+
if m:
|
|
995
|
+
mname = m.group(1)
|
|
996
|
+
lower_name = mname.lower()
|
|
997
|
+
if lower_name.startswith("test") or "mock" in lower_name:
|
|
998
|
+
continue
|
|
999
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
1000
|
+
methods.append({
|
|
1001
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
1002
|
+
"class_name": None, "file_path": str(fp),
|
|
1003
|
+
"line_number": lnum, "line_end": lnum,
|
|
1004
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
1005
|
+
})
|
|
1006
|
+
return {"methods": methods, "calls": []}
|
|
1007
|
+
|
|
1008
|
+
def _regex_ruby(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
1009
|
+
methods: List[Dict] = []
|
|
1010
|
+
pat = re.compile(r"^\s*def\s+(?:self\.)?(\w+)")
|
|
1011
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
1012
|
+
m = pat.match(line)
|
|
1013
|
+
if m:
|
|
1014
|
+
mname = m.group(1)
|
|
1015
|
+
lower_name = mname.lower()
|
|
1016
|
+
if lower_name.startswith("test_") or "mock" in lower_name:
|
|
1017
|
+
continue
|
|
1018
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
1019
|
+
methods.append({
|
|
1020
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
1021
|
+
"class_name": None, "file_path": str(fp),
|
|
1022
|
+
"line_number": lnum, "line_end": lnum,
|
|
1023
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
1024
|
+
})
|
|
1025
|
+
return {"methods": methods, "calls": []}
|
|
1026
|
+
|
|
1027
|
+
def _regex_php(self, fp: Path, content: str, service_id: str) -> Dict:
|
|
1028
|
+
methods: List[Dict] = []
|
|
1029
|
+
pat = re.compile(r"^\s*(?:(?:public|private|protected|static|final)\s+)*function\s+(\w+)\s*\(")
|
|
1030
|
+
for lnum, line in enumerate(content.splitlines(), 1):
|
|
1031
|
+
m = pat.match(line)
|
|
1032
|
+
if m:
|
|
1033
|
+
mname = m.group(1)
|
|
1034
|
+
lower_name = mname.lower()
|
|
1035
|
+
if lower_name.startswith("test") or "mock" in lower_name:
|
|
1036
|
+
continue
|
|
1037
|
+
mid = self._make_method_id(service_id, fp, mname)
|
|
1038
|
+
methods.append({
|
|
1039
|
+
"id": mid, "name": mname, "full_name": mname,
|
|
1040
|
+
"class_name": None, "file_path": str(fp),
|
|
1041
|
+
"line_number": lnum, "line_end": lnum,
|
|
1042
|
+
"signature": mname, "docstring": None, "service_id": service_id,
|
|
1043
|
+
})
|
|
1044
|
+
return {"methods": methods, "calls": []}
|
|
1045
|
+
|
|
1046
|
+
# ------------------------------------------------------------------ #
|
|
1047
|
+
# Call graph resolution #
|
|
1048
|
+
# ------------------------------------------------------------------ #
|
|
1049
|
+
|
|
1050
|
+
def _build_call_graph(
|
|
1051
|
+
self, all_methods: Dict[str, Dict], all_calls: List[Dict]
|
|
1052
|
+
) -> List[Tuple[str, str, Dict]]:
|
|
1053
|
+
"""Match call names to method IDs → (caller, callee, meta) triples."""
|
|
1054
|
+
name_to_ids: Dict[str, Set[str]] = defaultdict(set)
|
|
1055
|
+
for mid, info in all_methods.items():
|
|
1056
|
+
name_to_ids[info["name"]].add(mid)
|
|
1057
|
+
if info.get("full_name") and info["full_name"] != info["name"]:
|
|
1058
|
+
name_to_ids[info["full_name"]].add(mid)
|
|
1059
|
+
|
|
1060
|
+
seen: Set[Tuple[str, str]] = set()
|
|
1061
|
+
result = []
|
|
1062
|
+
skip = {"if", "for", "while", "return", "try", "except", "catch", "with", "else", "elif"}
|
|
1063
|
+
for call in all_calls:
|
|
1064
|
+
caller_id = call["caller_id"]
|
|
1065
|
+
callee_name = call.get("callee_name", "")
|
|
1066
|
+
if callee_name in skip:
|
|
1067
|
+
continue
|
|
1068
|
+
for callee_id in name_to_ids.get(callee_name, set()):
|
|
1069
|
+
if caller_id == callee_id:
|
|
1070
|
+
continue
|
|
1071
|
+
key = (caller_id, callee_id)
|
|
1072
|
+
if key not in seen:
|
|
1073
|
+
seen.add(key)
|
|
1074
|
+
result.append(
|
|
1075
|
+
(caller_id, callee_id, {"line": call.get("line_number")})
|
|
1076
|
+
)
|
|
1077
|
+
return result
|