flurryx-code-memory 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_memory/__init__.py +1 -0
- code_memory/claims/__init__.py +32 -0
- code_memory/claims/extractor.py +325 -0
- code_memory/claims/indexer.py +258 -0
- code_memory/claims/resolver.py +186 -0
- code_memory/claims/store.py +424 -0
- code_memory/cli.py +1192 -0
- code_memory/config.py +268 -0
- code_memory/embed/__init__.py +224 -0
- code_memory/embed/cache.py +204 -0
- code_memory/embed/m3.py +174 -0
- code_memory/embed/ollama.py +92 -0
- code_memory/embed/tei.py +106 -0
- code_memory/episodic/__init__.py +3 -0
- code_memory/episodic/sqlite_store.py +278 -0
- code_memory/extractor/__init__.py +3 -0
- code_memory/extractor/csproj.py +166 -0
- code_memory/extractor/dll.py +385 -0
- code_memory/extractor/gitignore.py +162 -0
- code_memory/extractor/nuget.py +275 -0
- code_memory/extractor/sanity.py +124 -0
- code_memory/extractor/sln.py +108 -0
- code_memory/extractor/treesitter.py +1172 -0
- code_memory/graph/__init__.py +3 -0
- code_memory/graph/falkor_store.py +740 -0
- code_memory/mcp_server.py +1816 -0
- code_memory/metrics.py +260 -0
- code_memory/orchestrator/__init__.py +13 -0
- code_memory/orchestrator/git_delta.py +211 -0
- code_memory/orchestrator/ingest_state.py +71 -0
- code_memory/orchestrator/pipeline.py +1478 -0
- code_memory/orchestrator/reset.py +130 -0
- code_memory/orchestrator/resolver.py +825 -0
- code_memory/orchestrator/retrieve.py +505 -0
- code_memory/resilience.py +73 -0
- code_memory/sync/__init__.py +20 -0
- code_memory/sync/autostart/__init__.py +42 -0
- code_memory/sync/autostart/base.py +106 -0
- code_memory/sync/autostart/launchd.py +115 -0
- code_memory/sync/autostart/schtasks.py +155 -0
- code_memory/sync/autostart/systemd.py +113 -0
- code_memory/sync/hooks.py +164 -0
- code_memory/sync/safety.py +65 -0
- code_memory/sync/snapshot.py +461 -0
- code_memory/sync/store.py +399 -0
- code_memory/sync/sync.py +405 -0
- code_memory/sync/watcher.py +320 -0
- code_memory/vector/__init__.py +3 -0
- code_memory/vector/qdrant_store.py +302 -0
- flurryx_code_memory-0.4.0.dist-info/METADATA +26 -0
- flurryx_code_memory-0.4.0.dist-info/RECORD +53 -0
- flurryx_code_memory-0.4.0.dist-info/WHEEL +4 -0
- flurryx_code_memory-0.4.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,1172 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from tree_sitter import Language, Node, Parser
|
|
9
|
+
from tree_sitter_language_pack import get_language
|
|
10
|
+
|
|
11
|
+
LANG_BY_EXT: dict[str, str] = {
|
|
12
|
+
".ts": "typescript",
|
|
13
|
+
".tsx": "tsx",
|
|
14
|
+
".js": "javascript",
|
|
15
|
+
".jsx": "javascript",
|
|
16
|
+
".mjs": "javascript",
|
|
17
|
+
".cjs": "javascript",
|
|
18
|
+
".py": "python",
|
|
19
|
+
# .NET ecosystem
|
|
20
|
+
".cs": "csharp",
|
|
21
|
+
".cshtml": "razor",
|
|
22
|
+
".razor": "razor",
|
|
23
|
+
".vb": "vb",
|
|
24
|
+
".fs": "fsharp",
|
|
25
|
+
".fsi": "fsharp",
|
|
26
|
+
".fsx": "fsharp",
|
|
27
|
+
# PHP — ``.phtml`` is the legacy template extension still used by
|
|
28
|
+
# Laravel/Zend/WordPress for view files mixing PHP + HTML; same grammar.
|
|
29
|
+
".php": "php",
|
|
30
|
+
".phtml": "php",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
SYMBOL_NODE_TYPES = {
|
|
34
|
+
"function_declaration",
|
|
35
|
+
"function_definition",
|
|
36
|
+
"method_definition",
|
|
37
|
+
"class_declaration",
|
|
38
|
+
"class_definition",
|
|
39
|
+
# TypeScript ``abstract class`` parses as its own node type; missing
|
|
40
|
+
# it makes Angular clean-arch ports invisible to the graph, which
|
|
41
|
+
# in turn leaves every ``inject(Port)`` edge unresolved.
|
|
42
|
+
"abstract_class_declaration",
|
|
43
|
+
"abstract_method_signature",
|
|
44
|
+
"arrow_function",
|
|
45
|
+
"export_statement",
|
|
46
|
+
# C# / Razor (Razor embeds C#)
|
|
47
|
+
"method_declaration",
|
|
48
|
+
"interface_declaration",
|
|
49
|
+
"struct_declaration",
|
|
50
|
+
"record_declaration",
|
|
51
|
+
"enum_declaration",
|
|
52
|
+
"constructor_declaration",
|
|
53
|
+
"delegate_declaration",
|
|
54
|
+
"property_declaration",
|
|
55
|
+
# VB.NET
|
|
56
|
+
"class_block",
|
|
57
|
+
"module_block",
|
|
58
|
+
"namespace_block",
|
|
59
|
+
# F#
|
|
60
|
+
"function_or_value_defn",
|
|
61
|
+
"type_definition",
|
|
62
|
+
"method_or_prop_defn",
|
|
63
|
+
"named_module",
|
|
64
|
+
# PHP — trait_declaration is the only one not already covered by
|
|
65
|
+
# the C#/TS/Py names above (class_declaration, interface_declaration,
|
|
66
|
+
# enum_declaration, method_declaration, function_definition are reused).
|
|
67
|
+
"trait_declaration",
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
CALL_NODE_TYPES = {
|
|
71
|
+
"call_expression",
|
|
72
|
+
"call",
|
|
73
|
+
"invocation_expression",
|
|
74
|
+
"invocation", # VB
|
|
75
|
+
# C# / VB / Razor: ``new Foo()`` parses as ``object_creation_expression``
|
|
76
|
+
# rather than ``invocation_expression``. Without this, calls to
|
|
77
|
+
# constructors (factories, DI registrations, ``new Builder().X()``) never
|
|
78
|
+
# become CALLS edges, which is the #1 reason the call graph looks empty
|
|
79
|
+
# on real .NET codebases.
|
|
80
|
+
"object_creation_expression",
|
|
81
|
+
# PHP — function call (``foo($x)``), instance method (``$obj->bar($x)``),
|
|
82
|
+
# static method (``Foo::baz($x)``). PHP ``new Foo()`` is also
|
|
83
|
+
# ``object_creation_expression`` (shared name).
|
|
84
|
+
"function_call_expression",
|
|
85
|
+
"member_call_expression",
|
|
86
|
+
"scoped_call_expression",
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Nodes that carry a type expression via a field named "type" (or "returns").
|
|
90
|
+
# When walking, look up these fields and harvest every identifier inside the
|
|
91
|
+
# type subtree. Covers C# parameter/field/property/variable/cast/typeof/is/as
|
|
92
|
+
# plus TypeScript/JS type annotations.
|
|
93
|
+
TYPE_FIELD_NODE_TYPES = {
|
|
94
|
+
# C# declarations
|
|
95
|
+
"parameter", "variable_declaration", "property_declaration",
|
|
96
|
+
"field_declaration", "event_declaration", "indexer_declaration",
|
|
97
|
+
"delegate_declaration", "method_declaration",
|
|
98
|
+
# C# expressions referencing a type
|
|
99
|
+
"cast_expression", "as_expression", "is_expression",
|
|
100
|
+
"typeof_expression", "sizeof_expression", "default_expression",
|
|
101
|
+
"array_creation_expression", "stack_alloc_array_creation_expression",
|
|
102
|
+
# TS / JS
|
|
103
|
+
"type_annotation", "type_alias_declaration",
|
|
104
|
+
"as_expression", # TS overlaps name
|
|
105
|
+
"satisfies_expression",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# Nodes whose direct (non-punctuation) children ARE type expressions —
|
|
109
|
+
# walk every child as a type tree. ``base_list`` (`class X : Foo, IBar`),
|
|
110
|
+
# generic arguments, and constraint clauses fall here.
|
|
111
|
+
TYPE_CHILDREN_NODE_TYPES = {
|
|
112
|
+
"base_list", # C#
|
|
113
|
+
"type_argument_list", # C# generics
|
|
114
|
+
"type_arguments", # TS/JS generics
|
|
115
|
+
"type_parameter_constraints_clause", # C# `where T : Foo`
|
|
116
|
+
"implements_clause", # TS `implements Foo, Bar`
|
|
117
|
+
"extends_clause", # TS `extends Foo`
|
|
118
|
+
"extends_type_clause", # TS interface extends
|
|
119
|
+
"heritage_clause", # TS class heritage
|
|
120
|
+
"tuple_type", # C# `(int, Foo)` — walk for Foo
|
|
121
|
+
"tuple_element",
|
|
122
|
+
# PHP
|
|
123
|
+
"base_clause", # ``extends Bar``
|
|
124
|
+
"class_interface_clause", # ``implements I1, I2``
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# PHP type-expression wrapper nodes — recurse to find inner ``name``s.
|
|
128
|
+
# ``primitive_type`` (``int``, ``string``, ``array``, ...) is skipped at
|
|
129
|
+
# the top of ``_collect_type_refs`` because primitives carry no graph
|
|
130
|
+
# value; they'd otherwise pollute "who touches type X" queries.
|
|
131
|
+
_PHP_TYPE_WRAPPER_NODE_TYPES = {
|
|
132
|
+
"named_type",
|
|
133
|
+
"optional_type", # ``?Foo``
|
|
134
|
+
"union_type", # ``Foo|Bar``
|
|
135
|
+
"intersection_type", # ``Foo&Bar``
|
|
136
|
+
"disjunctive_normal_form_type", # PHP 8.2 ``(Foo&Bar)|Baz``
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
# Parent nodes whose children include a PHP type expression in a
|
|
140
|
+
# positional slot (no ``type`` field). Walk each child whose type is a
|
|
141
|
+
# wrapper and collect the references. This is in addition to
|
|
142
|
+
# ``TYPE_FIELD_NODE_TYPES`` (field-name lookup) so C#/TS keep working.
|
|
143
|
+
_PHP_TYPED_PARENT_NODE_TYPES = {
|
|
144
|
+
"property_declaration",
|
|
145
|
+
"simple_parameter",
|
|
146
|
+
"variadic_parameter",
|
|
147
|
+
"property_promotion_parameter",
|
|
148
|
+
"method_declaration", # return type after ``:``
|
|
149
|
+
"function_definition", # free-function return type
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Primitive / language-built-in type tokens — never emit as a reference.
|
|
153
|
+
# These usually appear as `predefined_type` nodes (skipped structurally) but
|
|
154
|
+
# some grammars emit them as bare identifiers in odd positions.
|
|
155
|
+
_PRIMITIVE_TYPE_NAMES: frozenset[str] = frozenset({
|
|
156
|
+
# C#
|
|
157
|
+
"void", "bool", "byte", "sbyte", "short", "ushort", "int", "uint",
|
|
158
|
+
"long", "ulong", "float", "double", "decimal", "char", "string",
|
|
159
|
+
"object", "dynamic", "var", "nint", "nuint",
|
|
160
|
+
# TS/JS
|
|
161
|
+
"any", "unknown", "never", "number", "boolean", "undefined", "null",
|
|
162
|
+
"this", "symbol", "bigint",
|
|
163
|
+
})
|
|
164
|
+
|
|
165
|
+
IMPORT_NODE_TYPES = {
|
|
166
|
+
"import_statement",
|
|
167
|
+
"import_from_statement",
|
|
168
|
+
"using_directive", # C#
|
|
169
|
+
"razor_using_directive", # Razor
|
|
170
|
+
"imports_statement", # VB
|
|
171
|
+
"import_decl", # F#
|
|
172
|
+
"namespace_use_declaration", # PHP ``use Foo\Bar;``
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
# Razor / Blazor ``@inject TypeName Member`` directives. Each one is
|
|
176
|
+
# a DI dependency declaration that we want as a graph edge from the
|
|
177
|
+
# file to the injected type.
|
|
178
|
+
INJECT_NODE_TYPES = {
|
|
179
|
+
"razor_inject_directive",
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
@dataclass
|
|
184
|
+
class Symbol:
|
|
185
|
+
name: str
|
|
186
|
+
kind: str
|
|
187
|
+
start_line: int
|
|
188
|
+
end_line: int
|
|
189
|
+
snippet: str
|
|
190
|
+
namespace: str | None = None
|
|
191
|
+
partial: bool = False
|
|
192
|
+
# Parameter count for callable kinds (method_declaration,
|
|
193
|
+
# function_declaration, ...). ``None`` when the kind is not
|
|
194
|
+
# callable (class_declaration, etc.) or when the parser couldn't
|
|
195
|
+
# locate a parameter_list child.
|
|
196
|
+
param_count: int | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@dataclass(frozen=True)
|
|
200
|
+
class Call:
|
|
201
|
+
"""One call site: ``name(args)`` with arity captured.
|
|
202
|
+
|
|
203
|
+
Arity feeds the resolver's overload-disambiguation tier: when
|
|
204
|
+
multiple definitions share the same name (classic C# / Java
|
|
205
|
+
overload pattern), prefer the one whose parameter count matches.
|
|
206
|
+
|
|
207
|
+
``receiver_type`` is the inferred type of the call's receiver, set
|
|
208
|
+
for TS ``this.<field>.<method>()`` patterns where the field's type
|
|
209
|
+
can be read off a member initializer or annotation. The resolver
|
|
210
|
+
uses it to narrow ``<method>`` to the methods defined on that type
|
|
211
|
+
— without it, every Angular use case's call to its port collapses
|
|
212
|
+
to an ambiguous bare identifier.
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
name: str
|
|
216
|
+
arity: int
|
|
217
|
+
receiver_type: str | None = None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@dataclass
|
|
221
|
+
class ExtractedFile:
|
|
222
|
+
path: str
|
|
223
|
+
lang: str
|
|
224
|
+
symbols: list[Symbol] = field(default_factory=list)
|
|
225
|
+
imports: list[str] = field(default_factory=list)
|
|
226
|
+
calls: list[Call] = field(default_factory=list)
|
|
227
|
+
# DI declarations: list of injected type names (Razor ``@inject TypeName Member``).
|
|
228
|
+
# Populated for ``.razor`` / ``.cshtml`` files; empty for other languages.
|
|
229
|
+
injects: list[str] = field(default_factory=list)
|
|
230
|
+
# Type-position name references: base lists (`class X : IFoo`), parameter
|
|
231
|
+
# types, field/property types, generic args, type constraints, cast/is/as/
|
|
232
|
+
# typeof targets, etc. Powers "who touches type X" queries (callers + refs).
|
|
233
|
+
references: list[str] = field(default_factory=list)
|
|
234
|
+
source: str = ""
|
|
235
|
+
generated: bool = False
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@lru_cache(maxsize=16)
|
|
239
|
+
def _parser_for(lang: str) -> Parser:
|
|
240
|
+
language: Language = get_language(lang)
|
|
241
|
+
return Parser(language)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def lang_for(path: str | Path) -> str | None:
|
|
245
|
+
return LANG_BY_EXT.get(Path(path).suffix.lower())
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
MAX_FILE_BYTES = 500_000 # skip files larger than ~500KB (bundles, minified)
|
|
249
|
+
MAX_LINE_LEN = 2000 # likely minified if any line is this long
|
|
250
|
+
MINIFIED_SNIFF_BYTES = 4096 # bytes to inspect for minified-file heuristic
|
|
251
|
+
MINIFIED_AVG_LINE = 200 # avg line length above this in sniff window => minified
|
|
252
|
+
|
|
253
|
+
# Substrings that, when present in the first ~2KB of a file, mark it as
|
|
254
|
+
# auto-generated. These are case-insensitive contains checks.
|
|
255
|
+
GENERATED_HEADER_MARKERS = (
|
|
256
|
+
"@generated",
|
|
257
|
+
"auto-generated",
|
|
258
|
+
"autogenerated",
|
|
259
|
+
"code generated by",
|
|
260
|
+
"do not edit",
|
|
261
|
+
"this file was generated",
|
|
262
|
+
"generated by openapi",
|
|
263
|
+
"generated by swagger",
|
|
264
|
+
"generated by ng-openapi-gen",
|
|
265
|
+
"generated by openapi-generator",
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Path segments / suffixes that indicate generated output.
|
|
269
|
+
_GENERATED_PATH_PARTS = ("generated", "__generated__", "openapi-gen", "swagger-gen")
|
|
270
|
+
_GENERATED_PATH_SUFFIXES = (".generated.ts", ".generated.js", ".g.ts", ".g.dart")
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _has_generated_header(sample: str) -> bool:
|
|
274
|
+
lower = sample[:2048].lower()
|
|
275
|
+
return any(m in lower for m in GENERATED_HEADER_MARKERS)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _has_generated_path(path: Path) -> bool:
|
|
279
|
+
parts_lower = {part.lower() for part in path.parts}
|
|
280
|
+
if any(p in parts_lower for p in _GENERATED_PATH_PARTS):
|
|
281
|
+
return True
|
|
282
|
+
name_lower = path.name.lower()
|
|
283
|
+
return any(name_lower.endswith(suf) for suf in _GENERATED_PATH_SUFFIXES)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def looks_generated(path: str | Path, sample: str) -> bool:
|
|
287
|
+
"""Detect auto-generated code by path heuristics or header markers."""
|
|
288
|
+
p = Path(path)
|
|
289
|
+
return _has_generated_path(p) or _has_generated_header(sample)
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def looks_minified(sample: str) -> bool:
|
|
293
|
+
"""Detect minified / pre-bundled JS without parsing.
|
|
294
|
+
|
|
295
|
+
Triggers when:
|
|
296
|
+
- the sniffed window has no newline (one giant line), or
|
|
297
|
+
- the average line length within the sniffed window exceeds
|
|
298
|
+
``MINIFIED_AVG_LINE``, or
|
|
299
|
+
- any line in the sniffed window exceeds ``MAX_LINE_LEN``.
|
|
300
|
+
|
|
301
|
+
Vite/webpack dep caches and minified bundles all match at least one.
|
|
302
|
+
"""
|
|
303
|
+
if not sample:
|
|
304
|
+
return False
|
|
305
|
+
if "\n" not in sample:
|
|
306
|
+
return True
|
|
307
|
+
lines = sample.splitlines()
|
|
308
|
+
if any(len(line) > MAX_LINE_LEN for line in lines):
|
|
309
|
+
return True
|
|
310
|
+
avg = len(sample) / max(len(lines), 1)
|
|
311
|
+
return avg > MINIFIED_AVG_LINE
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def extract_file(path: str | Path) -> ExtractedFile | None:
|
|
315
|
+
p = Path(path)
|
|
316
|
+
lang = lang_for(p)
|
|
317
|
+
if lang is None:
|
|
318
|
+
return None
|
|
319
|
+
try:
|
|
320
|
+
size = p.stat().st_size
|
|
321
|
+
except OSError:
|
|
322
|
+
return None
|
|
323
|
+
if size > MAX_FILE_BYTES:
|
|
324
|
+
return None
|
|
325
|
+
raw = p.read_bytes()
|
|
326
|
+
# Strip a UTF-8 BOM if present so tree-sitter's byte offsets line up
|
|
327
|
+
# with our slicing buffer. Some Windows-authored C# files ship one.
|
|
328
|
+
if raw.startswith(b"\xef\xbb\xbf"):
|
|
329
|
+
raw = raw[3:]
|
|
330
|
+
source = raw.decode("utf-8", errors="replace")
|
|
331
|
+
if looks_minified(source[:MINIFIED_SNIFF_BYTES]):
|
|
332
|
+
return None # minified / bundled
|
|
333
|
+
parser = _parser_for(lang)
|
|
334
|
+
tree = parser.parse(raw)
|
|
335
|
+
root = tree.root_node
|
|
336
|
+
ex = ExtractedFile(
|
|
337
|
+
path=str(p.resolve()),
|
|
338
|
+
lang=lang,
|
|
339
|
+
source=source,
|
|
340
|
+
generated=looks_generated(p, source),
|
|
341
|
+
)
|
|
342
|
+
_walk(root, raw, ex, ns_stack=[], class_stack=[])
|
|
343
|
+
return ex
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# C# block-scoped ``namespace Foo { ... }``. Pushed while walking the
|
|
347
|
+
# block's children and popped on exit.
|
|
348
|
+
_BLOCK_NAMESPACE_NODE_TYPES = {"namespace_declaration"}
|
|
349
|
+
|
|
350
|
+
# C# 10 ``namespace Foo;`` (file-scoped). One per file by spec; applies
|
|
351
|
+
# to *everything after it*. We push without popping.
|
|
352
|
+
_FILE_SCOPED_NAMESPACE_NODE_TYPES = {"file_scoped_namespace_declaration"}
|
|
353
|
+
|
|
354
|
+
# PHP ``namespace X;`` (file-scoped, persists for the rest of the file) vs
|
|
355
|
+
# ``namespace X { ... }`` (block, scopes only its body). Tree-sitter emits
|
|
356
|
+
# the same ``namespace_definition`` node for both — we disambiguate by
|
|
357
|
+
# checking for a ``compound_statement`` child.
|
|
358
|
+
_PHP_NAMESPACE_NODE_TYPE = "namespace_definition"
|
|
359
|
+
|
|
360
|
+
# Symbol kinds that can carry a ``partial`` modifier in C#. Partial
|
|
361
|
+
# classes / structs / interfaces / records get merged into a single
|
|
362
|
+
# logical entity in the graph; non-partial symbols stay file-scoped.
|
|
363
|
+
_PARTIAL_CAPABLE_KINDS = {
|
|
364
|
+
"class_declaration",
|
|
365
|
+
"struct_declaration",
|
|
366
|
+
"interface_declaration",
|
|
367
|
+
"record_declaration",
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
# Symbol kinds that take parameters — we record their arity for the
|
|
371
|
+
# resolver's overload disambiguation tier. Non-callable kinds
|
|
372
|
+
# (classes, modules, enums) skip the count.
|
|
373
|
+
_CALLABLE_KINDS = {
|
|
374
|
+
"function_declaration",
|
|
375
|
+
"function_definition",
|
|
376
|
+
"method_definition",
|
|
377
|
+
"method_declaration",
|
|
378
|
+
"constructor_declaration",
|
|
379
|
+
"delegate_declaration",
|
|
380
|
+
"arrow_function",
|
|
381
|
+
"function_or_value_defn", # F#
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _is_partial_modifier(node: Node, source: bytes) -> bool:
|
|
386
|
+
"""``True`` when this is a ``modifier`` node carrying ``partial``."""
|
|
387
|
+
if node.type != "modifier":
|
|
388
|
+
return False
|
|
389
|
+
text = _slice(source, node).strip()
|
|
390
|
+
return text == "partial"
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _has_partial_modifier(node: Node, source: bytes) -> bool:
|
|
394
|
+
return any(_is_partial_modifier(c, source) for c in node.children)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _namespace_name(node: Node, source: bytes) -> str | None:
|
|
398
|
+
"""Return the dotted name of a namespace declaration (C#/PHP)."""
|
|
399
|
+
for child in node.children:
|
|
400
|
+
# ``namespace_name`` is PHP's wrapper for ``Foo\Bar\Baz``;
|
|
401
|
+
# ``qualified_name`` is C# / PHP ``use`` clauses.
|
|
402
|
+
if child.type in {"qualified_name", "identifier", "namespace_name"}:
|
|
403
|
+
return _slice(source, child)
|
|
404
|
+
return None
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
_CLASS_DECL_NODE_TYPES = frozenset(
|
|
408
|
+
{"class_declaration", "abstract_class_declaration", "class"}
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _walk(
|
|
413
|
+
node: Node,
|
|
414
|
+
source: bytes,
|
|
415
|
+
ex: ExtractedFile,
|
|
416
|
+
ns_stack: list[str],
|
|
417
|
+
class_stack: list[dict[str, str]],
|
|
418
|
+
) -> None:
|
|
419
|
+
t = node.type
|
|
420
|
+
pushed_ns = False
|
|
421
|
+
pushed_class = False
|
|
422
|
+
if t in _CLASS_DECL_NODE_TYPES:
|
|
423
|
+
body = None
|
|
424
|
+
for child in node.children:
|
|
425
|
+
if child.type == "class_body":
|
|
426
|
+
body = child
|
|
427
|
+
break
|
|
428
|
+
if body is not None:
|
|
429
|
+
class_stack.append(_ts_class_field_types(body, source))
|
|
430
|
+
pushed_class = True
|
|
431
|
+
if t in _BLOCK_NAMESPACE_NODE_TYPES:
|
|
432
|
+
ns = _namespace_name(node, source)
|
|
433
|
+
if ns:
|
|
434
|
+
ns_stack.append(ns)
|
|
435
|
+
pushed_ns = True
|
|
436
|
+
elif t in _FILE_SCOPED_NAMESPACE_NODE_TYPES:
|
|
437
|
+
# C# 10 file-scoped namespace scopes the rest of the file.
|
|
438
|
+
# Push and never pop within this walk — there is at most one.
|
|
439
|
+
ns = _namespace_name(node, source)
|
|
440
|
+
if ns:
|
|
441
|
+
ns_stack.append(ns)
|
|
442
|
+
elif t == _PHP_NAMESPACE_NODE_TYPE:
|
|
443
|
+
# PHP ``namespace X { ... }`` has a ``compound_statement`` body —
|
|
444
|
+
# push+pop so symbols outside the braces stay unqualified.
|
|
445
|
+
# ``namespace X;`` has no body — push without pop so the rest of
|
|
446
|
+
# the file (parsed as sibling nodes of the ``program``) inherits it.
|
|
447
|
+
ns = _namespace_name(node, source)
|
|
448
|
+
if ns:
|
|
449
|
+
ns_stack.append(ns)
|
|
450
|
+
if any(c.type == "compound_statement" for c in node.children):
|
|
451
|
+
pushed_ns = True
|
|
452
|
+
|
|
453
|
+
if t in SYMBOL_NODE_TYPES:
|
|
454
|
+
name = _symbol_name(node, source)
|
|
455
|
+
if name:
|
|
456
|
+
partial = (
|
|
457
|
+
t in _PARTIAL_CAPABLE_KINDS and _has_partial_modifier(node, source)
|
|
458
|
+
)
|
|
459
|
+
param_count = _param_count(node) if t in _CALLABLE_KINDS else None
|
|
460
|
+
ex.symbols.append(
|
|
461
|
+
Symbol(
|
|
462
|
+
name=name,
|
|
463
|
+
kind=t,
|
|
464
|
+
start_line=node.start_point[0] + 1,
|
|
465
|
+
end_line=node.end_point[0] + 1,
|
|
466
|
+
snippet=_slice(source, node),
|
|
467
|
+
namespace=".".join(ns_stack) if ns_stack else None,
|
|
468
|
+
partial=partial,
|
|
469
|
+
param_count=param_count,
|
|
470
|
+
)
|
|
471
|
+
)
|
|
472
|
+
elif t in IMPORT_NODE_TYPES:
|
|
473
|
+
if t == "namespace_use_declaration":
|
|
474
|
+
# PHP allows multiple clauses per statement; emit each FQCN.
|
|
475
|
+
ex.imports.extend(_php_use_imports(node, source))
|
|
476
|
+
else:
|
|
477
|
+
mod = _import_module(node, source)
|
|
478
|
+
if mod:
|
|
479
|
+
ex.imports.append(mod)
|
|
480
|
+
elif t in INJECT_NODE_TYPES:
|
|
481
|
+
injected = _inject_type(node, source)
|
|
482
|
+
if injected:
|
|
483
|
+
ex.injects.append(injected)
|
|
484
|
+
elif t in CALL_NODE_TYPES:
|
|
485
|
+
# Angular DI: ``inject(Token)`` becomes an INJECTS edge instead
|
|
486
|
+
# of a (stoplisted) CALL. Without this, the entire DI graph for
|
|
487
|
+
# Angular 14+ codebases is invisible.
|
|
488
|
+
token = _angular_inject_token(node, source)
|
|
489
|
+
if token:
|
|
490
|
+
ex.injects.append(token)
|
|
491
|
+
else:
|
|
492
|
+
callee = _callee_name(node, source)
|
|
493
|
+
if callee:
|
|
494
|
+
receiver_type: str | None = None
|
|
495
|
+
if class_stack:
|
|
496
|
+
field = _this_field_receiver(node, source)
|
|
497
|
+
if field:
|
|
498
|
+
receiver_type = class_stack[-1].get(field)
|
|
499
|
+
ex.calls.append(
|
|
500
|
+
Call(
|
|
501
|
+
name=callee,
|
|
502
|
+
arity=_call_arity(node),
|
|
503
|
+
receiver_type=receiver_type,
|
|
504
|
+
)
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
if t in TYPE_FIELD_NODE_TYPES:
|
|
508
|
+
# ``method_declaration`` exposes the return type via ``returns``
|
|
509
|
+
# in some grammars; everything else uses ``type``.
|
|
510
|
+
type_node = node.child_by_field_name("type") or node.child_by_field_name(
|
|
511
|
+
"returns"
|
|
512
|
+
)
|
|
513
|
+
if type_node is not None:
|
|
514
|
+
_collect_type_refs(type_node, source, ex.references)
|
|
515
|
+
if t in TYPE_CHILDREN_NODE_TYPES:
|
|
516
|
+
for child in node.children:
|
|
517
|
+
if child.type in {
|
|
518
|
+
",", ":", "(", ")", "<", ">",
|
|
519
|
+
"where", "extends", "implements",
|
|
520
|
+
"|", "&", # PHP union/intersection separators
|
|
521
|
+
}:
|
|
522
|
+
continue
|
|
523
|
+
_collect_type_refs(child, source, ex.references)
|
|
524
|
+
if t in _PHP_TYPED_PARENT_NODE_TYPES:
|
|
525
|
+
# PHP property/parameter/return types are positional children
|
|
526
|
+
# (no ``type`` field). Find any type-wrapper child and harvest
|
|
527
|
+
# the inner identifiers. The wrapper-only filter keeps us from
|
|
528
|
+
# over-walking unrelated children like ``visibility_modifier``
|
|
529
|
+
# or ``variable_name`` that share the parent node.
|
|
530
|
+
for child in node.children:
|
|
531
|
+
if child.type in _PHP_TYPE_WRAPPER_NODE_TYPES:
|
|
532
|
+
_collect_type_refs(child, source, ex.references)
|
|
533
|
+
# C# pattern / cast / typeof: tree-sitter doesn't expose a `type`
|
|
534
|
+
# field on these, so collect the type child positionally.
|
|
535
|
+
if t == "cast_expression":
|
|
536
|
+
# `(Type)expr` — type is the single child between `(` and `)`.
|
|
537
|
+
between = []
|
|
538
|
+
opened = False
|
|
539
|
+
for child in node.children:
|
|
540
|
+
if child.type == "(":
|
|
541
|
+
opened = True
|
|
542
|
+
continue
|
|
543
|
+
if child.type == ")":
|
|
544
|
+
break
|
|
545
|
+
if opened:
|
|
546
|
+
between.append(child)
|
|
547
|
+
for c in between:
|
|
548
|
+
_collect_type_refs(c, source, ex.references)
|
|
549
|
+
elif t in {"as_expression", "is_expression"}:
|
|
550
|
+
# `value as Type` / `value is Type` — type follows the keyword.
|
|
551
|
+
keyword = "as" if t == "as_expression" else "is"
|
|
552
|
+
seen_kw = False
|
|
553
|
+
for child in node.children:
|
|
554
|
+
if not seen_kw:
|
|
555
|
+
if child.type == keyword:
|
|
556
|
+
seen_kw = True
|
|
557
|
+
continue
|
|
558
|
+
_collect_type_refs(child, source, ex.references)
|
|
559
|
+
elif t == "is_pattern_expression":
|
|
560
|
+
# `value is Pattern` — find declaration_pattern / type_pattern
|
|
561
|
+
# children and pick their type identifier(s).
|
|
562
|
+
for child in node.children:
|
|
563
|
+
if child.type in {"declaration_pattern", "type_pattern", "recursive_pattern"}:
|
|
564
|
+
# First identifier-bearing sub is the type name.
|
|
565
|
+
for sub in child.children:
|
|
566
|
+
if sub.type in {"identifier", "type_identifier", "qualified_name", "generic_name"}:
|
|
567
|
+
_collect_type_refs(sub, source, ex.references)
|
|
568
|
+
break
|
|
569
|
+
elif t in {"typeof_expression", "sizeof_expression", "default_expression"}:
|
|
570
|
+
# `typeof(Type)` — type between the parens.
|
|
571
|
+
opened = False
|
|
572
|
+
for child in node.children:
|
|
573
|
+
if child.type == "(":
|
|
574
|
+
opened = True
|
|
575
|
+
continue
|
|
576
|
+
if child.type == ")":
|
|
577
|
+
break
|
|
578
|
+
if opened and child.type not in {","}:
|
|
579
|
+
_collect_type_refs(child, source, ex.references)
|
|
580
|
+
|
|
581
|
+
for child in node.children:
|
|
582
|
+
_walk(child, source, ex, ns_stack, class_stack)
|
|
583
|
+
|
|
584
|
+
if pushed_ns:
|
|
585
|
+
ns_stack.pop()
|
|
586
|
+
if pushed_class:
|
|
587
|
+
class_stack.pop()
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def _slice(source: bytes, node: Node) -> str:
|
|
591
|
+
"""Return UTF-8 text at the node's byte range.
|
|
592
|
+
|
|
593
|
+
Tree-sitter reports byte offsets into the parsed buffer, not
|
|
594
|
+
character offsets. Slicing a Python ``str`` with those offsets
|
|
595
|
+
silently chops identifiers on files that contain any non-ASCII
|
|
596
|
+
bytes (e.g. French C# with accents). Slicing ``bytes`` then
|
|
597
|
+
decoding fixes the off-by-many-bytes drift.
|
|
598
|
+
"""
|
|
599
|
+
return source[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
_FSHARP_DEEP_NAME_NODES = {
|
|
603
|
+
"function_or_value_defn",
|
|
604
|
+
"type_definition",
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
def _first_identifier_deep(node: Node, source: bytes) -> str | None:
|
|
609
|
+
"""BFS for the first identifier-bearing token inside ``node``."""
|
|
610
|
+
queue: list[Node] = [node]
|
|
611
|
+
while queue:
|
|
612
|
+
current = queue.pop(0)
|
|
613
|
+
for child in current.children:
|
|
614
|
+
if child.type in {"identifier", "type_identifier"}:
|
|
615
|
+
return _slice(source, child)
|
|
616
|
+
queue.append(child)
|
|
617
|
+
return None
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _symbol_name(node: Node, source: bytes) -> str | None:
|
|
621
|
+
name = node.child_by_field_name("name")
|
|
622
|
+
if name is not None:
|
|
623
|
+
return _slice(source, name)
|
|
624
|
+
if node.type in _FSHARP_DEEP_NAME_NODES:
|
|
625
|
+
return _first_identifier_deep(node, source)
|
|
626
|
+
for child in node.children:
|
|
627
|
+
if child.type in {"identifier", "type_identifier", "property_identifier"}:
|
|
628
|
+
return _slice(source, child)
|
|
629
|
+
return None
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def _php_use_imports(node: Node, source: bytes) -> list[str]:
|
|
633
|
+
"""Extract every FQCN imported by a PHP ``use`` statement.
|
|
634
|
+
|
|
635
|
+
Handles single-clause (``use Foo\\Bar;``), multi-clause
|
|
636
|
+
(``use A\\B, C\\D;``), and aliased (``use A\\B as Alias;``) forms.
|
|
637
|
+
The alias is discarded — the graph tracks what the file *imports*,
|
|
638
|
+
not the local rebinding name. For plain ``use Alias;`` (no
|
|
639
|
+
backslash) we surface the bare ``name`` child so the import shows
|
|
640
|
+
up under its declared identifier.
|
|
641
|
+
"""
|
|
642
|
+
out: list[str] = []
|
|
643
|
+
for child in node.children:
|
|
644
|
+
if child.type != "namespace_use_clause":
|
|
645
|
+
continue
|
|
646
|
+
for sub in child.children:
|
|
647
|
+
if sub.type in {"qualified_name", "name"}:
|
|
648
|
+
out.append(_slice(source, sub).strip())
|
|
649
|
+
break # first id-bearing child is the FQCN; ignore ``as Alias``
|
|
650
|
+
return out
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def _import_module(node: Node, source: bytes) -> str | None:
|
|
654
|
+
# Python ``from X import Y`` and ``from .X import Y`` expose the
|
|
655
|
+
# module via a ``module_name`` field. Without this branch the first
|
|
656
|
+
# ``dotted_name`` child wins — which for ``from ..pkg.mod import Sym``
|
|
657
|
+
# is ``Sym`` (the imported name), not the module. Result: the graph
|
|
658
|
+
# files the import under the wrong key and ``importers <module>``
|
|
659
|
+
# misses every relative caller.
|
|
660
|
+
module_name_field = node.child_by_field_name("module_name") or node.child_by_field_name("name")
|
|
661
|
+
if module_name_field is not None:
|
|
662
|
+
return _slice(source, module_name_field).strip("'\"")
|
|
663
|
+
for child in node.children:
|
|
664
|
+
if child.type in {
|
|
665
|
+
"string",
|
|
666
|
+
"string_fragment",
|
|
667
|
+
"dotted_name",
|
|
668
|
+
"module_name",
|
|
669
|
+
"relative_import", # Python ``..pkg.mod``
|
|
670
|
+
"qualified_name",
|
|
671
|
+
"namespace_name", # VB
|
|
672
|
+
"long_identifier", # F#
|
|
673
|
+
"identifier",
|
|
674
|
+
}:
|
|
675
|
+
return _slice(source, child).strip("'\"")
|
|
676
|
+
return None
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
_PARAMETER_LIST_TYPES = {
|
|
680
|
+
"parameter_list",
|
|
681
|
+
"formal_parameters",
|
|
682
|
+
"parameters", # F# / Python
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
_PARAMETER_NODE_TYPES = {
|
|
686
|
+
"parameter",
|
|
687
|
+
"required_parameter",
|
|
688
|
+
"optional_parameter",
|
|
689
|
+
"rest_parameter",
|
|
690
|
+
"typed_parameter",
|
|
691
|
+
"typed_default_parameter",
|
|
692
|
+
"default_parameter",
|
|
693
|
+
"identifier", # F# value bindings expose bare identifiers
|
|
694
|
+
# PHP
|
|
695
|
+
"simple_parameter",
|
|
696
|
+
"variadic_parameter",
|
|
697
|
+
"property_promotion_parameter", # PHP 8 ctor promotion
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
|
|
701
|
+
def _param_count(node: Node) -> int | None:
|
|
702
|
+
"""Count parameters of a callable declaration.
|
|
703
|
+
|
|
704
|
+
Looks for a ``parameter_list`` (or grammar-specific equivalent)
|
|
705
|
+
child and counts its parameter children, ignoring punctuation
|
|
706
|
+
tokens like ``(``, ``)``, ``,``. Returns ``None`` when no
|
|
707
|
+
parameter list child is found — that signals the caller to leave
|
|
708
|
+
``param_count`` unset rather than write a misleading 0.
|
|
709
|
+
"""
|
|
710
|
+
for child in node.children:
|
|
711
|
+
if child.type in _PARAMETER_LIST_TYPES:
|
|
712
|
+
count = 0
|
|
713
|
+
for sub in child.children:
|
|
714
|
+
if sub.type in _PARAMETER_NODE_TYPES:
|
|
715
|
+
count += 1
|
|
716
|
+
return count
|
|
717
|
+
return None
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def _call_arity(node: Node) -> int:
|
|
721
|
+
"""Count arguments at a call site.
|
|
722
|
+
|
|
723
|
+
Returns the number of argument children in the call's argument
|
|
724
|
+
list. Falls back to ``0`` when we can't find one — that matches
|
|
725
|
+
what tree-sitter reports for property/field references parsed as
|
|
726
|
+
invocation_expression (rare, but happens in C# generated code).
|
|
727
|
+
"""
|
|
728
|
+
for child in node.children:
|
|
729
|
+
if child.type in {"argument_list", "arguments"}:
|
|
730
|
+
count = 0
|
|
731
|
+
for sub in child.children:
|
|
732
|
+
if sub.type in {"argument", "spread_element"}:
|
|
733
|
+
count += 1
|
|
734
|
+
elif sub.type not in {"(", ")", ",", "{", "}"}:
|
|
735
|
+
# Some grammars (Python) emit expression children
|
|
736
|
+
# directly without an ``argument`` wrapper.
|
|
737
|
+
count += 1
|
|
738
|
+
return count
|
|
739
|
+
return 0
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def _collect_type_refs(node: Node, source: bytes, out: list[str]) -> None:
|
|
743
|
+
"""Walk a type expression subtree, appending each referenced type name.
|
|
744
|
+
|
|
745
|
+
Handles:
|
|
746
|
+
- ``identifier`` / ``type_identifier`` → emit text
|
|
747
|
+
- ``qualified_name`` / ``member_access_expression`` → emit right-most segment
|
|
748
|
+
- ``generic_name`` → emit the generic's name + recurse into type_arguments
|
|
749
|
+
- ``nullable_type`` / ``array_type`` / ``pointer_type`` → recurse into element
|
|
750
|
+
- ``predefined_type`` / primitive identifiers → skip (no graph value)
|
|
751
|
+
- ``tuple_type`` / ``tuple_element`` → recurse for inner names
|
|
752
|
+
"""
|
|
753
|
+
t = node.type
|
|
754
|
+
if t in {"predefined_type", "implicit_type", "this_type", "primitive_type"}:
|
|
755
|
+
return
|
|
756
|
+
if t in {"identifier", "type_identifier", "name"}:
|
|
757
|
+
# ``name`` is PHP's identifier node; included here so PHP type
|
|
758
|
+
# positions (``named_type``, ``base_clause`` children, etc.)
|
|
759
|
+
# surface as references the same way C#/TS identifiers do.
|
|
760
|
+
name = _slice(source, node).strip()
|
|
761
|
+
if name and name not in _PRIMITIVE_TYPE_NAMES:
|
|
762
|
+
out.append(name)
|
|
763
|
+
return
|
|
764
|
+
if t in _PHP_TYPE_WRAPPER_NODE_TYPES:
|
|
765
|
+
# ``?Foo`` / ``Foo|Bar`` / ``Foo&Bar`` / ``(A&B)|C`` — recurse,
|
|
766
|
+
# skipping the punctuation that separates the alternatives.
|
|
767
|
+
for child in node.children:
|
|
768
|
+
if child.type in {"?", "|", "&", "(", ")"}:
|
|
769
|
+
continue
|
|
770
|
+
_collect_type_refs(child, source, out)
|
|
771
|
+
return
|
|
772
|
+
if t == "qualified_name":
|
|
773
|
+
# ``Foo.Bar.Baz`` — recurse into the right-most type-bearing
|
|
774
|
+
# child. Left segments are usually namespaces, not types. The
|
|
775
|
+
# right-most can be a plain identifier (``Foo.Bar``), a
|
|
776
|
+
# ``generic_name`` (``Foo.Bar.List<T>``), or another nested
|
|
777
|
+
# qualified_name when grammars produce a left-leaning tree.
|
|
778
|
+
last = None
|
|
779
|
+
for child in node.children:
|
|
780
|
+
if child.type in {
|
|
781
|
+
"identifier",
|
|
782
|
+
"type_identifier",
|
|
783
|
+
"generic_name",
|
|
784
|
+
"qualified_name",
|
|
785
|
+
"name", # PHP: trailing segment of ``App\Repo\UserRepo``
|
|
786
|
+
}:
|
|
787
|
+
last = child
|
|
788
|
+
if last is not None:
|
|
789
|
+
_collect_type_refs(last, source, out)
|
|
790
|
+
return
|
|
791
|
+
if t == "generic_name":
|
|
792
|
+
# ``List<int, Foo>`` — emit `List`, then recurse into the type args.
|
|
793
|
+
for child in node.children:
|
|
794
|
+
if child.type in {"identifier", "type_identifier"}:
|
|
795
|
+
name = _slice(source, child).strip()
|
|
796
|
+
if name and name not in _PRIMITIVE_TYPE_NAMES:
|
|
797
|
+
out.append(name)
|
|
798
|
+
break
|
|
799
|
+
for child in node.children:
|
|
800
|
+
if child.type in {"type_argument_list", "type_arguments"}:
|
|
801
|
+
for sub in child.children:
|
|
802
|
+
if sub.type in {"<", ">", ","}:
|
|
803
|
+
continue
|
|
804
|
+
_collect_type_refs(sub, source, out)
|
|
805
|
+
return
|
|
806
|
+
# Wrapper / composite type nodes — recurse to find inner type names.
|
|
807
|
+
for child in node.children:
|
|
808
|
+
_collect_type_refs(child, source, out)
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
_CLASS_BODY_NODE_TYPES = frozenset({"class_body", "object_type"})
|
|
812
|
+
_TS_FIELD_DECL_TYPES = frozenset(
|
|
813
|
+
{
|
|
814
|
+
"public_field_definition",
|
|
815
|
+
"property_definition",
|
|
816
|
+
"property_signature",
|
|
817
|
+
"abstract_method_signature",
|
|
818
|
+
}
|
|
819
|
+
)
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def _ts_class_field_types(body: Node, source: bytes) -> dict[str, str]:
|
|
823
|
+
"""Map of ``field_name → type_name`` for a TS class body.
|
|
824
|
+
|
|
825
|
+
Reads two sources per field:
|
|
826
|
+
|
|
827
|
+
1. A type annotation (``private foo: Bar``) — the most reliable
|
|
828
|
+
signal.
|
|
829
|
+
2. An initializer of the form ``inject(Token)`` — Angular 14+ DI;
|
|
830
|
+
lets a use case's injected port surface its type even when no
|
|
831
|
+
explicit annotation is written.
|
|
832
|
+
|
|
833
|
+
Also handles constructor parameter properties
|
|
834
|
+
(``constructor(private foo: Bar) {}``), which TypeScript treats as
|
|
835
|
+
fields. Without the constructor scan, Angular services that stick
|
|
836
|
+
to the older ``constructor(private repo: Repo)`` style stay
|
|
837
|
+
invisible to receiver-type resolution.
|
|
838
|
+
"""
|
|
839
|
+
out: dict[str, str] = {}
|
|
840
|
+
for child in body.children:
|
|
841
|
+
if child.type in _TS_FIELD_DECL_TYPES:
|
|
842
|
+
name_node = child.child_by_field_name("name")
|
|
843
|
+
field_name: str | None = None
|
|
844
|
+
for sub in child.children:
|
|
845
|
+
if sub.type == "property_identifier":
|
|
846
|
+
field_name = _slice(source, sub)
|
|
847
|
+
break
|
|
848
|
+
if name_node is not None:
|
|
849
|
+
field_name = _slice(source, name_node)
|
|
850
|
+
if not field_name:
|
|
851
|
+
continue
|
|
852
|
+
type_name = _ts_field_type_from_annotation(child, source)
|
|
853
|
+
if type_name is None:
|
|
854
|
+
type_name = _ts_field_type_from_inject_init(child, source)
|
|
855
|
+
if type_name:
|
|
856
|
+
out[field_name] = type_name
|
|
857
|
+
elif child.type == "method_definition":
|
|
858
|
+
# Constructor parameter properties live on the formal_parameters.
|
|
859
|
+
name_node = child.child_by_field_name("name")
|
|
860
|
+
method_name = _slice(source, name_node) if name_node else None
|
|
861
|
+
if method_name != "constructor":
|
|
862
|
+
continue
|
|
863
|
+
params = child.child_by_field_name("parameters")
|
|
864
|
+
if params is None:
|
|
865
|
+
for sub in child.children:
|
|
866
|
+
if sub.type == "formal_parameters":
|
|
867
|
+
params = sub
|
|
868
|
+
break
|
|
869
|
+
if params is None:
|
|
870
|
+
continue
|
|
871
|
+
for param in params.children:
|
|
872
|
+
if param.type not in {"required_parameter", "optional_parameter"}:
|
|
873
|
+
continue
|
|
874
|
+
# Only treat as a field when there is an accessibility modifier
|
|
875
|
+
# (private/public/protected) — that's TS's "parameter property"
|
|
876
|
+
# syntax. Plain ctor params live in local scope.
|
|
877
|
+
has_modifier = any(
|
|
878
|
+
sub.type == "accessibility_modifier" for sub in param.children
|
|
879
|
+
)
|
|
880
|
+
if not has_modifier:
|
|
881
|
+
continue
|
|
882
|
+
pname = None
|
|
883
|
+
for sub in param.children:
|
|
884
|
+
if sub.type == "identifier":
|
|
885
|
+
pname = _slice(source, sub)
|
|
886
|
+
break
|
|
887
|
+
if not pname:
|
|
888
|
+
continue
|
|
889
|
+
type_name = _ts_field_type_from_annotation(param, source)
|
|
890
|
+
if type_name:
|
|
891
|
+
out[pname] = type_name
|
|
892
|
+
return out
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def _ts_field_type_from_annotation(node: Node, source: bytes) -> str | None:
|
|
896
|
+
"""Read ``: <Type>`` annotation off a field / param node."""
|
|
897
|
+
for child in node.children:
|
|
898
|
+
if child.type == "type_annotation":
|
|
899
|
+
for sub in child.children:
|
|
900
|
+
if sub.type in {"type_identifier", "identifier"}:
|
|
901
|
+
return _slice(source, sub)
|
|
902
|
+
if sub.type == "generic_type":
|
|
903
|
+
for inner in sub.children:
|
|
904
|
+
if inner.type in {"type_identifier", "identifier"}:
|
|
905
|
+
return _slice(source, inner)
|
|
906
|
+
return None
|
|
907
|
+
return None
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
def _ts_field_type_from_inject_init(node: Node, source: bytes) -> str | None:
|
|
911
|
+
"""Read ``= inject(Token)`` initializer off a field node."""
|
|
912
|
+
for child in node.children:
|
|
913
|
+
if child.type == "call_expression":
|
|
914
|
+
return _angular_inject_token(child, source)
|
|
915
|
+
return None
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def _this_field_receiver(node: Node, source: bytes) -> str | None:
|
|
919
|
+
"""For a callee ``this.<field>.<method>``, return ``<field>``.
|
|
920
|
+
|
|
921
|
+
Other receiver shapes (chained calls, computed members, bare
|
|
922
|
+
identifiers) return ``None`` — too ambiguous for the receiver-type
|
|
923
|
+
table to help.
|
|
924
|
+
"""
|
|
925
|
+
fn = node.child_by_field_name("function") or node.child_by_field_name("callee")
|
|
926
|
+
if fn is None or fn.type != "member_expression":
|
|
927
|
+
return None
|
|
928
|
+
obj = fn.child_by_field_name("object")
|
|
929
|
+
if obj is None or obj.type != "member_expression":
|
|
930
|
+
return None
|
|
931
|
+
inner_obj = obj.child_by_field_name("object")
|
|
932
|
+
inner_prop = obj.child_by_field_name("property")
|
|
933
|
+
if inner_obj is None or inner_obj.type != "this":
|
|
934
|
+
return None
|
|
935
|
+
if inner_prop is None or inner_prop.type != "property_identifier":
|
|
936
|
+
return None
|
|
937
|
+
return _slice(source, inner_prop)
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
def _angular_inject_token(node: Node, source: bytes) -> str | None:
|
|
941
|
+
"""Pull the DI token out of an Angular ``inject(Token)`` call.
|
|
942
|
+
|
|
943
|
+
Angular 14+ replaced constructor-DI with the ``inject()`` primitive.
|
|
944
|
+
Without this hook the call gets filtered by ``CALLEE_STOPLIST`` and
|
|
945
|
+
the DI graph for any Angular codebase disappears entirely. We only
|
|
946
|
+
accept call sites whose function is literally ``inject`` to avoid
|
|
947
|
+
capturing user-defined functions of the same name in module scope.
|
|
948
|
+
"""
|
|
949
|
+
fn = node.child_by_field_name("function") or node.child_by_field_name("callee")
|
|
950
|
+
if fn is None:
|
|
951
|
+
return None
|
|
952
|
+
fn_text = _slice(source, fn).strip()
|
|
953
|
+
# Drop generic args: ``inject<Token>`` parses as the bare identifier
|
|
954
|
+
# in the function field; defensive split keeps qualified forms out.
|
|
955
|
+
if fn_text.split("<", 1)[0] != "inject":
|
|
956
|
+
return None
|
|
957
|
+
args = None
|
|
958
|
+
for child in node.children:
|
|
959
|
+
if child.type in {"arguments", "argument_list"}:
|
|
960
|
+
args = child
|
|
961
|
+
break
|
|
962
|
+
if args is None:
|
|
963
|
+
return None
|
|
964
|
+
for sub in args.children:
|
|
965
|
+
if sub.type in {"(", ")", ",", "argument"}:
|
|
966
|
+
if sub.type == "argument":
|
|
967
|
+
# Some grammars wrap each arg in `argument`; descend.
|
|
968
|
+
for inner in sub.children:
|
|
969
|
+
name = _last_identifier(_slice(source, inner).strip())
|
|
970
|
+
if name:
|
|
971
|
+
return name
|
|
972
|
+
continue
|
|
973
|
+
raw = _slice(source, sub).strip()
|
|
974
|
+
name = _last_identifier(raw)
|
|
975
|
+
if name:
|
|
976
|
+
return name
|
|
977
|
+
return None
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
def _inject_type(node: Node, source: bytes) -> str | None:
|
|
981
|
+
"""Pull the injected type name out of a Razor ``@inject`` directive.
|
|
982
|
+
|
|
983
|
+
Grammar: ``@inject <Type> <Member>``. Tree-sitter wraps the
|
|
984
|
+
`<Type> <Member>` pair in a ``variable_declaration``; the type is
|
|
985
|
+
the first ``identifier`` / ``qualified_name`` / ``generic_name``
|
|
986
|
+
child. We capture the **type name only** — for ``ILogger<Foo>``
|
|
987
|
+
that's ``ILogger`` (the resolver matches by bare identifier;
|
|
988
|
+
generic parameters live at the call site, not in the graph).
|
|
989
|
+
"""
|
|
990
|
+
for child in node.children:
|
|
991
|
+
if child.type == "variable_declaration":
|
|
992
|
+
for sub in child.children:
|
|
993
|
+
if sub.type in {"identifier", "qualified_name", "type_identifier"}:
|
|
994
|
+
return _slice(source, sub)
|
|
995
|
+
if sub.type == "generic_name":
|
|
996
|
+
# Drop the ``<T, ...>`` tail by finding the first
|
|
997
|
+
# plain identifier under it.
|
|
998
|
+
for inner in sub.children:
|
|
999
|
+
if inner.type in {"identifier", "type_identifier"}:
|
|
1000
|
+
return _slice(source, inner)
|
|
1001
|
+
break
|
|
1002
|
+
return None
|
|
1003
|
+
|
|
1004
|
+
|
|
1005
|
+
# Callees that are stdlib / framework / RxJS / Angular DI built-ins.
|
|
1006
|
+
# Filtered at extract time so they never enter the graph as CALLS edges;
|
|
1007
|
+
# they pollute "who calls X" queries with high-frequency noise.
|
|
1008
|
+
CALLEE_STOPLIST: frozenset[str] = frozenset(
|
|
1009
|
+
{
|
|
1010
|
+
# JS builtins
|
|
1011
|
+
"console", "JSON", "Math", "Object", "Array", "Promise", "Number",
|
|
1012
|
+
"String", "Boolean", "Date", "RegExp", "Symbol", "Map", "Set",
|
|
1013
|
+
"parseInt", "parseFloat", "isNaN", "isFinite",
|
|
1014
|
+
"setTimeout", "setInterval", "clearTimeout", "clearInterval",
|
|
1015
|
+
"fetch", "structuredClone", "queueMicrotask",
|
|
1016
|
+
# Angular DI / lifecycle
|
|
1017
|
+
"inject", "Injectable", "Component", "Directive", "Pipe", "NgModule",
|
|
1018
|
+
"Input", "Output", "ViewChild", "ContentChild", "HostListener",
|
|
1019
|
+
"HostBinding",
|
|
1020
|
+
# RxJS operators commonly chained via .pipe()
|
|
1021
|
+
"pipe", "subscribe", "map", "filter", "tap", "switchMap", "mergeMap",
|
|
1022
|
+
"concatMap", "exhaustMap", "catchError", "take", "takeUntil", "first",
|
|
1023
|
+
"of", "from", "EMPTY", "throwError", "combineLatest", "forkJoin",
|
|
1024
|
+
"BehaviorSubject", "Subject", "ReplaySubject",
|
|
1025
|
+
# Generic test helpers
|
|
1026
|
+
"describe", "it", "test", "expect", "beforeEach", "afterEach",
|
|
1027
|
+
"beforeAll", "afterAll", "jest", "vi", "spyOn",
|
|
1028
|
+
}
|
|
1029
|
+
)
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def _callee_name(node: Node, source: bytes) -> str | None:
|
|
1033
|
+
"""Return the last identifier of a call expression's callee.
|
|
1034
|
+
|
|
1035
|
+
For ``foo()`` → ``foo``. For ``this.svc.method()`` → ``method``.
|
|
1036
|
+
For ``a.b.c()`` → ``c``. Computed (``a[b]()``) and chained
|
|
1037
|
+
(``f()()``) callees collapse to ``None`` — too ambiguous to resolve.
|
|
1038
|
+
|
|
1039
|
+
Returns ``None`` for callees in :data:`CALLEE_STOPLIST` so they don't
|
|
1040
|
+
enter the graph as noise.
|
|
1041
|
+
"""
|
|
1042
|
+
# ``new Foo()`` exposes the constructor target under the ``type`` field;
|
|
1043
|
+
# plain calls use ``function`` / ``callee``. Without the ``type`` branch
|
|
1044
|
+
# the first-child fallback would land on the ``new`` keyword and miss
|
|
1045
|
+
# every constructor invocation.
|
|
1046
|
+
fn = (
|
|
1047
|
+
node.child_by_field_name("type")
|
|
1048
|
+
or node.child_by_field_name("function")
|
|
1049
|
+
or node.child_by_field_name("callee")
|
|
1050
|
+
# PHP ``member_call_expression`` and ``scoped_call_expression``
|
|
1051
|
+
# expose the method name via a ``name`` field instead of
|
|
1052
|
+
# ``function`` / ``callee``. Without this branch, callee
|
|
1053
|
+
# resolution falls through to the first-child fallback, which
|
|
1054
|
+
# for ``$this->repo->byId(...)`` lands on the
|
|
1055
|
+
# ``member_access_expression`` text (``$this->repo``) and
|
|
1056
|
+
# ``_last_identifier`` rejects the ``->`` separator — every PHP
|
|
1057
|
+
# method call disappears from the call graph.
|
|
1058
|
+
or node.child_by_field_name("name")
|
|
1059
|
+
)
|
|
1060
|
+
if fn is None and node.type == "object_creation_expression":
|
|
1061
|
+
# PHP ``new Foo()`` / ``new App\Foo()`` — no field names on
|
|
1062
|
+
# children. The first child is the ``new`` keyword; the class
|
|
1063
|
+
# name follows. Without this, the first-child fallback returns
|
|
1064
|
+
# ``new`` as the callee for every PHP ctor call.
|
|
1065
|
+
for c in node.children:
|
|
1066
|
+
if c.type in {"name", "identifier", "qualified_name", "type_identifier"}:
|
|
1067
|
+
fn = c
|
|
1068
|
+
break
|
|
1069
|
+
if fn is None and node.children:
|
|
1070
|
+
fn = node.children[0]
|
|
1071
|
+
if fn is None:
|
|
1072
|
+
return None
|
|
1073
|
+
raw = _slice(source, fn).split("(")[0].strip()
|
|
1074
|
+
name = _last_identifier(raw)
|
|
1075
|
+
if name is None or name in CALLEE_STOPLIST:
|
|
1076
|
+
return None
|
|
1077
|
+
return name
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
_IDENT_RE = re.compile(r"[A-Za-z_$][\w$]*")
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
def _last_identifier(expr: str) -> str | None:
|
|
1084
|
+
"""Extract the trailing identifier from a (possibly chained) expression.
|
|
1085
|
+
|
|
1086
|
+
``this.foo.bar`` → ``bar``
|
|
1087
|
+
``MyClass.staticFn`` → ``staticFn``
|
|
1088
|
+
``foo`` → ``foo``
|
|
1089
|
+
``arr[i]`` → ``None`` (computed)
|
|
1090
|
+
``f()`` → ``None`` (chained call; shouldn't normally hit)
|
|
1091
|
+
"""
|
|
1092
|
+
# Reject anything with brackets or calls in the trailing position.
|
|
1093
|
+
if expr.endswith("]") or expr.endswith(")"):
|
|
1094
|
+
return None
|
|
1095
|
+
# PHP fully-qualified names use ``\`` as the namespace separator
|
|
1096
|
+
# (``App\Repo\UserRepo``). Normalize to ``.`` so the chained-name
|
|
1097
|
+
# split below picks the trailing class/method identifier.
|
|
1098
|
+
parts = expr.replace("\\", ".").split(".")
|
|
1099
|
+
last = parts[-1].strip()
|
|
1100
|
+
if not last:
|
|
1101
|
+
return None
|
|
1102
|
+
m = _IDENT_RE.fullmatch(last)
|
|
1103
|
+
return m.group(0) if m else None
|
|
1104
|
+
|
|
1105
|
+
|
|
1106
|
+
DEFAULT_IGNORE_DIRS: tuple[str, ...] = (
|
|
1107
|
+
".git",
|
|
1108
|
+
"node_modules",
|
|
1109
|
+
".venv",
|
|
1110
|
+
"venv",
|
|
1111
|
+
"dist",
|
|
1112
|
+
"build",
|
|
1113
|
+
".next",
|
|
1114
|
+
".nuxt",
|
|
1115
|
+
"out",
|
|
1116
|
+
"coverage",
|
|
1117
|
+
".turbo",
|
|
1118
|
+
".cache",
|
|
1119
|
+
"__pycache__",
|
|
1120
|
+
".mypy_cache",
|
|
1121
|
+
".pytest_cache",
|
|
1122
|
+
".ruff_cache",
|
|
1123
|
+
"target",
|
|
1124
|
+
# Angular / Vite / Nx / Yarn / Parcel / SvelteKit caches and tarballs
|
|
1125
|
+
".angular",
|
|
1126
|
+
".nx",
|
|
1127
|
+
".yarn",
|
|
1128
|
+
".parcel-cache",
|
|
1129
|
+
".svelte-kit",
|
|
1130
|
+
"bower_components",
|
|
1131
|
+
"vendor",
|
|
1132
|
+
"tmp",
|
|
1133
|
+
# .NET build output / IDE caches
|
|
1134
|
+
"bin",
|
|
1135
|
+
"obj",
|
|
1136
|
+
"packages",
|
|
1137
|
+
"TestResults",
|
|
1138
|
+
".vs",
|
|
1139
|
+
"artifacts",
|
|
1140
|
+
)
|
|
1141
|
+
|
|
1142
|
+
|
|
1143
|
+
class Extractor:
|
|
1144
|
+
"""Convenience wrapper to walk a directory."""
|
|
1145
|
+
|
|
1146
|
+
def __init__(
|
|
1147
|
+
self,
|
|
1148
|
+
ignore_dirs: tuple[str, ...] = DEFAULT_IGNORE_DIRS,
|
|
1149
|
+
*,
|
|
1150
|
+
respect_gitignore: bool = True,
|
|
1151
|
+
) -> None:
|
|
1152
|
+
self.ignore_dirs = ignore_dirs
|
|
1153
|
+
self.respect_gitignore = respect_gitignore
|
|
1154
|
+
|
|
1155
|
+
def walk(self, root: str | Path):
|
|
1156
|
+
from .gitignore import GitignoreMatcher
|
|
1157
|
+
|
|
1158
|
+
root_path = Path(root).resolve()
|
|
1159
|
+
matcher = (
|
|
1160
|
+
GitignoreMatcher.from_root(root_path) if self.respect_gitignore else None
|
|
1161
|
+
)
|
|
1162
|
+
ignore_set = set(self.ignore_dirs)
|
|
1163
|
+
for p in root_path.rglob("*"):
|
|
1164
|
+
if not p.is_file():
|
|
1165
|
+
continue
|
|
1166
|
+
if any(part in ignore_set for part in p.parts):
|
|
1167
|
+
continue
|
|
1168
|
+
if matcher is not None and matcher.match(p, is_dir=False):
|
|
1169
|
+
continue
|
|
1170
|
+
ex = extract_file(p)
|
|
1171
|
+
if ex is not None:
|
|
1172
|
+
yield ex
|