cerebro-code-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cerebro/indexer.py ADDED
@@ -0,0 +1,854 @@
1
+ """Static indexer: hash files, extract symbols + imports, resolve dependency edges.
2
+
3
+ No LLM is involved here — this layer is deterministic, fast, and free. It is the
4
+ structural map (plan layers 1 and 5). Summaries (layer 2) are written separately
5
+ by the chat sessions via summaries.record().
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ import posixpath
12
+ import re
13
+ import threading
14
+ from datetime import datetime, timezone
15
+ from pathlib import Path
16
+
17
+ from . import config as cfg
18
+ from . import db
19
+ from . import tsconfig
20
+
21
+ try:
22
+ from tree_sitter_language_pack import get_parser
23
+ except Exception: # pragma: no cover - import guard for environments w/o the pack
24
+ get_parser = None
25
+
26
+ # tree-sitter's Parser objects (Rust/pyo3 binding) are unsendable across threads —
27
+ # sharing one cache between threads panics. FastMCP runs sync tools in a worker
28
+ # thread pool, so the cache must be thread-local.
29
+ _PARSER_TLS = threading.local()
30
+
31
+
32
+ def _parser(lang: str):
33
+ if get_parser is None:
34
+ return None
35
+ cache = getattr(_PARSER_TLS, "parsers", None)
36
+ if cache is None:
37
+ cache = {}
38
+ _PARSER_TLS.parsers = cache
39
+ if lang not in cache:
40
+ try:
41
+ cache[lang] = get_parser(lang)
42
+ except Exception:
43
+ cache[lang] = None
44
+ return cache[lang]
45
+
46
+
47
+ def now_iso() -> str:
48
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
49
+
50
+
51
+ def file_hash(path: Path) -> str:
52
+ h = hashlib.sha1()
53
+ with open(path, "rb") as f:
54
+ for chunk in iter(lambda: f.read(65536), b""):
55
+ h.update(chunk)
56
+ return h.hexdigest()
57
+
58
+
59
+ # --- disk diff ---------------------------------------------------------------
60
+
61
+ def disk_state(config: cfg.Config) -> dict[str, str]:
62
+ """Map relative path -> current on-disk hash for all indexable files."""
63
+ state: dict[str, str] = {}
64
+ for rel, abs_path in config.iter_files():
65
+ try:
66
+ state[rel] = file_hash(abs_path)
67
+ except OSError:
68
+ continue
69
+ return state
70
+
71
+
72
+ def diff(conn, disk: dict[str, str]) -> dict[str, list[str]]:
73
+ stored = db.stored_hashes(conn)
74
+ new = [p for p in disk if p not in stored]
75
+ changed = [p for p in disk if p in stored and disk[p] != stored[p]]
76
+ deleted = [p for p in stored if p not in disk]
77
+ return {"new": sorted(new), "changed": sorted(changed), "deleted": sorted(deleted)}
78
+
79
+
80
+ # --- tree-sitter node accessors ----------------------------------------------
81
+ # tree-sitter-language-pack ships a binding whose Node members are *methods*
82
+ # (node.kind(), node.start_byte()) rather than the properties of the standard
83
+ # py-tree-sitter package (node.type, node.start_byte). These helpers normalize
84
+ # both so the extraction logic stays clean and version-resilient.
85
+
86
+ def _attr(node, *names):
87
+ for name in names:
88
+ if hasattr(node, name):
89
+ v = getattr(node, name)
90
+ return v() if callable(v) else v
91
+ raise AttributeError(f"node has none of {names}")
92
+
93
+
94
+ def _kind(n) -> str:
95
+ return _attr(n, "kind", "type")
96
+
97
+
98
+ def _child_count(n) -> int:
99
+ return _attr(n, "child_count")
100
+
101
+
102
+ def _children(n):
103
+ return [n.child(i) for i in range(_child_count(n))]
104
+
105
+
106
+ def _field(n, name):
107
+ return n.child_by_field_name(name)
108
+
109
+
110
+ def _line(n) -> int:
111
+ pos = _attr(n, "start_position", "start_point")
112
+ return getattr(pos, "row", 0) + 1
113
+
114
+
115
+ def _text(src: bytes, n) -> str:
116
+ # tree-sitter reports *byte* offsets; slice the UTF-8 bytes, never the str,
117
+ # or multi-byte chars (em-dashes, emoji) shift every later offset.
118
+ return src[_attr(n, "start_byte") : _attr(n, "end_byte")].decode("utf-8", "ignore")
119
+
120
+
121
+ def _root(tree):
122
+ return _attr(tree, "root_node")
123
+
124
+
125
+ def _do_parse(parser, text: str):
126
+ try:
127
+ return parser.parse(text) # language-pack binding wants str
128
+ except TypeError:
129
+ return parser.parse(text.encode("utf-8")) # standard binding wants bytes
130
+
131
+
132
+ def _walk(node):
133
+ stack = [node]
134
+ while stack:
135
+ n = stack.pop()
136
+ yield n
137
+ stack.extend(_children(n))
138
+
139
+
140
+ # --- symbol / import extraction ----------------------------------------------
141
+
142
+ _DEF_TYPES = {
143
+ "python": {"function_definition": "function", "class_definition": "class"},
144
+ "javascript": {
145
+ "function_declaration": "function",
146
+ "generator_function_declaration": "function",
147
+ "class_declaration": "class",
148
+ "method_definition": "method",
149
+ },
150
+ }
151
+ _ARROW_VALUES = ("arrow_function", "function", "function_expression")
152
+
153
+
154
+ def _base_lang(lang: str) -> str:
155
+ """Collapse a tree-sitter language name to the extractor family that handles
156
+ it. typescript/tsx/javascript all share the JS extractor; dart and python
157
+ each have their own."""
158
+ if lang == "python":
159
+ return "python"
160
+ if lang == "dart":
161
+ return "dart"
162
+ return "javascript"
163
+ # Identifier node kinds we count as a *use* of a name (a reference). Definition
164
+ # sites are excluded separately, so what remains is genuine usage: calls, JSX
165
+ # tags, type annotations, value reads, object shorthand.
166
+ _IDENT_KINDS = {
167
+ "identifier", "property_identifier", "type_identifier", "shorthand_property_identifier",
168
+ }
169
+
170
+
171
+ def _signature(src: str, node) -> str:
172
+ text = _text(src, node)
173
+ line = text.splitlines()[0] if text else ""
174
+ return line.strip()[:160]
175
+
176
+
177
+ def extract(lang: str, src):
178
+ """Return (symbols, imports, calls, refs).
179
+ symbols: (kind, name, line, signature). imports: language-specific descriptors.
180
+ calls: (enclosing_symbol|None, callee_name, line). refs: distinct names USED in
181
+ the file (references), with the file's own definition names excluded."""
182
+ if isinstance(src, bytes):
183
+ src_bytes, src_str = src, src.decode("utf-8", "ignore")
184
+ else:
185
+ src_str, src_bytes = src, src.encode("utf-8")
186
+ parser = _parser(lang)
187
+ if parser is None:
188
+ return [], [], [], []
189
+ root = _root(_do_parse(parser, src_str))
190
+ base = _base_lang(lang)
191
+ if base == "dart":
192
+ return _dart_extract(root, src_bytes)
193
+ def_types = _DEF_TYPES[base]
194
+ call_kinds = {"call"} if base == "python" else {"call_expression", "new_expression"}
195
+
196
+ symbols, calls = [], []
197
+ refs: set[str] = set()
198
+ def_spans: set[int] = set() # byte offsets of definition-name nodes (not uses)
199
+ stack = [(root, None)] # (node, enclosing definition name)
200
+ while stack:
201
+ n, enc = stack.pop()
202
+ k = _kind(n)
203
+ new_enc = enc
204
+ kind = def_types.get(k)
205
+ if kind:
206
+ name_node = _field(n, "name")
207
+ if name_node is not None:
208
+ nm = _text(src_bytes, name_node)
209
+ symbols.append((kind, nm, _line(n), _signature(src_bytes, n)))
210
+ def_spans.add(_attr(name_node, "start_byte"))
211
+ new_enc = nm
212
+ # const foo = () => {...} / const Bar = function(){}
213
+ elif k == "variable_declarator":
214
+ value = _field(n, "value")
215
+ name_node = _field(n, "name")
216
+ if name_node is not None:
217
+ def_spans.add(_attr(name_node, "start_byte")) # a binding, not a use
218
+ if value is not None and _kind(value) in _ARROW_VALUES:
219
+ nm = _text(src_bytes, name_node)
220
+ symbols.append(("function", nm, _line(n), _signature(src_bytes, n)))
221
+ new_enc = nm
222
+ elif k in call_kinds:
223
+ cn = _callee_name(src_bytes, n)
224
+ if cn:
225
+ calls.append((enc, cn, _line(n)))
226
+ if k in _IDENT_KINDS and _attr(n, "start_byte") not in def_spans:
227
+ refs.add(_text(src_bytes, n))
228
+ for c in _children(n):
229
+ stack.append((c, new_enc))
230
+
231
+ imports = _py_imports(root, src_bytes) if base == "python" else _js_imports(root, src_bytes)
232
+ return symbols, imports, calls, sorted(refs)
233
+
234
+
235
+ def _callee_name(src, call):
236
+ """Best-effort callee name of a call / new expression (the rightmost name)."""
237
+ fn = _field(call, "function") or _field(call, "constructor")
238
+ if fn is None:
239
+ return None
240
+ k = _kind(fn)
241
+ if k in ("identifier", "property_identifier"):
242
+ return _text(src, fn)
243
+ if k == "attribute": # python a.b.c -> field 'attribute'
244
+ a = _field(fn, "attribute")
245
+ return _text(src, a) if a is not None else None
246
+ if k == "member_expression": # js a.b.c -> field 'property'
247
+ p = _field(fn, "property")
248
+ return _text(src, p) if p is not None else None
249
+ return None
250
+
251
+
252
+ def _aliased_name(src, node):
253
+ name = _field(node, "name") or next(
254
+ (g for g in _children(node) if _kind(g) == "dotted_name"), None
255
+ )
256
+ return _text(src, name) if name is not None else None
257
+
258
+
259
+ def _py_imports(root, src):
260
+ """Return list of (level, module): level 0 == absolute, level N == N leading dots.
261
+
262
+ For `from <module> import <names>` we also emit each imported name as a
263
+ candidate submodule (joined to the module). This is what resolves
264
+ `from . import config` -> config.py, since tree-sitter puts the dots in
265
+ module_name and the name `config` in a separate child.
266
+ """
267
+ out = []
268
+ for n in _walk(root):
269
+ k = _kind(n)
270
+ if k == "import_statement":
271
+ for c in _children(n):
272
+ ck = _kind(c)
273
+ if ck == "dotted_name":
274
+ out.append((0, _text(src, c)))
275
+ elif ck == "aliased_import":
276
+ nm = _aliased_name(src, c)
277
+ if nm:
278
+ out.append((0, nm))
279
+ elif k == "import_from_statement":
280
+ mod = _field(n, "module_name")
281
+ if mod is None:
282
+ continue
283
+ mtxt = _text(src, mod)
284
+ level = len(mtxt) - len(mtxt.lstrip("."))
285
+ module = mtxt[level:]
286
+ out.append((level, module))
287
+ mod_span = (_attr(mod, "start_byte"), _attr(mod, "end_byte"))
288
+ for c in _children(n):
289
+ if (_attr(c, "start_byte"), _attr(c, "end_byte")) == mod_span:
290
+ continue
291
+ ck = _kind(c)
292
+ nm = None
293
+ if ck == "dotted_name":
294
+ nm = _text(src, c)
295
+ elif ck == "aliased_import":
296
+ nm = _aliased_name(src, c)
297
+ if nm:
298
+ out.append((level, f"{module}.{nm}" if module else nm))
299
+ return out
300
+
301
+
302
+ def _is_type_only(stmt) -> bool:
303
+ """True when a TS import/export contributes no runtime value, so its edge is
304
+ elided after compilation: `import type {...}` (a `type` keyword right after
305
+ `import`), or a named clause where every specifier is itself `type`-qualified
306
+ (`import { type A, type B }`). A default/namespace binding, a side-effect
307
+ import, or any untyped specifier makes it a real runtime edge."""
308
+ children = _children(stmt)
309
+ if any(_kind(c) == "type" for c in children): # `import type { ... } from ...`
310
+ return True
311
+ specifiers = []
312
+ for c in children:
313
+ if _kind(c) != "import_clause":
314
+ continue
315
+ for cc in _children(c):
316
+ ck = _kind(cc)
317
+ if ck == "named_imports":
318
+ specifiers.extend(s for s in _children(cc) if _kind(s) == "import_specifier")
319
+ elif ck in ("identifier", "namespace_import"):
320
+ return False # default / `* as ns` binding is a runtime value
321
+ if not specifiers:
322
+ return False # side-effect import, or a re-export we can't prove type-only
323
+ return all(any(_kind(g) == "type" for g in _children(s)) for s in specifiers)
324
+
325
+
326
+ def _js_imports(root, src):
327
+ """Return list of (source, is_type) — source strings like './foo' or 'react',
328
+ is_type=True for type-only TS imports (no runtime edge). Captures static
329
+ import/export, CommonJS `require(...)`, and dynamic `import(...)` (e.g. the
330
+ `dynamic(() => import('./X'))` lazy-load pattern), all as runtime edges."""
331
+ out = []
332
+ for n in _walk(root):
333
+ k = _kind(n)
334
+ if k in ("import_statement", "export_statement"):
335
+ source = _field(n, "source")
336
+ if source is not None:
337
+ out.append((_text(src, source).strip("\"'`"), _is_type_only(n)))
338
+ elif k == "call_expression":
339
+ fn = _field(n, "function")
340
+ if fn is None:
341
+ continue
342
+ # require('x') or dynamic import('x') (function node is the `import` kw)
343
+ if _text(src, fn) == "require" or _kind(fn) == "import":
344
+ args = _field(n, "arguments")
345
+ if args is not None:
346
+ for c in _children(args):
347
+ if _kind(c) == "string":
348
+ out.append((_text(src, c).strip("\"'`"), False))
349
+ return out
350
+
351
+
352
+ # --- Dart / Flutter extraction -----------------------------------------------
353
+ # tree-sitter-dart shape (verified against the language pack grammar):
354
+ # top-level function -> `function_signature` (name field); body is a sibling
355
+ # method in a class -> `method_signature` wrapping a `function_signature`
356
+ # class / enum -> `class_definition` / `enum_declaration` (name field)
357
+ # mixin -> `mixin_declaration` (no name field; plain identifier)
358
+ # extension -> `extension_declaration` (may be anonymous -> no symbol)
359
+ # call `Foo(...)` -> an `identifier`/`type_identifier` immediately followed
360
+ # by a `selector` whose first child is `argument_part`
361
+ _DART_CONTAINER_DEFS = {
362
+ "class_definition": "class",
363
+ "mixin_declaration": "class",
364
+ "enum_declaration": "enum",
365
+ "extension_declaration": "extension",
366
+ }
367
+ _DART_IDENT_KINDS = {"identifier", "type_identifier"}
368
+
369
+
370
+ def _dart_name_node(node):
371
+ """Name node of a Dart container declaration: the `name` field when present
372
+ (class/enum/named extension), else the first plain `identifier` child (mixin).
373
+ Anonymous extensions have neither, so this returns None and no symbol is emitted
374
+ — never the `type_identifier` of the extended type."""
375
+ nm = _field(node, "name")
376
+ if nm is None:
377
+ nm = next((c for c in _children(node) if _kind(c) == "identifier"), None)
378
+ return nm
379
+
380
+
381
+ def _dart_imports(root, src):
382
+ """Return the raw URI string of every import/export/part directive, e.g.
383
+ 'package:app/x.dart', '../models/user.dart', 'user.g.dart'. Exports re-expose a
384
+ file's API, and `part` pulls a file into the library, so both are real edges.
385
+ `part of` is the reverse pointer (part -> library); we skip it to avoid a cycle
386
+ against the `part` edge the library already declares."""
387
+ out = []
388
+ for n in _walk(root):
389
+ if _kind(n) in ("library_import", "library_export", "part_directive"):
390
+ uri = next((c for c in _walk(n) if _kind(c) == "uri"), None)
391
+ if uri is not None:
392
+ out.append(_text(src, uri).strip().strip("\"'"))
393
+ return out
394
+
395
+
396
+ # Signatures that sit inside a class-body member wrapper. A member with a body
397
+ # (or abstract) is a `method_signature`; a field or body-less constructor is a
398
+ # `declaration`. Either can wrap one of these inner signatures.
399
+ _DART_SIG_KINDS = {
400
+ "function_signature": "method",
401
+ "constructor_signature": "constructor",
402
+ "constant_constructor_signature": "constructor",
403
+ "factory_constructor_signature": "constructor",
404
+ "getter_signature": "method",
405
+ "setter_signature": "method",
406
+ }
407
+
408
+
409
+ def _dart_member_symbol(node, src):
410
+ """Resolve a class-body member (`method_signature` or `declaration`) to a
411
+ (kind, name, name_nodes) triple. The name is the *simple* call-site name — the
412
+ last plain identifier of the signature — so `Product.named` -> 'named', the
413
+ default ctor -> the class name, a getter -> its property. Matching by simple
414
+ name is what keeps cerebro_callers and dead_symbols working, since the refs and
415
+ calls tables store unqualified names. Returns (None, None, ()) for fields and
416
+ other declarations that define no callable symbol."""
417
+ sig = next((c for c in _children(node) if _kind(c) in _DART_SIG_KINDS), None)
418
+ if sig is None:
419
+ return None, None, ()
420
+ if _kind(sig) == "function_signature":
421
+ nm = _field(sig, "name")
422
+ return ("method", _text(src, nm), (nm,)) if nm is not None else (None, None, ())
423
+ # ctor / getter / setter: name is the signature's last direct identifier
424
+ # (`Product.named` -> [Product, named] -> 'named'; getter -> [total]).
425
+ idents = [c for c in _children(sig) if _kind(c) == "identifier"]
426
+ if not idents:
427
+ return None, None, ()
428
+ return _DART_SIG_KINDS[_kind(sig)], _text(src, idents[-1]), tuple(idents)
429
+
430
+
431
+ def _dart_extract(root, src):
432
+ """Dart's grammar differs enough from python/js (methods wrap a nested
433
+ function_signature; calls are identifier+selector, not call_expression) to
434
+ warrant its own walk. Returns the same (symbols, imports, calls, refs)."""
435
+ symbols, calls = [], []
436
+ refs: set[str] = set()
437
+ def_spans: set[int] = set() # byte offsets of definition-name nodes (not uses)
438
+ stack = [(root, None, None)] # (node, parent_kind, enclosing definition name)
439
+ while stack:
440
+ n, parent_kind, enc = stack.pop()
441
+ k = _kind(n)
442
+ new_enc = enc
443
+ # Members (methods, constructors, getters, setters) live in a
444
+ # method_signature (has a body / abstract) or a declaration (field or
445
+ # body-less ctor); _dart_member_symbol sorts out which and skips fields.
446
+ if k in ("method_signature", "declaration"):
447
+ mkind, mname, mnodes = _dart_member_symbol(n, src)
448
+ if mkind:
449
+ symbols.append((mkind, mname, _line(n), _signature(src, n)))
450
+ for nn in mnodes:
451
+ def_spans.add(_attr(nn, "start_byte"))
452
+ new_enc = mname
453
+ elif k == "function_signature" and parent_kind not in ("method_signature", "declaration"):
454
+ name_node = _field(n, "name") # top-level fn; member sigs handled above
455
+ if name_node is not None:
456
+ nm = _text(src, name_node)
457
+ symbols.append(("function", nm, _line(n), _signature(src, n)))
458
+ def_spans.add(_attr(name_node, "start_byte"))
459
+ new_enc = nm
460
+ elif k in _DART_CONTAINER_DEFS:
461
+ name_node = _dart_name_node(n)
462
+ if name_node is not None:
463
+ nm = _text(src, name_node)
464
+ symbols.append((_DART_CONTAINER_DEFS[k], nm, _line(n), _signature(src, n)))
465
+ def_spans.add(_attr(name_node, "start_byte"))
466
+ new_enc = nm
467
+ elif k == "type_alias": # typedef Json = ...; -> name is the first type_identifier
468
+ name_node = next((c for c in _children(n) if _kind(c) == "type_identifier"), None)
469
+ if name_node is not None:
470
+ symbols.append(("typedef", _text(src, name_node), _line(n), _signature(src, n)))
471
+ def_spans.add(_attr(name_node, "start_byte"))
472
+ elif k in ("static_final_declaration_list", "initialized_identifier_list") and parent_kind == "program":
473
+ # Top-level const/final/var (Riverpod providers, theme constants, etc.).
474
+ # The same node kinds nest under `declaration` for class fields, which
475
+ # the parent_kind=='program' guard excludes.
476
+ decls = [c for c in _children(n)
477
+ if _kind(c) in ("static_final_declaration", "initialized_identifier")]
478
+ names = [next((g for g in _children(d) if _kind(g) == "identifier"), None) for d in decls]
479
+ for d, nm in zip(decls, names):
480
+ if nm is not None:
481
+ symbols.append(("variable", _text(src, nm), _line(d), _signature(src, d)))
482
+ def_spans.add(_attr(nm, "start_byte"))
483
+ # Attribute the initializer's calls to the variable when it's the only
484
+ # one declared (`final x = Provider(...)` -> x calls Provider).
485
+ if len(names) == 1 and names[0] is not None:
486
+ new_enc = _text(src, names[0])
487
+ elif k == "enum_constant":
488
+ name_node = next((c for c in _children(n) if _kind(c) == "identifier"), None)
489
+ if name_node is not None:
490
+ symbols.append(("enum_member", _text(src, name_node), _line(n), _signature(src, n)))
491
+ def_spans.add(_attr(name_node, "start_byte"))
492
+ # --- call sites, all read from a node's direct children -------------
493
+ # bare call Foo(...) -> identifier + selector(argument_part)
494
+ # method call obj.foo(...) -> selector(.foo) + selector(argument_part)
495
+ # cascade obj..foo(...) -> cascade_section{ cascade_selector + argument_part }
496
+ # A `selector` only carries args when its first child is `argument_part`
497
+ # (a `.name` field access is an unconditional_assignable_selector instead),
498
+ # which is what tells `obj.foo(...)` (call) apart from `obj.foo` (read).
499
+ kids = _children(n)
500
+ if k == "cascade_section" and any(_kind(c) == "argument_part" for c in kids):
501
+ csel = next((c for c in kids if _kind(c) == "cascade_selector"), None)
502
+ nm = next((g for g in _children(csel) if _kind(g) == "identifier"), None) if csel else None
503
+ if nm is not None:
504
+ calls.append((enc, _text(src, nm), _line(nm)))
505
+ for i, c in enumerate(kids):
506
+ nxt = kids[i + 1] if i + 1 < len(kids) else None
507
+ sc = _children(nxt) if nxt is not None and _kind(nxt) == "selector" else None
508
+ if not (sc and _kind(sc[0]) == "argument_part"):
509
+ continue
510
+ name_node = None
511
+ if _kind(c) in _DART_IDENT_KINDS: # Foo(...), setState(...)
512
+ name_node = c
513
+ elif _kind(c) == "selector": # obj.foo(...): name is in the
514
+ uas = _children(c)[0] if _children(c) else None # preceding .foo selector
515
+ if uas is not None and _kind(uas) == "unconditional_assignable_selector":
516
+ name_node = next((g for g in _children(uas) if _kind(g) == "identifier"), None)
517
+ if name_node is not None:
518
+ calls.append((enc, _text(src, name_node), _line(name_node)))
519
+ if k in _DART_IDENT_KINDS and _attr(n, "start_byte") not in def_spans:
520
+ refs.add(_text(src, n))
521
+ # A function/method body is a *sibling* of its signature in Dart, so its
522
+ # calls would otherwise be attributed to the enclosing class (or to None
523
+ # at top level). Pair each signature with the following body so calls
524
+ # inside are attributed to that function/method.
525
+ child_enc = {}
526
+ pending = None
527
+ for i, c in enumerate(kids):
528
+ ck = _kind(c)
529
+ if ck == "function_signature":
530
+ nm = _field(c, "name")
531
+ pending = _text(src, nm) if nm is not None else None
532
+ elif ck in ("method_signature", "declaration"):
533
+ pending = _dart_member_symbol(c, src)[1]
534
+ elif ck == "function_body":
535
+ if pending is not None:
536
+ child_enc[i] = pending
537
+ pending = None
538
+ for i, c in enumerate(kids):
539
+ stack.append((c, k, child_enc.get(i, new_enc)))
540
+ return symbols, _dart_imports(root, src), calls, sorted(refs)
541
+
542
+
543
+ # --- import resolution (raw import -> repo-relative path) --------------------
544
+
545
+ def _resolve_python(level, module, importer_rel, known: set[str]):
546
+ parts_mod = [p for p in module.split(".") if p]
547
+ if level > 0:
548
+ importer_dir = posixpath.dirname(importer_rel)
549
+ dir_parts = importer_dir.split("/") if importer_dir else []
550
+ # level 1 = importer's own package; each extra dot climbs one more.
551
+ keep = len(dir_parts) - (level - 1)
552
+ if keep < 0:
553
+ return None
554
+ base = dir_parts[:keep] + parts_mod
555
+ return _first_existing(["/".join(base)], known)
556
+ candidate = "/".join(parts_mod)
557
+ return _first_existing([candidate], known, suffix_ok=True)
558
+
559
+
560
+ def _first_existing(stems, known: set[str], suffix_ok: bool = False):
561
+ for stem in stems:
562
+ cands = [stem + ".py", stem + "/__init__.py"]
563
+ for c in cands:
564
+ if c in known:
565
+ return c
566
+ if suffix_ok:
567
+ for c in cands:
568
+ tail = "/" + c
569
+ hit = next((k for k in known if k.endswith(tail)), None)
570
+ if hit:
571
+ return hit
572
+ return None
573
+
574
+
575
+ _JS_EXTS = [".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs"]
576
+
577
+
578
+ def _resolve_fs(target: str, known: set[str]):
579
+ """Resolve a repo-relative module path (no extension) to a real file, trying
580
+ JS/TS extensions and an index file in a directory."""
581
+ cands = [target] + [target + e for e in _JS_EXTS]
582
+ cands += [target + "/index" + e for e in _JS_EXTS]
583
+ for c in cands:
584
+ if c in known:
585
+ return c
586
+ return None
587
+
588
+
589
+ def _resolve_js(source, importer_rel, known: set[str]):
590
+ if not source.startswith("."):
591
+ return None # relative imports only; aliases handled separately
592
+ importer_dir = posixpath.dirname(importer_rel)
593
+ target = posixpath.normpath(posixpath.join(importer_dir, source))
594
+ return _resolve_fs(target, known)
595
+
596
+
597
+ def _resolve_dart(uri: str, importer_rel: str, known: set[str], pkg_roots: dict):
598
+ """Resolve a Dart import URI to a repo-relative file.
599
+ 'dart:...' -> SDK, no edge.
600
+ 'package:p/sub' -> p's pubspec dir + '/lib/' + sub, when p is a local package.
601
+ anything else -> a path relative to the importer. Dart treats a bare
602
+ 'src/x.dart' as relative (unlike a JS bare specifier,
603
+ which means node_modules)."""
604
+ if uri.startswith("dart:"):
605
+ return None
606
+ if uri.startswith("package:"):
607
+ pkg, _, sub = uri[len("package:"):].partition("/")
608
+ base = pkg_roots.get(pkg)
609
+ if base is None or not sub:
610
+ return None
611
+ target = posixpath.normpath(posixpath.join(base, "lib", sub))
612
+ return target if target in known else None
613
+ importer_dir = posixpath.dirname(importer_rel)
614
+ target = posixpath.normpath(posixpath.join(importer_dir, uri))
615
+ return target if target in known else None
616
+
617
+
618
+ def resolve_imports(lang, imports, importer_rel, known: set[str], alias_configs=None, dart_pkgs=None) -> dict:
619
+ """Resolve raw imports to a {dst_path: kind} map. kind is 'type' only when
620
+ EVERY import resolving to that target is type-only — a single runtime import
621
+ makes the edge 'import', since the target is then loaded at runtime."""
622
+ base = _base_lang(lang)
623
+ if base == "dart":
624
+ edges = {}
625
+ for uri in imports:
626
+ hit = _resolve_dart(uri, importer_rel, known, dart_pkgs or {})
627
+ if hit and hit != importer_rel:
628
+ edges[hit] = "import"
629
+ return edges
630
+ runtime, type_only = set(), set()
631
+ for imp in imports:
632
+ is_type = False
633
+ if base == "python":
634
+ level, module = imp
635
+ hit = _resolve_python(level, module, importer_rel, known)
636
+ else:
637
+ source, is_type = imp
638
+ if source.startswith("."):
639
+ hit = _resolve_js(source, importer_rel, known)
640
+ else:
641
+ # bare import: try tsconfig/jsconfig path aliases (@/..., ~/..., etc.)
642
+ hit = None
643
+ for cand in tsconfig.expand(source, importer_rel, alias_configs or []):
644
+ hit = _resolve_fs(cand, known)
645
+ if hit:
646
+ break
647
+ if hit and hit != importer_rel:
648
+ (type_only if is_type else runtime).add(hit)
649
+ edges = {d: "import" for d in runtime}
650
+ for d in type_only:
651
+ edges.setdefault(d, "type") # demoted to runtime above if also imported as a value
652
+ return edges
653
+
654
+
655
+ # --- framework entrypoints (loaded by tooling, not by import) ----------------
656
+
657
+ # A source-file token inside a package.json script command, e.g. the
658
+ # `src/database/seeder.ts` in `ts-node ... src/database/seeder.ts`.
659
+ _SCRIPT_FILE_RE = re.compile(r"[\w./@-]+\.(?:ts|tsx|js|jsx|mjs|cjs)\b")
660
+
661
+
662
+ def script_entrypoints(config, known: set[str]) -> set[str]:
663
+ """Repo-relative source files invoked by a package.json `scripts` command.
664
+
665
+ These are run by tooling (`npm run seed` -> `ts-node src/database/seeder.ts`),
666
+ not imported by other code, so the dependency graph never sees an edge into
667
+ them — without this they masquerade as dead code in orphans()."""
668
+ out: set[str] = set()
669
+ for rel in known:
670
+ if posixpath.basename(rel) != "package.json":
671
+ continue
672
+ try:
673
+ data = json.loads((config.root / rel).read_text(encoding="utf-8", errors="ignore"))
674
+ except Exception:
675
+ continue
676
+ scripts = data.get("scripts")
677
+ if not isinstance(scripts, dict):
678
+ continue
679
+ pkg_dir = posixpath.dirname(rel)
680
+ for cmd in scripts.values():
681
+ if not isinstance(cmd, str):
682
+ continue
683
+ for token in _SCRIPT_FILE_RE.findall(cmd):
684
+ target = posixpath.normpath(posixpath.join(pkg_dir, token))
685
+ if target in known:
686
+ out.add(target)
687
+ return out
688
+
689
+
690
+ # A pubspec.yaml top-level `name:` value (the Dart package name). Matched without
691
+ # a YAML parser since we only need this one scalar field.
692
+ _DART_PKG_NAME_RE = re.compile(r"(?m)^name:[ \t]*['\"]?([A-Za-z_][A-Za-z0-9_]*)")
693
+
694
+
695
+ def dart_package_roots(config, known: set[str]) -> dict:
696
+ """Map each Dart package name to the repo-relative dir holding its pubspec.yaml,
697
+ so `package:<name>/x.dart` imports resolve to `<dir>/lib/x.dart`. A Flutter
698
+ monorepo/polyrepo can declare several packages, hence a map, not one name."""
699
+ roots: dict[str, str] = {}
700
+ for rel in known:
701
+ if posixpath.basename(rel) != "pubspec.yaml":
702
+ continue
703
+ try:
704
+ text = (config.root / rel).read_text(encoding="utf-8", errors="ignore")
705
+ except OSError:
706
+ continue
707
+ m = _DART_PKG_NAME_RE.search(text)
708
+ if m:
709
+ roots[m.group(1)] = posixpath.dirname(rel)
710
+ return roots
711
+
712
+
713
+ # --- reindex (apply changes to the DB) --------------------------------------
714
+
715
+ def _index_one(config, conn, rel, file_hash_val, known, alias_configs, stamp, src=None, dart_pkgs=None):
716
+ """Index a single file: store its row, symbols, and dependency edges."""
717
+ abs_path = config.root / rel
718
+ if src is None:
719
+ try:
720
+ src = abs_path.read_bytes()
721
+ except OSError:
722
+ return
723
+ lang = config.lang_for(rel)
724
+ stat = abs_path.stat()
725
+ db.upsert_file(conn, rel, lang, file_hash_val, stat.st_mtime, stat.st_size, stamp)
726
+ symbols, imports, calls, refs = ([], [], [], [])
727
+ if lang:
728
+ symbols, imports, calls, refs = extract(lang, src)
729
+ db.replace_symbols(conn, rel, symbols)
730
+ db.replace_edges(
731
+ conn,
732
+ rel,
733
+ resolve_imports(lang, imports, rel, known, alias_configs, dart_pkgs) if lang else [],
734
+ )
735
+ db.replace_calls(conn, rel, calls)
736
+ db.replace_refs(conn, rel, refs)
737
+
738
+
739
+ def reindex(config: cfg.Config, conn, paths: list[str] | None = None, force: bool = False) -> dict:
740
+ """Bring the index up to date with disk. Only changed/new/deleted files are
741
+ touched (unless force=True, which re-extracts every file — useful after an
742
+ extractor upgrade; summaries/notes/embeddings are preserved)."""
743
+ disk = disk_state(config)
744
+ known = set(disk)
745
+ d = diff(conn, disk)
746
+ alias_configs = tsconfig.load_alias_configs(config)
747
+ dart_pkgs = dart_package_roots(config, known)
748
+
749
+ targets = sorted(known) if force else d["new"] + d["changed"]
750
+ if paths is not None:
751
+ wanted = set(paths)
752
+ targets = [p for p in targets if p in wanted]
753
+
754
+ stamp = now_iso()
755
+ for rel in targets:
756
+ _index_one(config, conn, rel, disk[rel], known, alias_configs, stamp, dart_pkgs=dart_pkgs)
757
+
758
+ for rel in d["deleted"]:
759
+ db.forget_file(conn, rel)
760
+
761
+ # Record package.json script entrypoints so orphans() doesn't flag them as
762
+ # dead. A full walk already happened above, so this is cheap.
763
+ conn.execute(
764
+ "INSERT INTO meta(key,value) VALUES('script_entrypoints',?) "
765
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
766
+ (json.dumps(sorted(script_entrypoints(config, known))),),
767
+ )
768
+ conn.execute(
769
+ "INSERT INTO meta(key,value) VALUES('last_reindex',?) "
770
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
771
+ (stamp,),
772
+ )
773
+ conn.commit()
774
+ return {
775
+ "indexed": len(targets),
776
+ "new": len(d["new"]),
777
+ "changed": len(d["changed"]),
778
+ "deleted": len(d["deleted"]),
779
+ "total_files": len(known),
780
+ }
781
+
782
+
783
+ def reindex_paths(config: cfg.Config, conn, rels: list[str]) -> dict:
784
+ """Incrementally reindex specific files WITHOUT walking/hashing the whole tree.
785
+ Used by the post-edit hook so a single save stays cheap on large monorepos.
786
+ Edges resolve against the already-indexed file set."""
787
+ alias_configs = tsconfig.load_alias_configs(config)
788
+ known = set(db.stored_hashes(conn))
789
+ known.update(rels)
790
+ dart_pkgs = dart_package_roots(config, known)
791
+ stamp = now_iso()
792
+ touched = 0
793
+ for rel in rels:
794
+ abs_path = config.root / rel
795
+ if not abs_path.exists():
796
+ db.forget_file(conn, rel)
797
+ touched += 1
798
+ continue
799
+ if config.is_ignored(abs_path):
800
+ continue
801
+ try:
802
+ src = abs_path.read_bytes()
803
+ except OSError:
804
+ continue
805
+ h = hashlib.sha1(src).hexdigest()
806
+ prev = conn.execute("SELECT hash FROM files WHERE path=?", (rel,)).fetchone()
807
+ if prev and prev["hash"] == h:
808
+ continue
809
+ _index_one(config, conn, rel, h, known, alias_configs, stamp, src=src, dart_pkgs=dart_pkgs)
810
+ touched += 1
811
+ # Editing a package.json can change which files are script entrypoints; refresh
812
+ # the cached set so orphans() stays accurate without needing a full reindex.
813
+ if any(posixpath.basename(r) == "package.json" for r in rels):
814
+ conn.execute(
815
+ "INSERT INTO meta(key,value) VALUES('script_entrypoints',?) "
816
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
817
+ (json.dumps(sorted(script_entrypoints(config, known))),),
818
+ )
819
+ conn.commit()
820
+ return {"touched": touched, "files": len(rels)}
821
+
822
+
823
+ def _to_rel(config, arg: str) -> str | None:
824
+ p = Path(arg)
825
+ if p.is_absolute():
826
+ try:
827
+ return p.resolve().relative_to(config.root.resolve()).as_posix()
828
+ except ValueError:
829
+ return None
830
+ return posixpath.normpath(arg)
831
+
832
+
833
+ def main(): # `cerebro-index` entry point; with file args, does an incremental update
834
+ import json
835
+ import sys
836
+
837
+ config = cfg.Config.load()
838
+ conn = db.connect(config.db_path)
839
+ args = sys.argv[1:]
840
+ force = "--force" in args
841
+ args = [a for a in args if a != "--force"]
842
+ if args:
843
+ rels = [r for r in (_to_rel(config, a) for a in args) if r]
844
+ result = reindex_paths(config, conn, rels)
845
+ result["mode"] = "incremental"
846
+ else:
847
+ result = reindex(config, conn, force=force)
848
+ result["mode"] = "full-force" if force else "full"
849
+ result["root"] = str(config.root)
850
+ print(json.dumps(result, indent=2))
851
+
852
+
853
+ if __name__ == "__main__":
854
+ main()