agentforge-graph 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. agentforge_graph/__init__.py +6 -0
  2. agentforge_graph/chunking/__init__.py +12 -0
  3. agentforge_graph/chunking/cast.py +159 -0
  4. agentforge_graph/chunking/chunk.py +19 -0
  5. agentforge_graph/chunking/tokens.py +15 -0
  6. agentforge_graph/cli.py +607 -0
  7. agentforge_graph/config.py +259 -0
  8. agentforge_graph/core/__init__.py +54 -0
  9. agentforge_graph/core/conformance.py +270 -0
  10. agentforge_graph/core/contracts.py +163 -0
  11. agentforge_graph/core/kinds.py +68 -0
  12. agentforge_graph/core/models.py +134 -0
  13. agentforge_graph/core/provenance.py +62 -0
  14. agentforge_graph/core/symbols.py +116 -0
  15. agentforge_graph/embed/__init__.py +28 -0
  16. agentforge_graph/embed/base.py +22 -0
  17. agentforge_graph/embed/bedrock.py +85 -0
  18. agentforge_graph/embed/fake.py +34 -0
  19. agentforge_graph/embed/openai.py +67 -0
  20. agentforge_graph/embed/pipeline.py +184 -0
  21. agentforge_graph/embed/registry.py +66 -0
  22. agentforge_graph/embed/report.py +15 -0
  23. agentforge_graph/enrich/__init__.py +70 -0
  24. agentforge_graph/enrich/anthropic.py +38 -0
  25. agentforge_graph/enrich/anthropic_client.py +109 -0
  26. agentforge_graph/enrich/bedrock.py +24 -0
  27. agentforge_graph/enrich/bedrock_client.py +115 -0
  28. agentforge_graph/enrich/bedrock_summarizer.py +23 -0
  29. agentforge_graph/enrich/claude.py +172 -0
  30. agentforge_graph/enrich/enricher.py +108 -0
  31. agentforge_graph/enrich/governs.py +173 -0
  32. agentforge_graph/enrich/governs_enricher.py +152 -0
  33. agentforge_graph/enrich/heuristics.py +224 -0
  34. agentforge_graph/enrich/judge.py +63 -0
  35. agentforge_graph/enrich/registry.py +133 -0
  36. agentforge_graph/enrich/report.py +60 -0
  37. agentforge_graph/enrich/summarizer.py +62 -0
  38. agentforge_graph/enrich/summary_enricher.py +211 -0
  39. agentforge_graph/enrich/taxonomy.py +38 -0
  40. agentforge_graph/frameworks/__init__.py +29 -0
  41. agentforge_graph/frameworks/base.py +75 -0
  42. agentforge_graph/frameworks/detect.py +124 -0
  43. agentforge_graph/frameworks/extractor.py +63 -0
  44. agentforge_graph/frameworks/orm.py +93 -0
  45. agentforge_graph/frameworks/packs/_js_ast.py +56 -0
  46. agentforge_graph/frameworks/packs/_python_ast.py +157 -0
  47. agentforge_graph/frameworks/packs/django/__init__.py +240 -0
  48. agentforge_graph/frameworks/packs/django/models.scm +7 -0
  49. agentforge_graph/frameworks/packs/express/__init__.py +133 -0
  50. agentforge_graph/frameworks/packs/express/routes.scm +8 -0
  51. agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
  52. agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
  53. agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
  54. agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
  55. agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
  56. agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
  57. agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
  58. agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
  59. agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
  60. agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
  61. agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
  62. agentforge_graph/frameworks/registry.py +44 -0
  63. agentforge_graph/ingest/__init__.py +30 -0
  64. agentforge_graph/ingest/codegraph.py +847 -0
  65. agentforge_graph/ingest/extractor.py +353 -0
  66. agentforge_graph/ingest/incremental/__init__.py +25 -0
  67. agentforge_graph/ingest/incremental/detect.py +118 -0
  68. agentforge_graph/ingest/incremental/dirty.py +61 -0
  69. agentforge_graph/ingest/incremental/indexer.py +218 -0
  70. agentforge_graph/ingest/incremental/meta.py +72 -0
  71. agentforge_graph/ingest/incremental/ports.py +39 -0
  72. agentforge_graph/ingest/pack.py +160 -0
  73. agentforge_graph/ingest/packs/__init__.py +34 -0
  74. agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
  75. agentforge_graph/ingest/packs/cpp/references.scm +15 -0
  76. agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
  77. agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
  78. agentforge_graph/ingest/packs/csharp/references.scm +12 -0
  79. agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
  80. agentforge_graph/ingest/packs/go/__init__.py +38 -0
  81. agentforge_graph/ingest/packs/go/references.scm +12 -0
  82. agentforge_graph/ingest/packs/go/structure.scm +64 -0
  83. agentforge_graph/ingest/packs/java/__init__.py +35 -0
  84. agentforge_graph/ingest/packs/java/references.scm +12 -0
  85. agentforge_graph/ingest/packs/java/structure.scm +38 -0
  86. agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
  87. agentforge_graph/ingest/packs/javascript/references.scm +11 -0
  88. agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
  89. agentforge_graph/ingest/packs/php/__init__.py +35 -0
  90. agentforge_graph/ingest/packs/php/references.scm +15 -0
  91. agentforge_graph/ingest/packs/php/structure.scm +44 -0
  92. agentforge_graph/ingest/packs/python/__init__.py +25 -0
  93. agentforge_graph/ingest/packs/python/references.scm +14 -0
  94. agentforge_graph/ingest/packs/python/structure.scm +57 -0
  95. agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
  96. agentforge_graph/ingest/packs/ruby/references.scm +12 -0
  97. agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
  98. agentforge_graph/ingest/packs/rust/__init__.py +39 -0
  99. agentforge_graph/ingest/packs/rust/references.scm +12 -0
  100. agentforge_graph/ingest/packs/rust/structure.scm +46 -0
  101. agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
  102. agentforge_graph/ingest/packs/typescript/references.scm +11 -0
  103. agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
  104. agentforge_graph/ingest/pipeline.py +134 -0
  105. agentforge_graph/ingest/report.py +84 -0
  106. agentforge_graph/ingest/resolver.py +467 -0
  107. agentforge_graph/ingest/source.py +79 -0
  108. agentforge_graph/knowledge/__init__.py +28 -0
  109. agentforge_graph/knowledge/adr.py +136 -0
  110. agentforge_graph/knowledge/commits.py +152 -0
  111. agentforge_graph/knowledge/ingest.py +312 -0
  112. agentforge_graph/knowledge/mentions.py +71 -0
  113. agentforge_graph/knowledge/report.py +32 -0
  114. agentforge_graph/main.py +21 -0
  115. agentforge_graph/providers.py +36 -0
  116. agentforge_graph/repomap/__init__.py +14 -0
  117. agentforge_graph/repomap/rank.py +161 -0
  118. agentforge_graph/repomap/render.py +55 -0
  119. agentforge_graph/repomap/repomap.py +66 -0
  120. agentforge_graph/retrieve/__init__.py +21 -0
  121. agentforge_graph/retrieve/pack.py +76 -0
  122. agentforge_graph/retrieve/rerank.py +251 -0
  123. agentforge_graph/retrieve/retriever.py +286 -0
  124. agentforge_graph/retrieve/scoring.py +36 -0
  125. agentforge_graph/serve/__init__.py +19 -0
  126. agentforge_graph/serve/engine.py +204 -0
  127. agentforge_graph/serve/http_runner.py +133 -0
  128. agentforge_graph/serve/server.py +110 -0
  129. agentforge_graph/serve/tools.py +307 -0
  130. agentforge_graph/store/__init__.py +32 -0
  131. agentforge_graph/store/_rowmap.py +102 -0
  132. agentforge_graph/store/errors.py +22 -0
  133. agentforge_graph/store/facade.py +89 -0
  134. agentforge_graph/store/kuzu_store.py +380 -0
  135. agentforge_graph/store/lance_store.py +146 -0
  136. agentforge_graph/store/neo4j_store.py +294 -0
  137. agentforge_graph/store/pgvector_store.py +170 -0
  138. agentforge_graph/store/registry.py +45 -0
  139. agentforge_graph/temporal/__init__.py +36 -0
  140. agentforge_graph/temporal/backfill.py +338 -0
  141. agentforge_graph/temporal/events.py +82 -0
  142. agentforge_graph/temporal/index.py +190 -0
  143. agentforge_graph/temporal/mining.py +190 -0
  144. agentforge_graph/temporal/recorder.py +114 -0
  145. agentforge_graph/temporal/store.py +282 -0
  146. agentforge_graph-0.3.2.dist-info/METADATA +291 -0
  147. agentforge_graph-0.3.2.dist-info/RECORD +151 -0
  148. agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
  149. agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
  150. agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
  151. agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
@@ -0,0 +1,353 @@
1
+ """``TreeSitterExtractor`` — pass 1 of ingestion (feat-002).
2
+
3
+ File-isolated: parses one file and emits its ``FileSubgraph`` — definition
4
+ nodes (File/Class/Function/Method) with ``CONTAINS`` edges, plus imports and
5
+ call sites recorded as node *attrs* (not edges — their targets may live in
6
+ other files, which pass 1 may not read). The graph-only resolver (pass 2)
7
+ turns those attrs into ``IMPORTS``/``CALLS`` edges.
8
+
9
+ Parsing uses the standalone ``tree_sitter`` package driven by a grammar from
10
+ ``tree-sitter-language-pack`` (``Parser(get_language(...))``, never
11
+ ``get_parser()`` — see the framework note on the ABI split).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import hashlib
17
+ import re
18
+ import textwrap
19
+ from collections import defaultdict
20
+ from dataclasses import dataclass, field
21
+ from functools import cache
22
+ from typing import Any
23
+
24
+ from tree_sitter import Language, Parser, Query, QueryCursor
25
+ from tree_sitter import Node as TSNode
26
+ from tree_sitter_language_pack import get_language
27
+
28
+ from agentforge_graph.core import (
29
+ Descriptor,
30
+ Edge,
31
+ EdgeKind,
32
+ Extractor,
33
+ FileSubgraph,
34
+ NodeKind,
35
+ Provenance,
36
+ SourceFile,
37
+ SymbolID,
38
+ )
39
+ from agentforge_graph.core import (
40
+ Node as GraphNode,
41
+ )
42
+
43
+ from .pack import LanguagePack
44
+
45
+ _CALLABLE = {NodeKind.FUNCTION, NodeKind.METHOD}
46
+ _METHOD_OWNERS = {NodeKind.CLASS, NodeKind.INTERFACE}
47
+
48
+
49
+ @cache
50
+ def _language(grammar: str) -> Language:
51
+ return get_language(grammar)
52
+
53
+
54
+ @dataclass
55
+ class _Def:
56
+ """A captured definition, pre-symbol-id."""
57
+
58
+ ts_id: int
59
+ node: TSNode
60
+ kind: NodeKind
61
+ name: str
62
+ enclosing: int | None = None # ts id of the nearest enclosing def
63
+ symbol_id: str = ""
64
+ bases: list[str] = field(default_factory=list) # superclass names (INHERITS)
65
+ recv_var: str = "" # Go: a method's receiver variable name (`s` in `func (s *T)`)
66
+ recv_type: str = "" # Go: a method's receiver type name (`T`)
67
+ docstring: str = "" # the symbol's docstring, cleaned (DESCRIBES, feat-010)
68
+
69
+
70
+ def _text(node: TSNode, src: bytes) -> str:
71
+ return src[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
72
+
73
+
74
+ def _span(node: TSNode) -> tuple[int, int]:
75
+ return (node.start_point[0] + 1, node.end_point[0] + 1)
76
+
77
+
78
+ def _signature(node: TSNode, src: bytes) -> str:
79
+ """The symbol's first source line (the def/class header), trimmed."""
80
+ text = _text(node, src)
81
+ return text.splitlines()[0].strip() if text else ""
82
+
83
+
84
+ _STR_PREFIX = re.compile(r"^[rbfuRBFU]{0,2}")
85
+ _JSDOC_LINE = re.compile(r"^\s*\*?\s?") # a JSDoc/Javadoc line's leading ` * `
86
+
87
+
88
+ def _clean_docstring(raw: str) -> str:
89
+ """The docstring body: strip a Python string literal's prefix + quotes, OR a
90
+ ``/** … */`` JSDoc/Javadoc comment's markers + per-line ``*``; then dedent."""
91
+ s = raw.strip()
92
+ if s.startswith("/*"): # JSDoc / Javadoc block comment
93
+ s = s[2:]
94
+ if s.startswith("*"): # the second `*` of `/**`
95
+ s = s[1:]
96
+ if s.endswith("*/"):
97
+ s = s[:-2]
98
+ return "\n".join(_JSDOC_LINE.sub("", ln) for ln in s.splitlines()).strip()
99
+ s = _STR_PREFIX.sub("", s, count=1)
100
+ for q in ('"""', "'''", '"', "'"):
101
+ if s.startswith(q) and s.endswith(q) and len(s) >= 2 * len(q):
102
+ s = s[len(q) : -len(q)]
103
+ break
104
+ return textwrap.dedent(s).strip()
105
+
106
+
107
+ class TreeSitterExtractor(Extractor):
108
+ """Extracts a ``FileSubgraph`` from one source file, in isolation."""
109
+
110
+ def __init__(self, pack: LanguagePack, repo: str, commit: str = "") -> None:
111
+ self.pack = pack
112
+ self.repo = repo
113
+ self.commit = commit
114
+ self.name = f"tree-sitter-{pack.language}"
115
+ self._lang = _language(pack.grammar)
116
+ self._parser = Parser(self._lang)
117
+ self._structure_q = Query(self._lang, pack.structure_queries)
118
+ self._reference_q = Query(self._lang, pack.reference_queries)
119
+
120
+ def extract(self, file: SourceFile) -> FileSubgraph:
121
+ src = file.text.encode("utf-8")
122
+ root = self._parser.parse(src).root_node
123
+ prov = Provenance.parsed(self.name, self.commit)
124
+ file_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, file.path, "")
125
+
126
+ defs, imports, default_export, namespace = self._structure(root, src)
127
+ self._assign_symbol_ids(defs, file.path)
128
+ by_tsid = {d.ts_id: d for d in defs}
129
+ refs = self._references(root, src, by_tsid, file_id)
130
+
131
+ nodes: list[GraphNode] = []
132
+ file_attrs: dict[str, Any] = {}
133
+ if imports:
134
+ file_attrs["imports"] = imports
135
+ if default_export:
136
+ file_attrs["default_export"] = default_export
137
+ if namespace:
138
+ file_attrs["namespace"] = namespace # PHP/Java/C# package (FQN resolution)
139
+ if file_id in refs:
140
+ file_attrs["refs"] = refs[file_id]
141
+ nodes.append(
142
+ GraphNode(
143
+ id=file_id,
144
+ kind=NodeKind.FILE,
145
+ name=file.path.rsplit("/", 1)[-1],
146
+ provenance=prov,
147
+ attrs=file_attrs,
148
+ )
149
+ )
150
+
151
+ edges: list[Edge] = []
152
+ for d in defs:
153
+ attrs: dict[str, Any] = {"signature": _signature(d.node, src)}
154
+ if d.symbol_id in refs:
155
+ attrs["refs"] = refs[d.symbol_id]
156
+ if d.bases: # INHERITS: superclass names, resolved in pass 2
157
+ attrs["bases"] = d.bases
158
+ if d.recv_var: # Go: receiver var/type, for receiver self-calls (pass 2)
159
+ attrs["recv_var"] = d.recv_var
160
+ attrs["recv_type"] = d.recv_type
161
+ nodes.append(
162
+ GraphNode(
163
+ id=d.symbol_id,
164
+ kind=d.kind,
165
+ name=d.name,
166
+ span=_span(d.node),
167
+ provenance=prov,
168
+ attrs=attrs,
169
+ )
170
+ )
171
+ parent_id = by_tsid[d.enclosing].symbol_id if d.enclosing in by_tsid else file_id
172
+ edges.append(
173
+ Edge(src=parent_id, dst=d.symbol_id, kind=EdgeKind.CONTAINS, provenance=prov)
174
+ )
175
+ # docstring -> a DocChunk that DESCRIBES the symbol (feat-010), so the
176
+ # docstring prose is embeddable + searchable, attached to its symbol.
177
+ if d.docstring:
178
+ desc = SymbolID.parse(d.symbol_id).descriptor + "docstring."
179
+ doc_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, file.path, desc)
180
+ nodes.append(
181
+ GraphNode(
182
+ id=doc_id,
183
+ kind=NodeKind.DOC_CHUNK,
184
+ name=d.name,
185
+ provenance=prov,
186
+ attrs={
187
+ "path": file.path,
188
+ "heading": d.name,
189
+ "text": d.docstring,
190
+ "describes": d.symbol_id,
191
+ "content_hash": hashlib.sha256(d.docstring.encode()).hexdigest(),
192
+ },
193
+ )
194
+ )
195
+ edges.append(
196
+ Edge(src=doc_id, dst=d.symbol_id, kind=EdgeKind.DESCRIBES, provenance=prov)
197
+ )
198
+
199
+ nodes.sort(key=lambda n: (n.span or (0, 0), n.id))
200
+ edges.sort(key=lambda e: (e.src, e.dst, e.kind.value))
201
+ return FileSubgraph(
202
+ path=file.path, content_hash=file.content_hash, nodes=nodes, edges=edges
203
+ )
204
+
205
+ # --- structure pass -------------------------------------------------
206
+
207
+ def _structure(
208
+ self, root: TSNode, src: bytes
209
+ ) -> tuple[list[_Def], list[dict[str, Any]], str, str]:
210
+ defs: list[_Def] = []
211
+ imports: list[dict[str, Any]] = []
212
+ default_export = "" # CommonJS `module.exports = <name>` (BUG-006)
213
+ namespace = "" # PHP/Java/C# package declaration (FQN import resolution)
214
+ class_bases: dict[int, list[str]] = defaultdict(list) # class node id -> base names
215
+ method_recv: dict[int, tuple[str, str]] = {} # method node id -> (recv var, recv type)
216
+ docstrings: dict[int, str] = {} # def node id -> cleaned docstring (DESCRIBES)
217
+ rules = self.pack.descriptor_rules
218
+ for _pattern, caps in QueryCursor(self._structure_q).matches(root):
219
+ def_cap = next((c for c in caps if c.startswith("def.")), None)
220
+ if def_cap is not None:
221
+ kind = rules.kind_for(def_cap)
222
+ names = caps.get("name")
223
+ if kind is None or not names:
224
+ continue
225
+ node = caps[def_cap][0]
226
+ defs.append(_Def(ts_id=node.id, node=node, kind=kind, name=_text(names[0], src)))
227
+ elif "base.name" in caps:
228
+ # a base class of a class definition (INHERITS); one match per base
229
+ cls = caps.get("base.def")
230
+ if cls:
231
+ class_bases[cls[0].id].extend(_text(b, src) for b in caps["base.name"])
232
+ elif "recv.var" in caps:
233
+ # Go: a method's receiver `(s *T)` — bind the var name + type
234
+ meth, rvar, rtype = caps.get("recv.method"), caps["recv.var"], caps.get("recv.type")
235
+ if meth and rtype:
236
+ method_recv[meth[0].id] = (_text(rvar[0], src), _text(rtype[0], src))
237
+ elif "docstring" in caps:
238
+ # a def/class docstring or JSDoc comment — DESCRIBES the symbol
239
+ owner = caps.get("doc.owner")
240
+ if owner:
241
+ docstrings[owner[0].id] = _clean_docstring(_text(caps["docstring"][0], src))
242
+ elif "import" in caps:
243
+ mods = caps.get("import.module", [])
244
+ dflt = caps.get("import.default")
245
+ imports.append(
246
+ {
247
+ "module": _text(mods[0], src) if mods else "",
248
+ "names": [_text(n, src) for n in caps.get("import.name", [])],
249
+ # CommonJS default require binding: `const x = require(...)`
250
+ "default": _text(dflt[0], src) if dflt else "",
251
+ "line": caps["import"][0].start_point[0] + 1,
252
+ }
253
+ )
254
+ elif "namespace" in caps:
255
+ ns = caps.get("namespace")
256
+ if ns:
257
+ namespace = _text(ns[0], src)
258
+ elif "export" in caps:
259
+ ed = caps.get("export.default")
260
+ if ed:
261
+ default_export = _text(ed[0], src)
262
+ for d in defs:
263
+ if d.ts_id in class_bases:
264
+ d.bases = class_bases[d.ts_id]
265
+ if d.ts_id in method_recv:
266
+ d.recv_var, d.recv_type = method_recv[d.ts_id]
267
+ if d.ts_id in docstrings:
268
+ d.docstring = docstrings[d.ts_id]
269
+ self._link_scopes(defs)
270
+ return defs, imports, default_export, namespace
271
+
272
+ def _link_scopes(self, defs: list[_Def]) -> None:
273
+ idset = {d.ts_id for d in defs}
274
+ by_tsid = {d.ts_id: d for d in defs}
275
+ for d in defs:
276
+ anc = d.node.parent
277
+ while anc is not None and anc.id not in idset:
278
+ anc = anc.parent
279
+ d.enclosing = anc.id if anc is not None else None
280
+ # a function whose nearest enclosing def is a class is a method
281
+ if (
282
+ d.kind is NodeKind.FUNCTION
283
+ and d.enclosing is not None
284
+ and by_tsid[d.enclosing].kind in _METHOD_OWNERS
285
+ ):
286
+ d.kind = NodeKind.METHOD
287
+
288
+ def _assign_symbol_ids(self, defs: list[_Def], path: str) -> None:
289
+ by_tsid = {d.ts_id: d for d in defs}
290
+ # overload disambiguator: nth same-named callable in the same scope (source order)
291
+ counter: dict[tuple[int | None, str], int] = defaultdict(int)
292
+ disamb: dict[int, int] = {}
293
+ for d in sorted(defs, key=lambda d: d.node.start_byte):
294
+ if d.kind in _CALLABLE:
295
+ key = (d.enclosing, d.name)
296
+ disamb[d.ts_id] = counter[key]
297
+ counter[key] += 1
298
+ for d in defs:
299
+ chain: list[_Def] = []
300
+ cur: _Def | None = d
301
+ while cur is not None:
302
+ chain.append(cur)
303
+ cur = by_tsid.get(cur.enclosing) if cur.enclosing is not None else None
304
+ chain.reverse()
305
+ descriptor = "".join(self._suffix(x, disamb.get(x.ts_id, 0)) for x in chain)
306
+ d.symbol_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, path, descriptor)
307
+
308
+ @staticmethod
309
+ def _suffix(d: _Def, disambiguator: int) -> str:
310
+ if d.kind in (NodeKind.CLASS, NodeKind.INTERFACE):
311
+ return Descriptor.type(d.name)
312
+ if d.kind in _CALLABLE:
313
+ return Descriptor.method(d.name, disambiguator)
314
+ return Descriptor.term(d.name)
315
+
316
+ # --- reference pass -------------------------------------------------
317
+
318
+ def _references(
319
+ self, root: TSNode, src: bytes, by_tsid: dict[int, _Def], file_id: str
320
+ ) -> dict[str, list[dict[str, Any]]]:
321
+ idset = set(by_tsid)
322
+ # Keyed by the call node so a bare + receiver-capturing pattern that both
323
+ # match the same call (Java/Ruby, where one node type serves `f()` and
324
+ # `recv.f()`) yield ONE ref — the receiver merged in. Distinct-node-type
325
+ # grammars (Py/TS/JS/C#/Rust/PHP/C++) never collide, so this is a no-op
326
+ # for them; insertion order preserves source order.
327
+ owner_of: dict[int, str] = {}
328
+ ref_of: dict[int, dict[str, Any]] = {}
329
+ for _pattern, caps in QueryCursor(self._reference_q).matches(root):
330
+ if "call" not in caps:
331
+ continue
332
+ callees = caps.get("call.callee")
333
+ if not callees:
334
+ continue
335
+ call_node = caps["call"][0]
336
+ ref = ref_of.get(call_node.id)
337
+ if ref is None:
338
+ anc = call_node.parent
339
+ while anc is not None and anc.id not in idset:
340
+ anc = anc.parent
341
+ owner_of[call_node.id] = by_tsid[anc.id].symbol_id if anc is not None else file_id
342
+ ref = {"name": _text(callees[0], src), "line": call_node.start_point[0] + 1}
343
+ ref_of[call_node.id] = ref
344
+ # BUG-006: the receiver of an attribute call (`recv.f()`), when the pack
345
+ # captures it — lets the resolver bind `self.f()`/`this.f()` to the
346
+ # enclosing class's method and refuse to guess for other receivers.
347
+ recv = caps.get("call.recv")
348
+ if recv and "recv" not in ref:
349
+ ref["recv"] = _text(recv[0], src)
350
+ refs: dict[str, list[dict[str, Any]]] = defaultdict(list)
351
+ for cid, ref in ref_of.items():
352
+ refs[owner_of[cid]].append(ref)
353
+ return dict(refs)
@@ -0,0 +1,25 @@
1
+ """Incremental indexing (feat-004): re-index only the diff.
2
+
3
+ A thin coordination layer over the feat-002/003 primitives —
4
+ ``ChangeDetector`` diffs the working tree against the ``IndexMeta`` manifest,
5
+ ``IncrementalIndexer`` applies the resulting ``ChangeSet`` (delete → re-extract
6
+ → scoped re-resolve), and ``DirtySet`` records what each enricher must redo.
7
+ Zero ``agentforge`` imports (ADR-0001).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from .detect import ChangeDetector, ChangeSet, DetectResult
13
+ from .dirty import DirtySet
14
+ from .indexer import IncrementalIndexer
15
+ from .meta import IndexMeta, pack_fingerprint
16
+
17
+ __all__ = [
18
+ "ChangeDetector",
19
+ "ChangeSet",
20
+ "DetectResult",
21
+ "DirtySet",
22
+ "IncrementalIndexer",
23
+ "IndexMeta",
24
+ "pack_fingerprint",
25
+ ]
@@ -0,0 +1,118 @@
1
+ """``ChangeDetector`` — diff the working tree against the indexed manifest.
2
+
3
+ The **content hash is the source of truth**: we walk the working tree once,
4
+ hash every indexable file, and diff that against ``IndexMeta.files``. This is
5
+ correct regardless of git state (dirty working tree, shallow clone, detached
6
+ HEAD, rebase) and naturally catches uncommitted edits — the common case for an
7
+ agent mid-flight. Git is then consulted *best-effort* only to promote a
8
+ matching delete+add pair into a rename (nicer reporting); if git disagrees or
9
+ is absent, the hash diff stands and a move simply reads as delete + add
10
+ (accepted at 0.2, spec §3).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import subprocess
17
+ from pathlib import Path
18
+
19
+ from pydantic import BaseModel, Field
20
+
21
+ from agentforge_graph.ingest.pack import PackRegistry
22
+ from agentforge_graph.ingest.source import RepoSource
23
+
24
+ from .meta import IndexMeta
25
+
26
+
27
+ class ChangeSet(BaseModel):
28
+ """Files that changed since the last index, classified."""
29
+
30
+ added: list[str] = Field(default_factory=list)
31
+ modified: list[str] = Field(default_factory=list)
32
+ deleted: list[str] = Field(default_factory=list)
33
+ renamed: list[tuple[str, str]] = Field(default_factory=list) # (old, new)
34
+
35
+ def is_empty(self) -> bool:
36
+ return not (self.added or self.modified or self.deleted or self.renamed)
37
+
38
+ def touched_paths(self) -> list[str]:
39
+ """Files to (re)extract: added, modified, and the new side of renames."""
40
+ return sorted({*self.added, *self.modified, *(new for _, new in self.renamed)})
41
+
42
+ def removed_paths(self) -> list[str]:
43
+ """Files to delete from the store: deleted, and the old side of renames."""
44
+ return sorted({*self.deleted, *(old for old, _ in self.renamed)})
45
+
46
+ def changed_paths(self) -> list[str]:
47
+ """Every path the diff touches on either side — the re-resolve seed."""
48
+ return sorted({*self.touched_paths(), *self.removed_paths()})
49
+
50
+
51
+ class DetectResult(BaseModel):
52
+ changes: ChangeSet
53
+ file_hashes: dict[str, str] # the fresh, full path -> content_hash manifest
54
+
55
+
56
+ class ChangeDetector:
57
+ def __init__(self, repo_path: str | Path = ".") -> None:
58
+ self.repo_path = repo_path
59
+
60
+ async def detect(
61
+ self, source: RepoSource, meta: IndexMeta, registry: PackRegistry
62
+ ) -> DetectResult:
63
+ current = await asyncio.to_thread(self._current_hashes, source, registry)
64
+ prior = meta.files
65
+ added = [p for p in current if p not in prior]
66
+ modified = [p for p in current if p in prior and current[p] != prior[p]]
67
+ deleted = [p for p in prior if p not in current]
68
+ changes = ChangeSet(added=sorted(added), modified=sorted(modified), deleted=sorted(deleted))
69
+ self._refine_renames(changes, meta.indexed_commit)
70
+ return DetectResult(changes=changes, file_hashes=current)
71
+
72
+ @staticmethod
73
+ def _current_hashes(source: RepoSource, registry: PackRegistry) -> dict[str, str]:
74
+ return {sf.path: sf.content_hash for sf in source.iter_files(registry)}
75
+
76
+ def _refine_renames(self, changes: ChangeSet, base_commit: str) -> None:
77
+ """Best-effort: if git reports a committed rename old->new and our hash
78
+ diff independently saw `old` deleted and `new` added, collapse the pair
79
+ into a rename. Purely cosmetic — the indexer treats a rename as
80
+ delete(old)+add(new) anyway (§3), so a miss here changes nothing."""
81
+ if not base_commit:
82
+ return
83
+ added = set(changes.added)
84
+ deleted = set(changes.deleted)
85
+ for old, new in self._git_renames(base_commit):
86
+ if old in deleted and new in added:
87
+ changes.renamed.append((old, new))
88
+ changes.deleted.remove(old)
89
+ changes.added.remove(new)
90
+ deleted.discard(old)
91
+ added.discard(new)
92
+
93
+ def _git_renames(self, base_commit: str) -> list[tuple[str, str]]:
94
+ try:
95
+ out = subprocess.run(
96
+ [
97
+ "git",
98
+ "-C",
99
+ str(self.repo_path),
100
+ "diff",
101
+ "--name-status",
102
+ "-M",
103
+ "--diff-filter=R",
104
+ base_commit,
105
+ "HEAD",
106
+ ],
107
+ capture_output=True,
108
+ text=True,
109
+ check=True,
110
+ )
111
+ except (subprocess.SubprocessError, OSError):
112
+ return []
113
+ pairs: list[tuple[str, str]] = []
114
+ for line in out.stdout.splitlines():
115
+ parts = line.split("\t")
116
+ if len(parts) == 3 and parts[0].startswith("R"):
117
+ pairs.append((parts[1], parts[2]))
118
+ return pairs
@@ -0,0 +1,61 @@
1
+ """``DirtySet`` — the one staleness API every enricher reads (feat-004).
2
+
3
+ When an incremental refresh changes a file, the symbols it touched (plus their
4
+ 1-hop neighbours) are *dirtied* for every registered consumer — ``embeddings``
5
+ now, ``summaries`` / ``pattern-tags`` / ``routes`` as feat-010/011/012 land.
6
+ Each consumer drains its own cursor at its own cadence and marks the ids clean,
7
+ so no enricher reinvents "what changed since I last ran". Persisted to
8
+ ``.ckg/dirty.json`` as ``{consumer: [symbol_id, ...]}`` — a side file, so a
9
+ consumer cursor update never rewrites the index manifest (``meta.json``).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from pathlib import Path
16
+
17
+ _DIRTY = "dirty.json"
18
+
19
+
20
+ class DirtySet:
21
+ # Known enrichment consumers: embeddings (feat-005), patterns + summaries (feat-012).
22
+ DEFAULT_CONSUMERS = ["embeddings", "patterns", "summaries"]
23
+
24
+ def __init__(self, root: str | Path, consumers: list[str] | None = None) -> None:
25
+ self._path = Path(root) / _DIRTY
26
+ self._consumers = list(consumers or self.DEFAULT_CONSUMERS)
27
+ self._state: dict[str, list[str]] = self._load()
28
+
29
+ def _load(self) -> dict[str, list[str]]:
30
+ if not self._path.exists():
31
+ return {}
32
+ data = json.loads(self._path.read_text())
33
+ return {k: list(v) for k, v in data.items()}
34
+
35
+ def _save(self) -> None:
36
+ self._path.parent.mkdir(parents=True, exist_ok=True)
37
+ tmp = self._path.with_name(_DIRTY + ".tmp")
38
+ tmp.write_text(json.dumps(self._state, indent=2, sort_keys=True))
39
+ tmp.replace(self._path)
40
+
41
+ async def add(self, ids: list[str]) -> None:
42
+ """Append ``ids`` to every registered consumer's dirty set (deduped,
43
+ order-stable)."""
44
+ if not ids:
45
+ return
46
+ for consumer in self._consumers:
47
+ have = self._state.setdefault(consumer, [])
48
+ seen = set(have)
49
+ for i in ids:
50
+ if i not in seen:
51
+ seen.add(i)
52
+ have.append(i)
53
+ self._save()
54
+
55
+ async def dirty_for(self, consumer: str) -> list[str]:
56
+ return list(self._state.get(consumer, []))
57
+
58
+ async def mark_clean(self, consumer: str, ids: list[str]) -> None:
59
+ drop = set(ids)
60
+ self._state[consumer] = [i for i in self._state.get(consumer, []) if i not in drop]
61
+ self._save()