agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
"""``TreeSitterExtractor`` — pass 1 of ingestion (feat-002).
|
|
2
|
+
|
|
3
|
+
File-isolated: parses one file and emits its ``FileSubgraph`` — definition
|
|
4
|
+
nodes (File/Class/Function/Method) with ``CONTAINS`` edges, plus imports and
|
|
5
|
+
call sites recorded as node *attrs* (not edges — their targets may live in
|
|
6
|
+
other files, which pass 1 may not read). The graph-only resolver (pass 2)
|
|
7
|
+
turns those attrs into ``IMPORTS``/``CALLS`` edges.
|
|
8
|
+
|
|
9
|
+
Parsing uses the standalone ``tree_sitter`` package driven by a grammar from
|
|
10
|
+
``tree-sitter-language-pack`` (``Parser(get_language(...))``, never
|
|
11
|
+
``get_parser()`` — see the framework note on the ABI split).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import re
|
|
18
|
+
import textwrap
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from functools import cache
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from tree_sitter import Language, Parser, Query, QueryCursor
|
|
25
|
+
from tree_sitter import Node as TSNode
|
|
26
|
+
from tree_sitter_language_pack import get_language
|
|
27
|
+
|
|
28
|
+
from agentforge_graph.core import (
|
|
29
|
+
Descriptor,
|
|
30
|
+
Edge,
|
|
31
|
+
EdgeKind,
|
|
32
|
+
Extractor,
|
|
33
|
+
FileSubgraph,
|
|
34
|
+
NodeKind,
|
|
35
|
+
Provenance,
|
|
36
|
+
SourceFile,
|
|
37
|
+
SymbolID,
|
|
38
|
+
)
|
|
39
|
+
from agentforge_graph.core import (
|
|
40
|
+
Node as GraphNode,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
from .pack import LanguagePack
|
|
44
|
+
|
|
45
|
+
_CALLABLE = {NodeKind.FUNCTION, NodeKind.METHOD}
|
|
46
|
+
_METHOD_OWNERS = {NodeKind.CLASS, NodeKind.INTERFACE}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@cache
|
|
50
|
+
def _language(grammar: str) -> Language:
|
|
51
|
+
return get_language(grammar)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class _Def:
|
|
56
|
+
"""A captured definition, pre-symbol-id."""
|
|
57
|
+
|
|
58
|
+
ts_id: int
|
|
59
|
+
node: TSNode
|
|
60
|
+
kind: NodeKind
|
|
61
|
+
name: str
|
|
62
|
+
enclosing: int | None = None # ts id of the nearest enclosing def
|
|
63
|
+
symbol_id: str = ""
|
|
64
|
+
bases: list[str] = field(default_factory=list) # superclass names (INHERITS)
|
|
65
|
+
recv_var: str = "" # Go: a method's receiver variable name (`s` in `func (s *T)`)
|
|
66
|
+
recv_type: str = "" # Go: a method's receiver type name (`T`)
|
|
67
|
+
docstring: str = "" # the symbol's docstring, cleaned (DESCRIBES, feat-010)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _text(node: TSNode, src: bytes) -> str:
|
|
71
|
+
return src[node.start_byte : node.end_byte].decode("utf-8", errors="replace")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _span(node: TSNode) -> tuple[int, int]:
|
|
75
|
+
return (node.start_point[0] + 1, node.end_point[0] + 1)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _signature(node: TSNode, src: bytes) -> str:
|
|
79
|
+
"""The symbol's first source line (the def/class header), trimmed."""
|
|
80
|
+
text = _text(node, src)
|
|
81
|
+
return text.splitlines()[0].strip() if text else ""
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
_STR_PREFIX = re.compile(r"^[rbfuRBFU]{0,2}")
|
|
85
|
+
_JSDOC_LINE = re.compile(r"^\s*\*?\s?") # a JSDoc/Javadoc line's leading ` * `
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _clean_docstring(raw: str) -> str:
|
|
89
|
+
"""The docstring body: strip a Python string literal's prefix + quotes, OR a
|
|
90
|
+
``/** … */`` JSDoc/Javadoc comment's markers + per-line ``*``; then dedent."""
|
|
91
|
+
s = raw.strip()
|
|
92
|
+
if s.startswith("/*"): # JSDoc / Javadoc block comment
|
|
93
|
+
s = s[2:]
|
|
94
|
+
if s.startswith("*"): # the second `*` of `/**`
|
|
95
|
+
s = s[1:]
|
|
96
|
+
if s.endswith("*/"):
|
|
97
|
+
s = s[:-2]
|
|
98
|
+
return "\n".join(_JSDOC_LINE.sub("", ln) for ln in s.splitlines()).strip()
|
|
99
|
+
s = _STR_PREFIX.sub("", s, count=1)
|
|
100
|
+
for q in ('"""', "'''", '"', "'"):
|
|
101
|
+
if s.startswith(q) and s.endswith(q) and len(s) >= 2 * len(q):
|
|
102
|
+
s = s[len(q) : -len(q)]
|
|
103
|
+
break
|
|
104
|
+
return textwrap.dedent(s).strip()
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TreeSitterExtractor(Extractor):
|
|
108
|
+
"""Extracts a ``FileSubgraph`` from one source file, in isolation."""
|
|
109
|
+
|
|
110
|
+
def __init__(self, pack: LanguagePack, repo: str, commit: str = "") -> None:
|
|
111
|
+
self.pack = pack
|
|
112
|
+
self.repo = repo
|
|
113
|
+
self.commit = commit
|
|
114
|
+
self.name = f"tree-sitter-{pack.language}"
|
|
115
|
+
self._lang = _language(pack.grammar)
|
|
116
|
+
self._parser = Parser(self._lang)
|
|
117
|
+
self._structure_q = Query(self._lang, pack.structure_queries)
|
|
118
|
+
self._reference_q = Query(self._lang, pack.reference_queries)
|
|
119
|
+
|
|
120
|
+
def extract(self, file: SourceFile) -> FileSubgraph:
|
|
121
|
+
src = file.text.encode("utf-8")
|
|
122
|
+
root = self._parser.parse(src).root_node
|
|
123
|
+
prov = Provenance.parsed(self.name, self.commit)
|
|
124
|
+
file_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, file.path, "")
|
|
125
|
+
|
|
126
|
+
defs, imports, default_export, namespace = self._structure(root, src)
|
|
127
|
+
self._assign_symbol_ids(defs, file.path)
|
|
128
|
+
by_tsid = {d.ts_id: d for d in defs}
|
|
129
|
+
refs = self._references(root, src, by_tsid, file_id)
|
|
130
|
+
|
|
131
|
+
nodes: list[GraphNode] = []
|
|
132
|
+
file_attrs: dict[str, Any] = {}
|
|
133
|
+
if imports:
|
|
134
|
+
file_attrs["imports"] = imports
|
|
135
|
+
if default_export:
|
|
136
|
+
file_attrs["default_export"] = default_export
|
|
137
|
+
if namespace:
|
|
138
|
+
file_attrs["namespace"] = namespace # PHP/Java/C# package (FQN resolution)
|
|
139
|
+
if file_id in refs:
|
|
140
|
+
file_attrs["refs"] = refs[file_id]
|
|
141
|
+
nodes.append(
|
|
142
|
+
GraphNode(
|
|
143
|
+
id=file_id,
|
|
144
|
+
kind=NodeKind.FILE,
|
|
145
|
+
name=file.path.rsplit("/", 1)[-1],
|
|
146
|
+
provenance=prov,
|
|
147
|
+
attrs=file_attrs,
|
|
148
|
+
)
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
edges: list[Edge] = []
|
|
152
|
+
for d in defs:
|
|
153
|
+
attrs: dict[str, Any] = {"signature": _signature(d.node, src)}
|
|
154
|
+
if d.symbol_id in refs:
|
|
155
|
+
attrs["refs"] = refs[d.symbol_id]
|
|
156
|
+
if d.bases: # INHERITS: superclass names, resolved in pass 2
|
|
157
|
+
attrs["bases"] = d.bases
|
|
158
|
+
if d.recv_var: # Go: receiver var/type, for receiver self-calls (pass 2)
|
|
159
|
+
attrs["recv_var"] = d.recv_var
|
|
160
|
+
attrs["recv_type"] = d.recv_type
|
|
161
|
+
nodes.append(
|
|
162
|
+
GraphNode(
|
|
163
|
+
id=d.symbol_id,
|
|
164
|
+
kind=d.kind,
|
|
165
|
+
name=d.name,
|
|
166
|
+
span=_span(d.node),
|
|
167
|
+
provenance=prov,
|
|
168
|
+
attrs=attrs,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
parent_id = by_tsid[d.enclosing].symbol_id if d.enclosing in by_tsid else file_id
|
|
172
|
+
edges.append(
|
|
173
|
+
Edge(src=parent_id, dst=d.symbol_id, kind=EdgeKind.CONTAINS, provenance=prov)
|
|
174
|
+
)
|
|
175
|
+
# docstring -> a DocChunk that DESCRIBES the symbol (feat-010), so the
|
|
176
|
+
# docstring prose is embeddable + searchable, attached to its symbol.
|
|
177
|
+
if d.docstring:
|
|
178
|
+
desc = SymbolID.parse(d.symbol_id).descriptor + "docstring."
|
|
179
|
+
doc_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, file.path, desc)
|
|
180
|
+
nodes.append(
|
|
181
|
+
GraphNode(
|
|
182
|
+
id=doc_id,
|
|
183
|
+
kind=NodeKind.DOC_CHUNK,
|
|
184
|
+
name=d.name,
|
|
185
|
+
provenance=prov,
|
|
186
|
+
attrs={
|
|
187
|
+
"path": file.path,
|
|
188
|
+
"heading": d.name,
|
|
189
|
+
"text": d.docstring,
|
|
190
|
+
"describes": d.symbol_id,
|
|
191
|
+
"content_hash": hashlib.sha256(d.docstring.encode()).hexdigest(),
|
|
192
|
+
},
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
edges.append(
|
|
196
|
+
Edge(src=doc_id, dst=d.symbol_id, kind=EdgeKind.DESCRIBES, provenance=prov)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
nodes.sort(key=lambda n: (n.span or (0, 0), n.id))
|
|
200
|
+
edges.sort(key=lambda e: (e.src, e.dst, e.kind.value))
|
|
201
|
+
return FileSubgraph(
|
|
202
|
+
path=file.path, content_hash=file.content_hash, nodes=nodes, edges=edges
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# --- structure pass -------------------------------------------------
|
|
206
|
+
|
|
207
|
+
def _structure(
|
|
208
|
+
self, root: TSNode, src: bytes
|
|
209
|
+
) -> tuple[list[_Def], list[dict[str, Any]], str, str]:
|
|
210
|
+
defs: list[_Def] = []
|
|
211
|
+
imports: list[dict[str, Any]] = []
|
|
212
|
+
default_export = "" # CommonJS `module.exports = <name>` (BUG-006)
|
|
213
|
+
namespace = "" # PHP/Java/C# package declaration (FQN import resolution)
|
|
214
|
+
class_bases: dict[int, list[str]] = defaultdict(list) # class node id -> base names
|
|
215
|
+
method_recv: dict[int, tuple[str, str]] = {} # method node id -> (recv var, recv type)
|
|
216
|
+
docstrings: dict[int, str] = {} # def node id -> cleaned docstring (DESCRIBES)
|
|
217
|
+
rules = self.pack.descriptor_rules
|
|
218
|
+
for _pattern, caps in QueryCursor(self._structure_q).matches(root):
|
|
219
|
+
def_cap = next((c for c in caps if c.startswith("def.")), None)
|
|
220
|
+
if def_cap is not None:
|
|
221
|
+
kind = rules.kind_for(def_cap)
|
|
222
|
+
names = caps.get("name")
|
|
223
|
+
if kind is None or not names:
|
|
224
|
+
continue
|
|
225
|
+
node = caps[def_cap][0]
|
|
226
|
+
defs.append(_Def(ts_id=node.id, node=node, kind=kind, name=_text(names[0], src)))
|
|
227
|
+
elif "base.name" in caps:
|
|
228
|
+
# a base class of a class definition (INHERITS); one match per base
|
|
229
|
+
cls = caps.get("base.def")
|
|
230
|
+
if cls:
|
|
231
|
+
class_bases[cls[0].id].extend(_text(b, src) for b in caps["base.name"])
|
|
232
|
+
elif "recv.var" in caps:
|
|
233
|
+
# Go: a method's receiver `(s *T)` — bind the var name + type
|
|
234
|
+
meth, rvar, rtype = caps.get("recv.method"), caps["recv.var"], caps.get("recv.type")
|
|
235
|
+
if meth and rtype:
|
|
236
|
+
method_recv[meth[0].id] = (_text(rvar[0], src), _text(rtype[0], src))
|
|
237
|
+
elif "docstring" in caps:
|
|
238
|
+
# a def/class docstring or JSDoc comment — DESCRIBES the symbol
|
|
239
|
+
owner = caps.get("doc.owner")
|
|
240
|
+
if owner:
|
|
241
|
+
docstrings[owner[0].id] = _clean_docstring(_text(caps["docstring"][0], src))
|
|
242
|
+
elif "import" in caps:
|
|
243
|
+
mods = caps.get("import.module", [])
|
|
244
|
+
dflt = caps.get("import.default")
|
|
245
|
+
imports.append(
|
|
246
|
+
{
|
|
247
|
+
"module": _text(mods[0], src) if mods else "",
|
|
248
|
+
"names": [_text(n, src) for n in caps.get("import.name", [])],
|
|
249
|
+
# CommonJS default require binding: `const x = require(...)`
|
|
250
|
+
"default": _text(dflt[0], src) if dflt else "",
|
|
251
|
+
"line": caps["import"][0].start_point[0] + 1,
|
|
252
|
+
}
|
|
253
|
+
)
|
|
254
|
+
elif "namespace" in caps:
|
|
255
|
+
ns = caps.get("namespace")
|
|
256
|
+
if ns:
|
|
257
|
+
namespace = _text(ns[0], src)
|
|
258
|
+
elif "export" in caps:
|
|
259
|
+
ed = caps.get("export.default")
|
|
260
|
+
if ed:
|
|
261
|
+
default_export = _text(ed[0], src)
|
|
262
|
+
for d in defs:
|
|
263
|
+
if d.ts_id in class_bases:
|
|
264
|
+
d.bases = class_bases[d.ts_id]
|
|
265
|
+
if d.ts_id in method_recv:
|
|
266
|
+
d.recv_var, d.recv_type = method_recv[d.ts_id]
|
|
267
|
+
if d.ts_id in docstrings:
|
|
268
|
+
d.docstring = docstrings[d.ts_id]
|
|
269
|
+
self._link_scopes(defs)
|
|
270
|
+
return defs, imports, default_export, namespace
|
|
271
|
+
|
|
272
|
+
def _link_scopes(self, defs: list[_Def]) -> None:
|
|
273
|
+
idset = {d.ts_id for d in defs}
|
|
274
|
+
by_tsid = {d.ts_id: d for d in defs}
|
|
275
|
+
for d in defs:
|
|
276
|
+
anc = d.node.parent
|
|
277
|
+
while anc is not None and anc.id not in idset:
|
|
278
|
+
anc = anc.parent
|
|
279
|
+
d.enclosing = anc.id if anc is not None else None
|
|
280
|
+
# a function whose nearest enclosing def is a class is a method
|
|
281
|
+
if (
|
|
282
|
+
d.kind is NodeKind.FUNCTION
|
|
283
|
+
and d.enclosing is not None
|
|
284
|
+
and by_tsid[d.enclosing].kind in _METHOD_OWNERS
|
|
285
|
+
):
|
|
286
|
+
d.kind = NodeKind.METHOD
|
|
287
|
+
|
|
288
|
+
def _assign_symbol_ids(self, defs: list[_Def], path: str) -> None:
|
|
289
|
+
by_tsid = {d.ts_id: d for d in defs}
|
|
290
|
+
# overload disambiguator: nth same-named callable in the same scope (source order)
|
|
291
|
+
counter: dict[tuple[int | None, str], int] = defaultdict(int)
|
|
292
|
+
disamb: dict[int, int] = {}
|
|
293
|
+
for d in sorted(defs, key=lambda d: d.node.start_byte):
|
|
294
|
+
if d.kind in _CALLABLE:
|
|
295
|
+
key = (d.enclosing, d.name)
|
|
296
|
+
disamb[d.ts_id] = counter[key]
|
|
297
|
+
counter[key] += 1
|
|
298
|
+
for d in defs:
|
|
299
|
+
chain: list[_Def] = []
|
|
300
|
+
cur: _Def | None = d
|
|
301
|
+
while cur is not None:
|
|
302
|
+
chain.append(cur)
|
|
303
|
+
cur = by_tsid.get(cur.enclosing) if cur.enclosing is not None else None
|
|
304
|
+
chain.reverse()
|
|
305
|
+
descriptor = "".join(self._suffix(x, disamb.get(x.ts_id, 0)) for x in chain)
|
|
306
|
+
d.symbol_id = SymbolID.for_symbol(self.pack.lang_slug, self.repo, path, descriptor)
|
|
307
|
+
|
|
308
|
+
@staticmethod
|
|
309
|
+
def _suffix(d: _Def, disambiguator: int) -> str:
|
|
310
|
+
if d.kind in (NodeKind.CLASS, NodeKind.INTERFACE):
|
|
311
|
+
return Descriptor.type(d.name)
|
|
312
|
+
if d.kind in _CALLABLE:
|
|
313
|
+
return Descriptor.method(d.name, disambiguator)
|
|
314
|
+
return Descriptor.term(d.name)
|
|
315
|
+
|
|
316
|
+
# --- reference pass -------------------------------------------------
|
|
317
|
+
|
|
318
|
+
def _references(
|
|
319
|
+
self, root: TSNode, src: bytes, by_tsid: dict[int, _Def], file_id: str
|
|
320
|
+
) -> dict[str, list[dict[str, Any]]]:
|
|
321
|
+
idset = set(by_tsid)
|
|
322
|
+
# Keyed by the call node so a bare + receiver-capturing pattern that both
|
|
323
|
+
# match the same call (Java/Ruby, where one node type serves `f()` and
|
|
324
|
+
# `recv.f()`) yield ONE ref — the receiver merged in. Distinct-node-type
|
|
325
|
+
# grammars (Py/TS/JS/C#/Rust/PHP/C++) never collide, so this is a no-op
|
|
326
|
+
# for them; insertion order preserves source order.
|
|
327
|
+
owner_of: dict[int, str] = {}
|
|
328
|
+
ref_of: dict[int, dict[str, Any]] = {}
|
|
329
|
+
for _pattern, caps in QueryCursor(self._reference_q).matches(root):
|
|
330
|
+
if "call" not in caps:
|
|
331
|
+
continue
|
|
332
|
+
callees = caps.get("call.callee")
|
|
333
|
+
if not callees:
|
|
334
|
+
continue
|
|
335
|
+
call_node = caps["call"][0]
|
|
336
|
+
ref = ref_of.get(call_node.id)
|
|
337
|
+
if ref is None:
|
|
338
|
+
anc = call_node.parent
|
|
339
|
+
while anc is not None and anc.id not in idset:
|
|
340
|
+
anc = anc.parent
|
|
341
|
+
owner_of[call_node.id] = by_tsid[anc.id].symbol_id if anc is not None else file_id
|
|
342
|
+
ref = {"name": _text(callees[0], src), "line": call_node.start_point[0] + 1}
|
|
343
|
+
ref_of[call_node.id] = ref
|
|
344
|
+
# BUG-006: the receiver of an attribute call (`recv.f()`), when the pack
|
|
345
|
+
# captures it — lets the resolver bind `self.f()`/`this.f()` to the
|
|
346
|
+
# enclosing class's method and refuse to guess for other receivers.
|
|
347
|
+
recv = caps.get("call.recv")
|
|
348
|
+
if recv and "recv" not in ref:
|
|
349
|
+
ref["recv"] = _text(recv[0], src)
|
|
350
|
+
refs: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
351
|
+
for cid, ref in ref_of.items():
|
|
352
|
+
refs[owner_of[cid]].append(ref)
|
|
353
|
+
return dict(refs)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Incremental indexing (feat-004): re-index only the diff.
|
|
2
|
+
|
|
3
|
+
A thin coordination layer over the feat-002/003 primitives —
|
|
4
|
+
``ChangeDetector`` diffs the working tree against the ``IndexMeta`` manifest,
|
|
5
|
+
``IncrementalIndexer`` applies the resulting ``ChangeSet`` (delete → re-extract
|
|
6
|
+
→ scoped re-resolve), and ``DirtySet`` records what each enricher must redo.
|
|
7
|
+
Zero ``agentforge`` imports (ADR-0001).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .detect import ChangeDetector, ChangeSet, DetectResult
|
|
13
|
+
from .dirty import DirtySet
|
|
14
|
+
from .indexer import IncrementalIndexer
|
|
15
|
+
from .meta import IndexMeta, pack_fingerprint
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ChangeDetector",
|
|
19
|
+
"ChangeSet",
|
|
20
|
+
"DetectResult",
|
|
21
|
+
"DirtySet",
|
|
22
|
+
"IncrementalIndexer",
|
|
23
|
+
"IndexMeta",
|
|
24
|
+
"pack_fingerprint",
|
|
25
|
+
]
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""``ChangeDetector`` — diff the working tree against the indexed manifest.
|
|
2
|
+
|
|
3
|
+
The **content hash is the source of truth**: we walk the working tree once,
|
|
4
|
+
hash every indexable file, and diff that against ``IndexMeta.files``. This is
|
|
5
|
+
correct regardless of git state (dirty working tree, shallow clone, detached
|
|
6
|
+
HEAD, rebase) and naturally catches uncommitted edits — the common case for an
|
|
7
|
+
agent mid-flight. Git is then consulted *best-effort* only to promote a
|
|
8
|
+
matching delete+add pair into a rename (nicer reporting); if git disagrees or
|
|
9
|
+
is absent, the hash diff stands and a move simply reads as delete + add
|
|
10
|
+
(accepted at 0.2, spec §3).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import subprocess
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field
|
|
20
|
+
|
|
21
|
+
from agentforge_graph.ingest.pack import PackRegistry
|
|
22
|
+
from agentforge_graph.ingest.source import RepoSource
|
|
23
|
+
|
|
24
|
+
from .meta import IndexMeta
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ChangeSet(BaseModel):
|
|
28
|
+
"""Files that changed since the last index, classified."""
|
|
29
|
+
|
|
30
|
+
added: list[str] = Field(default_factory=list)
|
|
31
|
+
modified: list[str] = Field(default_factory=list)
|
|
32
|
+
deleted: list[str] = Field(default_factory=list)
|
|
33
|
+
renamed: list[tuple[str, str]] = Field(default_factory=list) # (old, new)
|
|
34
|
+
|
|
35
|
+
def is_empty(self) -> bool:
|
|
36
|
+
return not (self.added or self.modified or self.deleted or self.renamed)
|
|
37
|
+
|
|
38
|
+
def touched_paths(self) -> list[str]:
|
|
39
|
+
"""Files to (re)extract: added, modified, and the new side of renames."""
|
|
40
|
+
return sorted({*self.added, *self.modified, *(new for _, new in self.renamed)})
|
|
41
|
+
|
|
42
|
+
def removed_paths(self) -> list[str]:
|
|
43
|
+
"""Files to delete from the store: deleted, and the old side of renames."""
|
|
44
|
+
return sorted({*self.deleted, *(old for old, _ in self.renamed)})
|
|
45
|
+
|
|
46
|
+
def changed_paths(self) -> list[str]:
|
|
47
|
+
"""Every path the diff touches on either side — the re-resolve seed."""
|
|
48
|
+
return sorted({*self.touched_paths(), *self.removed_paths()})
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DetectResult(BaseModel):
|
|
52
|
+
changes: ChangeSet
|
|
53
|
+
file_hashes: dict[str, str] # the fresh, full path -> content_hash manifest
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ChangeDetector:
|
|
57
|
+
def __init__(self, repo_path: str | Path = ".") -> None:
|
|
58
|
+
self.repo_path = repo_path
|
|
59
|
+
|
|
60
|
+
async def detect(
|
|
61
|
+
self, source: RepoSource, meta: IndexMeta, registry: PackRegistry
|
|
62
|
+
) -> DetectResult:
|
|
63
|
+
current = await asyncio.to_thread(self._current_hashes, source, registry)
|
|
64
|
+
prior = meta.files
|
|
65
|
+
added = [p for p in current if p not in prior]
|
|
66
|
+
modified = [p for p in current if p in prior and current[p] != prior[p]]
|
|
67
|
+
deleted = [p for p in prior if p not in current]
|
|
68
|
+
changes = ChangeSet(added=sorted(added), modified=sorted(modified), deleted=sorted(deleted))
|
|
69
|
+
self._refine_renames(changes, meta.indexed_commit)
|
|
70
|
+
return DetectResult(changes=changes, file_hashes=current)
|
|
71
|
+
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _current_hashes(source: RepoSource, registry: PackRegistry) -> dict[str, str]:
|
|
74
|
+
return {sf.path: sf.content_hash for sf in source.iter_files(registry)}
|
|
75
|
+
|
|
76
|
+
def _refine_renames(self, changes: ChangeSet, base_commit: str) -> None:
|
|
77
|
+
"""Best-effort: if git reports a committed rename old->new and our hash
|
|
78
|
+
diff independently saw `old` deleted and `new` added, collapse the pair
|
|
79
|
+
into a rename. Purely cosmetic — the indexer treats a rename as
|
|
80
|
+
delete(old)+add(new) anyway (§3), so a miss here changes nothing."""
|
|
81
|
+
if not base_commit:
|
|
82
|
+
return
|
|
83
|
+
added = set(changes.added)
|
|
84
|
+
deleted = set(changes.deleted)
|
|
85
|
+
for old, new in self._git_renames(base_commit):
|
|
86
|
+
if old in deleted and new in added:
|
|
87
|
+
changes.renamed.append((old, new))
|
|
88
|
+
changes.deleted.remove(old)
|
|
89
|
+
changes.added.remove(new)
|
|
90
|
+
deleted.discard(old)
|
|
91
|
+
added.discard(new)
|
|
92
|
+
|
|
93
|
+
def _git_renames(self, base_commit: str) -> list[tuple[str, str]]:
|
|
94
|
+
try:
|
|
95
|
+
out = subprocess.run(
|
|
96
|
+
[
|
|
97
|
+
"git",
|
|
98
|
+
"-C",
|
|
99
|
+
str(self.repo_path),
|
|
100
|
+
"diff",
|
|
101
|
+
"--name-status",
|
|
102
|
+
"-M",
|
|
103
|
+
"--diff-filter=R",
|
|
104
|
+
base_commit,
|
|
105
|
+
"HEAD",
|
|
106
|
+
],
|
|
107
|
+
capture_output=True,
|
|
108
|
+
text=True,
|
|
109
|
+
check=True,
|
|
110
|
+
)
|
|
111
|
+
except (subprocess.SubprocessError, OSError):
|
|
112
|
+
return []
|
|
113
|
+
pairs: list[tuple[str, str]] = []
|
|
114
|
+
for line in out.stdout.splitlines():
|
|
115
|
+
parts = line.split("\t")
|
|
116
|
+
if len(parts) == 3 and parts[0].startswith("R"):
|
|
117
|
+
pairs.append((parts[1], parts[2]))
|
|
118
|
+
return pairs
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""``DirtySet`` — the one staleness API every enricher reads (feat-004).
|
|
2
|
+
|
|
3
|
+
When an incremental refresh changes a file, the symbols it touched (plus their
|
|
4
|
+
1-hop neighbours) are *dirtied* for every registered consumer — ``embeddings``
|
|
5
|
+
now, ``summaries`` / ``pattern-tags`` / ``routes`` as feat-010/011/012 land.
|
|
6
|
+
Each consumer drains its own cursor at its own cadence and marks the ids clean,
|
|
7
|
+
so no enricher reinvents "what changed since I last ran". Persisted to
|
|
8
|
+
``.ckg/dirty.json`` as ``{consumer: [symbol_id, ...]}`` — a side file, so a
|
|
9
|
+
consumer cursor update never rewrites the index manifest (``meta.json``).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
_DIRTY = "dirty.json"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DirtySet:
|
|
21
|
+
# Known enrichment consumers: embeddings (feat-005), patterns + summaries (feat-012).
|
|
22
|
+
DEFAULT_CONSUMERS = ["embeddings", "patterns", "summaries"]
|
|
23
|
+
|
|
24
|
+
def __init__(self, root: str | Path, consumers: list[str] | None = None) -> None:
|
|
25
|
+
self._path = Path(root) / _DIRTY
|
|
26
|
+
self._consumers = list(consumers or self.DEFAULT_CONSUMERS)
|
|
27
|
+
self._state: dict[str, list[str]] = self._load()
|
|
28
|
+
|
|
29
|
+
def _load(self) -> dict[str, list[str]]:
|
|
30
|
+
if not self._path.exists():
|
|
31
|
+
return {}
|
|
32
|
+
data = json.loads(self._path.read_text())
|
|
33
|
+
return {k: list(v) for k, v in data.items()}
|
|
34
|
+
|
|
35
|
+
def _save(self) -> None:
|
|
36
|
+
self._path.parent.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
tmp = self._path.with_name(_DIRTY + ".tmp")
|
|
38
|
+
tmp.write_text(json.dumps(self._state, indent=2, sort_keys=True))
|
|
39
|
+
tmp.replace(self._path)
|
|
40
|
+
|
|
41
|
+
async def add(self, ids: list[str]) -> None:
|
|
42
|
+
"""Append ``ids`` to every registered consumer's dirty set (deduped,
|
|
43
|
+
order-stable)."""
|
|
44
|
+
if not ids:
|
|
45
|
+
return
|
|
46
|
+
for consumer in self._consumers:
|
|
47
|
+
have = self._state.setdefault(consumer, [])
|
|
48
|
+
seen = set(have)
|
|
49
|
+
for i in ids:
|
|
50
|
+
if i not in seen:
|
|
51
|
+
seen.add(i)
|
|
52
|
+
have.append(i)
|
|
53
|
+
self._save()
|
|
54
|
+
|
|
55
|
+
async def dirty_for(self, consumer: str) -> list[str]:
|
|
56
|
+
return list(self._state.get(consumer, []))
|
|
57
|
+
|
|
58
|
+
async def mark_clean(self, consumer: str, ids: list[str]) -> None:
|
|
59
|
+
drop = set(ids)
|
|
60
|
+
self._state[consumer] = [i for i in self._state.get(consumer, []) if i not in drop]
|
|
61
|
+
self._save()
|