agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""History backfill (feat-009 chunk 4, ``ckg index --history N``).
|
|
2
|
+
|
|
3
|
+
Seeds the evolution log for code that predates temporal adoption by *replaying*
|
|
4
|
+
the last ``N`` commits oldest→newest through the **existing** incremental
|
|
5
|
+
pipeline against a **throwaway** graph store, feeding the real sidecar recorder
|
|
6
|
+
at each step. The HEAD index and the embeddings are never touched — backfill
|
|
7
|
+
writes lifecycle events only (design §4.6).
|
|
8
|
+
|
|
9
|
+
- File content at each historical commit is read from git (``git ls-tree`` +
|
|
10
|
+
``git show <commit>:<path>``) via :class:`GitBlobSource` — **no checkout
|
|
11
|
+
churn**, the working tree is left alone.
|
|
12
|
+
- The per-step diff is ``git diff --name-status -M <parent> <commit>``.
|
|
13
|
+
- Churn/authorship mining is **skipped** during replay (it is a HEAD-time
|
|
14
|
+
signal, mined by chunk 2; replaying it would clobber HEAD aggregates with
|
|
15
|
+
stale values). Only ``OPENED``/``CLOSED`` are recorded.
|
|
16
|
+
- **Resumable**: the oldest covered commit is stored as ``backfilled_through``;
|
|
17
|
+
a re-run whose requested range is already covered is a no-op. Events are
|
|
18
|
+
idempotent (unique per symbol/commit/event), so a partial run re-runs safely.
|
|
19
|
+
|
|
20
|
+
The accuracy this buys: a symbol's earliest ``OPENED`` event becomes its true
|
|
21
|
+
introduction commit (within the backfilled horizon), so ``history().introduced``
|
|
22
|
+
is no longer window-bounded for pre-existing code.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import subprocess
|
|
29
|
+
import tempfile
|
|
30
|
+
from collections.abc import Iterable, Iterator
|
|
31
|
+
from pathlib import Path, PurePosixPath
|
|
32
|
+
|
|
33
|
+
from pydantic import BaseModel
|
|
34
|
+
|
|
35
|
+
from agentforge_graph.core import SourceFile
|
|
36
|
+
from agentforge_graph.ingest.source import RepoSource
|
|
37
|
+
from agentforge_graph.store import Store
|
|
38
|
+
|
|
39
|
+
from .recorder import build_recorder, seed_symbols
|
|
40
|
+
from .store import TemporalStore
|
|
41
|
+
|
|
42
|
+
_FULL = -1 # `--history full` sentinel
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BackfillReport(BaseModel):
|
|
46
|
+
ran: bool
|
|
47
|
+
commits: int = 0
|
|
48
|
+
events_before: int = 0
|
|
49
|
+
events_after: int = 0
|
|
50
|
+
backfilled_through: str = ""
|
|
51
|
+
reason: str = ""
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def events_added(self) -> int:
|
|
55
|
+
return max(self.events_after - self.events_before, 0)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def parse_history(value: str | int | None) -> int:
|
|
59
|
+
"""Normalise the ``--history`` argument: ``"full"`` → ``_FULL``; an int-ish
|
|
60
|
+
→ that many commits; ``None``/0 → 0 (no backfill)."""
|
|
61
|
+
if value is None:
|
|
62
|
+
return 0
|
|
63
|
+
if isinstance(value, int):
|
|
64
|
+
return value
|
|
65
|
+
if value.strip().lower() == "full":
|
|
66
|
+
return _FULL
|
|
67
|
+
try:
|
|
68
|
+
return max(int(value), 0)
|
|
69
|
+
except ValueError:
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# --- a git-blob source: file content at a specific commit -----------------
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class GitBlobSource(RepoSource):
|
|
77
|
+
"""A ``RepoSource`` that yields the indexable files of a specific *commit's*
|
|
78
|
+
tree (read from git), not the working tree. ``restrict`` limits the read to
|
|
79
|
+
a known path set (an incremental step only needs its touched files), so
|
|
80
|
+
per-step cost is bounded — the working tree is never touched."""
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
root: str | Path,
|
|
85
|
+
commit: str,
|
|
86
|
+
*,
|
|
87
|
+
exclude: list[str],
|
|
88
|
+
include: list[str] | None = None,
|
|
89
|
+
max_file_kb: int = 512,
|
|
90
|
+
restrict: set[str] | None = None,
|
|
91
|
+
) -> None:
|
|
92
|
+
super().__init__(root, include=include, exclude=exclude, max_file_kb=max_file_kb)
|
|
93
|
+
self.commit = commit
|
|
94
|
+
self._restrict = restrict
|
|
95
|
+
|
|
96
|
+
def iter_files(self, registry: object) -> Iterator[SourceFile]:
|
|
97
|
+
self.skipped = []
|
|
98
|
+
for rel in self._tree_paths():
|
|
99
|
+
if self._restrict is not None and rel not in self._restrict:
|
|
100
|
+
continue
|
|
101
|
+
if self._is_excluded(rel) or not self._is_included(rel):
|
|
102
|
+
continue
|
|
103
|
+
pack = registry.for_extension(PurePosixPath(rel).suffix) # type: ignore[attr-defined]
|
|
104
|
+
if pack is None:
|
|
105
|
+
continue
|
|
106
|
+
raw = self._blob(rel)
|
|
107
|
+
if raw is None:
|
|
108
|
+
continue
|
|
109
|
+
if len(raw) > self.max_file_kb * 1024:
|
|
110
|
+
self.skipped.append(f"{rel} (> {self.max_file_kb}KB)")
|
|
111
|
+
continue
|
|
112
|
+
yield SourceFile(
|
|
113
|
+
path=rel,
|
|
114
|
+
text=raw.decode("utf-8", errors="replace"),
|
|
115
|
+
language=pack.lang_slug,
|
|
116
|
+
content_hash=hashlib.sha256(raw).hexdigest(),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _tree_paths(self) -> list[str]:
|
|
120
|
+
try:
|
|
121
|
+
out = subprocess.run(
|
|
122
|
+
["git", "-C", str(self.root), "ls-tree", "-r", "--name-only", self.commit],
|
|
123
|
+
capture_output=True,
|
|
124
|
+
text=True,
|
|
125
|
+
check=True,
|
|
126
|
+
)
|
|
127
|
+
except (subprocess.SubprocessError, OSError):
|
|
128
|
+
return []
|
|
129
|
+
return [p for p in out.stdout.splitlines() if p]
|
|
130
|
+
|
|
131
|
+
def _blob(self, rel: str) -> bytes | None:
|
|
132
|
+
try:
|
|
133
|
+
out = subprocess.run(
|
|
134
|
+
["git", "-C", str(self.root), "show", f"{self.commit}:{rel}"],
|
|
135
|
+
capture_output=True,
|
|
136
|
+
check=True,
|
|
137
|
+
)
|
|
138
|
+
except (subprocess.SubprocessError, OSError):
|
|
139
|
+
return None
|
|
140
|
+
return out.stdout
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# --- a recorder that records lifecycle but skips churn --------------------
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class _LifecycleOnly:
|
|
147
|
+
"""Wraps the real recorder; forwards open/close/flush but no-ops
|
|
148
|
+
``record_churn`` so replay never clobbers HEAD churn aggregates."""
|
|
149
|
+
|
|
150
|
+
def __init__(self, inner: object) -> None:
|
|
151
|
+
self._inner = inner
|
|
152
|
+
|
|
153
|
+
def open(self, symbol_ids: Iterable[str], at: str, ts: int) -> None:
|
|
154
|
+
self._inner.open(symbol_ids, at, ts) # type: ignore[attr-defined]
|
|
155
|
+
|
|
156
|
+
def close(self, symbol_ids: Iterable[str], at: str, ts: int) -> None:
|
|
157
|
+
self._inner.close(symbol_ids, at, ts) # type: ignore[attr-defined]
|
|
158
|
+
|
|
159
|
+
async def record_churn(self, *args: object, **kwargs: object) -> None:
|
|
160
|
+
return None
|
|
161
|
+
|
|
162
|
+
async def flush(self) -> None:
|
|
163
|
+
await self._inner.flush() # type: ignore[attr-defined]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# --- git helpers ----------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _git(root: str | Path, *args: str) -> str | None:
|
|
170
|
+
try:
|
|
171
|
+
out = subprocess.run(
|
|
172
|
+
["git", "-C", str(root), *args], capture_output=True, text=True, check=True
|
|
173
|
+
)
|
|
174
|
+
except (subprocess.SubprocessError, OSError):
|
|
175
|
+
return None
|
|
176
|
+
return out.stdout
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _commit_list(root: str | Path, history: int) -> list[str]:
|
|
180
|
+
"""The commits to replay, oldest→newest. ``history == _FULL`` walks to the
|
|
181
|
+
root; ``N`` takes the last ``N+1`` (a baseline + ``N`` diff steps)."""
|
|
182
|
+
args = ["rev-list", "--reverse"]
|
|
183
|
+
if history != _FULL:
|
|
184
|
+
args += ["-n", str(history + 1)]
|
|
185
|
+
args.append("HEAD")
|
|
186
|
+
out = _git(root, *args)
|
|
187
|
+
return [c for c in out.splitlines() if c] if out else []
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _commit_ts(root: str | Path, commit: str) -> int:
|
|
191
|
+
out = _git(root, "show", "-s", "--format=%ct", commit)
|
|
192
|
+
try:
|
|
193
|
+
return int(out.strip()) if out else 0
|
|
194
|
+
except ValueError:
|
|
195
|
+
return 0
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _is_ancestor(root: str | Path, a: str, b: str) -> bool:
|
|
199
|
+
"""True if commit ``a`` is an ancestor of (or equal to) ``b``."""
|
|
200
|
+
try:
|
|
201
|
+
return (
|
|
202
|
+
subprocess.run(
|
|
203
|
+
["git", "-C", str(root), "merge-base", "--is-ancestor", a, b],
|
|
204
|
+
capture_output=True,
|
|
205
|
+
).returncode
|
|
206
|
+
== 0
|
|
207
|
+
)
|
|
208
|
+
except (subprocess.SubprocessError, OSError):
|
|
209
|
+
return False
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _changeset(root: str | Path, parent: str, commit: str, registry: object) -> object:
|
|
213
|
+
"""A feat-004 ``ChangeSet`` from ``git diff --name-status -M`` between two
|
|
214
|
+
commits, restricted to indexable files."""
|
|
215
|
+
from agentforge_graph.ingest.incremental import ChangeSet
|
|
216
|
+
|
|
217
|
+
out = _git(root, "diff", "--name-status", "-M", parent, commit)
|
|
218
|
+
added: list[str] = []
|
|
219
|
+
modified: list[str] = []
|
|
220
|
+
deleted: list[str] = []
|
|
221
|
+
renamed: list[tuple[str, str]] = []
|
|
222
|
+
|
|
223
|
+
def indexable(p: str) -> bool:
|
|
224
|
+
return registry.for_extension(PurePosixPath(p).suffix) is not None # type: ignore[attr-defined]
|
|
225
|
+
|
|
226
|
+
for line in (out or "").splitlines():
|
|
227
|
+
parts = line.split("\t")
|
|
228
|
+
code = parts[0]
|
|
229
|
+
if code.startswith("R") and len(parts) == 3:
|
|
230
|
+
old, new = parts[1], parts[2]
|
|
231
|
+
if indexable(old) or indexable(new):
|
|
232
|
+
renamed.append((old, new))
|
|
233
|
+
elif len(parts) == 2 and indexable(parts[1]):
|
|
234
|
+
path = parts[1]
|
|
235
|
+
if code.startswith("A"):
|
|
236
|
+
added.append(path)
|
|
237
|
+
elif code.startswith("M"):
|
|
238
|
+
modified.append(path)
|
|
239
|
+
elif code.startswith("D"):
|
|
240
|
+
deleted.append(path)
|
|
241
|
+
return ChangeSet(
|
|
242
|
+
added=sorted(added),
|
|
243
|
+
modified=sorted(modified),
|
|
244
|
+
deleted=sorted(deleted),
|
|
245
|
+
renamed=renamed,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
async def _open_temp_store(tmp: Path) -> Store:
|
|
250
|
+
"""An embedded (kuzu + lance) throwaway store, regardless of the real
|
|
251
|
+
config's backend — backfill replays locally and discards it."""
|
|
252
|
+
from agentforge_graph.config import StoreConfig
|
|
253
|
+
from agentforge_graph.store.registry import graph_driver, vector_driver
|
|
254
|
+
|
|
255
|
+
graph = await graph_driver("kuzu").open(tmp / "graph.kuzu")
|
|
256
|
+
vectors = await vector_driver("lancedb").open(tmp / "vectors.lance")
|
|
257
|
+
return Store(graph, vectors, StoreConfig())
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
async def run_backfill(
|
|
261
|
+
repo_path: str | Path,
|
|
262
|
+
config: str | Path | None,
|
|
263
|
+
history: int,
|
|
264
|
+
*,
|
|
265
|
+
languages: str | list[str] | None = None,
|
|
266
|
+
) -> BackfillReport:
|
|
267
|
+
"""Replay ``history`` commits into the evolution log. See the module
|
|
268
|
+
docstring for the model; returns a report (``ran=False`` with a reason when
|
|
269
|
+
skipped)."""
|
|
270
|
+
from agentforge_graph.config import IngestConfig, StoreConfig, TemporalConfig
|
|
271
|
+
from agentforge_graph.ingest.codegraph import _registry_for
|
|
272
|
+
from agentforge_graph.ingest.incremental import IncrementalIndexer
|
|
273
|
+
from agentforge_graph.ingest.pipeline import IngestPipeline
|
|
274
|
+
|
|
275
|
+
if history == 0:
|
|
276
|
+
return BackfillReport(ran=False, reason="history=0 (nothing to backfill)")
|
|
277
|
+
if not TemporalConfig.load(config).enabled:
|
|
278
|
+
return BackfillReport(ran=False, reason="temporal disabled")
|
|
279
|
+
|
|
280
|
+
commits = _commit_list(repo_path, history)
|
|
281
|
+
if len(commits) < 2: # need a baseline + ≥1 step
|
|
282
|
+
return BackfillReport(ran=False, reason="not a git repo or too few commits")
|
|
283
|
+
|
|
284
|
+
root = Path(repo_path) / StoreConfig.load(config).path
|
|
285
|
+
tstore = TemporalStore.open(root)
|
|
286
|
+
target_oldest = commits[0]
|
|
287
|
+
cursor = await tstore.get_meta("backfilled_through")
|
|
288
|
+
if cursor and _is_ancestor(repo_path, cursor, target_oldest):
|
|
289
|
+
return BackfillReport(ran=False, reason="already backfilled", backfilled_through=cursor)
|
|
290
|
+
|
|
291
|
+
ingest = IngestConfig.load(config)
|
|
292
|
+
registry = _registry_for(languages if languages is not None else ingest.languages)
|
|
293
|
+
repo = Path(repo_path).resolve().name
|
|
294
|
+
exclude, max_kb = ingest.exclude, ingest.max_file_kb
|
|
295
|
+
|
|
296
|
+
recorder = build_recorder(str(root))
|
|
297
|
+
lifecycle = _LifecycleOnly(recorder)
|
|
298
|
+
events_before = await tstore.count_events()
|
|
299
|
+
|
|
300
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
301
|
+
store = await _open_temp_store(Path(tmpdir))
|
|
302
|
+
try:
|
|
303
|
+
c0 = commits[0]
|
|
304
|
+
src0 = GitBlobSource(repo_path, c0, exclude=exclude, max_file_kb=max_kb)
|
|
305
|
+
await IngestPipeline(repo=repo, commit=c0).run(src0, store.graph, registry)
|
|
306
|
+
# OPENED for everything alive at the baseline; repo_root="" → no churn
|
|
307
|
+
await seed_symbols(store.graph, recorder, c0, _commit_ts(repo_path, c0))
|
|
308
|
+
|
|
309
|
+
for prev, cur in zip(commits, commits[1:], strict=False):
|
|
310
|
+
changes = _changeset(repo_path, prev, cur, registry)
|
|
311
|
+
if changes.is_empty(): # type: ignore[attr-defined]
|
|
312
|
+
continue
|
|
313
|
+
touched = set(changes.touched_paths()) # type: ignore[attr-defined]
|
|
314
|
+
src = GitBlobSource(
|
|
315
|
+
repo_path, cur, exclude=exclude, max_file_kb=max_kb, restrict=touched
|
|
316
|
+
)
|
|
317
|
+
indexer = IncrementalIndexer(
|
|
318
|
+
store,
|
|
319
|
+
src,
|
|
320
|
+
registry,
|
|
321
|
+
repo,
|
|
322
|
+
commit=cur,
|
|
323
|
+
dirty=None,
|
|
324
|
+
recorder=lifecycle,
|
|
325
|
+
commit_ts=_commit_ts(repo_path, cur),
|
|
326
|
+
)
|
|
327
|
+
await indexer.refresh(changes) # type: ignore[arg-type]
|
|
328
|
+
finally:
|
|
329
|
+
await store.close()
|
|
330
|
+
|
|
331
|
+
await tstore.set_meta("backfilled_through", target_oldest)
|
|
332
|
+
return BackfillReport(
|
|
333
|
+
ran=True,
|
|
334
|
+
commits=len(commits),
|
|
335
|
+
events_before=events_before,
|
|
336
|
+
events_after=await tstore.count_events(),
|
|
337
|
+
backfilled_through=target_oldest,
|
|
338
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Value types for the temporal evolution log (feat-009).
|
|
2
|
+
|
|
3
|
+
An ``Event`` is one lifecycle record for a symbol (or, later, an edge): it was
|
|
4
|
+
``opened`` (first observed / re-introduced) or ``closed`` (removed) at a commit,
|
|
5
|
+
or ``succeeds`` another symbol (rename lineage). These are *commit-validity*
|
|
6
|
+
facts — when something was true in the repo — not ingestion-time facts (the
|
|
7
|
+
design's bi-temporal-lite scope; see design-009 §3).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from enum import StrEnum
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EventKind(StrEnum):
|
|
18
|
+
OPENED = "opened" # symbol first observed / re-introduced at `commit`
|
|
19
|
+
CLOSED = "closed" # symbol removed at `commit`
|
|
20
|
+
SUCCEEDS = "succeeds" # `symbol_id` is the successor of `ref` (rename lineage)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Entity(StrEnum):
|
|
24
|
+
NODE = "node"
|
|
25
|
+
EDGE = "edge"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Event(BaseModel):
|
|
29
|
+
"""One append-only lifecycle record in the evolution log."""
|
|
30
|
+
|
|
31
|
+
model_config = ConfigDict(frozen=True)
|
|
32
|
+
|
|
33
|
+
symbol_id: str
|
|
34
|
+
event: EventKind
|
|
35
|
+
commit: str
|
|
36
|
+
ts: int = 0 # commit author time (epoch seconds); 0 if unknown / non-git
|
|
37
|
+
entity: Entity = Entity.NODE
|
|
38
|
+
ref: str | None = None # SUCCEEDS: the prior symbol id this one supersedes
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# --- read-side value types (chunk 3 read APIs) ----------------------------
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class Author(BaseModel):
|
|
45
|
+
"""An author and how many commits they made to a symbol's span (within the
|
|
46
|
+
mined window)."""
|
|
47
|
+
|
|
48
|
+
model_config = ConfigDict(frozen=True)
|
|
49
|
+
|
|
50
|
+
name: str
|
|
51
|
+
commits: int
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Change(BaseModel):
|
|
55
|
+
"""One symbol that changed since a reference commit — the unit returned by
|
|
56
|
+
``changed_since``."""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(frozen=True)
|
|
59
|
+
|
|
60
|
+
symbol_id: str
|
|
61
|
+
path: str
|
|
62
|
+
kind: str # "opened" | "closed" | "modified"
|
|
63
|
+
commit: str
|
|
64
|
+
ts: int
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class SymbolHistory(BaseModel):
|
|
68
|
+
"""A symbol's evolution at a glance: when it was introduced / last changed,
|
|
69
|
+
its churn windows, its authors, and the raw lifecycle events. Read from the
|
|
70
|
+
sidecar (+ the current graph for the live span)."""
|
|
71
|
+
|
|
72
|
+
model_config = ConfigDict(frozen=True)
|
|
73
|
+
|
|
74
|
+
symbol_id: str
|
|
75
|
+
introduced: str = "" # commit sha (prefer the OPENED event; else mined)
|
|
76
|
+
introduced_ts: int = 0
|
|
77
|
+
last_changed: str = ""
|
|
78
|
+
last_changed_ts: int = 0
|
|
79
|
+
churn_30d: int = 0
|
|
80
|
+
churn_90d: int = 0
|
|
81
|
+
authors: list[Author] = []
|
|
82
|
+
events: list[Event] = []
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""``TemporalIndex`` — the read side of the evolution log (feat-009 chunks 3+5).
|
|
2
|
+
|
|
3
|
+
Answers the questions an agent asks after a regression — *when was this
|
|
4
|
+
introduced, who owns it, how much does it churn, what changed since <ref>, what
|
|
5
|
+
did this look like as_of <commit>* — from the sidecar (``TemporalStore``) plus
|
|
6
|
+
the current graph. Pure reads; no mutation, no embedding.
|
|
7
|
+
|
|
8
|
+
`introduced` prefers the chunk-1 ``OPENED`` event (the exact birth commit when
|
|
9
|
+
the symbol was added during the temporal era) and falls back to the mined
|
|
10
|
+
aggregate's window-bounded estimate otherwise (design §4.5 known limitation).
|
|
11
|
+
|
|
12
|
+
``alive_at(C)`` reconstructs the set of symbols valid at commit ``C`` by
|
|
13
|
+
replaying the log: a symbol is alive iff the *last* lifecycle event at or before
|
|
14
|
+
``C`` is ``OPENED`` (design §4.7). This tolerates the spurious ``OPENED`` the
|
|
15
|
+
full-index seed stamps at HEAD — that event is *after* any historical ``C``, so
|
|
16
|
+
it never leaks into an as_of reconstruction.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import subprocess
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from fnmatch import fnmatch
|
|
24
|
+
|
|
25
|
+
from agentforge_graph.core import GraphStore, SymbolID
|
|
26
|
+
|
|
27
|
+
from .events import Author, Change, Event, EventKind, SymbolHistory
|
|
28
|
+
from .store import TemporalStore
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TemporalError(Exception):
|
|
32
|
+
"""A temporal query that cannot be answered honestly (e.g. an ``as_of``
|
|
33
|
+
commit older than the retention horizon) — never a silent wrong answer."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TemporalIndex:
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
store: TemporalStore,
|
|
40
|
+
graph: GraphStore,
|
|
41
|
+
repo_root: str = "",
|
|
42
|
+
retention_commits: int = 0,
|
|
43
|
+
) -> None:
|
|
44
|
+
self._store = store
|
|
45
|
+
self._graph = graph
|
|
46
|
+
self._root = repo_root
|
|
47
|
+
self._retention = retention_commits
|
|
48
|
+
|
|
49
|
+
async def history(self, symbol_id: str) -> SymbolHistory:
|
|
50
|
+
events = await self._store.events_for(symbol_id)
|
|
51
|
+
agg = await self._store.aggregate_for(symbol_id)
|
|
52
|
+
|
|
53
|
+
# introduced: earliest OPENED event (exact) wins over the mined estimate.
|
|
54
|
+
opened = [e for e in events if e.event is EventKind.OPENED]
|
|
55
|
+
if opened:
|
|
56
|
+
first = min(opened, key=lambda e: e.ts)
|
|
57
|
+
introduced, introduced_ts = first.commit, first.ts
|
|
58
|
+
elif agg is not None:
|
|
59
|
+
introduced, introduced_ts = agg.introduced_sha, agg.introduced_ts
|
|
60
|
+
else:
|
|
61
|
+
introduced, introduced_ts = "", 0
|
|
62
|
+
|
|
63
|
+
# last_changed: the most recent of any event or the mined last_changed.
|
|
64
|
+
last, last_ts = "", 0
|
|
65
|
+
for e in events:
|
|
66
|
+
if e.ts >= last_ts:
|
|
67
|
+
last, last_ts = e.commit, e.ts
|
|
68
|
+
if agg is not None and agg.last_changed_ts >= last_ts:
|
|
69
|
+
last, last_ts = agg.last_changed_sha, agg.last_changed_ts
|
|
70
|
+
|
|
71
|
+
authors = [Author(name=n, commits=c) for n, c in (agg.top_authors if agg else [])]
|
|
72
|
+
return SymbolHistory(
|
|
73
|
+
symbol_id=symbol_id,
|
|
74
|
+
introduced=introduced,
|
|
75
|
+
introduced_ts=introduced_ts,
|
|
76
|
+
last_changed=last,
|
|
77
|
+
last_changed_ts=last_ts,
|
|
78
|
+
churn_30d=agg.churn_30d if agg else 0,
|
|
79
|
+
churn_90d=agg.churn_90d if agg else 0,
|
|
80
|
+
authors=authors,
|
|
81
|
+
events=events,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async def authors(self, symbol_id: str) -> list[Author]:
|
|
85
|
+
agg = await self._store.aggregate_for(symbol_id)
|
|
86
|
+
return [Author(name=n, commits=c) for n, c in (agg.top_authors if agg else [])]
|
|
87
|
+
|
|
88
|
+
async def churn(self, symbol_id: str, window_days: int = 90) -> int:
|
|
89
|
+
agg = await self._store.aggregate_for(symbol_id)
|
|
90
|
+
if agg is None:
|
|
91
|
+
return 0
|
|
92
|
+
return agg.churn_30d if window_days <= 30 else agg.churn_90d
|
|
93
|
+
|
|
94
|
+
async def changed_since(self, ref: str, scope: str | None = None) -> list[Change]:
|
|
95
|
+
"""Symbols with recorded activity after ``ref`` (a commit-ish), newest
|
|
96
|
+
first. Lifecycle events (opened/closed) and mined modifications both
|
|
97
|
+
count; ``scope`` keeps only paths matching the glob or prefix."""
|
|
98
|
+
since_ts = self._resolve_ts(ref)
|
|
99
|
+
changes: dict[str, Change] = {}
|
|
100
|
+
# lifecycle events after the ref — the precise kind
|
|
101
|
+
for e in await self._store.all_events():
|
|
102
|
+
if e.ts > since_ts:
|
|
103
|
+
changes[e.symbol_id] = Change(
|
|
104
|
+
symbol_id=e.symbol_id,
|
|
105
|
+
path=SymbolID.parse(e.symbol_id).path,
|
|
106
|
+
kind=e.event.value,
|
|
107
|
+
commit=e.commit,
|
|
108
|
+
ts=e.ts,
|
|
109
|
+
)
|
|
110
|
+
# mined modifications after the ref (don't overwrite a precise lifecycle)
|
|
111
|
+
for agg in await self._store.all_aggregates():
|
|
112
|
+
if agg.last_changed_ts > since_ts and agg.symbol_id not in changes:
|
|
113
|
+
changes[agg.symbol_id] = Change(
|
|
114
|
+
symbol_id=agg.symbol_id,
|
|
115
|
+
path=SymbolID.parse(agg.symbol_id).path,
|
|
116
|
+
kind="modified",
|
|
117
|
+
commit=agg.last_changed_sha,
|
|
118
|
+
ts=agg.last_changed_ts,
|
|
119
|
+
)
|
|
120
|
+
out = [c for c in changes.values() if _in_scope(c.path, scope)]
|
|
121
|
+
out.sort(key=lambda c: (-c.ts, c.symbol_id))
|
|
122
|
+
return out
|
|
123
|
+
|
|
124
|
+
async def alive_at(self, commit: str) -> set[str]:
|
|
125
|
+
"""The set of symbol ids valid at ``commit`` — reconstructed by replaying
|
|
126
|
+
the log over the current node set (design §4.7). Raises ``TemporalError``
|
|
127
|
+
when ``commit`` is older than the retention horizon (its closed events
|
|
128
|
+
may have been pruned, so the answer would be silently wrong)."""
|
|
129
|
+
ts = self._resolve_ts(commit)
|
|
130
|
+
horizon = self._horizon_ts()
|
|
131
|
+
if horizon and ts < horizon:
|
|
132
|
+
raise TemporalError(
|
|
133
|
+
f"{commit} is beyond the retention horizon ({self._retention} commits)"
|
|
134
|
+
)
|
|
135
|
+
by_sym: dict[str, list[Event]] = defaultdict(list)
|
|
136
|
+
for e in await self._store.all_events(): # ordered by (ts, rowid)
|
|
137
|
+
if e.ts <= ts:
|
|
138
|
+
by_sym[e.symbol_id].append(e)
|
|
139
|
+
# alive iff the last lifecycle event at/before C opened (not closed) it
|
|
140
|
+
return {sid for sid, evs in by_sym.items() if evs[-1].event is EventKind.OPENED}
|
|
141
|
+
|
|
142
|
+
# --- internals --------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def _horizon_ts(self) -> int:
|
|
145
|
+
"""Author time of ``HEAD~retention_commits`` (the oldest commit still in
|
|
146
|
+
retention), or 0 when retention is unbounded / history is shorter."""
|
|
147
|
+
if self._retention <= 0:
|
|
148
|
+
return 0
|
|
149
|
+
sha = self._git("rev-parse", f"HEAD~{self._retention}")
|
|
150
|
+
return self._commit_ts(sha.strip()) if sha else 0
|
|
151
|
+
|
|
152
|
+
def _commit_ts(self, ref: str) -> int:
|
|
153
|
+
out = self._git("show", "-s", "--format=%ct", ref)
|
|
154
|
+
try:
|
|
155
|
+
return int(out.strip()) if out else 0
|
|
156
|
+
except ValueError:
|
|
157
|
+
return 0
|
|
158
|
+
|
|
159
|
+
def _git(self, *args: str) -> str | None:
|
|
160
|
+
try:
|
|
161
|
+
return subprocess.run(
|
|
162
|
+
["git", "-C", self._root, *args],
|
|
163
|
+
capture_output=True,
|
|
164
|
+
text=True,
|
|
165
|
+
check=True,
|
|
166
|
+
).stdout
|
|
167
|
+
except (subprocess.SubprocessError, OSError):
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
def _resolve_ts(self, ref: str) -> int:
|
|
171
|
+
"""Author time (epoch s) of ``ref``. Accepts a raw epoch int too, so the
|
|
172
|
+
API is testable without a working tree."""
|
|
173
|
+
if ref.isdigit():
|
|
174
|
+
return int(ref)
|
|
175
|
+
try:
|
|
176
|
+
out = subprocess.run(
|
|
177
|
+
["git", "-C", self._root, "show", "-s", "--format=%ct", ref],
|
|
178
|
+
capture_output=True,
|
|
179
|
+
text=True,
|
|
180
|
+
check=True,
|
|
181
|
+
)
|
|
182
|
+
return int(out.stdout.strip())
|
|
183
|
+
except (subprocess.SubprocessError, OSError, ValueError) as exc:
|
|
184
|
+
raise ValueError(f"cannot resolve ref {ref!r} to a commit time") from exc
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _in_scope(path: str, scope: str | None) -> bool:
|
|
188
|
+
if not scope:
|
|
189
|
+
return True
|
|
190
|
+
return path.startswith(scope) or fnmatch(path, scope)
|