agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Churn / authorship mining (feat-009 chunk 2).
|
|
2
|
+
|
|
3
|
+
A symbol's *churn* and *authorship* are ranking + ownership signals
|
|
4
|
+
(design-009 §4.5): how much it has moved lately and who has been editing it.
|
|
5
|
+
We mine them from ``git log`` over a bounded window and attribute each diff
|
|
6
|
+
hunk to the symbol(s) whose **current span** overlaps the hunk's new-line
|
|
7
|
+
range. Attribution is approximate *by design* — historical line numbers drift
|
|
8
|
+
from current spans — which is fine for a ranking signal (it is never asserted
|
|
9
|
+
as provenance).
|
|
10
|
+
|
|
11
|
+
One batched ``git log -U0`` call per refresh covers all touched paths; the
|
|
12
|
+
window (default 90d, derived from the commit's author time so results are
|
|
13
|
+
deterministic in tests) bounds cost. Output is a small, bounded
|
|
14
|
+
``SymbolAggregate`` per symbol — never a per-commit fact (design §4.10).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import subprocess
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import UTC, datetime
|
|
24
|
+
|
|
25
|
+
_DAY = 86_400
|
|
26
|
+
_HUNK = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
|
|
27
|
+
_SOH = "\x01" # commit-header marker (unambiguous vs. patch text)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class SymbolAggregate:
|
|
32
|
+
"""Bounded churn/authorship rollup for one symbol (design §4.4 aggregates)."""
|
|
33
|
+
|
|
34
|
+
symbol_id: str
|
|
35
|
+
churn_30d: int
|
|
36
|
+
churn_90d: int
|
|
37
|
+
top_authors: list[tuple[str, int]] # (name, commits), ≤3, desc by commits
|
|
38
|
+
introduced_sha: str
|
|
39
|
+
introduced_ts: int
|
|
40
|
+
last_changed_sha: str
|
|
41
|
+
last_changed_ts: int
|
|
42
|
+
|
|
43
|
+
def attrs(self) -> dict[str, object]:
|
|
44
|
+
"""The denormalised view written onto a node's ``attrs`` (design §4.0)
|
|
45
|
+
so ``ckg_symbol`` surfaces it with no join."""
|
|
46
|
+
return {
|
|
47
|
+
"churn_30d": self.churn_30d,
|
|
48
|
+
"churn_90d": self.churn_90d,
|
|
49
|
+
"top_authors": [{"name": n, "commits": c} for n, c in self.top_authors],
|
|
50
|
+
"introduced": self.introduced_sha,
|
|
51
|
+
"introduced_ts": self.introduced_ts,
|
|
52
|
+
"last_changed": self.last_changed_sha,
|
|
53
|
+
"last_changed_ts": self.last_changed_ts,
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class _Acc:
|
|
59
|
+
"""Per-symbol accumulator while walking the log."""
|
|
60
|
+
|
|
61
|
+
churn_30d: int = 0
|
|
62
|
+
churn_90d: int = 0
|
|
63
|
+
authors: dict[str, set[str]] = field(default_factory=lambda: defaultdict(set)) # name -> shas
|
|
64
|
+
first: tuple[int, str] = (1 << 62, "") # (ts, sha) oldest
|
|
65
|
+
last: tuple[int, str] = (-1, "") # (ts, sha) newest
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ChurnMiner:
|
|
69
|
+
"""Mines churn/authorship for a set of files over a commit-time window."""
|
|
70
|
+
|
|
71
|
+
def __init__(self, repo_root: str, *, now_ts: int, window_days: int = 90) -> None:
|
|
72
|
+
self._root = repo_root
|
|
73
|
+
self._now = now_ts
|
|
74
|
+
self._window = window_days
|
|
75
|
+
|
|
76
|
+
def mine(
|
|
77
|
+
self, spans_by_path: dict[str, list[tuple[str, tuple[int, int]]]]
|
|
78
|
+
) -> list[SymbolAggregate]:
|
|
79
|
+
"""Attribute windowed churn/authorship to the symbols in ``spans_by_path``
|
|
80
|
+
(``path -> [(symbol_id, (start_line, end_line)), …]``)."""
|
|
81
|
+
paths = [p for p, syms in spans_by_path.items() if syms]
|
|
82
|
+
if not paths or self._now <= 0:
|
|
83
|
+
return []
|
|
84
|
+
log = self._git_log(paths)
|
|
85
|
+
if log is None:
|
|
86
|
+
return []
|
|
87
|
+
acc: dict[str, _Acc] = defaultdict(_Acc)
|
|
88
|
+
cut30 = self._now - 30 * _DAY
|
|
89
|
+
cut90 = self._now - 90 * _DAY
|
|
90
|
+
for sha, ts, author, path, new_start, count, delta in self._hunks(log):
|
|
91
|
+
for sid in self._overlapping(spans_by_path.get(path, []), new_start, count):
|
|
92
|
+
a = acc[sid]
|
|
93
|
+
a.churn_90d += delta if ts >= cut90 else 0
|
|
94
|
+
a.churn_30d += delta if ts >= cut30 else 0
|
|
95
|
+
a.authors[author].add(sha)
|
|
96
|
+
if ts < a.first[0]:
|
|
97
|
+
a.first = (ts, sha)
|
|
98
|
+
if ts > a.last[0]:
|
|
99
|
+
a.last = (ts, sha)
|
|
100
|
+
return [self._aggregate(sid, a) for sid, a in sorted(acc.items())]
|
|
101
|
+
|
|
102
|
+
# --- internals --------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def _git_log(self, paths: list[str]) -> str | None:
|
|
105
|
+
since = datetime.fromtimestamp(max(self._now - self._window * _DAY, 0), tz=UTC).strftime(
|
|
106
|
+
"%Y-%m-%d"
|
|
107
|
+
)
|
|
108
|
+
try:
|
|
109
|
+
out = subprocess.run(
|
|
110
|
+
[
|
|
111
|
+
"git",
|
|
112
|
+
"-C",
|
|
113
|
+
self._root,
|
|
114
|
+
"log",
|
|
115
|
+
"--no-renames",
|
|
116
|
+
"--no-color",
|
|
117
|
+
"-U0",
|
|
118
|
+
f"--since={since}",
|
|
119
|
+
f"--format={_SOH}%H%x09%ct%x09%an",
|
|
120
|
+
"--",
|
|
121
|
+
*paths,
|
|
122
|
+
],
|
|
123
|
+
capture_output=True,
|
|
124
|
+
text=True,
|
|
125
|
+
check=True,
|
|
126
|
+
)
|
|
127
|
+
except (subprocess.SubprocessError, OSError):
|
|
128
|
+
return None
|
|
129
|
+
return out.stdout
|
|
130
|
+
|
|
131
|
+
def _hunks(self, log: str): # type: ignore[no-untyped-def]
|
|
132
|
+
"""Yield ``(sha, ts, author, path, new_start, line_count, churn_delta)`` per
|
|
133
|
+
hunk. ``churn_delta`` = added + deleted lines for the hunk."""
|
|
134
|
+
sha = author = path = ""
|
|
135
|
+
ts = 0
|
|
136
|
+
for line in log.splitlines():
|
|
137
|
+
if line.startswith(_SOH):
|
|
138
|
+
parts = line[1:].split("\t")
|
|
139
|
+
if len(parts) == 3:
|
|
140
|
+
sha, ts_s, author = parts
|
|
141
|
+
ts = int(ts_s) if ts_s.isdigit() else 0
|
|
142
|
+
path = ""
|
|
143
|
+
continue
|
|
144
|
+
if line.startswith("diff --git "):
|
|
145
|
+
path = ""
|
|
146
|
+
continue
|
|
147
|
+
if line.startswith("+++ b/"):
|
|
148
|
+
path = line[6:]
|
|
149
|
+
continue
|
|
150
|
+
if line.startswith("@@"):
|
|
151
|
+
m = _HUNK.match(line)
|
|
152
|
+
if not m or not path:
|
|
153
|
+
continue
|
|
154
|
+
new_start = int(m.group(1))
|
|
155
|
+
added = int(m.group(2)) if m.group(2) is not None else 1
|
|
156
|
+
deleted = self._removed(line)
|
|
157
|
+
yield sha, ts, author, path, new_start, added, added + deleted
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def _removed(hunk: str) -> int:
|
|
161
|
+
m = re.match(r"^@@ -\d+(?:,(\d+))? \+", hunk)
|
|
162
|
+
if not m:
|
|
163
|
+
return 0
|
|
164
|
+
return int(m.group(1)) if m.group(1) is not None else 1
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def _overlapping(
|
|
168
|
+
syms: list[tuple[str, tuple[int, int]]], new_start: int, count: int
|
|
169
|
+
) -> list[str]:
|
|
170
|
+
lo = new_start
|
|
171
|
+
hi = new_start + max(count, 1) - 1
|
|
172
|
+
return [sid for sid, (s, e) in syms if not (e < lo or s > hi)]
|
|
173
|
+
|
|
174
|
+
def _aggregate(self, sid: str, a: _Acc) -> SymbolAggregate:
|
|
175
|
+
top = sorted(
|
|
176
|
+
((name, len(shas)) for name, shas in a.authors.items()),
|
|
177
|
+
key=lambda t: (-t[1], t[0]),
|
|
178
|
+
)[:3]
|
|
179
|
+
intro_ts, intro_sha = a.first if a.first[1] else (0, "")
|
|
180
|
+
last_ts, last_sha = a.last if a.last[1] else (0, "")
|
|
181
|
+
return SymbolAggregate(
|
|
182
|
+
symbol_id=sid,
|
|
183
|
+
churn_30d=a.churn_30d,
|
|
184
|
+
churn_90d=a.churn_90d,
|
|
185
|
+
top_authors=top,
|
|
186
|
+
introduced_sha=intro_sha,
|
|
187
|
+
introduced_ts=intro_ts,
|
|
188
|
+
last_changed_sha=last_sha,
|
|
189
|
+
last_changed_ts=last_ts,
|
|
190
|
+
)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""``SqliteTemporalRecorder`` — the write port the indexer drives (feat-009).
|
|
2
|
+
|
|
3
|
+
The ``IncrementalIndexer`` calls ``open``/``close`` as it applies a diff (it is
|
|
4
|
+
the only writer that sees both the old and new state of a file); the recorder
|
|
5
|
+
buffers those into ``Event``s and writes them in a single transaction on
|
|
6
|
+
``flush()`` at end-of-refresh — mirroring how ``IndexMeta`` is saved last, so a
|
|
7
|
+
crash leaves a consistent log. ``open``/``close`` are sync (buffering); only
|
|
8
|
+
``flush`` touches SQLite.
|
|
9
|
+
|
|
10
|
+
Structurally satisfies ``ingest.incremental.ports.TemporalRecorder`` so the
|
|
11
|
+
deterministic ``ingest`` layer depends on a Protocol, not on ``temporal``.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Iterable
|
|
17
|
+
|
|
18
|
+
from agentforge_graph.core import GraphQuery, GraphStore, NodeKind, SymbolID
|
|
19
|
+
|
|
20
|
+
from .events import Entity, Event, EventKind
|
|
21
|
+
from .mining import ChurnMiner
|
|
22
|
+
from .store import TemporalStore
|
|
23
|
+
|
|
24
|
+
_ALL = 10_000_000
|
|
25
|
+
_SYMBOL_KINDS = (NodeKind.CLASS, NodeKind.FUNCTION, NodeKind.METHOD)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SqliteTemporalRecorder:
|
|
29
|
+
def __init__(self, store: TemporalStore) -> None:
|
|
30
|
+
self._store = store
|
|
31
|
+
self._buf: list[Event] = []
|
|
32
|
+
|
|
33
|
+
def open(self, symbol_ids: Iterable[str], at: str, ts: int) -> None:
|
|
34
|
+
self._buf.extend(
|
|
35
|
+
Event(symbol_id=sid, event=EventKind.OPENED, commit=at, ts=ts, entity=Entity.NODE)
|
|
36
|
+
for sid in symbol_ids
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
def close(self, symbol_ids: Iterable[str], at: str, ts: int) -> None:
|
|
40
|
+
self._buf.extend(
|
|
41
|
+
Event(symbol_id=sid, event=EventKind.CLOSED, commit=at, ts=ts, entity=Entity.NODE)
|
|
42
|
+
for sid in symbol_ids
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
async def record_churn(
|
|
46
|
+
self,
|
|
47
|
+
graph: GraphStore,
|
|
48
|
+
repo_root: str,
|
|
49
|
+
paths: Iterable[str],
|
|
50
|
+
commit: str,
|
|
51
|
+
commit_ts: int,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""Mine churn/authorship for ``paths``, persist aggregates, and
|
|
54
|
+
denormalise them onto the matching node ``attrs`` (design §4.5).
|
|
55
|
+
Cheap on a small diff; a no-op when nothing maps or the commit time is
|
|
56
|
+
unknown (non-git)."""
|
|
57
|
+
if commit_ts <= 0:
|
|
58
|
+
return
|
|
59
|
+
spans = await self._spans_by_path(graph, set(paths))
|
|
60
|
+
if not spans:
|
|
61
|
+
return
|
|
62
|
+
aggs = ChurnMiner(repo_root, now_ts=commit_ts).mine(spans)
|
|
63
|
+
if not aggs:
|
|
64
|
+
return
|
|
65
|
+
await self._store.upsert_aggregates(aggs)
|
|
66
|
+
for agg in aggs:
|
|
67
|
+
await graph.set_attrs(agg.symbol_id, agg.attrs())
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
async def _spans_by_path(
|
|
71
|
+
graph: GraphStore, paths: set[str]
|
|
72
|
+
) -> dict[str, list[tuple[str, tuple[int, int]]]]:
|
|
73
|
+
"""``path -> [(symbol_id, span), …]`` for the code symbols in ``paths``
|
|
74
|
+
that carry a span (the attribution targets)."""
|
|
75
|
+
out: dict[str, list[tuple[str, tuple[int, int]]]] = {}
|
|
76
|
+
nodes = (await graph.query(GraphQuery(kinds=list(_SYMBOL_KINDS), limit=_ALL))).nodes
|
|
77
|
+
for n in nodes:
|
|
78
|
+
if n.span is None:
|
|
79
|
+
continue
|
|
80
|
+
path = SymbolID.parse(n.id).path
|
|
81
|
+
if path in paths:
|
|
82
|
+
out.setdefault(path, []).append((n.id, n.span))
|
|
83
|
+
return out
|
|
84
|
+
|
|
85
|
+
async def flush(self) -> None:
|
|
86
|
+
if not self._buf:
|
|
87
|
+
return
|
|
88
|
+
events, self._buf = self._buf, []
|
|
89
|
+
await self._store.record(events)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def build_recorder(root: str) -> SqliteTemporalRecorder:
|
|
93
|
+
"""Open the sidecar under ``root`` (the ``.ckg`` dir) and wrap it."""
|
|
94
|
+
return SqliteTemporalRecorder(TemporalStore.open(root))
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
async def seed_symbols(
|
|
98
|
+
graph: GraphStore,
|
|
99
|
+
recorder: SqliteTemporalRecorder,
|
|
100
|
+
commit: str,
|
|
101
|
+
ts: int,
|
|
102
|
+
repo_root: str = "",
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Open intervals for every code symbol currently in the graph — used after
|
|
105
|
+
a full index so 'introduced' is anchored at the index commit — then mine
|
|
106
|
+
churn/authorship for the whole tree so a fresh index already carries the
|
|
107
|
+
ranking signal. Idempotent: a re-index of the same commit re-opens the same
|
|
108
|
+
events (deduped by the store)."""
|
|
109
|
+
nodes = (await graph.query(GraphQuery(kinds=list(_SYMBOL_KINDS), limit=_ALL))).nodes
|
|
110
|
+
recorder.open([n.id for n in nodes], commit, ts)
|
|
111
|
+
await recorder.flush()
|
|
112
|
+
if repo_root:
|
|
113
|
+
paths = {SymbolID.parse(n.id).path for n in nodes}
|
|
114
|
+
await recorder.record_churn(graph, repo_root, paths, commit, ts)
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""``TemporalStore`` — the append-only evolution log (feat-009).
|
|
2
|
+
|
|
3
|
+
A stdlib-``sqlite3`` sidecar at ``.ckg/temporal.db``, deliberately *separate*
|
|
4
|
+
from the graph/vector stores: it keeps the current-graph hot path and both
|
|
5
|
+
store adapters untouched (design-009 §4.2), is trivially prunable, and is absent
|
|
6
|
+
for non-git / temporal-off repos. Writes are append-only and idempotent per
|
|
7
|
+
``(symbol_id, commit, event, ref)`` so a crashed-then-retried refresh stays
|
|
8
|
+
consistent.
|
|
9
|
+
|
|
10
|
+
Chunk 1 implements the ``events`` table (node lifecycle); the ``aggregates``
|
|
11
|
+
table (churn/authorship, chunk 2) is created here but populated later.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import asyncio
|
|
17
|
+
import json
|
|
18
|
+
import sqlite3
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from .events import Entity, Event, EventKind
|
|
23
|
+
from .mining import SymbolAggregate
|
|
24
|
+
|
|
25
|
+
_DB = "temporal.db"
|
|
26
|
+
|
|
27
|
+
_SCHEMA = [
|
|
28
|
+
"""CREATE TABLE IF NOT EXISTS events (
|
|
29
|
+
symbol_id TEXT NOT NULL,
|
|
30
|
+
entity TEXT NOT NULL,
|
|
31
|
+
event TEXT NOT NULL,
|
|
32
|
+
commit_sha TEXT NOT NULL,
|
|
33
|
+
ts INTEGER NOT NULL,
|
|
34
|
+
ref TEXT
|
|
35
|
+
)""",
|
|
36
|
+
# idempotency: the same lifecycle fact recorded twice is a no-op. COALESCE so
|
|
37
|
+
# a NULL ref participates in uniqueness (SQLite treats NULLs as distinct).
|
|
38
|
+
"""CREATE UNIQUE INDEX IF NOT EXISTS events_unique
|
|
39
|
+
ON events(symbol_id, commit_sha, event, COALESCE(ref, ''))""",
|
|
40
|
+
"CREATE INDEX IF NOT EXISTS events_by_symbol ON events(symbol_id)",
|
|
41
|
+
"CREATE INDEX IF NOT EXISTS events_by_ts ON events(ts)",
|
|
42
|
+
# aggregates: periodic, bounded — populated in chunk 2 (churn/authorship).
|
|
43
|
+
"""CREATE TABLE IF NOT EXISTS aggregates (
|
|
44
|
+
symbol_id TEXT PRIMARY KEY,
|
|
45
|
+
churn_30d INTEGER,
|
|
46
|
+
churn_90d INTEGER,
|
|
47
|
+
top_authors TEXT,
|
|
48
|
+
introduced_sha TEXT,
|
|
49
|
+
introduced_ts INTEGER,
|
|
50
|
+
last_changed_sha TEXT,
|
|
51
|
+
last_changed_ts INTEGER
|
|
52
|
+
)""",
|
|
53
|
+
# meta: small key/value side-table (chunk 4 backfill cursor, etc.).
|
|
54
|
+
"CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT)",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class TemporalStore:
|
|
59
|
+
"""Embedded SQLite evolution log. Opened once per index/refresh; each
|
|
60
|
+
operation uses its own short-lived connection (SQLite connections are not
|
|
61
|
+
shareable across threads, and ops run via ``asyncio.to_thread``)."""
|
|
62
|
+
|
|
63
|
+
def __init__(self, path: Path) -> None:
|
|
64
|
+
self._path = path
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def path(self) -> Path:
|
|
68
|
+
return self._path
|
|
69
|
+
|
|
70
|
+
@classmethod
|
|
71
|
+
def open(cls, root: str | Path) -> TemporalStore:
|
|
72
|
+
"""Create (if needed) the sidecar under ``root`` (the ``.ckg`` dir) and
|
|
73
|
+
ensure the schema exists."""
|
|
74
|
+
p = Path(root) / _DB
|
|
75
|
+
p.parent.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
conn = sqlite3.connect(str(p))
|
|
77
|
+
try:
|
|
78
|
+
for ddl in _SCHEMA:
|
|
79
|
+
conn.execute(ddl)
|
|
80
|
+
conn.commit()
|
|
81
|
+
finally:
|
|
82
|
+
conn.close()
|
|
83
|
+
return cls(p)
|
|
84
|
+
|
|
85
|
+
async def record(self, events: list[Event]) -> int:
|
|
86
|
+
"""Append events; return the number newly inserted (duplicates ignored)."""
|
|
87
|
+
if not events:
|
|
88
|
+
return 0
|
|
89
|
+
return await asyncio.to_thread(self._record_sync, events)
|
|
90
|
+
|
|
91
|
+
def _record_sync(self, events: list[Event]) -> int:
|
|
92
|
+
conn = sqlite3.connect(str(self._path))
|
|
93
|
+
try:
|
|
94
|
+
cur = conn.executemany(
|
|
95
|
+
"INSERT OR IGNORE INTO events"
|
|
96
|
+
"(symbol_id, entity, event, commit_sha, ts, ref) VALUES (?, ?, ?, ?, ?, ?)",
|
|
97
|
+
[
|
|
98
|
+
(e.symbol_id, e.entity.value, e.event.value, e.commit, e.ts, e.ref)
|
|
99
|
+
for e in events
|
|
100
|
+
],
|
|
101
|
+
)
|
|
102
|
+
conn.commit()
|
|
103
|
+
return cur.rowcount if cur.rowcount is not None else 0
|
|
104
|
+
finally:
|
|
105
|
+
conn.close()
|
|
106
|
+
|
|
107
|
+
async def events_for(self, symbol_id: str) -> list[Event]:
|
|
108
|
+
"""All events for one symbol, oldest first."""
|
|
109
|
+
return await asyncio.to_thread(self._events_for_sync, symbol_id)
|
|
110
|
+
|
|
111
|
+
def _events_for_sync(self, symbol_id: str) -> list[Event]:
|
|
112
|
+
conn = sqlite3.connect(str(self._path))
|
|
113
|
+
try:
|
|
114
|
+
rows = conn.execute(
|
|
115
|
+
"SELECT symbol_id, entity, event, commit_sha, ts, ref FROM events "
|
|
116
|
+
"WHERE symbol_id = ? ORDER BY ts, rowid",
|
|
117
|
+
(symbol_id,),
|
|
118
|
+
).fetchall()
|
|
119
|
+
finally:
|
|
120
|
+
conn.close()
|
|
121
|
+
return [_row_to_event(r) for r in rows]
|
|
122
|
+
|
|
123
|
+
async def all_events(self) -> list[Event]:
|
|
124
|
+
"""Every event, oldest first (test/inspection helper)."""
|
|
125
|
+
return await asyncio.to_thread(self._all_events_sync)
|
|
126
|
+
|
|
127
|
+
def _all_events_sync(self) -> list[Event]:
|
|
128
|
+
conn = sqlite3.connect(str(self._path))
|
|
129
|
+
try:
|
|
130
|
+
rows = conn.execute(
|
|
131
|
+
"SELECT symbol_id, entity, event, commit_sha, ts, ref FROM events "
|
|
132
|
+
"ORDER BY ts, rowid"
|
|
133
|
+
).fetchall()
|
|
134
|
+
finally:
|
|
135
|
+
conn.close()
|
|
136
|
+
return [_row_to_event(r) for r in rows]
|
|
137
|
+
|
|
138
|
+
# --- meta key/value (chunk 4 resume cursor) ---------------------------
|
|
139
|
+
|
|
140
|
+
async def get_meta(self, key: str) -> str | None:
|
|
141
|
+
return await asyncio.to_thread(self._get_meta_sync, key)
|
|
142
|
+
|
|
143
|
+
def _get_meta_sync(self, key: str) -> str | None:
|
|
144
|
+
conn = sqlite3.connect(str(self._path))
|
|
145
|
+
try:
|
|
146
|
+
row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
|
|
147
|
+
finally:
|
|
148
|
+
conn.close()
|
|
149
|
+
return row[0] if row else None
|
|
150
|
+
|
|
151
|
+
async def set_meta(self, key: str, value: str) -> None:
|
|
152
|
+
await asyncio.to_thread(self._set_meta_sync, key, value)
|
|
153
|
+
|
|
154
|
+
def _set_meta_sync(self, key: str, value: str) -> None:
|
|
155
|
+
conn = sqlite3.connect(str(self._path))
|
|
156
|
+
try:
|
|
157
|
+
conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES (?, ?)", (key, value))
|
|
158
|
+
conn.commit()
|
|
159
|
+
finally:
|
|
160
|
+
conn.close()
|
|
161
|
+
|
|
162
|
+
async def count_events(self) -> int:
|
|
163
|
+
"""Total events in the log (for ``ckg status``)."""
|
|
164
|
+
return await asyncio.to_thread(self._count_events_sync)
|
|
165
|
+
|
|
166
|
+
def _count_events_sync(self) -> int:
|
|
167
|
+
conn = sqlite3.connect(str(self._path))
|
|
168
|
+
try:
|
|
169
|
+
return int(conn.execute("SELECT COUNT(*) FROM events").fetchone()[0])
|
|
170
|
+
finally:
|
|
171
|
+
conn.close()
|
|
172
|
+
|
|
173
|
+
# --- aggregates (churn / authorship, chunk 2) -------------------------
|
|
174
|
+
|
|
175
|
+
async def upsert_aggregates(self, aggs: list[SymbolAggregate]) -> int:
|
|
176
|
+
"""Insert/replace churn/authorship rollups; return rows written."""
|
|
177
|
+
if not aggs:
|
|
178
|
+
return 0
|
|
179
|
+
return await asyncio.to_thread(self._upsert_aggregates_sync, aggs)
|
|
180
|
+
|
|
181
|
+
def _upsert_aggregates_sync(self, aggs: list[SymbolAggregate]) -> int:
|
|
182
|
+
conn = sqlite3.connect(str(self._path))
|
|
183
|
+
try:
|
|
184
|
+
cur = conn.executemany(
|
|
185
|
+
"INSERT OR REPLACE INTO aggregates"
|
|
186
|
+
"(symbol_id, churn_30d, churn_90d, top_authors,"
|
|
187
|
+
" introduced_sha, introduced_ts, last_changed_sha, last_changed_ts)"
|
|
188
|
+
" VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
189
|
+
[
|
|
190
|
+
(
|
|
191
|
+
a.symbol_id,
|
|
192
|
+
a.churn_30d,
|
|
193
|
+
a.churn_90d,
|
|
194
|
+
json.dumps([{"name": n, "commits": c} for n, c in a.top_authors]),
|
|
195
|
+
a.introduced_sha,
|
|
196
|
+
a.introduced_ts,
|
|
197
|
+
a.last_changed_sha,
|
|
198
|
+
a.last_changed_ts,
|
|
199
|
+
)
|
|
200
|
+
for a in aggs
|
|
201
|
+
],
|
|
202
|
+
)
|
|
203
|
+
conn.commit()
|
|
204
|
+
return cur.rowcount if cur.rowcount is not None else 0
|
|
205
|
+
finally:
|
|
206
|
+
conn.close()
|
|
207
|
+
|
|
208
|
+
async def all_aggregates(self) -> list[SymbolAggregate]:
|
|
209
|
+
"""Every stored rollup (for ``changed_since`` scans)."""
|
|
210
|
+
return await asyncio.to_thread(self._all_aggregates_sync)
|
|
211
|
+
|
|
212
|
+
def _all_aggregates_sync(self) -> list[SymbolAggregate]:
|
|
213
|
+
conn = sqlite3.connect(str(self._path))
|
|
214
|
+
try:
|
|
215
|
+
rows = conn.execute(
|
|
216
|
+
"SELECT symbol_id, churn_30d, churn_90d, top_authors,"
|
|
217
|
+
" introduced_sha, introduced_ts, last_changed_sha, last_changed_ts"
|
|
218
|
+
" FROM aggregates"
|
|
219
|
+
).fetchall()
|
|
220
|
+
finally:
|
|
221
|
+
conn.close()
|
|
222
|
+
return [_row_to_aggregate(r) for r in rows]
|
|
223
|
+
|
|
224
|
+
async def aggregate_for(self, symbol_id: str) -> SymbolAggregate | None:
|
|
225
|
+
"""The stored rollup for one symbol, or ``None``."""
|
|
226
|
+
return await asyncio.to_thread(self._aggregate_for_sync, symbol_id)
|
|
227
|
+
|
|
228
|
+
def _aggregate_for_sync(self, symbol_id: str) -> SymbolAggregate | None:
|
|
229
|
+
conn = sqlite3.connect(str(self._path))
|
|
230
|
+
try:
|
|
231
|
+
row = conn.execute(
|
|
232
|
+
"SELECT symbol_id, churn_30d, churn_90d, top_authors,"
|
|
233
|
+
" introduced_sha, introduced_ts, last_changed_sha, last_changed_ts"
|
|
234
|
+
" FROM aggregates WHERE symbol_id = ?",
|
|
235
|
+
(symbol_id,),
|
|
236
|
+
).fetchone()
|
|
237
|
+
finally:
|
|
238
|
+
conn.close()
|
|
239
|
+
return _row_to_aggregate(row) if row else None
|
|
240
|
+
|
|
241
|
+
async def prune(self, before_ts: int) -> int:
|
|
242
|
+
"""Delete CLOSED events older than ``before_ts`` (retention horizon).
|
|
243
|
+
OPENED events are kept (they anchor 'introduced'); full retention math
|
|
244
|
+
lands in chunk 5. Returns rows removed."""
|
|
245
|
+
return await asyncio.to_thread(self._prune_sync, before_ts)
|
|
246
|
+
|
|
247
|
+
def _prune_sync(self, before_ts: int) -> int:
|
|
248
|
+
conn = sqlite3.connect(str(self._path))
|
|
249
|
+
try:
|
|
250
|
+
cur = conn.execute(
|
|
251
|
+
"DELETE FROM events WHERE event = ? AND ts < ?",
|
|
252
|
+
(EventKind.CLOSED.value, before_ts),
|
|
253
|
+
)
|
|
254
|
+
conn.commit()
|
|
255
|
+
return cur.rowcount if cur.rowcount is not None else 0
|
|
256
|
+
finally:
|
|
257
|
+
conn.close()
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _row_to_event(r: tuple[Any, ...]) -> Event:
|
|
261
|
+
return Event(
|
|
262
|
+
symbol_id=r[0],
|
|
263
|
+
entity=Entity(r[1]),
|
|
264
|
+
event=EventKind(r[2]),
|
|
265
|
+
commit=r[3],
|
|
266
|
+
ts=r[4],
|
|
267
|
+
ref=r[5],
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _row_to_aggregate(r: tuple[Any, ...]) -> SymbolAggregate:
|
|
272
|
+
authors = [(a["name"], a["commits"]) for a in json.loads(r[3] or "[]")]
|
|
273
|
+
return SymbolAggregate(
|
|
274
|
+
symbol_id=r[0],
|
|
275
|
+
churn_30d=r[1],
|
|
276
|
+
churn_90d=r[2],
|
|
277
|
+
top_authors=authors,
|
|
278
|
+
introduced_sha=r[4] or "",
|
|
279
|
+
introduced_ts=r[5] or 0,
|
|
280
|
+
last_changed_sha=r[6] or "",
|
|
281
|
+
last_changed_ts=r[7] or 0,
|
|
282
|
+
)
|