agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""``ADRParser`` — parse an architecture decision record markdown file into a
|
|
2
|
+
format-neutral ``ParsedADR`` (feat-010 MVP).
|
|
3
|
+
|
|
4
|
+
Tolerant of MADR (YAML frontmatter), Nygard, and adr-tools layouts: read a
|
|
5
|
+
frontmatter block if present, else scan headings/lines for the title, status,
|
|
6
|
+
date, and supersedes link. A file that yields no recognisable ADR shape still
|
|
7
|
+
becomes a ``Decision`` titled from its filename — degrade, never drop (spec §8).
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import PurePosixPath
|
|
15
|
+
|
|
16
|
+
import yaml
|
|
17
|
+
|
|
18
|
+
_STATUSES = {"proposed", "accepted", "superseded", "deprecated", "rejected"}
|
|
19
|
+
_DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
|
|
20
|
+
# "Supersedes ADR-0007", "supersede 0007", "Superseded by ADR-0012"
|
|
21
|
+
_SUPERSEDES_RE = re.compile(r"supersedes?\s+(?:adr-?)?(\d+)", re.IGNORECASE)
|
|
22
|
+
_ADR_NUM_RE = re.compile(r"(\d+)")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class DocSection:
|
|
27
|
+
heading: str
|
|
28
|
+
text: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ParsedADR:
|
|
33
|
+
title: str
|
|
34
|
+
status: str = "proposed"
|
|
35
|
+
date: str = ""
|
|
36
|
+
adr_id: str = "" # e.g. "ADR-0012" (from filename number or frontmatter)
|
|
37
|
+
supersedes_num: str = "" # the numeric id of a superseded ADR, if any
|
|
38
|
+
body: str = ""
|
|
39
|
+
sections: list[DocSection] = field(default_factory=list)
|
|
40
|
+
well_formed: bool = True # False → fell back to filename title
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _split_frontmatter(text: str) -> tuple[dict[str, object], str]:
|
|
44
|
+
if not text.startswith("---"):
|
|
45
|
+
return {}, text
|
|
46
|
+
parts = text.split("\n", 1)
|
|
47
|
+
rest = parts[1] if len(parts) > 1 else ""
|
|
48
|
+
end = rest.find("\n---")
|
|
49
|
+
if end == -1:
|
|
50
|
+
return {}, text
|
|
51
|
+
block = rest[:end]
|
|
52
|
+
remainder = rest[end + len("\n---") :].lstrip("\n")
|
|
53
|
+
try:
|
|
54
|
+
data = yaml.safe_load(block)
|
|
55
|
+
except yaml.YAMLError:
|
|
56
|
+
return {}, text
|
|
57
|
+
return (data if isinstance(data, dict) else {}), remainder
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _sections(body: str) -> list[DocSection]:
|
|
61
|
+
sections: list[DocSection] = []
|
|
62
|
+
heading = ""
|
|
63
|
+
buf: list[str] = []
|
|
64
|
+
for line in body.splitlines():
|
|
65
|
+
if line.startswith("#"):
|
|
66
|
+
if buf or heading:
|
|
67
|
+
sections.append(DocSection(heading=heading, text="\n".join(buf).strip()))
|
|
68
|
+
heading = line.lstrip("#").strip()
|
|
69
|
+
buf = []
|
|
70
|
+
else:
|
|
71
|
+
buf.append(line)
|
|
72
|
+
if buf or heading:
|
|
73
|
+
sections.append(DocSection(heading=heading, text="\n".join(buf).strip()))
|
|
74
|
+
return [s for s in sections if s.text or s.heading]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _first_heading_title(body: str) -> str:
|
|
78
|
+
for line in body.splitlines():
|
|
79
|
+
s = line.strip()
|
|
80
|
+
if s.startswith("#"):
|
|
81
|
+
return s.lstrip("#").strip()
|
|
82
|
+
return ""
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _status_from(text: str) -> str:
|
|
86
|
+
# a "Status: accepted" line, a "## Status\naccepted" section, or a bare word
|
|
87
|
+
for raw in text.splitlines():
|
|
88
|
+
low = raw.strip().lower().lstrip("#").strip()
|
|
89
|
+
low = low.removeprefix("status:").strip()
|
|
90
|
+
for token in re.split(r"[\s,]+", low):
|
|
91
|
+
if token in _STATUSES:
|
|
92
|
+
return token
|
|
93
|
+
return ""
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class ADRParser:
|
|
97
|
+
name = "adr-parser"
|
|
98
|
+
|
|
99
|
+
def parse(self, path: str, text: str) -> ParsedADR:
|
|
100
|
+
fm, body = _split_frontmatter(text)
|
|
101
|
+
stem = PurePosixPath(path).stem
|
|
102
|
+
num_match = _ADR_NUM_RE.search(stem)
|
|
103
|
+
adr_id = f"ADR-{int(num_match.group(1)):04d}" if num_match else ""
|
|
104
|
+
|
|
105
|
+
title = str(fm.get("title") or _first_heading_title(body) or "").strip()
|
|
106
|
+
well_formed = bool(title)
|
|
107
|
+
if not title:
|
|
108
|
+
title = stem.replace("-", " ").replace("_", " ").strip() or path
|
|
109
|
+
|
|
110
|
+
status = str(fm.get("status") or "").strip().lower() or _status_from(body)
|
|
111
|
+
status = status if status in _STATUSES else "proposed"
|
|
112
|
+
|
|
113
|
+
date = str(fm.get("date") or "").strip()
|
|
114
|
+
if not date:
|
|
115
|
+
m = _DATE_RE.search(body)
|
|
116
|
+
date = m.group(1) if m else ""
|
|
117
|
+
|
|
118
|
+
supersedes_num = ""
|
|
119
|
+
fm_sup = fm.get("superseded-by") or fm.get("supersedes")
|
|
120
|
+
if fm_sup:
|
|
121
|
+
m = _ADR_NUM_RE.search(str(fm_sup))
|
|
122
|
+
supersedes_num = str(int(m.group(1))) if m else ""
|
|
123
|
+
if not supersedes_num:
|
|
124
|
+
m = _SUPERSEDES_RE.search(body)
|
|
125
|
+
supersedes_num = str(int(m.group(1))) if m else ""
|
|
126
|
+
|
|
127
|
+
return ParsedADR(
|
|
128
|
+
title=title,
|
|
129
|
+
status=status,
|
|
130
|
+
date=date,
|
|
131
|
+
adr_id=adr_id,
|
|
132
|
+
supersedes_num=supersedes_num,
|
|
133
|
+
body=body,
|
|
134
|
+
sections=_sections(body),
|
|
135
|
+
well_formed=well_formed,
|
|
136
|
+
)
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""``CommitIngestor`` — turn meaningful git commit messages into graph facts
|
|
2
|
+
(feat-010 follow-up).
|
|
3
|
+
|
|
4
|
+
A commit whose subject is a *conventional commit* (``feat:`` / ``fix:`` / …) or
|
|
5
|
+
carries an *issue reference* (``#123`` / ``PROJ-45``) is high-signal: it records
|
|
6
|
+
*why* a change was made. We ingest the subjects of the last ``limit`` such commits
|
|
7
|
+
as ``DocChunk``s that ``DESCRIBES`` the in-repo files they touched — so "why did
|
|
8
|
+
the retry logic change?" can reach the commit and the code it touched.
|
|
9
|
+
|
|
10
|
+
Git is read via ``git log`` (subprocess; no ``agentforge`` import — the knowledge
|
|
11
|
+
package stays deterministic, ADR-0001). Commit chunks are keyed by sha and added
|
|
12
|
+
idempotently (a re-index skips shas already present); they are immutable, so there
|
|
13
|
+
is no per-file GC.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import hashlib
|
|
19
|
+
import re
|
|
20
|
+
import subprocess
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
from agentforge_graph.core import (
|
|
24
|
+
Edge,
|
|
25
|
+
EdgeKind,
|
|
26
|
+
GraphQuery,
|
|
27
|
+
GraphStore,
|
|
28
|
+
Node,
|
|
29
|
+
NodeKind,
|
|
30
|
+
Provenance,
|
|
31
|
+
SymbolID,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
_ALL = 10_000_000
|
|
35
|
+
_DOC_LANG = "doc"
|
|
36
|
+
_COMMITS_PATH = "<commits>" # synthetic SymbolID path namespace for commit chunks
|
|
37
|
+
_RS = "\x1e" # record separator between commits
|
|
38
|
+
_US = "\x1f" # field separator within a commit's header line
|
|
39
|
+
|
|
40
|
+
# conventional commit: `type(scope)?!: summary`
|
|
41
|
+
_CONVENTIONAL = re.compile(
|
|
42
|
+
r"^(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\([^)]*\))?!?:",
|
|
43
|
+
re.IGNORECASE,
|
|
44
|
+
)
|
|
45
|
+
# an issue reference: `#123` or `PROJ-45`
|
|
46
|
+
_ISSUE_REF = re.compile(r"(#\d+|\b[A-Z][A-Z0-9]+-\d+\b)")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _qualifies(subject: str) -> bool:
|
|
50
|
+
return bool(_CONVENTIONAL.match(subject.strip()) or _ISSUE_REF.search(subject))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class CommitIngestor:
|
|
54
|
+
def __init__(self, repo: str, root: str | Path, commit: str = "", limit: int = 50) -> None:
|
|
55
|
+
self.repo = repo
|
|
56
|
+
self.root = str(root)
|
|
57
|
+
self.commit = commit
|
|
58
|
+
self.limit = max(1, limit)
|
|
59
|
+
|
|
60
|
+
async def ingest(self, store: GraphStore) -> int:
|
|
61
|
+
commits = self._git_log()
|
|
62
|
+
if not commits:
|
|
63
|
+
return 0
|
|
64
|
+
path_index = await self._path_index(store)
|
|
65
|
+
existing = await self._existing_shas(store)
|
|
66
|
+
prov = Provenance.parsed("commit-ingestor", self.commit)
|
|
67
|
+
facts: list[Node | Edge] = []
|
|
68
|
+
count = 0
|
|
69
|
+
for sha, ts, author, subject, files in commits:
|
|
70
|
+
if sha in existing or not _qualifies(subject):
|
|
71
|
+
continue
|
|
72
|
+
targets = [path_index[f] for f in files if f in path_index]
|
|
73
|
+
if not targets: # touched no in-repo code → nothing to describe
|
|
74
|
+
continue
|
|
75
|
+
chunk_id = SymbolID.for_symbol(
|
|
76
|
+
_DOC_LANG, self.repo, _COMMITS_PATH, f"commit({sha[:12]})."
|
|
77
|
+
)
|
|
78
|
+
facts.append(
|
|
79
|
+
Node(
|
|
80
|
+
id=chunk_id,
|
|
81
|
+
kind=NodeKind.DOC_CHUNK,
|
|
82
|
+
name=subject[:80],
|
|
83
|
+
attrs={
|
|
84
|
+
"path": _COMMITS_PATH,
|
|
85
|
+
"heading": subject[:80],
|
|
86
|
+
"text": subject,
|
|
87
|
+
"doc_source": "commit",
|
|
88
|
+
"commit": sha,
|
|
89
|
+
"author": author,
|
|
90
|
+
"ts": ts,
|
|
91
|
+
"content_hash": hashlib.sha256(f"{sha}:{subject}".encode()).hexdigest(),
|
|
92
|
+
},
|
|
93
|
+
provenance=prov,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
for target in sorted(set(targets)):
|
|
97
|
+
facts.append(
|
|
98
|
+
Edge(src=chunk_id, dst=target, kind=EdgeKind.DESCRIBES, provenance=prov)
|
|
99
|
+
)
|
|
100
|
+
count += 1
|
|
101
|
+
if facts:
|
|
102
|
+
await store.add(facts)
|
|
103
|
+
return count
|
|
104
|
+
|
|
105
|
+
async def _path_index(self, store: GraphStore) -> dict[str, str]:
|
|
106
|
+
return {
|
|
107
|
+
SymbolID.parse(n.id).path: n.id
|
|
108
|
+
for n in (await store.query(GraphQuery(kinds=[NodeKind.FILE], limit=_ALL))).nodes
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
async def _existing_shas(self, store: GraphStore) -> set[str]:
|
|
112
|
+
nodes = (await store.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
|
|
113
|
+
return {
|
|
114
|
+
str(n.attrs.get("commit", "")) for n in nodes if n.attrs.get("doc_source") == "commit"
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
def _git_log(self) -> list[tuple[str, int, str, str, list[str]]]:
|
|
118
|
+
"""Last ``limit`` non-merge commits as (sha, author_ts, author, subject,
|
|
119
|
+
touched files). Subject-only keeps the ``--name-only`` parse unambiguous;
|
|
120
|
+
full-body ingestion is a refinement."""
|
|
121
|
+
try:
|
|
122
|
+
out = subprocess.run(
|
|
123
|
+
[
|
|
124
|
+
"git",
|
|
125
|
+
"-C",
|
|
126
|
+
self.root,
|
|
127
|
+
"log",
|
|
128
|
+
f"-n{self.limit}",
|
|
129
|
+
"--no-merges",
|
|
130
|
+
"--no-color",
|
|
131
|
+
"--name-only",
|
|
132
|
+
f"--format={_RS}%H{_US}%ct{_US}%an{_US}%s",
|
|
133
|
+
],
|
|
134
|
+
capture_output=True,
|
|
135
|
+
text=True,
|
|
136
|
+
check=True,
|
|
137
|
+
)
|
|
138
|
+
except (subprocess.SubprocessError, OSError):
|
|
139
|
+
return []
|
|
140
|
+
commits: list[tuple[str, int, str, str, list[str]]] = []
|
|
141
|
+
for block in out.stdout.split(_RS):
|
|
142
|
+
block = block.strip("\n")
|
|
143
|
+
if not block:
|
|
144
|
+
continue
|
|
145
|
+
head, _, rest = block.partition("\n")
|
|
146
|
+
parts = head.split(_US)
|
|
147
|
+
if len(parts) < 4:
|
|
148
|
+
continue
|
|
149
|
+
sha, ts_s, author, subject = parts[0], parts[1], parts[2], parts[3]
|
|
150
|
+
files = [ln for ln in rest.splitlines() if ln.strip()]
|
|
151
|
+
commits.append((sha, int(ts_s) if ts_s.isdigit() else 0, author, subject, files))
|
|
152
|
+
return commits
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
"""``KnowledgeIngestor`` — turn ADR markdown into graph facts (feat-010 MVP).
|
|
2
|
+
|
|
3
|
+
Each ADR becomes its own ``FileSubgraph`` keyed by its path: a ``Decision``
|
|
4
|
+
node, body ``DocChunk`` nodes (``CONTAINS``-linked; not embedded at MVP),
|
|
5
|
+
``GOVERNS`` edges to the code it unambiguously mentions, and a ``SUPERSEDES``
|
|
6
|
+
edge to the ADR it replaces. Upserting per ADR means edits/deletes ride the
|
|
7
|
+
store's per-file machinery (feat-004) with no ChangeDetector change. Runs after
|
|
8
|
+
code indexing so the mention indices see current code; re-runs each index, and
|
|
9
|
+
GCs decisions whose ADR file is gone.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
import re
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from agentforge_graph.core import (
|
|
19
|
+
Edge,
|
|
20
|
+
EdgeKind,
|
|
21
|
+
FileSubgraph,
|
|
22
|
+
GraphQuery,
|
|
23
|
+
GraphStore,
|
|
24
|
+
Node,
|
|
25
|
+
NodeKind,
|
|
26
|
+
Provenance,
|
|
27
|
+
SymbolID,
|
|
28
|
+
)
|
|
29
|
+
from agentforge_graph.core.symbols import normalize_path
|
|
30
|
+
|
|
31
|
+
from .adr import ADRParser, _sections
|
|
32
|
+
from .mentions import extract_mentions, resolve_mentions
|
|
33
|
+
from .report import KnowledgeStats
|
|
34
|
+
|
|
35
|
+
_ALL = 10_000_000
|
|
36
|
+
_DOC_LANG = "doc" # SymbolID lang slug — keeps decision ids in their own namespace
|
|
37
|
+
_SYMBOL_KINDS = {NodeKind.CLASS, NodeKind.FUNCTION, NodeKind.METHOD}
|
|
38
|
+
_NUM_RE = re.compile(r"(\d+)")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class KnowledgeIngestor:
|
|
42
|
+
def __init__(self, repo: str, commit: str = "") -> None:
|
|
43
|
+
self.repo = repo
|
|
44
|
+
self.commit = commit
|
|
45
|
+
self.parser = ADRParser()
|
|
46
|
+
|
|
47
|
+
async def ingest(
|
|
48
|
+
self,
|
|
49
|
+
store: GraphStore,
|
|
50
|
+
repo_path: str | Path,
|
|
51
|
+
adr_globs: list[str],
|
|
52
|
+
code_exts: set[str],
|
|
53
|
+
doc_globs: list[str] | None = None,
|
|
54
|
+
) -> KnowledgeStats:
|
|
55
|
+
root = Path(repo_path)
|
|
56
|
+
files = self._discover(root, adr_globs)
|
|
57
|
+
adr_paths = {rel for rel, _ in files}
|
|
58
|
+
|
|
59
|
+
# GC decisions whose ADR file vanished
|
|
60
|
+
for path in await self._existing_decision_paths(store):
|
|
61
|
+
if path not in adr_paths:
|
|
62
|
+
await store.delete_file(path)
|
|
63
|
+
|
|
64
|
+
stats = KnowledgeStats()
|
|
65
|
+
doc_globs = doc_globs or []
|
|
66
|
+
# both the ADR and the general-doc passes need the code indices, and the
|
|
67
|
+
# doc pass must still run (to GC vanished docs) even when there are no ADRs.
|
|
68
|
+
if not files and not doc_globs:
|
|
69
|
+
return stats
|
|
70
|
+
|
|
71
|
+
path_index, name_index = await self._code_indices(store)
|
|
72
|
+
|
|
73
|
+
if files:
|
|
74
|
+
num_to_decision = {self._adr_num(rel): self._decision_id(rel) for rel, _ in files}
|
|
75
|
+
prov = Provenance.parsed(self.parser.name, self.commit)
|
|
76
|
+
built: list[tuple[FileSubgraph, bool]] = []
|
|
77
|
+
for rel, text in files:
|
|
78
|
+
sg, has_supersedes, resolved, unresolved = self._build(
|
|
79
|
+
rel, text, prov, path_index, name_index, num_to_decision, code_exts
|
|
80
|
+
)
|
|
81
|
+
built.append((sg, has_supersedes))
|
|
82
|
+
stats.decisions_indexed += 1
|
|
83
|
+
stats.governs_resolved += resolved
|
|
84
|
+
stats.mentions_unresolved += unresolved
|
|
85
|
+
# round A: land every Decision/DocChunk/GOVERNS (no SUPERSEDES yet)
|
|
86
|
+
for sg, _ in built:
|
|
87
|
+
await store.upsert(self._without(sg, EdgeKind.SUPERSEDES))
|
|
88
|
+
# round B: re-upsert ADRs that supersede, now that all Decisions exist
|
|
89
|
+
for sg, has_supersedes in built:
|
|
90
|
+
if has_supersedes:
|
|
91
|
+
await store.upsert(sg)
|
|
92
|
+
|
|
93
|
+
if doc_globs:
|
|
94
|
+
await self._ingest_docs(
|
|
95
|
+
store, root, doc_globs, adr_paths, path_index, name_index, code_exts, stats
|
|
96
|
+
)
|
|
97
|
+
return stats
|
|
98
|
+
|
|
99
|
+
# --- discovery & indices ---------------------------------------------
|
|
100
|
+
|
|
101
|
+
# Index/landing/template pages that live under the ADR globs but are not
|
|
102
|
+
# decisions (BUG-003).
|
|
103
|
+
_NON_ADR_STEMS = {"readme", "index", "template", "_template", "0000-template"}
|
|
104
|
+
|
|
105
|
+
@classmethod
|
|
106
|
+
def _discover(cls, root: Path, adr_globs: list[str]) -> list[tuple[str, str]]:
|
|
107
|
+
seen: dict[str, str] = {}
|
|
108
|
+
for pattern in adr_globs:
|
|
109
|
+
for path in sorted(root.glob(pattern)):
|
|
110
|
+
if not path.is_file() or path.stem.lower() in cls._NON_ADR_STEMS:
|
|
111
|
+
continue
|
|
112
|
+
rel = path.relative_to(root).as_posix()
|
|
113
|
+
if rel not in seen:
|
|
114
|
+
seen[rel] = path.read_text(encoding="utf-8", errors="replace")
|
|
115
|
+
return sorted(seen.items())
|
|
116
|
+
|
|
117
|
+
async def _existing_decision_paths(self, store: GraphStore) -> set[str]:
|
|
118
|
+
nodes = (await store.query(GraphQuery(kinds=[NodeKind.DECISION], limit=_ALL))).nodes
|
|
119
|
+
return {SymbolID.parse(n.id).path for n in nodes}
|
|
120
|
+
|
|
121
|
+
async def _code_indices(self, store: GraphStore) -> tuple[dict[str, str], dict[str, list[str]]]:
|
|
122
|
+
path_index: dict[str, str] = {}
|
|
123
|
+
name_index: dict[str, list[str]] = {}
|
|
124
|
+
for n in (await store.query(GraphQuery(limit=_ALL))).nodes:
|
|
125
|
+
if n.kind is NodeKind.FILE:
|
|
126
|
+
path_index[SymbolID.parse(n.id).path] = n.id
|
|
127
|
+
elif n.kind in _SYMBOL_KINDS:
|
|
128
|
+
name_index.setdefault(n.name, []).append(n.id)
|
|
129
|
+
return path_index, name_index
|
|
130
|
+
|
|
131
|
+
# --- building one ADR subgraph ---------------------------------------
|
|
132
|
+
|
|
133
|
+
def _decision_id(self, rel: str) -> str:
|
|
134
|
+
return SymbolID.for_symbol(_DOC_LANG, self.repo, rel, "decision.")
|
|
135
|
+
|
|
136
|
+
@staticmethod
|
|
137
|
+
def _adr_num(rel: str) -> str:
|
|
138
|
+
m = _NUM_RE.search(Path(rel).stem)
|
|
139
|
+
return str(int(m.group(1))) if m else ""
|
|
140
|
+
|
|
141
|
+
def _build(
|
|
142
|
+
self,
|
|
143
|
+
rel: str,
|
|
144
|
+
text: str,
|
|
145
|
+
prov: Provenance,
|
|
146
|
+
path_index: dict[str, str],
|
|
147
|
+
name_index: dict[str, list[str]],
|
|
148
|
+
num_to_decision: dict[str, str],
|
|
149
|
+
code_exts: set[str],
|
|
150
|
+
) -> tuple[FileSubgraph, bool, int, int]:
|
|
151
|
+
adr = self.parser.parse(rel, text)
|
|
152
|
+
decision_id = self._decision_id(rel)
|
|
153
|
+
nodes: list[Node] = [
|
|
154
|
+
Node(
|
|
155
|
+
id=decision_id,
|
|
156
|
+
kind=NodeKind.DECISION,
|
|
157
|
+
name=adr.title,
|
|
158
|
+
attrs={
|
|
159
|
+
"title": adr.title,
|
|
160
|
+
"status": adr.status,
|
|
161
|
+
"date": adr.date,
|
|
162
|
+
"adr_id": adr.adr_id,
|
|
163
|
+
"path": normalize_path(rel),
|
|
164
|
+
"well_formed": adr.well_formed,
|
|
165
|
+
},
|
|
166
|
+
provenance=prov,
|
|
167
|
+
)
|
|
168
|
+
]
|
|
169
|
+
edges: list[Edge] = []
|
|
170
|
+
for i, section in enumerate(adr.sections):
|
|
171
|
+
chunk_id = SymbolID.for_symbol(_DOC_LANG, self.repo, rel, f"docchunk({i}).")
|
|
172
|
+
nodes.append(
|
|
173
|
+
Node(
|
|
174
|
+
id=chunk_id,
|
|
175
|
+
kind=NodeKind.DOC_CHUNK,
|
|
176
|
+
name=section.heading or f"section{i}",
|
|
177
|
+
attrs={
|
|
178
|
+
"path": normalize_path(rel),
|
|
179
|
+
"heading": section.heading,
|
|
180
|
+
"text": section.text,
|
|
181
|
+
"seq": i,
|
|
182
|
+
# hash of the embeddable text (heading + body) — lets the
|
|
183
|
+
# embed pass detect changed doc chunks (feat-010 follow-up).
|
|
184
|
+
"content_hash": hashlib.sha256(
|
|
185
|
+
f"{section.heading}\n{section.text}".encode()
|
|
186
|
+
).hexdigest(),
|
|
187
|
+
},
|
|
188
|
+
provenance=prov,
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
edges.append(
|
|
192
|
+
Edge(src=decision_id, dst=chunk_id, kind=EdgeKind.CONTAINS, provenance=prov)
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
mentions = extract_mentions(adr.body, code_exts)
|
|
196
|
+
targets, unresolved = resolve_mentions(mentions, path_index, name_index)
|
|
197
|
+
for target in sorted(targets):
|
|
198
|
+
edges.append(Edge(src=decision_id, dst=target, kind=EdgeKind.GOVERNS, provenance=prov))
|
|
199
|
+
|
|
200
|
+
has_supersedes = False
|
|
201
|
+
if adr.supersedes_num and adr.supersedes_num in num_to_decision:
|
|
202
|
+
superseded = num_to_decision[adr.supersedes_num]
|
|
203
|
+
if superseded != decision_id:
|
|
204
|
+
edges.append(
|
|
205
|
+
Edge(
|
|
206
|
+
src=decision_id,
|
|
207
|
+
dst=superseded,
|
|
208
|
+
kind=EdgeKind.SUPERSEDES,
|
|
209
|
+
provenance=prov,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
has_supersedes = True
|
|
213
|
+
|
|
214
|
+
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
215
|
+
sg = FileSubgraph(path=rel, content_hash=content_hash, nodes=nodes, edges=edges)
|
|
216
|
+
return sg, has_supersedes, len(targets), unresolved
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
def _without(sg: FileSubgraph, kind: EdgeKind) -> FileSubgraph:
|
|
220
|
+
return sg.model_copy(update={"edges": [e for e in sg.edges if e.kind is not kind]})
|
|
221
|
+
|
|
222
|
+
# --- general docs (doc_globs) ----------------------------------------
|
|
223
|
+
|
|
224
|
+
@classmethod
|
|
225
|
+
def _discover_docs(
|
|
226
|
+
cls, root: Path, doc_globs: list[str], adr_paths: set[str]
|
|
227
|
+
) -> list[tuple[str, str]]:
|
|
228
|
+
"""Markdown docs under ``doc_globs``, minus files already ingested as ADRs.
|
|
229
|
+
Unlike ADR discovery, README/index pages ARE kept — they're general docs."""
|
|
230
|
+
seen: dict[str, str] = {}
|
|
231
|
+
for pattern in doc_globs:
|
|
232
|
+
for path in sorted(root.glob(pattern)):
|
|
233
|
+
if not path.is_file():
|
|
234
|
+
continue
|
|
235
|
+
rel = path.relative_to(root).as_posix()
|
|
236
|
+
if rel in adr_paths or rel in seen:
|
|
237
|
+
continue
|
|
238
|
+
seen[rel] = path.read_text(encoding="utf-8", errors="replace")
|
|
239
|
+
return sorted(seen.items())
|
|
240
|
+
|
|
241
|
+
async def _existing_doc_paths(self, store: GraphStore) -> set[str]:
|
|
242
|
+
nodes = (await store.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
|
|
243
|
+
return {SymbolID.parse(n.id).path for n in nodes if n.attrs.get("doc_source") == "doc"}
|
|
244
|
+
|
|
245
|
+
async def _ingest_docs(
|
|
246
|
+
self,
|
|
247
|
+
store: GraphStore,
|
|
248
|
+
root: Path,
|
|
249
|
+
doc_globs: list[str],
|
|
250
|
+
adr_paths: set[str],
|
|
251
|
+
path_index: dict[str, str],
|
|
252
|
+
name_index: dict[str, list[str]],
|
|
253
|
+
code_exts: set[str],
|
|
254
|
+
stats: KnowledgeStats,
|
|
255
|
+
) -> None:
|
|
256
|
+
doc_files = self._discover_docs(root, doc_globs, adr_paths)
|
|
257
|
+
current = {rel for rel, _ in doc_files}
|
|
258
|
+
# GC general-doc DocChunks whose source file vanished (per-file like ADRs)
|
|
259
|
+
for path in await self._existing_doc_paths(store):
|
|
260
|
+
if path not in current:
|
|
261
|
+
await store.delete_file(path)
|
|
262
|
+
prov = Provenance.parsed("doc-ingestor", self.commit)
|
|
263
|
+
for rel, text in doc_files:
|
|
264
|
+
sg, resolved = self._build_doc(rel, text, prov, path_index, name_index, code_exts)
|
|
265
|
+
if not sg.nodes: # an empty/section-less doc contributes nothing
|
|
266
|
+
continue
|
|
267
|
+
await store.upsert(sg)
|
|
268
|
+
stats.docs_indexed += 1
|
|
269
|
+
stats.describes_resolved += resolved
|
|
270
|
+
|
|
271
|
+
def _build_doc(
|
|
272
|
+
self,
|
|
273
|
+
rel: str,
|
|
274
|
+
text: str,
|
|
275
|
+
prov: Provenance,
|
|
276
|
+
path_index: dict[str, str],
|
|
277
|
+
name_index: dict[str, list[str]],
|
|
278
|
+
code_exts: set[str],
|
|
279
|
+
) -> tuple[FileSubgraph, int]:
|
|
280
|
+
"""A general doc → one DocChunk per markdown section, each ``DESCRIBES`` the
|
|
281
|
+
code it unambiguously mentions (no Decision; docs describe, ADRs govern)."""
|
|
282
|
+
nodes: list[Node] = []
|
|
283
|
+
edges: list[Edge] = []
|
|
284
|
+
resolved = 0
|
|
285
|
+
for i, section in enumerate(_sections(text)):
|
|
286
|
+
chunk_id = SymbolID.for_symbol(_DOC_LANG, self.repo, rel, f"docchunk({i}).")
|
|
287
|
+
body = f"{section.heading}\n{section.text}"
|
|
288
|
+
nodes.append(
|
|
289
|
+
Node(
|
|
290
|
+
id=chunk_id,
|
|
291
|
+
kind=NodeKind.DOC_CHUNK,
|
|
292
|
+
name=section.heading or f"section{i}",
|
|
293
|
+
attrs={
|
|
294
|
+
"path": normalize_path(rel),
|
|
295
|
+
"heading": section.heading,
|
|
296
|
+
"text": section.text,
|
|
297
|
+
"seq": i,
|
|
298
|
+
"doc_source": "doc", # distinguishes general docs from ADR chunks
|
|
299
|
+
"content_hash": hashlib.sha256(body.encode()).hexdigest(),
|
|
300
|
+
},
|
|
301
|
+
provenance=prov,
|
|
302
|
+
)
|
|
303
|
+
)
|
|
304
|
+
mentions = extract_mentions(body, code_exts)
|
|
305
|
+
targets, _unresolved = resolve_mentions(mentions, path_index, name_index)
|
|
306
|
+
for target in sorted(targets):
|
|
307
|
+
edges.append(
|
|
308
|
+
Edge(src=chunk_id, dst=target, kind=EdgeKind.DESCRIBES, provenance=prov)
|
|
309
|
+
)
|
|
310
|
+
resolved += 1
|
|
311
|
+
content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
312
|
+
return FileSubgraph(path=rel, content_hash=content_hash, nodes=nodes, edges=edges), resolved
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""Extract code mentions from an ADR body and resolve them — precisely — to
|
|
2
|
+
graph nodes (feat-010). Only **unambiguous** matches become ``GOVERNS`` edges;
|
|
3
|
+
ambiguous or unresolved mentions are counted, never guessed (ADR-0004)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
|
|
10
|
+
from agentforge_graph.core.symbols import normalize_path
|
|
11
|
+
|
|
12
|
+
_BACKTICK_RE = re.compile(r"`([^`]+)`")
|
|
13
|
+
# A qualified name: identifiers joined by '.' or '#', e.g. app.auth.login,
|
|
14
|
+
# Auth#login, PaymentService. No spaces, at least one identifier char.
|
|
15
|
+
_QUALNAME_RE = re.compile(r"^[A-Za-z_][\w]*(?:[.#][A-Za-z_][\w]*)*$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Mentions:
|
|
20
|
+
paths: set[str] = field(default_factory=set) # normalised repo-relative paths
|
|
21
|
+
names: set[str] = field(default_factory=set) # bare symbol names (last segment)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _looks_like_path(token: str, code_exts: set[str]) -> bool:
|
|
25
|
+
return "/" in token and any(token.endswith(ext) for ext in code_exts)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _leaf_name(qualname: str) -> str:
|
|
29
|
+
return re.split(r"[.#]", qualname.strip())[-1]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_mentions(body: str, code_exts: set[str]) -> Mentions:
|
|
33
|
+
m = Mentions()
|
|
34
|
+
# backtick code spans: paths or qualified names
|
|
35
|
+
for span in _BACKTICK_RE.findall(body):
|
|
36
|
+
token = span.strip()
|
|
37
|
+
if _looks_like_path(token, code_exts):
|
|
38
|
+
m.paths.add(normalize_path(token))
|
|
39
|
+
elif _QUALNAME_RE.match(token):
|
|
40
|
+
m.names.add(_leaf_name(token))
|
|
41
|
+
# bare path-like tokens anywhere (e.g. mentioned in prose without backticks)
|
|
42
|
+
ext_alt = "|".join(re.escape(e.lstrip(".")) for e in sorted(code_exts))
|
|
43
|
+
if ext_alt:
|
|
44
|
+
for token in re.findall(rf"[\w./-]+\.(?:{ext_alt})\b", body):
|
|
45
|
+
if "/" in token:
|
|
46
|
+
m.paths.add(normalize_path(token))
|
|
47
|
+
return m
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def resolve_mentions(
|
|
51
|
+
mentions: Mentions,
|
|
52
|
+
path_index: dict[str, str],
|
|
53
|
+
name_index: dict[str, list[str]],
|
|
54
|
+
) -> tuple[set[str], int]:
|
|
55
|
+
"""Map mentions to node ids. Returns (resolved target ids, unresolved count).
|
|
56
|
+
A path → its FILE id (exact). A name → its symbol id **iff unique**."""
|
|
57
|
+
targets: set[str] = set()
|
|
58
|
+
unresolved = 0
|
|
59
|
+
for path in mentions.paths:
|
|
60
|
+
file_id = path_index.get(path)
|
|
61
|
+
if file_id is not None:
|
|
62
|
+
targets.add(file_id)
|
|
63
|
+
else:
|
|
64
|
+
unresolved += 1
|
|
65
|
+
for name in mentions.names:
|
|
66
|
+
candidates = name_index.get(name, [])
|
|
67
|
+
if len(candidates) == 1:
|
|
68
|
+
targets.add(candidates[0])
|
|
69
|
+
else:
|
|
70
|
+
unresolved += 1 # 0 (unknown) or >1 (ambiguous) — never guess
|
|
71
|
+
return targets, unresolved
|