agentforge-graph 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. agentforge_graph/__init__.py +6 -0
  2. agentforge_graph/chunking/__init__.py +12 -0
  3. agentforge_graph/chunking/cast.py +159 -0
  4. agentforge_graph/chunking/chunk.py +19 -0
  5. agentforge_graph/chunking/tokens.py +15 -0
  6. agentforge_graph/cli.py +607 -0
  7. agentforge_graph/config.py +259 -0
  8. agentforge_graph/core/__init__.py +54 -0
  9. agentforge_graph/core/conformance.py +270 -0
  10. agentforge_graph/core/contracts.py +163 -0
  11. agentforge_graph/core/kinds.py +68 -0
  12. agentforge_graph/core/models.py +134 -0
  13. agentforge_graph/core/provenance.py +62 -0
  14. agentforge_graph/core/symbols.py +116 -0
  15. agentforge_graph/embed/__init__.py +28 -0
  16. agentforge_graph/embed/base.py +22 -0
  17. agentforge_graph/embed/bedrock.py +85 -0
  18. agentforge_graph/embed/fake.py +34 -0
  19. agentforge_graph/embed/openai.py +67 -0
  20. agentforge_graph/embed/pipeline.py +184 -0
  21. agentforge_graph/embed/registry.py +66 -0
  22. agentforge_graph/embed/report.py +15 -0
  23. agentforge_graph/enrich/__init__.py +70 -0
  24. agentforge_graph/enrich/anthropic.py +38 -0
  25. agentforge_graph/enrich/anthropic_client.py +109 -0
  26. agentforge_graph/enrich/bedrock.py +24 -0
  27. agentforge_graph/enrich/bedrock_client.py +115 -0
  28. agentforge_graph/enrich/bedrock_summarizer.py +23 -0
  29. agentforge_graph/enrich/claude.py +172 -0
  30. agentforge_graph/enrich/enricher.py +108 -0
  31. agentforge_graph/enrich/governs.py +173 -0
  32. agentforge_graph/enrich/governs_enricher.py +152 -0
  33. agentforge_graph/enrich/heuristics.py +224 -0
  34. agentforge_graph/enrich/judge.py +63 -0
  35. agentforge_graph/enrich/registry.py +133 -0
  36. agentforge_graph/enrich/report.py +60 -0
  37. agentforge_graph/enrich/summarizer.py +62 -0
  38. agentforge_graph/enrich/summary_enricher.py +211 -0
  39. agentforge_graph/enrich/taxonomy.py +38 -0
  40. agentforge_graph/frameworks/__init__.py +29 -0
  41. agentforge_graph/frameworks/base.py +75 -0
  42. agentforge_graph/frameworks/detect.py +124 -0
  43. agentforge_graph/frameworks/extractor.py +63 -0
  44. agentforge_graph/frameworks/orm.py +93 -0
  45. agentforge_graph/frameworks/packs/_js_ast.py +56 -0
  46. agentforge_graph/frameworks/packs/_python_ast.py +157 -0
  47. agentforge_graph/frameworks/packs/django/__init__.py +240 -0
  48. agentforge_graph/frameworks/packs/django/models.scm +7 -0
  49. agentforge_graph/frameworks/packs/express/__init__.py +133 -0
  50. agentforge_graph/frameworks/packs/express/routes.scm +8 -0
  51. agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
  52. agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
  53. agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
  54. agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
  55. agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
  56. agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
  57. agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
  58. agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
  59. agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
  60. agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
  61. agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
  62. agentforge_graph/frameworks/registry.py +44 -0
  63. agentforge_graph/ingest/__init__.py +30 -0
  64. agentforge_graph/ingest/codegraph.py +847 -0
  65. agentforge_graph/ingest/extractor.py +353 -0
  66. agentforge_graph/ingest/incremental/__init__.py +25 -0
  67. agentforge_graph/ingest/incremental/detect.py +118 -0
  68. agentforge_graph/ingest/incremental/dirty.py +61 -0
  69. agentforge_graph/ingest/incremental/indexer.py +218 -0
  70. agentforge_graph/ingest/incremental/meta.py +72 -0
  71. agentforge_graph/ingest/incremental/ports.py +39 -0
  72. agentforge_graph/ingest/pack.py +160 -0
  73. agentforge_graph/ingest/packs/__init__.py +34 -0
  74. agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
  75. agentforge_graph/ingest/packs/cpp/references.scm +15 -0
  76. agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
  77. agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
  78. agentforge_graph/ingest/packs/csharp/references.scm +12 -0
  79. agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
  80. agentforge_graph/ingest/packs/go/__init__.py +38 -0
  81. agentforge_graph/ingest/packs/go/references.scm +12 -0
  82. agentforge_graph/ingest/packs/go/structure.scm +64 -0
  83. agentforge_graph/ingest/packs/java/__init__.py +35 -0
  84. agentforge_graph/ingest/packs/java/references.scm +12 -0
  85. agentforge_graph/ingest/packs/java/structure.scm +38 -0
  86. agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
  87. agentforge_graph/ingest/packs/javascript/references.scm +11 -0
  88. agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
  89. agentforge_graph/ingest/packs/php/__init__.py +35 -0
  90. agentforge_graph/ingest/packs/php/references.scm +15 -0
  91. agentforge_graph/ingest/packs/php/structure.scm +44 -0
  92. agentforge_graph/ingest/packs/python/__init__.py +25 -0
  93. agentforge_graph/ingest/packs/python/references.scm +14 -0
  94. agentforge_graph/ingest/packs/python/structure.scm +57 -0
  95. agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
  96. agentforge_graph/ingest/packs/ruby/references.scm +12 -0
  97. agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
  98. agentforge_graph/ingest/packs/rust/__init__.py +39 -0
  99. agentforge_graph/ingest/packs/rust/references.scm +12 -0
  100. agentforge_graph/ingest/packs/rust/structure.scm +46 -0
  101. agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
  102. agentforge_graph/ingest/packs/typescript/references.scm +11 -0
  103. agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
  104. agentforge_graph/ingest/pipeline.py +134 -0
  105. agentforge_graph/ingest/report.py +84 -0
  106. agentforge_graph/ingest/resolver.py +467 -0
  107. agentforge_graph/ingest/source.py +79 -0
  108. agentforge_graph/knowledge/__init__.py +28 -0
  109. agentforge_graph/knowledge/adr.py +136 -0
  110. agentforge_graph/knowledge/commits.py +152 -0
  111. agentforge_graph/knowledge/ingest.py +312 -0
  112. agentforge_graph/knowledge/mentions.py +71 -0
  113. agentforge_graph/knowledge/report.py +32 -0
  114. agentforge_graph/main.py +21 -0
  115. agentforge_graph/providers.py +36 -0
  116. agentforge_graph/repomap/__init__.py +14 -0
  117. agentforge_graph/repomap/rank.py +161 -0
  118. agentforge_graph/repomap/render.py +55 -0
  119. agentforge_graph/repomap/repomap.py +66 -0
  120. agentforge_graph/retrieve/__init__.py +21 -0
  121. agentforge_graph/retrieve/pack.py +76 -0
  122. agentforge_graph/retrieve/rerank.py +251 -0
  123. agentforge_graph/retrieve/retriever.py +286 -0
  124. agentforge_graph/retrieve/scoring.py +36 -0
  125. agentforge_graph/serve/__init__.py +19 -0
  126. agentforge_graph/serve/engine.py +204 -0
  127. agentforge_graph/serve/http_runner.py +133 -0
  128. agentforge_graph/serve/server.py +110 -0
  129. agentforge_graph/serve/tools.py +307 -0
  130. agentforge_graph/store/__init__.py +32 -0
  131. agentforge_graph/store/_rowmap.py +102 -0
  132. agentforge_graph/store/errors.py +22 -0
  133. agentforge_graph/store/facade.py +89 -0
  134. agentforge_graph/store/kuzu_store.py +380 -0
  135. agentforge_graph/store/lance_store.py +146 -0
  136. agentforge_graph/store/neo4j_store.py +294 -0
  137. agentforge_graph/store/pgvector_store.py +170 -0
  138. agentforge_graph/store/registry.py +45 -0
  139. agentforge_graph/temporal/__init__.py +36 -0
  140. agentforge_graph/temporal/backfill.py +338 -0
  141. agentforge_graph/temporal/events.py +82 -0
  142. agentforge_graph/temporal/index.py +190 -0
  143. agentforge_graph/temporal/mining.py +190 -0
  144. agentforge_graph/temporal/recorder.py +114 -0
  145. agentforge_graph/temporal/store.py +282 -0
  146. agentforge_graph-0.3.2.dist-info/METADATA +291 -0
  147. agentforge_graph-0.3.2.dist-info/RECORD +151 -0
  148. agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
  149. agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
  150. agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
  151. agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
@@ -0,0 +1,136 @@
1
+ """``ADRParser`` — parse an architecture decision record markdown file into a
2
+ format-neutral ``ParsedADR`` (feat-010 MVP).
3
+
4
+ Tolerant of MADR (YAML frontmatter), Nygard, and adr-tools layouts: read a
5
+ frontmatter block if present, else scan headings/lines for the title, status,
6
+ date, and supersedes link. A file that yields no recognisable ADR shape still
7
+ becomes a ``Decision`` titled from its filename — degrade, never drop (spec §8).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from dataclasses import dataclass, field
14
+ from pathlib import PurePosixPath
15
+
16
+ import yaml
17
+
18
+ _STATUSES = {"proposed", "accepted", "superseded", "deprecated", "rejected"}
19
+ _DATE_RE = re.compile(r"\b(\d{4}-\d{2}-\d{2})\b")
20
+ # "Supersedes ADR-0007", "supersede 0007", "Superseded by ADR-0012"
21
+ _SUPERSEDES_RE = re.compile(r"supersedes?\s+(?:adr-?)?(\d+)", re.IGNORECASE)
22
+ _ADR_NUM_RE = re.compile(r"(\d+)")
23
+
24
+
25
+ @dataclass
26
+ class DocSection:
27
+ heading: str
28
+ text: str
29
+
30
+
31
+ @dataclass
32
+ class ParsedADR:
33
+ title: str
34
+ status: str = "proposed"
35
+ date: str = ""
36
+ adr_id: str = "" # e.g. "ADR-0012" (from filename number or frontmatter)
37
+ supersedes_num: str = "" # the numeric id of a superseded ADR, if any
38
+ body: str = ""
39
+ sections: list[DocSection] = field(default_factory=list)
40
+ well_formed: bool = True # False → fell back to filename title
41
+
42
+
43
+ def _split_frontmatter(text: str) -> tuple[dict[str, object], str]:
44
+ if not text.startswith("---"):
45
+ return {}, text
46
+ parts = text.split("\n", 1)
47
+ rest = parts[1] if len(parts) > 1 else ""
48
+ end = rest.find("\n---")
49
+ if end == -1:
50
+ return {}, text
51
+ block = rest[:end]
52
+ remainder = rest[end + len("\n---") :].lstrip("\n")
53
+ try:
54
+ data = yaml.safe_load(block)
55
+ except yaml.YAMLError:
56
+ return {}, text
57
+ return (data if isinstance(data, dict) else {}), remainder
58
+
59
+
60
+ def _sections(body: str) -> list[DocSection]:
61
+ sections: list[DocSection] = []
62
+ heading = ""
63
+ buf: list[str] = []
64
+ for line in body.splitlines():
65
+ if line.startswith("#"):
66
+ if buf or heading:
67
+ sections.append(DocSection(heading=heading, text="\n".join(buf).strip()))
68
+ heading = line.lstrip("#").strip()
69
+ buf = []
70
+ else:
71
+ buf.append(line)
72
+ if buf or heading:
73
+ sections.append(DocSection(heading=heading, text="\n".join(buf).strip()))
74
+ return [s for s in sections if s.text or s.heading]
75
+
76
+
77
+ def _first_heading_title(body: str) -> str:
78
+ for line in body.splitlines():
79
+ s = line.strip()
80
+ if s.startswith("#"):
81
+ return s.lstrip("#").strip()
82
+ return ""
83
+
84
+
85
+ def _status_from(text: str) -> str:
86
+ # a "Status: accepted" line, a "## Status\naccepted" section, or a bare word
87
+ for raw in text.splitlines():
88
+ low = raw.strip().lower().lstrip("#").strip()
89
+ low = low.removeprefix("status:").strip()
90
+ for token in re.split(r"[\s,]+", low):
91
+ if token in _STATUSES:
92
+ return token
93
+ return ""
94
+
95
+
96
+ class ADRParser:
97
+ name = "adr-parser"
98
+
99
+ def parse(self, path: str, text: str) -> ParsedADR:
100
+ fm, body = _split_frontmatter(text)
101
+ stem = PurePosixPath(path).stem
102
+ num_match = _ADR_NUM_RE.search(stem)
103
+ adr_id = f"ADR-{int(num_match.group(1)):04d}" if num_match else ""
104
+
105
+ title = str(fm.get("title") or _first_heading_title(body) or "").strip()
106
+ well_formed = bool(title)
107
+ if not title:
108
+ title = stem.replace("-", " ").replace("_", " ").strip() or path
109
+
110
+ status = str(fm.get("status") or "").strip().lower() or _status_from(body)
111
+ status = status if status in _STATUSES else "proposed"
112
+
113
+ date = str(fm.get("date") or "").strip()
114
+ if not date:
115
+ m = _DATE_RE.search(body)
116
+ date = m.group(1) if m else ""
117
+
118
+ supersedes_num = ""
119
+ fm_sup = fm.get("superseded-by") or fm.get("supersedes")
120
+ if fm_sup:
121
+ m = _ADR_NUM_RE.search(str(fm_sup))
122
+ supersedes_num = str(int(m.group(1))) if m else ""
123
+ if not supersedes_num:
124
+ m = _SUPERSEDES_RE.search(body)
125
+ supersedes_num = str(int(m.group(1))) if m else ""
126
+
127
+ return ParsedADR(
128
+ title=title,
129
+ status=status,
130
+ date=date,
131
+ adr_id=adr_id,
132
+ supersedes_num=supersedes_num,
133
+ body=body,
134
+ sections=_sections(body),
135
+ well_formed=well_formed,
136
+ )
@@ -0,0 +1,152 @@
1
+ """``CommitIngestor`` — turn meaningful git commit messages into graph facts
2
+ (feat-010 follow-up).
3
+
4
+ A commit whose subject is a *conventional commit* (``feat:`` / ``fix:`` / …) or
5
+ carries an *issue reference* (``#123`` / ``PROJ-45``) is high-signal: it records
6
+ *why* a change was made. We ingest the subjects of the last ``limit`` such commits
7
+ as ``DocChunk``s that ``DESCRIBES`` the in-repo files they touched — so "why did
8
+ the retry logic change?" can reach the commit and the code it touched.
9
+
10
+ Git is read via ``git log`` (subprocess; no ``agentforge`` import — the knowledge
11
+ package stays deterministic, ADR-0001). Commit chunks are keyed by sha and added
12
+ idempotently (a re-index skips shas already present); they are immutable, so there
13
+ is no per-file GC.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import hashlib
19
+ import re
20
+ import subprocess
21
+ from pathlib import Path
22
+
23
+ from agentforge_graph.core import (
24
+ Edge,
25
+ EdgeKind,
26
+ GraphQuery,
27
+ GraphStore,
28
+ Node,
29
+ NodeKind,
30
+ Provenance,
31
+ SymbolID,
32
+ )
33
+
34
+ _ALL = 10_000_000
35
+ _DOC_LANG = "doc"
36
+ _COMMITS_PATH = "<commits>" # synthetic SymbolID path namespace for commit chunks
37
+ _RS = "\x1e" # record separator between commits
38
+ _US = "\x1f" # field separator within a commit's header line
39
+
40
+ # conventional commit: `type(scope)?!: summary`
41
+ _CONVENTIONAL = re.compile(
42
+ r"^(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\([^)]*\))?!?:",
43
+ re.IGNORECASE,
44
+ )
45
+ # an issue reference: `#123` or `PROJ-45`
46
+ _ISSUE_REF = re.compile(r"(#\d+|\b[A-Z][A-Z0-9]+-\d+\b)")
47
+
48
+
49
+ def _qualifies(subject: str) -> bool:
50
+ return bool(_CONVENTIONAL.match(subject.strip()) or _ISSUE_REF.search(subject))
51
+
52
+
53
+ class CommitIngestor:
54
+ def __init__(self, repo: str, root: str | Path, commit: str = "", limit: int = 50) -> None:
55
+ self.repo = repo
56
+ self.root = str(root)
57
+ self.commit = commit
58
+ self.limit = max(1, limit)
59
+
60
+ async def ingest(self, store: GraphStore) -> int:
61
+ commits = self._git_log()
62
+ if not commits:
63
+ return 0
64
+ path_index = await self._path_index(store)
65
+ existing = await self._existing_shas(store)
66
+ prov = Provenance.parsed("commit-ingestor", self.commit)
67
+ facts: list[Node | Edge] = []
68
+ count = 0
69
+ for sha, ts, author, subject, files in commits:
70
+ if sha in existing or not _qualifies(subject):
71
+ continue
72
+ targets = [path_index[f] for f in files if f in path_index]
73
+ if not targets: # touched no in-repo code → nothing to describe
74
+ continue
75
+ chunk_id = SymbolID.for_symbol(
76
+ _DOC_LANG, self.repo, _COMMITS_PATH, f"commit({sha[:12]})."
77
+ )
78
+ facts.append(
79
+ Node(
80
+ id=chunk_id,
81
+ kind=NodeKind.DOC_CHUNK,
82
+ name=subject[:80],
83
+ attrs={
84
+ "path": _COMMITS_PATH,
85
+ "heading": subject[:80],
86
+ "text": subject,
87
+ "doc_source": "commit",
88
+ "commit": sha,
89
+ "author": author,
90
+ "ts": ts,
91
+ "content_hash": hashlib.sha256(f"{sha}:{subject}".encode()).hexdigest(),
92
+ },
93
+ provenance=prov,
94
+ )
95
+ )
96
+ for target in sorted(set(targets)):
97
+ facts.append(
98
+ Edge(src=chunk_id, dst=target, kind=EdgeKind.DESCRIBES, provenance=prov)
99
+ )
100
+ count += 1
101
+ if facts:
102
+ await store.add(facts)
103
+ return count
104
+
105
+ async def _path_index(self, store: GraphStore) -> dict[str, str]:
106
+ return {
107
+ SymbolID.parse(n.id).path: n.id
108
+ for n in (await store.query(GraphQuery(kinds=[NodeKind.FILE], limit=_ALL))).nodes
109
+ }
110
+
111
+ async def _existing_shas(self, store: GraphStore) -> set[str]:
112
+ nodes = (await store.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
113
+ return {
114
+ str(n.attrs.get("commit", "")) for n in nodes if n.attrs.get("doc_source") == "commit"
115
+ }
116
+
117
+ def _git_log(self) -> list[tuple[str, int, str, str, list[str]]]:
118
+ """Last ``limit`` non-merge commits as (sha, author_ts, author, subject,
119
+ touched files). Subject-only keeps the ``--name-only`` parse unambiguous;
120
+ full-body ingestion is a refinement."""
121
+ try:
122
+ out = subprocess.run(
123
+ [
124
+ "git",
125
+ "-C",
126
+ self.root,
127
+ "log",
128
+ f"-n{self.limit}",
129
+ "--no-merges",
130
+ "--no-color",
131
+ "--name-only",
132
+ f"--format={_RS}%H{_US}%ct{_US}%an{_US}%s",
133
+ ],
134
+ capture_output=True,
135
+ text=True,
136
+ check=True,
137
+ )
138
+ except (subprocess.SubprocessError, OSError):
139
+ return []
140
+ commits: list[tuple[str, int, str, str, list[str]]] = []
141
+ for block in out.stdout.split(_RS):
142
+ block = block.strip("\n")
143
+ if not block:
144
+ continue
145
+ head, _, rest = block.partition("\n")
146
+ parts = head.split(_US)
147
+ if len(parts) < 4:
148
+ continue
149
+ sha, ts_s, author, subject = parts[0], parts[1], parts[2], parts[3]
150
+ files = [ln for ln in rest.splitlines() if ln.strip()]
151
+ commits.append((sha, int(ts_s) if ts_s.isdigit() else 0, author, subject, files))
152
+ return commits
@@ -0,0 +1,312 @@
1
+ """``KnowledgeIngestor`` — turn ADR markdown into graph facts (feat-010 MVP).
2
+
3
+ Each ADR becomes its own ``FileSubgraph`` keyed by its path: a ``Decision``
4
+ node, body ``DocChunk`` nodes (``CONTAINS``-linked; not embedded at MVP),
5
+ ``GOVERNS`` edges to the code it unambiguously mentions, and a ``SUPERSEDES``
6
+ edge to the ADR it replaces. Upserting per ADR means edits/deletes ride the
7
+ store's per-file machinery (feat-004) with no ChangeDetector change. Runs after
8
+ code indexing so the mention indices see current code; re-runs each index, and
9
+ GCs decisions whose ADR file is gone.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+ import re
16
+ from pathlib import Path
17
+
18
+ from agentforge_graph.core import (
19
+ Edge,
20
+ EdgeKind,
21
+ FileSubgraph,
22
+ GraphQuery,
23
+ GraphStore,
24
+ Node,
25
+ NodeKind,
26
+ Provenance,
27
+ SymbolID,
28
+ )
29
+ from agentforge_graph.core.symbols import normalize_path
30
+
31
+ from .adr import ADRParser, _sections
32
+ from .mentions import extract_mentions, resolve_mentions
33
+ from .report import KnowledgeStats
34
+
35
+ _ALL = 10_000_000
36
+ _DOC_LANG = "doc" # SymbolID lang slug — keeps decision ids in their own namespace
37
+ _SYMBOL_KINDS = {NodeKind.CLASS, NodeKind.FUNCTION, NodeKind.METHOD}
38
+ _NUM_RE = re.compile(r"(\d+)")
39
+
40
+
41
+ class KnowledgeIngestor:
42
+ def __init__(self, repo: str, commit: str = "") -> None:
43
+ self.repo = repo
44
+ self.commit = commit
45
+ self.parser = ADRParser()
46
+
47
+ async def ingest(
48
+ self,
49
+ store: GraphStore,
50
+ repo_path: str | Path,
51
+ adr_globs: list[str],
52
+ code_exts: set[str],
53
+ doc_globs: list[str] | None = None,
54
+ ) -> KnowledgeStats:
55
+ root = Path(repo_path)
56
+ files = self._discover(root, adr_globs)
57
+ adr_paths = {rel for rel, _ in files}
58
+
59
+ # GC decisions whose ADR file vanished
60
+ for path in await self._existing_decision_paths(store):
61
+ if path not in adr_paths:
62
+ await store.delete_file(path)
63
+
64
+ stats = KnowledgeStats()
65
+ doc_globs = doc_globs or []
66
+ # both the ADR and the general-doc passes need the code indices, and the
67
+ # doc pass must still run (to GC vanished docs) even when there are no ADRs.
68
+ if not files and not doc_globs:
69
+ return stats
70
+
71
+ path_index, name_index = await self._code_indices(store)
72
+
73
+ if files:
74
+ num_to_decision = {self._adr_num(rel): self._decision_id(rel) for rel, _ in files}
75
+ prov = Provenance.parsed(self.parser.name, self.commit)
76
+ built: list[tuple[FileSubgraph, bool]] = []
77
+ for rel, text in files:
78
+ sg, has_supersedes, resolved, unresolved = self._build(
79
+ rel, text, prov, path_index, name_index, num_to_decision, code_exts
80
+ )
81
+ built.append((sg, has_supersedes))
82
+ stats.decisions_indexed += 1
83
+ stats.governs_resolved += resolved
84
+ stats.mentions_unresolved += unresolved
85
+ # round A: land every Decision/DocChunk/GOVERNS (no SUPERSEDES yet)
86
+ for sg, _ in built:
87
+ await store.upsert(self._without(sg, EdgeKind.SUPERSEDES))
88
+ # round B: re-upsert ADRs that supersede, now that all Decisions exist
89
+ for sg, has_supersedes in built:
90
+ if has_supersedes:
91
+ await store.upsert(sg)
92
+
93
+ if doc_globs:
94
+ await self._ingest_docs(
95
+ store, root, doc_globs, adr_paths, path_index, name_index, code_exts, stats
96
+ )
97
+ return stats
98
+
99
+ # --- discovery & indices ---------------------------------------------
100
+
101
+ # Index/landing/template pages that live under the ADR globs but are not
102
+ # decisions (BUG-003).
103
+ _NON_ADR_STEMS = {"readme", "index", "template", "_template", "0000-template"}
104
+
105
+ @classmethod
106
+ def _discover(cls, root: Path, adr_globs: list[str]) -> list[tuple[str, str]]:
107
+ seen: dict[str, str] = {}
108
+ for pattern in adr_globs:
109
+ for path in sorted(root.glob(pattern)):
110
+ if not path.is_file() or path.stem.lower() in cls._NON_ADR_STEMS:
111
+ continue
112
+ rel = path.relative_to(root).as_posix()
113
+ if rel not in seen:
114
+ seen[rel] = path.read_text(encoding="utf-8", errors="replace")
115
+ return sorted(seen.items())
116
+
117
+ async def _existing_decision_paths(self, store: GraphStore) -> set[str]:
118
+ nodes = (await store.query(GraphQuery(kinds=[NodeKind.DECISION], limit=_ALL))).nodes
119
+ return {SymbolID.parse(n.id).path for n in nodes}
120
+
121
+ async def _code_indices(self, store: GraphStore) -> tuple[dict[str, str], dict[str, list[str]]]:
122
+ path_index: dict[str, str] = {}
123
+ name_index: dict[str, list[str]] = {}
124
+ for n in (await store.query(GraphQuery(limit=_ALL))).nodes:
125
+ if n.kind is NodeKind.FILE:
126
+ path_index[SymbolID.parse(n.id).path] = n.id
127
+ elif n.kind in _SYMBOL_KINDS:
128
+ name_index.setdefault(n.name, []).append(n.id)
129
+ return path_index, name_index
130
+
131
+ # --- building one ADR subgraph ---------------------------------------
132
+
133
+ def _decision_id(self, rel: str) -> str:
134
+ return SymbolID.for_symbol(_DOC_LANG, self.repo, rel, "decision.")
135
+
136
+ @staticmethod
137
+ def _adr_num(rel: str) -> str:
138
+ m = _NUM_RE.search(Path(rel).stem)
139
+ return str(int(m.group(1))) if m else ""
140
+
141
+ def _build(
142
+ self,
143
+ rel: str,
144
+ text: str,
145
+ prov: Provenance,
146
+ path_index: dict[str, str],
147
+ name_index: dict[str, list[str]],
148
+ num_to_decision: dict[str, str],
149
+ code_exts: set[str],
150
+ ) -> tuple[FileSubgraph, bool, int, int]:
151
+ adr = self.parser.parse(rel, text)
152
+ decision_id = self._decision_id(rel)
153
+ nodes: list[Node] = [
154
+ Node(
155
+ id=decision_id,
156
+ kind=NodeKind.DECISION,
157
+ name=adr.title,
158
+ attrs={
159
+ "title": adr.title,
160
+ "status": adr.status,
161
+ "date": adr.date,
162
+ "adr_id": adr.adr_id,
163
+ "path": normalize_path(rel),
164
+ "well_formed": adr.well_formed,
165
+ },
166
+ provenance=prov,
167
+ )
168
+ ]
169
+ edges: list[Edge] = []
170
+ for i, section in enumerate(adr.sections):
171
+ chunk_id = SymbolID.for_symbol(_DOC_LANG, self.repo, rel, f"docchunk({i}).")
172
+ nodes.append(
173
+ Node(
174
+ id=chunk_id,
175
+ kind=NodeKind.DOC_CHUNK,
176
+ name=section.heading or f"section{i}",
177
+ attrs={
178
+ "path": normalize_path(rel),
179
+ "heading": section.heading,
180
+ "text": section.text,
181
+ "seq": i,
182
+ # hash of the embeddable text (heading + body) — lets the
183
+ # embed pass detect changed doc chunks (feat-010 follow-up).
184
+ "content_hash": hashlib.sha256(
185
+ f"{section.heading}\n{section.text}".encode()
186
+ ).hexdigest(),
187
+ },
188
+ provenance=prov,
189
+ )
190
+ )
191
+ edges.append(
192
+ Edge(src=decision_id, dst=chunk_id, kind=EdgeKind.CONTAINS, provenance=prov)
193
+ )
194
+
195
+ mentions = extract_mentions(adr.body, code_exts)
196
+ targets, unresolved = resolve_mentions(mentions, path_index, name_index)
197
+ for target in sorted(targets):
198
+ edges.append(Edge(src=decision_id, dst=target, kind=EdgeKind.GOVERNS, provenance=prov))
199
+
200
+ has_supersedes = False
201
+ if adr.supersedes_num and adr.supersedes_num in num_to_decision:
202
+ superseded = num_to_decision[adr.supersedes_num]
203
+ if superseded != decision_id:
204
+ edges.append(
205
+ Edge(
206
+ src=decision_id,
207
+ dst=superseded,
208
+ kind=EdgeKind.SUPERSEDES,
209
+ provenance=prov,
210
+ )
211
+ )
212
+ has_supersedes = True
213
+
214
+ content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
215
+ sg = FileSubgraph(path=rel, content_hash=content_hash, nodes=nodes, edges=edges)
216
+ return sg, has_supersedes, len(targets), unresolved
217
+
218
+ @staticmethod
219
+ def _without(sg: FileSubgraph, kind: EdgeKind) -> FileSubgraph:
220
+ return sg.model_copy(update={"edges": [e for e in sg.edges if e.kind is not kind]})
221
+
222
+ # --- general docs (doc_globs) ----------------------------------------
223
+
224
+ @classmethod
225
+ def _discover_docs(
226
+ cls, root: Path, doc_globs: list[str], adr_paths: set[str]
227
+ ) -> list[tuple[str, str]]:
228
+ """Markdown docs under ``doc_globs``, minus files already ingested as ADRs.
229
+ Unlike ADR discovery, README/index pages ARE kept — they're general docs."""
230
+ seen: dict[str, str] = {}
231
+ for pattern in doc_globs:
232
+ for path in sorted(root.glob(pattern)):
233
+ if not path.is_file():
234
+ continue
235
+ rel = path.relative_to(root).as_posix()
236
+ if rel in adr_paths or rel in seen:
237
+ continue
238
+ seen[rel] = path.read_text(encoding="utf-8", errors="replace")
239
+ return sorted(seen.items())
240
+
241
+ async def _existing_doc_paths(self, store: GraphStore) -> set[str]:
242
+ nodes = (await store.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
243
+ return {SymbolID.parse(n.id).path for n in nodes if n.attrs.get("doc_source") == "doc"}
244
+
245
+ async def _ingest_docs(
246
+ self,
247
+ store: GraphStore,
248
+ root: Path,
249
+ doc_globs: list[str],
250
+ adr_paths: set[str],
251
+ path_index: dict[str, str],
252
+ name_index: dict[str, list[str]],
253
+ code_exts: set[str],
254
+ stats: KnowledgeStats,
255
+ ) -> None:
256
+ doc_files = self._discover_docs(root, doc_globs, adr_paths)
257
+ current = {rel for rel, _ in doc_files}
258
+ # GC general-doc DocChunks whose source file vanished (per-file like ADRs)
259
+ for path in await self._existing_doc_paths(store):
260
+ if path not in current:
261
+ await store.delete_file(path)
262
+ prov = Provenance.parsed("doc-ingestor", self.commit)
263
+ for rel, text in doc_files:
264
+ sg, resolved = self._build_doc(rel, text, prov, path_index, name_index, code_exts)
265
+ if not sg.nodes: # an empty/section-less doc contributes nothing
266
+ continue
267
+ await store.upsert(sg)
268
+ stats.docs_indexed += 1
269
+ stats.describes_resolved += resolved
270
+
271
+ def _build_doc(
272
+ self,
273
+ rel: str,
274
+ text: str,
275
+ prov: Provenance,
276
+ path_index: dict[str, str],
277
+ name_index: dict[str, list[str]],
278
+ code_exts: set[str],
279
+ ) -> tuple[FileSubgraph, int]:
280
+ """A general doc → one DocChunk per markdown section, each ``DESCRIBES`` the
281
+ code it unambiguously mentions (no Decision; docs describe, ADRs govern)."""
282
+ nodes: list[Node] = []
283
+ edges: list[Edge] = []
284
+ resolved = 0
285
+ for i, section in enumerate(_sections(text)):
286
+ chunk_id = SymbolID.for_symbol(_DOC_LANG, self.repo, rel, f"docchunk({i}).")
287
+ body = f"{section.heading}\n{section.text}"
288
+ nodes.append(
289
+ Node(
290
+ id=chunk_id,
291
+ kind=NodeKind.DOC_CHUNK,
292
+ name=section.heading or f"section{i}",
293
+ attrs={
294
+ "path": normalize_path(rel),
295
+ "heading": section.heading,
296
+ "text": section.text,
297
+ "seq": i,
298
+ "doc_source": "doc", # distinguishes general docs from ADR chunks
299
+ "content_hash": hashlib.sha256(body.encode()).hexdigest(),
300
+ },
301
+ provenance=prov,
302
+ )
303
+ )
304
+ mentions = extract_mentions(body, code_exts)
305
+ targets, _unresolved = resolve_mentions(mentions, path_index, name_index)
306
+ for target in sorted(targets):
307
+ edges.append(
308
+ Edge(src=chunk_id, dst=target, kind=EdgeKind.DESCRIBES, provenance=prov)
309
+ )
310
+ resolved += 1
311
+ content_hash = hashlib.sha256(text.encode("utf-8")).hexdigest()
312
+ return FileSubgraph(path=rel, content_hash=content_hash, nodes=nodes, edges=edges), resolved
@@ -0,0 +1,71 @@
1
+ """Extract code mentions from an ADR body and resolve them — precisely — to
2
+ graph nodes (feat-010). Only **unambiguous** matches become ``GOVERNS`` edges;
3
+ ambiguous or unresolved mentions are counted, never guessed (ADR-0004)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from dataclasses import dataclass, field
9
+
10
+ from agentforge_graph.core.symbols import normalize_path
11
+
12
+ _BACKTICK_RE = re.compile(r"`([^`]+)`")
13
+ # A qualified name: identifiers joined by '.' or '#', e.g. app.auth.login,
14
+ # Auth#login, PaymentService. No spaces, at least one identifier char.
15
+ _QUALNAME_RE = re.compile(r"^[A-Za-z_][\w]*(?:[.#][A-Za-z_][\w]*)*$")
16
+
17
+
18
+ @dataclass
19
+ class Mentions:
20
+ paths: set[str] = field(default_factory=set) # normalised repo-relative paths
21
+ names: set[str] = field(default_factory=set) # bare symbol names (last segment)
22
+
23
+
24
+ def _looks_like_path(token: str, code_exts: set[str]) -> bool:
25
+ return "/" in token and any(token.endswith(ext) for ext in code_exts)
26
+
27
+
28
+ def _leaf_name(qualname: str) -> str:
29
+ return re.split(r"[.#]", qualname.strip())[-1]
30
+
31
+
32
+ def extract_mentions(body: str, code_exts: set[str]) -> Mentions:
33
+ m = Mentions()
34
+ # backtick code spans: paths or qualified names
35
+ for span in _BACKTICK_RE.findall(body):
36
+ token = span.strip()
37
+ if _looks_like_path(token, code_exts):
38
+ m.paths.add(normalize_path(token))
39
+ elif _QUALNAME_RE.match(token):
40
+ m.names.add(_leaf_name(token))
41
+ # bare path-like tokens anywhere (e.g. mentioned in prose without backticks)
42
+ ext_alt = "|".join(re.escape(e.lstrip(".")) for e in sorted(code_exts))
43
+ if ext_alt:
44
+ for token in re.findall(rf"[\w./-]+\.(?:{ext_alt})\b", body):
45
+ if "/" in token:
46
+ m.paths.add(normalize_path(token))
47
+ return m
48
+
49
+
50
+ def resolve_mentions(
51
+ mentions: Mentions,
52
+ path_index: dict[str, str],
53
+ name_index: dict[str, list[str]],
54
+ ) -> tuple[set[str], int]:
55
+ """Map mentions to node ids. Returns (resolved target ids, unresolved count).
56
+ A path → its FILE id (exact). A name → its symbol id **iff unique**."""
57
+ targets: set[str] = set()
58
+ unresolved = 0
59
+ for path in mentions.paths:
60
+ file_id = path_index.get(path)
61
+ if file_id is not None:
62
+ targets.add(file_id)
63
+ else:
64
+ unresolved += 1
65
+ for name in mentions.names:
66
+ candidates = name_index.get(name, [])
67
+ if len(candidates) == 1:
68
+ targets.add(candidates[0])
69
+ else:
70
+ unresolved += 1 # 0 (unknown) or >1 (ambiguous) — never guess
71
+ return targets, unresolved