agentforge-graph 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. agentforge_graph/__init__.py +6 -0
  2. agentforge_graph/chunking/__init__.py +12 -0
  3. agentforge_graph/chunking/cast.py +159 -0
  4. agentforge_graph/chunking/chunk.py +19 -0
  5. agentforge_graph/chunking/tokens.py +15 -0
  6. agentforge_graph/cli.py +607 -0
  7. agentforge_graph/config.py +259 -0
  8. agentforge_graph/core/__init__.py +54 -0
  9. agentforge_graph/core/conformance.py +270 -0
  10. agentforge_graph/core/contracts.py +163 -0
  11. agentforge_graph/core/kinds.py +68 -0
  12. agentforge_graph/core/models.py +134 -0
  13. agentforge_graph/core/provenance.py +62 -0
  14. agentforge_graph/core/symbols.py +116 -0
  15. agentforge_graph/embed/__init__.py +28 -0
  16. agentforge_graph/embed/base.py +22 -0
  17. agentforge_graph/embed/bedrock.py +85 -0
  18. agentforge_graph/embed/fake.py +34 -0
  19. agentforge_graph/embed/openai.py +67 -0
  20. agentforge_graph/embed/pipeline.py +184 -0
  21. agentforge_graph/embed/registry.py +66 -0
  22. agentforge_graph/embed/report.py +15 -0
  23. agentforge_graph/enrich/__init__.py +70 -0
  24. agentforge_graph/enrich/anthropic.py +38 -0
  25. agentforge_graph/enrich/anthropic_client.py +109 -0
  26. agentforge_graph/enrich/bedrock.py +24 -0
  27. agentforge_graph/enrich/bedrock_client.py +115 -0
  28. agentforge_graph/enrich/bedrock_summarizer.py +23 -0
  29. agentforge_graph/enrich/claude.py +172 -0
  30. agentforge_graph/enrich/enricher.py +108 -0
  31. agentforge_graph/enrich/governs.py +173 -0
  32. agentforge_graph/enrich/governs_enricher.py +152 -0
  33. agentforge_graph/enrich/heuristics.py +224 -0
  34. agentforge_graph/enrich/judge.py +63 -0
  35. agentforge_graph/enrich/registry.py +133 -0
  36. agentforge_graph/enrich/report.py +60 -0
  37. agentforge_graph/enrich/summarizer.py +62 -0
  38. agentforge_graph/enrich/summary_enricher.py +211 -0
  39. agentforge_graph/enrich/taxonomy.py +38 -0
  40. agentforge_graph/frameworks/__init__.py +29 -0
  41. agentforge_graph/frameworks/base.py +75 -0
  42. agentforge_graph/frameworks/detect.py +124 -0
  43. agentforge_graph/frameworks/extractor.py +63 -0
  44. agentforge_graph/frameworks/orm.py +93 -0
  45. agentforge_graph/frameworks/packs/_js_ast.py +56 -0
  46. agentforge_graph/frameworks/packs/_python_ast.py +157 -0
  47. agentforge_graph/frameworks/packs/django/__init__.py +240 -0
  48. agentforge_graph/frameworks/packs/django/models.scm +7 -0
  49. agentforge_graph/frameworks/packs/express/__init__.py +133 -0
  50. agentforge_graph/frameworks/packs/express/routes.scm +8 -0
  51. agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
  52. agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
  53. agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
  54. agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
  55. agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
  56. agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
  57. agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
  58. agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
  59. agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
  60. agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
  61. agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
  62. agentforge_graph/frameworks/registry.py +44 -0
  63. agentforge_graph/ingest/__init__.py +30 -0
  64. agentforge_graph/ingest/codegraph.py +847 -0
  65. agentforge_graph/ingest/extractor.py +353 -0
  66. agentforge_graph/ingest/incremental/__init__.py +25 -0
  67. agentforge_graph/ingest/incremental/detect.py +118 -0
  68. agentforge_graph/ingest/incremental/dirty.py +61 -0
  69. agentforge_graph/ingest/incremental/indexer.py +218 -0
  70. agentforge_graph/ingest/incremental/meta.py +72 -0
  71. agentforge_graph/ingest/incremental/ports.py +39 -0
  72. agentforge_graph/ingest/pack.py +160 -0
  73. agentforge_graph/ingest/packs/__init__.py +34 -0
  74. agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
  75. agentforge_graph/ingest/packs/cpp/references.scm +15 -0
  76. agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
  77. agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
  78. agentforge_graph/ingest/packs/csharp/references.scm +12 -0
  79. agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
  80. agentforge_graph/ingest/packs/go/__init__.py +38 -0
  81. agentforge_graph/ingest/packs/go/references.scm +12 -0
  82. agentforge_graph/ingest/packs/go/structure.scm +64 -0
  83. agentforge_graph/ingest/packs/java/__init__.py +35 -0
  84. agentforge_graph/ingest/packs/java/references.scm +12 -0
  85. agentforge_graph/ingest/packs/java/structure.scm +38 -0
  86. agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
  87. agentforge_graph/ingest/packs/javascript/references.scm +11 -0
  88. agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
  89. agentforge_graph/ingest/packs/php/__init__.py +35 -0
  90. agentforge_graph/ingest/packs/php/references.scm +15 -0
  91. agentforge_graph/ingest/packs/php/structure.scm +44 -0
  92. agentforge_graph/ingest/packs/python/__init__.py +25 -0
  93. agentforge_graph/ingest/packs/python/references.scm +14 -0
  94. agentforge_graph/ingest/packs/python/structure.scm +57 -0
  95. agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
  96. agentforge_graph/ingest/packs/ruby/references.scm +12 -0
  97. agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
  98. agentforge_graph/ingest/packs/rust/__init__.py +39 -0
  99. agentforge_graph/ingest/packs/rust/references.scm +12 -0
  100. agentforge_graph/ingest/packs/rust/structure.scm +46 -0
  101. agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
  102. agentforge_graph/ingest/packs/typescript/references.scm +11 -0
  103. agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
  104. agentforge_graph/ingest/pipeline.py +134 -0
  105. agentforge_graph/ingest/report.py +84 -0
  106. agentforge_graph/ingest/resolver.py +467 -0
  107. agentforge_graph/ingest/source.py +79 -0
  108. agentforge_graph/knowledge/__init__.py +28 -0
  109. agentforge_graph/knowledge/adr.py +136 -0
  110. agentforge_graph/knowledge/commits.py +152 -0
  111. agentforge_graph/knowledge/ingest.py +312 -0
  112. agentforge_graph/knowledge/mentions.py +71 -0
  113. agentforge_graph/knowledge/report.py +32 -0
  114. agentforge_graph/main.py +21 -0
  115. agentforge_graph/providers.py +36 -0
  116. agentforge_graph/repomap/__init__.py +14 -0
  117. agentforge_graph/repomap/rank.py +161 -0
  118. agentforge_graph/repomap/render.py +55 -0
  119. agentforge_graph/repomap/repomap.py +66 -0
  120. agentforge_graph/retrieve/__init__.py +21 -0
  121. agentforge_graph/retrieve/pack.py +76 -0
  122. agentforge_graph/retrieve/rerank.py +251 -0
  123. agentforge_graph/retrieve/retriever.py +286 -0
  124. agentforge_graph/retrieve/scoring.py +36 -0
  125. agentforge_graph/serve/__init__.py +19 -0
  126. agentforge_graph/serve/engine.py +204 -0
  127. agentforge_graph/serve/http_runner.py +133 -0
  128. agentforge_graph/serve/server.py +110 -0
  129. agentforge_graph/serve/tools.py +307 -0
  130. agentforge_graph/store/__init__.py +32 -0
  131. agentforge_graph/store/_rowmap.py +102 -0
  132. agentforge_graph/store/errors.py +22 -0
  133. agentforge_graph/store/facade.py +89 -0
  134. agentforge_graph/store/kuzu_store.py +380 -0
  135. agentforge_graph/store/lance_store.py +146 -0
  136. agentforge_graph/store/neo4j_store.py +294 -0
  137. agentforge_graph/store/pgvector_store.py +170 -0
  138. agentforge_graph/store/registry.py +45 -0
  139. agentforge_graph/temporal/__init__.py +36 -0
  140. agentforge_graph/temporal/backfill.py +338 -0
  141. agentforge_graph/temporal/events.py +82 -0
  142. agentforge_graph/temporal/index.py +190 -0
  143. agentforge_graph/temporal/mining.py +190 -0
  144. agentforge_graph/temporal/recorder.py +114 -0
  145. agentforge_graph/temporal/store.py +282 -0
  146. agentforge_graph-0.3.2.dist-info/METADATA +291 -0
  147. agentforge_graph-0.3.2.dist-info/RECORD +151 -0
  148. agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
  149. agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
  150. agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
  151. agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
@@ -0,0 +1,34 @@
1
+ """Deterministic, dependency-free embedder for tests and CI — no creds, no
2
+ network. Same text always yields the same L2-normalized vector, so retrieval
3
+ tests are reproducible."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import hashlib
8
+ import math
9
+ import struct
10
+
11
+ from .base import Embedder, InputType
12
+
13
+
14
+ class FakeEmbedder(Embedder):
15
+ def __init__(self, dim: int = 256) -> None:
16
+ self.name = "fake"
17
+ self.dim = dim
18
+
19
+ async def embed(
20
+ self, texts: list[str], input_type: InputType = "document"
21
+ ) -> list[list[float]]:
22
+ return [self._vector(t) for t in texts]
23
+
24
+ def _vector(self, text: str) -> list[float]:
25
+ buf = b""
26
+ counter = 0
27
+ need = self.dim * 4
28
+ while len(buf) < need:
29
+ buf += hashlib.sha256(text.encode("utf-8") + counter.to_bytes(4, "big")).digest()
30
+ counter += 1
31
+ words = struct.unpack(f">{self.dim}I", buf[:need])
32
+ vals = [(w / 2**32) * 2.0 - 1.0 for w in words] # finite, in [-1, 1)
33
+ norm = math.sqrt(sum(v * v for v in vals)) or 1.0
34
+ return [v / norm for v in vals]
@@ -0,0 +1,67 @@
1
+ """``OpenAIEmbedder`` — OpenAI (and OpenAI-compatible) embeddings (ENH-003
2
+ phase 2; the most-requested non-AWS path, and the **local model** path).
3
+
4
+ Lazy-imports the ``openai`` SDK (the ``openai`` extra); synchronous calls run on
5
+ a worker thread, mirroring ``BedrockEmbedder``. Setting ``embed.base_url`` points
6
+ the same adapter at any OpenAI-compatible server — a local Ollama
7
+ (``http://localhost:11434/v1``), vLLM, LM Studio, or a gateway — so "bring your
8
+ own / run it locally" is a config line, not a new adapter.
9
+
10
+ ``text-embedding-3-*`` models support arbitrary output ``dimensions``; ``dim``
11
+ is passed through. Credentials come from ``OPENAI_API_KEY`` (the SDK default)
12
+ unless ``api_key_env`` overrides it. Imports nothing from ``agentforge``
13
+ (ADR-0001).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import os
20
+ from typing import Any
21
+
22
+ from .base import Embedder, InputType
23
+
24
+
25
+ class OpenAIEmbedder(Embedder):
26
+ def __init__(
27
+ self,
28
+ model: str = "text-embedding-3-small",
29
+ dim: int = 1536,
30
+ batch_size: int = 96,
31
+ base_url: str = "",
32
+ api_key_env: str = "OPENAI_API_KEY",
33
+ ) -> None:
34
+ self.name = f"openai:{model}"
35
+ self.model = model
36
+ self.dim = dim
37
+ self.batch_size = batch_size
38
+ self.base_url = base_url
39
+ self.api_key_env = api_key_env
40
+ self._client: Any = None
41
+
42
+ def _openai(self) -> Any:
43
+ if self._client is None:
44
+ import openai
45
+
46
+ kwargs: dict[str, Any] = {}
47
+ key = os.environ.get(self.api_key_env)
48
+ if key:
49
+ kwargs["api_key"] = key
50
+ if self.base_url:
51
+ kwargs["base_url"] = self.base_url
52
+ self._client = openai.OpenAI(**kwargs)
53
+ return self._client
54
+
55
+ async def embed(
56
+ self, texts: list[str], input_type: InputType = "document"
57
+ ) -> list[list[float]]:
58
+ # OpenAI embeddings are symmetric — ``input_type`` is ignored.
59
+ out: list[list[float]] = []
60
+ for i in range(0, len(texts), self.batch_size):
61
+ batch = texts[i : i + self.batch_size]
62
+ out.extend(await asyncio.to_thread(self._invoke, batch))
63
+ return out
64
+
65
+ def _invoke(self, batch: list[str]) -> list[list[float]]:
66
+ resp = self._openai().embeddings.create(model=self.model, input=batch, dimensions=self.dim)
67
+ return [[float(x) for x in item.embedding] for item in resp.data]
@@ -0,0 +1,184 @@
1
+ """``EmbedPipeline`` — chunk the indexed code and embed the chunks.
2
+
3
+ Per file: pull its symbol nodes from the graph, chunk them, write `CHUNK`
4
+ nodes + `CHUNK_OF` edges, embed the chunk texts, and upsert vectors. Coarse
5
+ incrementality at 0.1: if a file's chunk-hash set is unchanged, skip
6
+ re-embedding (saves cost); otherwise clean-replace the file's chunk vectors.
7
+ feat-004 will scope this to a DirtySet.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ from pathlib import Path
14
+
15
+ from agentforge_graph.chunking import Chunker
16
+ from agentforge_graph.core import (
17
+ Edge,
18
+ EdgeKind,
19
+ Embedded,
20
+ GraphQuery,
21
+ Node,
22
+ NodeKind,
23
+ Provenance,
24
+ SymbolID,
25
+ )
26
+ from agentforge_graph.ingest import PackRegistry, RepoSource
27
+ from agentforge_graph.store import Store
28
+
29
+ from .base import Embedder
30
+ from .report import EmbedReport
31
+
32
+ _ALL = 10_000_000
33
+
34
+
35
+ class EmbedPipeline:
36
+ def __init__(self, chunker: Chunker, embedder: Embedder, commit: str = "") -> None:
37
+ self.chunker = chunker
38
+ self.embedder = embedder
39
+ self.commit = commit
40
+ self.name = "cast-chunker"
41
+
42
+ async def run(
43
+ self,
44
+ store: Store,
45
+ source: RepoSource,
46
+ registry: PackRegistry,
47
+ only_paths: set[str] | None = None,
48
+ doc_root: Path | None = None,
49
+ ) -> EmbedReport:
50
+ """Embed the indexed code. When ``only_paths`` is given (feat-004: the
51
+ files a refresh dirtied), only those files are re-chunked/embedded;
52
+ otherwise every file is visited (the chunk-hash skip still avoids
53
+ re-embedding unchanged files)."""
54
+ report = EmbedReport(model=self.embedder.name, dim=self.embedder.dim)
55
+ prov = Provenance.parsed(self.name, self.commit)
56
+
57
+ for sf in source.iter_files(registry):
58
+ if only_paths is not None and sf.path not in only_paths:
59
+ continue
60
+ nodes_for_path = [
61
+ n
62
+ for n in (
63
+ await store.graph.query(GraphQuery(path_prefix=sf.path, limit=_ALL))
64
+ ).nodes
65
+ if SymbolID.parse(n.id).path == sf.path
66
+ ]
67
+ symbols = [n for n in nodes_for_path if n.kind is not NodeKind.CHUNK]
68
+ if not symbols:
69
+ continue
70
+ chunks = self.chunker.chunk(sf, symbols)
71
+ if not chunks:
72
+ continue
73
+ report.files += 1
74
+ report.chunks += len(chunks)
75
+
76
+ prior = {
77
+ n.attrs.get("content_hash") for n in nodes_for_path if n.kind is NodeKind.CHUNK
78
+ }
79
+ if prior and prior == {c.content_hash for c in chunks}:
80
+ report.skipped_unchanged += 1
81
+ continue
82
+
83
+ repo = SymbolID.parse(symbols[0].id).repo
84
+ file_id = SymbolID.for_symbol(sf.language, repo, sf.path, "")
85
+ graph_items: list[Node | Edge] = []
86
+ for ch in chunks:
87
+ graph_items.append(
88
+ Node(
89
+ id=ch.id,
90
+ kind=NodeKind.CHUNK,
91
+ name=f"chunk{ch.seq}",
92
+ span=ch.span,
93
+ attrs={
94
+ "path": ch.path,
95
+ "token_count": ch.token_count,
96
+ "content_hash": ch.content_hash,
97
+ "seq": ch.seq,
98
+ "code": ch.code, # carried for retrieval rendering (feat-006)
99
+ },
100
+ provenance=prov,
101
+ )
102
+ )
103
+ for target in ch.symbol_ids or [file_id]:
104
+ graph_items.append(
105
+ Edge(src=ch.id, dst=target, kind=EdgeKind.CHUNK_OF, provenance=prov)
106
+ )
107
+ await store.graph.add(graph_items)
108
+
109
+ await store.vectors.delete_where({"path": sf.path}) # clean-replace this file
110
+ vectors = await self.embedder.embed([c.text for c in chunks], input_type="document")
111
+ await store.vectors.upsert(
112
+ [
113
+ Embedded(
114
+ ref=ch.id,
115
+ vector=vec,
116
+ kind=NodeKind.CHUNK,
117
+ attrs={
118
+ "path": ch.path,
119
+ "span": list(ch.span),
120
+ "symbol_ids": ch.symbol_ids,
121
+ "source_type": "code", # vs "doc" (feat-010) — lets
122
+ "model": self.embedder.name, # retrieval tell them apart
123
+ },
124
+ )
125
+ for ch, vec in zip(chunks, vectors, strict=True)
126
+ ]
127
+ )
128
+ report.embedded += len(chunks)
129
+
130
+ report.doc_chunks = await self._embed_docs(store, doc_root)
131
+ return report
132
+
133
+ async def _embed_docs(self, store: Store, doc_root: Path | None = None) -> int:
134
+ """Embed ADR/doc ``DocChunk`` prose so an architectural query surfaces the
135
+ governing decision / documented symbol (feat-010). A ``source_type: doc``
136
+ tag keeps these distinct from code chunks. Incremental: a fingerprint of all
137
+ doc chunks (ids + content hashes + embedder) is recorded under ``doc_root``;
138
+ when it is unchanged the whole pass is skipped (no API calls). On any change
139
+ it clean-replaces every doc vector (the simple, orphan-safe path for the
140
+ small doc set)."""
141
+ docs = (await store.graph.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
142
+ manifest = (doc_root / "doc_embed.hash") if doc_root is not None else None
143
+ if not docs:
144
+ await store.vectors.delete_where({"kind": NodeKind.DOC_CHUNK.value})
145
+ if manifest is not None and manifest.exists():
146
+ manifest.unlink()
147
+ return 0
148
+ fp_body = "".join(
149
+ f"{n.id}|{n.attrs.get('content_hash', '')};" for n in sorted(docs, key=lambda z: z.id)
150
+ )
151
+ fingerprint = hashlib.sha256(
152
+ f"{self.embedder.name}:{self.embedder.dim}:{fp_body}".encode()
153
+ ).hexdigest()
154
+ if (
155
+ manifest is not None
156
+ and manifest.exists()
157
+ and manifest.read_text().strip() == fingerprint
158
+ ):
159
+ return 0 # docs unchanged since the last embed → skip the re-embed
160
+ # clean-replace via the DocChunk kind (a filterable vector column) — this
161
+ # also GCs vectors for docs/ADRs that were removed since the last embed.
162
+ await store.vectors.delete_where({"kind": NodeKind.DOC_CHUNK.value})
163
+ texts = [f"{n.attrs.get('heading', '')}\n{n.attrs.get('text', '')}".strip() for n in docs]
164
+ vectors = await self.embedder.embed(texts, input_type="document")
165
+ await store.vectors.upsert(
166
+ [
167
+ Embedded(
168
+ ref=n.id,
169
+ vector=vec,
170
+ kind=NodeKind.DOC_CHUNK,
171
+ attrs={
172
+ "path": n.attrs.get("path", ""),
173
+ "source_type": "doc",
174
+ "heading": n.attrs.get("heading", ""),
175
+ "model": self.embedder.name,
176
+ },
177
+ )
178
+ for n, vec in zip(docs, vectors, strict=True)
179
+ ]
180
+ )
181
+ if manifest is not None:
182
+ manifest.parent.mkdir(parents=True, exist_ok=True)
183
+ manifest.write_text(fingerprint)
184
+ return len(docs)
@@ -0,0 +1,66 @@
1
+ """Resolve an ``Embedder`` from ``EmbedConfig`` via the provider registry.
2
+
3
+ Built-ins (``fake``, ``bedrock``, ``openai``) are registered below; third-party
4
+ embedders register out-of-tree under the ``agentforge_graph.embedder_providers``
5
+ entry-point group (``pip install`` + one ``embed.driver`` line, no core change).
6
+ Each live driver lazy-imports its SDK so the base/fake path needs neither boto3
7
+ nor openai. ``openai`` also covers OpenAI-compatible local servers via
8
+ ``embed.base_url`` (ENH-003 phase 2).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Callable
14
+
15
+ from agentforge_graph.config import EmbedConfig
16
+ from agentforge_graph.providers import resolve_provider
17
+
18
+ from .base import Embedder
19
+
20
+ EMBEDDER_GROUP = "agentforge_graph.embedder_providers"
21
+
22
+ # A builder takes the parsed ``embed:`` block and returns a ready Embedder.
23
+ EmbedderBuilder = Callable[[EmbedConfig], Embedder]
24
+
25
+
26
+ def _build_fake(cfg: EmbedConfig) -> Embedder:
27
+ from .fake import FakeEmbedder
28
+
29
+ return FakeEmbedder(dim=cfg.dim)
30
+
31
+
32
+ def _build_bedrock(cfg: EmbedConfig) -> Embedder:
33
+ from .bedrock import BedrockEmbedder # lazy: only needs boto3 on this path
34
+
35
+ return BedrockEmbedder(
36
+ model=cfg.model,
37
+ region=cfg.region,
38
+ dim=cfg.dim,
39
+ batch_size=cfg.batch_size,
40
+ assume_role_arn=cfg.assume_role_arn or None,
41
+ )
42
+
43
+
44
+ def _build_openai(cfg: EmbedConfig) -> Embedder:
45
+ from .openai import OpenAIEmbedder # lazy: only needs the openai SDK on this path
46
+
47
+ return OpenAIEmbedder(
48
+ model=cfg.model,
49
+ dim=cfg.dim,
50
+ batch_size=cfg.batch_size,
51
+ base_url=cfg.base_url,
52
+ api_key_env=cfg.api_key_env or "OPENAI_API_KEY",
53
+ )
54
+
55
+
56
+ _EMBEDDER_BUILTINS: dict[str, EmbedderBuilder] = {
57
+ "fake": _build_fake,
58
+ "bedrock": _build_bedrock,
59
+ "openai": _build_openai,
60
+ }
61
+
62
+
63
+ def embedder_from_config(cfg: EmbedConfig) -> Embedder:
64
+ """Construct the ``Embedder`` selected by ``cfg.driver`` via the registry."""
65
+ builder = resolve_provider(cfg.driver, _EMBEDDER_BUILTINS, EMBEDDER_GROUP, role="embedder")
66
+ return builder(cfg)
@@ -0,0 +1,15 @@
1
+ """Result type for an embedding run."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class EmbedReport(BaseModel):
9
+ files: int = 0
10
+ chunks: int = 0
11
+ embedded: int = 0
12
+ skipped_unchanged: int = 0 # files whose chunk set was unchanged (hash-skip)
13
+ doc_chunks: int = 0 # ADR/doc DocChunks embedded for semantic search (feat-010)
14
+ model: str = ""
15
+ dim: int = 0
@@ -0,0 +1,70 @@
1
+ """LLM enrichment (feat-012): turn the code graph into a knowledge graph.
2
+
3
+ MVP: **design-pattern tagging** — deterministic structural heuristics nominate
4
+ candidates, a budgeted LLM judge confirms them, and confirmed verdicts become
5
+ ``TAGGED`` edges to a fixed v1 ``PatternTag`` taxonomy with honest ``llm``
6
+ provenance + confidence + rationale. The judge is injectable
7
+ (``ScriptedJudge`` for tests, ``BedrockClaudeJudge`` live), so all orchestration
8
+ is deterministic. This is the framework layer (ADR-0001: ``enrich`` may import
9
+ ``agentforge``); never runs implicitly (``ckg enrich`` only).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from .enricher import PatternTagEnricher
15
+ from .governs import (
16
+ ClaudeGovernsMatcher,
17
+ GovernsCandidate,
18
+ GovernsMatch,
19
+ GovernsMatcher,
20
+ ScriptedMatcher,
21
+ )
22
+ from .governs_enricher import DecisionGovernsInferencer
23
+ from .heuristics import Candidate, PatternHeuristics
24
+ from .judge import PatternJudge, ScriptedJudge, Verdict
25
+ from .registry import (
26
+ JUDGE_GROUP,
27
+ SUMMARIZER_GROUP,
28
+ governs_matcher_from_config,
29
+ judge_from_config,
30
+ summarizer_from_config,
31
+ )
32
+ from .report import EnrichReport, GovernsReport, SummaryInfo, SummaryReport, TaggedInfo
33
+ from .summarizer import FileContext, ScriptedSummarizer, Summarizer, Summary
34
+ from .summary_enricher import SummaryEnricher, repo_node_id, summary_id
35
+ from .taxonomy import TAXONOMY_V1, is_pattern, pattern_tag_id
36
+
37
+ __all__ = [
38
+ "PatternTagEnricher",
39
+ "PatternHeuristics",
40
+ "Candidate",
41
+ "PatternJudge",
42
+ "ScriptedJudge",
43
+ "Verdict",
44
+ "judge_from_config",
45
+ "summarizer_from_config",
46
+ "governs_matcher_from_config",
47
+ "DecisionGovernsInferencer",
48
+ "GovernsMatcher",
49
+ "ScriptedMatcher",
50
+ "ClaudeGovernsMatcher",
51
+ "GovernsCandidate",
52
+ "GovernsMatch",
53
+ "GovernsReport",
54
+ "JUDGE_GROUP",
55
+ "SUMMARIZER_GROUP",
56
+ "EnrichReport",
57
+ "TaggedInfo",
58
+ "SummaryReport",
59
+ "SummaryInfo",
60
+ "SummaryEnricher",
61
+ "Summarizer",
62
+ "ScriptedSummarizer",
63
+ "Summary",
64
+ "FileContext",
65
+ "summary_id",
66
+ "repo_node_id",
67
+ "TAXONOMY_V1",
68
+ "is_pattern",
69
+ "pattern_tag_id",
70
+ ]
@@ -0,0 +1,38 @@
1
+ """``AnthropicClaudeJudge`` / ``AnthropicClaudeSummarizer`` — the **direct
2
+ Anthropic API** enrichers (ENH-003 phase 2; the non-AWS Claude path).
3
+
4
+ Thin endpoint adapters: the judging/summary logic is the provider-neutral
5
+ ``ClaudeJudge`` / ``ClaudeSummarizer`` (``claude.py``); these wire it to an
6
+ ``AnthropicClient`` transport. Pick them with ``enrich.provider: anthropic`` and
7
+ set ``ANTHROPIC_API_KEY`` (``enrich.model`` may stay the Bedrock default — the
8
+ id is normalised to its API form). Tests use the ``Scripted*`` variants.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .anthropic_client import AnthropicClient
14
+ from .claude import ClaudeJudge, ClaudeSummarizer
15
+
16
+
17
+ class AnthropicClaudeJudge(ClaudeJudge):
18
+ def __init__(
19
+ self,
20
+ model: str = "claude-haiku-4-5-20251001",
21
+ api_key_env: str = "ANTHROPIC_API_KEY",
22
+ base_url: str = "",
23
+ max_tokens: int = 512,
24
+ ) -> None:
25
+ client = AnthropicClient(model, api_key_env, base_url, max_tokens)
26
+ super().__init__(client, client.model)
27
+
28
+
29
+ class AnthropicClaudeSummarizer(ClaudeSummarizer):
30
+ def __init__(
31
+ self,
32
+ model: str = "claude-haiku-4-5-20251001",
33
+ api_key_env: str = "ANTHROPIC_API_KEY",
34
+ base_url: str = "",
35
+ max_tokens: int = 400,
36
+ ) -> None:
37
+ client = AnthropicClient(model, api_key_env, base_url, max_tokens)
38
+ super().__init__(client, client.model)
@@ -0,0 +1,109 @@
1
+ """Shared **direct Anthropic API** Claude client for the enrichers (ENH-003
2
+ phase 2 — the non-AWS path).
3
+
4
+ Mirrors ``BedrockClient``: one lazily-built ``anthropic.Anthropic`` client, a
5
+ synchronous Messages call run on a worker thread, cost accumulated from token
6
+ usage. Returns ``resp.model_dump()`` — the same ``content``/``usage`` dict shape
7
+ Bedrock returns — so ``ClaudeJudge`` / ``ClaudeSummarizer`` parse it unchanged.
8
+
9
+ The ``anthropic`` SDK ships with the base install (pulled by
10
+ ``agentforge-anthropic[anthropic]``); it is imported lazily so the
11
+ scripted/offline path never needs it. Credentials come from ``ANTHROPIC_API_KEY``
12
+ (the SDK's default env var) unless overridden.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import os
19
+ from typing import Any
20
+
21
+ from .bedrock_client import price_for # provider-neutral USD-per-token table
22
+
23
+ # Bedrock ids carry an inference-profile prefix (``us.``/``eu.``/…) and a
24
+ # ``-v1:0`` suffix; the direct API wants the bare model id. Normalising lets the
25
+ # same ``enrich.model`` default work on either provider.
26
+ _PROFILE_PREFIXES = ("us.", "eu.", "apac.", "us-gov.")
27
+
28
+
29
+ def api_model_id(model: str) -> str:
30
+ """Map a Bedrock model id to its Anthropic-API equivalent (idempotent on an
31
+ id that is already an API id). ``us.anthropic.claude-haiku-4-5-20251001-v1:0``
32
+ → ``claude-haiku-4-5-20251001``."""
33
+ m = model
34
+ for prefix in _PROFILE_PREFIXES:
35
+ if m.startswith(prefix):
36
+ m = m[len(prefix) :]
37
+ break
38
+ if m.startswith("anthropic."):
39
+ m = m[len("anthropic.") :]
40
+ if m.endswith("-v1:0"):
41
+ m = m[: -len("-v1:0")]
42
+ return m
43
+
44
+
45
+ class AnthropicClient:
46
+ def __init__(
47
+ self,
48
+ model: str = "claude-haiku-4-5-20251001",
49
+ api_key_env: str = "ANTHROPIC_API_KEY",
50
+ base_url: str = "",
51
+ max_tokens: int = 512,
52
+ ) -> None:
53
+ self.model = api_model_id(model)
54
+ self.api_key_env = api_key_env
55
+ self.base_url = base_url
56
+ self.max_tokens = max_tokens
57
+ self._client: Any = None
58
+ self.cost_usd = 0.0
59
+
60
+ def _anthropic(self) -> Any:
61
+ if self._client is None:
62
+ import anthropic
63
+
64
+ kwargs: dict[str, Any] = {}
65
+ key = os.environ.get(self.api_key_env)
66
+ if key:
67
+ kwargs["api_key"] = key
68
+ if self.base_url:
69
+ kwargs["base_url"] = self.base_url
70
+ self._client = anthropic.Anthropic(**kwargs)
71
+ return self._client
72
+
73
+ async def invoke(
74
+ self,
75
+ system: str,
76
+ user: str,
77
+ tools: list[dict[str, Any]] | None = None,
78
+ tool_name: str | None = None,
79
+ ) -> dict[str, Any]:
80
+ """One Messages call; accumulates cost from usage. Returns the raw
81
+ payload (``content`` blocks + ``usage``)."""
82
+ payload = await asyncio.to_thread(self._invoke, system, user, tools, tool_name)
83
+ cents_in, cents_out = price_for(self.model)
84
+ usage = payload.get("usage", {})
85
+ self.cost_usd += (
86
+ usage.get("input_tokens", 0) * cents_in + usage.get("output_tokens", 0) * cents_out
87
+ ) / 1_000_000
88
+ return payload
89
+
90
+ def _invoke(
91
+ self,
92
+ system: str,
93
+ user: str,
94
+ tools: list[dict[str, Any]] | None,
95
+ tool_name: str | None,
96
+ ) -> dict[str, Any]:
97
+ kwargs: dict[str, Any] = {
98
+ "model": self.model,
99
+ "max_tokens": self.max_tokens,
100
+ "system": system,
101
+ "messages": [{"role": "user", "content": user}],
102
+ }
103
+ if tools is not None:
104
+ kwargs["tools"] = tools
105
+ if tool_name is not None:
106
+ kwargs["tool_choice"] = {"type": "tool", "name": tool_name}
107
+ resp = self._anthropic().messages.create(**kwargs)
108
+ result: dict[str, Any] = resp.model_dump()
109
+ return result
@@ -0,0 +1,24 @@
1
+ """``BedrockClaudeJudge`` — the pattern judge on AWS Bedrock (feat-012).
2
+
3
+ Thin endpoint adapter: the judging logic (prompts, forced ``submit_verdicts``
4
+ tool call, verdict parsing, cost) lives in the provider-neutral ``ClaudeJudge``
5
+ (``claude.py``); this just wires it to a Bedrock transport (``BedrockClient``).
6
+ The Anthropic-API sibling is ``AnthropicClaudeJudge`` (``anthropic.py``). Tests
7
+ use the ``ScriptedJudge`` instead.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from .bedrock_client import BedrockClient
13
+ from .claude import ClaudeJudge
14
+
15
+
16
+ class BedrockClaudeJudge(ClaudeJudge):
17
+ def __init__(
18
+ self,
19
+ model: str = "us.anthropic.claude-haiku-4-5-20251001-v1:0",
20
+ region: str = "us-east-1",
21
+ assume_role_arn: str | None = None,
22
+ max_tokens: int = 512,
23
+ ) -> None:
24
+ super().__init__(BedrockClient(model, region, assume_role_arn, max_tokens), model)