agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Deterministic, dependency-free embedder for tests and CI — no creds, no
|
|
2
|
+
network. Same text always yields the same L2-normalized vector, so retrieval
|
|
3
|
+
tests are reproducible."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import hashlib
|
|
8
|
+
import math
|
|
9
|
+
import struct
|
|
10
|
+
|
|
11
|
+
from .base import Embedder, InputType
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FakeEmbedder(Embedder):
|
|
15
|
+
def __init__(self, dim: int = 256) -> None:
|
|
16
|
+
self.name = "fake"
|
|
17
|
+
self.dim = dim
|
|
18
|
+
|
|
19
|
+
async def embed(
|
|
20
|
+
self, texts: list[str], input_type: InputType = "document"
|
|
21
|
+
) -> list[list[float]]:
|
|
22
|
+
return [self._vector(t) for t in texts]
|
|
23
|
+
|
|
24
|
+
def _vector(self, text: str) -> list[float]:
|
|
25
|
+
buf = b""
|
|
26
|
+
counter = 0
|
|
27
|
+
need = self.dim * 4
|
|
28
|
+
while len(buf) < need:
|
|
29
|
+
buf += hashlib.sha256(text.encode("utf-8") + counter.to_bytes(4, "big")).digest()
|
|
30
|
+
counter += 1
|
|
31
|
+
words = struct.unpack(f">{self.dim}I", buf[:need])
|
|
32
|
+
vals = [(w / 2**32) * 2.0 - 1.0 for w in words] # finite, in [-1, 1)
|
|
33
|
+
norm = math.sqrt(sum(v * v for v in vals)) or 1.0
|
|
34
|
+
return [v / norm for v in vals]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""``OpenAIEmbedder`` — OpenAI (and OpenAI-compatible) embeddings (ENH-003
|
|
2
|
+
phase 2; the most-requested non-AWS path, and the **local model** path).
|
|
3
|
+
|
|
4
|
+
Lazy-imports the ``openai`` SDK (the ``openai`` extra); synchronous calls run on
|
|
5
|
+
a worker thread, mirroring ``BedrockEmbedder``. Setting ``embed.base_url`` points
|
|
6
|
+
the same adapter at any OpenAI-compatible server — a local Ollama
|
|
7
|
+
(``http://localhost:11434/v1``), vLLM, LM Studio, or a gateway — so "bring your
|
|
8
|
+
own / run it locally" is a config line, not a new adapter.
|
|
9
|
+
|
|
10
|
+
``text-embedding-3-*`` models support arbitrary output ``dimensions``; ``dim``
|
|
11
|
+
is passed through. Credentials come from ``OPENAI_API_KEY`` (the SDK default)
|
|
12
|
+
unless ``api_key_env`` overrides it. Imports nothing from ``agentforge``
|
|
13
|
+
(ADR-0001).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import os
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
from .base import Embedder, InputType
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OpenAIEmbedder(Embedder):
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
model: str = "text-embedding-3-small",
|
|
29
|
+
dim: int = 1536,
|
|
30
|
+
batch_size: int = 96,
|
|
31
|
+
base_url: str = "",
|
|
32
|
+
api_key_env: str = "OPENAI_API_KEY",
|
|
33
|
+
) -> None:
|
|
34
|
+
self.name = f"openai:{model}"
|
|
35
|
+
self.model = model
|
|
36
|
+
self.dim = dim
|
|
37
|
+
self.batch_size = batch_size
|
|
38
|
+
self.base_url = base_url
|
|
39
|
+
self.api_key_env = api_key_env
|
|
40
|
+
self._client: Any = None
|
|
41
|
+
|
|
42
|
+
def _openai(self) -> Any:
|
|
43
|
+
if self._client is None:
|
|
44
|
+
import openai
|
|
45
|
+
|
|
46
|
+
kwargs: dict[str, Any] = {}
|
|
47
|
+
key = os.environ.get(self.api_key_env)
|
|
48
|
+
if key:
|
|
49
|
+
kwargs["api_key"] = key
|
|
50
|
+
if self.base_url:
|
|
51
|
+
kwargs["base_url"] = self.base_url
|
|
52
|
+
self._client = openai.OpenAI(**kwargs)
|
|
53
|
+
return self._client
|
|
54
|
+
|
|
55
|
+
async def embed(
|
|
56
|
+
self, texts: list[str], input_type: InputType = "document"
|
|
57
|
+
) -> list[list[float]]:
|
|
58
|
+
# OpenAI embeddings are symmetric — ``input_type`` is ignored.
|
|
59
|
+
out: list[list[float]] = []
|
|
60
|
+
for i in range(0, len(texts), self.batch_size):
|
|
61
|
+
batch = texts[i : i + self.batch_size]
|
|
62
|
+
out.extend(await asyncio.to_thread(self._invoke, batch))
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
def _invoke(self, batch: list[str]) -> list[list[float]]:
|
|
66
|
+
resp = self._openai().embeddings.create(model=self.model, input=batch, dimensions=self.dim)
|
|
67
|
+
return [[float(x) for x in item.embedding] for item in resp.data]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""``EmbedPipeline`` — chunk the indexed code and embed the chunks.
|
|
2
|
+
|
|
3
|
+
Per file: pull its symbol nodes from the graph, chunk them, write `CHUNK`
|
|
4
|
+
nodes + `CHUNK_OF` edges, embed the chunk texts, and upsert vectors. Coarse
|
|
5
|
+
incrementality at 0.1: if a file's chunk-hash set is unchanged, skip
|
|
6
|
+
re-embedding (saves cost); otherwise clean-replace the file's chunk vectors.
|
|
7
|
+
feat-004 will scope this to a DirtySet.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from agentforge_graph.chunking import Chunker
|
|
16
|
+
from agentforge_graph.core import (
|
|
17
|
+
Edge,
|
|
18
|
+
EdgeKind,
|
|
19
|
+
Embedded,
|
|
20
|
+
GraphQuery,
|
|
21
|
+
Node,
|
|
22
|
+
NodeKind,
|
|
23
|
+
Provenance,
|
|
24
|
+
SymbolID,
|
|
25
|
+
)
|
|
26
|
+
from agentforge_graph.ingest import PackRegistry, RepoSource
|
|
27
|
+
from agentforge_graph.store import Store
|
|
28
|
+
|
|
29
|
+
from .base import Embedder
|
|
30
|
+
from .report import EmbedReport
|
|
31
|
+
|
|
32
|
+
_ALL = 10_000_000
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class EmbedPipeline:
|
|
36
|
+
def __init__(self, chunker: Chunker, embedder: Embedder, commit: str = "") -> None:
|
|
37
|
+
self.chunker = chunker
|
|
38
|
+
self.embedder = embedder
|
|
39
|
+
self.commit = commit
|
|
40
|
+
self.name = "cast-chunker"
|
|
41
|
+
|
|
42
|
+
async def run(
|
|
43
|
+
self,
|
|
44
|
+
store: Store,
|
|
45
|
+
source: RepoSource,
|
|
46
|
+
registry: PackRegistry,
|
|
47
|
+
only_paths: set[str] | None = None,
|
|
48
|
+
doc_root: Path | None = None,
|
|
49
|
+
) -> EmbedReport:
|
|
50
|
+
"""Embed the indexed code. When ``only_paths`` is given (feat-004: the
|
|
51
|
+
files a refresh dirtied), only those files are re-chunked/embedded;
|
|
52
|
+
otherwise every file is visited (the chunk-hash skip still avoids
|
|
53
|
+
re-embedding unchanged files)."""
|
|
54
|
+
report = EmbedReport(model=self.embedder.name, dim=self.embedder.dim)
|
|
55
|
+
prov = Provenance.parsed(self.name, self.commit)
|
|
56
|
+
|
|
57
|
+
for sf in source.iter_files(registry):
|
|
58
|
+
if only_paths is not None and sf.path not in only_paths:
|
|
59
|
+
continue
|
|
60
|
+
nodes_for_path = [
|
|
61
|
+
n
|
|
62
|
+
for n in (
|
|
63
|
+
await store.graph.query(GraphQuery(path_prefix=sf.path, limit=_ALL))
|
|
64
|
+
).nodes
|
|
65
|
+
if SymbolID.parse(n.id).path == sf.path
|
|
66
|
+
]
|
|
67
|
+
symbols = [n for n in nodes_for_path if n.kind is not NodeKind.CHUNK]
|
|
68
|
+
if not symbols:
|
|
69
|
+
continue
|
|
70
|
+
chunks = self.chunker.chunk(sf, symbols)
|
|
71
|
+
if not chunks:
|
|
72
|
+
continue
|
|
73
|
+
report.files += 1
|
|
74
|
+
report.chunks += len(chunks)
|
|
75
|
+
|
|
76
|
+
prior = {
|
|
77
|
+
n.attrs.get("content_hash") for n in nodes_for_path if n.kind is NodeKind.CHUNK
|
|
78
|
+
}
|
|
79
|
+
if prior and prior == {c.content_hash for c in chunks}:
|
|
80
|
+
report.skipped_unchanged += 1
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
repo = SymbolID.parse(symbols[0].id).repo
|
|
84
|
+
file_id = SymbolID.for_symbol(sf.language, repo, sf.path, "")
|
|
85
|
+
graph_items: list[Node | Edge] = []
|
|
86
|
+
for ch in chunks:
|
|
87
|
+
graph_items.append(
|
|
88
|
+
Node(
|
|
89
|
+
id=ch.id,
|
|
90
|
+
kind=NodeKind.CHUNK,
|
|
91
|
+
name=f"chunk{ch.seq}",
|
|
92
|
+
span=ch.span,
|
|
93
|
+
attrs={
|
|
94
|
+
"path": ch.path,
|
|
95
|
+
"token_count": ch.token_count,
|
|
96
|
+
"content_hash": ch.content_hash,
|
|
97
|
+
"seq": ch.seq,
|
|
98
|
+
"code": ch.code, # carried for retrieval rendering (feat-006)
|
|
99
|
+
},
|
|
100
|
+
provenance=prov,
|
|
101
|
+
)
|
|
102
|
+
)
|
|
103
|
+
for target in ch.symbol_ids or [file_id]:
|
|
104
|
+
graph_items.append(
|
|
105
|
+
Edge(src=ch.id, dst=target, kind=EdgeKind.CHUNK_OF, provenance=prov)
|
|
106
|
+
)
|
|
107
|
+
await store.graph.add(graph_items)
|
|
108
|
+
|
|
109
|
+
await store.vectors.delete_where({"path": sf.path}) # clean-replace this file
|
|
110
|
+
vectors = await self.embedder.embed([c.text for c in chunks], input_type="document")
|
|
111
|
+
await store.vectors.upsert(
|
|
112
|
+
[
|
|
113
|
+
Embedded(
|
|
114
|
+
ref=ch.id,
|
|
115
|
+
vector=vec,
|
|
116
|
+
kind=NodeKind.CHUNK,
|
|
117
|
+
attrs={
|
|
118
|
+
"path": ch.path,
|
|
119
|
+
"span": list(ch.span),
|
|
120
|
+
"symbol_ids": ch.symbol_ids,
|
|
121
|
+
"source_type": "code", # vs "doc" (feat-010) — lets
|
|
122
|
+
"model": self.embedder.name, # retrieval tell them apart
|
|
123
|
+
},
|
|
124
|
+
)
|
|
125
|
+
for ch, vec in zip(chunks, vectors, strict=True)
|
|
126
|
+
]
|
|
127
|
+
)
|
|
128
|
+
report.embedded += len(chunks)
|
|
129
|
+
|
|
130
|
+
report.doc_chunks = await self._embed_docs(store, doc_root)
|
|
131
|
+
return report
|
|
132
|
+
|
|
133
|
+
async def _embed_docs(self, store: Store, doc_root: Path | None = None) -> int:
|
|
134
|
+
"""Embed ADR/doc ``DocChunk`` prose so an architectural query surfaces the
|
|
135
|
+
governing decision / documented symbol (feat-010). A ``source_type: doc``
|
|
136
|
+
tag keeps these distinct from code chunks. Incremental: a fingerprint of all
|
|
137
|
+
doc chunks (ids + content hashes + embedder) is recorded under ``doc_root``;
|
|
138
|
+
when it is unchanged the whole pass is skipped (no API calls). On any change
|
|
139
|
+
it clean-replaces every doc vector (the simple, orphan-safe path for the
|
|
140
|
+
small doc set)."""
|
|
141
|
+
docs = (await store.graph.query(GraphQuery(kinds=[NodeKind.DOC_CHUNK], limit=_ALL))).nodes
|
|
142
|
+
manifest = (doc_root / "doc_embed.hash") if doc_root is not None else None
|
|
143
|
+
if not docs:
|
|
144
|
+
await store.vectors.delete_where({"kind": NodeKind.DOC_CHUNK.value})
|
|
145
|
+
if manifest is not None and manifest.exists():
|
|
146
|
+
manifest.unlink()
|
|
147
|
+
return 0
|
|
148
|
+
fp_body = "".join(
|
|
149
|
+
f"{n.id}|{n.attrs.get('content_hash', '')};" for n in sorted(docs, key=lambda z: z.id)
|
|
150
|
+
)
|
|
151
|
+
fingerprint = hashlib.sha256(
|
|
152
|
+
f"{self.embedder.name}:{self.embedder.dim}:{fp_body}".encode()
|
|
153
|
+
).hexdigest()
|
|
154
|
+
if (
|
|
155
|
+
manifest is not None
|
|
156
|
+
and manifest.exists()
|
|
157
|
+
and manifest.read_text().strip() == fingerprint
|
|
158
|
+
):
|
|
159
|
+
return 0 # docs unchanged since the last embed → skip the re-embed
|
|
160
|
+
# clean-replace via the DocChunk kind (a filterable vector column) — this
|
|
161
|
+
# also GCs vectors for docs/ADRs that were removed since the last embed.
|
|
162
|
+
await store.vectors.delete_where({"kind": NodeKind.DOC_CHUNK.value})
|
|
163
|
+
texts = [f"{n.attrs.get('heading', '')}\n{n.attrs.get('text', '')}".strip() for n in docs]
|
|
164
|
+
vectors = await self.embedder.embed(texts, input_type="document")
|
|
165
|
+
await store.vectors.upsert(
|
|
166
|
+
[
|
|
167
|
+
Embedded(
|
|
168
|
+
ref=n.id,
|
|
169
|
+
vector=vec,
|
|
170
|
+
kind=NodeKind.DOC_CHUNK,
|
|
171
|
+
attrs={
|
|
172
|
+
"path": n.attrs.get("path", ""),
|
|
173
|
+
"source_type": "doc",
|
|
174
|
+
"heading": n.attrs.get("heading", ""),
|
|
175
|
+
"model": self.embedder.name,
|
|
176
|
+
},
|
|
177
|
+
)
|
|
178
|
+
for n, vec in zip(docs, vectors, strict=True)
|
|
179
|
+
]
|
|
180
|
+
)
|
|
181
|
+
if manifest is not None:
|
|
182
|
+
manifest.parent.mkdir(parents=True, exist_ok=True)
|
|
183
|
+
manifest.write_text(fingerprint)
|
|
184
|
+
return len(docs)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Resolve an ``Embedder`` from ``EmbedConfig`` via the provider registry.
|
|
2
|
+
|
|
3
|
+
Built-ins (``fake``, ``bedrock``, ``openai``) are registered below; third-party
|
|
4
|
+
embedders register out-of-tree under the ``agentforge_graph.embedder_providers``
|
|
5
|
+
entry-point group (``pip install`` + one ``embed.driver`` line, no core change).
|
|
6
|
+
Each live driver lazy-imports its SDK so the base/fake path needs neither boto3
|
|
7
|
+
nor openai. ``openai`` also covers OpenAI-compatible local servers via
|
|
8
|
+
``embed.base_url`` (ENH-003 phase 2).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections.abc import Callable
|
|
14
|
+
|
|
15
|
+
from agentforge_graph.config import EmbedConfig
|
|
16
|
+
from agentforge_graph.providers import resolve_provider
|
|
17
|
+
|
|
18
|
+
from .base import Embedder
|
|
19
|
+
|
|
20
|
+
EMBEDDER_GROUP = "agentforge_graph.embedder_providers"
|
|
21
|
+
|
|
22
|
+
# A builder takes the parsed ``embed:`` block and returns a ready Embedder.
|
|
23
|
+
EmbedderBuilder = Callable[[EmbedConfig], Embedder]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_fake(cfg: EmbedConfig) -> Embedder:
|
|
27
|
+
from .fake import FakeEmbedder
|
|
28
|
+
|
|
29
|
+
return FakeEmbedder(dim=cfg.dim)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _build_bedrock(cfg: EmbedConfig) -> Embedder:
|
|
33
|
+
from .bedrock import BedrockEmbedder # lazy: only needs boto3 on this path
|
|
34
|
+
|
|
35
|
+
return BedrockEmbedder(
|
|
36
|
+
model=cfg.model,
|
|
37
|
+
region=cfg.region,
|
|
38
|
+
dim=cfg.dim,
|
|
39
|
+
batch_size=cfg.batch_size,
|
|
40
|
+
assume_role_arn=cfg.assume_role_arn or None,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _build_openai(cfg: EmbedConfig) -> Embedder:
|
|
45
|
+
from .openai import OpenAIEmbedder # lazy: only needs the openai SDK on this path
|
|
46
|
+
|
|
47
|
+
return OpenAIEmbedder(
|
|
48
|
+
model=cfg.model,
|
|
49
|
+
dim=cfg.dim,
|
|
50
|
+
batch_size=cfg.batch_size,
|
|
51
|
+
base_url=cfg.base_url,
|
|
52
|
+
api_key_env=cfg.api_key_env or "OPENAI_API_KEY",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
_EMBEDDER_BUILTINS: dict[str, EmbedderBuilder] = {
|
|
57
|
+
"fake": _build_fake,
|
|
58
|
+
"bedrock": _build_bedrock,
|
|
59
|
+
"openai": _build_openai,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def embedder_from_config(cfg: EmbedConfig) -> Embedder:
|
|
64
|
+
"""Construct the ``Embedder`` selected by ``cfg.driver`` via the registry."""
|
|
65
|
+
builder = resolve_provider(cfg.driver, _EMBEDDER_BUILTINS, EMBEDDER_GROUP, role="embedder")
|
|
66
|
+
return builder(cfg)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Result type for an embedding run."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EmbedReport(BaseModel):
|
|
9
|
+
files: int = 0
|
|
10
|
+
chunks: int = 0
|
|
11
|
+
embedded: int = 0
|
|
12
|
+
skipped_unchanged: int = 0 # files whose chunk set was unchanged (hash-skip)
|
|
13
|
+
doc_chunks: int = 0 # ADR/doc DocChunks embedded for semantic search (feat-010)
|
|
14
|
+
model: str = ""
|
|
15
|
+
dim: int = 0
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""LLM enrichment (feat-012): turn the code graph into a knowledge graph.
|
|
2
|
+
|
|
3
|
+
MVP: **design-pattern tagging** — deterministic structural heuristics nominate
|
|
4
|
+
candidates, a budgeted LLM judge confirms them, and confirmed verdicts become
|
|
5
|
+
``TAGGED`` edges to a fixed v1 ``PatternTag`` taxonomy with honest ``llm``
|
|
6
|
+
provenance + confidence + rationale. The judge is injectable
|
|
7
|
+
(``ScriptedJudge`` for tests, ``BedrockClaudeJudge`` live), so all orchestration
|
|
8
|
+
is deterministic. This is the framework layer (ADR-0001: ``enrich`` may import
|
|
9
|
+
``agentforge``); never runs implicitly (``ckg enrich`` only).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from .enricher import PatternTagEnricher
|
|
15
|
+
from .governs import (
|
|
16
|
+
ClaudeGovernsMatcher,
|
|
17
|
+
GovernsCandidate,
|
|
18
|
+
GovernsMatch,
|
|
19
|
+
GovernsMatcher,
|
|
20
|
+
ScriptedMatcher,
|
|
21
|
+
)
|
|
22
|
+
from .governs_enricher import DecisionGovernsInferencer
|
|
23
|
+
from .heuristics import Candidate, PatternHeuristics
|
|
24
|
+
from .judge import PatternJudge, ScriptedJudge, Verdict
|
|
25
|
+
from .registry import (
|
|
26
|
+
JUDGE_GROUP,
|
|
27
|
+
SUMMARIZER_GROUP,
|
|
28
|
+
governs_matcher_from_config,
|
|
29
|
+
judge_from_config,
|
|
30
|
+
summarizer_from_config,
|
|
31
|
+
)
|
|
32
|
+
from .report import EnrichReport, GovernsReport, SummaryInfo, SummaryReport, TaggedInfo
|
|
33
|
+
from .summarizer import FileContext, ScriptedSummarizer, Summarizer, Summary
|
|
34
|
+
from .summary_enricher import SummaryEnricher, repo_node_id, summary_id
|
|
35
|
+
from .taxonomy import TAXONOMY_V1, is_pattern, pattern_tag_id
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"PatternTagEnricher",
|
|
39
|
+
"PatternHeuristics",
|
|
40
|
+
"Candidate",
|
|
41
|
+
"PatternJudge",
|
|
42
|
+
"ScriptedJudge",
|
|
43
|
+
"Verdict",
|
|
44
|
+
"judge_from_config",
|
|
45
|
+
"summarizer_from_config",
|
|
46
|
+
"governs_matcher_from_config",
|
|
47
|
+
"DecisionGovernsInferencer",
|
|
48
|
+
"GovernsMatcher",
|
|
49
|
+
"ScriptedMatcher",
|
|
50
|
+
"ClaudeGovernsMatcher",
|
|
51
|
+
"GovernsCandidate",
|
|
52
|
+
"GovernsMatch",
|
|
53
|
+
"GovernsReport",
|
|
54
|
+
"JUDGE_GROUP",
|
|
55
|
+
"SUMMARIZER_GROUP",
|
|
56
|
+
"EnrichReport",
|
|
57
|
+
"TaggedInfo",
|
|
58
|
+
"SummaryReport",
|
|
59
|
+
"SummaryInfo",
|
|
60
|
+
"SummaryEnricher",
|
|
61
|
+
"Summarizer",
|
|
62
|
+
"ScriptedSummarizer",
|
|
63
|
+
"Summary",
|
|
64
|
+
"FileContext",
|
|
65
|
+
"summary_id",
|
|
66
|
+
"repo_node_id",
|
|
67
|
+
"TAXONOMY_V1",
|
|
68
|
+
"is_pattern",
|
|
69
|
+
"pattern_tag_id",
|
|
70
|
+
]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""``AnthropicClaudeJudge`` / ``AnthropicClaudeSummarizer`` — the **direct
|
|
2
|
+
Anthropic API** enrichers (ENH-003 phase 2; the non-AWS Claude path).
|
|
3
|
+
|
|
4
|
+
Thin endpoint adapters: the judging/summary logic is the provider-neutral
|
|
5
|
+
``ClaudeJudge`` / ``ClaudeSummarizer`` (``claude.py``); these wire it to an
|
|
6
|
+
``AnthropicClient`` transport. Pick them with ``enrich.provider: anthropic`` and
|
|
7
|
+
set ``ANTHROPIC_API_KEY`` (``enrich.model`` may stay the Bedrock default — the
|
|
8
|
+
id is normalised to its API form). Tests use the ``Scripted*`` variants.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .anthropic_client import AnthropicClient
|
|
14
|
+
from .claude import ClaudeJudge, ClaudeSummarizer
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AnthropicClaudeJudge(ClaudeJudge):
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
model: str = "claude-haiku-4-5-20251001",
|
|
21
|
+
api_key_env: str = "ANTHROPIC_API_KEY",
|
|
22
|
+
base_url: str = "",
|
|
23
|
+
max_tokens: int = 512,
|
|
24
|
+
) -> None:
|
|
25
|
+
client = AnthropicClient(model, api_key_env, base_url, max_tokens)
|
|
26
|
+
super().__init__(client, client.model)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AnthropicClaudeSummarizer(ClaudeSummarizer):
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
model: str = "claude-haiku-4-5-20251001",
|
|
33
|
+
api_key_env: str = "ANTHROPIC_API_KEY",
|
|
34
|
+
base_url: str = "",
|
|
35
|
+
max_tokens: int = 400,
|
|
36
|
+
) -> None:
|
|
37
|
+
client = AnthropicClient(model, api_key_env, base_url, max_tokens)
|
|
38
|
+
super().__init__(client, client.model)
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Shared **direct Anthropic API** Claude client for the enrichers (ENH-003
|
|
2
|
+
phase 2 — the non-AWS path).
|
|
3
|
+
|
|
4
|
+
Mirrors ``BedrockClient``: one lazily-built ``anthropic.Anthropic`` client, a
|
|
5
|
+
synchronous Messages call run on a worker thread, cost accumulated from token
|
|
6
|
+
usage. Returns ``resp.model_dump()`` — the same ``content``/``usage`` dict shape
|
|
7
|
+
Bedrock returns — so ``ClaudeJudge`` / ``ClaudeSummarizer`` parse it unchanged.
|
|
8
|
+
|
|
9
|
+
The ``anthropic`` SDK ships with the base install (pulled by
|
|
10
|
+
``agentforge-anthropic[anthropic]``); it is imported lazily so the
|
|
11
|
+
scripted/offline path never needs it. Credentials come from ``ANTHROPIC_API_KEY``
|
|
12
|
+
(the SDK's default env var) unless overridden.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import os
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from .bedrock_client import price_for # provider-neutral USD-per-token table
|
|
22
|
+
|
|
23
|
+
# Bedrock ids carry an inference-profile prefix (``us.``/``eu.``/…) and a
|
|
24
|
+
# ``-v1:0`` suffix; the direct API wants the bare model id. Normalising lets the
|
|
25
|
+
# same ``enrich.model`` default work on either provider.
|
|
26
|
+
_PROFILE_PREFIXES = ("us.", "eu.", "apac.", "us-gov.")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def api_model_id(model: str) -> str:
|
|
30
|
+
"""Map a Bedrock model id to its Anthropic-API equivalent (idempotent on an
|
|
31
|
+
id that is already an API id). ``us.anthropic.claude-haiku-4-5-20251001-v1:0``
|
|
32
|
+
→ ``claude-haiku-4-5-20251001``."""
|
|
33
|
+
m = model
|
|
34
|
+
for prefix in _PROFILE_PREFIXES:
|
|
35
|
+
if m.startswith(prefix):
|
|
36
|
+
m = m[len(prefix) :]
|
|
37
|
+
break
|
|
38
|
+
if m.startswith("anthropic."):
|
|
39
|
+
m = m[len("anthropic.") :]
|
|
40
|
+
if m.endswith("-v1:0"):
|
|
41
|
+
m = m[: -len("-v1:0")]
|
|
42
|
+
return m
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AnthropicClient:
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
model: str = "claude-haiku-4-5-20251001",
|
|
49
|
+
api_key_env: str = "ANTHROPIC_API_KEY",
|
|
50
|
+
base_url: str = "",
|
|
51
|
+
max_tokens: int = 512,
|
|
52
|
+
) -> None:
|
|
53
|
+
self.model = api_model_id(model)
|
|
54
|
+
self.api_key_env = api_key_env
|
|
55
|
+
self.base_url = base_url
|
|
56
|
+
self.max_tokens = max_tokens
|
|
57
|
+
self._client: Any = None
|
|
58
|
+
self.cost_usd = 0.0
|
|
59
|
+
|
|
60
|
+
def _anthropic(self) -> Any:
|
|
61
|
+
if self._client is None:
|
|
62
|
+
import anthropic
|
|
63
|
+
|
|
64
|
+
kwargs: dict[str, Any] = {}
|
|
65
|
+
key = os.environ.get(self.api_key_env)
|
|
66
|
+
if key:
|
|
67
|
+
kwargs["api_key"] = key
|
|
68
|
+
if self.base_url:
|
|
69
|
+
kwargs["base_url"] = self.base_url
|
|
70
|
+
self._client = anthropic.Anthropic(**kwargs)
|
|
71
|
+
return self._client
|
|
72
|
+
|
|
73
|
+
async def invoke(
|
|
74
|
+
self,
|
|
75
|
+
system: str,
|
|
76
|
+
user: str,
|
|
77
|
+
tools: list[dict[str, Any]] | None = None,
|
|
78
|
+
tool_name: str | None = None,
|
|
79
|
+
) -> dict[str, Any]:
|
|
80
|
+
"""One Messages call; accumulates cost from usage. Returns the raw
|
|
81
|
+
payload (``content`` blocks + ``usage``)."""
|
|
82
|
+
payload = await asyncio.to_thread(self._invoke, system, user, tools, tool_name)
|
|
83
|
+
cents_in, cents_out = price_for(self.model)
|
|
84
|
+
usage = payload.get("usage", {})
|
|
85
|
+
self.cost_usd += (
|
|
86
|
+
usage.get("input_tokens", 0) * cents_in + usage.get("output_tokens", 0) * cents_out
|
|
87
|
+
) / 1_000_000
|
|
88
|
+
return payload
|
|
89
|
+
|
|
90
|
+
def _invoke(
|
|
91
|
+
self,
|
|
92
|
+
system: str,
|
|
93
|
+
user: str,
|
|
94
|
+
tools: list[dict[str, Any]] | None,
|
|
95
|
+
tool_name: str | None,
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
kwargs: dict[str, Any] = {
|
|
98
|
+
"model": self.model,
|
|
99
|
+
"max_tokens": self.max_tokens,
|
|
100
|
+
"system": system,
|
|
101
|
+
"messages": [{"role": "user", "content": user}],
|
|
102
|
+
}
|
|
103
|
+
if tools is not None:
|
|
104
|
+
kwargs["tools"] = tools
|
|
105
|
+
if tool_name is not None:
|
|
106
|
+
kwargs["tool_choice"] = {"type": "tool", "name": tool_name}
|
|
107
|
+
resp = self._anthropic().messages.create(**kwargs)
|
|
108
|
+
result: dict[str, Any] = resp.model_dump()
|
|
109
|
+
return result
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""``BedrockClaudeJudge`` — the pattern judge on AWS Bedrock (feat-012).
|
|
2
|
+
|
|
3
|
+
Thin endpoint adapter: the judging logic (prompts, forced ``submit_verdicts``
|
|
4
|
+
tool call, verdict parsing, cost) lives in the provider-neutral ``ClaudeJudge``
|
|
5
|
+
(``claude.py``); this just wires it to a Bedrock transport (``BedrockClient``).
|
|
6
|
+
The Anthropic-API sibling is ``AnthropicClaudeJudge`` (``anthropic.py``). Tests
|
|
7
|
+
use the ``ScriptedJudge`` instead.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .bedrock_client import BedrockClient
|
|
13
|
+
from .claude import ClaudeJudge
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BedrockClaudeJudge(ClaudeJudge):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
model: str = "us.anthropic.claude-haiku-4-5-20251001-v1:0",
|
|
20
|
+
region: str = "us-east-1",
|
|
21
|
+
assume_role_arn: str | None = None,
|
|
22
|
+
max_tokens: int = 512,
|
|
23
|
+
) -> None:
|
|
24
|
+
super().__init__(BedrockClient(model, region, assume_role_arn, max_tokens), model)
|