agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""The locked ABCs every later feature plugs into.
|
|
2
|
+
|
|
3
|
+
- ``Extractor`` — turns a file into a ``FileSubgraph`` (feat-002, feat-011).
|
|
4
|
+
- ``GraphStore`` — persists subgraphs and enrichment facts, answers
|
|
5
|
+
queries and neighborhood walks (feat-003 adapters).
|
|
6
|
+
- ``Enricher`` — derives new nodes/edges from the existing graph
|
|
7
|
+
(feat-010/012).
|
|
8
|
+
|
|
9
|
+
Signatures only; implementations ship with their owning features. The
|
|
10
|
+
constructor/method surface here is the stable contract — additions are
|
|
11
|
+
minor bumps, removals/renames are major. See ADR-0001 (layering: this
|
|
12
|
+
module imports nothing from ``agentforge``).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from typing import Any, Literal
|
|
19
|
+
|
|
20
|
+
from .kinds import EdgeKind
|
|
21
|
+
from .models import (
|
|
22
|
+
Edge,
|
|
23
|
+
Embedded,
|
|
24
|
+
FileSubgraph,
|
|
25
|
+
GraphQuery,
|
|
26
|
+
Node,
|
|
27
|
+
QueryResult,
|
|
28
|
+
ScoredRef,
|
|
29
|
+
SourceFile,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Direction of a 1-hop edge walk: out = node is src, in = node is dst.
|
|
33
|
+
Direction = Literal["out", "in", "both"]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Extractor(ABC):
|
|
37
|
+
"""Produces a ``FileSubgraph`` from a single file, in isolation.
|
|
38
|
+
|
|
39
|
+
Extraction must not read other files (per-file isolation is what
|
|
40
|
+
makes feat-004 incremental); cross-file edges are emitted as
|
|
41
|
+
candidate references and resolved in a later pass.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
name: str
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def extract(self, file: SourceFile) -> FileSubgraph: ...
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class GraphStore(ABC):
|
|
51
|
+
"""Persistence + query contract. feat-003 ships the adapters."""
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
async def upsert(self, subgraph: FileSubgraph) -> None:
|
|
55
|
+
"""Insert/replace all nodes & edges for ``subgraph.path``
|
|
56
|
+
transactionally (delete prior content for that path, add new)."""
|
|
57
|
+
|
|
58
|
+
@abstractmethod
|
|
59
|
+
async def add(self, items: list[Node | Edge]) -> None:
|
|
60
|
+
"""Persist facts not tied to a single file (enrichment, resolved
|
|
61
|
+
cross-file edges). These survive ``delete_file`` of code files."""
|
|
62
|
+
|
|
63
|
+
@abstractmethod
|
|
64
|
+
async def delete_file(self, path: str) -> None:
|
|
65
|
+
"""Remove everything previously upserted for ``path``."""
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
async def clear_resolved(self, paths: list[str]) -> None:
|
|
69
|
+
"""Delete resolved-provenance edges whose ``origin_path`` is in
|
|
70
|
+
``paths`` — the inverse of a scoped re-resolve (feat-004). Parsed
|
|
71
|
+
nodes/edges are untouched. Also garbage-collects external ``PACKAGE``
|
|
72
|
+
nodes left with no inbound edge, so an incremental re-resolve converges
|
|
73
|
+
to the same graph a full re-index would produce."""
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
async def clear_outgoing(self, src_ids: list[str], kind: EdgeKind) -> None:
|
|
77
|
+
"""Delete edges of ``kind`` whose ``src`` is in ``src_ids`` — lets an
|
|
78
|
+
enricher (feat-012) re-derive a symbol's facts idempotently (re-tag
|
|
79
|
+
without duplicating ``TAGGED``/``SUMMARIZES`` edges)."""
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
async def query(self, q: GraphQuery) -> QueryResult:
|
|
83
|
+
"""Exact-match node lookup with the flat ``GraphQuery`` filter."""
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
async def neighbors(
|
|
87
|
+
self,
|
|
88
|
+
node_id: str,
|
|
89
|
+
kinds: list[EdgeKind] | None = None,
|
|
90
|
+
depth: int = 1,
|
|
91
|
+
) -> list[Node]:
|
|
92
|
+
"""Nodes reachable from ``node_id`` over edges of ``kinds`` within
|
|
93
|
+
``depth`` hops (either direction)."""
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
async def get(self, node_id: str) -> Node | None:
|
|
97
|
+
"""Fetch a node by id, or ``None``."""
|
|
98
|
+
|
|
99
|
+
@abstractmethod
|
|
100
|
+
async def set_attrs(self, node_id: str, attrs: dict[str, Any]) -> None:
|
|
101
|
+
"""Merge ``attrs`` into an existing node's ``attrs`` (a partial update —
|
|
102
|
+
other fields, including the file-ownership ``origin_path`` that drives
|
|
103
|
+
``delete_file``, are untouched). No-op if the node is absent. The
|
|
104
|
+
denormalisation channel for derived facts (feat-009 churn/authorship)
|
|
105
|
+
that must not detach a file-owned node from its file."""
|
|
106
|
+
|
|
107
|
+
@abstractmethod
|
|
108
|
+
async def adjacent(
|
|
109
|
+
self,
|
|
110
|
+
node_id: str,
|
|
111
|
+
kinds: list[EdgeKind] | None = None,
|
|
112
|
+
direction: Direction = "both",
|
|
113
|
+
) -> list[Edge]:
|
|
114
|
+
"""The 1-hop edges touching ``node_id`` (``out``: it is the src;
|
|
115
|
+
``in``: it is the dst; ``both``), optionally filtered by edge kind.
|
|
116
|
+
Returns full ``Edge`` objects, so the caller sees each edge's kind,
|
|
117
|
+
direction and provenance (feat-006 retrieval scoring)."""
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
async def close(self) -> None:
|
|
121
|
+
"""Release resources. Safe to call more than once."""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class VectorStore(ABC):
|
|
125
|
+
"""Vector persistence + similarity search. feat-003 ships the LanceDB
|
|
126
|
+
adapter; feat-005 produces the ``Embedded`` items it stores. A peer of
|
|
127
|
+
``GraphStore`` — the ``Store`` facade (feat-003) owns one of each and
|
|
128
|
+
joins them (vector hit -> graph expansion) for retrieval (feat-006)."""
|
|
129
|
+
|
|
130
|
+
@abstractmethod
|
|
131
|
+
async def upsert(self, items: list[Embedded]) -> None:
|
|
132
|
+
"""Insert/replace vectors keyed by ``Embedded.ref``."""
|
|
133
|
+
|
|
134
|
+
@abstractmethod
|
|
135
|
+
async def search(
|
|
136
|
+
self,
|
|
137
|
+
vector: list[float],
|
|
138
|
+
k: int,
|
|
139
|
+
filter: dict[str, Any] | None = None,
|
|
140
|
+
) -> list[ScoredRef]:
|
|
141
|
+
"""Top-``k`` nearest refs, optionally constrained by an attribute
|
|
142
|
+
``filter`` (e.g. ``{"kind": "Chunk"}``)."""
|
|
143
|
+
|
|
144
|
+
@abstractmethod
|
|
145
|
+
async def delete_where(self, filter: dict[str, Any]) -> None:
|
|
146
|
+
"""Drop vectors matching ``filter`` (feat-004 invalidation)."""
|
|
147
|
+
|
|
148
|
+
@abstractmethod
|
|
149
|
+
async def close(self) -> None:
|
|
150
|
+
"""Release resources. Safe to call more than once."""
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class Enricher(ABC):
|
|
154
|
+
"""Derives new nodes/edges from the existing graph (feat-010/012).
|
|
155
|
+
|
|
156
|
+
Returns the facts it derived; the caller persists them via
|
|
157
|
+
``GraphStore.add``. Derived facts must carry ``source=llm``
|
|
158
|
+
provenance with a confidence (ADR-0004)."""
|
|
159
|
+
|
|
160
|
+
name: str
|
|
161
|
+
|
|
162
|
+
@abstractmethod
|
|
163
|
+
async def enrich(self, store: GraphStore) -> list[Node | Edge]: ...
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Node and edge kind vocabularies for the code knowledge graph.
|
|
2
|
+
|
|
3
|
+
The full vocabulary is locked at 0.1 — including the higher-level kinds
|
|
4
|
+
whose producers ship later (feat-010/011/012) — so stores and queries
|
|
5
|
+
handle every kind from day one and no schema migration is needed when a
|
|
6
|
+
later producer lands. See ADR-0005.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NodeKind(StrEnum):
|
|
15
|
+
"""Every node kind the graph may contain. Locked at 0.1 (ADR-0005)."""
|
|
16
|
+
|
|
17
|
+
# --- structural (produced by feat-002) ---
|
|
18
|
+
REPOSITORY = "Repository"
|
|
19
|
+
PACKAGE = "Package"
|
|
20
|
+
FILE = "File"
|
|
21
|
+
CLASS = "Class"
|
|
22
|
+
INTERFACE = "Interface"
|
|
23
|
+
FUNCTION = "Function"
|
|
24
|
+
METHOD = "Method"
|
|
25
|
+
VARIABLE = "Variable"
|
|
26
|
+
TYPE_ALIAS = "TypeAlias"
|
|
27
|
+
|
|
28
|
+
# --- retrieval (feat-005 / feat-010) ---
|
|
29
|
+
CHUNK = "Chunk"
|
|
30
|
+
DOC_CHUNK = "DocChunk"
|
|
31
|
+
|
|
32
|
+
# --- higher-level: reserved now, produced later (ADR-0005) ---
|
|
33
|
+
DECISION = "Decision" # feat-010
|
|
34
|
+
ROUTE = "Route" # feat-011
|
|
35
|
+
DATA_MODEL = "DataModel" # feat-011
|
|
36
|
+
SERVICE = "Service" # feat-011
|
|
37
|
+
SUMMARY = "Summary" # feat-012
|
|
38
|
+
PATTERN_TAG = "PatternTag" # feat-012
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EdgeKind(StrEnum):
|
|
42
|
+
"""Every edge kind the graph may contain. Locked at 0.1 (ADR-0005)."""
|
|
43
|
+
|
|
44
|
+
# --- structural (feat-002) ---
|
|
45
|
+
CONTAINS = "CONTAINS"
|
|
46
|
+
IMPORTS = "IMPORTS"
|
|
47
|
+
CALLS = "CALLS"
|
|
48
|
+
INHERITS = "INHERITS"
|
|
49
|
+
IMPLEMENTS = "IMPLEMENTS"
|
|
50
|
+
REFERENCES = "REFERENCES"
|
|
51
|
+
|
|
52
|
+
# --- retrieval / docs (feat-005 / feat-010) ---
|
|
53
|
+
CHUNK_OF = "CHUNK_OF"
|
|
54
|
+
DESCRIBES = "DESCRIBES"
|
|
55
|
+
|
|
56
|
+
# --- decisions (feat-010) ---
|
|
57
|
+
GOVERNS = "GOVERNS"
|
|
58
|
+
SUPERSEDES = "SUPERSEDES"
|
|
59
|
+
|
|
60
|
+
# --- framework (feat-011) ---
|
|
61
|
+
HANDLED_BY = "HANDLED_BY"
|
|
62
|
+
INJECTED_INTO = "INJECTED_INTO"
|
|
63
|
+
HAS_FIELD = "HAS_FIELD"
|
|
64
|
+
RELATES_TO = "RELATES_TO"
|
|
65
|
+
|
|
66
|
+
# --- enrichment (feat-012) ---
|
|
67
|
+
SUMMARIZES = "SUMMARIZES"
|
|
68
|
+
TAGGED = "TAGGED"
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Value types for the graph: nodes, edges, the per-file subgraph, and
|
|
2
|
+
the minimal query/result shapes.
|
|
3
|
+
|
|
4
|
+
``Node``/``Edge`` validate their IDs and (via ``Provenance``) their
|
|
5
|
+
attribution at construction, so the graph cannot hold a malformed or
|
|
6
|
+
unattributed fact. ``FileSubgraph`` is the unit of ingestion *and*
|
|
7
|
+
deletion — keyed by ``(path, content_hash)`` — which is what makes
|
|
8
|
+
incremental indexing (feat-004) a thin layer. See ADR-0003/0004.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
16
|
+
|
|
17
|
+
from .kinds import EdgeKind, NodeKind
|
|
18
|
+
from .provenance import Provenance, Source
|
|
19
|
+
from .symbols import SymbolID
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _require_symbol_id(value: str) -> str:
|
|
23
|
+
SymbolID.parse(value) # raises ValueError if malformed
|
|
24
|
+
return value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SourceFile(BaseModel):
|
|
28
|
+
"""A single file handed to an ``Extractor``."""
|
|
29
|
+
|
|
30
|
+
model_config = ConfigDict(frozen=True)
|
|
31
|
+
|
|
32
|
+
path: str # repo-relative, posix
|
|
33
|
+
text: str
|
|
34
|
+
language: str
|
|
35
|
+
content_hash: str # sha256 of the file bytes
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Node(BaseModel):
|
|
39
|
+
"""A typed entity in the graph."""
|
|
40
|
+
|
|
41
|
+
id: str # a SymbolID string
|
|
42
|
+
kind: NodeKind
|
|
43
|
+
name: str
|
|
44
|
+
span: tuple[int, int] | None = None # (start_line, end_line), 1-based
|
|
45
|
+
attrs: dict[str, Any] = Field(default_factory=dict)
|
|
46
|
+
provenance: Provenance
|
|
47
|
+
|
|
48
|
+
@field_validator("id")
|
|
49
|
+
@classmethod
|
|
50
|
+
def _check_id(cls, v: str) -> str:
|
|
51
|
+
return _require_symbol_id(v)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Edge(BaseModel):
|
|
55
|
+
"""A typed relationship between two symbols."""
|
|
56
|
+
|
|
57
|
+
src: str
|
|
58
|
+
dst: str
|
|
59
|
+
kind: EdgeKind
|
|
60
|
+
attrs: dict[str, Any] = Field(default_factory=dict)
|
|
61
|
+
provenance: Provenance
|
|
62
|
+
# The file whose content produced this edge (the import/call site's file).
|
|
63
|
+
# Lets resolver-produced edges be invalidated per file on an incremental
|
|
64
|
+
# re-resolve (feat-004). Empty for file-parsed edges, where the store
|
|
65
|
+
# stamps the owning file's path at upsert time.
|
|
66
|
+
origin_path: str = ""
|
|
67
|
+
|
|
68
|
+
@field_validator("src", "dst")
|
|
69
|
+
@classmethod
|
|
70
|
+
def _check_endpoint(cls, v: str) -> str:
|
|
71
|
+
return _require_symbol_id(v)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class FileSubgraph(BaseModel):
|
|
75
|
+
"""Everything extracted from one file — the ingestion/deletion unit."""
|
|
76
|
+
|
|
77
|
+
path: str # repo-relative, posix
|
|
78
|
+
content_hash: str
|
|
79
|
+
nodes: list[Node] = Field(default_factory=list)
|
|
80
|
+
edges: list[Edge] = Field(default_factory=list)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class GraphQuery(BaseModel):
|
|
84
|
+
"""A minimal, flat node filter (0.1). Graph traversal lives in
|
|
85
|
+
``GraphStore.neighbors``, not here. Extends by minor bump."""
|
|
86
|
+
|
|
87
|
+
kinds: list[NodeKind] | None = None
|
|
88
|
+
name: str | None = None # exact match
|
|
89
|
+
path_prefix: str | None = None
|
|
90
|
+
edge_kind: EdgeKind | None = None
|
|
91
|
+
min_source: Source | None = None # provenance floor
|
|
92
|
+
limit: int = 100
|
|
93
|
+
|
|
94
|
+
@field_validator("limit")
|
|
95
|
+
@classmethod
|
|
96
|
+
def _positive_limit(cls, v: int) -> int:
|
|
97
|
+
if v <= 0:
|
|
98
|
+
raise ValueError("limit must be > 0")
|
|
99
|
+
return v
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class QueryResult(BaseModel):
|
|
103
|
+
"""The result of a ``GraphStore.query``."""
|
|
104
|
+
|
|
105
|
+
nodes: list[Node] = Field(default_factory=list)
|
|
106
|
+
edges: list[Edge] = Field(default_factory=list)
|
|
107
|
+
truncated: bool = False # True if `limit` clipped the result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class Embedded(BaseModel):
|
|
111
|
+
"""A vector plus the symbol/chunk it represents — the ``VectorStore``
|
|
112
|
+
write unit. ``ref`` is the id of the node the vector stands in for
|
|
113
|
+
(a Chunk, DocChunk, Summary…); the producer is feat-005."""
|
|
114
|
+
|
|
115
|
+
ref: str # symbol/chunk id this vector represents
|
|
116
|
+
vector: list[float]
|
|
117
|
+
kind: NodeKind
|
|
118
|
+
attrs: dict[str, Any] = Field(default_factory=dict)
|
|
119
|
+
|
|
120
|
+
@field_validator("vector")
|
|
121
|
+
@classmethod
|
|
122
|
+
def _non_empty(cls, v: list[float]) -> list[float]:
|
|
123
|
+
if not v:
|
|
124
|
+
raise ValueError("vector must be non-empty")
|
|
125
|
+
return v
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class ScoredRef(BaseModel):
|
|
129
|
+
"""A vector-search hit: a ref and its similarity score (higher =
|
|
130
|
+
closer). feat-006 expands these into a graph neighborhood."""
|
|
131
|
+
|
|
132
|
+
ref: str
|
|
133
|
+
score: float
|
|
134
|
+
attrs: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Provenance — attribution carried by every node and edge.
|
|
2
|
+
|
|
3
|
+
Every fact in the graph records where it came from, so an agent can
|
|
4
|
+
tell parsed ground truth from a heuristic resolution or an LLM guess,
|
|
5
|
+
and rank/filter accordingly. Enforced at construction so no
|
|
6
|
+
unattributed fact can exist. See ADR-0004.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from enum import StrEnum
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Source(StrEnum):
|
|
17
|
+
"""How a fact was derived, in decreasing reliability."""
|
|
18
|
+
|
|
19
|
+
PARSED = "parsed" # straight from the syntax tree
|
|
20
|
+
RESOLVED = "resolved" # upgraded via the import graph / resolver
|
|
21
|
+
LLM = "llm" # generated by a model (summaries, pattern tags, inferred links)
|
|
22
|
+
MANUAL = "manual" # human-asserted
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Provenance(BaseModel):
|
|
26
|
+
"""Attribution for a single node or edge."""
|
|
27
|
+
|
|
28
|
+
model_config = ConfigDict(frozen=True)
|
|
29
|
+
|
|
30
|
+
source: Source
|
|
31
|
+
extractor: str # producer name + version, e.g. "tree-sitter-python@0.23"
|
|
32
|
+
commit: str = "" # git sha the fact was derived at; "" if non-git / unstaged
|
|
33
|
+
confidence: float = 1.0 # < 1.0 only meaningful for source=llm
|
|
34
|
+
|
|
35
|
+
@model_validator(mode="after")
|
|
36
|
+
def _check_confidence(self) -> Provenance:
|
|
37
|
+
if not 0.0 <= self.confidence <= 1.0:
|
|
38
|
+
raise ValueError("confidence must be in [0.0, 1.0]")
|
|
39
|
+
if self.source is not Source.LLM and self.confidence != 1.0:
|
|
40
|
+
raise ValueError("confidence < 1.0 is only valid for source=llm")
|
|
41
|
+
return self
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def parsed(cls, extractor: str, commit: str = "") -> Provenance:
|
|
45
|
+
return cls(source=Source.PARSED, extractor=extractor, commit=commit)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def resolved(cls, extractor: str, commit: str = "") -> Provenance:
|
|
49
|
+
return cls(source=Source.RESOLVED, extractor=extractor, commit=commit)
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
def manual(cls, extractor: str, commit: str = "") -> Provenance:
|
|
53
|
+
return cls(source=Source.MANUAL, extractor=extractor, commit=commit)
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def llm(cls, extractor: str, confidence: float, commit: str = "") -> Provenance:
|
|
57
|
+
return cls(
|
|
58
|
+
source=Source.LLM,
|
|
59
|
+
extractor=extractor,
|
|
60
|
+
confidence=confidence,
|
|
61
|
+
commit=commit,
|
|
62
|
+
)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Symbol identity — stable, human-readable, deterministic node IDs.
|
|
2
|
+
|
|
3
|
+
A symbol ID is a single string derived from
|
|
4
|
+
``(scheme, lang, repo, path, descriptor)``. It has no global counters
|
|
5
|
+
and no ordering constraints, so per-file extraction can run in any order
|
|
6
|
+
and merge, and the same symbol keeps its ID across commits — the
|
|
7
|
+
property incremental indexing (feat-004) and history (feat-009) depend
|
|
8
|
+
on. The descriptor grammar is SCIP-derived. See ADR-0003 and
|
|
9
|
+
``docs/design/design-001-core-contracts-module.md`` §4.4.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict
|
|
17
|
+
|
|
18
|
+
SCHEME = "ckg"
|
|
19
|
+
_FIELD_SEP = " "
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _encode(field: str) -> str:
|
|
23
|
+
"""Escape the field separator and the escape char so IDs round-trip."""
|
|
24
|
+
return field.replace("%", "%25").replace(_FIELD_SEP, "%20")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _decode(field: str) -> str:
|
|
28
|
+
# %20 before %25 so an escaped escape (%2520) decodes back to "%20".
|
|
29
|
+
return field.replace("%20", _FIELD_SEP).replace("%25", "%")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def normalize_path(path: str) -> str:
|
|
33
|
+
"""Repo-relative, posix-separated, no leading ``./`` or ``/``.
|
|
34
|
+
|
|
35
|
+
Ensures the same file yields the same ID on any OS.
|
|
36
|
+
"""
|
|
37
|
+
p = path.replace("\\", "/")
|
|
38
|
+
while p.startswith("./"):
|
|
39
|
+
p = p[2:]
|
|
40
|
+
return p.lstrip("/")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ParsedSymbol(BaseModel):
|
|
44
|
+
"""The structured form of a symbol ID."""
|
|
45
|
+
|
|
46
|
+
model_config = ConfigDict(frozen=True)
|
|
47
|
+
|
|
48
|
+
scheme: str
|
|
49
|
+
lang: str
|
|
50
|
+
repo: str
|
|
51
|
+
path: str
|
|
52
|
+
descriptor: str
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class SymbolID:
|
|
56
|
+
"""Format and parse symbol-ID strings. Stateless."""
|
|
57
|
+
|
|
58
|
+
SCHEME = SCHEME
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def for_symbol(lang: str, repo: str, path: str, descriptor: str) -> str:
|
|
62
|
+
parts = [SCHEME, lang, repo, normalize_path(path), descriptor]
|
|
63
|
+
return _FIELD_SEP.join(_encode(p) for p in parts)
|
|
64
|
+
|
|
65
|
+
@staticmethod
|
|
66
|
+
def parse(symbol_id: str) -> ParsedSymbol:
|
|
67
|
+
raw = symbol_id.split(_FIELD_SEP)
|
|
68
|
+
if len(raw) != 5:
|
|
69
|
+
raise ValueError(
|
|
70
|
+
f"malformed symbol id (expected 5 space-separated fields): {symbol_id!r}"
|
|
71
|
+
)
|
|
72
|
+
scheme, lang, repo, path, descriptor = (_decode(p) for p in raw)
|
|
73
|
+
if scheme != SCHEME:
|
|
74
|
+
raise ValueError(f"unknown symbol-id scheme {scheme!r} (expected {SCHEME!r})")
|
|
75
|
+
return ParsedSymbol(scheme=scheme, lang=lang, repo=repo, path=path, descriptor=descriptor)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Descriptor:
|
|
79
|
+
"""Builders for the SCIP-derived descriptor segments.
|
|
80
|
+
|
|
81
|
+
Segments compose by concatenation — a method on a class is
|
|
82
|
+
``Descriptor.type("Auth") + Descriptor.method("login")`` →
|
|
83
|
+
``"Auth#login()."``. Language packs (feat-002) map AST nodes to
|
|
84
|
+
these; core only owns the string format.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def namespace(name: str) -> str:
|
|
89
|
+
return f"{name}/"
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def type(name: str) -> str:
|
|
93
|
+
return f"{name}#"
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def term(name: str) -> str:
|
|
97
|
+
return f"{name}."
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def method(name: str, disambiguator: int = 0) -> str:
|
|
101
|
+
"""A method/function. ``disambiguator`` n>=1 marks the nth overload."""
|
|
102
|
+
if disambiguator < 0:
|
|
103
|
+
raise ValueError("disambiguator must be >= 0")
|
|
104
|
+
suffix = f"(+{disambiguator})" if disambiguator else ""
|
|
105
|
+
return f"{name}{suffix}()."
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def local(seed: str) -> str:
|
|
109
|
+
"""Descriptor for an anonymous/local symbol with no stable name.
|
|
110
|
+
|
|
111
|
+
``seed`` should be derived from the symbol's position within its
|
|
112
|
+
nearest *named* ancestor so edits above the ancestor don't shift
|
|
113
|
+
it. Inherently less stable than named symbols (ADR-0003 §risks).
|
|
114
|
+
"""
|
|
115
|
+
digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:8]
|
|
116
|
+
return f"local({digest})"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""agentforge_graph.embed — chunk embedding (feat-005).
|
|
2
|
+
|
|
3
|
+
Default real backend is AWS Bedrock Cohere embed-v4 (`BedrockEmbedder`);
|
|
4
|
+
`OpenAIEmbedder` is the non-AWS / local-server path (ENH-003 phase 2); tests/CI
|
|
5
|
+
use the deterministic `FakeEmbedder`. Imports nothing from ``agentforge``
|
|
6
|
+
(ADR-0001); each driver's SDK (boto3 / openai) is lazy-imported in its module.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .base import Embedder, InputType
|
|
12
|
+
from .bedrock import BedrockEmbedder
|
|
13
|
+
from .fake import FakeEmbedder
|
|
14
|
+
from .openai import OpenAIEmbedder
|
|
15
|
+
from .pipeline import EmbedPipeline
|
|
16
|
+
from .registry import embedder_from_config
|
|
17
|
+
from .report import EmbedReport
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"Embedder",
|
|
21
|
+
"InputType",
|
|
22
|
+
"FakeEmbedder",
|
|
23
|
+
"BedrockEmbedder",
|
|
24
|
+
"OpenAIEmbedder",
|
|
25
|
+
"EmbedPipeline",
|
|
26
|
+
"EmbedReport",
|
|
27
|
+
"embedder_from_config",
|
|
28
|
+
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""The ``Embedder`` contract. Implementations: ``FakeEmbedder`` (CI default,
|
|
2
|
+
deterministic) and ``BedrockEmbedder`` (Cohere embed-v4). Imports nothing
|
|
3
|
+
from ``agentforge`` (ADR-0001)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
InputType = Literal["document", "query"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Embedder(ABC):
|
|
14
|
+
name: str
|
|
15
|
+
dim: int
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
async def embed(
|
|
19
|
+
self, texts: list[str], input_type: InputType = "document"
|
|
20
|
+
) -> list[list[float]]:
|
|
21
|
+
"""Embed ``texts``. ``input_type`` distinguishes stored documents from
|
|
22
|
+
search queries (asymmetric models use it; symmetric ones ignore it)."""
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""``BedrockEmbedder`` — AWS Bedrock Cohere embed-v4 via boto3.
|
|
2
|
+
|
|
3
|
+
boto3 is imported lazily (only this driver needs it; it lives in the
|
|
4
|
+
``bedrock`` extra). Synchronous Bedrock calls run on a worker thread.
|
|
5
|
+
Supports an optional STS assume-role (the CI path); otherwise the default
|
|
6
|
+
AWS credential chain (a developer's configured CLI). Voyage is *not* on
|
|
7
|
+
Bedrock — see memory `embeddings-bedrock`.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import json
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from .base import Embedder, InputType
|
|
17
|
+
|
|
18
|
+
_INPUT_MAP: dict[str, str] = {"document": "search_document", "query": "search_query"}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BedrockEmbedder(Embedder):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
model: str = "cohere.embed-v4:0",
|
|
25
|
+
region: str = "us-east-1",
|
|
26
|
+
dim: int = 1024,
|
|
27
|
+
batch_size: int = 96,
|
|
28
|
+
assume_role_arn: str | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
self.name = f"bedrock:{model}"
|
|
31
|
+
self.model = model
|
|
32
|
+
self.region = region
|
|
33
|
+
self.dim = dim
|
|
34
|
+
self.batch_size = batch_size
|
|
35
|
+
self.assume_role_arn = assume_role_arn
|
|
36
|
+
self._client: Any = None
|
|
37
|
+
|
|
38
|
+
def _bedrock(self) -> Any:
|
|
39
|
+
if self._client is None:
|
|
40
|
+
import boto3
|
|
41
|
+
|
|
42
|
+
if self.assume_role_arn:
|
|
43
|
+
sts = boto3.client("sts", region_name=self.region)
|
|
44
|
+
creds = sts.assume_role(RoleArn=self.assume_role_arn, RoleSessionName="ckg-embed")[
|
|
45
|
+
"Credentials"
|
|
46
|
+
]
|
|
47
|
+
self._client = boto3.client(
|
|
48
|
+
"bedrock-runtime",
|
|
49
|
+
region_name=self.region,
|
|
50
|
+
aws_access_key_id=creds["AccessKeyId"],
|
|
51
|
+
aws_secret_access_key=creds["SecretAccessKey"],
|
|
52
|
+
aws_session_token=creds["SessionToken"],
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
self._client = boto3.client("bedrock-runtime", region_name=self.region)
|
|
56
|
+
return self._client
|
|
57
|
+
|
|
58
|
+
async def embed(
|
|
59
|
+
self, texts: list[str], input_type: InputType = "document"
|
|
60
|
+
) -> list[list[float]]:
|
|
61
|
+
out: list[list[float]] = []
|
|
62
|
+
for i in range(0, len(texts), self.batch_size):
|
|
63
|
+
batch = texts[i : i + self.batch_size]
|
|
64
|
+
out.extend(await asyncio.to_thread(self._invoke, batch, input_type))
|
|
65
|
+
return out
|
|
66
|
+
|
|
67
|
+
def _invoke(self, batch: list[str], input_type: InputType) -> list[list[float]]:
|
|
68
|
+
body = json.dumps(
|
|
69
|
+
{
|
|
70
|
+
"texts": batch,
|
|
71
|
+
"input_type": _INPUT_MAP[input_type],
|
|
72
|
+
"embedding_types": ["float"],
|
|
73
|
+
"output_dimension": self.dim,
|
|
74
|
+
}
|
|
75
|
+
)
|
|
76
|
+
resp = self._bedrock().invoke_model(
|
|
77
|
+
modelId=self.model,
|
|
78
|
+
contentType="application/json",
|
|
79
|
+
accept="application/json",
|
|
80
|
+
body=body,
|
|
81
|
+
)
|
|
82
|
+
payload = json.loads(resp["body"].read())
|
|
83
|
+
embeddings = payload["embeddings"]
|
|
84
|
+
floats = embeddings["float"] if isinstance(embeddings, dict) else embeddings
|
|
85
|
+
return [[float(x) for x in vec] for vec in floats]
|