agentforge-graph 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (151) hide show
  1. agentforge_graph/__init__.py +6 -0
  2. agentforge_graph/chunking/__init__.py +12 -0
  3. agentforge_graph/chunking/cast.py +159 -0
  4. agentforge_graph/chunking/chunk.py +19 -0
  5. agentforge_graph/chunking/tokens.py +15 -0
  6. agentforge_graph/cli.py +607 -0
  7. agentforge_graph/config.py +259 -0
  8. agentforge_graph/core/__init__.py +54 -0
  9. agentforge_graph/core/conformance.py +270 -0
  10. agentforge_graph/core/contracts.py +163 -0
  11. agentforge_graph/core/kinds.py +68 -0
  12. agentforge_graph/core/models.py +134 -0
  13. agentforge_graph/core/provenance.py +62 -0
  14. agentforge_graph/core/symbols.py +116 -0
  15. agentforge_graph/embed/__init__.py +28 -0
  16. agentforge_graph/embed/base.py +22 -0
  17. agentforge_graph/embed/bedrock.py +85 -0
  18. agentforge_graph/embed/fake.py +34 -0
  19. agentforge_graph/embed/openai.py +67 -0
  20. agentforge_graph/embed/pipeline.py +184 -0
  21. agentforge_graph/embed/registry.py +66 -0
  22. agentforge_graph/embed/report.py +15 -0
  23. agentforge_graph/enrich/__init__.py +70 -0
  24. agentforge_graph/enrich/anthropic.py +38 -0
  25. agentforge_graph/enrich/anthropic_client.py +109 -0
  26. agentforge_graph/enrich/bedrock.py +24 -0
  27. agentforge_graph/enrich/bedrock_client.py +115 -0
  28. agentforge_graph/enrich/bedrock_summarizer.py +23 -0
  29. agentforge_graph/enrich/claude.py +172 -0
  30. agentforge_graph/enrich/enricher.py +108 -0
  31. agentforge_graph/enrich/governs.py +173 -0
  32. agentforge_graph/enrich/governs_enricher.py +152 -0
  33. agentforge_graph/enrich/heuristics.py +224 -0
  34. agentforge_graph/enrich/judge.py +63 -0
  35. agentforge_graph/enrich/registry.py +133 -0
  36. agentforge_graph/enrich/report.py +60 -0
  37. agentforge_graph/enrich/summarizer.py +62 -0
  38. agentforge_graph/enrich/summary_enricher.py +211 -0
  39. agentforge_graph/enrich/taxonomy.py +38 -0
  40. agentforge_graph/frameworks/__init__.py +29 -0
  41. agentforge_graph/frameworks/base.py +75 -0
  42. agentforge_graph/frameworks/detect.py +124 -0
  43. agentforge_graph/frameworks/extractor.py +63 -0
  44. agentforge_graph/frameworks/orm.py +93 -0
  45. agentforge_graph/frameworks/packs/_js_ast.py +56 -0
  46. agentforge_graph/frameworks/packs/_python_ast.py +157 -0
  47. agentforge_graph/frameworks/packs/django/__init__.py +240 -0
  48. agentforge_graph/frameworks/packs/django/models.scm +7 -0
  49. agentforge_graph/frameworks/packs/express/__init__.py +133 -0
  50. agentforge_graph/frameworks/packs/express/routes.scm +8 -0
  51. agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
  52. agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
  53. agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
  54. agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
  55. agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
  56. agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
  57. agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
  58. agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
  59. agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
  60. agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
  61. agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
  62. agentforge_graph/frameworks/registry.py +44 -0
  63. agentforge_graph/ingest/__init__.py +30 -0
  64. agentforge_graph/ingest/codegraph.py +847 -0
  65. agentforge_graph/ingest/extractor.py +353 -0
  66. agentforge_graph/ingest/incremental/__init__.py +25 -0
  67. agentforge_graph/ingest/incremental/detect.py +118 -0
  68. agentforge_graph/ingest/incremental/dirty.py +61 -0
  69. agentforge_graph/ingest/incremental/indexer.py +218 -0
  70. agentforge_graph/ingest/incremental/meta.py +72 -0
  71. agentforge_graph/ingest/incremental/ports.py +39 -0
  72. agentforge_graph/ingest/pack.py +160 -0
  73. agentforge_graph/ingest/packs/__init__.py +34 -0
  74. agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
  75. agentforge_graph/ingest/packs/cpp/references.scm +15 -0
  76. agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
  77. agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
  78. agentforge_graph/ingest/packs/csharp/references.scm +12 -0
  79. agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
  80. agentforge_graph/ingest/packs/go/__init__.py +38 -0
  81. agentforge_graph/ingest/packs/go/references.scm +12 -0
  82. agentforge_graph/ingest/packs/go/structure.scm +64 -0
  83. agentforge_graph/ingest/packs/java/__init__.py +35 -0
  84. agentforge_graph/ingest/packs/java/references.scm +12 -0
  85. agentforge_graph/ingest/packs/java/structure.scm +38 -0
  86. agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
  87. agentforge_graph/ingest/packs/javascript/references.scm +11 -0
  88. agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
  89. agentforge_graph/ingest/packs/php/__init__.py +35 -0
  90. agentforge_graph/ingest/packs/php/references.scm +15 -0
  91. agentforge_graph/ingest/packs/php/structure.scm +44 -0
  92. agentforge_graph/ingest/packs/python/__init__.py +25 -0
  93. agentforge_graph/ingest/packs/python/references.scm +14 -0
  94. agentforge_graph/ingest/packs/python/structure.scm +57 -0
  95. agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
  96. agentforge_graph/ingest/packs/ruby/references.scm +12 -0
  97. agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
  98. agentforge_graph/ingest/packs/rust/__init__.py +39 -0
  99. agentforge_graph/ingest/packs/rust/references.scm +12 -0
  100. agentforge_graph/ingest/packs/rust/structure.scm +46 -0
  101. agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
  102. agentforge_graph/ingest/packs/typescript/references.scm +11 -0
  103. agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
  104. agentforge_graph/ingest/pipeline.py +134 -0
  105. agentforge_graph/ingest/report.py +84 -0
  106. agentforge_graph/ingest/resolver.py +467 -0
  107. agentforge_graph/ingest/source.py +79 -0
  108. agentforge_graph/knowledge/__init__.py +28 -0
  109. agentforge_graph/knowledge/adr.py +136 -0
  110. agentforge_graph/knowledge/commits.py +152 -0
  111. agentforge_graph/knowledge/ingest.py +312 -0
  112. agentforge_graph/knowledge/mentions.py +71 -0
  113. agentforge_graph/knowledge/report.py +32 -0
  114. agentforge_graph/main.py +21 -0
  115. agentforge_graph/providers.py +36 -0
  116. agentforge_graph/repomap/__init__.py +14 -0
  117. agentforge_graph/repomap/rank.py +161 -0
  118. agentforge_graph/repomap/render.py +55 -0
  119. agentforge_graph/repomap/repomap.py +66 -0
  120. agentforge_graph/retrieve/__init__.py +21 -0
  121. agentforge_graph/retrieve/pack.py +76 -0
  122. agentforge_graph/retrieve/rerank.py +251 -0
  123. agentforge_graph/retrieve/retriever.py +286 -0
  124. agentforge_graph/retrieve/scoring.py +36 -0
  125. agentforge_graph/serve/__init__.py +19 -0
  126. agentforge_graph/serve/engine.py +204 -0
  127. agentforge_graph/serve/http_runner.py +133 -0
  128. agentforge_graph/serve/server.py +110 -0
  129. agentforge_graph/serve/tools.py +307 -0
  130. agentforge_graph/store/__init__.py +32 -0
  131. agentforge_graph/store/_rowmap.py +102 -0
  132. agentforge_graph/store/errors.py +22 -0
  133. agentforge_graph/store/facade.py +89 -0
  134. agentforge_graph/store/kuzu_store.py +380 -0
  135. agentforge_graph/store/lance_store.py +146 -0
  136. agentforge_graph/store/neo4j_store.py +294 -0
  137. agentforge_graph/store/pgvector_store.py +170 -0
  138. agentforge_graph/store/registry.py +45 -0
  139. agentforge_graph/temporal/__init__.py +36 -0
  140. agentforge_graph/temporal/backfill.py +338 -0
  141. agentforge_graph/temporal/events.py +82 -0
  142. agentforge_graph/temporal/index.py +190 -0
  143. agentforge_graph/temporal/mining.py +190 -0
  144. agentforge_graph/temporal/recorder.py +114 -0
  145. agentforge_graph/temporal/store.py +282 -0
  146. agentforge_graph-0.3.2.dist-info/METADATA +291 -0
  147. agentforge_graph-0.3.2.dist-info/RECORD +151 -0
  148. agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
  149. agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
  150. agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
  151. agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
@@ -0,0 +1,163 @@
1
+ """The locked ABCs every later feature plugs into.
2
+
3
+ - ``Extractor`` — turns a file into a ``FileSubgraph`` (feat-002, feat-011).
4
+ - ``GraphStore`` — persists subgraphs and enrichment facts, answers
5
+ queries and neighborhood walks (feat-003 adapters).
6
+ - ``Enricher`` — derives new nodes/edges from the existing graph
7
+ (feat-010/012).
8
+
9
+ Signatures only; implementations ship with their owning features. The
10
+ constructor/method surface here is the stable contract — additions are
11
+ minor bumps, removals/renames are major. See ADR-0001 (layering: this
12
+ module imports nothing from ``agentforge``).
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from abc import ABC, abstractmethod
18
+ from typing import Any, Literal
19
+
20
+ from .kinds import EdgeKind
21
+ from .models import (
22
+ Edge,
23
+ Embedded,
24
+ FileSubgraph,
25
+ GraphQuery,
26
+ Node,
27
+ QueryResult,
28
+ ScoredRef,
29
+ SourceFile,
30
+ )
31
+
32
+ # Direction of a 1-hop edge walk: out = node is src, in = node is dst.
33
+ Direction = Literal["out", "in", "both"]
34
+
35
+
36
+ class Extractor(ABC):
37
+ """Produces a ``FileSubgraph`` from a single file, in isolation.
38
+
39
+ Extraction must not read other files (per-file isolation is what
40
+ makes feat-004 incremental); cross-file edges are emitted as
41
+ candidate references and resolved in a later pass.
42
+ """
43
+
44
+ name: str
45
+
46
+ @abstractmethod
47
+ def extract(self, file: SourceFile) -> FileSubgraph: ...
48
+
49
+
50
+ class GraphStore(ABC):
51
+ """Persistence + query contract. feat-003 ships the adapters."""
52
+
53
+ @abstractmethod
54
+ async def upsert(self, subgraph: FileSubgraph) -> None:
55
+ """Insert/replace all nodes & edges for ``subgraph.path``
56
+ transactionally (delete prior content for that path, add new)."""
57
+
58
+ @abstractmethod
59
+ async def add(self, items: list[Node | Edge]) -> None:
60
+ """Persist facts not tied to a single file (enrichment, resolved
61
+ cross-file edges). These survive ``delete_file`` of code files."""
62
+
63
+ @abstractmethod
64
+ async def delete_file(self, path: str) -> None:
65
+ """Remove everything previously upserted for ``path``."""
66
+
67
+ @abstractmethod
68
+ async def clear_resolved(self, paths: list[str]) -> None:
69
+ """Delete resolved-provenance edges whose ``origin_path`` is in
70
+ ``paths`` — the inverse of a scoped re-resolve (feat-004). Parsed
71
+ nodes/edges are untouched. Also garbage-collects external ``PACKAGE``
72
+ nodes left with no inbound edge, so an incremental re-resolve converges
73
+ to the same graph a full re-index would produce."""
74
+
75
+ @abstractmethod
76
+ async def clear_outgoing(self, src_ids: list[str], kind: EdgeKind) -> None:
77
+ """Delete edges of ``kind`` whose ``src`` is in ``src_ids`` — lets an
78
+ enricher (feat-012) re-derive a symbol's facts idempotently (re-tag
79
+ without duplicating ``TAGGED``/``SUMMARIZES`` edges)."""
80
+
81
+ @abstractmethod
82
+ async def query(self, q: GraphQuery) -> QueryResult:
83
+ """Exact-match node lookup with the flat ``GraphQuery`` filter."""
84
+
85
+ @abstractmethod
86
+ async def neighbors(
87
+ self,
88
+ node_id: str,
89
+ kinds: list[EdgeKind] | None = None,
90
+ depth: int = 1,
91
+ ) -> list[Node]:
92
+ """Nodes reachable from ``node_id`` over edges of ``kinds`` within
93
+ ``depth`` hops (either direction)."""
94
+
95
+ @abstractmethod
96
+ async def get(self, node_id: str) -> Node | None:
97
+ """Fetch a node by id, or ``None``."""
98
+
99
+ @abstractmethod
100
+ async def set_attrs(self, node_id: str, attrs: dict[str, Any]) -> None:
101
+ """Merge ``attrs`` into an existing node's ``attrs`` (a partial update —
102
+ other fields, including the file-ownership ``origin_path`` that drives
103
+ ``delete_file``, are untouched). No-op if the node is absent. The
104
+ denormalisation channel for derived facts (feat-009 churn/authorship)
105
+ that must not detach a file-owned node from its file."""
106
+
107
+ @abstractmethod
108
+ async def adjacent(
109
+ self,
110
+ node_id: str,
111
+ kinds: list[EdgeKind] | None = None,
112
+ direction: Direction = "both",
113
+ ) -> list[Edge]:
114
+ """The 1-hop edges touching ``node_id`` (``out``: it is the src;
115
+ ``in``: it is the dst; ``both``), optionally filtered by edge kind.
116
+ Returns full ``Edge`` objects, so the caller sees each edge's kind,
117
+ direction and provenance (feat-006 retrieval scoring)."""
118
+
119
+ @abstractmethod
120
+ async def close(self) -> None:
121
+ """Release resources. Safe to call more than once."""
122
+
123
+
124
+ class VectorStore(ABC):
125
+ """Vector persistence + similarity search. feat-003 ships the LanceDB
126
+ adapter; feat-005 produces the ``Embedded`` items it stores. A peer of
127
+ ``GraphStore`` — the ``Store`` facade (feat-003) owns one of each and
128
+ joins them (vector hit -> graph expansion) for retrieval (feat-006)."""
129
+
130
+ @abstractmethod
131
+ async def upsert(self, items: list[Embedded]) -> None:
132
+ """Insert/replace vectors keyed by ``Embedded.ref``."""
133
+
134
+ @abstractmethod
135
+ async def search(
136
+ self,
137
+ vector: list[float],
138
+ k: int,
139
+ filter: dict[str, Any] | None = None,
140
+ ) -> list[ScoredRef]:
141
+ """Top-``k`` nearest refs, optionally constrained by an attribute
142
+ ``filter`` (e.g. ``{"kind": "Chunk"}``)."""
143
+
144
+ @abstractmethod
145
+ async def delete_where(self, filter: dict[str, Any]) -> None:
146
+ """Drop vectors matching ``filter`` (feat-004 invalidation)."""
147
+
148
+ @abstractmethod
149
+ async def close(self) -> None:
150
+ """Release resources. Safe to call more than once."""
151
+
152
+
153
+ class Enricher(ABC):
154
+ """Derives new nodes/edges from the existing graph (feat-010/012).
155
+
156
+ Returns the facts it derived; the caller persists them via
157
+ ``GraphStore.add``. Derived facts must carry ``source=llm``
158
+ provenance with a confidence (ADR-0004)."""
159
+
160
+ name: str
161
+
162
+ @abstractmethod
163
+ async def enrich(self, store: GraphStore) -> list[Node | Edge]: ...
@@ -0,0 +1,68 @@
1
+ """Node and edge kind vocabularies for the code knowledge graph.
2
+
3
+ The full vocabulary is locked at 0.1 — including the higher-level kinds
4
+ whose producers ship later (feat-010/011/012) — so stores and queries
5
+ handle every kind from day one and no schema migration is needed when a
6
+ later producer lands. See ADR-0005.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from enum import StrEnum
12
+
13
+
14
+ class NodeKind(StrEnum):
15
+ """Every node kind the graph may contain. Locked at 0.1 (ADR-0005)."""
16
+
17
+ # --- structural (produced by feat-002) ---
18
+ REPOSITORY = "Repository"
19
+ PACKAGE = "Package"
20
+ FILE = "File"
21
+ CLASS = "Class"
22
+ INTERFACE = "Interface"
23
+ FUNCTION = "Function"
24
+ METHOD = "Method"
25
+ VARIABLE = "Variable"
26
+ TYPE_ALIAS = "TypeAlias"
27
+
28
+ # --- retrieval (feat-005 / feat-010) ---
29
+ CHUNK = "Chunk"
30
+ DOC_CHUNK = "DocChunk"
31
+
32
+ # --- higher-level: reserved now, produced later (ADR-0005) ---
33
+ DECISION = "Decision" # feat-010
34
+ ROUTE = "Route" # feat-011
35
+ DATA_MODEL = "DataModel" # feat-011
36
+ SERVICE = "Service" # feat-011
37
+ SUMMARY = "Summary" # feat-012
38
+ PATTERN_TAG = "PatternTag" # feat-012
39
+
40
+
41
+ class EdgeKind(StrEnum):
42
+ """Every edge kind the graph may contain. Locked at 0.1 (ADR-0005)."""
43
+
44
+ # --- structural (feat-002) ---
45
+ CONTAINS = "CONTAINS"
46
+ IMPORTS = "IMPORTS"
47
+ CALLS = "CALLS"
48
+ INHERITS = "INHERITS"
49
+ IMPLEMENTS = "IMPLEMENTS"
50
+ REFERENCES = "REFERENCES"
51
+
52
+ # --- retrieval / docs (feat-005 / feat-010) ---
53
+ CHUNK_OF = "CHUNK_OF"
54
+ DESCRIBES = "DESCRIBES"
55
+
56
+ # --- decisions (feat-010) ---
57
+ GOVERNS = "GOVERNS"
58
+ SUPERSEDES = "SUPERSEDES"
59
+
60
+ # --- framework (feat-011) ---
61
+ HANDLED_BY = "HANDLED_BY"
62
+ INJECTED_INTO = "INJECTED_INTO"
63
+ HAS_FIELD = "HAS_FIELD"
64
+ RELATES_TO = "RELATES_TO"
65
+
66
+ # --- enrichment (feat-012) ---
67
+ SUMMARIZES = "SUMMARIZES"
68
+ TAGGED = "TAGGED"
@@ -0,0 +1,134 @@
1
+ """Value types for the graph: nodes, edges, the per-file subgraph, and
2
+ the minimal query/result shapes.
3
+
4
+ ``Node``/``Edge`` validate their IDs and (via ``Provenance``) their
5
+ attribution at construction, so the graph cannot hold a malformed or
6
+ unattributed fact. ``FileSubgraph`` is the unit of ingestion *and*
7
+ deletion — keyed by ``(path, content_hash)`` — which is what makes
8
+ incremental indexing (feat-004) a thin layer. See ADR-0003/0004.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any
14
+
15
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
16
+
17
+ from .kinds import EdgeKind, NodeKind
18
+ from .provenance import Provenance, Source
19
+ from .symbols import SymbolID
20
+
21
+
22
+ def _require_symbol_id(value: str) -> str:
23
+ SymbolID.parse(value) # raises ValueError if malformed
24
+ return value
25
+
26
+
27
+ class SourceFile(BaseModel):
28
+ """A single file handed to an ``Extractor``."""
29
+
30
+ model_config = ConfigDict(frozen=True)
31
+
32
+ path: str # repo-relative, posix
33
+ text: str
34
+ language: str
35
+ content_hash: str # sha256 of the file bytes
36
+
37
+
38
+ class Node(BaseModel):
39
+ """A typed entity in the graph."""
40
+
41
+ id: str # a SymbolID string
42
+ kind: NodeKind
43
+ name: str
44
+ span: tuple[int, int] | None = None # (start_line, end_line), 1-based
45
+ attrs: dict[str, Any] = Field(default_factory=dict)
46
+ provenance: Provenance
47
+
48
+ @field_validator("id")
49
+ @classmethod
50
+ def _check_id(cls, v: str) -> str:
51
+ return _require_symbol_id(v)
52
+
53
+
54
+ class Edge(BaseModel):
55
+ """A typed relationship between two symbols."""
56
+
57
+ src: str
58
+ dst: str
59
+ kind: EdgeKind
60
+ attrs: dict[str, Any] = Field(default_factory=dict)
61
+ provenance: Provenance
62
+ # The file whose content produced this edge (the import/call site's file).
63
+ # Lets resolver-produced edges be invalidated per file on an incremental
64
+ # re-resolve (feat-004). Empty for file-parsed edges, where the store
65
+ # stamps the owning file's path at upsert time.
66
+ origin_path: str = ""
67
+
68
+ @field_validator("src", "dst")
69
+ @classmethod
70
+ def _check_endpoint(cls, v: str) -> str:
71
+ return _require_symbol_id(v)
72
+
73
+
74
+ class FileSubgraph(BaseModel):
75
+ """Everything extracted from one file — the ingestion/deletion unit."""
76
+
77
+ path: str # repo-relative, posix
78
+ content_hash: str
79
+ nodes: list[Node] = Field(default_factory=list)
80
+ edges: list[Edge] = Field(default_factory=list)
81
+
82
+
83
+ class GraphQuery(BaseModel):
84
+ """A minimal, flat node filter (0.1). Graph traversal lives in
85
+ ``GraphStore.neighbors``, not here. Extends by minor bump."""
86
+
87
+ kinds: list[NodeKind] | None = None
88
+ name: str | None = None # exact match
89
+ path_prefix: str | None = None
90
+ edge_kind: EdgeKind | None = None
91
+ min_source: Source | None = None # provenance floor
92
+ limit: int = 100
93
+
94
+ @field_validator("limit")
95
+ @classmethod
96
+ def _positive_limit(cls, v: int) -> int:
97
+ if v <= 0:
98
+ raise ValueError("limit must be > 0")
99
+ return v
100
+
101
+
102
+ class QueryResult(BaseModel):
103
+ """The result of a ``GraphStore.query``."""
104
+
105
+ nodes: list[Node] = Field(default_factory=list)
106
+ edges: list[Edge] = Field(default_factory=list)
107
+ truncated: bool = False # True if `limit` clipped the result
108
+
109
+
110
+ class Embedded(BaseModel):
111
+ """A vector plus the symbol/chunk it represents — the ``VectorStore``
112
+ write unit. ``ref`` is the id of the node the vector stands in for
113
+ (a Chunk, DocChunk, Summary…); the producer is feat-005."""
114
+
115
+ ref: str # symbol/chunk id this vector represents
116
+ vector: list[float]
117
+ kind: NodeKind
118
+ attrs: dict[str, Any] = Field(default_factory=dict)
119
+
120
+ @field_validator("vector")
121
+ @classmethod
122
+ def _non_empty(cls, v: list[float]) -> list[float]:
123
+ if not v:
124
+ raise ValueError("vector must be non-empty")
125
+ return v
126
+
127
+
128
+ class ScoredRef(BaseModel):
129
+ """A vector-search hit: a ref and its similarity score (higher =
130
+ closer). feat-006 expands these into a graph neighborhood."""
131
+
132
+ ref: str
133
+ score: float
134
+ attrs: dict[str, Any] = Field(default_factory=dict)
@@ -0,0 +1,62 @@
1
+ """Provenance — attribution carried by every node and edge.
2
+
3
+ Every fact in the graph records where it came from, so an agent can
4
+ tell parsed ground truth from a heuristic resolution or an LLM guess,
5
+ and rank/filter accordingly. Enforced at construction so no
6
+ unattributed fact can exist. See ADR-0004.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from enum import StrEnum
12
+
13
+ from pydantic import BaseModel, ConfigDict, model_validator
14
+
15
+
16
+ class Source(StrEnum):
17
+ """How a fact was derived, in decreasing reliability."""
18
+
19
+ PARSED = "parsed" # straight from the syntax tree
20
+ RESOLVED = "resolved" # upgraded via the import graph / resolver
21
+ LLM = "llm" # generated by a model (summaries, pattern tags, inferred links)
22
+ MANUAL = "manual" # human-asserted
23
+
24
+
25
+ class Provenance(BaseModel):
26
+ """Attribution for a single node or edge."""
27
+
28
+ model_config = ConfigDict(frozen=True)
29
+
30
+ source: Source
31
+ extractor: str # producer name + version, e.g. "tree-sitter-python@0.23"
32
+ commit: str = "" # git sha the fact was derived at; "" if non-git / unstaged
33
+ confidence: float = 1.0 # < 1.0 only meaningful for source=llm
34
+
35
+ @model_validator(mode="after")
36
+ def _check_confidence(self) -> Provenance:
37
+ if not 0.0 <= self.confidence <= 1.0:
38
+ raise ValueError("confidence must be in [0.0, 1.0]")
39
+ if self.source is not Source.LLM and self.confidence != 1.0:
40
+ raise ValueError("confidence < 1.0 is only valid for source=llm")
41
+ return self
42
+
43
+ @classmethod
44
+ def parsed(cls, extractor: str, commit: str = "") -> Provenance:
45
+ return cls(source=Source.PARSED, extractor=extractor, commit=commit)
46
+
47
+ @classmethod
48
+ def resolved(cls, extractor: str, commit: str = "") -> Provenance:
49
+ return cls(source=Source.RESOLVED, extractor=extractor, commit=commit)
50
+
51
+ @classmethod
52
+ def manual(cls, extractor: str, commit: str = "") -> Provenance:
53
+ return cls(source=Source.MANUAL, extractor=extractor, commit=commit)
54
+
55
+ @classmethod
56
+ def llm(cls, extractor: str, confidence: float, commit: str = "") -> Provenance:
57
+ return cls(
58
+ source=Source.LLM,
59
+ extractor=extractor,
60
+ confidence=confidence,
61
+ commit=commit,
62
+ )
@@ -0,0 +1,116 @@
1
+ """Symbol identity — stable, human-readable, deterministic node IDs.
2
+
3
+ A symbol ID is a single string derived from
4
+ ``(scheme, lang, repo, path, descriptor)``. It has no global counters
5
+ and no ordering constraints, so per-file extraction can run in any order
6
+ and merge, and the same symbol keeps its ID across commits — the
7
+ property incremental indexing (feat-004) and history (feat-009) depend
8
+ on. The descriptor grammar is SCIP-derived. See ADR-0003 and
9
+ ``docs/design/design-001-core-contracts-module.md`` §4.4.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+
16
+ from pydantic import BaseModel, ConfigDict
17
+
18
+ SCHEME = "ckg"
19
+ _FIELD_SEP = " "
20
+
21
+
22
+ def _encode(field: str) -> str:
23
+ """Escape the field separator and the escape char so IDs round-trip."""
24
+ return field.replace("%", "%25").replace(_FIELD_SEP, "%20")
25
+
26
+
27
+ def _decode(field: str) -> str:
28
+ # %20 before %25 so an escaped escape (%2520) decodes back to "%20".
29
+ return field.replace("%20", _FIELD_SEP).replace("%25", "%")
30
+
31
+
32
+ def normalize_path(path: str) -> str:
33
+ """Repo-relative, posix-separated, no leading ``./`` or ``/``.
34
+
35
+ Ensures the same file yields the same ID on any OS.
36
+ """
37
+ p = path.replace("\\", "/")
38
+ while p.startswith("./"):
39
+ p = p[2:]
40
+ return p.lstrip("/")
41
+
42
+
43
+ class ParsedSymbol(BaseModel):
44
+ """The structured form of a symbol ID."""
45
+
46
+ model_config = ConfigDict(frozen=True)
47
+
48
+ scheme: str
49
+ lang: str
50
+ repo: str
51
+ path: str
52
+ descriptor: str
53
+
54
+
55
+ class SymbolID:
56
+ """Format and parse symbol-ID strings. Stateless."""
57
+
58
+ SCHEME = SCHEME
59
+
60
+ @staticmethod
61
+ def for_symbol(lang: str, repo: str, path: str, descriptor: str) -> str:
62
+ parts = [SCHEME, lang, repo, normalize_path(path), descriptor]
63
+ return _FIELD_SEP.join(_encode(p) for p in parts)
64
+
65
+ @staticmethod
66
+ def parse(symbol_id: str) -> ParsedSymbol:
67
+ raw = symbol_id.split(_FIELD_SEP)
68
+ if len(raw) != 5:
69
+ raise ValueError(
70
+ f"malformed symbol id (expected 5 space-separated fields): {symbol_id!r}"
71
+ )
72
+ scheme, lang, repo, path, descriptor = (_decode(p) for p in raw)
73
+ if scheme != SCHEME:
74
+ raise ValueError(f"unknown symbol-id scheme {scheme!r} (expected {SCHEME!r})")
75
+ return ParsedSymbol(scheme=scheme, lang=lang, repo=repo, path=path, descriptor=descriptor)
76
+
77
+
78
+ class Descriptor:
79
+ """Builders for the SCIP-derived descriptor segments.
80
+
81
+ Segments compose by concatenation — a method on a class is
82
+ ``Descriptor.type("Auth") + Descriptor.method("login")`` →
83
+ ``"Auth#login()."``. Language packs (feat-002) map AST nodes to
84
+ these; core only owns the string format.
85
+ """
86
+
87
+ @staticmethod
88
+ def namespace(name: str) -> str:
89
+ return f"{name}/"
90
+
91
+ @staticmethod
92
+ def type(name: str) -> str:
93
+ return f"{name}#"
94
+
95
+ @staticmethod
96
+ def term(name: str) -> str:
97
+ return f"{name}."
98
+
99
+ @staticmethod
100
+ def method(name: str, disambiguator: int = 0) -> str:
101
+ """A method/function. ``disambiguator`` n>=1 marks the nth overload."""
102
+ if disambiguator < 0:
103
+ raise ValueError("disambiguator must be >= 0")
104
+ suffix = f"(+{disambiguator})" if disambiguator else ""
105
+ return f"{name}{suffix}()."
106
+
107
+ @staticmethod
108
+ def local(seed: str) -> str:
109
+ """Descriptor for an anonymous/local symbol with no stable name.
110
+
111
+ ``seed`` should be derived from the symbol's position within its
112
+ nearest *named* ancestor so edits above the ancestor don't shift
113
+ it. Inherently less stable than named symbols (ADR-0003 §risks).
114
+ """
115
+ digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:8]
116
+ return f"local({digest})"
@@ -0,0 +1,28 @@
1
+ """agentforge_graph.embed — chunk embedding (feat-005).
2
+
3
+ Default real backend is AWS Bedrock Cohere embed-v4 (`BedrockEmbedder`);
4
+ `OpenAIEmbedder` is the non-AWS / local-server path (ENH-003 phase 2); tests/CI
5
+ use the deterministic `FakeEmbedder`. Imports nothing from ``agentforge``
6
+ (ADR-0001); each driver's SDK (boto3 / openai) is lazy-imported in its module.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .base import Embedder, InputType
12
+ from .bedrock import BedrockEmbedder
13
+ from .fake import FakeEmbedder
14
+ from .openai import OpenAIEmbedder
15
+ from .pipeline import EmbedPipeline
16
+ from .registry import embedder_from_config
17
+ from .report import EmbedReport
18
+
19
+ __all__ = [
20
+ "Embedder",
21
+ "InputType",
22
+ "FakeEmbedder",
23
+ "BedrockEmbedder",
24
+ "OpenAIEmbedder",
25
+ "EmbedPipeline",
26
+ "EmbedReport",
27
+ "embedder_from_config",
28
+ ]
@@ -0,0 +1,22 @@
1
+ """The ``Embedder`` contract. Implementations: ``FakeEmbedder`` (CI default,
2
+ deterministic) and ``BedrockEmbedder`` (Cohere embed-v4). Imports nothing
3
+ from ``agentforge`` (ADR-0001)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Literal
9
+
10
+ InputType = Literal["document", "query"]
11
+
12
+
13
+ class Embedder(ABC):
14
+ name: str
15
+ dim: int
16
+
17
+ @abstractmethod
18
+ async def embed(
19
+ self, texts: list[str], input_type: InputType = "document"
20
+ ) -> list[list[float]]:
21
+ """Embed ``texts``. ``input_type`` distinguishes stored documents from
22
+ search queries (asymmetric models use it; symmetric ones ignore it)."""
@@ -0,0 +1,85 @@
1
+ """``BedrockEmbedder`` — AWS Bedrock Cohere embed-v4 via boto3.
2
+
3
+ boto3 is imported lazily (only this driver needs it; it lives in the
4
+ ``bedrock`` extra). Synchronous Bedrock calls run on a worker thread.
5
+ Supports an optional STS assume-role (the CI path); otherwise the default
6
+ AWS credential chain (a developer's configured CLI). Voyage is *not* on
7
+ Bedrock — see memory `embeddings-bedrock`.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ from typing import Any
15
+
16
+ from .base import Embedder, InputType
17
+
18
+ _INPUT_MAP: dict[str, str] = {"document": "search_document", "query": "search_query"}
19
+
20
+
21
+ class BedrockEmbedder(Embedder):
22
+ def __init__(
23
+ self,
24
+ model: str = "cohere.embed-v4:0",
25
+ region: str = "us-east-1",
26
+ dim: int = 1024,
27
+ batch_size: int = 96,
28
+ assume_role_arn: str | None = None,
29
+ ) -> None:
30
+ self.name = f"bedrock:{model}"
31
+ self.model = model
32
+ self.region = region
33
+ self.dim = dim
34
+ self.batch_size = batch_size
35
+ self.assume_role_arn = assume_role_arn
36
+ self._client: Any = None
37
+
38
+ def _bedrock(self) -> Any:
39
+ if self._client is None:
40
+ import boto3
41
+
42
+ if self.assume_role_arn:
43
+ sts = boto3.client("sts", region_name=self.region)
44
+ creds = sts.assume_role(RoleArn=self.assume_role_arn, RoleSessionName="ckg-embed")[
45
+ "Credentials"
46
+ ]
47
+ self._client = boto3.client(
48
+ "bedrock-runtime",
49
+ region_name=self.region,
50
+ aws_access_key_id=creds["AccessKeyId"],
51
+ aws_secret_access_key=creds["SecretAccessKey"],
52
+ aws_session_token=creds["SessionToken"],
53
+ )
54
+ else:
55
+ self._client = boto3.client("bedrock-runtime", region_name=self.region)
56
+ return self._client
57
+
58
+ async def embed(
59
+ self, texts: list[str], input_type: InputType = "document"
60
+ ) -> list[list[float]]:
61
+ out: list[list[float]] = []
62
+ for i in range(0, len(texts), self.batch_size):
63
+ batch = texts[i : i + self.batch_size]
64
+ out.extend(await asyncio.to_thread(self._invoke, batch, input_type))
65
+ return out
66
+
67
+ def _invoke(self, batch: list[str], input_type: InputType) -> list[list[float]]:
68
+ body = json.dumps(
69
+ {
70
+ "texts": batch,
71
+ "input_type": _INPUT_MAP[input_type],
72
+ "embedding_types": ["float"],
73
+ "output_dimension": self.dim,
74
+ }
75
+ )
76
+ resp = self._bedrock().invoke_model(
77
+ modelId=self.model,
78
+ contentType="application/json",
79
+ accept="application/json",
80
+ body=body,
81
+ )
82
+ payload = json.loads(resp["body"].read())
83
+ embeddings = payload["embeddings"]
84
+ floats = embeddings["float"] if isinstance(embeddings, dict) else embeddings
85
+ return [[float(x) for x in vec] for vec in floats]