agentforge-graph 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_graph/__init__.py +6 -0
- agentforge_graph/chunking/__init__.py +12 -0
- agentforge_graph/chunking/cast.py +159 -0
- agentforge_graph/chunking/chunk.py +19 -0
- agentforge_graph/chunking/tokens.py +15 -0
- agentforge_graph/cli.py +607 -0
- agentforge_graph/config.py +259 -0
- agentforge_graph/core/__init__.py +54 -0
- agentforge_graph/core/conformance.py +270 -0
- agentforge_graph/core/contracts.py +163 -0
- agentforge_graph/core/kinds.py +68 -0
- agentforge_graph/core/models.py +134 -0
- agentforge_graph/core/provenance.py +62 -0
- agentforge_graph/core/symbols.py +116 -0
- agentforge_graph/embed/__init__.py +28 -0
- agentforge_graph/embed/base.py +22 -0
- agentforge_graph/embed/bedrock.py +85 -0
- agentforge_graph/embed/fake.py +34 -0
- agentforge_graph/embed/openai.py +67 -0
- agentforge_graph/embed/pipeline.py +184 -0
- agentforge_graph/embed/registry.py +66 -0
- agentforge_graph/embed/report.py +15 -0
- agentforge_graph/enrich/__init__.py +70 -0
- agentforge_graph/enrich/anthropic.py +38 -0
- agentforge_graph/enrich/anthropic_client.py +109 -0
- agentforge_graph/enrich/bedrock.py +24 -0
- agentforge_graph/enrich/bedrock_client.py +115 -0
- agentforge_graph/enrich/bedrock_summarizer.py +23 -0
- agentforge_graph/enrich/claude.py +172 -0
- agentforge_graph/enrich/enricher.py +108 -0
- agentforge_graph/enrich/governs.py +173 -0
- agentforge_graph/enrich/governs_enricher.py +152 -0
- agentforge_graph/enrich/heuristics.py +224 -0
- agentforge_graph/enrich/judge.py +63 -0
- agentforge_graph/enrich/registry.py +133 -0
- agentforge_graph/enrich/report.py +60 -0
- agentforge_graph/enrich/summarizer.py +62 -0
- agentforge_graph/enrich/summary_enricher.py +211 -0
- agentforge_graph/enrich/taxonomy.py +38 -0
- agentforge_graph/frameworks/__init__.py +29 -0
- agentforge_graph/frameworks/base.py +75 -0
- agentforge_graph/frameworks/detect.py +124 -0
- agentforge_graph/frameworks/extractor.py +63 -0
- agentforge_graph/frameworks/orm.py +93 -0
- agentforge_graph/frameworks/packs/_js_ast.py +56 -0
- agentforge_graph/frameworks/packs/_python_ast.py +157 -0
- agentforge_graph/frameworks/packs/django/__init__.py +240 -0
- agentforge_graph/frameworks/packs/django/models.scm +7 -0
- agentforge_graph/frameworks/packs/express/__init__.py +133 -0
- agentforge_graph/frameworks/packs/express/routes.scm +8 -0
- agentforge_graph/frameworks/packs/fastapi/__init__.py +210 -0
- agentforge_graph/frameworks/packs/fastapi/depends.scm +6 -0
- agentforge_graph/frameworks/packs/fastapi/routes.scm +10 -0
- agentforge_graph/frameworks/packs/flask/__init__.py +143 -0
- agentforge_graph/frameworks/packs/flask/routes.scm +11 -0
- agentforge_graph/frameworks/packs/nestjs/__init__.py +205 -0
- agentforge_graph/frameworks/packs/nestjs/routes.scm +6 -0
- agentforge_graph/frameworks/packs/spring/__init__.py +267 -0
- agentforge_graph/frameworks/packs/spring/routes.scm +6 -0
- agentforge_graph/frameworks/packs/sqlalchemy/__init__.py +250 -0
- agentforge_graph/frameworks/packs/sqlalchemy/models.scm +7 -0
- agentforge_graph/frameworks/registry.py +44 -0
- agentforge_graph/ingest/__init__.py +30 -0
- agentforge_graph/ingest/codegraph.py +847 -0
- agentforge_graph/ingest/extractor.py +353 -0
- agentforge_graph/ingest/incremental/__init__.py +25 -0
- agentforge_graph/ingest/incremental/detect.py +118 -0
- agentforge_graph/ingest/incremental/dirty.py +61 -0
- agentforge_graph/ingest/incremental/indexer.py +218 -0
- agentforge_graph/ingest/incremental/meta.py +72 -0
- agentforge_graph/ingest/incremental/ports.py +39 -0
- agentforge_graph/ingest/pack.py +160 -0
- agentforge_graph/ingest/packs/__init__.py +34 -0
- agentforge_graph/ingest/packs/cpp/__init__.py +35 -0
- agentforge_graph/ingest/packs/cpp/references.scm +15 -0
- agentforge_graph/ingest/packs/cpp/structure.scm +49 -0
- agentforge_graph/ingest/packs/csharp/__init__.py +35 -0
- agentforge_graph/ingest/packs/csharp/references.scm +12 -0
- agentforge_graph/ingest/packs/csharp/structure.scm +45 -0
- agentforge_graph/ingest/packs/go/__init__.py +38 -0
- agentforge_graph/ingest/packs/go/references.scm +12 -0
- agentforge_graph/ingest/packs/go/structure.scm +64 -0
- agentforge_graph/ingest/packs/java/__init__.py +35 -0
- agentforge_graph/ingest/packs/java/references.scm +12 -0
- agentforge_graph/ingest/packs/java/structure.scm +38 -0
- agentforge_graph/ingest/packs/javascript/__init__.py +34 -0
- agentforge_graph/ingest/packs/javascript/references.scm +11 -0
- agentforge_graph/ingest/packs/javascript/structure.scm +166 -0
- agentforge_graph/ingest/packs/php/__init__.py +35 -0
- agentforge_graph/ingest/packs/php/references.scm +15 -0
- agentforge_graph/ingest/packs/php/structure.scm +44 -0
- agentforge_graph/ingest/packs/python/__init__.py +25 -0
- agentforge_graph/ingest/packs/python/references.scm +14 -0
- agentforge_graph/ingest/packs/python/structure.scm +57 -0
- agentforge_graph/ingest/packs/ruby/__init__.py +37 -0
- agentforge_graph/ingest/packs/ruby/references.scm +12 -0
- agentforge_graph/ingest/packs/ruby/structure.scm +37 -0
- agentforge_graph/ingest/packs/rust/__init__.py +39 -0
- agentforge_graph/ingest/packs/rust/references.scm +12 -0
- agentforge_graph/ingest/packs/rust/structure.scm +46 -0
- agentforge_graph/ingest/packs/typescript/__init__.py +31 -0
- agentforge_graph/ingest/packs/typescript/references.scm +11 -0
- agentforge_graph/ingest/packs/typescript/structure.scm +99 -0
- agentforge_graph/ingest/pipeline.py +134 -0
- agentforge_graph/ingest/report.py +84 -0
- agentforge_graph/ingest/resolver.py +467 -0
- agentforge_graph/ingest/source.py +79 -0
- agentforge_graph/knowledge/__init__.py +28 -0
- agentforge_graph/knowledge/adr.py +136 -0
- agentforge_graph/knowledge/commits.py +152 -0
- agentforge_graph/knowledge/ingest.py +312 -0
- agentforge_graph/knowledge/mentions.py +71 -0
- agentforge_graph/knowledge/report.py +32 -0
- agentforge_graph/main.py +21 -0
- agentforge_graph/providers.py +36 -0
- agentforge_graph/repomap/__init__.py +14 -0
- agentforge_graph/repomap/rank.py +161 -0
- agentforge_graph/repomap/render.py +55 -0
- agentforge_graph/repomap/repomap.py +66 -0
- agentforge_graph/retrieve/__init__.py +21 -0
- agentforge_graph/retrieve/pack.py +76 -0
- agentforge_graph/retrieve/rerank.py +251 -0
- agentforge_graph/retrieve/retriever.py +286 -0
- agentforge_graph/retrieve/scoring.py +36 -0
- agentforge_graph/serve/__init__.py +19 -0
- agentforge_graph/serve/engine.py +204 -0
- agentforge_graph/serve/http_runner.py +133 -0
- agentforge_graph/serve/server.py +110 -0
- agentforge_graph/serve/tools.py +307 -0
- agentforge_graph/store/__init__.py +32 -0
- agentforge_graph/store/_rowmap.py +102 -0
- agentforge_graph/store/errors.py +22 -0
- agentforge_graph/store/facade.py +89 -0
- agentforge_graph/store/kuzu_store.py +380 -0
- agentforge_graph/store/lance_store.py +146 -0
- agentforge_graph/store/neo4j_store.py +294 -0
- agentforge_graph/store/pgvector_store.py +170 -0
- agentforge_graph/store/registry.py +45 -0
- agentforge_graph/temporal/__init__.py +36 -0
- agentforge_graph/temporal/backfill.py +338 -0
- agentforge_graph/temporal/events.py +82 -0
- agentforge_graph/temporal/index.py +190 -0
- agentforge_graph/temporal/mining.py +190 -0
- agentforge_graph/temporal/recorder.py +114 -0
- agentforge_graph/temporal/store.py +282 -0
- agentforge_graph-0.3.2.dist-info/METADATA +291 -0
- agentforge_graph-0.3.2.dist-info/RECORD +151 -0
- agentforge_graph-0.3.2.dist-info/WHEEL +4 -0
- agentforge_graph-0.3.2.dist-info/entry_points.txt +3 -0
- agentforge_graph-0.3.2.dist-info/licenses/LICENSE +202 -0
- agentforge_graph-0.3.2.dist-info/licenses/NOTICE +14 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""agentforge_graph.chunking — AST-aware (cAST) chunking of code symbols
|
|
2
|
+
into retrieval units linked back to the graph (feat-005). Deterministic;
|
|
3
|
+
imports nothing from ``agentforge`` (ADR-0001).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from .cast import CASTChunker, Chunker
|
|
9
|
+
from .chunk import Chunk
|
|
10
|
+
from .tokens import estimate_tokens
|
|
11
|
+
|
|
12
|
+
__all__ = ["Chunk", "Chunker", "CASTChunker", "estimate_tokens"]
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""``CASTChunker`` — AST-aware chunking via split-then-merge over the symbol
|
|
2
|
+
spans feat-002 already extracted (no re-parse). Partitions a file's lines
|
|
3
|
+
into contiguous chunks that honour symbol boundaries: a symbol that fits the
|
|
4
|
+
budget is never split and never fused with another; oversized symbols recurse
|
|
5
|
+
into nested children (a class → per-method chunks) and finally line windows;
|
|
6
|
+
small inter-symbol gaps (imports, module code) merge up to the budget.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
from abc import ABC, abstractmethod
|
|
13
|
+
|
|
14
|
+
from agentforge_graph.core import Node, SourceFile, SymbolID
|
|
15
|
+
|
|
16
|
+
from .chunk import Chunk
|
|
17
|
+
from .tokens import estimate_tokens
|
|
18
|
+
|
|
19
|
+
_Range = tuple[int, int]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Chunker(ABC):
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def chunk(self, file: SourceFile, symbols: list[Node]) -> list[Chunk]:
|
|
25
|
+
"""Chunks for ``file``, given its symbol nodes (with spans)."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CASTChunker(Chunker):
|
|
29
|
+
def __init__(self, max_tokens: int = 512, min_tokens: int = 64) -> None:
|
|
30
|
+
self.max_tokens = max_tokens
|
|
31
|
+
self.min_tokens = min_tokens
|
|
32
|
+
|
|
33
|
+
def chunk(self, file: SourceFile, symbols: list[Node]) -> list[Chunk]:
|
|
34
|
+
lines = file.text.splitlines()
|
|
35
|
+
n = len(lines)
|
|
36
|
+
spanned = [s for s in symbols if s.span is not None]
|
|
37
|
+
if not spanned or n == 0:
|
|
38
|
+
return []
|
|
39
|
+
repo = SymbolID.parse(spanned[0].id).repo
|
|
40
|
+
lang = file.language
|
|
41
|
+
|
|
42
|
+
toplevel = [s for s in spanned if not any(_contains(o, s) for o in spanned)]
|
|
43
|
+
toplevel.sort(key=lambda s: _span(s)[0])
|
|
44
|
+
|
|
45
|
+
ranges: list[_Range] = []
|
|
46
|
+
cursor = 1
|
|
47
|
+
for sym in toplevel:
|
|
48
|
+
start, end = _span(sym)
|
|
49
|
+
if start > cursor:
|
|
50
|
+
self._window(cursor, start - 1, lines, ranges)
|
|
51
|
+
self._emit_symbol(start, end, lines, spanned, ranges)
|
|
52
|
+
cursor = end + 1
|
|
53
|
+
if cursor <= n:
|
|
54
|
+
self._window(cursor, n, lines, ranges)
|
|
55
|
+
|
|
56
|
+
ranges = [(a, b) for (a, b) in ranges if self._slice(lines, a, b).strip()]
|
|
57
|
+
ranges = self._merge_gaps(ranges, lines, spanned)
|
|
58
|
+
|
|
59
|
+
chunks: list[Chunk] = []
|
|
60
|
+
for seq, (a, b) in enumerate(ranges):
|
|
61
|
+
code = self._slice(lines, a, b)
|
|
62
|
+
sym_ids = [s.id for s in spanned if _overlaps((a, b), _span(s))]
|
|
63
|
+
text = f"{file.path} | {self._qualify(sym_ids)}\n{code}"
|
|
64
|
+
content_hash = hashlib.sha256(
|
|
65
|
+
f"{text}|{self.max_tokens}|{self.min_tokens}".encode()
|
|
66
|
+
).hexdigest()
|
|
67
|
+
chunks.append(
|
|
68
|
+
Chunk(
|
|
69
|
+
id=SymbolID.for_symbol(lang, repo, file.path, f"chunk({seq})."),
|
|
70
|
+
text=text,
|
|
71
|
+
code=code,
|
|
72
|
+
token_count=estimate_tokens(code),
|
|
73
|
+
path=file.path,
|
|
74
|
+
span=(a, b),
|
|
75
|
+
content_hash=content_hash,
|
|
76
|
+
symbol_ids=sym_ids,
|
|
77
|
+
seq=seq,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
return chunks
|
|
81
|
+
|
|
82
|
+
# --- range production -----------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _emit_symbol(
|
|
85
|
+
self, start: int, end: int, lines: list[str], symbols: list[Node], out: list[_Range]
|
|
86
|
+
) -> None:
|
|
87
|
+
if estimate_tokens(self._slice(lines, start, end)) <= self.max_tokens:
|
|
88
|
+
out.append((start, end))
|
|
89
|
+
return
|
|
90
|
+
within = [
|
|
91
|
+
s
|
|
92
|
+
for s in symbols
|
|
93
|
+
if start <= _span(s)[0] and _span(s)[1] <= end and _span(s) != (start, end)
|
|
94
|
+
]
|
|
95
|
+
direct = [c for c in within if not any(o is not c and _contains(o, c) for o in within)]
|
|
96
|
+
direct.sort(key=lambda s: _span(s)[0])
|
|
97
|
+
if not direct: # leaf symbol still too big -> line windows (logged by report)
|
|
98
|
+
self._window(start, end, lines, out)
|
|
99
|
+
return
|
|
100
|
+
cursor = start
|
|
101
|
+
for child in direct:
|
|
102
|
+
cs, ce = _span(child)
|
|
103
|
+
if cs > cursor:
|
|
104
|
+
self._window(cursor, cs - 1, lines, out) # header / gap before child
|
|
105
|
+
self._emit_symbol(cs, ce, lines, symbols, out)
|
|
106
|
+
cursor = ce + 1
|
|
107
|
+
if cursor <= end:
|
|
108
|
+
self._window(cursor, end, lines, out)
|
|
109
|
+
|
|
110
|
+
def _window(self, start: int, end: int, lines: list[str], out: list[_Range]) -> None:
|
|
111
|
+
acc = start
|
|
112
|
+
for ln in range(start, end + 1):
|
|
113
|
+
if ln > acc and estimate_tokens(self._slice(lines, acc, ln)) > self.max_tokens:
|
|
114
|
+
out.append((acc, ln - 1))
|
|
115
|
+
acc = ln
|
|
116
|
+
out.append((acc, end))
|
|
117
|
+
|
|
118
|
+
def _merge_gaps(
|
|
119
|
+
self, ranges: list[_Range], lines: list[str], symbols: list[Node]
|
|
120
|
+
) -> list[_Range]:
|
|
121
|
+
def is_gap(r: _Range) -> bool:
|
|
122
|
+
# a gap overlaps no symbol — so function-body windows are NOT gaps
|
|
123
|
+
return not any(_overlaps(r, _span(s)) for s in symbols)
|
|
124
|
+
|
|
125
|
+
out: list[_Range] = []
|
|
126
|
+
for r in ranges:
|
|
127
|
+
if out and is_gap(out[-1]) and is_gap(r):
|
|
128
|
+
merged = (out[-1][0], r[1])
|
|
129
|
+
if estimate_tokens(self._slice(lines, *merged)) <= self.max_tokens:
|
|
130
|
+
out[-1] = merged
|
|
131
|
+
continue
|
|
132
|
+
out.append(r)
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
# --- helpers --------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def _slice(lines: list[str], a: int, b: int) -> str:
|
|
139
|
+
return "\n".join(lines[a - 1 : b])
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _qualify(symbol_ids: list[str]) -> str:
|
|
143
|
+
if not symbol_ids:
|
|
144
|
+
return "module"
|
|
145
|
+
return SymbolID.parse(symbol_ids[0]).descriptor or "module"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _span(node: Node) -> tuple[int, int]:
|
|
149
|
+
assert node.span is not None
|
|
150
|
+
return node.span
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _contains(outer: Node, inner: Node) -> bool:
|
|
154
|
+
o, i = _span(outer), _span(inner)
|
|
155
|
+
return o[0] <= i[0] and i[1] <= o[1] and o != i
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _overlaps(a: tuple[int, int], b: tuple[int, int]) -> bool:
|
|
159
|
+
return not (a[1] < b[0] or b[1] < a[0])
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""The ``Chunk`` value — a retrieval artifact distinct from the symbol nodes
|
|
2
|
+
it covers (the chunk↔symbol separation that lets a vector hit expand into the
|
|
3
|
+
graph; feat-006). Converts to a ``CHUNK`` node + ``CHUNK_OF`` edges."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Chunk(BaseModel):
|
|
11
|
+
id: str # SymbolID with a chunk(<seq>). descriptor on the file path
|
|
12
|
+
text: str # embedding text: "<path> | <symbol>\n<code>"
|
|
13
|
+
code: str # raw source slice (for display)
|
|
14
|
+
token_count: int
|
|
15
|
+
path: str
|
|
16
|
+
span: tuple[int, int] # 1-based inclusive line range
|
|
17
|
+
content_hash: str # sha256(text + chunker params) — the vector key
|
|
18
|
+
symbol_ids: list[str] = Field(default_factory=list) # CHUNK_OF targets
|
|
19
|
+
seq: int # order within the file
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Token budgeting for the chunker.
|
|
2
|
+
|
|
3
|
+
A fast, model-independent heuristic — exactness doesn't matter, only that
|
|
4
|
+
budgeting and the boundary tests use the *same* estimate. A real tokenizer
|
|
5
|
+
is a drop-in replacement behind this function (ADR-0007 risk note).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def estimate_tokens(text: str) -> int:
|
|
12
|
+
"""Approximate token count: ~4 chars/token, floored to the word count."""
|
|
13
|
+
if not text.strip():
|
|
14
|
+
return 0
|
|
15
|
+
return max(len(text) // 4, len(text.split()), 1)
|