agentforge-core 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentforge_core/__init__.py +228 -0
- agentforge_core/_bm25.py +132 -0
- agentforge_core/config/__init__.py +62 -0
- agentforge_core/config/loader.py +239 -0
- agentforge_core/config/module_schemas.py +208 -0
- agentforge_core/config/schema.py +424 -0
- agentforge_core/contracts/__init__.py +52 -0
- agentforge_core/contracts/auth.py +33 -0
- agentforge_core/contracts/chat.py +118 -0
- agentforge_core/contracts/embedding.py +71 -0
- agentforge_core/contracts/evaluator.py +56 -0
- agentforge_core/contracts/finding.py +39 -0
- agentforge_core/contracts/graph_store.py +180 -0
- agentforge_core/contracts/guardrails.py +129 -0
- agentforge_core/contracts/llm.py +152 -0
- agentforge_core/contracts/memory.py +113 -0
- agentforge_core/contracts/migrator.py +120 -0
- agentforge_core/contracts/renderer.py +57 -0
- agentforge_core/contracts/reranker.py +91 -0
- agentforge_core/contracts/strategy.py +70 -0
- agentforge_core/contracts/task.py +73 -0
- agentforge_core/contracts/tool.py +71 -0
- agentforge_core/contracts/vector_store.py +151 -0
- agentforge_core/migrations/__init__.py +14 -0
- agentforge_core/migrations/discover.py +77 -0
- agentforge_core/migrations/template.py +34 -0
- agentforge_core/observability/__init__.py +18 -0
- agentforge_core/observability/tracing.py +37 -0
- agentforge_core/production/__init__.py +77 -0
- agentforge_core/production/budget.py +134 -0
- agentforge_core/production/exceptions.py +136 -0
- agentforge_core/production/fallback.py +321 -0
- agentforge_core/production/log_filter.py +49 -0
- agentforge_core/production/log_format.py +117 -0
- agentforge_core/production/run_context.py +108 -0
- agentforge_core/py.typed +0 -0
- agentforge_core/resolver/__init__.py +38 -0
- agentforge_core/resolver/discover.py +145 -0
- agentforge_core/resolver/resolve.py +168 -0
- agentforge_core/testing/__init__.py +45 -0
- agentforge_core/testing/conformance.py +1138 -0
- agentforge_core/values/__init__.py +103 -0
- agentforge_core/values/auth.py +20 -0
- agentforge_core/values/chat.py +131 -0
- agentforge_core/values/claim.py +30 -0
- agentforge_core/values/graph.py +136 -0
- agentforge_core/values/guardrails.py +49 -0
- agentforge_core/values/manifest.py +129 -0
- agentforge_core/values/messages.py +153 -0
- agentforge_core/values/module.py +40 -0
- agentforge_core/values/pipeline.py +43 -0
- agentforge_core/values/retrieval.py +53 -0
- agentforge_core/values/state.py +118 -0
- agentforge_core/values/vector.py +59 -0
- agentforge_core-0.2.1.dist-info/METADATA +66 -0
- agentforge_core-0.2.1.dist-info/RECORD +58 -0
- agentforge_core-0.2.1.dist-info/WHEEL +4 -0
- agentforge_core-0.2.1.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,1138 @@
|
|
|
1
|
+
"""Conformance suites for `agentforge-core` ABCs.
|
|
2
|
+
|
|
3
|
+
Every shipped or third-party driver must pass these suites. They are
|
|
4
|
+
exposed as functions (not pytest collections) so they can be invoked
|
|
5
|
+
from any test runner by passing in a ready-to-use store / client.
|
|
6
|
+
|
|
7
|
+
Usage in a driver's tests:
|
|
8
|
+
|
|
9
|
+
import pytest
|
|
10
|
+
from agentforge_core.testing import run_memory_conformance
|
|
11
|
+
from my_pkg import MyMemoryStore
|
|
12
|
+
|
|
13
|
+
@pytest.mark.asyncio
|
|
14
|
+
async def test_my_driver_conforms() -> None:
|
|
15
|
+
async with MyMemoryStore.from_url("...") as store:
|
|
16
|
+
await run_memory_conformance(store)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import itertools
|
|
22
|
+
import math
|
|
23
|
+
import typing
|
|
24
|
+
from collections.abc import AsyncIterator, Awaitable, Callable
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
from agentforge_core.contracts.chat import ChatHistoryStore, HistoryTruncationStrategy
|
|
28
|
+
from agentforge_core.contracts.embedding import EmbeddingClient
|
|
29
|
+
from agentforge_core.contracts.graph_store import GraphStore
|
|
30
|
+
from agentforge_core.contracts.guardrails import (
|
|
31
|
+
InputValidator,
|
|
32
|
+
OutputValidator,
|
|
33
|
+
ToolCallGate,
|
|
34
|
+
)
|
|
35
|
+
from agentforge_core.contracts.memory import MemoryStore
|
|
36
|
+
from agentforge_core.contracts.reranker import Reranker
|
|
37
|
+
from agentforge_core.contracts.strategy import ReasoningStrategy
|
|
38
|
+
from agentforge_core.contracts.task import Task
|
|
39
|
+
from agentforge_core.contracts.tool import Tool
|
|
40
|
+
from agentforge_core.contracts.vector_store import VectorStore
|
|
41
|
+
from agentforge_core.values.claim import Claim
|
|
42
|
+
from agentforge_core.values.graph import (
|
|
43
|
+
GraphEdge,
|
|
44
|
+
GraphNode,
|
|
45
|
+
GraphPattern,
|
|
46
|
+
GraphSegment,
|
|
47
|
+
)
|
|
48
|
+
from agentforge_core.values.guardrails import ValidationResult
|
|
49
|
+
from agentforge_core.values.state import AgentState, StepKind
|
|
50
|
+
from agentforge_core.values.vector import VectorItem, VectorMatch
|
|
51
|
+
|
|
52
|
+
_VALID_STEP_KINDS: frozenset[str] = frozenset(typing.get_args(StepKind))
|
|
53
|
+
"""Closed enum mirror of `StepKind`. Used by `run_strategy_conformance`."""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _claim(
|
|
57
|
+
*,
|
|
58
|
+
project: str = "p1",
|
|
59
|
+
agent: str = "a1",
|
|
60
|
+
run_id: str = "run-x",
|
|
61
|
+
category: str = "finding",
|
|
62
|
+
payload: dict[str, object] | None = None,
|
|
63
|
+
) -> Claim:
|
|
64
|
+
return Claim(
|
|
65
|
+
run_id=run_id,
|
|
66
|
+
project=project,
|
|
67
|
+
agent=agent,
|
|
68
|
+
category=category,
|
|
69
|
+
payload=payload if payload is not None else {"v": 1},
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
async def _collect(it: AsyncIterator[Claim]) -> list[Claim]:
|
|
74
|
+
return [c async for c in it]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
_EXPECTED_DELETE_COUNT = 2
|
|
78
|
+
"""Magic-number constant for the `delete()` conformance cases."""
|
|
79
|
+
|
|
80
|
+
_EXPECTED_CHAT_TURNS_SID = 2
|
|
81
|
+
"""Magic-number constant for the chat-history conformance cases."""
|
|
82
|
+
|
|
83
|
+
_EXPECTED_CHAT_TURNS_SID_B = 1
|
|
84
|
+
"""Magic-number constant for the chat-history conformance cases."""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
async def _run_delete_conformance(store: MemoryStore) -> None:
|
|
88
|
+
"""feat-017 `delete()` conformance cases (separated from the main
|
|
89
|
+
suite so the parent function stays under ruff's PLR0915 cap)."""
|
|
90
|
+
from agentforge_core.production.exceptions import ModuleError # noqa: PLC0415
|
|
91
|
+
|
|
92
|
+
# No-filter refuses (defence against silent total wipe).
|
|
93
|
+
try:
|
|
94
|
+
await store.delete()
|
|
95
|
+
except ModuleError:
|
|
96
|
+
pass
|
|
97
|
+
else:
|
|
98
|
+
raise AssertionError("delete() with no filters must raise ModuleError")
|
|
99
|
+
|
|
100
|
+
# delete(run_id=...) only removes matching claims; accurate count.
|
|
101
|
+
purge_run = "run-purge"
|
|
102
|
+
keeper_run = "run-keep"
|
|
103
|
+
await store.put(_claim(run_id=purge_run, category="purge-me"))
|
|
104
|
+
await store.put(_claim(run_id=purge_run, category="purge-me"))
|
|
105
|
+
await store.put(_claim(run_id=keeper_run, category="purge-me"))
|
|
106
|
+
removed_run = await store.delete(run_id=purge_run)
|
|
107
|
+
assert removed_run == _EXPECTED_DELETE_COUNT, (
|
|
108
|
+
f"delete(run_id={purge_run!r}) must return count; got {removed_run}"
|
|
109
|
+
)
|
|
110
|
+
remaining = await store.query(category="purge-me")
|
|
111
|
+
assert all(c.run_id == keeper_run for c in remaining), (
|
|
112
|
+
"delete(run_id=...) must leave non-matching claims behind"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# delete(category=...) clears the whole category.
|
|
116
|
+
cat_marker = "ephemeral-step"
|
|
117
|
+
await store.put(_claim(category=cat_marker))
|
|
118
|
+
await store.put(_claim(category=cat_marker))
|
|
119
|
+
await store.put(_claim(category="something-else"))
|
|
120
|
+
removed_cat = await store.delete(category=cat_marker)
|
|
121
|
+
assert removed_cat == _EXPECTED_DELETE_COUNT, (
|
|
122
|
+
f"delete(category={cat_marker!r}) must report accurate count; got {removed_cat}"
|
|
123
|
+
)
|
|
124
|
+
after_cat = await store.query(category=cat_marker)
|
|
125
|
+
assert after_cat == [], "delete(category=...) must clear all claims of that category"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
async def run_memory_conformance(store: MemoryStore) -> None:
|
|
129
|
+
"""Run the full MemoryStore conformance suite against `store`.
|
|
130
|
+
|
|
131
|
+
The store must be empty when this is called and is left empty when
|
|
132
|
+
the function returns (every claim written is also deleted, except
|
|
133
|
+
where the contract demands history retention via `supersede`).
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
AssertionError: a contract was violated.
|
|
137
|
+
"""
|
|
138
|
+
# 1. put + get roundtrip
|
|
139
|
+
c1 = _claim(category="finding")
|
|
140
|
+
cid = await store.put(c1)
|
|
141
|
+
assert cid == c1.id, "put() must return the claim's id"
|
|
142
|
+
fetched = await store.get(cid)
|
|
143
|
+
assert fetched is not None, "get() must return the persisted claim"
|
|
144
|
+
assert fetched.id == c1.id
|
|
145
|
+
|
|
146
|
+
# 2. get returns None for unknown id
|
|
147
|
+
missing = await store.get("01HX-NONEXISTENT")
|
|
148
|
+
assert missing is None, "get() of an unknown id must return None"
|
|
149
|
+
|
|
150
|
+
# 3. query with no filters returns at least the claim we put
|
|
151
|
+
all_results = await store.query()
|
|
152
|
+
assert any(c.id == cid for c in all_results), (
|
|
153
|
+
"query() with no filters must include the put claim"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# 4. query filters by project
|
|
157
|
+
other_project = _claim(project="other-project")
|
|
158
|
+
await store.put(other_project)
|
|
159
|
+
only_p1 = await store.query(project="p1")
|
|
160
|
+
assert any(c.id == cid for c in only_p1)
|
|
161
|
+
assert all(c.project == "p1" for c in only_p1), (
|
|
162
|
+
"query(project=...) must filter results to that project"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# 5. query filters by agent
|
|
166
|
+
other_agent = _claim(agent="other-agent")
|
|
167
|
+
await store.put(other_agent)
|
|
168
|
+
only_a1 = await store.query(agent="a1")
|
|
169
|
+
assert all(c.agent == "a1" for c in only_a1), (
|
|
170
|
+
"query(agent=...) must filter results to that agent"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# 6. query filters by category
|
|
174
|
+
decision = _claim(category="decision")
|
|
175
|
+
await store.put(decision)
|
|
176
|
+
only_findings = await store.query(category="finding")
|
|
177
|
+
assert all(c.category == "finding" for c in only_findings), (
|
|
178
|
+
"query(category=...) must filter results to that category"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# 7. query filters by run_id
|
|
182
|
+
other_run = _claim(run_id="run-y")
|
|
183
|
+
await store.put(other_run)
|
|
184
|
+
only_run_x = await store.query(run_id="run-x")
|
|
185
|
+
assert all(c.run_id == "run-x" for c in only_run_x), (
|
|
186
|
+
"query(run_id=...) must filter results to that run_id"
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
# 8. query respects limit
|
|
190
|
+
limited = await store.query(limit=1)
|
|
191
|
+
assert len(limited) <= 1, "query(limit=N) must return at most N claims"
|
|
192
|
+
|
|
193
|
+
# 9. supersede chains old → new
|
|
194
|
+
new_claim = _claim(payload={"v": 2})
|
|
195
|
+
new_id = await store.supersede(cid, new_claim)
|
|
196
|
+
assert new_id == new_claim.id
|
|
197
|
+
refetched = await store.get(new_id)
|
|
198
|
+
assert refetched is not None
|
|
199
|
+
assert refetched.supersedes == cid, "supersede() must set supersedes link on the new claim"
|
|
200
|
+
|
|
201
|
+
# 10. stream yields claims
|
|
202
|
+
streamed = await _collect(store.stream(project="p1"))
|
|
203
|
+
assert len(streamed) >= 1, "stream() must yield matching claims"
|
|
204
|
+
assert all(c.project == "p1" for c in streamed)
|
|
205
|
+
|
|
206
|
+
# 11. capabilities() returns a set
|
|
207
|
+
caps = store.capabilities()
|
|
208
|
+
assert isinstance(caps, set)
|
|
209
|
+
|
|
210
|
+
# 12. supports() reflects capabilities()
|
|
211
|
+
if caps:
|
|
212
|
+
sample = next(iter(caps))
|
|
213
|
+
assert store.supports(sample) is True
|
|
214
|
+
assert store.supports("definitely-not-a-capability-2026") is False
|
|
215
|
+
|
|
216
|
+
# 13-15. delete() — feat-017. Tested separately so the main
|
|
217
|
+
# conformance function stays under PLR0915's statement cap.
|
|
218
|
+
await _run_delete_conformance(store)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# ----------------------------------------------------------------------
|
|
222
|
+
# Strategy conformance — feat-002.
|
|
223
|
+
# ----------------------------------------------------------------------
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
async def run_strategy_conformance(
|
|
227
|
+
strategy: ReasoningStrategy,
|
|
228
|
+
*,
|
|
229
|
+
state_factory: Callable[[], AgentState],
|
|
230
|
+
pre_run: Callable[[AgentState], None | Awaitable[None]] | None = None,
|
|
231
|
+
) -> None:
|
|
232
|
+
"""Run the shared `ReasoningStrategy` conformance suite.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
strategy: A constructed strategy instance.
|
|
236
|
+
state_factory: Builds a fresh `AgentState` for each scenario
|
|
237
|
+
(with `RuntimeContext` bound on `state.metadata` if the
|
|
238
|
+
strategy needs one — the framework runtime does this; tests
|
|
239
|
+
must do it explicitly).
|
|
240
|
+
pre_run: Optional async-or-sync callable invoked on the freshly
|
|
241
|
+
built `AgentState` before `strategy.run()` (e.g. to seed
|
|
242
|
+
findings or steps). May be omitted.
|
|
243
|
+
|
|
244
|
+
Verifies the locked invariants of `ReasoningStrategy.run`:
|
|
245
|
+
|
|
246
|
+
1. Returns the same `AgentState` instance it was given.
|
|
247
|
+
2. Populates `state.steps` with at least one step.
|
|
248
|
+
3. Every emitted step's `kind` is a valid `StepKind` value.
|
|
249
|
+
4. `step.iteration` is monotonically non-decreasing across the run.
|
|
250
|
+
5. Every emitted step has non-negative `tokens_in`, `tokens_out`,
|
|
251
|
+
`cost_usd`, `duration_ms` (Pydantic enforces; the assertion
|
|
252
|
+
here is defence-in-depth).
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
AssertionError: a contract was violated.
|
|
256
|
+
"""
|
|
257
|
+
state = state_factory()
|
|
258
|
+
if pre_run is not None:
|
|
259
|
+
outcome = pre_run(state)
|
|
260
|
+
if outcome is not None and hasattr(outcome, "__await__"):
|
|
261
|
+
await outcome
|
|
262
|
+
|
|
263
|
+
result = await strategy.run(state)
|
|
264
|
+
|
|
265
|
+
# 1. Returns the same instance
|
|
266
|
+
assert result is state, (
|
|
267
|
+
"ReasoningStrategy.run must return the same AgentState instance "
|
|
268
|
+
"it received (state mutation, not replacement)."
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# 2. Populates state.steps
|
|
272
|
+
assert len(state.steps) >= 1, (
|
|
273
|
+
"ReasoningStrategy.run must append at least one Step to state.steps before returning."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# 3. Every step.kind is valid
|
|
277
|
+
for step in state.steps:
|
|
278
|
+
assert step.kind in _VALID_STEP_KINDS, (
|
|
279
|
+
f"step.kind={step.kind!r} is not a valid StepKind. "
|
|
280
|
+
f"Valid kinds: {sorted(_VALID_STEP_KINDS)}"
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
# 4. step.iteration monotonic non-decreasing
|
|
284
|
+
last_iter = -1
|
|
285
|
+
for step in state.steps:
|
|
286
|
+
assert step.iteration >= last_iter, (
|
|
287
|
+
f"step.iteration must be monotonically non-decreasing; "
|
|
288
|
+
f"saw {step.iteration} after {last_iter}."
|
|
289
|
+
)
|
|
290
|
+
last_iter = step.iteration
|
|
291
|
+
|
|
292
|
+
# 5. Non-negative cost / token / duration fields (Pydantic
|
|
293
|
+
# already enforces ge=0; this is defence-in-depth)
|
|
294
|
+
for step in state.steps:
|
|
295
|
+
assert step.tokens_in >= 0, "step.tokens_in must be non-negative"
|
|
296
|
+
assert step.tokens_out >= 0, "step.tokens_out must be non-negative"
|
|
297
|
+
assert step.cost_usd >= 0.0, "step.cost_usd must be non-negative"
|
|
298
|
+
assert step.duration_ms >= 0, "step.duration_ms must be non-negative"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ----------------------------------------------------------------------
|
|
302
|
+
# Embedding conformance — feat-003.
|
|
303
|
+
# ----------------------------------------------------------------------
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
async def run_embedding_conformance(client: EmbeddingClient) -> None:
|
|
307
|
+
"""Run the shared `EmbeddingClient` conformance suite.
|
|
308
|
+
|
|
309
|
+
Verifies the locked invariants of `EmbeddingClient.embed`:
|
|
310
|
+
|
|
311
|
+
1. `dimensions()` returns a positive integer without a network
|
|
312
|
+
round-trip (callers rely on this for storage sizing).
|
|
313
|
+
2. `embed(texts)` raises `ValueError` on an empty input list
|
|
314
|
+
(no provider supports zero-length batches).
|
|
315
|
+
3. The returned `EmbeddingResponse` has one vector per input
|
|
316
|
+
text in input order.
|
|
317
|
+
4. Every vector has length `dimensions()`.
|
|
318
|
+
5. `usage.input_tokens >= 0` and `usage.output_tokens == 0`
|
|
319
|
+
(embeddings have no output tokens).
|
|
320
|
+
6. `cost_usd >= 0`.
|
|
321
|
+
7. `model` and `provider` are non-empty strings.
|
|
322
|
+
8. `supports("not-a-real-capability")` returns False (the
|
|
323
|
+
capability check is honest about unknown names).
|
|
324
|
+
|
|
325
|
+
Drivers may need to issue a real (or mocked) network call inside
|
|
326
|
+
this test, so it is async. Tests are responsible for arranging the
|
|
327
|
+
necessary fixtures (e.g. injecting a fake AWS session) before
|
|
328
|
+
calling this helper.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
client: A constructed `EmbeddingClient` instance, ready to use.
|
|
332
|
+
|
|
333
|
+
Raises:
|
|
334
|
+
AssertionError: a contract was violated.
|
|
335
|
+
"""
|
|
336
|
+
# 1. dimensions() is sync, positive, no network round-trip
|
|
337
|
+
dim = client.dimensions()
|
|
338
|
+
assert isinstance(dim, int), "dimensions() must return an int"
|
|
339
|
+
assert dim >= 1, f"dimensions() must be >= 1, got {dim}"
|
|
340
|
+
|
|
341
|
+
# 2. empty batch raises ValueError
|
|
342
|
+
raised_value_error = False
|
|
343
|
+
try:
|
|
344
|
+
await client.embed([])
|
|
345
|
+
except ValueError:
|
|
346
|
+
raised_value_error = True
|
|
347
|
+
assert raised_value_error, "embed([]) must raise ValueError on empty input"
|
|
348
|
+
|
|
349
|
+
# 3-7. embed roundtrip
|
|
350
|
+
texts = ["hello", "world", "agentforge"]
|
|
351
|
+
response = await client.embed(texts)
|
|
352
|
+
assert len(response.vectors) == len(texts), (
|
|
353
|
+
f"embed() must return one vector per input text; "
|
|
354
|
+
f"got {len(response.vectors)} vectors for {len(texts)} texts."
|
|
355
|
+
)
|
|
356
|
+
for i, vec in enumerate(response.vectors):
|
|
357
|
+
assert len(vec) == dim, f"vector {i} has length {len(vec)} but dimensions() declared {dim}"
|
|
358
|
+
assert response.dimensions == dim, (
|
|
359
|
+
f"response.dimensions ({response.dimensions}) must match client.dimensions() ({dim})"
|
|
360
|
+
)
|
|
361
|
+
assert response.usage.input_tokens >= 0
|
|
362
|
+
assert response.usage.output_tokens == 0, (
|
|
363
|
+
f"embedding responses must report output_tokens=0; got {response.usage.output_tokens}."
|
|
364
|
+
)
|
|
365
|
+
assert response.cost_usd >= 0.0
|
|
366
|
+
assert response.model, "EmbeddingResponse.model must be non-empty"
|
|
367
|
+
assert response.provider, "EmbeddingResponse.provider must be non-empty"
|
|
368
|
+
|
|
369
|
+
# 8. supports() is honest about unknown capabilities
|
|
370
|
+
assert client.supports("definitely-not-a-capability-2026") is False
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ----------------------------------------------------------------------
|
|
374
|
+
# Vector store conformance — feat-007.
|
|
375
|
+
# ----------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
async def run_vector_conformance(store: VectorStore) -> None:
|
|
379
|
+
"""Run the shared `VectorStore` conformance suite.
|
|
380
|
+
|
|
381
|
+
The store must be empty when this is called and is left empty when
|
|
382
|
+
the function returns (every item upserted is also deleted).
|
|
383
|
+
|
|
384
|
+
Verifies the locked invariants of `VectorStore`:
|
|
385
|
+
|
|
386
|
+
1. `dimensions()` returns a positive int with no network call.
|
|
387
|
+
2. `upsert` accepts items whose vectors match `dimensions()`;
|
|
388
|
+
dimension mismatch raises `ValueError`.
|
|
389
|
+
3. `search` returns at most `limit` matches sorted by score
|
|
390
|
+
descending, with scores in `[0, 1]`.
|
|
391
|
+
4. `search`'s top hit on a query identical to an upserted
|
|
392
|
+
vector returns that item with score ≈ 1.0.
|
|
393
|
+
5. `upsert` is write-through: re-upserting an existing id
|
|
394
|
+
replaces the prior record (no duplicate ids in results).
|
|
395
|
+
6. `delete` returns the count of items actually removed; unknown
|
|
396
|
+
ids are silently dropped (no exception).
|
|
397
|
+
7. `filter_metadata` AND-matches every key/value in the dict.
|
|
398
|
+
8. `search(limit=0)` raises `ValueError`.
|
|
399
|
+
9. `supports("not-a-real-capability")` returns False.
|
|
400
|
+
|
|
401
|
+
Drivers may issue real network calls; the suite is async. Tests are
|
|
402
|
+
responsible for arranging fixtures (e.g. running Postgres) before
|
|
403
|
+
calling this helper.
|
|
404
|
+
|
|
405
|
+
Raises:
|
|
406
|
+
AssertionError: a contract was violated.
|
|
407
|
+
"""
|
|
408
|
+
_ITEM_COUNT = 3 # noqa: N806 — local constant in this function only
|
|
409
|
+
|
|
410
|
+
dim = store.dimensions()
|
|
411
|
+
assert isinstance(dim, int), "dimensions() must return an int"
|
|
412
|
+
assert dim >= 1, f"dimensions() must be >= 1, got {dim}"
|
|
413
|
+
|
|
414
|
+
# 2. dimension-mismatch on upsert
|
|
415
|
+
bad = VectorItem(id="bad", vector=tuple([0.1] * (dim + 1)), text="bad", metadata={})
|
|
416
|
+
raised_dim_error = False
|
|
417
|
+
try:
|
|
418
|
+
await store.upsert([bad])
|
|
419
|
+
except ValueError:
|
|
420
|
+
raised_dim_error = True
|
|
421
|
+
assert raised_dim_error, "upsert with mismatched vector length must raise ValueError"
|
|
422
|
+
|
|
423
|
+
# 3-5. happy-path upsert + search
|
|
424
|
+
items = [
|
|
425
|
+
VectorItem(
|
|
426
|
+
id=f"id-{i}",
|
|
427
|
+
vector=tuple(_unit_vector(dim, seed=i)),
|
|
428
|
+
text=f"text {i}",
|
|
429
|
+
metadata={"category": "doc" if i < 2 else "note", "n": i}, # noqa: PLR2004
|
|
430
|
+
)
|
|
431
|
+
for i in range(_ITEM_COUNT)
|
|
432
|
+
]
|
|
433
|
+
await store.upsert(items)
|
|
434
|
+
|
|
435
|
+
# Searching with the same vector as item-0 should put item-0 first.
|
|
436
|
+
results = await store.search(items[0].vector, limit=_ITEM_COUNT)
|
|
437
|
+
assert len(results) == _ITEM_COUNT, f"expected {_ITEM_COUNT} results, got {len(results)}"
|
|
438
|
+
# Sorted by score descending, all in [0, 1]
|
|
439
|
+
for prev, nxt in itertools.pairwise(results):
|
|
440
|
+
assert prev.score >= nxt.score, f"results not sorted desc: {prev.score} before {nxt.score}"
|
|
441
|
+
for r in results:
|
|
442
|
+
assert 0.0 <= r.score <= 1.0, f"score out of range: {r.score}"
|
|
443
|
+
assert results[0].id == "id-0", (
|
|
444
|
+
f"top result must be the exact-match upsert, got {results[0].id!r}"
|
|
445
|
+
)
|
|
446
|
+
score_tolerance = 1e-3
|
|
447
|
+
assert abs(results[0].score - 1.0) < score_tolerance, (
|
|
448
|
+
f"exact-match score must be ~1.0, got {results[0].score}"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
# 5. write-through: replace id-0 and search again
|
|
452
|
+
replacement = VectorItem(
|
|
453
|
+
id="id-0",
|
|
454
|
+
vector=tuple(_unit_vector(dim, seed=99)),
|
|
455
|
+
text="replaced",
|
|
456
|
+
metadata={"category": "doc", "n": 0},
|
|
457
|
+
)
|
|
458
|
+
await store.upsert([replacement])
|
|
459
|
+
after = await store.search(items[0].vector, limit=10)
|
|
460
|
+
# No two results may share an id.
|
|
461
|
+
seen_ids = [r.id for r in after]
|
|
462
|
+
assert len(seen_ids) == len(set(seen_ids)), (
|
|
463
|
+
f"upsert must replace prior records, but got duplicate ids: {seen_ids}"
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# 7. metadata filtering
|
|
467
|
+
filtered = await store.search(items[0].vector, limit=10, filter_metadata={"category": "doc"})
|
|
468
|
+
for r in filtered:
|
|
469
|
+
assert r.metadata.get("category") == "doc", (
|
|
470
|
+
f"filter_metadata broken: returned {r.metadata!r}"
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# 8. limit < 1 raises
|
|
474
|
+
raised_limit_error = False
|
|
475
|
+
try:
|
|
476
|
+
await store.search(items[0].vector, limit=0)
|
|
477
|
+
except ValueError:
|
|
478
|
+
raised_limit_error = True
|
|
479
|
+
assert raised_limit_error, "search(limit=0) must raise ValueError"
|
|
480
|
+
|
|
481
|
+
# 6. delete: known + unknown ids
|
|
482
|
+
deleted = await store.delete([item.id for item in items] + ["never-existed"])
|
|
483
|
+
assert deleted == _ITEM_COUNT, (
|
|
484
|
+
f"delete should report {_ITEM_COUNT} actual removals "
|
|
485
|
+
f"(the {_ITEM_COUNT} we upserted), got {deleted}"
|
|
486
|
+
)
|
|
487
|
+
# Empty list returns 0
|
|
488
|
+
assert await store.delete([]) == 0
|
|
489
|
+
|
|
490
|
+
# 9. supports honesty
|
|
491
|
+
assert store.supports("definitely-not-a-capability-2026") is False
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _unit_vector(dim: int, *, seed: int) -> list[float]:
|
|
495
|
+
"""Build a deterministic unit vector for conformance tests.
|
|
496
|
+
|
|
497
|
+
Returns a one-hot-like vector with the seed-th component set high
|
|
498
|
+
and a small uniform background, then L2-normalised so cosine
|
|
499
|
+
similarity computations are stable across drivers.
|
|
500
|
+
"""
|
|
501
|
+
raw = [0.01] * dim
|
|
502
|
+
raw[seed % dim] = 1.0
|
|
503
|
+
norm = math.sqrt(sum(x * x for x in raw))
|
|
504
|
+
return [x / norm for x in raw]
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# ----------------------------------------------------------------------
|
|
508
|
+
# Graph store conformance — feat-009.
|
|
509
|
+
# ----------------------------------------------------------------------
|
|
510
|
+
|
|
511
|
+
# Named constants used by the graph conformance suite. Kept module-
|
|
512
|
+
# private; they're only meaningful inside the assertions below.
|
|
513
|
+
_GRAPH_PATH_LEN_TWO = 2 # (n0)-[e]->(n1) — one segment = two nodes
|
|
514
|
+
_GRAPH_DEPTH_TWO = 2 # paper:3 -> paper:2 -> paper:1
|
|
515
|
+
_EXPECTED_YEAR = 2017
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
async def run_graph_conformance(store: GraphStore) -> None:
|
|
519
|
+
"""Run the shared `GraphStore` conformance suite.
|
|
520
|
+
|
|
521
|
+
The store must be empty when this is called and is left empty when
|
|
522
|
+
the function returns (every node and edge created here is also
|
|
523
|
+
deleted).
|
|
524
|
+
|
|
525
|
+
Verifies the locked invariants of `GraphStore`:
|
|
526
|
+
|
|
527
|
+
1. `add_node` is idempotent (re-adding the same id replaces the
|
|
528
|
+
prior `properties` rather than appending or erroring).
|
|
529
|
+
2. `get_node(id)` returns the most-recent node, or `None` if
|
|
530
|
+
absent.
|
|
531
|
+
3. `add_edge` rejects edges referencing unknown nodes
|
|
532
|
+
(`ValueError`).
|
|
533
|
+
4. `add_edge` is idempotent on `(src, dst, edge_type)`.
|
|
534
|
+
5. `get_edges(id, direction=...)` honours the direction filter
|
|
535
|
+
and the optional `edge_type` filter.
|
|
536
|
+
6. `match()` finds a single-segment pattern and returns paths of
|
|
537
|
+
length 2 (one edge, two nodes).
|
|
538
|
+
7. `match(limit=...)` caps results.
|
|
539
|
+
8. `traverse()` respects `max_depth` and never returns paths
|
|
540
|
+
longer than `max_depth` edges.
|
|
541
|
+
9. `delete_node(cascade=False)` raises if the node has incident
|
|
542
|
+
edges; `cascade=True` removes them.
|
|
543
|
+
10. `delete_edge` returns False on unknown triples and True on
|
|
544
|
+
known ones.
|
|
545
|
+
11. `supports()` is honest about unknown capabilities.
|
|
546
|
+
|
|
547
|
+
Raises:
|
|
548
|
+
AssertionError: a contract was violated.
|
|
549
|
+
"""
|
|
550
|
+
await _graph_round_trip_invariants(store)
|
|
551
|
+
await _graph_seed_citation_chain(store)
|
|
552
|
+
await _graph_query_invariants(store)
|
|
553
|
+
await _graph_delete_invariants(store)
|
|
554
|
+
_graph_capability_invariants(store)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
async def _graph_round_trip_invariants(store: GraphStore) -> None:
|
|
558
|
+
"""Round-trip and idempotency invariants on a single node."""
|
|
559
|
+
n1 = GraphNode(id="paper:1", labels=("Doc",), properties={"topic": "ml"})
|
|
560
|
+
await store.add_node(n1)
|
|
561
|
+
|
|
562
|
+
fetched = await store.get_node("paper:1")
|
|
563
|
+
assert fetched is not None, "get_node must return the persisted node"
|
|
564
|
+
assert fetched.id == "paper:1"
|
|
565
|
+
assert fetched.properties.get("topic") == "ml"
|
|
566
|
+
|
|
567
|
+
# Unknown id returns None, not raise.
|
|
568
|
+
missing = await store.get_node("paper:never")
|
|
569
|
+
assert missing is None, "get_node of an unknown id must return None"
|
|
570
|
+
|
|
571
|
+
# Idempotent upsert: re-add with extra properties replaces.
|
|
572
|
+
n1_v2 = GraphNode(
|
|
573
|
+
id="paper:1", labels=("Doc",), properties={"topic": "ml", "year": _EXPECTED_YEAR}
|
|
574
|
+
)
|
|
575
|
+
await store.add_node(n1_v2)
|
|
576
|
+
refetched = await store.get_node("paper:1")
|
|
577
|
+
assert refetched is not None
|
|
578
|
+
assert refetched.properties.get("year") == _EXPECTED_YEAR, (
|
|
579
|
+
"add_node must replace properties on idempotent upsert"
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
# add_edge rejects unknown endpoints.
|
|
583
|
+
raised_unknown = False
|
|
584
|
+
try:
|
|
585
|
+
await store.add_edge(GraphEdge(src="ghost", dst="paper:1", edge_type="CITES"))
|
|
586
|
+
except ValueError:
|
|
587
|
+
raised_unknown = True
|
|
588
|
+
assert raised_unknown, "add_edge must raise ValueError on unknown endpoint"
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
async def _graph_seed_citation_chain(store: GraphStore) -> None:
|
|
592
|
+
"""Seed a tiny three-paper citation chain. Assumes paper:1 exists."""
|
|
593
|
+
await store.add_node(GraphNode(id="paper:2", labels=("Doc",), properties={"topic": "ml"}))
|
|
594
|
+
await store.add_node(GraphNode(id="paper:3", labels=("Doc",), properties={"topic": "bio"}))
|
|
595
|
+
await store.add_edge(GraphEdge(src="paper:2", dst="paper:1", edge_type="CITES"))
|
|
596
|
+
await store.add_edge(GraphEdge(src="paper:3", dst="paper:2", edge_type="CITES"))
|
|
597
|
+
|
|
598
|
+
# Idempotent edge upsert.
|
|
599
|
+
await store.add_edge(
|
|
600
|
+
GraphEdge(src="paper:2", dst="paper:1", edge_type="CITES", properties={"weight": 0.9})
|
|
601
|
+
)
|
|
602
|
+
out_edges = await store.get_edges("paper:2", direction="out")
|
|
603
|
+
assert len([e for e in out_edges if e.dst == "paper:1"]) == 1, (
|
|
604
|
+
"add_edge must be idempotent on (src, dst, edge_type)"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
async def _graph_query_invariants(store: GraphStore) -> None:
|
|
609
|
+
"""get_edges, match, and traverse invariants over the seeded chain."""
|
|
610
|
+
out2 = await store.get_edges("paper:2", direction="out")
|
|
611
|
+
assert all(e.src == "paper:2" for e in out2), "direction=out filter broken"
|
|
612
|
+
|
|
613
|
+
in1 = await store.get_edges("paper:1", direction="in")
|
|
614
|
+
assert all(e.dst == "paper:1" for e in in1), "direction=in filter broken"
|
|
615
|
+
assert any(e.src == "paper:2" for e in in1)
|
|
616
|
+
|
|
617
|
+
cites_only = await store.get_edges("paper:2", edge_type="CITES", direction="out")
|
|
618
|
+
assert all(e.edge_type == "CITES" for e in cites_only)
|
|
619
|
+
|
|
620
|
+
pattern = GraphPattern(
|
|
621
|
+
segments=(GraphSegment(src_label="Doc", edge_type="CITES", dst_label="Doc"),),
|
|
622
|
+
)
|
|
623
|
+
matches = await store.match(pattern, limit=10)
|
|
624
|
+
assert len(matches) >= 1, "match should find at least one CITES edge"
|
|
625
|
+
for path in matches:
|
|
626
|
+
assert len(path.nodes) == _GRAPH_PATH_LEN_TWO, (
|
|
627
|
+
"single-segment match must return length-2 paths"
|
|
628
|
+
)
|
|
629
|
+
assert len(path.edges) == 1
|
|
630
|
+
assert path.edges[0].edge_type == "CITES"
|
|
631
|
+
|
|
632
|
+
capped = await store.match(pattern, limit=1)
|
|
633
|
+
assert len(capped) <= 1
|
|
634
|
+
|
|
635
|
+
paths_d1 = await store.traverse("paper:3", max_depth=1)
|
|
636
|
+
for p in paths_d1:
|
|
637
|
+
assert len(p.edges) <= 1, f"max_depth=1 must not return path with {len(p.edges)} edges"
|
|
638
|
+
|
|
639
|
+
paths_d2 = await store.traverse("paper:3", max_depth=_GRAPH_DEPTH_TWO)
|
|
640
|
+
reaches_paper1 = any(p.nodes[-1].id == "paper:1" for p in paths_d2)
|
|
641
|
+
assert reaches_paper1, "traverse(max_depth=2) from paper:3 must reach paper:1"
|
|
642
|
+
for p in paths_d2:
|
|
643
|
+
assert len(p.edges) <= _GRAPH_DEPTH_TWO
|
|
644
|
+
|
|
645
|
+
empty_traverse = await store.traverse("ghost-node", max_depth=_GRAPH_DEPTH_TWO)
|
|
646
|
+
assert empty_traverse == [], "traverse from unknown node must return empty list"
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
async def _graph_delete_invariants(store: GraphStore) -> None:
|
|
650
|
+
"""Cascade and unknown-triple delete invariants. Empties the store."""
|
|
651
|
+
raised_cascade = False
|
|
652
|
+
try:
|
|
653
|
+
await store.delete_node("paper:2", cascade=False)
|
|
654
|
+
except ValueError:
|
|
655
|
+
raised_cascade = True
|
|
656
|
+
assert raised_cascade, "delete_node with cascade=False must raise on connected node"
|
|
657
|
+
|
|
658
|
+
deleted = await store.delete_node("paper:2", cascade=True)
|
|
659
|
+
assert deleted is True
|
|
660
|
+
assert await store.get_node("paper:2") is None
|
|
661
|
+
assert (await store.get_edges("paper:1", direction="in")) == []
|
|
662
|
+
|
|
663
|
+
# Unknown triple returns False, not raise.
|
|
664
|
+
assert await store.delete_edge("paper:3", "paper:never", edge_type="CITES") is False
|
|
665
|
+
|
|
666
|
+
# Empty the store fully.
|
|
667
|
+
await store.delete_node("paper:1", cascade=True)
|
|
668
|
+
await store.delete_node("paper:3", cascade=True)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def _graph_capability_invariants(store: GraphStore) -> None:
|
|
672
|
+
"""capabilities() / supports() honesty."""
|
|
673
|
+
caps = store.capabilities()
|
|
674
|
+
assert isinstance(caps, set)
|
|
675
|
+
assert store.supports("definitely-not-a-capability-2026") is False
|
|
676
|
+
if caps:
|
|
677
|
+
sample = next(iter(caps))
|
|
678
|
+
assert store.supports(sample) is True
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
# ======================================================================
|
|
682
|
+
# Guardrail conformance — feat-018.
|
|
683
|
+
# ======================================================================
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
async def run_input_validator_conformance(
|
|
687
|
+
validator: InputValidator,
|
|
688
|
+
*,
|
|
689
|
+
benign: str = "What is the weather today?",
|
|
690
|
+
obvious_violation: str | None = None,
|
|
691
|
+
) -> None:
|
|
692
|
+
"""Validate that an InputValidator honours the locked contract.
|
|
693
|
+
|
|
694
|
+
- Concrete subclass declares `name` and `description`.
|
|
695
|
+
- `.validate(...)` returns a `ValidationResult`.
|
|
696
|
+
- Benign input produces `passed=True` and an empty `violations`
|
|
697
|
+
tuple.
|
|
698
|
+
- When `obvious_violation` is supplied, the validator flags it
|
|
699
|
+
(`passed=False`).
|
|
700
|
+
"""
|
|
701
|
+
assert isinstance(getattr(validator, "name", None), str), "name must be a str ClassVar"
|
|
702
|
+
assert isinstance(
|
|
703
|
+
getattr(validator, "description", None),
|
|
704
|
+
str,
|
|
705
|
+
), "description must be a str ClassVar"
|
|
706
|
+
|
|
707
|
+
result = await validator.validate(benign, {"run_id": "conformance"})
|
|
708
|
+
assert isinstance(result, ValidationResult), "validate() must return ValidationResult"
|
|
709
|
+
assert result.passed, f"benign input must pass; got violations {list(result.violations)!r}"
|
|
710
|
+
|
|
711
|
+
if obvious_violation is not None:
|
|
712
|
+
bad = await validator.validate(obvious_violation, {"run_id": "conformance"})
|
|
713
|
+
assert isinstance(bad, ValidationResult)
|
|
714
|
+
assert not bad.passed, (
|
|
715
|
+
f"obvious-violation input must fail; validator {validator.name!r} returned passed=True"
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
async def run_output_validator_conformance(
|
|
720
|
+
validator: OutputValidator,
|
|
721
|
+
*,
|
|
722
|
+
benign: str = "The weather is nice today.",
|
|
723
|
+
obvious_violation: str | None = None,
|
|
724
|
+
) -> None:
|
|
725
|
+
"""Same contract as `run_input_validator_conformance` but for
|
|
726
|
+
output validators. If `obvious_violation` is supplied and the
|
|
727
|
+
validator can redact, asserts that `redacted_content` is set."""
|
|
728
|
+
assert isinstance(getattr(validator, "name", None), str)
|
|
729
|
+
assert isinstance(getattr(validator, "description", None), str)
|
|
730
|
+
|
|
731
|
+
result = await validator.validate(benign, {"run_id": "conformance"})
|
|
732
|
+
assert isinstance(result, ValidationResult)
|
|
733
|
+
assert result.passed, f"benign output must pass; got {list(result.violations)!r}"
|
|
734
|
+
|
|
735
|
+
if obvious_violation is not None:
|
|
736
|
+
bad = await validator.validate(obvious_violation, {"run_id": "conformance"})
|
|
737
|
+
assert isinstance(bad, ValidationResult)
|
|
738
|
+
assert not bad.passed, "obvious-violation output must fail"
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
async def run_tool_gate_conformance(
|
|
742
|
+
gate: ToolCallGate,
|
|
743
|
+
*,
|
|
744
|
+
benign_tool: Tool,
|
|
745
|
+
benign_tool_name: str,
|
|
746
|
+
forbidden_tool: Tool | None = None,
|
|
747
|
+
forbidden_tool_name: str | None = None,
|
|
748
|
+
) -> None:
|
|
749
|
+
"""Validate that a ToolCallGate honours the locked contract."""
|
|
750
|
+
assert isinstance(getattr(gate, "name", None), str)
|
|
751
|
+
assert isinstance(getattr(gate, "description", None), str)
|
|
752
|
+
|
|
753
|
+
benign_result = await gate.authorize(benign_tool_name, benign_tool, {}, {})
|
|
754
|
+
assert isinstance(benign_result, ValidationResult)
|
|
755
|
+
|
|
756
|
+
if forbidden_tool is not None and forbidden_tool_name is not None:
|
|
757
|
+
denied = await gate.authorize(forbidden_tool_name, forbidden_tool, {}, {})
|
|
758
|
+
assert isinstance(denied, ValidationResult)
|
|
759
|
+
assert not denied.passed, (
|
|
760
|
+
f"gate {gate.name!r} must deny {forbidden_tool_name!r}; got passed=True"
|
|
761
|
+
)
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
# ----------------------------------------------------------------------
|
|
765
|
+
# Task conformance — feat-015.
|
|
766
|
+
# ----------------------------------------------------------------------
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
async def run_task_conformance(
|
|
770
|
+
task: Task,
|
|
771
|
+
*,
|
|
772
|
+
context: dict[str, object] | None = None,
|
|
773
|
+
) -> None:
|
|
774
|
+
"""Validate that a Task honours the locked contract.
|
|
775
|
+
|
|
776
|
+
Asserts:
|
|
777
|
+
1. ``name`` is a non-empty string.
|
|
778
|
+
2. ``cost_estimate_usd`` is a non-negative float.
|
|
779
|
+
3. ``timeout_s`` is a positive float.
|
|
780
|
+
4. ``depends_on`` is a tuple of strings (possibly empty).
|
|
781
|
+
5. ``run(context)`` returns a list (the engine treats an empty
|
|
782
|
+
list as a valid no-finding result).
|
|
783
|
+
6. Every emitted finding has the three required ``Finding``
|
|
784
|
+
attributes (``severity``, ``category``, ``message``).
|
|
785
|
+
"""
|
|
786
|
+
name = type(task).name
|
|
787
|
+
assert isinstance(name, str), "Task.name must be a string"
|
|
788
|
+
assert name, "Task.name must be non-empty"
|
|
789
|
+
assert isinstance(type(task).cost_estimate_usd, (int, float)), (
|
|
790
|
+
"Task.cost_estimate_usd must be numeric"
|
|
791
|
+
)
|
|
792
|
+
assert float(type(task).cost_estimate_usd) >= 0.0, "Task.cost_estimate_usd must be non-negative"
|
|
793
|
+
assert isinstance(type(task).timeout_s, (int, float)), "Task.timeout_s must be numeric"
|
|
794
|
+
assert float(type(task).timeout_s) > 0.0, "Task.timeout_s must be positive"
|
|
795
|
+
deps = type(task).depends_on
|
|
796
|
+
assert isinstance(deps, tuple), f"Task.depends_on must be a tuple, got {type(deps).__name__}"
|
|
797
|
+
for dep in deps:
|
|
798
|
+
assert isinstance(dep, str), "Task.depends_on entries must be strings"
|
|
799
|
+
|
|
800
|
+
findings = await task.run(context if context is not None else {})
|
|
801
|
+
assert isinstance(findings, list), (
|
|
802
|
+
f"Task.run must return a list of findings, got {type(findings).__name__}"
|
|
803
|
+
)
|
|
804
|
+
for f in findings:
|
|
805
|
+
assert hasattr(f, "severity"), "finding must have a 'severity' attribute"
|
|
806
|
+
assert isinstance(f.severity, str), "finding.severity must be a string"
|
|
807
|
+
assert hasattr(f, "category"), "finding must have a 'category' attribute"
|
|
808
|
+
assert isinstance(f.category, str), "finding.category must be a string"
|
|
809
|
+
assert hasattr(f, "message"), "finding must have a 'message' attribute"
|
|
810
|
+
assert isinstance(f.message, str), "finding.message must be a string"
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
# ----------------------------------------------------------------------
|
|
814
|
+
# Chat conformance — feat-020.
|
|
815
|
+
# ----------------------------------------------------------------------
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
async def run_chat_history_conformance(store: ChatHistoryStore) -> None:
|
|
819
|
+
"""Validate that a `ChatHistoryStore` honours the locked contract.
|
|
820
|
+
|
|
821
|
+
The store must be empty when this is called and is left empty
|
|
822
|
+
when the function returns.
|
|
823
|
+
"""
|
|
824
|
+
from datetime import UTC, datetime # noqa: PLC0415
|
|
825
|
+
from uuid import uuid4 # noqa: PLC0415
|
|
826
|
+
|
|
827
|
+
from agentforge_core.values.chat import ChatTurn # noqa: PLC0415
|
|
828
|
+
|
|
829
|
+
sid = f"conf-{uuid4().hex[:8]}"
|
|
830
|
+
sid_b = f"conf-{uuid4().hex[:8]}"
|
|
831
|
+
|
|
832
|
+
def _turn(session: str, role: str, content: str, **kw: Any) -> ChatTurn:
|
|
833
|
+
return ChatTurn(
|
|
834
|
+
id=uuid4().hex,
|
|
835
|
+
session_id=session,
|
|
836
|
+
role=role, # type: ignore[arg-type]
|
|
837
|
+
content=content,
|
|
838
|
+
timestamp=datetime.now(UTC),
|
|
839
|
+
**kw,
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
# 1. append + load round-trip.
|
|
843
|
+
t1 = _turn(sid, "user", "hello")
|
|
844
|
+
t2 = _turn(sid, "assistant", "hi there", run_id="run-1")
|
|
845
|
+
await store.append(t1)
|
|
846
|
+
await store.append(t2)
|
|
847
|
+
loaded = await store.load(sid)
|
|
848
|
+
assert len(loaded) == _EXPECTED_CHAT_TURNS_SID, (
|
|
849
|
+
f"load() must return both turns; got {len(loaded)}"
|
|
850
|
+
)
|
|
851
|
+
assert loaded[0].id == t1.id, "load() must return turns in chronological order"
|
|
852
|
+
assert loaded[1].id == t2.id
|
|
853
|
+
|
|
854
|
+
# 2. count.
|
|
855
|
+
n = await store.count(sid)
|
|
856
|
+
assert n == _EXPECTED_CHAT_TURNS_SID, f"count() must reflect appended turns; got {n}"
|
|
857
|
+
|
|
858
|
+
# 3. session isolation — a different session_id sees nothing.
|
|
859
|
+
other_turn = _turn(sid_b, "user", "different session")
|
|
860
|
+
await store.append(other_turn)
|
|
861
|
+
only_a = await store.load(sid)
|
|
862
|
+
assert all(t.session_id == sid for t in only_a), (
|
|
863
|
+
"load(session_id) must not bleed across sessions"
|
|
864
|
+
)
|
|
865
|
+
assert await store.count(sid_b) == _EXPECTED_CHAT_TURNS_SID_B
|
|
866
|
+
|
|
867
|
+
# 4. role filter.
|
|
868
|
+
only_assistant = await store.load(sid, roles=["assistant"])
|
|
869
|
+
assert all(t.role == "assistant" for t in only_assistant), (
|
|
870
|
+
"load(roles=...) must filter to those roles"
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# 5. limit.
|
|
874
|
+
limited = await store.load(sid, limit=1)
|
|
875
|
+
assert len(limited) == 1, "load(limit=N) must return at most N turns"
|
|
876
|
+
|
|
877
|
+
# 6. list_sessions returns info for every active session.
|
|
878
|
+
sessions = await store.list_sessions()
|
|
879
|
+
ids = {s.id for s in sessions}
|
|
880
|
+
assert sid in ids, "list_sessions() must include the first session"
|
|
881
|
+
assert sid_b in ids, "list_sessions() must include the second session"
|
|
882
|
+
|
|
883
|
+
# 7. update_session_metadata merges keys.
|
|
884
|
+
await store.update_session_metadata(sid, {"owner": "alice", "tag": "x"})
|
|
885
|
+
after = await store.list_sessions()
|
|
886
|
+
matched = [s for s in after if s.id == sid]
|
|
887
|
+
assert matched, "session must still appear after update_session_metadata()"
|
|
888
|
+
|
|
889
|
+
# 8. owner filter on list_sessions.
|
|
890
|
+
owned = await store.list_sessions(owner="alice")
|
|
891
|
+
assert all(s.owner == "alice" for s in owned), (
|
|
892
|
+
"list_sessions(owner=X) must filter to that owner"
|
|
893
|
+
)
|
|
894
|
+
|
|
895
|
+
# 9. delete_session removes turns + returns count.
|
|
896
|
+
removed = await store.delete_session(sid)
|
|
897
|
+
assert removed == _EXPECTED_CHAT_TURNS_SID, (
|
|
898
|
+
f"delete_session must return turns removed; got {removed}"
|
|
899
|
+
)
|
|
900
|
+
assert await store.count(sid) == 0
|
|
901
|
+
after_other = await store.count(sid_b)
|
|
902
|
+
assert after_other == _EXPECTED_CHAT_TURNS_SID_B, "delete_session must not touch other sessions"
|
|
903
|
+
|
|
904
|
+
# 10. expire_before — drivers without TTL may return 0.
|
|
905
|
+
far_future = datetime(2099, 1, 1, tzinfo=UTC)
|
|
906
|
+
removed_b = await store.expire_before(far_future)
|
|
907
|
+
assert isinstance(removed_b, int), "expire_before must return an int"
|
|
908
|
+
|
|
909
|
+
# 11. capabilities is a set.
|
|
910
|
+
caps = store.capabilities()
|
|
911
|
+
assert isinstance(caps, set)
|
|
912
|
+
assert store.supports("definitely-not-a-real-capability") is False
|
|
913
|
+
|
|
914
|
+
# Clean up remaining session.
|
|
915
|
+
await store.delete_session(sid_b)
|
|
916
|
+
await store.expire_before(far_future)
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
async def run_truncation_conformance(strategy: HistoryTruncationStrategy) -> None:
|
|
920
|
+
"""Validate that a `HistoryTruncationStrategy` honours the
|
|
921
|
+
locked invariants.
|
|
922
|
+
|
|
923
|
+
Asserts:
|
|
924
|
+
1. Output is a subsequence of input (order preserved, no
|
|
925
|
+
injection).
|
|
926
|
+
2. Empty input → empty output.
|
|
927
|
+
"""
|
|
928
|
+
from datetime import UTC, datetime # noqa: PLC0415
|
|
929
|
+
from uuid import uuid4 # noqa: PLC0415
|
|
930
|
+
|
|
931
|
+
from agentforge_core.values.chat import ChatTurn # noqa: PLC0415
|
|
932
|
+
|
|
933
|
+
def _turn(role: str, content: str) -> ChatTurn:
|
|
934
|
+
return ChatTurn(
|
|
935
|
+
id=uuid4().hex,
|
|
936
|
+
session_id="conf",
|
|
937
|
+
role=role, # type: ignore[arg-type]
|
|
938
|
+
content=content,
|
|
939
|
+
timestamp=datetime.now(UTC),
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
empty: list[ChatTurn] = []
|
|
943
|
+
out = await strategy.select(empty, "msg", {})
|
|
944
|
+
assert out == [], "empty input must yield empty output"
|
|
945
|
+
|
|
946
|
+
seq = [
|
|
947
|
+
_turn("user", "a"),
|
|
948
|
+
_turn("assistant", "b"),
|
|
949
|
+
_turn("user", "c"),
|
|
950
|
+
_turn("assistant", "d"),
|
|
951
|
+
]
|
|
952
|
+
picked = await strategy.select(seq, "next msg", {})
|
|
953
|
+
ids = [t.id for t in seq]
|
|
954
|
+
# Output must preserve order: original turns appear in the same
|
|
955
|
+
# relative order as in `seq`. Synthesised summary turns marked
|
|
956
|
+
# `metadata["agentforge_chat.summary"] == True` are allowed
|
|
957
|
+
# (`SummariseOldest`) and skipped from the subsequence check.
|
|
958
|
+
iter_ids = iter(ids)
|
|
959
|
+
for t in picked:
|
|
960
|
+
if t.metadata.get("agentforge_chat.summary") is True:
|
|
961
|
+
continue
|
|
962
|
+
assert t.id in iter_ids, (
|
|
963
|
+
"truncation output must preserve input order "
|
|
964
|
+
"(no reordered or inserted non-summary turns)"
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
|
|
968
|
+
async def run_reranker_conformance(reranker: Reranker) -> None:
|
|
969
|
+
"""Run the shared `Reranker` conformance suite (feat-021).
|
|
970
|
+
|
|
971
|
+
The reranker must be ready to call when this is passed in and is
|
|
972
|
+
not touched after the function returns (callers manage the
|
|
973
|
+
reranker's lifecycle).
|
|
974
|
+
|
|
975
|
+
Verifies the locked invariants of `Reranker`:
|
|
976
|
+
|
|
977
|
+
1. Empty candidate list returns an empty list, no calls to any
|
|
978
|
+
backing model.
|
|
979
|
+
2. ``top_k < 1`` (when not None) raises `ValueError`.
|
|
980
|
+
3. ``rerank(query, candidates, top_k=None)`` returns a list of
|
|
981
|
+
the same length as `candidates`.
|
|
982
|
+
4. ``rerank(query, candidates, top_k=K)`` returns at most
|
|
983
|
+
`K` items.
|
|
984
|
+
5. Returned scores are in `[0, 1]`.
|
|
985
|
+
6. Results are sorted descending by score.
|
|
986
|
+
7. Returned `VectorMatch` objects carry the input's `id` /
|
|
987
|
+
`text` / `metadata` values unchanged (only `score` may
|
|
988
|
+
change).
|
|
989
|
+
8. Input list is not mutated.
|
|
990
|
+
9. ``supports("not-a-real-capability")`` returns False.
|
|
991
|
+
|
|
992
|
+
Raises:
|
|
993
|
+
AssertionError: a contract was violated.
|
|
994
|
+
"""
|
|
995
|
+
|
|
996
|
+
# 1. empty input → empty output, no work
|
|
997
|
+
empty = await reranker.rerank("any query", [])
|
|
998
|
+
assert empty == [], f"rerank([]) must return [], got {empty!r}"
|
|
999
|
+
|
|
1000
|
+
# 2. top_k < 1 must raise
|
|
1001
|
+
candidates = [
|
|
1002
|
+
VectorMatch(id="a", text="alpha", score=0.9, metadata={"k": 1}),
|
|
1003
|
+
VectorMatch(id="b", text="beta", score=0.5, metadata={"k": 2}),
|
|
1004
|
+
VectorMatch(id="c", text="gamma", score=0.3, metadata={"k": 3}),
|
|
1005
|
+
]
|
|
1006
|
+
raised_topk = False
|
|
1007
|
+
try:
|
|
1008
|
+
await reranker.rerank("q", candidates, top_k=0)
|
|
1009
|
+
except ValueError:
|
|
1010
|
+
raised_topk = True
|
|
1011
|
+
assert raised_topk, "rerank(top_k=0) must raise ValueError"
|
|
1012
|
+
|
|
1013
|
+
# 3-7. happy path
|
|
1014
|
+
original = list(candidates)
|
|
1015
|
+
full = await reranker.rerank("q", candidates)
|
|
1016
|
+
assert len(full) == len(candidates), (
|
|
1017
|
+
f"rerank(top_k=None) must return all candidates, got {len(full)} vs {len(candidates)}"
|
|
1018
|
+
)
|
|
1019
|
+
for r in full:
|
|
1020
|
+
assert 0.0 <= r.score <= 1.0, f"score out of range: {r.score}"
|
|
1021
|
+
for prev, nxt in itertools.pairwise(full):
|
|
1022
|
+
assert prev.score >= nxt.score, f"results not sorted desc: {prev.score} before {nxt.score}"
|
|
1023
|
+
by_id = {r.id: r for r in full}
|
|
1024
|
+
for orig in original:
|
|
1025
|
+
out = by_id[orig.id]
|
|
1026
|
+
assert out.text == orig.text, (
|
|
1027
|
+
f"text field mutated for id={orig.id}: {orig.text!r} → {out.text!r}"
|
|
1028
|
+
)
|
|
1029
|
+
assert out.metadata == orig.metadata, (
|
|
1030
|
+
f"metadata mutated for id={orig.id}: {orig.metadata!r} → {out.metadata!r}"
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
# 4. top_k truncates
|
|
1034
|
+
truncated = await reranker.rerank("q", candidates, top_k=2)
|
|
1035
|
+
assert len(truncated) == 2, f"top_k=2 must return 2 items, got {len(truncated)}" # noqa: PLR2004
|
|
1036
|
+
for prev, nxt in itertools.pairwise(truncated):
|
|
1037
|
+
assert prev.score >= nxt.score
|
|
1038
|
+
|
|
1039
|
+
# 8. input not mutated
|
|
1040
|
+
assert candidates == original, "rerank must not mutate its input list"
|
|
1041
|
+
|
|
1042
|
+
# 9. unknown capability check
|
|
1043
|
+
assert reranker.supports("not-a-real-capability") is False, (
|
|
1044
|
+
"supports() must return False for unknown capability"
|
|
1045
|
+
)
|
|
1046
|
+
|
|
1047
|
+
|
|
1048
|
+
# ----------------------------------------------------------------------
|
|
1049
|
+
# Hybrid-search conformance — feat-022 (opt-in).
|
|
1050
|
+
# ----------------------------------------------------------------------
|
|
1051
|
+
|
|
1052
|
+
|
|
1053
|
+
async def run_hybrid_search_conformance(store: VectorStore) -> None:
|
|
1054
|
+
"""Verify the `lexical_search` contract for hybrid-capable drivers.
|
|
1055
|
+
|
|
1056
|
+
Only call this on stores that declare the ``"hybrid_search"``
|
|
1057
|
+
capability — the function asserts the precondition. Empty store
|
|
1058
|
+
in, empty store out (every item upserted is deleted).
|
|
1059
|
+
|
|
1060
|
+
Verifies:
|
|
1061
|
+
|
|
1062
|
+
1. ``store.supports("hybrid_search")`` is True.
|
|
1063
|
+
2. Empty corpus returns ``[]``.
|
|
1064
|
+
3. ``lexical_search`` returns at most ``limit`` matches sorted
|
|
1065
|
+
by score desc, with scores in ``[0, 1]`` (max-normalised).
|
|
1066
|
+
4. The top hit on an exact-token query is the document that
|
|
1067
|
+
contains the rarest matching token.
|
|
1068
|
+
5. ``filter_metadata`` AND-matches on the lexical path.
|
|
1069
|
+
6. ``limit=0`` raises ``ValueError``.
|
|
1070
|
+
7. Re-upserting an existing id with new text invalidates the
|
|
1071
|
+
prior lexical hit (the new text wins).
|
|
1072
|
+
|
|
1073
|
+
Raises:
|
|
1074
|
+
AssertionError: a contract was violated.
|
|
1075
|
+
"""
|
|
1076
|
+
assert store.supports("hybrid_search"), (
|
|
1077
|
+
"run_hybrid_search_conformance called on a store that does not declare 'hybrid_search'"
|
|
1078
|
+
)
|
|
1079
|
+
|
|
1080
|
+
# 2. empty corpus
|
|
1081
|
+
empty = await store.lexical_search("anything", limit=5)
|
|
1082
|
+
assert empty == [], f"empty store must return [], got {empty!r}"
|
|
1083
|
+
|
|
1084
|
+
dim = store.dimensions()
|
|
1085
|
+
base_vec = tuple([0.1] * dim)
|
|
1086
|
+
items = [
|
|
1087
|
+
VectorItem(
|
|
1088
|
+
id="a", vector=base_vec, text="Paris is the capital of France", metadata={"k": 1}
|
|
1089
|
+
),
|
|
1090
|
+
VectorItem(
|
|
1091
|
+
id="b", vector=base_vec, text="Berlin is the capital of Germany", metadata={"k": 2}
|
|
1092
|
+
),
|
|
1093
|
+
VectorItem(id="c", vector=base_vec, text="The Eiffel Tower is in Paris", metadata={"k": 1}),
|
|
1094
|
+
]
|
|
1095
|
+
await store.upsert(items)
|
|
1096
|
+
|
|
1097
|
+
try:
|
|
1098
|
+
# 6. limit < 1 must raise
|
|
1099
|
+
raised_limit = False
|
|
1100
|
+
try:
|
|
1101
|
+
await store.lexical_search("Paris", limit=0)
|
|
1102
|
+
except ValueError:
|
|
1103
|
+
raised_limit = True
|
|
1104
|
+
assert raised_limit, "lexical_search(limit=0) must raise ValueError"
|
|
1105
|
+
|
|
1106
|
+
# 3. ordering + score range
|
|
1107
|
+
eiffel = await store.lexical_search("Eiffel Tower", limit=5)
|
|
1108
|
+
assert len(eiffel) >= 1, "Eiffel Tower query must match at least one doc"
|
|
1109
|
+
assert eiffel[0].id == "c", f"top hit on 'Eiffel Tower' must be doc c, got {eiffel[0].id}"
|
|
1110
|
+
for prev, nxt in itertools.pairwise(eiffel):
|
|
1111
|
+
assert prev.score >= nxt.score, "lexical_search results not sorted desc"
|
|
1112
|
+
for m in eiffel:
|
|
1113
|
+
assert 0.0 <= m.score <= 1.0, f"lexical score out of [0, 1]: {m.score}"
|
|
1114
|
+
|
|
1115
|
+
# 5. metadata filter
|
|
1116
|
+
filtered = await store.lexical_search("capital", limit=5, filter_metadata={"k": 1})
|
|
1117
|
+
ids = {m.id for m in filtered}
|
|
1118
|
+
# docs with k=1 are a (Paris/capital) and c (Eiffel/Paris)
|
|
1119
|
+
assert "b" not in ids, "filter_metadata={k: 1} must exclude doc b"
|
|
1120
|
+
|
|
1121
|
+
# 7. re-upsert invalidates prior text
|
|
1122
|
+
await store.upsert(
|
|
1123
|
+
[
|
|
1124
|
+
VectorItem(
|
|
1125
|
+
id="a",
|
|
1126
|
+
vector=base_vec,
|
|
1127
|
+
text="Madrid is the capital of Spain",
|
|
1128
|
+
metadata={"k": 1},
|
|
1129
|
+
),
|
|
1130
|
+
]
|
|
1131
|
+
)
|
|
1132
|
+
paris_after = await store.lexical_search("Paris", limit=5)
|
|
1133
|
+
paris_ids = {m.id for m in paris_after}
|
|
1134
|
+
assert "a" not in paris_ids, (
|
|
1135
|
+
"re-upserting doc a with new text must drop it from a 'Paris' query"
|
|
1136
|
+
)
|
|
1137
|
+
finally:
|
|
1138
|
+
await store.delete(["a", "b", "c"])
|