@tangle-network/agent-eval 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/index.d.ts +352 -4
- package/dist/index.js +634 -45
- package/dist/index.js.map +1 -1
- package/docs/knowledge-readiness.md +84 -0
- package/docs/multi-shot-optimization.md +7 -0
- package/package.json +12 -10
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Knowledge Readiness
|
|
2
|
+
|
|
3
|
+
`agent-eval` owns the contract for deciding whether an agent had enough
|
|
4
|
+
task-world context to run. It does not own web crawling, connector storage, wiki
|
|
5
|
+
pages, credentials, or product policy. Those live in `agent-knowledge` and
|
|
6
|
+
product repos.
|
|
7
|
+
|
|
8
|
+
The core loop is:
|
|
9
|
+
|
|
10
|
+
```txt
|
|
11
|
+
Know -> Act -> Evaluate -> Learn -> Optimize
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
Use `KnowledgeRequirement` to declare required context, `scoreKnowledgeReadiness`
|
|
15
|
+
to produce a `KnowledgeReadinessReport`, and `blockingKnowledgeEval` to make the
|
|
16
|
+
report a normal control-runtime validator.
|
|
17
|
+
|
|
18
|
+
```ts
|
|
19
|
+
import {
|
|
20
|
+
blockingKnowledgeEval,
|
|
21
|
+
runAgentControlLoop,
|
|
22
|
+
scoreKnowledgeReadiness,
|
|
23
|
+
} from '@tangle-network/agent-eval'
|
|
24
|
+
|
|
25
|
+
await runAgentControlLoop({
|
|
26
|
+
intent: 'Implement the SDK migration',
|
|
27
|
+
async observe() {
|
|
28
|
+
const knowledge = scoreKnowledgeReadiness({
|
|
29
|
+
taskId: 'sdk-migration',
|
|
30
|
+
requirements: [{
|
|
31
|
+
id: 'repo-build-command',
|
|
32
|
+
description: 'Repository build and typecheck command',
|
|
33
|
+
requiredFor: ['coding'],
|
|
34
|
+
category: 'codebase_specific',
|
|
35
|
+
acquisitionMode: 'inspect_repo',
|
|
36
|
+
importance: 'blocking',
|
|
37
|
+
freshness: 'weekly',
|
|
38
|
+
sensitivity: 'public',
|
|
39
|
+
confidenceNeeded: 0.9,
|
|
40
|
+
currentConfidence: 0.2,
|
|
41
|
+
evidenceIds: [],
|
|
42
|
+
fallbackPolicy: 'block',
|
|
43
|
+
}],
|
|
44
|
+
})
|
|
45
|
+
return { knowledge }
|
|
46
|
+
},
|
|
47
|
+
async validate({ state }) {
|
|
48
|
+
return [blockingKnowledgeEval(state.knowledge)]
|
|
49
|
+
},
|
|
50
|
+
async decide({ evals, state }) {
|
|
51
|
+
if (!evals.find((e) => e.id === 'knowledge-ready')?.passed) {
|
|
52
|
+
return {
|
|
53
|
+
type: 'stop',
|
|
54
|
+
pass: false,
|
|
55
|
+
reason: `Collect knowledge first: ${state.knowledge.recommendedAction}`,
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return { type: 'stop', pass: true, reason: 'ready' }
|
|
59
|
+
},
|
|
60
|
+
act() {
|
|
61
|
+
return null
|
|
62
|
+
},
|
|
63
|
+
})
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Knowledge-related failures use the normal failure taxonomy:
|
|
67
|
+
|
|
68
|
+
- `knowledge_readiness_blocked`
|
|
69
|
+
- `missing_user_data`
|
|
70
|
+
- `missing_domain_data`
|
|
71
|
+
- `missing_codebase_context`
|
|
72
|
+
- `missing_runtime_context`
|
|
73
|
+
- `missing_credentials`
|
|
74
|
+
- `stale_external_data`
|
|
75
|
+
- `bad_retrieval`
|
|
76
|
+
- `insufficient_evidence`
|
|
77
|
+
- `contradictory_evidence`
|
|
78
|
+
- `ambiguous_user_intent`
|
|
79
|
+
|
|
80
|
+
For optimization, scorers should use responsible surfaces such as
|
|
81
|
+
`knowledge-requirements`, `data-acquisition`, `retrieval-policy`, and
|
|
82
|
+
`user-question-policy` in actionable side information. That lets GEPA-style
|
|
83
|
+
loops improve data acquisition and retrieval policy instead of blaming every
|
|
84
|
+
failure on the prompt.
|
|
@@ -52,6 +52,13 @@ The scorer should return `asi` rows for concrete failure modes:
|
|
|
52
52
|
}
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
+
Standard knowledge-related responsible surfaces are:
|
|
56
|
+
|
|
57
|
+
- `knowledge-requirements`
|
|
58
|
+
- `data-acquisition`
|
|
59
|
+
- `retrieval-policy`
|
|
60
|
+
- `user-question-policy`
|
|
61
|
+
|
|
55
62
|
These rows become:
|
|
56
63
|
|
|
57
64
|
- reflection expectations via `trialTraceFromMultiShotTrial`
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.20.0",
|
|
4
4
|
"description": "Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -45,6 +45,15 @@
|
|
|
45
45
|
"publishConfig": {
|
|
46
46
|
"access": "public"
|
|
47
47
|
},
|
|
48
|
+
"scripts": {
|
|
49
|
+
"build": "tsup",
|
|
50
|
+
"dev": "tsup --watch",
|
|
51
|
+
"prepare": "tsup",
|
|
52
|
+
"test": "vitest run",
|
|
53
|
+
"test:watch": "vitest",
|
|
54
|
+
"typecheck": "tsc --noEmit",
|
|
55
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
56
|
+
},
|
|
48
57
|
"dependencies": {
|
|
49
58
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
50
59
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -64,12 +73,5 @@
|
|
|
64
73
|
"node": ">=20"
|
|
65
74
|
},
|
|
66
75
|
"license": "MIT",
|
|
67
|
-
"
|
|
68
|
-
|
|
69
|
-
"dev": "tsup --watch",
|
|
70
|
-
"test": "vitest run",
|
|
71
|
-
"test:watch": "vitest",
|
|
72
|
-
"typecheck": "tsc --noEmit",
|
|
73
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
+
"packageManager": "pnpm@10.22.0"
|
|
77
|
+
}
|