@kat-ai/eval 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -71
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,114 +1,96 @@
|
|
|
1
1
|
# @kat-ai/eval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Layered evaluation toolkit for KAT applications.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
`@kat-ai/eval` gives you programmatic checks for three parts of the system:
|
|
6
|
+
|
|
7
|
+
- manifest quality after introspection
|
|
8
|
+
- retrieval quality against a Pinecone Assistant
|
|
9
|
+
- end-to-end agent behavior through a chat endpoint
|
|
10
|
+
|
|
11
|
+
## Install
|
|
6
12
|
|
|
7
13
|
```bash
|
|
8
|
-
npm install @kat-ai/eval
|
|
14
|
+
npm install @kat-ai/sdk @kat-ai/eval
|
|
9
15
|
```
|
|
10
16
|
|
|
11
|
-
|
|
17
|
+
The package uses the same environment variables as the rest of KAT:
|
|
12
18
|
|
|
13
|
-
```
|
|
14
|
-
|
|
19
|
+
```env
|
|
20
|
+
OPENAI_API_KEY=sk-...
|
|
21
|
+
PINECONE_API_KEY=...
|
|
15
22
|
```
|
|
16
23
|
|
|
17
24
|
## Quick Start
|
|
18
25
|
|
|
19
|
-
```
|
|
20
|
-
import {
|
|
26
|
+
```ts
|
|
27
|
+
import {
|
|
28
|
+
evaluateAgent,
|
|
29
|
+
evaluateIntrospection,
|
|
30
|
+
evaluateRetrieval,
|
|
31
|
+
} from '@kat-ai/eval';
|
|
21
32
|
|
|
22
|
-
// Layer 1: Evaluate manifest quality
|
|
23
33
|
const introspectionResult = await evaluateIntrospection({
|
|
24
|
-
assistantName: 'my-
|
|
25
|
-
manifest
|
|
34
|
+
assistantName: 'my-assistant',
|
|
35
|
+
manifest,
|
|
26
36
|
groundTruth: [
|
|
27
|
-
{
|
|
37
|
+
{
|
|
38
|
+
query: 'What products do you cover?',
|
|
39
|
+
expectedEntities: ['toaster', 'blender'],
|
|
40
|
+
},
|
|
28
41
|
],
|
|
29
42
|
});
|
|
30
|
-
console.log(`Introspection score: ${introspectionResult.overallScore}/100`);
|
|
31
43
|
|
|
32
|
-
// Layer 2: Evaluate retrieval quality
|
|
33
44
|
const retrievalResult = await evaluateRetrieval({
|
|
34
|
-
assistantName: 'my-
|
|
45
|
+
assistantName: 'my-assistant',
|
|
35
46
|
queries: [
|
|
36
|
-
{
|
|
47
|
+
{
|
|
48
|
+
query: 'How do I reset model T100?',
|
|
49
|
+
expectedTopics: ['reset', 'model t100'],
|
|
50
|
+
},
|
|
37
51
|
],
|
|
38
52
|
});
|
|
39
|
-
console.log(`Retrieval score: ${retrievalResult.overallScore}/100`);
|
|
40
53
|
|
|
41
|
-
// Layer 3: Evaluate agent behavior
|
|
42
54
|
const agentResult = await evaluateAgent({
|
|
43
55
|
agentEndpoint: 'http://localhost:3000/api/chat',
|
|
44
56
|
scenarios: [
|
|
45
57
|
{
|
|
46
|
-
name: 'basic-
|
|
47
|
-
initialQuery: 'My toaster
|
|
58
|
+
name: 'basic-troubleshooting',
|
|
59
|
+
initialQuery: 'My toaster will not heat up',
|
|
48
60
|
expectedOutcome: 'answer',
|
|
49
|
-
evaluation: {
|
|
61
|
+
evaluation: {
|
|
62
|
+
mustContain: ['heating element'],
|
|
63
|
+
},
|
|
50
64
|
},
|
|
51
65
|
],
|
|
52
66
|
});
|
|
53
|
-
console.log(`Agent score: ${agentResult.overallScore}/100`);
|
|
54
67
|
```
|
|
55
68
|
|
|
56
|
-
##
|
|
57
|
-
|
|
58
|
-
### Layer 1: Introspection Eval
|
|
59
|
-
|
|
60
|
-
Evaluates whether introspection correctly understands a KB's content:
|
|
61
|
-
|
|
62
|
-
- **Entity Coverage**: Does the manifest capture all entities in the KB?
|
|
63
|
-
- **Slot Accuracy**: Are extracted slots correct for the domain?
|
|
64
|
-
- **Scope Precision**: Are in/out scope boundaries accurate?
|
|
65
|
-
- **Capability Match**: Do capabilities match actual KB content?
|
|
66
|
-
|
|
67
|
-
### Layer 2: Retrieval Eval
|
|
68
|
-
|
|
69
|
-
Evaluates whether RAG retrieves relevant chunks:
|
|
70
|
-
|
|
71
|
-
- **Relevance**: Are retrieved chunks relevant to the query?
|
|
72
|
-
- **Recall**: Are expected topics found in retrieved chunks?
|
|
73
|
-
- **Precision**: What percentage of retrieved content is relevant?
|
|
74
|
-
- **Noise Ratio**: How much irrelevant content is retrieved?
|
|
75
|
-
|
|
76
|
-
### Layer 3: Agent Eval
|
|
69
|
+
## Package Exports
|
|
77
70
|
|
|
78
|
-
|
|
71
|
+
- `@kat-ai/eval`: all layers and shared helpers
|
|
72
|
+
- `@kat-ai/eval/introspection`: manifest-quality evaluation
|
|
73
|
+
- `@kat-ai/eval/retrieval`: retrieval-quality evaluation
|
|
74
|
+
- `@kat-ai/eval/agent`: end-to-end agent evaluation
|
|
79
75
|
|
|
80
|
-
|
|
81
|
-
- **Relevance**: Is the answer relevant to the query?
|
|
82
|
-
- **Completeness**: Does the answer fully address the question?
|
|
83
|
-
- **Helpfulness**: Is the response actionable and helpful?
|
|
76
|
+
## CLI
|
|
84
77
|
|
|
85
|
-
|
|
78
|
+
The same baseline can be run from the CLI:
|
|
86
79
|
|
|
87
80
|
```bash
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
# Run specific layer
|
|
94
|
-
kat eval --layer introspection --assistant my-kb --scenarios ./eval/baseline/introspection-ground-truth.json
|
|
95
|
-
kat eval --layer retrieval --assistant my-kb --scenarios ./eval/baseline/retrieval-queries.json
|
|
96
|
-
kat eval --layer agent --endpoint http://localhost:3000/api/chat --scenarios ./eval/baseline/agent-scenarios.json
|
|
97
|
-
|
|
98
|
-
# Output as JSON
|
|
99
|
-
kat eval --assistant my-kb --output json > results.json
|
|
81
|
+
npx @kat-ai/cli eval \
|
|
82
|
+
--assistant my-assistant \
|
|
83
|
+
--endpoint http://localhost:3000/api/chat \
|
|
84
|
+
--baseline
|
|
100
85
|
```
|
|
101
86
|
|
|
102
|
-
|
|
103
|
-
- `eval/baseline/naive-rag-baseline.json`
|
|
104
|
-
- `eval/baseline/introspection-ground-truth.json`
|
|
105
|
-
- `eval/baseline/retrieval-queries.json`
|
|
106
|
-
- `eval/baseline/agent-scenarios.json`
|
|
87
|
+
## Related Packages
|
|
107
88
|
|
|
108
|
-
|
|
109
|
-
-
|
|
110
|
-
-
|
|
89
|
+
- `@kat-ai/sdk`: core runtime and shared types
|
|
90
|
+
- `@kat-ai/cli`: command-line workflow for running evals and generating apps
|
|
91
|
+
- `@kat-ai/react`: optional UI layer for chat experiences
|
|
111
92
|
|
|
112
|
-
##
|
|
93
|
+
## Docs
|
|
113
94
|
|
|
114
|
-
|
|
95
|
+
- Repository: [github.com/pinecone-io/KAT](https://github.com/pinecone-io/KAT)
|
|
96
|
+
- Eval docs: [docs-site/docs/cli/eval.md](https://github.com/pinecone-io/KAT/blob/main/docs-site/docs/cli/eval.md)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kat-ai/eval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Layered evaluation toolkit for KAT introspection, retrieval, and agent quality",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.cjs",
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
"@pinecone-database/pinecone": "^6.1.3",
|
|
42
42
|
"ai": "^4.0.0",
|
|
43
43
|
"zod": "^3.23.0",
|
|
44
|
-
"@kat-ai/sdk": "0.1.
|
|
44
|
+
"@kat-ai/sdk": "0.1.1"
|
|
45
45
|
},
|
|
46
46
|
"devDependencies": {
|
|
47
47
|
"@types/node": "^20.19.25",
|