@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,680 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bench": "customer-support",
|
|
3
|
+
"stack": "pentatonic-memory",
|
|
4
|
+
"n_tasks": 20,
|
|
5
|
+
"n_correct": 5,
|
|
6
|
+
"accuracy": 0.25,
|
|
7
|
+
"mean_score": 0.25,
|
|
8
|
+
"p50_search_ms": 30.443045994616114,
|
|
9
|
+
"p95_search_ms": 37.8251028523664,
|
|
10
|
+
"total_tokens_in": 0,
|
|
11
|
+
"total_tokens_out": 0,
|
|
12
|
+
"total_usd": 0.0,
|
|
13
|
+
"by_tag": {
|
|
14
|
+
"factoid": {
|
|
15
|
+
"n": 10,
|
|
16
|
+
"mean_score": 0.3,
|
|
17
|
+
"accuracy": 0.3
|
|
18
|
+
},
|
|
19
|
+
"customer": {
|
|
20
|
+
"n": 8,
|
|
21
|
+
"mean_score": 0.375,
|
|
22
|
+
"accuracy": 0.375
|
|
23
|
+
},
|
|
24
|
+
"multi-doc": {
|
|
25
|
+
"n": 6,
|
|
26
|
+
"mean_score": 0.16666666666666666,
|
|
27
|
+
"accuracy": 0.16666666666666666
|
|
28
|
+
},
|
|
29
|
+
"rma": {
|
|
30
|
+
"n": 3,
|
|
31
|
+
"mean_score": 0.3333333333333333,
|
|
32
|
+
"accuracy": 0.3333333333333333
|
|
33
|
+
},
|
|
34
|
+
"policy": {
|
|
35
|
+
"n": 5,
|
|
36
|
+
"mean_score": 0.2,
|
|
37
|
+
"accuracy": 0.2
|
|
38
|
+
},
|
|
39
|
+
"escalation": {
|
|
40
|
+
"n": 4,
|
|
41
|
+
"mean_score": 0.25,
|
|
42
|
+
"accuracy": 0.25
|
|
43
|
+
},
|
|
44
|
+
"rubric": {
|
|
45
|
+
"n": 3,
|
|
46
|
+
"mean_score": 0.3333333333333333,
|
|
47
|
+
"accuracy": 0.3333333333333333
|
|
48
|
+
},
|
|
49
|
+
"multi-hop": {
|
|
50
|
+
"n": 1,
|
|
51
|
+
"mean_score": 0.0,
|
|
52
|
+
"accuracy": 0.0
|
|
53
|
+
},
|
|
54
|
+
"entity": {
|
|
55
|
+
"n": 2,
|
|
56
|
+
"mean_score": 0.0,
|
|
57
|
+
"accuracy": 0.0
|
|
58
|
+
}
|
|
59
|
+
},
|
|
60
|
+
"extra": {
|
|
61
|
+
"ingest_ms": 11268.10247899266,
|
|
62
|
+
"grading": "substring",
|
|
63
|
+
"limit": 3,
|
|
64
|
+
"tokens": {
|
|
65
|
+
"corpus_tokens": 1227,
|
|
66
|
+
"query_tokens": 283,
|
|
67
|
+
"context_tokens": 278,
|
|
68
|
+
"retrieval_tokens": 561,
|
|
69
|
+
"naive_tokens": 24823,
|
|
70
|
+
"saved_tokens": 24262,
|
|
71
|
+
"reduction_pct": 0.9773999919429561,
|
|
72
|
+
"mean_retrieval_tokens_per_task": 28.05,
|
|
73
|
+
"tokenizer": "cl100k_base",
|
|
74
|
+
"per_task": {
|
|
75
|
+
"order-mina-count": {
|
|
76
|
+
"query": 11,
|
|
77
|
+
"context": 0,
|
|
78
|
+
"retrieval": 11,
|
|
79
|
+
"judge_in": 0,
|
|
80
|
+
"judge_out": 0,
|
|
81
|
+
"judge_latency_ms": 0.0
|
|
82
|
+
},
|
|
83
|
+
"order-mina-latest": {
|
|
84
|
+
"query": 13,
|
|
85
|
+
"context": 0,
|
|
86
|
+
"retrieval": 13,
|
|
87
|
+
"judge_in": 0,
|
|
88
|
+
"judge_out": 0,
|
|
89
|
+
"judge_latency_ms": 0.0
|
|
90
|
+
},
|
|
91
|
+
"rma-mina-sleeve-reason": {
|
|
92
|
+
"query": 17,
|
|
93
|
+
"context": 0,
|
|
94
|
+
"retrieval": 17,
|
|
95
|
+
"judge_in": 0,
|
|
96
|
+
"judge_out": 0,
|
|
97
|
+
"judge_latency_ms": 0.0
|
|
98
|
+
},
|
|
99
|
+
"rma-mina-lid-resolution": {
|
|
100
|
+
"query": 11,
|
|
101
|
+
"context": 57,
|
|
102
|
+
"retrieval": 68,
|
|
103
|
+
"judge_in": 0,
|
|
104
|
+
"judge_out": 0,
|
|
105
|
+
"judge_latency_ms": 0.0
|
|
106
|
+
},
|
|
107
|
+
"jareth-harness-bar-followup": {
|
|
108
|
+
"query": 15,
|
|
109
|
+
"context": 0,
|
|
110
|
+
"retrieval": 15,
|
|
111
|
+
"judge_in": 0,
|
|
112
|
+
"judge_out": 0,
|
|
113
|
+
"judge_latency_ms": 0.0
|
|
114
|
+
},
|
|
115
|
+
"jareth-second-order": {
|
|
116
|
+
"query": 11,
|
|
117
|
+
"context": 0,
|
|
118
|
+
"retrieval": 11,
|
|
119
|
+
"judge_in": 0,
|
|
120
|
+
"judge_out": 0,
|
|
121
|
+
"judge_latency_ms": 0.0
|
|
122
|
+
},
|
|
123
|
+
"priya-custom-status": {
|
|
124
|
+
"query": 11,
|
|
125
|
+
"context": 0,
|
|
126
|
+
"retrieval": 11,
|
|
127
|
+
"judge_in": 0,
|
|
128
|
+
"judge_out": 0,
|
|
129
|
+
"judge_latency_ms": 0.0
|
|
130
|
+
},
|
|
131
|
+
"priya-goodwill-offered": {
|
|
132
|
+
"query": 9,
|
|
133
|
+
"context": 0,
|
|
134
|
+
"retrieval": 9,
|
|
135
|
+
"judge_in": 0,
|
|
136
|
+
"judge_out": 0,
|
|
137
|
+
"judge_latency_ms": 0.0
|
|
138
|
+
},
|
|
139
|
+
"policy-custom-return": {
|
|
140
|
+
"query": 11,
|
|
141
|
+
"context": 0,
|
|
142
|
+
"retrieval": 11,
|
|
143
|
+
"judge_in": 0,
|
|
144
|
+
"judge_out": 0,
|
|
145
|
+
"judge_latency_ms": 0.0
|
|
146
|
+
},
|
|
147
|
+
"policy-40day-return": {
|
|
148
|
+
"query": 18,
|
|
149
|
+
"context": 46,
|
|
150
|
+
"retrieval": 64,
|
|
151
|
+
"judge_in": 0,
|
|
152
|
+
"judge_out": 0,
|
|
153
|
+
"judge_latency_ms": 0.0
|
|
154
|
+
},
|
|
155
|
+
"escalation-400gbp-full-refund": {
|
|
156
|
+
"query": 20,
|
|
157
|
+
"context": 0,
|
|
158
|
+
"retrieval": 20,
|
|
159
|
+
"judge_in": 0,
|
|
160
|
+
"judge_out": 0,
|
|
161
|
+
"judge_latency_ms": 0.0
|
|
162
|
+
},
|
|
163
|
+
"escalation-goodwill-20pct-tier": {
|
|
164
|
+
"query": 15,
|
|
165
|
+
"context": 49,
|
|
166
|
+
"retrieval": 64,
|
|
167
|
+
"judge_in": 0,
|
|
168
|
+
"judge_out": 0,
|
|
169
|
+
"judge_latency_ms": 0.0
|
|
170
|
+
},
|
|
171
|
+
"mina-vip-status": {
|
|
172
|
+
"query": 10,
|
|
173
|
+
"context": 42,
|
|
174
|
+
"retrieval": 52,
|
|
175
|
+
"judge_in": 0,
|
|
176
|
+
"judge_out": 0,
|
|
177
|
+
"judge_latency_ms": 0.0
|
|
178
|
+
},
|
|
179
|
+
"mina-preferences": {
|
|
180
|
+
"query": 11,
|
|
181
|
+
"context": 42,
|
|
182
|
+
"retrieval": 53,
|
|
183
|
+
"judge_in": 0,
|
|
184
|
+
"judge_out": 0,
|
|
185
|
+
"judge_latency_ms": 0.0
|
|
186
|
+
},
|
|
187
|
+
"priya-agent-guidance": {
|
|
188
|
+
"query": 16,
|
|
189
|
+
"context": 0,
|
|
190
|
+
"retrieval": 16,
|
|
191
|
+
"judge_in": 0,
|
|
192
|
+
"judge_out": 0,
|
|
193
|
+
"judge_latency_ms": 0.0
|
|
194
|
+
},
|
|
195
|
+
"rubric-mina-warranty-recommendation": {
|
|
196
|
+
"query": 18,
|
|
197
|
+
"context": 42,
|
|
198
|
+
"retrieval": 60,
|
|
199
|
+
"judge_in": 374,
|
|
200
|
+
"judge_out": 44,
|
|
201
|
+
"judge_latency_ms": 945.222895026207
|
|
202
|
+
},
|
|
203
|
+
"rubric-jareth-escalation": {
|
|
204
|
+
"query": 30,
|
|
205
|
+
"context": 0,
|
|
206
|
+
"retrieval": 30,
|
|
207
|
+
"judge_in": 357,
|
|
208
|
+
"judge_out": 43,
|
|
209
|
+
"judge_latency_ms": 868.3296079933643
|
|
210
|
+
},
|
|
211
|
+
"rubric-priya-delayed-custom": {
|
|
212
|
+
"query": 16,
|
|
213
|
+
"context": 0,
|
|
214
|
+
"retrieval": 16,
|
|
215
|
+
"judge_in": 310,
|
|
216
|
+
"judge_out": 54,
|
|
217
|
+
"judge_latency_ms": 939.1167230010033
|
|
218
|
+
},
|
|
219
|
+
"entity-mina-orders": {
|
|
220
|
+
"query": 10,
|
|
221
|
+
"context": 0,
|
|
222
|
+
"retrieval": 10,
|
|
223
|
+
"judge_in": 0,
|
|
224
|
+
"judge_out": 0,
|
|
225
|
+
"judge_latency_ms": 0.0
|
|
226
|
+
},
|
|
227
|
+
"entity-all-rmas": {
|
|
228
|
+
"query": 10,
|
|
229
|
+
"context": 0,
|
|
230
|
+
"retrieval": 10,
|
|
231
|
+
"judge_in": 0,
|
|
232
|
+
"judge_out": 0,
|
|
233
|
+
"judge_latency_ms": 0.0
|
|
234
|
+
}
|
|
235
|
+
},
|
|
236
|
+
"judge_tokens_in": 1041,
|
|
237
|
+
"judge_tokens_out": 141,
|
|
238
|
+
"judge_calls": 3,
|
|
239
|
+
"judge_mean_latency_ms": 917.5564086735249
|
|
240
|
+
},
|
|
241
|
+
"cost_usd": {
|
|
242
|
+
"assumed_completion_tokens_per_task": 100,
|
|
243
|
+
"rates": {
|
|
244
|
+
"input_per_1k": 0.0025,
|
|
245
|
+
"output_per_1k": 0.01,
|
|
246
|
+
"model": "gpt-4o"
|
|
247
|
+
},
|
|
248
|
+
"retrieval_usd_in": 0.0014025,
|
|
249
|
+
"retrieval_usd_out": 0.02,
|
|
250
|
+
"retrieval_usd_total": 0.0214025,
|
|
251
|
+
"naive_usd_total": 0.0820575,
|
|
252
|
+
"saved_usd": 0.060655,
|
|
253
|
+
"saved_usd_per_1k_tasks": 3.03275
|
|
254
|
+
}
|
|
255
|
+
},
|
|
256
|
+
"task_results": [
|
|
257
|
+
{
|
|
258
|
+
"task_id": "order-mina-count",
|
|
259
|
+
"query": "How many orders has Mina Okafor placed?",
|
|
260
|
+
"answer": "",
|
|
261
|
+
"hits": [],
|
|
262
|
+
"correct": false,
|
|
263
|
+
"score": 0.0,
|
|
264
|
+
"grading_notes": "missing 1/1: ['2']",
|
|
265
|
+
"search_time_ms": 34.31391599588096,
|
|
266
|
+
"generation_time_ms": 0.0,
|
|
267
|
+
"tokens_in": 0,
|
|
268
|
+
"tokens_out": 0,
|
|
269
|
+
"retrieval_tokens": 11,
|
|
270
|
+
"query_tokens": 11,
|
|
271
|
+
"context_tokens": 0,
|
|
272
|
+
"judge_tokens_in": 0,
|
|
273
|
+
"judge_tokens_out": 0,
|
|
274
|
+
"judge_latency_ms": 0.0
|
|
275
|
+
},
|
|
276
|
+
{
|
|
277
|
+
"task_id": "order-mina-latest",
|
|
278
|
+
"query": "What was in Mina Okafor's most recent order?",
|
|
279
|
+
"answer": "",
|
|
280
|
+
"hits": [],
|
|
281
|
+
"correct": false,
|
|
282
|
+
"score": 0.0,
|
|
283
|
+
"grading_notes": "missing 2/2: ['Luna', 'Loop']",
|
|
284
|
+
"search_time_ms": 29.60854201228358,
|
|
285
|
+
"generation_time_ms": 0.0,
|
|
286
|
+
"tokens_in": 0,
|
|
287
|
+
"tokens_out": 0,
|
|
288
|
+
"retrieval_tokens": 13,
|
|
289
|
+
"query_tokens": 13,
|
|
290
|
+
"context_tokens": 0,
|
|
291
|
+
"judge_tokens_in": 0,
|
|
292
|
+
"judge_tokens_out": 0,
|
|
293
|
+
"judge_latency_ms": 0.0
|
|
294
|
+
},
|
|
295
|
+
{
|
|
296
|
+
"task_id": "rma-mina-sleeve-reason",
|
|
297
|
+
"query": "Why did Mina open an RMA on order 2026-0142?",
|
|
298
|
+
"answer": "",
|
|
299
|
+
"hits": [],
|
|
300
|
+
"correct": false,
|
|
301
|
+
"score": 0.0,
|
|
302
|
+
"grading_notes": "missing 2/2: ['wrong colour', 'Oat']",
|
|
303
|
+
"search_time_ms": 32.31730399420485,
|
|
304
|
+
"generation_time_ms": 0.0,
|
|
305
|
+
"tokens_in": 0,
|
|
306
|
+
"tokens_out": 0,
|
|
307
|
+
"retrieval_tokens": 17,
|
|
308
|
+
"query_tokens": 17,
|
|
309
|
+
"context_tokens": 0,
|
|
310
|
+
"judge_tokens_in": 0,
|
|
311
|
+
"judge_tokens_out": 0,
|
|
312
|
+
"judge_latency_ms": 0.0
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"task_id": "rma-mina-lid-resolution",
|
|
316
|
+
"query": "How was Mina's Luna bottle lid complaint resolved?",
|
|
317
|
+
"answer": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
|
|
318
|
+
"hits": [
|
|
319
|
+
{
|
|
320
|
+
"text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
|
|
321
|
+
"score": 0.5323119854064589,
|
|
322
|
+
"source": "pentatonic-memory",
|
|
323
|
+
"doc_id": "chat-mina-2026-04-10"
|
|
324
|
+
}
|
|
325
|
+
],
|
|
326
|
+
"correct": true,
|
|
327
|
+
"score": 1.0,
|
|
328
|
+
"grading_notes": "all substrings matched",
|
|
329
|
+
"search_time_ms": 28.685988974757493,
|
|
330
|
+
"generation_time_ms": 0.0,
|
|
331
|
+
"tokens_in": 0,
|
|
332
|
+
"tokens_out": 0,
|
|
333
|
+
"retrieval_tokens": 68,
|
|
334
|
+
"query_tokens": 11,
|
|
335
|
+
"context_tokens": 57,
|
|
336
|
+
"judge_tokens_in": 0,
|
|
337
|
+
"judge_tokens_out": 0,
|
|
338
|
+
"judge_latency_ms": 0.0
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
"task_id": "jareth-harness-bar-followup",
|
|
342
|
+
"query": "Did Jareth's kite harness work with his bar out of the box?",
|
|
343
|
+
"answer": "",
|
|
344
|
+
"hits": [],
|
|
345
|
+
"correct": false,
|
|
346
|
+
"score": 0.0,
|
|
347
|
+
"grading_notes": "missing 2/2: ['Duotone', 'adapter']",
|
|
348
|
+
"search_time_ms": 34.421516000293195,
|
|
349
|
+
"generation_time_ms": 0.0,
|
|
350
|
+
"tokens_in": 0,
|
|
351
|
+
"tokens_out": 0,
|
|
352
|
+
"retrieval_tokens": 15,
|
|
353
|
+
"query_tokens": 15,
|
|
354
|
+
"context_tokens": 0,
|
|
355
|
+
"judge_tokens_in": 0,
|
|
356
|
+
"judge_tokens_out": 0,
|
|
357
|
+
"judge_latency_ms": 0.0
|
|
358
|
+
},
|
|
359
|
+
{
|
|
360
|
+
"task_id": "jareth-second-order",
|
|
361
|
+
"query": "What did Jareth order after his initial harness purchase?",
|
|
362
|
+
"answer": "",
|
|
363
|
+
"hits": [],
|
|
364
|
+
"correct": false,
|
|
365
|
+
"score": 0.0,
|
|
366
|
+
"grading_notes": "missing 2/2: ['adapter', 'NMD-ADPT-DC']",
|
|
367
|
+
"search_time_ms": 25.77133697923273,
|
|
368
|
+
"generation_time_ms": 0.0,
|
|
369
|
+
"tokens_in": 0,
|
|
370
|
+
"tokens_out": 0,
|
|
371
|
+
"retrieval_tokens": 11,
|
|
372
|
+
"query_tokens": 11,
|
|
373
|
+
"context_tokens": 0,
|
|
374
|
+
"judge_tokens_in": 0,
|
|
375
|
+
"judge_tokens_out": 0,
|
|
376
|
+
"judge_latency_ms": 0.0
|
|
377
|
+
},
|
|
378
|
+
{
|
|
379
|
+
"task_id": "priya-custom-status",
|
|
380
|
+
"query": "Can Priya change the text on her custom tote?",
|
|
381
|
+
"answer": "",
|
|
382
|
+
"hits": [],
|
|
383
|
+
"correct": false,
|
|
384
|
+
"score": 0.0,
|
|
385
|
+
"grading_notes": "missing 2/2: [\"can't change\", 'production']",
|
|
386
|
+
"search_time_ms": 27.974719007033855,
|
|
387
|
+
"generation_time_ms": 0.0,
|
|
388
|
+
"tokens_in": 0,
|
|
389
|
+
"tokens_out": 0,
|
|
390
|
+
"retrieval_tokens": 11,
|
|
391
|
+
"query_tokens": 11,
|
|
392
|
+
"context_tokens": 0,
|
|
393
|
+
"judge_tokens_in": 0,
|
|
394
|
+
"judge_tokens_out": 0,
|
|
395
|
+
"judge_latency_ms": 0.0
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
"task_id": "priya-goodwill-offered",
|
|
399
|
+
"query": "What goodwill credit did Priya get offered?",
|
|
400
|
+
"answer": "",
|
|
401
|
+
"hits": [],
|
|
402
|
+
"correct": false,
|
|
403
|
+
"score": 0.0,
|
|
404
|
+
"grading_notes": "missing 1/1: ['15%']",
|
|
405
|
+
"search_time_ms": 26.681170013034716,
|
|
406
|
+
"generation_time_ms": 0.0,
|
|
407
|
+
"tokens_in": 0,
|
|
408
|
+
"tokens_out": 0,
|
|
409
|
+
"retrieval_tokens": 9,
|
|
410
|
+
"query_tokens": 9,
|
|
411
|
+
"context_tokens": 0,
|
|
412
|
+
"judge_tokens_in": 0,
|
|
413
|
+
"judge_tokens_out": 0,
|
|
414
|
+
"judge_latency_ms": 0.0
|
|
415
|
+
},
|
|
416
|
+
{
|
|
417
|
+
"task_id": "policy-custom-return",
|
|
418
|
+
"query": "Can a custom-printed tote be returned for refund?",
|
|
419
|
+
"answer": "",
|
|
420
|
+
"hits": [],
|
|
421
|
+
"correct": false,
|
|
422
|
+
"score": 0.0,
|
|
423
|
+
"grading_notes": "missing 1/1: ['final sale']",
|
|
424
|
+
"search_time_ms": 29.563406016677618,
|
|
425
|
+
"generation_time_ms": 0.0,
|
|
426
|
+
"tokens_in": 0,
|
|
427
|
+
"tokens_out": 0,
|
|
428
|
+
"retrieval_tokens": 11,
|
|
429
|
+
"query_tokens": 11,
|
|
430
|
+
"context_tokens": 0,
|
|
431
|
+
"judge_tokens_in": 0,
|
|
432
|
+
"judge_tokens_out": 0,
|
|
433
|
+
"judge_latency_ms": 0.0
|
|
434
|
+
},
|
|
435
|
+
{
|
|
436
|
+
"task_id": "policy-40day-return",
|
|
437
|
+
"query": "If an unused item was delivered 40 days ago, can I get a full refund?",
|
|
438
|
+
"answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
|
|
439
|
+
"hits": [
|
|
440
|
+
{
|
|
441
|
+
"text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
|
|
442
|
+
"score": 0.5349228375664262,
|
|
443
|
+
"source": "pentatonic-memory",
|
|
444
|
+
"doc_id": "policy-returns-30day"
|
|
445
|
+
}
|
|
446
|
+
],
|
|
447
|
+
"correct": false,
|
|
448
|
+
"score": 0.0,
|
|
449
|
+
"grading_notes": "all substrings matched; forbidden substring(s) present: ['full refund']",
|
|
450
|
+
"search_time_ms": 34.331914997892454,
|
|
451
|
+
"generation_time_ms": 0.0,
|
|
452
|
+
"tokens_in": 0,
|
|
453
|
+
"tokens_out": 0,
|
|
454
|
+
"retrieval_tokens": 64,
|
|
455
|
+
"query_tokens": 18,
|
|
456
|
+
"context_tokens": 46,
|
|
457
|
+
"judge_tokens_in": 0,
|
|
458
|
+
"judge_tokens_out": 0,
|
|
459
|
+
"judge_latency_ms": 0.0
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
"task_id": "escalation-400gbp-full-refund",
|
|
463
|
+
"query": "A Tier 1 agent wants to issue a full refund on a \u00a3400 order \u2014 can they?",
|
|
464
|
+
"answer": "",
|
|
465
|
+
"hits": [],
|
|
466
|
+
"correct": false,
|
|
467
|
+
"score": 0.0,
|
|
468
|
+
"grading_notes": "missing 1/1: ['Tier 3']",
|
|
469
|
+
"search_time_ms": 32.81780600082129,
|
|
470
|
+
"generation_time_ms": 0.0,
|
|
471
|
+
"tokens_in": 0,
|
|
472
|
+
"tokens_out": 0,
|
|
473
|
+
"retrieval_tokens": 20,
|
|
474
|
+
"query_tokens": 20,
|
|
475
|
+
"context_tokens": 0,
|
|
476
|
+
"judge_tokens_in": 0,
|
|
477
|
+
"judge_tokens_out": 0,
|
|
478
|
+
"judge_latency_ms": 0.0
|
|
479
|
+
},
|
|
480
|
+
{
|
|
481
|
+
"task_id": "escalation-goodwill-20pct-tier",
|
|
482
|
+
"query": "What's the minimum agent tier needed to offer 20% goodwill credit?",
|
|
483
|
+
"answer": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
|
|
484
|
+
"hits": [
|
|
485
|
+
{
|
|
486
|
+
"text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
|
|
487
|
+
"score": 0.5517141571721631,
|
|
488
|
+
"source": "pentatonic-memory",
|
|
489
|
+
"doc_id": "policy-goodwill-credit"
|
|
490
|
+
}
|
|
491
|
+
],
|
|
492
|
+
"correct": true,
|
|
493
|
+
"score": 1.0,
|
|
494
|
+
"grading_notes": "all substrings matched",
|
|
495
|
+
"search_time_ms": 33.02935999818146,
|
|
496
|
+
"generation_time_ms": 0.0,
|
|
497
|
+
"tokens_in": 0,
|
|
498
|
+
"tokens_out": 0,
|
|
499
|
+
"retrieval_tokens": 64,
|
|
500
|
+
"query_tokens": 15,
|
|
501
|
+
"context_tokens": 49,
|
|
502
|
+
"judge_tokens_in": 0,
|
|
503
|
+
"judge_tokens_out": 0,
|
|
504
|
+
"judge_latency_ms": 0.0
|
|
505
|
+
},
|
|
506
|
+
{
|
|
507
|
+
"task_id": "mina-vip-status",
|
|
508
|
+
"query": "Is Mina Okafor a VIP customer?",
|
|
509
|
+
"answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
510
|
+
"hits": [
|
|
511
|
+
{
|
|
512
|
+
"text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
513
|
+
"score": 0.7720720221593629,
|
|
514
|
+
"source": "pentatonic-memory",
|
|
515
|
+
"doc_id": "customer-note-mina-vip"
|
|
516
|
+
}
|
|
517
|
+
],
|
|
518
|
+
"correct": true,
|
|
519
|
+
"score": 1.0,
|
|
520
|
+
"grading_notes": "all substrings matched",
|
|
521
|
+
"search_time_ms": 27.029798016883433,
|
|
522
|
+
"generation_time_ms": 0.0,
|
|
523
|
+
"tokens_in": 0,
|
|
524
|
+
"tokens_out": 0,
|
|
525
|
+
"retrieval_tokens": 52,
|
|
526
|
+
"query_tokens": 10,
|
|
527
|
+
"context_tokens": 42,
|
|
528
|
+
"judge_tokens_in": 0,
|
|
529
|
+
"judge_tokens_out": 0,
|
|
530
|
+
"judge_latency_ms": 0.0
|
|
531
|
+
},
|
|
532
|
+
{
|
|
533
|
+
"task_id": "mina-preferences",
|
|
534
|
+
"query": "What are Mina Okafor's stated preferences?",
|
|
535
|
+
"answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
536
|
+
"hits": [
|
|
537
|
+
{
|
|
538
|
+
"text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
539
|
+
"score": 0.5363281403698282,
|
|
540
|
+
"source": "pentatonic-memory",
|
|
541
|
+
"doc_id": "customer-note-mina-vip"
|
|
542
|
+
}
|
|
543
|
+
],
|
|
544
|
+
"correct": true,
|
|
545
|
+
"score": 1.0,
|
|
546
|
+
"grading_notes": "all substrings matched",
|
|
547
|
+
"search_time_ms": 26.91432400024496,
|
|
548
|
+
"generation_time_ms": 0.0,
|
|
549
|
+
"tokens_in": 0,
|
|
550
|
+
"tokens_out": 0,
|
|
551
|
+
"retrieval_tokens": 53,
|
|
552
|
+
"query_tokens": 11,
|
|
553
|
+
"context_tokens": 42,
|
|
554
|
+
"judge_tokens_in": 0,
|
|
555
|
+
"judge_tokens_out": 0,
|
|
556
|
+
"judge_latency_ms": 0.0
|
|
557
|
+
},
|
|
558
|
+
{
|
|
559
|
+
"task_id": "priya-agent-guidance",
|
|
560
|
+
"query": "When handling Priya's orders, what should the agent do before production starts?",
|
|
561
|
+
"answer": "",
|
|
562
|
+
"hits": [],
|
|
563
|
+
"correct": false,
|
|
564
|
+
"score": 0.0,
|
|
565
|
+
"grading_notes": "missing 2/2: ['confirm', 'print text']",
|
|
566
|
+
"search_time_ms": 31.542113021714613,
|
|
567
|
+
"generation_time_ms": 0.0,
|
|
568
|
+
"tokens_in": 0,
|
|
569
|
+
"tokens_out": 0,
|
|
570
|
+
"retrieval_tokens": 16,
|
|
571
|
+
"query_tokens": 16,
|
|
572
|
+
"context_tokens": 0,
|
|
573
|
+
"judge_tokens_in": 0,
|
|
574
|
+
"judge_tokens_out": 0,
|
|
575
|
+
"judge_latency_ms": 0.0
|
|
576
|
+
},
|
|
577
|
+
{
|
|
578
|
+
"task_id": "rubric-mina-warranty-recommendation",
|
|
579
|
+
"query": "Mina is asking for product recommendations. Based on her history, what should we suggest?",
|
|
580
|
+
"answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
581
|
+
"hits": [
|
|
582
|
+
{
|
|
583
|
+
"text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
|
|
584
|
+
"score": 0.5268969216421068,
|
|
585
|
+
"source": "pentatonic-memory",
|
|
586
|
+
"doc_id": "customer-note-mina-vip"
|
|
587
|
+
}
|
|
588
|
+
],
|
|
589
|
+
"correct": true,
|
|
590
|
+
"score": 1.0,
|
|
591
|
+
"grading_notes": "judge=1.00 \u2014 The answer references Mina's VIP status and her stated interest in lifetime-warranty items, meeting the criteria for a s",
|
|
592
|
+
"search_time_ms": 31.27754997694865,
|
|
593
|
+
"generation_time_ms": 0.0,
|
|
594
|
+
"tokens_in": 0,
|
|
595
|
+
"tokens_out": 0,
|
|
596
|
+
"retrieval_tokens": 60,
|
|
597
|
+
"query_tokens": 18,
|
|
598
|
+
"context_tokens": 42,
|
|
599
|
+
"judge_tokens_in": 374,
|
|
600
|
+
"judge_tokens_out": 44,
|
|
601
|
+
"judge_latency_ms": 945.222895026207
|
|
602
|
+
},
|
|
603
|
+
{
|
|
604
|
+
"task_id": "rubric-jareth-escalation",
|
|
605
|
+
"query": "Jareth wants to return his kite harness for a full refund, claiming it 'didn't work' with his bar \u2014 how should we handle this?",
|
|
606
|
+
"answer": "",
|
|
607
|
+
"hits": [],
|
|
608
|
+
"correct": false,
|
|
609
|
+
"score": 0.0,
|
|
610
|
+
"grading_notes": "judge=0.00 \u2014 The answer fails to provide any context or resolution regarding Jareth's claim, simply processing a refund without addre",
|
|
611
|
+
"search_time_ms": 38.004239002475515,
|
|
612
|
+
"generation_time_ms": 0.0,
|
|
613
|
+
"tokens_in": 0,
|
|
614
|
+
"tokens_out": 0,
|
|
615
|
+
"retrieval_tokens": 30,
|
|
616
|
+
"query_tokens": 30,
|
|
617
|
+
"context_tokens": 0,
|
|
618
|
+
"judge_tokens_in": 357,
|
|
619
|
+
"judge_tokens_out": 43,
|
|
620
|
+
"judge_latency_ms": 868.3296079933643
|
|
621
|
+
},
|
|
622
|
+
{
|
|
623
|
+
"task_id": "rubric-priya-delayed-custom",
|
|
624
|
+
"query": "Priya is frustrated about her custom tote delay. What's the best response?",
|
|
625
|
+
"answer": "",
|
|
626
|
+
"hits": [],
|
|
627
|
+
"correct": false,
|
|
628
|
+
"score": 0.0,
|
|
629
|
+
"grading_notes": "judge=0.00 \u2014 The response does not acknowledge that custom items have a 10-12 working day lead time and does not mention goodwill. It",
|
|
630
|
+
"search_time_ms": 33.17419299855828,
|
|
631
|
+
"generation_time_ms": 0.0,
|
|
632
|
+
"tokens_in": 0,
|
|
633
|
+
"tokens_out": 0,
|
|
634
|
+
"retrieval_tokens": 16,
|
|
635
|
+
"query_tokens": 16,
|
|
636
|
+
"context_tokens": 0,
|
|
637
|
+
"judge_tokens_in": 310,
|
|
638
|
+
"judge_tokens_out": 54,
|
|
639
|
+
"judge_latency_ms": 939.1167230010033
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
"task_id": "entity-mina-orders",
|
|
643
|
+
"query": "List Mina Okafor's order IDs.",
|
|
644
|
+
"answer": "",
|
|
645
|
+
"hits": [],
|
|
646
|
+
"correct": false,
|
|
647
|
+
"score": 0.0,
|
|
648
|
+
"grading_notes": "no expected_substrings set",
|
|
649
|
+
"search_time_ms": 27.45368899195455,
|
|
650
|
+
"generation_time_ms": 0.0,
|
|
651
|
+
"tokens_in": 0,
|
|
652
|
+
"tokens_out": 0,
|
|
653
|
+
"retrieval_tokens": 10,
|
|
654
|
+
"query_tokens": 10,
|
|
655
|
+
"context_tokens": 0,
|
|
656
|
+
"judge_tokens_in": 0,
|
|
657
|
+
"judge_tokens_out": 0,
|
|
658
|
+
"judge_latency_ms": 0.0
|
|
659
|
+
},
|
|
660
|
+
{
|
|
661
|
+
"task_id": "entity-all-rmas",
|
|
662
|
+
"query": "List all open and closed RMA case IDs.",
|
|
663
|
+
"answer": "",
|
|
664
|
+
"hits": [],
|
|
665
|
+
"correct": false,
|
|
666
|
+
"score": 0.0,
|
|
667
|
+
"grading_notes": "no expected_substrings set",
|
|
668
|
+
"search_time_ms": 28.635605005547404,
|
|
669
|
+
"generation_time_ms": 0.0,
|
|
670
|
+
"tokens_in": 0,
|
|
671
|
+
"tokens_out": 0,
|
|
672
|
+
"retrieval_tokens": 10,
|
|
673
|
+
"query_tokens": 10,
|
|
674
|
+
"context_tokens": 0,
|
|
675
|
+
"judge_tokens_in": 0,
|
|
676
|
+
"judge_tokens_out": 0,
|
|
677
|
+
"judge_latency_ms": 0.0
|
|
678
|
+
}
|
|
679
|
+
]
|
|
680
|
+
}
|