@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,961 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bench": "product-catalogue",
|
|
3
|
+
"stack": "pentatonic-memory-latest",
|
|
4
|
+
"n_tasks": 18,
|
|
5
|
+
"n_correct": 8,
|
|
6
|
+
"accuracy": 0.4444444444444444,
|
|
7
|
+
"mean_score": 0.4861111111111111,
|
|
8
|
+
"p50_search_ms": 29.792192013701424,
|
|
9
|
+
"p95_search_ms": 39.145407994510606,
|
|
10
|
+
"total_tokens_in": 0,
|
|
11
|
+
"total_tokens_out": 0,
|
|
12
|
+
"total_usd": 0.0,
|
|
13
|
+
"by_tag": {
|
|
14
|
+
"factoid": {
|
|
15
|
+
"n": 10,
|
|
16
|
+
"mean_score": 0.725,
|
|
17
|
+
"accuracy": 0.7
|
|
18
|
+
},
|
|
19
|
+
"material": {
|
|
20
|
+
"n": 4,
|
|
21
|
+
"mean_score": 0.5,
|
|
22
|
+
"accuracy": 0.5
|
|
23
|
+
},
|
|
24
|
+
"spec": {
|
|
25
|
+
"n": 1,
|
|
26
|
+
"mean_score": 1.0,
|
|
27
|
+
"accuracy": 1.0
|
|
28
|
+
},
|
|
29
|
+
"warranty": {
|
|
30
|
+
"n": 1,
|
|
31
|
+
"mean_score": 0.25,
|
|
32
|
+
"accuracy": 0.0
|
|
33
|
+
},
|
|
34
|
+
"inventory": {
|
|
35
|
+
"n": 3,
|
|
36
|
+
"mean_score": 1.0,
|
|
37
|
+
"accuracy": 1.0
|
|
38
|
+
},
|
|
39
|
+
"compat": {
|
|
40
|
+
"n": 2,
|
|
41
|
+
"mean_score": 0.25,
|
|
42
|
+
"accuracy": 0.0
|
|
43
|
+
},
|
|
44
|
+
"multi-doc": {
|
|
45
|
+
"n": 3,
|
|
46
|
+
"mean_score": 0.5,
|
|
47
|
+
"accuracy": 0.3333333333333333
|
|
48
|
+
},
|
|
49
|
+
"care": {
|
|
50
|
+
"n": 1,
|
|
51
|
+
"mean_score": 1.0,
|
|
52
|
+
"accuracy": 1.0
|
|
53
|
+
},
|
|
54
|
+
"policy": {
|
|
55
|
+
"n": 2,
|
|
56
|
+
"mean_score": 0.5,
|
|
57
|
+
"accuracy": 0.5
|
|
58
|
+
},
|
|
59
|
+
"pricing": {
|
|
60
|
+
"n": 2,
|
|
61
|
+
"mean_score": 0.0,
|
|
62
|
+
"accuracy": 0.0
|
|
63
|
+
},
|
|
64
|
+
"multi-hop": {
|
|
65
|
+
"n": 2,
|
|
66
|
+
"mean_score": 0.0,
|
|
67
|
+
"accuracy": 0.0
|
|
68
|
+
},
|
|
69
|
+
"math": {
|
|
70
|
+
"n": 1,
|
|
71
|
+
"mean_score": 0.0,
|
|
72
|
+
"accuracy": 0.0
|
|
73
|
+
},
|
|
74
|
+
"rubric": {
|
|
75
|
+
"n": 3,
|
|
76
|
+
"mean_score": 0.0,
|
|
77
|
+
"accuracy": 0.0
|
|
78
|
+
},
|
|
79
|
+
"recommendation": {
|
|
80
|
+
"n": 1,
|
|
81
|
+
"mean_score": 0.0,
|
|
82
|
+
"accuracy": 0.0
|
|
83
|
+
},
|
|
84
|
+
"certification": {
|
|
85
|
+
"n": 1,
|
|
86
|
+
"mean_score": 0.0,
|
|
87
|
+
"accuracy": 0.0
|
|
88
|
+
},
|
|
89
|
+
"entity": {
|
|
90
|
+
"n": 1,
|
|
91
|
+
"mean_score": 0.0,
|
|
92
|
+
"accuracy": 0.0
|
|
93
|
+
}
|
|
94
|
+
},
|
|
95
|
+
"extra": {
|
|
96
|
+
"ingest_ms": 45601.26005701022,
|
|
97
|
+
"grading": "substring",
|
|
98
|
+
"limit": 3,
|
|
99
|
+
"tokens": {
|
|
100
|
+
"corpus_tokens": 1845,
|
|
101
|
+
"query_tokens": 271,
|
|
102
|
+
"context_tokens": 1483,
|
|
103
|
+
"retrieval_tokens": 1754,
|
|
104
|
+
"naive_tokens": 33481,
|
|
105
|
+
"saved_tokens": 31727,
|
|
106
|
+
"reduction_pct": 0.9476120784922792,
|
|
107
|
+
"mean_retrieval_tokens_per_task": 97.44444444444444,
|
|
108
|
+
"tokenizer": "cl100k_base",
|
|
109
|
+
"per_task": {
|
|
110
|
+
"sku-atlas-material": {
|
|
111
|
+
"query": 11,
|
|
112
|
+
"context": 169,
|
|
113
|
+
"retrieval": 180,
|
|
114
|
+
"judge_in": 0,
|
|
115
|
+
"judge_out": 0,
|
|
116
|
+
"judge_latency_ms": 0.0
|
|
117
|
+
},
|
|
118
|
+
"sku-luna-capacity": {
|
|
119
|
+
"query": 8,
|
|
120
|
+
"context": 169,
|
|
121
|
+
"retrieval": 177,
|
|
122
|
+
"judge_in": 0,
|
|
123
|
+
"judge_out": 0,
|
|
124
|
+
"judge_latency_ms": 0.0
|
|
125
|
+
},
|
|
126
|
+
"sku-luna-warranty": {
|
|
127
|
+
"query": 9,
|
|
128
|
+
"context": 45,
|
|
129
|
+
"retrieval": 54,
|
|
130
|
+
"judge_in": 0,
|
|
131
|
+
"judge_out": 0,
|
|
132
|
+
"judge_latency_ms": 0.0
|
|
133
|
+
},
|
|
134
|
+
"sku-kite-harness-out-of-stock": {
|
|
135
|
+
"query": 17,
|
|
136
|
+
"context": 37,
|
|
137
|
+
"retrieval": 54,
|
|
138
|
+
"judge_in": 0,
|
|
139
|
+
"judge_out": 0,
|
|
140
|
+
"judge_latency_ms": 0.0
|
|
141
|
+
},
|
|
142
|
+
"sku-kite-harness-restock": {
|
|
143
|
+
"query": 17,
|
|
144
|
+
"context": 209,
|
|
145
|
+
"retrieval": 226,
|
|
146
|
+
"judge_in": 0,
|
|
147
|
+
"judge_out": 0,
|
|
148
|
+
"judge_latency_ms": 0.0
|
|
149
|
+
},
|
|
150
|
+
"sku-tee-discontinued-sizes": {
|
|
151
|
+
"query": 13,
|
|
152
|
+
"context": 194,
|
|
153
|
+
"retrieval": 207,
|
|
154
|
+
"judge_in": 0,
|
|
155
|
+
"judge_out": 0,
|
|
156
|
+
"judge_latency_ms": 0.0
|
|
157
|
+
},
|
|
158
|
+
"sku-sleeve-compat-14-mbp": {
|
|
159
|
+
"query": 16,
|
|
160
|
+
"context": 50,
|
|
161
|
+
"retrieval": 66,
|
|
162
|
+
"judge_in": 0,
|
|
163
|
+
"judge_out": 0,
|
|
164
|
+
"judge_latency_ms": 0.0
|
|
165
|
+
},
|
|
166
|
+
"sku-coffee-cup-microwave": {
|
|
167
|
+
"query": 8,
|
|
168
|
+
"context": 184,
|
|
169
|
+
"retrieval": 192,
|
|
170
|
+
"judge_in": 0,
|
|
171
|
+
"judge_out": 0,
|
|
172
|
+
"judge_latency_ms": 0.0
|
|
173
|
+
},
|
|
174
|
+
"sku-bin-liners-home-compost": {
|
|
175
|
+
"query": 11,
|
|
176
|
+
"context": 23,
|
|
177
|
+
"retrieval": 34,
|
|
178
|
+
"judge_in": 0,
|
|
179
|
+
"judge_out": 0,
|
|
180
|
+
"judge_latency_ms": 0.0
|
|
181
|
+
},
|
|
182
|
+
"policy-custom-tote-return": {
|
|
183
|
+
"query": 10,
|
|
184
|
+
"context": 115,
|
|
185
|
+
"retrieval": 125,
|
|
186
|
+
"judge_in": 0,
|
|
187
|
+
"judge_out": 0,
|
|
188
|
+
"judge_latency_ms": 0.0
|
|
189
|
+
},
|
|
190
|
+
"price-tee-subscription": {
|
|
191
|
+
"query": 14,
|
|
192
|
+
"context": 30,
|
|
193
|
+
"retrieval": 44,
|
|
194
|
+
"judge_in": 0,
|
|
195
|
+
"judge_out": 0,
|
|
196
|
+
"judge_latency_ms": 0.0
|
|
197
|
+
},
|
|
198
|
+
"price-sleeve-diff": {
|
|
199
|
+
"query": 19,
|
|
200
|
+
"context": 46,
|
|
201
|
+
"retrieval": 65,
|
|
202
|
+
"judge_in": 0,
|
|
203
|
+
"judge_out": 0,
|
|
204
|
+
"judge_latency_ms": 0.0
|
|
205
|
+
},
|
|
206
|
+
"multi-hop-kite-duotone": {
|
|
207
|
+
"query": 25,
|
|
208
|
+
"context": 31,
|
|
209
|
+
"retrieval": 56,
|
|
210
|
+
"judge_in": 0,
|
|
211
|
+
"judge_out": 0,
|
|
212
|
+
"judge_latency_ms": 0.0
|
|
213
|
+
},
|
|
214
|
+
"multi-hop-takeback-credit": {
|
|
215
|
+
"query": 23,
|
|
216
|
+
"context": 37,
|
|
217
|
+
"retrieval": 60,
|
|
218
|
+
"judge_in": 0,
|
|
219
|
+
"judge_out": 0,
|
|
220
|
+
"judge_latency_ms": 0.0
|
|
221
|
+
},
|
|
222
|
+
"rubric-recommend-kitesurf-bag": {
|
|
223
|
+
"query": 31,
|
|
224
|
+
"context": 31,
|
|
225
|
+
"retrieval": 62,
|
|
226
|
+
"judge_in": 373,
|
|
227
|
+
"judge_out": 27,
|
|
228
|
+
"judge_latency_ms": 886.3365219831467
|
|
229
|
+
},
|
|
230
|
+
"rubric-plastic-free-tee": {
|
|
231
|
+
"query": 11,
|
|
232
|
+
"context": 41,
|
|
233
|
+
"retrieval": 52,
|
|
234
|
+
"judge_in": 349,
|
|
235
|
+
"judge_out": 27,
|
|
236
|
+
"judge_latency_ms": 861.2519019842148
|
|
237
|
+
},
|
|
238
|
+
"rubric-cc-certified-items": {
|
|
239
|
+
"query": 14,
|
|
240
|
+
"context": 34,
|
|
241
|
+
"retrieval": 48,
|
|
242
|
+
"judge_in": 323,
|
|
243
|
+
"judge_out": 35,
|
|
244
|
+
"judge_latency_ms": 592.7986909747124
|
|
245
|
+
},
|
|
246
|
+
"entity-recycled-content-high": {
|
|
247
|
+
"query": 14,
|
|
248
|
+
"context": 38,
|
|
249
|
+
"retrieval": 52,
|
|
250
|
+
"judge_in": 0,
|
|
251
|
+
"judge_out": 0,
|
|
252
|
+
"judge_latency_ms": 0.0
|
|
253
|
+
}
|
|
254
|
+
},
|
|
255
|
+
"judge_tokens_in": 1045,
|
|
256
|
+
"judge_tokens_out": 89,
|
|
257
|
+
"judge_calls": 3,
|
|
258
|
+
"judge_mean_latency_ms": 780.1290383140246
|
|
259
|
+
},
|
|
260
|
+
"cost_usd": {
|
|
261
|
+
"assumed_completion_tokens_per_task": 100,
|
|
262
|
+
"rates": {
|
|
263
|
+
"input_per_1k": 0.0025,
|
|
264
|
+
"output_per_1k": 0.01,
|
|
265
|
+
"model": "gpt-4o"
|
|
266
|
+
},
|
|
267
|
+
"retrieval_usd_in": 0.004385,
|
|
268
|
+
"retrieval_usd_out": 0.018,
|
|
269
|
+
"retrieval_usd_total": 0.022385,
|
|
270
|
+
"naive_usd_total": 0.1017025,
|
|
271
|
+
"saved_usd": 0.0793175,
|
|
272
|
+
"saved_usd_per_1k_tasks": 4.406527777777778
|
|
273
|
+
}
|
|
274
|
+
},
|
|
275
|
+
"task_results": [
|
|
276
|
+
{
|
|
277
|
+
"task_id": "sku-atlas-material",
|
|
278
|
+
"query": "What's the material composition of the Atlas Phone Shell?",
|
|
279
|
+
"answer": "# Atlas Phone Shell \u2014 SKU ATL-PHN-001\n\n**Brand:** Atlas Circular\n**Category:** Mobile accessories / Phone cases\n**Compatible with:** iPhone 15, iPhone 15 Pro\n\n**Material composition:**\n- 68% recycled ocean-bound polypropylene\n- 24% recycled polycarbonate (post-consumer)\n- 8% natural calcium carbonate\n\n**Price:** \u00a329.00 GBP (standard) / \u00a324.00 (subscription)\n**Inventory:** 1,240 units in stock at UK fulfilment (Worthing DC)\n**Take-back:** Yes \u2014 returnable for 50% credit after 18 months\n**Certification:** Cradle to Cradle Bronze, MADE-BY Class B\n---\nbilling-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex",
|
|
280
|
+
"hits": [
|
|
281
|
+
{
|
|
282
|
+
"text": "# Atlas Phone Shell \u2014 SKU ATL-PHN-001\n\n**Brand:** Atlas Circular\n**Category:** Mobile accessories / Phone cases\n**Compatible with:** iPhone 15, iPhone 15 Pro\n\n**Material composition:**\n- 68% recycled ocean-bound polypropylene\n- 24% recycled polycarbonate (post-consumer)\n- 8% natural calcium carbonate\n\n**Price:** \u00a329.00 GBP (standard) / \u00a324.00 (subscription)\n**Inventory:** 1,240 units in stock at U",
|
|
283
|
+
"score": 0.7315541935691172,
|
|
284
|
+
"source": "pentatonic-memory",
|
|
285
|
+
"doc_id": "sku-atlas-phone-01"
|
|
286
|
+
},
|
|
287
|
+
{
|
|
288
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
289
|
+
"score": 0.6994823789082102,
|
|
290
|
+
"source": "pentatonic-memory",
|
|
291
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
292
|
+
},
|
|
293
|
+
{
|
|
294
|
+
"text": "search-svc API keys \u2014 alex",
|
|
295
|
+
"score": 0.6981416575081894,
|
|
296
|
+
"source": "pentatonic-memory",
|
|
297
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
298
|
+
}
|
|
299
|
+
],
|
|
300
|
+
"correct": true,
|
|
301
|
+
"score": 1.0,
|
|
302
|
+
"grading_notes": "all substrings matched",
|
|
303
|
+
"search_time_ms": 32.12930998415686,
|
|
304
|
+
"generation_time_ms": 0.0,
|
|
305
|
+
"tokens_in": 0,
|
|
306
|
+
"tokens_out": 0,
|
|
307
|
+
"retrieval_tokens": 180,
|
|
308
|
+
"query_tokens": 11,
|
|
309
|
+
"context_tokens": 169,
|
|
310
|
+
"judge_tokens_in": 0,
|
|
311
|
+
"judge_tokens_out": 0,
|
|
312
|
+
"judge_latency_ms": 0.0
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
"task_id": "sku-luna-capacity",
|
|
316
|
+
"query": "What capacity is the Luna water bottle?",
|
|
317
|
+
"answer": "# Luna Water Bottle 500ml \u2014 SKU LUN-BTL-042\n\n**Brand:** Luna\n**Category:** Drinkware / Bottles\n**Capacity:** 500ml\n\n**Material composition:**\n- 100% food-grade 304 stainless steel (65% recycled content)\n- Lid: bio-based silicone (sugarcane derivative)\n\n**Price:** \u00a318.50 GBP\n**Inventory:** 87 units in stock (UK), 412 units (EU DC Eindhoven)\n**Colours available:** Matte black, Sage green, Terracotta\n**Replacement parts:** Lid (\u00a33.50), Silicone seal (\u00a30.90)\n**Warranty:** Lifetime on body, 2 years on lid\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nbilling.new_invoice_pdf",
|
|
318
|
+
"hits": [
|
|
319
|
+
{
|
|
320
|
+
"text": "# Luna Water Bottle 500ml \u2014 SKU LUN-BTL-042\n\n**Brand:** Luna\n**Category:** Drinkware / Bottles\n**Capacity:** 500ml\n\n**Material composition:**\n- 100% food-grade 304 stainless steel (65% recycled content)\n- Lid: bio-based silicone (sugarcane derivative)\n\n**Price:** \u00a318.50 GBP\n**Inventory:** 87 units in stock (UK), 412 units (EU DC Eindhoven)\n**Colours available:** Matte black, Sage green, Terracotta",
|
|
321
|
+
"score": 0.7476620620852907,
|
|
322
|
+
"source": "pentatonic-memory",
|
|
323
|
+
"doc_id": "sku-luna-bottle-42"
|
|
324
|
+
},
|
|
325
|
+
{
|
|
326
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
327
|
+
"score": 0.6922887155889775,
|
|
328
|
+
"source": "pentatonic-memory",
|
|
329
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
"text": "billing.new_invoice_pdf",
|
|
333
|
+
"score": 0.6899837133948777,
|
|
334
|
+
"source": "pentatonic-memory",
|
|
335
|
+
"doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
|
|
336
|
+
}
|
|
337
|
+
],
|
|
338
|
+
"correct": true,
|
|
339
|
+
"score": 1.0,
|
|
340
|
+
"grading_notes": "all substrings matched",
|
|
341
|
+
"search_time_ms": 28.959120012586936,
|
|
342
|
+
"generation_time_ms": 0.0,
|
|
343
|
+
"tokens_in": 0,
|
|
344
|
+
"tokens_out": 0,
|
|
345
|
+
"retrieval_tokens": 177,
|
|
346
|
+
"query_tokens": 8,
|
|
347
|
+
"context_tokens": 169,
|
|
348
|
+
"judge_tokens_in": 0,
|
|
349
|
+
"judge_tokens_out": 0,
|
|
350
|
+
"judge_latency_ms": 0.0
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
"task_id": "sku-luna-warranty",
|
|
354
|
+
"query": "What's the warranty on the Luna bottle?",
|
|
355
|
+
"answer": "Reason: defective lid on Luna Water Bottle 500ml.\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
356
|
+
"hits": [
|
|
357
|
+
{
|
|
358
|
+
"text": "Reason: defective lid on Luna Water Bottle 500ml.",
|
|
359
|
+
"score": 0.6973111644759994,
|
|
360
|
+
"source": "pentatonic-memory",
|
|
361
|
+
"doc_id": "mem_5c7f6891-f63a-4f9b-bd22-f9cd3494ece2"
|
|
362
|
+
},
|
|
363
|
+
{
|
|
364
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
365
|
+
"score": 0.6928739599750502,
|
|
366
|
+
"source": "pentatonic-memory",
|
|
367
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
368
|
+
},
|
|
369
|
+
{
|
|
370
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
371
|
+
"score": 0.6923489833962405,
|
|
372
|
+
"source": "pentatonic-memory",
|
|
373
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
374
|
+
}
|
|
375
|
+
],
|
|
376
|
+
"correct": false,
|
|
377
|
+
"score": 0.25,
|
|
378
|
+
"grading_notes": "missing 3/4: ['Lifetime', 'body', '2 years']",
|
|
379
|
+
"search_time_ms": 36.634856020100415,
|
|
380
|
+
"generation_time_ms": 0.0,
|
|
381
|
+
"tokens_in": 0,
|
|
382
|
+
"tokens_out": 0,
|
|
383
|
+
"retrieval_tokens": 54,
|
|
384
|
+
"query_tokens": 9,
|
|
385
|
+
"context_tokens": 45,
|
|
386
|
+
"judge_tokens_in": 0,
|
|
387
|
+
"judge_tokens_out": 0,
|
|
388
|
+
"judge_latency_ms": 0.0
|
|
389
|
+
},
|
|
390
|
+
{
|
|
391
|
+
"task_id": "sku-kite-harness-out-of-stock",
|
|
392
|
+
"query": "Which size of the Nomad Kite Harness v3 is currently out of stock?",
|
|
393
|
+
"answer": "Nomad Kite Harness v3 M sold out\n---\nsearch-svc API keys \u2014 alex\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
394
|
+
"hits": [
|
|
395
|
+
{
|
|
396
|
+
"text": "Nomad Kite Harness v3 M sold out",
|
|
397
|
+
"score": 0.7009975219550995,
|
|
398
|
+
"source": "pentatonic-memory",
|
|
399
|
+
"doc_id": "inventory-update-2026-04-15"
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
"text": "search-svc API keys \u2014 alex",
|
|
403
|
+
"score": 0.6928082686168076,
|
|
404
|
+
"source": "pentatonic-memory",
|
|
405
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
406
|
+
},
|
|
407
|
+
{
|
|
408
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
409
|
+
"score": 0.6920729476713338,
|
|
410
|
+
"source": "pentatonic-memory",
|
|
411
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
412
|
+
}
|
|
413
|
+
],
|
|
414
|
+
"correct": true,
|
|
415
|
+
"score": 1.0,
|
|
416
|
+
"grading_notes": "all substrings matched",
|
|
417
|
+
"search_time_ms": 39.145407994510606,
|
|
418
|
+
"generation_time_ms": 0.0,
|
|
419
|
+
"tokens_in": 0,
|
|
420
|
+
"tokens_out": 0,
|
|
421
|
+
"retrieval_tokens": 54,
|
|
422
|
+
"query_tokens": 17,
|
|
423
|
+
"context_tokens": 37,
|
|
424
|
+
"judge_tokens_in": 0,
|
|
425
|
+
"judge_tokens_out": 0,
|
|
426
|
+
"judge_latency_ms": 0.0
|
|
427
|
+
},
|
|
428
|
+
{
|
|
429
|
+
"task_id": "sku-kite-harness-restock",
|
|
430
|
+
"query": "When does the Nomad Kite Harness v3 restock for XS and S?",
|
|
431
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\n# Nomad Kite Harness v3 \u2014 SKU NMD-HRN-V3\n\n**Brand:** Nomad Sports\n**Category:** Watersports / Kite surfing\n**Size range:** XS (26-28\"), S (28-30\"), M (30-32\"), L (32-34\"), XL (34-36\")\n\n**Material composition:**\n- Outer shell: 92% recycled nylon 6,6 (ghost-net source)\n- Foam padding: EVA (non-recycled)\n- Hardware: 316 stainless steel spreader bar\n\n**Price:** \u00a3145.00 GBP\n**Inventory:** M=out of stock, L=6 units, XL=12 units, XS/S=restock 2026-05-15\n**Repair service:** \u00a325 flat fee, turnaround 14 days\n**Compatibility:** F-One bar systems, North bar systems (adapter sold separately)\n---\nkubectl rollout restart deployment/billing-svc",
|
|
432
|
+
"hits": [
|
|
433
|
+
{
|
|
434
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
435
|
+
"score": 0.7336013286272791,
|
|
436
|
+
"source": "pentatonic-memory",
|
|
437
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
438
|
+
},
|
|
439
|
+
{
|
|
440
|
+
"text": "# Nomad Kite Harness v3 \u2014 SKU NMD-HRN-V3\n\n**Brand:** Nomad Sports\n**Category:** Watersports / Kite surfing\n**Size range:** XS (26-28\"), S (28-30\"), M (30-32\"), L (32-34\"), XL (34-36\")\n\n**Material composition:**\n- Outer shell: 92% recycled nylon 6,6 (ghost-net source)\n- Foam padding: EVA (non-recycled)\n- Hardware: 316 stainless steel spreader bar\n\n**Price:** \u00a3145.00 GBP\n**Inventory:** M=out of stoc",
|
|
441
|
+
"score": 0.7293507801903829,
|
|
442
|
+
"source": "pentatonic-memory",
|
|
443
|
+
"doc_id": "sku-kite-harness-v3"
|
|
444
|
+
},
|
|
445
|
+
{
|
|
446
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
447
|
+
"score": 0.7200327904165044,
|
|
448
|
+
"source": "pentatonic-memory",
|
|
449
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
450
|
+
}
|
|
451
|
+
],
|
|
452
|
+
"correct": true,
|
|
453
|
+
"score": 1.0,
|
|
454
|
+
"grading_notes": "all substrings matched",
|
|
455
|
+
"search_time_ms": 34.503188013331965,
|
|
456
|
+
"generation_time_ms": 0.0,
|
|
457
|
+
"tokens_in": 0,
|
|
458
|
+
"tokens_out": 0,
|
|
459
|
+
"retrieval_tokens": 226,
|
|
460
|
+
"query_tokens": 17,
|
|
461
|
+
"context_tokens": 209,
|
|
462
|
+
"judge_tokens_in": 0,
|
|
463
|
+
"judge_tokens_out": 0,
|
|
464
|
+
"judge_latency_ms": 0.0
|
|
465
|
+
},
|
|
466
|
+
{
|
|
467
|
+
"task_id": "sku-tee-discontinued-sizes",
|
|
468
|
+
"query": "Which sizes of the Pentatonic Crew Tee Black are discontinued?",
|
|
469
|
+
"answer": "# Pentatonic Crew Tee Black \u2014 SKU PTT-TEE-BLK\n\n**Brand:** Pentatonic\n**Category:** Apparel / T-shirts\n**Fit:** Unisex regular\n**Size range:** XS, S, M, L, XL, XXL\n\n**Material composition:**\n- 100% GOTS-certified organic cotton, 180 gsm\n- Dyeing: low-impact reactive dye, closed-loop water system\n\n**Price:** \u00a328.00 GBP\n**Inventory:** S=48, M=136, L=94, XL=22, XS=0 (discontinued), XXL=0 (discontinued)\n**Country of origin:** Portugal (CITEVE-certified mill)\n**Care:** Wash cold, line dry; no tumble dry\n---\nPentatonic Crew Tee Black XXS/XXL discontinued\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
470
|
+
"hits": [
|
|
471
|
+
{
|
|
472
|
+
"text": "# Pentatonic Crew Tee Black \u2014 SKU PTT-TEE-BLK\n\n**Brand:** Pentatonic\n**Category:** Apparel / T-shirts\n**Fit:** Unisex regular\n**Size range:** XS, S, M, L, XL, XXL\n\n**Material composition:**\n- 100% GOTS-certified organic cotton, 180 gsm\n- Dyeing: low-impact reactive dye, closed-loop water system\n\n**Price:** \u00a328.00 GBP\n**Inventory:** S=48, M=136, L=94, XL=22, XS=0 (discontinued), XXL=0 (discontinued",
|
|
473
|
+
"score": 0.7419536682983361,
|
|
474
|
+
"source": "pentatonic-memory",
|
|
475
|
+
"doc_id": "sku-cotton-tee-black"
|
|
476
|
+
},
|
|
477
|
+
{
|
|
478
|
+
"text": "Pentatonic Crew Tee Black XXS/XXL discontinued",
|
|
479
|
+
"score": 0.721743907904692,
|
|
480
|
+
"source": "pentatonic-memory",
|
|
481
|
+
"doc_id": "inventory-update-2026-04-15"
|
|
482
|
+
},
|
|
483
|
+
{
|
|
484
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
485
|
+
"score": 0.7033698130789997,
|
|
486
|
+
"source": "pentatonic-memory",
|
|
487
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
488
|
+
}
|
|
489
|
+
],
|
|
490
|
+
"correct": true,
|
|
491
|
+
"score": 1.0,
|
|
492
|
+
"grading_notes": "all substrings matched",
|
|
493
|
+
"search_time_ms": 24.991354992380366,
|
|
494
|
+
"generation_time_ms": 0.0,
|
|
495
|
+
"tokens_in": 0,
|
|
496
|
+
"tokens_out": 0,
|
|
497
|
+
"retrieval_tokens": 207,
|
|
498
|
+
"query_tokens": 13,
|
|
499
|
+
"context_tokens": 194,
|
|
500
|
+
"judge_tokens_in": 0,
|
|
501
|
+
"judge_tokens_out": 0,
|
|
502
|
+
"judge_latency_ms": 0.0
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
"task_id": "sku-sleeve-compat-14-mbp",
|
|
506
|
+
"query": "Does the Haven Laptop Sleeve 13\" fit a 14\" MacBook Pro?",
|
|
507
|
+
"answer": "The Haven Laptop Sleeve does not fit a 14-inch MacBook Pro due to third-party dimensions exceeding 31.5cm.\n---\nThe user's laptop sleeve size is compatible with a 14-inch MacBook Pro.\n---\nsearch-svc API keys \u2014 alex",
|
|
508
|
+
"hits": [
|
|
509
|
+
{
|
|
510
|
+
"text": "The Haven Laptop Sleeve does not fit a 14-inch MacBook Pro due to third-party dimensions exceeding 31.5cm.",
|
|
511
|
+
"score": 0.7261165299002014,
|
|
512
|
+
"source": "pentatonic-memory",
|
|
513
|
+
"doc_id": "compat-laptop-sleeve-macbook-pro-14"
|
|
514
|
+
},
|
|
515
|
+
{
|
|
516
|
+
"text": "The user's laptop sleeve size is compatible with a 14-inch MacBook Pro.",
|
|
517
|
+
"score": 0.7027898525404869,
|
|
518
|
+
"source": "pentatonic-memory",
|
|
519
|
+
"doc_id": "compat-laptop-sleeve-macbook-pro-14"
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
"text": "search-svc API keys \u2014 alex",
|
|
523
|
+
"score": 0.6821494342438669,
|
|
524
|
+
"source": "pentatonic-memory",
|
|
525
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
526
|
+
}
|
|
527
|
+
],
|
|
528
|
+
"correct": false,
|
|
529
|
+
"score": 0.5,
|
|
530
|
+
"grading_notes": "missing 1/2: ['15']",
|
|
531
|
+
"search_time_ms": 30.218572006560862,
|
|
532
|
+
"generation_time_ms": 0.0,
|
|
533
|
+
"tokens_in": 0,
|
|
534
|
+
"tokens_out": 0,
|
|
535
|
+
"retrieval_tokens": 66,
|
|
536
|
+
"query_tokens": 16,
|
|
537
|
+
"context_tokens": 50,
|
|
538
|
+
"judge_tokens_in": 0,
|
|
539
|
+
"judge_tokens_out": 0,
|
|
540
|
+
"judge_latency_ms": 0.0
|
|
541
|
+
},
|
|
542
|
+
{
|
|
543
|
+
"task_id": "sku-coffee-cup-microwave",
|
|
544
|
+
"query": "Can I microwave the Loop coffee cup?",
|
|
545
|
+
"answer": "# Loop Reusable Coffee Cup 8oz \u2014 SKU LP-CUP-008\n\n**Brand:** Loop\n**Category:** Drinkware / Coffee\n**Capacity:** 8oz (240ml)\n\n**Material composition:**\n- Body: 100% plant-based PLA (corn-derived)\n- Silicone sleeve: food-grade silicone (not bio-based)\n- Lid: BPA-free polypropylene (recycled post-industrial)\n\n**Price:** \u00a312.50 GBP\n**Inventory:** 2,104 units UK, 1,876 units EU\n**Dishwasher safe:** Top rack only\n**Lifespan:** ~2 years regular use, then return for closed-loop composting\n**Note:** Not microwave-safe due to PLA degradation above 60\u00b0C.\n---\nUsed in: Loop Reusable Coffee Cup body\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
546
|
+
"hits": [
|
|
547
|
+
{
|
|
548
|
+
"text": "# Loop Reusable Coffee Cup 8oz \u2014 SKU LP-CUP-008\n\n**Brand:** Loop\n**Category:** Drinkware / Coffee\n**Capacity:** 8oz (240ml)\n\n**Material composition:**\n- Body: 100% plant-based PLA (corn-derived)\n- Silicone sleeve: food-grade silicone (not bio-based)\n- Lid: BPA-free polypropylene (recycled post-industrial)\n\n**Price:** \u00a312.50 GBP\n**Inventory:** 2,104 units UK, 1,876 units EU\n**Dishwasher safe:** Top",
|
|
549
|
+
"score": 0.7047915334903309,
|
|
550
|
+
"source": "pentatonic-memory",
|
|
551
|
+
"doc_id": "sku-coffee-cup-reusable"
|
|
552
|
+
},
|
|
553
|
+
{
|
|
554
|
+
"text": "Used in: Loop Reusable Coffee Cup body",
|
|
555
|
+
"score": 0.7036784559543265,
|
|
556
|
+
"source": "pentatonic-memory",
|
|
557
|
+
"doc_id": "mem_a4307cc8-b064-4449-8dc7-033eb22e3eb1"
|
|
558
|
+
},
|
|
559
|
+
{
|
|
560
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
561
|
+
"score": 0.6979665008354474,
|
|
562
|
+
"source": "pentatonic-memory",
|
|
563
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
564
|
+
}
|
|
565
|
+
],
|
|
566
|
+
"correct": true,
|
|
567
|
+
"score": 1.0,
|
|
568
|
+
"grading_notes": "all substrings matched",
|
|
569
|
+
"search_time_ms": 26.823068008525297,
|
|
570
|
+
"generation_time_ms": 0.0,
|
|
571
|
+
"tokens_in": 0,
|
|
572
|
+
"tokens_out": 0,
|
|
573
|
+
"retrieval_tokens": 192,
|
|
574
|
+
"query_tokens": 8,
|
|
575
|
+
"context_tokens": 184,
|
|
576
|
+
"judge_tokens_in": 0,
|
|
577
|
+
"judge_tokens_out": 0,
|
|
578
|
+
"judge_latency_ms": 0.0
|
|
579
|
+
},
|
|
580
|
+
{
|
|
581
|
+
"task_id": "sku-bin-liners-home-compost",
|
|
582
|
+
"query": "Are the Root bio bin liners home-compostable?",
|
|
583
|
+
"answer": "Used in Root Bio Bin Liners\n---\nNOT home-compostable\n---\nHome-compostability requires industrial temperatures",
|
|
584
|
+
"hits": [
|
|
585
|
+
{
|
|
586
|
+
"text": "Used in Root Bio Bin Liners",
|
|
587
|
+
"score": 0.709445264469953,
|
|
588
|
+
"source": "pentatonic-memory",
|
|
589
|
+
"doc_id": "mem_88bd5ef3-ac4c-4d99-850c-8c45aab4f877"
|
|
590
|
+
},
|
|
591
|
+
{
|
|
592
|
+
"text": "NOT home-compostable",
|
|
593
|
+
"score": 0.6873024809704998,
|
|
594
|
+
"source": "pentatonic-memory",
|
|
595
|
+
"doc_id": "mem_5075671c-8fd8-49e7-9587-485abf7b2f82"
|
|
596
|
+
},
|
|
597
|
+
{
|
|
598
|
+
"text": "Home-compostability requires industrial temperatures",
|
|
599
|
+
"score": 0.6824549395669162,
|
|
600
|
+
"source": "pentatonic-memory",
|
|
601
|
+
"doc_id": "mem_f6c7f0fd-1c5d-421f-93ce-c6d1c31e583c"
|
|
602
|
+
}
|
|
603
|
+
],
|
|
604
|
+
"correct": true,
|
|
605
|
+
"score": 1.0,
|
|
606
|
+
"grading_notes": "all substrings matched",
|
|
607
|
+
"search_time_ms": 25.088716007303447,
|
|
608
|
+
"generation_time_ms": 0.0,
|
|
609
|
+
"tokens_in": 0,
|
|
610
|
+
"tokens_out": 0,
|
|
611
|
+
"retrieval_tokens": 34,
|
|
612
|
+
"query_tokens": 11,
|
|
613
|
+
"context_tokens": 23,
|
|
614
|
+
"judge_tokens_in": 0,
|
|
615
|
+
"judge_tokens_out": 0,
|
|
616
|
+
"judge_latency_ms": 0.0
|
|
617
|
+
},
|
|
618
|
+
{
|
|
619
|
+
"task_id": "policy-custom-tote-return",
|
|
620
|
+
"query": "Can I return a custom-printed tote bag?",
|
|
621
|
+
"answer": "# Returns policy\n\nAll unused items returnable within 30 days of delivery for full refund.\nItems returned after 30 days but under 90 days: store credit only.\nTake-back programme is separate \u2014 applies to any lifetime-warrantied item at end of useful life.\nExceptions: custom-printed items (e.g. custom tote bags) and consumables (e.g. bin liners) are final sale.\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
|
|
622
|
+
"hits": [
|
|
623
|
+
{
|
|
624
|
+
"text": "# Returns policy\n\nAll unused items returnable within 30 days of delivery for full refund.\nItems returned after 30 days but under 90 days: store credit only.\nTake-back programme is separate \u2014 applies to any lifetime-warrantied item at end of useful life.\nExceptions: custom-printed items (e.g. custom tote bags) and consumables (e.g. bin liners) are final sale.\n",
|
|
625
|
+
"score": 0.7318615432069925,
|
|
626
|
+
"source": "pentatonic-memory",
|
|
627
|
+
"doc_id": "policy-returns-30day"
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
631
|
+
"score": 0.7025939344209113,
|
|
632
|
+
"source": "pentatonic-memory",
|
|
633
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
634
|
+
},
|
|
635
|
+
{
|
|
636
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
637
|
+
"score": 0.685714168675108,
|
|
638
|
+
"source": "pentatonic-memory",
|
|
639
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
640
|
+
}
|
|
641
|
+
],
|
|
642
|
+
"correct": true,
|
|
643
|
+
"score": 1.0,
|
|
644
|
+
"grading_notes": "all substrings matched",
|
|
645
|
+
"search_time_ms": 27.799732983112335,
|
|
646
|
+
"generation_time_ms": 0.0,
|
|
647
|
+
"tokens_in": 0,
|
|
648
|
+
"tokens_out": 0,
|
|
649
|
+
"retrieval_tokens": 125,
|
|
650
|
+
"query_tokens": 10,
|
|
651
|
+
"context_tokens": 115,
|
|
652
|
+
"judge_tokens_in": 0,
|
|
653
|
+
"judge_tokens_out": 0,
|
|
654
|
+
"judge_latency_ms": 0.0
|
|
655
|
+
},
|
|
656
|
+
{
|
|
657
|
+
"task_id": "price-tee-subscription",
|
|
658
|
+
"query": "What's the subscription price for the Luna water bottle compared to standard?",
|
|
659
|
+
"answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nbilling.new_invoice_pdf\n---\nbilling-svc API keys \u2014 jordan",
|
|
660
|
+
"hits": [
|
|
661
|
+
{
|
|
662
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
663
|
+
"score": 0.7187337758022995,
|
|
664
|
+
"source": "pentatonic-memory",
|
|
665
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
666
|
+
},
|
|
667
|
+
{
|
|
668
|
+
"text": "billing.new_invoice_pdf",
|
|
669
|
+
"score": 0.7017504939666374,
|
|
670
|
+
"source": "pentatonic-memory",
|
|
671
|
+
"doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
|
|
672
|
+
},
|
|
673
|
+
{
|
|
674
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
675
|
+
"score": 0.6970291528428944,
|
|
676
|
+
"source": "pentatonic-memory",
|
|
677
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
678
|
+
}
|
|
679
|
+
],
|
|
680
|
+
"correct": false,
|
|
681
|
+
"score": 0.0,
|
|
682
|
+
"grading_notes": "missing 1/1: ['18.50']",
|
|
683
|
+
"search_time_ms": 31.271957996068522,
|
|
684
|
+
"generation_time_ms": 0.0,
|
|
685
|
+
"tokens_in": 0,
|
|
686
|
+
"tokens_out": 0,
|
|
687
|
+
"retrieval_tokens": 44,
|
|
688
|
+
"query_tokens": 14,
|
|
689
|
+
"context_tokens": 30,
|
|
690
|
+
"judge_tokens_in": 0,
|
|
691
|
+
"judge_tokens_out": 0,
|
|
692
|
+
"judge_latency_ms": 0.0
|
|
693
|
+
},
|
|
694
|
+
{
|
|
695
|
+
"task_id": "price-sleeve-diff",
|
|
696
|
+
"query": "How much more does the 15\" Haven Laptop Sleeve cost than the 13\" variant?",
|
|
697
|
+
"answer": "15\" laptop sleeve\n---\nThe Haven Laptop Sleeve does not fit a 14-inch MacBook Pro due to third-party dimensions exceeding 31.5cm.\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
|
|
698
|
+
"hits": [
|
|
699
|
+
{
|
|
700
|
+
"text": "15\" laptop sleeve",
|
|
701
|
+
"score": 0.6990206555450599,
|
|
702
|
+
"source": "pentatonic-memory",
|
|
703
|
+
"doc_id": "mem_8fb1c0b1-9041-4204-ae86-ffedba958404"
|
|
704
|
+
},
|
|
705
|
+
{
|
|
706
|
+
"text": "The Haven Laptop Sleeve does not fit a 14-inch MacBook Pro due to third-party dimensions exceeding 31.5cm.",
|
|
707
|
+
"score": 0.6986175180116678,
|
|
708
|
+
"source": "pentatonic-memory",
|
|
709
|
+
"doc_id": "compat-laptop-sleeve-macbook-pro-14"
|
|
710
|
+
},
|
|
711
|
+
{
|
|
712
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
713
|
+
"score": 0.6858934356046972,
|
|
714
|
+
"source": "pentatonic-memory",
|
|
715
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
716
|
+
}
|
|
717
|
+
],
|
|
718
|
+
"correct": false,
|
|
719
|
+
"score": 0.0,
|
|
720
|
+
"grading_notes": "missing 1/1: ['10']",
|
|
721
|
+
"search_time_ms": 28.63598198746331,
|
|
722
|
+
"generation_time_ms": 0.0,
|
|
723
|
+
"tokens_in": 0,
|
|
724
|
+
"tokens_out": 0,
|
|
725
|
+
"retrieval_tokens": 65,
|
|
726
|
+
"query_tokens": 19,
|
|
727
|
+
"context_tokens": 46,
|
|
728
|
+
"judge_tokens_in": 0,
|
|
729
|
+
"judge_tokens_out": 0,
|
|
730
|
+
"judge_latency_ms": 0.0
|
|
731
|
+
},
|
|
732
|
+
{
|
|
733
|
+
"task_id": "multi-hop-kite-duotone",
|
|
734
|
+
"query": "I have a Duotone bar \u2014 will the Nomad Kite Harness v3 work with it out of the box?",
|
|
735
|
+
"answer": "billing-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
736
|
+
"hits": [
|
|
737
|
+
{
|
|
738
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
739
|
+
"score": 0.6965538098263404,
|
|
740
|
+
"source": "pentatonic-memory",
|
|
741
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
742
|
+
},
|
|
743
|
+
{
|
|
744
|
+
"text": "search-svc API keys \u2014 alex",
|
|
745
|
+
"score": 0.6864974270749059,
|
|
746
|
+
"source": "pentatonic-memory",
|
|
747
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
748
|
+
},
|
|
749
|
+
{
|
|
750
|
+
"text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
751
|
+
"score": 0.6749246946585903,
|
|
752
|
+
"source": "pentatonic-memory",
|
|
753
|
+
"doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
|
|
754
|
+
}
|
|
755
|
+
],
|
|
756
|
+
"correct": false,
|
|
757
|
+
"score": 0.0,
|
|
758
|
+
"grading_notes": "missing 2/2: ['adapter', 'Duotone']",
|
|
759
|
+
"search_time_ms": 32.39788900827989,
|
|
760
|
+
"generation_time_ms": 0.0,
|
|
761
|
+
"tokens_in": 0,
|
|
762
|
+
"tokens_out": 0,
|
|
763
|
+
"retrieval_tokens": 56,
|
|
764
|
+
"query_tokens": 25,
|
|
765
|
+
"context_tokens": 31,
|
|
766
|
+
"judge_tokens_in": 0,
|
|
767
|
+
"judge_tokens_out": 0,
|
|
768
|
+
"judge_latency_ms": 0.0
|
|
769
|
+
},
|
|
770
|
+
{
|
|
771
|
+
"task_id": "multi-hop-takeback-credit",
|
|
772
|
+
"query": "If I return an Atlas Phone Shell via take-back after 2 years, how much store credit do I get?",
|
|
773
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nemail-svc consumes OrderPaid for the receipt email",
|
|
774
|
+
"hits": [
|
|
775
|
+
{
|
|
776
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
777
|
+
"score": 0.7258603503673935,
|
|
778
|
+
"source": "pentatonic-memory",
|
|
779
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
780
|
+
},
|
|
781
|
+
{
|
|
782
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
783
|
+
"score": 0.713401687368604,
|
|
784
|
+
"source": "pentatonic-memory",
|
|
785
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
786
|
+
},
|
|
787
|
+
{
|
|
788
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
789
|
+
"score": 0.7101615945965846,
|
|
790
|
+
"source": "pentatonic-memory",
|
|
791
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
792
|
+
}
|
|
793
|
+
],
|
|
794
|
+
"correct": false,
|
|
795
|
+
"score": 0.0,
|
|
796
|
+
"grading_notes": "missing 2/2: ['14.50', '50%']",
|
|
797
|
+
"search_time_ms": 31.55402498668991,
|
|
798
|
+
"generation_time_ms": 0.0,
|
|
799
|
+
"tokens_in": 0,
|
|
800
|
+
"tokens_out": 0,
|
|
801
|
+
"retrieval_tokens": 60,
|
|
802
|
+
"query_tokens": 23,
|
|
803
|
+
"context_tokens": 37,
|
|
804
|
+
"judge_tokens_in": 0,
|
|
805
|
+
"judge_tokens_out": 0,
|
|
806
|
+
"judge_latency_ms": 0.0
|
|
807
|
+
},
|
|
808
|
+
{
|
|
809
|
+
"task_id": "rubric-recommend-kitesurf-bag",
|
|
810
|
+
"query": "I'm heading to Worthing for a kite session and need to carry my 15\" MacBook and some kit. What would you recommend from the catalogue?",
|
|
811
|
+
"answer": "billing-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
812
|
+
"hits": [
|
|
813
|
+
{
|
|
814
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
815
|
+
"score": 0.6698083530519248,
|
|
816
|
+
"source": "pentatonic-memory",
|
|
817
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
818
|
+
},
|
|
819
|
+
{
|
|
820
|
+
"text": "search-svc API keys \u2014 alex",
|
|
821
|
+
"score": 0.6672376864393237,
|
|
822
|
+
"source": "pentatonic-memory",
|
|
823
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
824
|
+
},
|
|
825
|
+
{
|
|
826
|
+
"text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
827
|
+
"score": 0.6639449282102663,
|
|
828
|
+
"source": "pentatonic-memory",
|
|
829
|
+
"doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
|
|
830
|
+
}
|
|
831
|
+
],
|
|
832
|
+
"correct": false,
|
|
833
|
+
"score": 0.0,
|
|
834
|
+
"grading_notes": "judge=0.00 \u2014 The agent's answer does not address the task query at all.",
|
|
835
|
+
"search_time_ms": 33.78428498399444,
|
|
836
|
+
"generation_time_ms": 0.0,
|
|
837
|
+
"tokens_in": 0,
|
|
838
|
+
"tokens_out": 0,
|
|
839
|
+
"retrieval_tokens": 62,
|
|
840
|
+
"query_tokens": 31,
|
|
841
|
+
"context_tokens": 31,
|
|
842
|
+
"judge_tokens_in": 373,
|
|
843
|
+
"judge_tokens_out": 27,
|
|
844
|
+
"judge_latency_ms": 886.3365219831467
|
|
845
|
+
},
|
|
846
|
+
{
|
|
847
|
+
"task_id": "rubric-plastic-free-tee",
|
|
848
|
+
"query": "Which tees in the catalogue are completely plastic-free?",
|
|
849
|
+
"answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nsearch-svc API keys \u2014 alex\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
850
|
+
"hits": [
|
|
851
|
+
{
|
|
852
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
853
|
+
"score": 0.6917861334892272,
|
|
854
|
+
"source": "pentatonic-memory",
|
|
855
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
856
|
+
},
|
|
857
|
+
{
|
|
858
|
+
"text": "search-svc API keys \u2014 alex",
|
|
859
|
+
"score": 0.6903066977993149,
|
|
860
|
+
"source": "pentatonic-memory",
|
|
861
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
862
|
+
},
|
|
863
|
+
{
|
|
864
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
865
|
+
"score": 0.6798537660432268,
|
|
866
|
+
"source": "pentatonic-memory",
|
|
867
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
868
|
+
}
|
|
869
|
+
],
|
|
870
|
+
"correct": false,
|
|
871
|
+
"score": 0.0,
|
|
872
|
+
"grading_notes": "judge=0.00 \u2014 The agent's answer does not address the task query at all.",
|
|
873
|
+
"search_time_ms": 29.365812020841986,
|
|
874
|
+
"generation_time_ms": 0.0,
|
|
875
|
+
"tokens_in": 0,
|
|
876
|
+
"tokens_out": 0,
|
|
877
|
+
"retrieval_tokens": 52,
|
|
878
|
+
"query_tokens": 11,
|
|
879
|
+
"context_tokens": 41,
|
|
880
|
+
"judge_tokens_in": 349,
|
|
881
|
+
"judge_tokens_out": 27,
|
|
882
|
+
"judge_latency_ms": 861.2519019842148
|
|
883
|
+
},
|
|
884
|
+
{
|
|
885
|
+
"task_id": "rubric-cc-certified-items",
|
|
886
|
+
"query": "Do any items in the catalogue have Cradle to Cradle certification?",
|
|
887
|
+
"answer": "Cradle to Cradle Certified\n---\nsearch-svc API keys \u2014 alex\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
888
|
+
"hits": [
|
|
889
|
+
{
|
|
890
|
+
"text": "Cradle to Cradle Certified",
|
|
891
|
+
"score": 0.7258294892094391,
|
|
892
|
+
"source": "pentatonic-memory",
|
|
893
|
+
"doc_id": "mem_0ce4a44a-3050-4463-a05c-9138b1524776"
|
|
894
|
+
},
|
|
895
|
+
{
|
|
896
|
+
"text": "search-svc API keys \u2014 alex",
|
|
897
|
+
"score": 0.7097413775359073,
|
|
898
|
+
"source": "pentatonic-memory",
|
|
899
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
900
|
+
},
|
|
901
|
+
{
|
|
902
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
903
|
+
"score": 0.7083036450592873,
|
|
904
|
+
"source": "pentatonic-memory",
|
|
905
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
906
|
+
}
|
|
907
|
+
],
|
|
908
|
+
"correct": false,
|
|
909
|
+
"score": 0.0,
|
|
910
|
+
"grading_notes": "judge=0.00 \u2014 The answer does not specify the tier of certification, which is a required detail according to the rubric.",
|
|
911
|
+
"search_time_ms": 27.729604014893994,
|
|
912
|
+
"generation_time_ms": 0.0,
|
|
913
|
+
"tokens_in": 0,
|
|
914
|
+
"tokens_out": 0,
|
|
915
|
+
"retrieval_tokens": 48,
|
|
916
|
+
"query_tokens": 14,
|
|
917
|
+
"context_tokens": 34,
|
|
918
|
+
"judge_tokens_in": 323,
|
|
919
|
+
"judge_tokens_out": 35,
|
|
920
|
+
"judge_latency_ms": 592.7986909747124
|
|
921
|
+
},
|
|
922
|
+
{
|
|
923
|
+
"task_id": "entity-recycled-content-high",
|
|
924
|
+
"query": "Which items in the catalogue use at least 50% recycled material?",
|
|
925
|
+
"answer": "email-svc consumes OrderPaid for the receipt email\n---\nreporting-svc consumes OrderPaid for financial reports\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
|
|
926
|
+
"hits": [
|
|
927
|
+
{
|
|
928
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
929
|
+
"score": 0.7125966165026759,
|
|
930
|
+
"source": "pentatonic-memory",
|
|
931
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
932
|
+
},
|
|
933
|
+
{
|
|
934
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
935
|
+
"score": 0.7119136086269425,
|
|
936
|
+
"source": "pentatonic-memory",
|
|
937
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
938
|
+
},
|
|
939
|
+
{
|
|
940
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
941
|
+
"score": 0.7070923566256655,
|
|
942
|
+
"source": "pentatonic-memory",
|
|
943
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
944
|
+
}
|
|
945
|
+
],
|
|
946
|
+
"correct": false,
|
|
947
|
+
"score": 0.0,
|
|
948
|
+
"grading_notes": "no expected_substrings set",
|
|
949
|
+
"search_time_ms": 26.608890009811148,
|
|
950
|
+
"generation_time_ms": 0.0,
|
|
951
|
+
"tokens_in": 0,
|
|
952
|
+
"tokens_out": 0,
|
|
953
|
+
"retrieval_tokens": 52,
|
|
954
|
+
"query_tokens": 14,
|
|
955
|
+
"context_tokens": 38,
|
|
956
|
+
"judge_tokens_in": 0,
|
|
957
|
+
"judge_tokens_out": 0,
|
|
958
|
+
"judge_latency_ms": 0.0
|
|
959
|
+
}
|
|
960
|
+
]
|
|
961
|
+
}
|