@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +178 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/bin/commands/login.js +10 -3
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +265 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +696 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1035 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,1278 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bench": "circular-economy",
|
|
3
|
+
"stack": "pentatonic-memory-latest",
|
|
4
|
+
"n_tasks": 25,
|
|
5
|
+
"n_correct": 8,
|
|
6
|
+
"accuracy": 0.32,
|
|
7
|
+
"mean_score": 0.3466666666666666,
|
|
8
|
+
"p50_search_ms": 32.229704986093566,
|
|
9
|
+
"p95_search_ms": 36.96155320503749,
|
|
10
|
+
"total_tokens_in": 0,
|
|
11
|
+
"total_tokens_out": 0,
|
|
12
|
+
"total_usd": 0.0,
|
|
13
|
+
"by_tag": {
|
|
14
|
+
"factoid": {
|
|
15
|
+
"n": 14,
|
|
16
|
+
"mean_score": 0.35714285714285715,
|
|
17
|
+
"accuracy": 0.35714285714285715
|
|
18
|
+
},
|
|
19
|
+
"material": {
|
|
20
|
+
"n": 8,
|
|
21
|
+
"mean_score": 0.20833333333333331,
|
|
22
|
+
"accuracy": 0.125
|
|
23
|
+
},
|
|
24
|
+
"takeback": {
|
|
25
|
+
"n": 6,
|
|
26
|
+
"mean_score": 0.3333333333333333,
|
|
27
|
+
"accuracy": 0.3333333333333333
|
|
28
|
+
},
|
|
29
|
+
"lifecycle": {
|
|
30
|
+
"n": 2,
|
|
31
|
+
"mean_score": 0.5,
|
|
32
|
+
"accuracy": 0.5
|
|
33
|
+
},
|
|
34
|
+
"multi-fact": {
|
|
35
|
+
"n": 1,
|
|
36
|
+
"mean_score": 0.0,
|
|
37
|
+
"accuracy": 0.0
|
|
38
|
+
},
|
|
39
|
+
"policy": {
|
|
40
|
+
"n": 3,
|
|
41
|
+
"mean_score": 0.3333333333333333,
|
|
42
|
+
"accuracy": 0.3333333333333333
|
|
43
|
+
},
|
|
44
|
+
"certification": {
|
|
45
|
+
"n": 4,
|
|
46
|
+
"mean_score": 0.5,
|
|
47
|
+
"accuracy": 0.5
|
|
48
|
+
},
|
|
49
|
+
"multi-doc": {
|
|
50
|
+
"n": 2,
|
|
51
|
+
"mean_score": 0.0,
|
|
52
|
+
"accuracy": 0.0
|
|
53
|
+
},
|
|
54
|
+
"regulation": {
|
|
55
|
+
"n": 5,
|
|
56
|
+
"mean_score": 0.6,
|
|
57
|
+
"accuracy": 0.6
|
|
58
|
+
},
|
|
59
|
+
"concept": {
|
|
60
|
+
"n": 1,
|
|
61
|
+
"mean_score": 0.0,
|
|
62
|
+
"accuracy": 0.0
|
|
63
|
+
},
|
|
64
|
+
"rubric": {
|
|
65
|
+
"n": 3,
|
|
66
|
+
"mean_score": 0.0,
|
|
67
|
+
"accuracy": 0.0
|
|
68
|
+
},
|
|
69
|
+
"honesty": {
|
|
70
|
+
"n": 1,
|
|
71
|
+
"mean_score": 0.0,
|
|
72
|
+
"accuracy": 0.0
|
|
73
|
+
},
|
|
74
|
+
"multi-hop": {
|
|
75
|
+
"n": 2,
|
|
76
|
+
"mean_score": 0.0,
|
|
77
|
+
"accuracy": 0.0
|
|
78
|
+
},
|
|
79
|
+
"entity": {
|
|
80
|
+
"n": 1,
|
|
81
|
+
"mean_score": 0.0,
|
|
82
|
+
"accuracy": 0.0
|
|
83
|
+
},
|
|
84
|
+
"negative": {
|
|
85
|
+
"n": 1,
|
|
86
|
+
"mean_score": 0.6666666666666666,
|
|
87
|
+
"accuracy": 0.0
|
|
88
|
+
}
|
|
89
|
+
},
|
|
90
|
+
"extra": {
|
|
91
|
+
"ingest_ms": 63385.400133993244,
|
|
92
|
+
"grading": "substring",
|
|
93
|
+
"limit": 3,
|
|
94
|
+
"tokens": {
|
|
95
|
+
"corpus_tokens": 1459,
|
|
96
|
+
"query_tokens": 359,
|
|
97
|
+
"context_tokens": 1305,
|
|
98
|
+
"retrieval_tokens": 1664,
|
|
99
|
+
"naive_tokens": 36834,
|
|
100
|
+
"saved_tokens": 35170,
|
|
101
|
+
"reduction_pct": 0.9548243470706412,
|
|
102
|
+
"mean_retrieval_tokens_per_task": 66.56,
|
|
103
|
+
"tokenizer": "cl100k_base",
|
|
104
|
+
"per_task": {
|
|
105
|
+
"atlas-material-source": {
|
|
106
|
+
"query": 15,
|
|
107
|
+
"context": 37,
|
|
108
|
+
"retrieval": 52,
|
|
109
|
+
"judge_in": 0,
|
|
110
|
+
"judge_out": 0,
|
|
111
|
+
"judge_latency_ms": 0.0
|
|
112
|
+
},
|
|
113
|
+
"atlas-takeback-credit": {
|
|
114
|
+
"query": 15,
|
|
115
|
+
"context": 35,
|
|
116
|
+
"retrieval": 50,
|
|
117
|
+
"judge_in": 0,
|
|
118
|
+
"judge_out": 0,
|
|
119
|
+
"judge_latency_ms": 0.0
|
|
120
|
+
},
|
|
121
|
+
"atlas-closed-loop": {
|
|
122
|
+
"query": 8,
|
|
123
|
+
"context": 112,
|
|
124
|
+
"retrieval": 120,
|
|
125
|
+
"judge_in": 0,
|
|
126
|
+
"judge_out": 0,
|
|
127
|
+
"judge_latency_ms": 0.0
|
|
128
|
+
},
|
|
129
|
+
"luna-takeback-split": {
|
|
130
|
+
"query": 16,
|
|
131
|
+
"context": 43,
|
|
132
|
+
"retrieval": 59,
|
|
133
|
+
"judge_in": 0,
|
|
134
|
+
"judge_out": 0,
|
|
135
|
+
"judge_latency_ms": 0.0
|
|
136
|
+
},
|
|
137
|
+
"luna-silicone-fate": {
|
|
138
|
+
"query": 13,
|
|
139
|
+
"context": 49,
|
|
140
|
+
"retrieval": 62,
|
|
141
|
+
"judge_in": 0,
|
|
142
|
+
"judge_out": 0,
|
|
143
|
+
"judge_latency_ms": 0.0
|
|
144
|
+
},
|
|
145
|
+
"pla-home-compost": {
|
|
146
|
+
"query": 10,
|
|
147
|
+
"context": 34,
|
|
148
|
+
"retrieval": 44,
|
|
149
|
+
"judge_in": 0,
|
|
150
|
+
"judge_out": 0,
|
|
151
|
+
"judge_latency_ms": 0.0
|
|
152
|
+
},
|
|
153
|
+
"pla-hot-drinks": {
|
|
154
|
+
"query": 10,
|
|
155
|
+
"context": 36,
|
|
156
|
+
"retrieval": 46,
|
|
157
|
+
"judge_in": 0,
|
|
158
|
+
"judge_out": 0,
|
|
159
|
+
"judge_latency_ms": 0.0
|
|
160
|
+
},
|
|
161
|
+
"pbat-local-authority": {
|
|
162
|
+
"query": 13,
|
|
163
|
+
"context": 40,
|
|
164
|
+
"retrieval": 53,
|
|
165
|
+
"judge_in": 0,
|
|
166
|
+
"judge_out": 0,
|
|
167
|
+
"judge_latency_ms": 0.0
|
|
168
|
+
},
|
|
169
|
+
"ghost-net-source": {
|
|
170
|
+
"query": 15,
|
|
171
|
+
"context": 31,
|
|
172
|
+
"retrieval": 46,
|
|
173
|
+
"judge_in": 0,
|
|
174
|
+
"judge_out": 0,
|
|
175
|
+
"judge_latency_ms": 0.0
|
|
176
|
+
},
|
|
177
|
+
"kite-harness-foam-recovery": {
|
|
178
|
+
"query": 18,
|
|
179
|
+
"context": 116,
|
|
180
|
+
"retrieval": 134,
|
|
181
|
+
"judge_in": 0,
|
|
182
|
+
"judge_out": 0,
|
|
183
|
+
"judge_latency_ms": 0.0
|
|
184
|
+
},
|
|
185
|
+
"haven-sleeve-bottles-15": {
|
|
186
|
+
"query": 17,
|
|
187
|
+
"context": 44,
|
|
188
|
+
"retrieval": 61,
|
|
189
|
+
"judge_in": 0,
|
|
190
|
+
"judge_out": 0,
|
|
191
|
+
"judge_latency_ms": 0.0
|
|
192
|
+
},
|
|
193
|
+
"cert-c2c-tiers": {
|
|
194
|
+
"query": 12,
|
|
195
|
+
"context": 43,
|
|
196
|
+
"retrieval": 55,
|
|
197
|
+
"judge_in": 0,
|
|
198
|
+
"judge_out": 0,
|
|
199
|
+
"judge_latency_ms": 0.0
|
|
200
|
+
},
|
|
201
|
+
"cert-c2c-our-products": {
|
|
202
|
+
"query": 14,
|
|
203
|
+
"context": 41,
|
|
204
|
+
"retrieval": 55,
|
|
205
|
+
"judge_in": 0,
|
|
206
|
+
"judge_out": 0,
|
|
207
|
+
"judge_latency_ms": 0.0
|
|
208
|
+
},
|
|
209
|
+
"cert-grs-threshold": {
|
|
210
|
+
"query": 12,
|
|
211
|
+
"context": 95,
|
|
212
|
+
"retrieval": 107,
|
|
213
|
+
"judge_in": 0,
|
|
214
|
+
"judge_out": 0,
|
|
215
|
+
"judge_latency_ms": 0.0
|
|
216
|
+
},
|
|
217
|
+
"cert-en13432-temp": {
|
|
218
|
+
"query": 11,
|
|
219
|
+
"context": 48,
|
|
220
|
+
"retrieval": 59,
|
|
221
|
+
"judge_in": 0,
|
|
222
|
+
"judge_out": 0,
|
|
223
|
+
"judge_latency_ms": 0.0
|
|
224
|
+
},
|
|
225
|
+
"reg-uk-epr-scope": {
|
|
226
|
+
"query": 13,
|
|
227
|
+
"context": 105,
|
|
228
|
+
"retrieval": 118,
|
|
229
|
+
"judge_in": 0,
|
|
230
|
+
"judge_out": 0,
|
|
231
|
+
"judge_latency_ms": 0.0
|
|
232
|
+
},
|
|
233
|
+
"reg-espr-dpp": {
|
|
234
|
+
"query": 13,
|
|
235
|
+
"context": 31,
|
|
236
|
+
"retrieval": 44,
|
|
237
|
+
"judge_in": 0,
|
|
238
|
+
"judge_out": 0,
|
|
239
|
+
"judge_latency_ms": 0.0
|
|
240
|
+
},
|
|
241
|
+
"reg-ca-sb54-deadline": {
|
|
242
|
+
"query": 14,
|
|
243
|
+
"context": 39,
|
|
244
|
+
"retrieval": 53,
|
|
245
|
+
"judge_in": 0,
|
|
246
|
+
"judge_out": 0,
|
|
247
|
+
"judge_latency_ms": 0.0
|
|
248
|
+
},
|
|
249
|
+
"reg-lithium-return": {
|
|
250
|
+
"query": 13,
|
|
251
|
+
"context": 106,
|
|
252
|
+
"retrieval": 119,
|
|
253
|
+
"judge_in": 0,
|
|
254
|
+
"judge_out": 0,
|
|
255
|
+
"judge_latency_ms": 0.0
|
|
256
|
+
},
|
|
257
|
+
"closed-loop-threshold": {
|
|
258
|
+
"query": 16,
|
|
259
|
+
"context": 37,
|
|
260
|
+
"retrieval": 53,
|
|
261
|
+
"judge_in": 0,
|
|
262
|
+
"judge_out": 0,
|
|
263
|
+
"judge_latency_ms": 0.0
|
|
264
|
+
},
|
|
265
|
+
"rubric-customer-greenwash-claim": {
|
|
266
|
+
"query": 27,
|
|
267
|
+
"context": 44,
|
|
268
|
+
"retrieval": 71,
|
|
269
|
+
"judge_in": 400,
|
|
270
|
+
"judge_out": 29,
|
|
271
|
+
"judge_latency_ms": 653.5468610227108
|
|
272
|
+
},
|
|
273
|
+
"rubric-full-takeback-story-atlas": {
|
|
274
|
+
"query": 17,
|
|
275
|
+
"context": 35,
|
|
276
|
+
"retrieval": 52,
|
|
277
|
+
"judge_in": 363,
|
|
278
|
+
"judge_out": 46,
|
|
279
|
+
"judge_latency_ms": 960.7863810062408
|
|
280
|
+
},
|
|
281
|
+
"rubric-regulatory-scope-briefing": {
|
|
282
|
+
"query": 22,
|
|
283
|
+
"context": 43,
|
|
284
|
+
"retrieval": 65,
|
|
285
|
+
"judge_in": 395,
|
|
286
|
+
"judge_out": 35,
|
|
287
|
+
"judge_latency_ms": 703.8500519990921
|
|
288
|
+
},
|
|
289
|
+
"entity-closed-loop-skus": {
|
|
290
|
+
"query": 12,
|
|
291
|
+
"context": 35,
|
|
292
|
+
"retrieval": 47,
|
|
293
|
+
"judge_in": 0,
|
|
294
|
+
"judge_out": 0,
|
|
295
|
+
"judge_latency_ms": 0.0
|
|
296
|
+
},
|
|
297
|
+
"entity-home-compostable": {
|
|
298
|
+
"query": 13,
|
|
299
|
+
"context": 26,
|
|
300
|
+
"retrieval": 39,
|
|
301
|
+
"judge_in": 0,
|
|
302
|
+
"judge_out": 0,
|
|
303
|
+
"judge_latency_ms": 0.0
|
|
304
|
+
}
|
|
305
|
+
},
|
|
306
|
+
"judge_tokens_in": 1158,
|
|
307
|
+
"judge_tokens_out": 110,
|
|
308
|
+
"judge_calls": 3,
|
|
309
|
+
"judge_mean_latency_ms": 772.7277646760145
|
|
310
|
+
},
|
|
311
|
+
"cost_usd": {
|
|
312
|
+
"assumed_completion_tokens_per_task": 100,
|
|
313
|
+
"rates": {
|
|
314
|
+
"input_per_1k": 0.0025,
|
|
315
|
+
"output_per_1k": 0.01,
|
|
316
|
+
"model": "gpt-4o"
|
|
317
|
+
},
|
|
318
|
+
"retrieval_usd_in": 0.0041600000000000005,
|
|
319
|
+
"retrieval_usd_out": 0.025,
|
|
320
|
+
"retrieval_usd_total": 0.029160000000000002,
|
|
321
|
+
"naive_usd_total": 0.11708500000000002,
|
|
322
|
+
"saved_usd": 0.08792500000000002,
|
|
323
|
+
"saved_usd_per_1k_tasks": 3.517000000000001
|
|
324
|
+
}
|
|
325
|
+
},
|
|
326
|
+
"task_results": [
|
|
327
|
+
{
|
|
328
|
+
"task_id": "atlas-material-source",
|
|
329
|
+
"query": "Where does the recycled polypropylene in the Atlas Phone Shell come from?",
|
|
330
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nkubectl rollout restart deployment/billing-svc",
|
|
331
|
+
"hits": [
|
|
332
|
+
{
|
|
333
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
334
|
+
"score": 0.717772584091517,
|
|
335
|
+
"source": "pentatonic-memory",
|
|
336
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
340
|
+
"score": 0.7059285047827621,
|
|
341
|
+
"source": "pentatonic-memory",
|
|
342
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
346
|
+
"score": 0.7058531600945959,
|
|
347
|
+
"source": "pentatonic-memory",
|
|
348
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
349
|
+
}
|
|
350
|
+
],
|
|
351
|
+
"correct": false,
|
|
352
|
+
"score": 0.0,
|
|
353
|
+
"grading_notes": "missing 3/3: ['ocean-bound', '50km', 'coastlines']",
|
|
354
|
+
"search_time_ms": 33.91047302284278,
|
|
355
|
+
"generation_time_ms": 0.0,
|
|
356
|
+
"tokens_in": 0,
|
|
357
|
+
"tokens_out": 0,
|
|
358
|
+
"retrieval_tokens": 52,
|
|
359
|
+
"query_tokens": 15,
|
|
360
|
+
"context_tokens": 37,
|
|
361
|
+
"judge_tokens_in": 0,
|
|
362
|
+
"judge_tokens_out": 0,
|
|
363
|
+
"judge_latency_ms": 0.0
|
|
364
|
+
},
|
|
365
|
+
{
|
|
366
|
+
"task_id": "atlas-takeback-credit",
|
|
367
|
+
"query": "What credit do I get for returning an Atlas Phone Shell via take-back?",
|
|
368
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan\n---\nkubectl rollout restart deployment/billing-svc",
|
|
369
|
+
"hits": [
|
|
370
|
+
{
|
|
371
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
372
|
+
"score": 0.7343930024127078,
|
|
373
|
+
"source": "pentatonic-memory",
|
|
374
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
375
|
+
},
|
|
376
|
+
{
|
|
377
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
378
|
+
"score": 0.7322204450410176,
|
|
379
|
+
"source": "pentatonic-memory",
|
|
380
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
381
|
+
},
|
|
382
|
+
{
|
|
383
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
384
|
+
"score": 0.727235895615381,
|
|
385
|
+
"source": "pentatonic-memory",
|
|
386
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
387
|
+
}
|
|
388
|
+
],
|
|
389
|
+
"correct": false,
|
|
390
|
+
"score": 0.0,
|
|
391
|
+
"grading_notes": "missing 1/1: ['50%']",
|
|
392
|
+
"search_time_ms": 36.923238018061966,
|
|
393
|
+
"generation_time_ms": 0.0,
|
|
394
|
+
"tokens_in": 0,
|
|
395
|
+
"tokens_out": 0,
|
|
396
|
+
"retrieval_tokens": 50,
|
|
397
|
+
"query_tokens": 15,
|
|
398
|
+
"context_tokens": 35,
|
|
399
|
+
"judge_tokens_in": 0,
|
|
400
|
+
"judge_tokens_out": 0,
|
|
401
|
+
"judge_latency_ms": 0.0
|
|
402
|
+
},
|
|
403
|
+
{
|
|
404
|
+
"task_id": "atlas-closed-loop",
|
|
405
|
+
"query": "Is the Atlas Phone Shell closed-loop?",
|
|
406
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nTake-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
|
|
407
|
+
"hits": [
|
|
408
|
+
{
|
|
409
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
410
|
+
"score": 0.7310814665451857,
|
|
411
|
+
"source": "pentatonic-memory",
|
|
412
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
416
|
+
"score": 0.7215003819165545,
|
|
417
|
+
"source": "pentatonic-memory",
|
|
418
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
"text": "Take-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
|
|
422
|
+
"score": 0.7107398496991579,
|
|
423
|
+
"source": "pentatonic-memory",
|
|
424
|
+
"doc_id": "takeback-atlas-phone"
|
|
425
|
+
}
|
|
426
|
+
],
|
|
427
|
+
"correct": true,
|
|
428
|
+
"score": 1.0,
|
|
429
|
+
"grading_notes": "all substrings matched",
|
|
430
|
+
"search_time_ms": 30.080388009082526,
|
|
431
|
+
"generation_time_ms": 0.0,
|
|
432
|
+
"tokens_in": 0,
|
|
433
|
+
"tokens_out": 0,
|
|
434
|
+
"retrieval_tokens": 120,
|
|
435
|
+
"query_tokens": 8,
|
|
436
|
+
"context_tokens": 112,
|
|
437
|
+
"judge_tokens_in": 0,
|
|
438
|
+
"judge_tokens_out": 0,
|
|
439
|
+
"judge_latency_ms": 0.0
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
"task_id": "luna-takeback-split",
|
|
443
|
+
"query": "How is the take-back credit split between body and lid for the Luna bottle?",
|
|
444
|
+
"answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nemail-svc consumes OrderPaid for the receipt email",
|
|
445
|
+
"hits": [
|
|
446
|
+
{
|
|
447
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
448
|
+
"score": 0.7115161283985844,
|
|
449
|
+
"source": "pentatonic-memory",
|
|
450
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
451
|
+
},
|
|
452
|
+
{
|
|
453
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
454
|
+
"score": 0.7109313133232749,
|
|
455
|
+
"source": "pentatonic-memory",
|
|
456
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
457
|
+
},
|
|
458
|
+
{
|
|
459
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
460
|
+
"score": 0.7087783615859253,
|
|
461
|
+
"source": "pentatonic-memory",
|
|
462
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
463
|
+
}
|
|
464
|
+
],
|
|
465
|
+
"correct": false,
|
|
466
|
+
"score": 0.0,
|
|
467
|
+
"grading_notes": "missing 2/2: ['40%', '10%']",
|
|
468
|
+
"search_time_ms": 30.672520981170237,
|
|
469
|
+
"generation_time_ms": 0.0,
|
|
470
|
+
"tokens_in": 0,
|
|
471
|
+
"tokens_out": 0,
|
|
472
|
+
"retrieval_tokens": 59,
|
|
473
|
+
"query_tokens": 16,
|
|
474
|
+
"context_tokens": 43,
|
|
475
|
+
"judge_tokens_in": 0,
|
|
476
|
+
"judge_tokens_out": 0,
|
|
477
|
+
"judge_latency_ms": 0.0
|
|
478
|
+
},
|
|
479
|
+
{
|
|
480
|
+
"task_id": "luna-silicone-fate",
|
|
481
|
+
"query": "What happens to the Luna bottle's silicone lid after take-back?",
|
|
482
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
|
|
483
|
+
"hits": [
|
|
484
|
+
{
|
|
485
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
486
|
+
"score": 0.7074845871712615,
|
|
487
|
+
"source": "pentatonic-memory",
|
|
488
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
489
|
+
},
|
|
490
|
+
{
|
|
491
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
492
|
+
"score": 0.6897312378709735,
|
|
493
|
+
"source": "pentatonic-memory",
|
|
494
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
495
|
+
},
|
|
496
|
+
{
|
|
497
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
498
|
+
"score": 0.6884294445687019,
|
|
499
|
+
"source": "pentatonic-memory",
|
|
500
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
501
|
+
}
|
|
502
|
+
],
|
|
503
|
+
"correct": false,
|
|
504
|
+
"score": 0.0,
|
|
505
|
+
"grading_notes": "missing 2/2: ['playground', 'surfacing']",
|
|
506
|
+
"search_time_ms": 34.277692990144715,
|
|
507
|
+
"generation_time_ms": 0.0,
|
|
508
|
+
"tokens_in": 0,
|
|
509
|
+
"tokens_out": 0,
|
|
510
|
+
"retrieval_tokens": 62,
|
|
511
|
+
"query_tokens": 13,
|
|
512
|
+
"context_tokens": 49,
|
|
513
|
+
"judge_tokens_in": 0,
|
|
514
|
+
"judge_tokens_out": 0,
|
|
515
|
+
"judge_latency_ms": 0.0
|
|
516
|
+
},
|
|
517
|
+
{
|
|
518
|
+
"task_id": "pla-home-compost",
|
|
519
|
+
"query": "Can I compost a Loop coffee cup at home?",
|
|
520
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nsearch-svc API keys \u2014 alex\n---\nbilling-svc API keys \u2014 jordan",
|
|
521
|
+
"hits": [
|
|
522
|
+
{
|
|
523
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
524
|
+
"score": 0.6853794275304081,
|
|
525
|
+
"source": "pentatonic-memory",
|
|
526
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
527
|
+
},
|
|
528
|
+
{
|
|
529
|
+
"text": "search-svc API keys \u2014 alex",
|
|
530
|
+
"score": 0.6744533183538658,
|
|
531
|
+
"source": "pentatonic-memory",
|
|
532
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
533
|
+
},
|
|
534
|
+
{
|
|
535
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
536
|
+
"score": 0.6727133801984494,
|
|
537
|
+
"source": "pentatonic-memory",
|
|
538
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
539
|
+
}
|
|
540
|
+
],
|
|
541
|
+
"correct": false,
|
|
542
|
+
"score": 0.0,
|
|
543
|
+
"grading_notes": "missing 2/2: ['NOT home-compostable', 'industrial']",
|
|
544
|
+
"search_time_ms": 27.47367398114875,
|
|
545
|
+
"generation_time_ms": 0.0,
|
|
546
|
+
"tokens_in": 0,
|
|
547
|
+
"tokens_out": 0,
|
|
548
|
+
"retrieval_tokens": 44,
|
|
549
|
+
"query_tokens": 10,
|
|
550
|
+
"context_tokens": 34,
|
|
551
|
+
"judge_tokens_in": 0,
|
|
552
|
+
"judge_tokens_out": 0,
|
|
553
|
+
"judge_latency_ms": 0.0
|
|
554
|
+
},
|
|
555
|
+
{
|
|
556
|
+
"task_id": "pla-hot-drinks",
|
|
557
|
+
"query": "Can I put hot coffee in a Loop cup?",
|
|
558
|
+
"answer": "Used in: Loop Reusable Coffee Cup body\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc",
|
|
559
|
+
"hits": [
|
|
560
|
+
{
|
|
561
|
+
"text": "Used in: Loop Reusable Coffee Cup body",
|
|
562
|
+
"score": 0.6982296262258938,
|
|
563
|
+
"source": "pentatonic-memory",
|
|
564
|
+
"doc_id": "material-pla-plant"
|
|
565
|
+
},
|
|
566
|
+
{
|
|
567
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
568
|
+
"score": 0.6892282168065307,
|
|
569
|
+
"source": "pentatonic-memory",
|
|
570
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
571
|
+
},
|
|
572
|
+
{
|
|
573
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
574
|
+
"score": 0.672198393223188,
|
|
575
|
+
"source": "pentatonic-memory",
|
|
576
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
577
|
+
}
|
|
578
|
+
],
|
|
579
|
+
"correct": false,
|
|
580
|
+
"score": 0.0,
|
|
581
|
+
"grading_notes": "missing 2/2: ['60\u00b0C', 'avoid hot']",
|
|
582
|
+
"search_time_ms": 32.57425202173181,
|
|
583
|
+
"generation_time_ms": 0.0,
|
|
584
|
+
"tokens_in": 0,
|
|
585
|
+
"tokens_out": 0,
|
|
586
|
+
"retrieval_tokens": 46,
|
|
587
|
+
"query_tokens": 10,
|
|
588
|
+
"context_tokens": 36,
|
|
589
|
+
"judge_tokens_in": 0,
|
|
590
|
+
"judge_tokens_out": 0,
|
|
591
|
+
"judge_latency_ms": 0.0
|
|
592
|
+
},
|
|
593
|
+
{
|
|
594
|
+
"task_id": "pbat-local-authority",
|
|
595
|
+
"query": "Are Root bin liners accepted in UK council food-waste streams?",
|
|
596
|
+
"answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nreporting-svc consumes OrderPaid for financial reports",
|
|
597
|
+
"hits": [
|
|
598
|
+
{
|
|
599
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
600
|
+
"score": 0.7035865148344783,
|
|
601
|
+
"source": "pentatonic-memory",
|
|
602
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
603
|
+
},
|
|
604
|
+
{
|
|
605
|
+
"text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
606
|
+
"score": 0.7016776112626507,
|
|
607
|
+
"source": "pentatonic-memory",
|
|
608
|
+
"doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
|
|
609
|
+
},
|
|
610
|
+
{
|
|
611
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
612
|
+
"score": 0.7014271425936577,
|
|
613
|
+
"source": "pentatonic-memory",
|
|
614
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
615
|
+
}
|
|
616
|
+
],
|
|
617
|
+
"correct": false,
|
|
618
|
+
"score": 0.0,
|
|
619
|
+
"grading_notes": "missing 2/2: ['most', 'check locally']",
|
|
620
|
+
"search_time_ms": 30.28504498070106,
|
|
621
|
+
"generation_time_ms": 0.0,
|
|
622
|
+
"tokens_in": 0,
|
|
623
|
+
"tokens_out": 0,
|
|
624
|
+
"retrieval_tokens": 53,
|
|
625
|
+
"query_tokens": 13,
|
|
626
|
+
"context_tokens": 40,
|
|
627
|
+
"judge_tokens_in": 0,
|
|
628
|
+
"judge_tokens_out": 0,
|
|
629
|
+
"judge_latency_ms": 0.0
|
|
630
|
+
},
|
|
631
|
+
{
|
|
632
|
+
"task_id": "ghost-net-source",
|
|
633
|
+
"query": "Which NGO supplies the ghost-net nylon for the Nomad Kite Harness?",
|
|
634
|
+
"answer": "billing-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
635
|
+
"hits": [
|
|
636
|
+
{
|
|
637
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
638
|
+
"score": 0.7028674257796388,
|
|
639
|
+
"source": "pentatonic-memory",
|
|
640
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
"text": "search-svc API keys \u2014 alex",
|
|
644
|
+
"score": 0.7010953433995578,
|
|
645
|
+
"source": "pentatonic-memory",
|
|
646
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
647
|
+
},
|
|
648
|
+
{
|
|
649
|
+
"text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
|
|
650
|
+
"score": 0.6987444364156061,
|
|
651
|
+
"source": "pentatonic-memory",
|
|
652
|
+
"doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
|
|
653
|
+
}
|
|
654
|
+
],
|
|
655
|
+
"correct": false,
|
|
656
|
+
"score": 0.0,
|
|
657
|
+
"grading_notes": "missing 1/1: ['Healthy Seas']",
|
|
658
|
+
"search_time_ms": 36.64102699258365,
|
|
659
|
+
"generation_time_ms": 0.0,
|
|
660
|
+
"tokens_in": 0,
|
|
661
|
+
"tokens_out": 0,
|
|
662
|
+
"retrieval_tokens": 46,
|
|
663
|
+
"query_tokens": 15,
|
|
664
|
+
"context_tokens": 31,
|
|
665
|
+
"judge_tokens_in": 0,
|
|
666
|
+
"judge_tokens_out": 0,
|
|
667
|
+
"judge_latency_ms": 0.0
|
|
668
|
+
},
|
|
669
|
+
{
|
|
670
|
+
"task_id": "kite-harness-foam-recovery",
|
|
671
|
+
"query": "Is the EVA foam padding in the Nomad Kite Harness recovered via take-back?",
|
|
672
|
+
"answer": "Take-back programme \u2014 Nomad Kite Harness v3 (NMD-HRN-V3). Accepted condition: worn but repairable or end-of-life. Credit: 25% of original price. Alternative: repair-not-replace via the Nomad repair service (\u00a325 flat fee). Recovery route: ghost-net nylon is mechanically recycled back into new harness shells \u2014 closed-loop. EVA foam padding is NOT recovered (currently sent to energy-from-waste).\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nsearch-svc API keys \u2014 alex",
|
|
673
|
+
"hits": [
|
|
674
|
+
{
|
|
675
|
+
"text": "Take-back programme \u2014 Nomad Kite Harness v3 (NMD-HRN-V3). Accepted condition: worn but repairable or end-of-life. Credit: 25% of original price. Alternative: repair-not-replace via the Nomad repair service (\u00a325 flat fee). Recovery route: ghost-net nylon is mechanically recycled back into new harness shells \u2014 closed-loop. EVA foam padding is NOT recovered (currently sent to energy-from-waste).",
|
|
676
|
+
"score": 0.7696214428953514,
|
|
677
|
+
"source": "pentatonic-memory",
|
|
678
|
+
"doc_id": "takeback-kite-harness"
|
|
679
|
+
},
|
|
680
|
+
{
|
|
681
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
682
|
+
"score": 0.708591139375945,
|
|
683
|
+
"source": "pentatonic-memory",
|
|
684
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
685
|
+
},
|
|
686
|
+
{
|
|
687
|
+
"text": "search-svc API keys \u2014 alex",
|
|
688
|
+
"score": 0.7027095257990065,
|
|
689
|
+
"source": "pentatonic-memory",
|
|
690
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
691
|
+
}
|
|
692
|
+
],
|
|
693
|
+
"correct": true,
|
|
694
|
+
"score": 1.0,
|
|
695
|
+
"grading_notes": "all substrings matched",
|
|
696
|
+
"search_time_ms": 36.57193901017308,
|
|
697
|
+
"generation_time_ms": 0.0,
|
|
698
|
+
"tokens_in": 0,
|
|
699
|
+
"tokens_out": 0,
|
|
700
|
+
"retrieval_tokens": 134,
|
|
701
|
+
"query_tokens": 18,
|
|
702
|
+
"context_tokens": 116,
|
|
703
|
+
"judge_tokens_in": 0,
|
|
704
|
+
"judge_tokens_out": 0,
|
|
705
|
+
"judge_latency_ms": 0.0
|
|
706
|
+
},
|
|
707
|
+
{
|
|
708
|
+
"task_id": "haven-sleeve-bottles-15",
|
|
709
|
+
"query": "Roughly how many plastic bottles go into a 15\" Haven Laptop Sleeve?",
|
|
710
|
+
"answer": "18 bottles per 15\" laptop sleeve\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
711
|
+
"hits": [
|
|
712
|
+
{
|
|
713
|
+
"text": "18 bottles per 15\" laptop sleeve",
|
|
714
|
+
"score": 0.7187293973607064,
|
|
715
|
+
"source": "pentatonic-memory",
|
|
716
|
+
"doc_id": "material-pet-felt-recycled"
|
|
717
|
+
},
|
|
718
|
+
{
|
|
719
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
720
|
+
"score": 0.6813978441181031,
|
|
721
|
+
"source": "pentatonic-memory",
|
|
722
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
723
|
+
},
|
|
724
|
+
{
|
|
725
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
726
|
+
"score": 0.6790929528783419,
|
|
727
|
+
"source": "pentatonic-memory",
|
|
728
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
729
|
+
}
|
|
730
|
+
],
|
|
731
|
+
"correct": true,
|
|
732
|
+
"score": 1.0,
|
|
733
|
+
"grading_notes": "all substrings matched",
|
|
734
|
+
"search_time_ms": 32.229704986093566,
|
|
735
|
+
"generation_time_ms": 0.0,
|
|
736
|
+
"tokens_in": 0,
|
|
737
|
+
"tokens_out": 0,
|
|
738
|
+
"retrieval_tokens": 61,
|
|
739
|
+
"query_tokens": 17,
|
|
740
|
+
"context_tokens": 44,
|
|
741
|
+
"judge_tokens_in": 0,
|
|
742
|
+
"judge_tokens_out": 0,
|
|
743
|
+
"judge_latency_ms": 0.0
|
|
744
|
+
},
|
|
745
|
+
{
|
|
746
|
+
"task_id": "cert-c2c-tiers",
|
|
747
|
+
"query": "What are the tiers of Cradle to Cradle certification?",
|
|
748
|
+
"answer": "search-svc API keys \u2014 alex\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
749
|
+
"hits": [
|
|
750
|
+
{
|
|
751
|
+
"text": "search-svc API keys \u2014 alex",
|
|
752
|
+
"score": 0.7054974847784411,
|
|
753
|
+
"source": "pentatonic-memory",
|
|
754
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
755
|
+
},
|
|
756
|
+
{
|
|
757
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
758
|
+
"score": 0.7021982039205501,
|
|
759
|
+
"source": "pentatonic-memory",
|
|
760
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
761
|
+
},
|
|
762
|
+
{
|
|
763
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
764
|
+
"score": 0.7007000321209369,
|
|
765
|
+
"source": "pentatonic-memory",
|
|
766
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
767
|
+
}
|
|
768
|
+
],
|
|
769
|
+
"correct": false,
|
|
770
|
+
"score": 0.0,
|
|
771
|
+
"grading_notes": "missing 4/4: ['Bronze', 'Silver', 'Gold']",
|
|
772
|
+
"search_time_ms": 30.337461997987702,
|
|
773
|
+
"generation_time_ms": 0.0,
|
|
774
|
+
"tokens_in": 0,
|
|
775
|
+
"tokens_out": 0,
|
|
776
|
+
"retrieval_tokens": 55,
|
|
777
|
+
"query_tokens": 12,
|
|
778
|
+
"context_tokens": 43,
|
|
779
|
+
"judge_tokens_in": 0,
|
|
780
|
+
"judge_tokens_out": 0,
|
|
781
|
+
"judge_latency_ms": 0.0
|
|
782
|
+
},
|
|
783
|
+
{
|
|
784
|
+
"task_id": "cert-c2c-our-products",
|
|
785
|
+
"query": "Which product in our catalogue holds a Cradle to Cradle certification?",
|
|
786
|
+
"answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nCradle to Cradle Certified",
|
|
787
|
+
"hits": [
|
|
788
|
+
{
|
|
789
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
790
|
+
"score": 0.7320723215185255,
|
|
791
|
+
"source": "pentatonic-memory",
|
|
792
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
793
|
+
},
|
|
794
|
+
{
|
|
795
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
796
|
+
"score": 0.7263731271830014,
|
|
797
|
+
"source": "pentatonic-memory",
|
|
798
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
799
|
+
},
|
|
800
|
+
{
|
|
801
|
+
"text": "Cradle to Cradle Certified",
|
|
802
|
+
"score": 0.7251890554635944,
|
|
803
|
+
"source": "pentatonic-memory",
|
|
804
|
+
"doc_id": "cert-cradle-to-cradle"
|
|
805
|
+
}
|
|
806
|
+
],
|
|
807
|
+
"correct": false,
|
|
808
|
+
"score": 0.0,
|
|
809
|
+
"grading_notes": "missing 1/1: ['Atlas Phone Shell']",
|
|
810
|
+
"search_time_ms": 32.06962300464511,
|
|
811
|
+
"generation_time_ms": 0.0,
|
|
812
|
+
"tokens_in": 0,
|
|
813
|
+
"tokens_out": 0,
|
|
814
|
+
"retrieval_tokens": 55,
|
|
815
|
+
"query_tokens": 14,
|
|
816
|
+
"context_tokens": 41,
|
|
817
|
+
"judge_tokens_in": 0,
|
|
818
|
+
"judge_tokens_out": 0,
|
|
819
|
+
"judge_latency_ms": 0.0
|
|
820
|
+
},
|
|
821
|
+
{
|
|
822
|
+
"task_id": "cert-grs-threshold",
|
|
823
|
+
"query": "What's the minimum recycled content for the GRS claim?",
|
|
824
|
+
"answer": "GRS (Global Recycled Standard): third-party verification of recycled content, chain of custody, social and environmental practices, and chemical restrictions. Minimum 20% recycled content for a product to bear the GRS claim. Materials using GRS in our catalogue: rPP-OB (Atlas Phone Shell), rPET-FELT (Haven Sleeves).\n---\nreporting-svc consumes OrderPaid for financial reports\n---\nemail-svc consumes OrderPaid for the receipt email",
|
|
825
|
+
"hits": [
|
|
826
|
+
{
|
|
827
|
+
"text": "GRS (Global Recycled Standard): third-party verification of recycled content, chain of custody, social and environmental practices, and chemical restrictions. Minimum 20% recycled content for a product to bear the GRS claim. Materials using GRS in our catalogue: rPP-OB (Atlas Phone Shell), rPET-FELT (Haven Sleeves).",
|
|
828
|
+
"score": 0.7797542678463447,
|
|
829
|
+
"source": "pentatonic-memory",
|
|
830
|
+
"doc_id": "cert-grs"
|
|
831
|
+
},
|
|
832
|
+
{
|
|
833
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
834
|
+
"score": 0.7100348942211113,
|
|
835
|
+
"source": "pentatonic-memory",
|
|
836
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
837
|
+
},
|
|
838
|
+
{
|
|
839
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
840
|
+
"score": 0.7099882905289688,
|
|
841
|
+
"source": "pentatonic-memory",
|
|
842
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
843
|
+
}
|
|
844
|
+
],
|
|
845
|
+
"correct": true,
|
|
846
|
+
"score": 1.0,
|
|
847
|
+
"grading_notes": "all substrings matched",
|
|
848
|
+
"search_time_ms": 31.637842999771237,
|
|
849
|
+
"generation_time_ms": 0.0,
|
|
850
|
+
"tokens_in": 0,
|
|
851
|
+
"tokens_out": 0,
|
|
852
|
+
"retrieval_tokens": 107,
|
|
853
|
+
"query_tokens": 12,
|
|
854
|
+
"context_tokens": 95,
|
|
855
|
+
"judge_tokens_in": 0,
|
|
856
|
+
"judge_tokens_out": 0,
|
|
857
|
+
"judge_latency_ms": 0.0
|
|
858
|
+
},
|
|
859
|
+
{
|
|
860
|
+
"task_id": "cert-en13432-temp",
|
|
861
|
+
"query": "What industrial composting temperature does EN13432 require?",
|
|
862
|
+
"answer": "search-svc API keys \u2014 alex\n---\nIndustrial composting conditions are typically between 58\u00b0C and 60\u00b0C.\n---\nIndustrially compostable under EN13432 (60\u00b0C, 60% humidity, 12 weeks)",
|
|
863
|
+
"hits": [
|
|
864
|
+
{
|
|
865
|
+
"text": "search-svc API keys \u2014 alex",
|
|
866
|
+
"score": 0.7098324676699068,
|
|
867
|
+
"source": "pentatonic-memory",
|
|
868
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
869
|
+
},
|
|
870
|
+
{
|
|
871
|
+
"text": "Industrial composting conditions are typically between 58\u00b0C and 60\u00b0C.",
|
|
872
|
+
"score": 0.7082228724004089,
|
|
873
|
+
"source": "pentatonic-memory",
|
|
874
|
+
"doc_id": "cert-en13432"
|
|
875
|
+
},
|
|
876
|
+
{
|
|
877
|
+
"text": "Industrially compostable under EN13432 (60\u00b0C, 60% humidity, 12 weeks)",
|
|
878
|
+
"score": 0.7044637528947185,
|
|
879
|
+
"source": "pentatonic-memory",
|
|
880
|
+
"doc_id": "material-pla-plant"
|
|
881
|
+
}
|
|
882
|
+
],
|
|
883
|
+
"correct": true,
|
|
884
|
+
"score": 1.0,
|
|
885
|
+
"grading_notes": "all substrings matched",
|
|
886
|
+
"search_time_ms": 28.975945024285465,
|
|
887
|
+
"generation_time_ms": 0.0,
|
|
888
|
+
"tokens_in": 0,
|
|
889
|
+
"tokens_out": 0,
|
|
890
|
+
"retrieval_tokens": 59,
|
|
891
|
+
"query_tokens": 11,
|
|
892
|
+
"context_tokens": 48,
|
|
893
|
+
"judge_tokens_in": 0,
|
|
894
|
+
"judge_tokens_out": 0,
|
|
895
|
+
"judge_latency_ms": 0.0
|
|
896
|
+
},
|
|
897
|
+
{
|
|
898
|
+
"task_id": "reg-uk-epr-scope",
|
|
899
|
+
"query": "Is Pentatonic in scope for UK EPR packaging reporting?",
|
|
900
|
+
"answer": "UK Extended Producer Responsibility (EPR) for packaging: from 2025, packaging producers must report and pay fees based on the weight and recyclability of packaging placed on the UK market. Modulated fees favour recyclable formats. Reporting threshold: \u00a31M turnover AND 25 tonnes of packaging/year. Pentatonic: IN SCOPE. Our filings: due every 6 months.\n---\nreporting-svc consumes OrderPaid for financial reports\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
|
|
901
|
+
"hits": [
|
|
902
|
+
{
|
|
903
|
+
"text": "UK Extended Producer Responsibility (EPR) for packaging: from 2025, packaging producers must report and pay fees based on the weight and recyclability of packaging placed on the UK market. Modulated fees favour recyclable formats. Reporting threshold: \u00a31M turnover AND 25 tonnes of packaging/year. Pentatonic: IN SCOPE. Our filings: due every 6 months.",
|
|
904
|
+
"score": 0.7636955078087055,
|
|
905
|
+
"source": "pentatonic-memory",
|
|
906
|
+
"doc_id": "reg-uk-epr"
|
|
907
|
+
},
|
|
908
|
+
{
|
|
909
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
910
|
+
"score": 0.7278477202715807,
|
|
911
|
+
"source": "pentatonic-memory",
|
|
912
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
913
|
+
},
|
|
914
|
+
{
|
|
915
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
916
|
+
"score": 0.7233511443966896,
|
|
917
|
+
"source": "pentatonic-memory",
|
|
918
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
919
|
+
}
|
|
920
|
+
],
|
|
921
|
+
"correct": true,
|
|
922
|
+
"score": 1.0,
|
|
923
|
+
"grading_notes": "all substrings matched",
|
|
924
|
+
"search_time_ms": 29.966066009365022,
|
|
925
|
+
"generation_time_ms": 0.0,
|
|
926
|
+
"tokens_in": 0,
|
|
927
|
+
"tokens_out": 0,
|
|
928
|
+
"retrieval_tokens": 118,
|
|
929
|
+
"query_tokens": 13,
|
|
930
|
+
"context_tokens": 105,
|
|
931
|
+
"judge_tokens_in": 0,
|
|
932
|
+
"judge_tokens_out": 0,
|
|
933
|
+
"judge_latency_ms": 0.0
|
|
934
|
+
},
|
|
935
|
+
{
|
|
936
|
+
"task_id": "reg-espr-dpp",
|
|
937
|
+
"query": "What are Digital Product Passports and when do they start applying?",
|
|
938
|
+
"answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nbilling.new_invoice_pdf\n---\nsearch-svc API keys \u2014 alex",
|
|
939
|
+
"hits": [
|
|
940
|
+
{
|
|
941
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
942
|
+
"score": 0.7360437766023968,
|
|
943
|
+
"source": "pentatonic-memory",
|
|
944
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
945
|
+
},
|
|
946
|
+
{
|
|
947
|
+
"text": "billing.new_invoice_pdf",
|
|
948
|
+
"score": 0.7231612241667263,
|
|
949
|
+
"source": "pentatonic-memory",
|
|
950
|
+
"doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
|
|
951
|
+
},
|
|
952
|
+
{
|
|
953
|
+
"text": "search-svc API keys \u2014 alex",
|
|
954
|
+
"score": 0.7208368232900594,
|
|
955
|
+
"source": "pentatonic-memory",
|
|
956
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
957
|
+
}
|
|
958
|
+
],
|
|
959
|
+
"correct": false,
|
|
960
|
+
"score": 0.0,
|
|
961
|
+
"grading_notes": "missing 2/2: ['ESPR', '2026']",
|
|
962
|
+
"search_time_ms": 29.100234009092674,
|
|
963
|
+
"generation_time_ms": 0.0,
|
|
964
|
+
"tokens_in": 0,
|
|
965
|
+
"tokens_out": 0,
|
|
966
|
+
"retrieval_tokens": 44,
|
|
967
|
+
"query_tokens": 13,
|
|
968
|
+
"context_tokens": 31,
|
|
969
|
+
"judge_tokens_in": 0,
|
|
970
|
+
"judge_tokens_out": 0,
|
|
971
|
+
"judge_latency_ms": 0.0
|
|
972
|
+
},
|
|
973
|
+
{
|
|
974
|
+
"task_id": "reg-ca-sb54-deadline",
|
|
975
|
+
"query": "By when must packaging sold in California be recyclable or compostable?",
|
|
976
|
+
"answer": "California SB 54 requires all packaging sold in California to be recyclable or compostable by 2032\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nbilling.new_invoice_pdf",
|
|
977
|
+
"hits": [
|
|
978
|
+
{
|
|
979
|
+
"text": "California SB 54 requires all packaging sold in California to be recyclable or compostable by 2032",
|
|
980
|
+
"score": 0.7258111675299815,
|
|
981
|
+
"source": "pentatonic-memory",
|
|
982
|
+
"doc_id": "reg-ca-sb54"
|
|
983
|
+
},
|
|
984
|
+
{
|
|
985
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
986
|
+
"score": 0.7082859621730703,
|
|
987
|
+
"source": "pentatonic-memory",
|
|
988
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
989
|
+
},
|
|
990
|
+
{
|
|
991
|
+
"text": "billing.new_invoice_pdf",
|
|
992
|
+
"score": 0.7064729824939898,
|
|
993
|
+
"source": "pentatonic-memory",
|
|
994
|
+
"doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
|
|
995
|
+
}
|
|
996
|
+
],
|
|
997
|
+
"correct": true,
|
|
998
|
+
"score": 1.0,
|
|
999
|
+
"grading_notes": "all substrings matched",
|
|
1000
|
+
"search_time_ms": 32.30167299625464,
|
|
1001
|
+
"generation_time_ms": 0.0,
|
|
1002
|
+
"tokens_in": 0,
|
|
1003
|
+
"tokens_out": 0,
|
|
1004
|
+
"retrieval_tokens": 53,
|
|
1005
|
+
"query_tokens": 14,
|
|
1006
|
+
"context_tokens": 39,
|
|
1007
|
+
"judge_tokens_in": 0,
|
|
1008
|
+
"judge_tokens_out": 0,
|
|
1009
|
+
"judge_latency_ms": 0.0
|
|
1010
|
+
},
|
|
1011
|
+
{
|
|
1012
|
+
"task_id": "reg-lithium-return",
|
|
1013
|
+
"query": "Can lithium-containing devices be returned through our standard take-back label?",
|
|
1014
|
+
"answer": "Lithium cell regulation: devices containing lithium cells are subject to UN 3480/3481 transport rules and cannot be returned via standard take-back labels. Pentatonic's policy: take-back for devices with lithium cells requires a dedicated labelled pack and courier pickup (not drop-in). Current catalogue impact: none (we do not sell lithium-containing products).\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
1015
|
+
"hits": [
|
|
1016
|
+
{
|
|
1017
|
+
"text": "Lithium cell regulation: devices containing lithium cells are subject to UN 3480/3481 transport rules and cannot be returned via standard take-back labels. Pentatonic's policy: take-back for devices with lithium cells requires a dedicated labelled pack and courier pickup (not drop-in). Current catalogue impact: none (we do not sell lithium-containing products).",
|
|
1018
|
+
"score": 0.7740864588286744,
|
|
1019
|
+
"source": "pentatonic-memory",
|
|
1020
|
+
"doc_id": "reg-hazardous-lithium"
|
|
1021
|
+
},
|
|
1022
|
+
{
|
|
1023
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
1024
|
+
"score": 0.7148047633975116,
|
|
1025
|
+
"source": "pentatonic-memory",
|
|
1026
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
1027
|
+
},
|
|
1028
|
+
{
|
|
1029
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
1030
|
+
"score": 0.6989524168963428,
|
|
1031
|
+
"source": "pentatonic-memory",
|
|
1032
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
1033
|
+
}
|
|
1034
|
+
],
|
|
1035
|
+
"correct": true,
|
|
1036
|
+
"score": 1.0,
|
|
1037
|
+
"grading_notes": "all substrings matched",
|
|
1038
|
+
"search_time_ms": 32.68404499976896,
|
|
1039
|
+
"generation_time_ms": 0.0,
|
|
1040
|
+
"tokens_in": 0,
|
|
1041
|
+
"tokens_out": 0,
|
|
1042
|
+
"retrieval_tokens": 119,
|
|
1043
|
+
"query_tokens": 13,
|
|
1044
|
+
"context_tokens": 106,
|
|
1045
|
+
"judge_tokens_in": 0,
|
|
1046
|
+
"judge_tokens_out": 0,
|
|
1047
|
+
"judge_latency_ms": 0.0
|
|
1048
|
+
},
|
|
1049
|
+
{
|
|
1050
|
+
"task_id": "closed-loop-threshold",
|
|
1051
|
+
"query": "At what mass-recovery percentage is a SKU considered closed-loop in our system?",
|
|
1052
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nreporting-svc consumes OrderPaid for financial reports",
|
|
1053
|
+
"hits": [
|
|
1054
|
+
{
|
|
1055
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
1056
|
+
"score": 0.7480205138026356,
|
|
1057
|
+
"source": "pentatonic-memory",
|
|
1058
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
1059
|
+
},
|
|
1060
|
+
{
|
|
1061
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
1062
|
+
"score": 0.7337613102429211,
|
|
1063
|
+
"source": "pentatonic-memory",
|
|
1064
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
1065
|
+
},
|
|
1066
|
+
{
|
|
1067
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
1068
|
+
"score": 0.7232225784347823,
|
|
1069
|
+
"source": "pentatonic-memory",
|
|
1070
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
1071
|
+
}
|
|
1072
|
+
],
|
|
1073
|
+
"correct": false,
|
|
1074
|
+
"score": 0.0,
|
|
1075
|
+
"grading_notes": "missing 1/1: ['80%']",
|
|
1076
|
+
"search_time_ms": 36.26873600296676,
|
|
1077
|
+
"generation_time_ms": 0.0,
|
|
1078
|
+
"tokens_in": 0,
|
|
1079
|
+
"tokens_out": 0,
|
|
1080
|
+
"retrieval_tokens": 53,
|
|
1081
|
+
"query_tokens": 16,
|
|
1082
|
+
"context_tokens": 37,
|
|
1083
|
+
"judge_tokens_in": 0,
|
|
1084
|
+
"judge_tokens_out": 0,
|
|
1085
|
+
"judge_latency_ms": 0.0
|
|
1086
|
+
},
|
|
1087
|
+
{
|
|
1088
|
+
"task_id": "rubric-customer-greenwash-claim",
|
|
1089
|
+
"query": "A customer asks: 'Are your Loop cups just the same as plastic cups but with a greener label?' Write an honest response.",
|
|
1090
|
+
"answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
1091
|
+
"hits": [
|
|
1092
|
+
{
|
|
1093
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
1094
|
+
"score": 0.7258701666241831,
|
|
1095
|
+
"source": "pentatonic-memory",
|
|
1096
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
1097
|
+
},
|
|
1098
|
+
{
|
|
1099
|
+
"text": "email-svc consumes OrderPaid for the receipt email",
|
|
1100
|
+
"score": 0.7257878423171912,
|
|
1101
|
+
"source": "pentatonic-memory",
|
|
1102
|
+
"doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
|
|
1103
|
+
},
|
|
1104
|
+
{
|
|
1105
|
+
"text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
|
|
1106
|
+
"score": 0.7228502526867759,
|
|
1107
|
+
"source": "pentatonic-memory",
|
|
1108
|
+
"doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
|
|
1109
|
+
}
|
|
1110
|
+
],
|
|
1111
|
+
"correct": false,
|
|
1112
|
+
"score": 0.0,
|
|
1113
|
+
"grading_notes": "judge=0.00 \u2014 The answer does not address the customer's question about Loop cups at all.",
|
|
1114
|
+
"search_time_ms": 35.61789699597284,
|
|
1115
|
+
"generation_time_ms": 0.0,
|
|
1116
|
+
"tokens_in": 0,
|
|
1117
|
+
"tokens_out": 0,
|
|
1118
|
+
"retrieval_tokens": 71,
|
|
1119
|
+
"query_tokens": 27,
|
|
1120
|
+
"context_tokens": 44,
|
|
1121
|
+
"judge_tokens_in": 400,
|
|
1122
|
+
"judge_tokens_out": 29,
|
|
1123
|
+
"judge_latency_ms": 653.5468610227108
|
|
1124
|
+
},
|
|
1125
|
+
{
|
|
1126
|
+
"task_id": "rubric-full-takeback-story-atlas",
|
|
1127
|
+
"query": "Walk me through the complete material journey of an Atlas Phone Shell returned via take-back.",
|
|
1128
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nThe Atlas project is currently in development.",
|
|
1129
|
+
"hits": [
|
|
1130
|
+
{
|
|
1131
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
1132
|
+
"score": 0.7139806729827953,
|
|
1133
|
+
"source": "pentatonic-memory",
|
|
1134
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
1135
|
+
},
|
|
1136
|
+
{
|
|
1137
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
1138
|
+
"score": 0.7042797692854087,
|
|
1139
|
+
"source": "pentatonic-memory",
|
|
1140
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
1141
|
+
},
|
|
1142
|
+
{
|
|
1143
|
+
"text": "The Atlas project is currently in development.",
|
|
1144
|
+
"score": 0.7032975323621676,
|
|
1145
|
+
"source": "pentatonic-memory",
|
|
1146
|
+
"doc_id": "mem_4597a5ae-4f71-4e63-a5be-7d7b130001ee"
|
|
1147
|
+
}
|
|
1148
|
+
],
|
|
1149
|
+
"correct": false,
|
|
1150
|
+
"score": 0.0,
|
|
1151
|
+
"grading_notes": "judge=0.00 \u2014 The answer does not cover any stages of the material journey of an Atlas Phone Shell returned via take-back, instead pro",
|
|
1152
|
+
"search_time_ms": 36.97797399945557,
|
|
1153
|
+
"generation_time_ms": 0.0,
|
|
1154
|
+
"tokens_in": 0,
|
|
1155
|
+
"tokens_out": 0,
|
|
1156
|
+
"retrieval_tokens": 52,
|
|
1157
|
+
"query_tokens": 17,
|
|
1158
|
+
"context_tokens": 35,
|
|
1159
|
+
"judge_tokens_in": 363,
|
|
1160
|
+
"judge_tokens_out": 46,
|
|
1161
|
+
"judge_latency_ms": 960.7863810062408
|
|
1162
|
+
},
|
|
1163
|
+
{
|
|
1164
|
+
"task_id": "rubric-regulatory-scope-briefing",
|
|
1165
|
+
"query": "Give me a short regulatory briefing on what applies to Pentatonic in the UK, EU, and California.",
|
|
1166
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nreporting-svc consumes OrderPaid for financial reports",
|
|
1167
|
+
"hits": [
|
|
1168
|
+
{
|
|
1169
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
1170
|
+
"score": 0.7073277452568545,
|
|
1171
|
+
"source": "pentatonic-memory",
|
|
1172
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
1173
|
+
},
|
|
1174
|
+
{
|
|
1175
|
+
"text": "order-svc marks order as paid, emits OrderPaid on pubsub",
|
|
1176
|
+
"score": 0.7058599186870722,
|
|
1177
|
+
"source": "pentatonic-memory",
|
|
1178
|
+
"doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
|
|
1179
|
+
},
|
|
1180
|
+
{
|
|
1181
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
1182
|
+
"score": 0.7057759064878902,
|
|
1183
|
+
"source": "pentatonic-memory",
|
|
1184
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
1185
|
+
}
|
|
1186
|
+
],
|
|
1187
|
+
"correct": false,
|
|
1188
|
+
"score": 0.0,
|
|
1189
|
+
"grading_notes": "judge=0.00 \u2014 The answer does not address the regulatory briefing query at all and instead appears to be related to Kubernetes command",
|
|
1190
|
+
"search_time_ms": 34.713215980445966,
|
|
1191
|
+
"generation_time_ms": 0.0,
|
|
1192
|
+
"tokens_in": 0,
|
|
1193
|
+
"tokens_out": 0,
|
|
1194
|
+
"retrieval_tokens": 65,
|
|
1195
|
+
"query_tokens": 22,
|
|
1196
|
+
"context_tokens": 43,
|
|
1197
|
+
"judge_tokens_in": 395,
|
|
1198
|
+
"judge_tokens_out": 35,
|
|
1199
|
+
"judge_latency_ms": 703.8500519990921
|
|
1200
|
+
},
|
|
1201
|
+
{
|
|
1202
|
+
"task_id": "entity-closed-loop-skus",
|
|
1203
|
+
"query": "Which SKUs in our catalogue achieve closed-loop material recovery?",
|
|
1204
|
+
"answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nbilling-svc API keys \u2014 jordan",
|
|
1205
|
+
"hits": [
|
|
1206
|
+
{
|
|
1207
|
+
"text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
|
|
1208
|
+
"score": 0.739307648997999,
|
|
1209
|
+
"source": "pentatonic-memory",
|
|
1210
|
+
"doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
|
|
1211
|
+
},
|
|
1212
|
+
{
|
|
1213
|
+
"text": "kubectl rollout restart deployment/billing-svc",
|
|
1214
|
+
"score": 0.7293644391954482,
|
|
1215
|
+
"source": "pentatonic-memory",
|
|
1216
|
+
"doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
|
|
1217
|
+
},
|
|
1218
|
+
{
|
|
1219
|
+
"text": "billing-svc API keys \u2014 jordan",
|
|
1220
|
+
"score": 0.7265604833960109,
|
|
1221
|
+
"source": "pentatonic-memory",
|
|
1222
|
+
"doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
|
|
1223
|
+
}
|
|
1224
|
+
],
|
|
1225
|
+
"correct": false,
|
|
1226
|
+
"score": 0.0,
|
|
1227
|
+
"grading_notes": "no expected_substrings set",
|
|
1228
|
+
"search_time_ms": 31.940229993779212,
|
|
1229
|
+
"generation_time_ms": 0.0,
|
|
1230
|
+
"tokens_in": 0,
|
|
1231
|
+
"tokens_out": 0,
|
|
1232
|
+
"retrieval_tokens": 47,
|
|
1233
|
+
"query_tokens": 12,
|
|
1234
|
+
"context_tokens": 35,
|
|
1235
|
+
"judge_tokens_in": 0,
|
|
1236
|
+
"judge_tokens_out": 0,
|
|
1237
|
+
"judge_latency_ms": 0.0
|
|
1238
|
+
},
|
|
1239
|
+
{
|
|
1240
|
+
"task_id": "entity-home-compostable",
|
|
1241
|
+
"query": "List any materials in our catalogue that are home-compostable.",
|
|
1242
|
+
"answer": "search-svc API keys \u2014 alex\n---\nreporting-svc consumes OrderPaid for financial reports\n---\nNOT home-compostable",
|
|
1243
|
+
"hits": [
|
|
1244
|
+
{
|
|
1245
|
+
"text": "search-svc API keys \u2014 alex",
|
|
1246
|
+
"score": 0.6961037697159256,
|
|
1247
|
+
"source": "pentatonic-memory",
|
|
1248
|
+
"doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
|
|
1249
|
+
},
|
|
1250
|
+
{
|
|
1251
|
+
"text": "reporting-svc consumes OrderPaid for financial reports",
|
|
1252
|
+
"score": 0.6880397333304152,
|
|
1253
|
+
"source": "pentatonic-memory",
|
|
1254
|
+
"doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
|
|
1255
|
+
},
|
|
1256
|
+
{
|
|
1257
|
+
"text": "NOT home-compostable",
|
|
1258
|
+
"score": 0.6861630088900115,
|
|
1259
|
+
"source": "pentatonic-memory",
|
|
1260
|
+
"doc_id": "material-pla-plant"
|
|
1261
|
+
}
|
|
1262
|
+
],
|
|
1263
|
+
"correct": false,
|
|
1264
|
+
"score": 0.6666666666666666,
|
|
1265
|
+
"grading_notes": "missing 1/3: ['none']",
|
|
1266
|
+
"search_time_ms": 29.149259004043415,
|
|
1267
|
+
"generation_time_ms": 0.0,
|
|
1268
|
+
"tokens_in": 0,
|
|
1269
|
+
"tokens_out": 0,
|
|
1270
|
+
"retrieval_tokens": 39,
|
|
1271
|
+
"query_tokens": 13,
|
|
1272
|
+
"context_tokens": 26,
|
|
1273
|
+
"judge_tokens_in": 0,
|
|
1274
|
+
"judge_tokens_out": 0,
|
|
1275
|
+
"judge_latency_ms": 0.0
|
|
1276
|
+
}
|
|
1277
|
+
]
|
|
1278
|
+
}
|