@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,894 @@
1
+ {
2
+ "bench": "circular-economy",
3
+ "stack": "pentatonic-memory",
4
+ "n_tasks": 25,
5
+ "n_correct": 10,
6
+ "accuracy": 0.4,
7
+ "mean_score": 0.42,
8
+ "p50_search_ms": 27.48094199341722,
9
+ "p95_search_ms": 32.77066470764112,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 14,
16
+ "mean_score": 0.5,
17
+ "accuracy": 0.5
18
+ },
19
+ "material": {
20
+ "n": 8,
21
+ "mean_score": 0.125,
22
+ "accuracy": 0.125
23
+ },
24
+ "takeback": {
25
+ "n": 6,
26
+ "mean_score": 0.5833333333333334,
27
+ "accuracy": 0.5
28
+ },
29
+ "lifecycle": {
30
+ "n": 2,
31
+ "mean_score": 0.5,
32
+ "accuracy": 0.5
33
+ },
34
+ "multi-fact": {
35
+ "n": 1,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "policy": {
40
+ "n": 3,
41
+ "mean_score": 0.3333333333333333,
42
+ "accuracy": 0.3333333333333333
43
+ },
44
+ "certification": {
45
+ "n": 4,
46
+ "mean_score": 0.75,
47
+ "accuracy": 0.75
48
+ },
49
+ "multi-doc": {
50
+ "n": 2,
51
+ "mean_score": 0.5,
52
+ "accuracy": 0.5
53
+ },
54
+ "regulation": {
55
+ "n": 5,
56
+ "mean_score": 0.6,
57
+ "accuracy": 0.6
58
+ },
59
+ "concept": {
60
+ "n": 1,
61
+ "mean_score": 0.0,
62
+ "accuracy": 0.0
63
+ },
64
+ "rubric": {
65
+ "n": 3,
66
+ "mean_score": 0.16666666666666666,
67
+ "accuracy": 0.0
68
+ },
69
+ "honesty": {
70
+ "n": 1,
71
+ "mean_score": 0.0,
72
+ "accuracy": 0.0
73
+ },
74
+ "multi-hop": {
75
+ "n": 2,
76
+ "mean_score": 0.25,
77
+ "accuracy": 0.0
78
+ },
79
+ "entity": {
80
+ "n": 1,
81
+ "mean_score": 0.0,
82
+ "accuracy": 0.0
83
+ },
84
+ "negative": {
85
+ "n": 1,
86
+ "mean_score": 0.0,
87
+ "accuracy": 0.0
88
+ }
89
+ },
90
+ "extra": {
91
+ "ingest_ms": 11691.628797998419,
92
+ "grading": "substring",
93
+ "limit": 3,
94
+ "tokens": {
95
+ "corpus_tokens": 1459,
96
+ "query_tokens": 359,
97
+ "context_tokens": 1025,
98
+ "retrieval_tokens": 1384,
99
+ "naive_tokens": 36834,
100
+ "saved_tokens": 35450,
101
+ "reduction_pct": 0.9624260194385622,
102
+ "mean_retrieval_tokens_per_task": 55.36,
103
+ "tokenizer": "cl100k_base",
104
+ "per_task": {
105
+ "atlas-material-source": {
106
+ "query": 15,
107
+ "context": 0,
108
+ "retrieval": 15,
109
+ "judge_in": 0,
110
+ "judge_out": 0,
111
+ "judge_latency_ms": 0.0
112
+ },
113
+ "atlas-takeback-credit": {
114
+ "query": 15,
115
+ "context": 0,
116
+ "retrieval": 15,
117
+ "judge_in": 0,
118
+ "judge_out": 0,
119
+ "judge_latency_ms": 0.0
120
+ },
121
+ "atlas-closed-loop": {
122
+ "query": 8,
123
+ "context": 85,
124
+ "retrieval": 93,
125
+ "judge_in": 0,
126
+ "judge_out": 0,
127
+ "judge_latency_ms": 0.0
128
+ },
129
+ "luna-takeback-split": {
130
+ "query": 16,
131
+ "context": 0,
132
+ "retrieval": 16,
133
+ "judge_in": 0,
134
+ "judge_out": 0,
135
+ "judge_latency_ms": 0.0
136
+ },
137
+ "luna-silicone-fate": {
138
+ "query": 13,
139
+ "context": 95,
140
+ "retrieval": 108,
141
+ "judge_in": 0,
142
+ "judge_out": 0,
143
+ "judge_latency_ms": 0.0
144
+ },
145
+ "pla-home-compost": {
146
+ "query": 10,
147
+ "context": 88,
148
+ "retrieval": 98,
149
+ "judge_in": 0,
150
+ "judge_out": 0,
151
+ "judge_latency_ms": 0.0
152
+ },
153
+ "pla-hot-drinks": {
154
+ "query": 10,
155
+ "context": 0,
156
+ "retrieval": 10,
157
+ "judge_in": 0,
158
+ "judge_out": 0,
159
+ "judge_latency_ms": 0.0
160
+ },
161
+ "pbat-local-authority": {
162
+ "query": 13,
163
+ "context": 0,
164
+ "retrieval": 13,
165
+ "judge_in": 0,
166
+ "judge_out": 0,
167
+ "judge_latency_ms": 0.0
168
+ },
169
+ "ghost-net-source": {
170
+ "query": 15,
171
+ "context": 0,
172
+ "retrieval": 15,
173
+ "judge_in": 0,
174
+ "judge_out": 0,
175
+ "judge_latency_ms": 0.0
176
+ },
177
+ "kite-harness-foam-recovery": {
178
+ "query": 18,
179
+ "context": 91,
180
+ "retrieval": 109,
181
+ "judge_in": 0,
182
+ "judge_out": 0,
183
+ "judge_latency_ms": 0.0
184
+ },
185
+ "haven-sleeve-bottles-15": {
186
+ "query": 17,
187
+ "context": 0,
188
+ "retrieval": 17,
189
+ "judge_in": 0,
190
+ "judge_out": 0,
191
+ "judge_latency_ms": 0.0
192
+ },
193
+ "cert-c2c-tiers": {
194
+ "query": 12,
195
+ "context": 58,
196
+ "retrieval": 70,
197
+ "judge_in": 0,
198
+ "judge_out": 0,
199
+ "judge_latency_ms": 0.0
200
+ },
201
+ "cert-c2c-our-products": {
202
+ "query": 14,
203
+ "context": 58,
204
+ "retrieval": 72,
205
+ "judge_in": 0,
206
+ "judge_out": 0,
207
+ "judge_latency_ms": 0.0
208
+ },
209
+ "cert-grs-threshold": {
210
+ "query": 12,
211
+ "context": 72,
212
+ "retrieval": 84,
213
+ "judge_in": 0,
214
+ "judge_out": 0,
215
+ "judge_latency_ms": 0.0
216
+ },
217
+ "cert-en13432-temp": {
218
+ "query": 11,
219
+ "context": 85,
220
+ "retrieval": 96,
221
+ "judge_in": 0,
222
+ "judge_out": 0,
223
+ "judge_latency_ms": 0.0
224
+ },
225
+ "reg-uk-epr-scope": {
226
+ "query": 13,
227
+ "context": 78,
228
+ "retrieval": 91,
229
+ "judge_in": 0,
230
+ "judge_out": 0,
231
+ "judge_latency_ms": 0.0
232
+ },
233
+ "reg-espr-dpp": {
234
+ "query": 13,
235
+ "context": 0,
236
+ "retrieval": 13,
237
+ "judge_in": 0,
238
+ "judge_out": 0,
239
+ "judge_latency_ms": 0.0
240
+ },
241
+ "reg-ca-sb54-deadline": {
242
+ "query": 14,
243
+ "context": 73,
244
+ "retrieval": 87,
245
+ "judge_in": 0,
246
+ "judge_out": 0,
247
+ "judge_latency_ms": 0.0
248
+ },
249
+ "reg-lithium-return": {
250
+ "query": 13,
251
+ "context": 72,
252
+ "retrieval": 85,
253
+ "judge_in": 0,
254
+ "judge_out": 0,
255
+ "judge_latency_ms": 0.0
256
+ },
257
+ "closed-loop-threshold": {
258
+ "query": 16,
259
+ "context": 0,
260
+ "retrieval": 16,
261
+ "judge_in": 0,
262
+ "judge_out": 0,
263
+ "judge_latency_ms": 0.0
264
+ },
265
+ "rubric-customer-greenwash-claim": {
266
+ "query": 27,
267
+ "context": 0,
268
+ "retrieval": 27,
269
+ "judge_in": 355,
270
+ "judge_out": 62,
271
+ "judge_latency_ms": 1281.8933610022068
272
+ },
273
+ "rubric-full-takeback-story-atlas": {
274
+ "query": 17,
275
+ "context": 85,
276
+ "retrieval": 102,
277
+ "judge_in": 413,
278
+ "judge_out": 46,
279
+ "judge_latency_ms": 1095.577819019556
280
+ },
281
+ "rubric-regulatory-scope-briefing": {
282
+ "query": 22,
283
+ "context": 0,
284
+ "retrieval": 22,
285
+ "judge_in": 351,
286
+ "judge_out": 38,
287
+ "judge_latency_ms": 745.5519150197506
288
+ },
289
+ "entity-closed-loop-skus": {
290
+ "query": 12,
291
+ "context": 0,
292
+ "retrieval": 12,
293
+ "judge_in": 0,
294
+ "judge_out": 0,
295
+ "judge_latency_ms": 0.0
296
+ },
297
+ "entity-home-compostable": {
298
+ "query": 13,
299
+ "context": 85,
300
+ "retrieval": 98,
301
+ "judge_in": 0,
302
+ "judge_out": 0,
303
+ "judge_latency_ms": 0.0
304
+ }
305
+ },
306
+ "judge_tokens_in": 1119,
307
+ "judge_tokens_out": 146,
308
+ "judge_calls": 3,
309
+ "judge_mean_latency_ms": 1041.007698347171
310
+ },
311
+ "cost_usd": {
312
+ "assumed_completion_tokens_per_task": 100,
313
+ "rates": {
314
+ "input_per_1k": 0.0025,
315
+ "output_per_1k": 0.01,
316
+ "model": "gpt-4o"
317
+ },
318
+ "retrieval_usd_in": 0.00346,
319
+ "retrieval_usd_out": 0.025,
320
+ "retrieval_usd_total": 0.028460000000000003,
321
+ "naive_usd_total": 0.11708500000000002,
322
+ "saved_usd": 0.08862500000000002,
323
+ "saved_usd_per_1k_tasks": 3.545000000000001
324
+ }
325
+ },
326
+ "task_results": [
327
+ {
328
+ "task_id": "atlas-material-source",
329
+ "query": "Where does the recycled polypropylene in the Atlas Phone Shell come from?",
330
+ "answer": "",
331
+ "hits": [],
332
+ "correct": false,
333
+ "score": 0.0,
334
+ "grading_notes": "missing 3/3: ['ocean-bound', '50km', 'coastlines']",
335
+ "search_time_ms": 29.875653999624774,
336
+ "generation_time_ms": 0.0,
337
+ "tokens_in": 0,
338
+ "tokens_out": 0,
339
+ "retrieval_tokens": 15,
340
+ "query_tokens": 15,
341
+ "context_tokens": 0,
342
+ "judge_tokens_in": 0,
343
+ "judge_tokens_out": 0,
344
+ "judge_latency_ms": 0.0
345
+ },
346
+ {
347
+ "task_id": "atlas-takeback-credit",
348
+ "query": "What credit do I get for returning an Atlas Phone Shell via take-back?",
349
+ "answer": "",
350
+ "hits": [],
351
+ "correct": false,
352
+ "score": 0.0,
353
+ "grading_notes": "missing 1/1: ['50%']",
354
+ "search_time_ms": 27.204331010580063,
355
+ "generation_time_ms": 0.0,
356
+ "tokens_in": 0,
357
+ "tokens_out": 0,
358
+ "retrieval_tokens": 15,
359
+ "query_tokens": 15,
360
+ "context_tokens": 0,
361
+ "judge_tokens_in": 0,
362
+ "judge_tokens_out": 0,
363
+ "judge_latency_ms": 0.0
364
+ },
365
+ {
366
+ "task_id": "atlas-closed-loop",
367
+ "query": "Is the Atlas Phone Shell closed-loop?",
368
+ "answer": "Take-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
369
+ "hits": [
370
+ {
371
+ "text": "Take-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
372
+ "score": 0.7255719755593892,
373
+ "source": "pentatonic-memory",
374
+ "doc_id": "takeback-atlas-phone"
375
+ }
376
+ ],
377
+ "correct": true,
378
+ "score": 1.0,
379
+ "grading_notes": "all substrings matched",
380
+ "search_time_ms": 26.919080992229283,
381
+ "generation_time_ms": 0.0,
382
+ "tokens_in": 0,
383
+ "tokens_out": 0,
384
+ "retrieval_tokens": 93,
385
+ "query_tokens": 8,
386
+ "context_tokens": 85,
387
+ "judge_tokens_in": 0,
388
+ "judge_tokens_out": 0,
389
+ "judge_latency_ms": 0.0
390
+ },
391
+ {
392
+ "task_id": "luna-takeback-split",
393
+ "query": "How is the take-back credit split between body and lid for the Luna bottle?",
394
+ "answer": "",
395
+ "hits": [],
396
+ "correct": false,
397
+ "score": 0.0,
398
+ "grading_notes": "missing 2/2: ['40%', '10%']",
399
+ "search_time_ms": 31.533285975456238,
400
+ "generation_time_ms": 0.0,
401
+ "tokens_in": 0,
402
+ "tokens_out": 0,
403
+ "retrieval_tokens": 16,
404
+ "query_tokens": 16,
405
+ "context_tokens": 0,
406
+ "judge_tokens_in": 0,
407
+ "judge_tokens_out": 0,
408
+ "judge_latency_ms": 0.0
409
+ },
410
+ {
411
+ "task_id": "luna-silicone-fate",
412
+ "query": "What happens to the Luna bottle's silicone lid after take-back?",
413
+ "answer": "Take-back programme \u2014 Luna Water Bottle 500ml (LUN-BTL-042). Accepted condition: any. Credit: 40% of original price as store credit (body) + 10% (lid, if returned intact). Process: drop-off at any UK Pentatonic pop-up, or mail-in via takeback label. Recovery route: steel body enters industrial recycling (not closed-loop); silicone lid is down-cycled into playground surfacing granulate.",
414
+ "hits": [
415
+ {
416
+ "text": "Take-back programme \u2014 Luna Water Bottle 500ml (LUN-BTL-042). Accepted condition: any. Credit: 40% of original price as store credit (body) + 10% (lid, if returned intact). Process: drop-off at any UK Pentatonic pop-up, or mail-in via takeback label. Recovery route: steel body enters industrial recycling (not closed-loop); silicone lid is down-cycled into playground surfacing granulate.",
417
+ "score": 0.5304672081450654,
418
+ "source": "pentatonic-memory",
419
+ "doc_id": "takeback-luna-bottle"
420
+ }
421
+ ],
422
+ "correct": true,
423
+ "score": 1.0,
424
+ "grading_notes": "all substrings matched",
425
+ "search_time_ms": 32.17602800577879,
426
+ "generation_time_ms": 0.0,
427
+ "tokens_in": 0,
428
+ "tokens_out": 0,
429
+ "retrieval_tokens": 108,
430
+ "query_tokens": 13,
431
+ "context_tokens": 95,
432
+ "judge_tokens_in": 0,
433
+ "judge_tokens_out": 0,
434
+ "judge_latency_ms": 0.0
435
+ },
436
+ {
437
+ "task_id": "pla-home-compost",
438
+ "query": "Can I compost a Loop coffee cup at home?",
439
+ "answer": "Material: PLA (polylactic acid). Bioplastic derived from corn starch (US Midwest feedstock for our supply). Industrially compostable under EN13432 (60\u00b0C, 60% humidity, 12 weeks). NOT home-compostable and NOT recyclable through standard plastic streams. Degradation threshold: 60\u00b0C \u2014 avoid hot liquids for direct-contact applications. Used in: Loop Reusable Coffee Cup body.",
440
+ "hits": [
441
+ {
442
+ "text": "Material: PLA (polylactic acid). Bioplastic derived from corn starch (US Midwest feedstock for our supply). Industrially compostable under EN13432 (60\u00b0C, 60% humidity, 12 weeks). NOT home-compostable and NOT recyclable through standard plastic streams. Degradation threshold: 60\u00b0C \u2014 avoid hot liquids for direct-contact applications. Used in: Loop Reusable Coffee Cup body.",
443
+ "score": 0.7333944698730844,
444
+ "source": "pentatonic-memory",
445
+ "doc_id": "material-pla-plant"
446
+ }
447
+ ],
448
+ "correct": true,
449
+ "score": 1.0,
450
+ "grading_notes": "all substrings matched",
451
+ "search_time_ms": 27.48094199341722,
452
+ "generation_time_ms": 0.0,
453
+ "tokens_in": 0,
454
+ "tokens_out": 0,
455
+ "retrieval_tokens": 98,
456
+ "query_tokens": 10,
457
+ "context_tokens": 88,
458
+ "judge_tokens_in": 0,
459
+ "judge_tokens_out": 0,
460
+ "judge_latency_ms": 0.0
461
+ },
462
+ {
463
+ "task_id": "pla-hot-drinks",
464
+ "query": "Can I put hot coffee in a Loop cup?",
465
+ "answer": "",
466
+ "hits": [],
467
+ "correct": false,
468
+ "score": 0.0,
469
+ "grading_notes": "missing 2/2: ['60\u00b0C', 'avoid hot']",
470
+ "search_time_ms": 25.262823997763917,
471
+ "generation_time_ms": 0.0,
472
+ "tokens_in": 0,
473
+ "tokens_out": 0,
474
+ "retrieval_tokens": 10,
475
+ "query_tokens": 10,
476
+ "context_tokens": 0,
477
+ "judge_tokens_in": 0,
478
+ "judge_tokens_out": 0,
479
+ "judge_latency_ms": 0.0
480
+ },
481
+ {
482
+ "task_id": "pbat-local-authority",
483
+ "query": "Are Root bin liners accepted in UK council food-waste streams?",
484
+ "answer": "",
485
+ "hits": [],
486
+ "correct": false,
487
+ "score": 0.0,
488
+ "grading_notes": "missing 2/2: ['most', 'check locally']",
489
+ "search_time_ms": 24.665570992510766,
490
+ "generation_time_ms": 0.0,
491
+ "tokens_in": 0,
492
+ "tokens_out": 0,
493
+ "retrieval_tokens": 13,
494
+ "query_tokens": 13,
495
+ "context_tokens": 0,
496
+ "judge_tokens_in": 0,
497
+ "judge_tokens_out": 0,
498
+ "judge_latency_ms": 0.0
499
+ },
500
+ {
501
+ "task_id": "ghost-net-source",
502
+ "query": "Which NGO supplies the ghost-net nylon for the Nomad Kite Harness?",
503
+ "answer": "",
504
+ "hits": [],
505
+ "correct": false,
506
+ "score": 0.0,
507
+ "grading_notes": "missing 1/1: ['Healthy Seas']",
508
+ "search_time_ms": 29.78819701820612,
509
+ "generation_time_ms": 0.0,
510
+ "tokens_in": 0,
511
+ "tokens_out": 0,
512
+ "retrieval_tokens": 15,
513
+ "query_tokens": 15,
514
+ "context_tokens": 0,
515
+ "judge_tokens_in": 0,
516
+ "judge_tokens_out": 0,
517
+ "judge_latency_ms": 0.0
518
+ },
519
+ {
520
+ "task_id": "kite-harness-foam-recovery",
521
+ "query": "Is the EVA foam padding in the Nomad Kite Harness recovered via take-back?",
522
+ "answer": "Take-back programme \u2014 Nomad Kite Harness v3 (NMD-HRN-V3). Accepted condition: worn but repairable or end-of-life. Credit: 25% of original price. Alternative: repair-not-replace via the Nomad repair service (\u00a325 flat fee). Recovery route: ghost-net nylon is mechanically recycled back into new harness shells \u2014 closed-loop. EVA foam padding is NOT recovered (currently sent to energy-from-waste).",
523
+ "hits": [
524
+ {
525
+ "text": "Take-back programme \u2014 Nomad Kite Harness v3 (NMD-HRN-V3). Accepted condition: worn but repairable or end-of-life. Credit: 25% of original price. Alternative: repair-not-replace via the Nomad repair service (\u00a325 flat fee). Recovery route: ghost-net nylon is mechanically recycled back into new harness shells \u2014 closed-loop. EVA foam padding is NOT recovered (currently sent to energy-from-waste).",
526
+ "score": 0.7871008236065192,
527
+ "source": "pentatonic-memory",
528
+ "doc_id": "takeback-kite-harness"
529
+ }
530
+ ],
531
+ "correct": true,
532
+ "score": 1.0,
533
+ "grading_notes": "all substrings matched",
534
+ "search_time_ms": 31.17753900005482,
535
+ "generation_time_ms": 0.0,
536
+ "tokens_in": 0,
537
+ "tokens_out": 0,
538
+ "retrieval_tokens": 109,
539
+ "query_tokens": 18,
540
+ "context_tokens": 91,
541
+ "judge_tokens_in": 0,
542
+ "judge_tokens_out": 0,
543
+ "judge_latency_ms": 0.0
544
+ },
545
+ {
546
+ "task_id": "haven-sleeve-bottles-15",
547
+ "query": "Roughly how many plastic bottles go into a 15\" Haven Laptop Sleeve?",
548
+ "answer": "",
549
+ "hits": [],
550
+ "correct": false,
551
+ "score": 0.0,
552
+ "grading_notes": "missing 1/1: ['18']",
553
+ "search_time_ms": 26.507524016778916,
554
+ "generation_time_ms": 0.0,
555
+ "tokens_in": 0,
556
+ "tokens_out": 0,
557
+ "retrieval_tokens": 17,
558
+ "query_tokens": 17,
559
+ "context_tokens": 0,
560
+ "judge_tokens_in": 0,
561
+ "judge_tokens_out": 0,
562
+ "judge_latency_ms": 0.0
563
+ },
564
+ {
565
+ "task_id": "cert-c2c-tiers",
566
+ "query": "What are the tiers of Cradle to Cradle certification?",
567
+ "answer": "Cradle to Cradle Certified: multi-attribute product certification covering material health, material reutilisation, renewable energy, water stewardship, and social fairness. Tiers (lowest to highest): Bronze, Silver, Gold, Platinum. Our holders: Atlas Phone Shell (Bronze).",
568
+ "hits": [
569
+ {
570
+ "text": "Cradle to Cradle Certified: multi-attribute product certification covering material health, material reutilisation, renewable energy, water stewardship, and social fairness. Tiers (lowest to highest): Bronze, Silver, Gold, Platinum. Our holders: Atlas Phone Shell (Bronze).",
571
+ "score": 0.7877394509195181,
572
+ "source": "pentatonic-memory",
573
+ "doc_id": "cert-cradle-to-cradle"
574
+ }
575
+ ],
576
+ "correct": true,
577
+ "score": 1.0,
578
+ "grading_notes": "all substrings matched",
579
+ "search_time_ms": 24.01597998687066,
580
+ "generation_time_ms": 0.0,
581
+ "tokens_in": 0,
582
+ "tokens_out": 0,
583
+ "retrieval_tokens": 70,
584
+ "query_tokens": 12,
585
+ "context_tokens": 58,
586
+ "judge_tokens_in": 0,
587
+ "judge_tokens_out": 0,
588
+ "judge_latency_ms": 0.0
589
+ },
590
+ {
591
+ "task_id": "cert-c2c-our-products",
592
+ "query": "Which product in our catalogue holds a Cradle to Cradle certification?",
593
+ "answer": "Cradle to Cradle Certified: multi-attribute product certification covering material health, material reutilisation, renewable energy, water stewardship, and social fairness. Tiers (lowest to highest): Bronze, Silver, Gold, Platinum. Our holders: Atlas Phone Shell (Bronze).",
594
+ "hits": [
595
+ {
596
+ "text": "Cradle to Cradle Certified: multi-attribute product certification covering material health, material reutilisation, renewable energy, water stewardship, and social fairness. Tiers (lowest to highest): Bronze, Silver, Gold, Platinum. Our holders: Atlas Phone Shell (Bronze).",
597
+ "score": 0.580276930908043,
598
+ "source": "pentatonic-memory",
599
+ "doc_id": "cert-cradle-to-cradle"
600
+ }
601
+ ],
602
+ "correct": true,
603
+ "score": 1.0,
604
+ "grading_notes": "all substrings matched",
605
+ "search_time_ms": 24.92099697701633,
606
+ "generation_time_ms": 0.0,
607
+ "tokens_in": 0,
608
+ "tokens_out": 0,
609
+ "retrieval_tokens": 72,
610
+ "query_tokens": 14,
611
+ "context_tokens": 58,
612
+ "judge_tokens_in": 0,
613
+ "judge_tokens_out": 0,
614
+ "judge_latency_ms": 0.0
615
+ },
616
+ {
617
+ "task_id": "cert-grs-threshold",
618
+ "query": "What's the minimum recycled content for the GRS claim?",
619
+ "answer": "GRS (Global Recycled Standard): third-party verification of recycled content, chain of custody, social and environmental practices, and chemical restrictions. Minimum 20% recycled content for a product to bear the GRS claim. Materials using GRS in our catalogue: rPP-OB (Atlas Phone Shell), rPET-FELT (Haven Sleeves).",
620
+ "hits": [
621
+ {
622
+ "text": "GRS (Global Recycled Standard): third-party verification of recycled content, chain of custody, social and environmental practices, and chemical restrictions. Minimum 20% recycled content for a product to bear the GRS claim. Materials using GRS in our catalogue: rPP-OB (Atlas Phone Shell), rPET-FELT (Haven Sleeves).",
623
+ "score": 0.7916210271663773,
624
+ "source": "pentatonic-memory",
625
+ "doc_id": "cert-grs"
626
+ }
627
+ ],
628
+ "correct": true,
629
+ "score": 1.0,
630
+ "grading_notes": "all substrings matched",
631
+ "search_time_ms": 23.015954007860273,
632
+ "generation_time_ms": 0.0,
633
+ "tokens_in": 0,
634
+ "tokens_out": 0,
635
+ "retrieval_tokens": 84,
636
+ "query_tokens": 12,
637
+ "context_tokens": 72,
638
+ "judge_tokens_in": 0,
639
+ "judge_tokens_out": 0,
640
+ "judge_latency_ms": 0.0
641
+ },
642
+ {
643
+ "task_id": "cert-en13432-temp",
644
+ "query": "What industrial composting temperature does EN13432 require?",
645
+ "answer": "Material: PBAT (polybutylene adipate terephthalate) + corn starch biopolymer. Industrially compostable (EN13432). Home-compostability: NO \u2014 requires industrial temperatures. Used in: Root Bio Bin Liners. Regulatory note: accepted in most UK local-authority food-waste streams that accept certified compostable bags, but some authorities reject all bag types \u2014 check locally.",
646
+ "hits": [
647
+ {
648
+ "text": "Material: PBAT (polybutylene adipate terephthalate) + corn starch biopolymer. Industrially compostable (EN13432). Home-compostability: NO \u2014 requires industrial temperatures. Used in: Root Bio Bin Liners. Regulatory note: accepted in most UK local-authority food-waste streams that accept certified compostable bags, but some authorities reject all bag types \u2014 check locally.",
649
+ "score": 0.7422245635590629,
650
+ "source": "pentatonic-memory",
651
+ "doc_id": "material-pbat-starch"
652
+ }
653
+ ],
654
+ "correct": false,
655
+ "score": 0.0,
656
+ "grading_notes": "missing 2/2: ['58', '60']",
657
+ "search_time_ms": 20.66308300709352,
658
+ "generation_time_ms": 0.0,
659
+ "tokens_in": 0,
660
+ "tokens_out": 0,
661
+ "retrieval_tokens": 96,
662
+ "query_tokens": 11,
663
+ "context_tokens": 85,
664
+ "judge_tokens_in": 0,
665
+ "judge_tokens_out": 0,
666
+ "judge_latency_ms": 0.0
667
+ },
668
+ {
669
+ "task_id": "reg-uk-epr-scope",
670
+ "query": "Is Pentatonic in scope for UK EPR packaging reporting?",
671
+ "answer": "UK Extended Producer Responsibility (EPR) for packaging: from 2025, packaging producers must report and pay fees based on the weight and recyclability of packaging placed on the UK market. Modulated fees favour recyclable formats. Reporting threshold: \u00a31M turnover AND 25 tonnes of packaging/year. Pentatonic: IN SCOPE. Our filings: due every 6 months.",
672
+ "hits": [
673
+ {
674
+ "text": "UK Extended Producer Responsibility (EPR) for packaging: from 2025, packaging producers must report and pay fees based on the weight and recyclability of packaging placed on the UK market. Modulated fees favour recyclable formats. Reporting threshold: \u00a31M turnover AND 25 tonnes of packaging/year. Pentatonic: IN SCOPE. Our filings: due every 6 months.",
675
+ "score": 0.7782274756985779,
676
+ "source": "pentatonic-memory",
677
+ "doc_id": "reg-uk-epr"
678
+ }
679
+ ],
680
+ "correct": true,
681
+ "score": 1.0,
682
+ "grading_notes": "all substrings matched",
683
+ "search_time_ms": 21.614981000311673,
684
+ "generation_time_ms": 0.0,
685
+ "tokens_in": 0,
686
+ "tokens_out": 0,
687
+ "retrieval_tokens": 91,
688
+ "query_tokens": 13,
689
+ "context_tokens": 78,
690
+ "judge_tokens_in": 0,
691
+ "judge_tokens_out": 0,
692
+ "judge_latency_ms": 0.0
693
+ },
694
+ {
695
+ "task_id": "reg-espr-dpp",
696
+ "query": "What are Digital Product Passports and when do they start applying?",
697
+ "answer": "",
698
+ "hits": [],
699
+ "correct": false,
700
+ "score": 0.0,
701
+ "grading_notes": "missing 2/2: ['ESPR', '2026']",
702
+ "search_time_ms": 24.717538995901123,
703
+ "generation_time_ms": 0.0,
704
+ "tokens_in": 0,
705
+ "tokens_out": 0,
706
+ "retrieval_tokens": 13,
707
+ "query_tokens": 13,
708
+ "context_tokens": 0,
709
+ "judge_tokens_in": 0,
710
+ "judge_tokens_out": 0,
711
+ "judge_latency_ms": 0.0
712
+ },
713
+ {
714
+ "task_id": "reg-ca-sb54-deadline",
715
+ "query": "By when must packaging sold in California be recyclable or compostable?",
716
+ "answer": "California SB 54 (Plastic Pollution Prevention and Packaging Producer Responsibility Act, 2022): requires all packaging sold in California to be recyclable or compostable by 2032, with 65% recycling rate and 25% source reduction. PRO (Producer Responsibility Organisation) fees apply. Pentatonic: IN SCOPE for US-bound shipments to California.",
717
+ "hits": [
718
+ {
719
+ "text": "California SB 54 (Plastic Pollution Prevention and Packaging Producer Responsibility Act, 2022): requires all packaging sold in California to be recyclable or compostable by 2032, with 65% recycling rate and 25% source reduction. PRO (Producer Responsibility Organisation) fees apply. Pentatonic: IN SCOPE for US-bound shipments to California.",
720
+ "score": 0.5522683088380467,
721
+ "source": "pentatonic-memory",
722
+ "doc_id": "reg-ca-sb54"
723
+ }
724
+ ],
725
+ "correct": true,
726
+ "score": 1.0,
727
+ "grading_notes": "all substrings matched",
728
+ "search_time_ms": 30.33506599604152,
729
+ "generation_time_ms": 0.0,
730
+ "tokens_in": 0,
731
+ "tokens_out": 0,
732
+ "retrieval_tokens": 87,
733
+ "query_tokens": 14,
734
+ "context_tokens": 73,
735
+ "judge_tokens_in": 0,
736
+ "judge_tokens_out": 0,
737
+ "judge_latency_ms": 0.0
738
+ },
739
+ {
740
+ "task_id": "reg-lithium-return",
741
+ "query": "Can lithium-containing devices be returned through our standard take-back label?",
742
+ "answer": "Lithium cell regulation: devices containing lithium cells are subject to UN 3480/3481 transport rules and cannot be returned via standard take-back labels. Pentatonic's policy: take-back for devices with lithium cells requires a dedicated labelled pack and courier pickup (not drop-in). Current catalogue impact: none (we do not sell lithium-containing products).",
743
+ "hits": [
744
+ {
745
+ "text": "Lithium cell regulation: devices containing lithium cells are subject to UN 3480/3481 transport rules and cannot be returned via standard take-back labels. Pentatonic's policy: take-back for devices with lithium cells requires a dedicated labelled pack and courier pickup (not drop-in). Current catalogue impact: none (we do not sell lithium-containing products).",
746
+ "score": 0.7893983902041972,
747
+ "source": "pentatonic-memory",
748
+ "doc_id": "reg-hazardous-lithium"
749
+ }
750
+ ],
751
+ "correct": true,
752
+ "score": 1.0,
753
+ "grading_notes": "all substrings matched",
754
+ "search_time_ms": 27.972067007794976,
755
+ "generation_time_ms": 0.0,
756
+ "tokens_in": 0,
757
+ "tokens_out": 0,
758
+ "retrieval_tokens": 85,
759
+ "query_tokens": 13,
760
+ "context_tokens": 72,
761
+ "judge_tokens_in": 0,
762
+ "judge_tokens_out": 0,
763
+ "judge_latency_ms": 0.0
764
+ },
765
+ {
766
+ "task_id": "closed-loop-threshold",
767
+ "query": "At what mass-recovery percentage is a SKU considered closed-loop in our system?",
768
+ "answer": "",
769
+ "hits": [],
770
+ "correct": false,
771
+ "score": 0.0,
772
+ "grading_notes": "missing 1/1: ['80%']",
773
+ "search_time_ms": 29.058508982416242,
774
+ "generation_time_ms": 0.0,
775
+ "tokens_in": 0,
776
+ "tokens_out": 0,
777
+ "retrieval_tokens": 16,
778
+ "query_tokens": 16,
779
+ "context_tokens": 0,
780
+ "judge_tokens_in": 0,
781
+ "judge_tokens_out": 0,
782
+ "judge_latency_ms": 0.0
783
+ },
784
+ {
785
+ "task_id": "rubric-customer-greenwash-claim",
786
+ "query": "A customer asks: 'Are your Loop cups just the same as plastic cups but with a greener label?' Write an honest response.",
787
+ "answer": "",
788
+ "hits": [],
789
+ "correct": false,
790
+ "score": 0.0,
791
+ "grading_notes": "judge=0.00 \u2014 The agent's response does not acknowledge that PLA is industrially compostable but NOT home-compostable or recyclable vi",
792
+ "search_time_ms": 29.882213013479486,
793
+ "generation_time_ms": 0.0,
794
+ "tokens_in": 0,
795
+ "tokens_out": 0,
796
+ "retrieval_tokens": 27,
797
+ "query_tokens": 27,
798
+ "context_tokens": 0,
799
+ "judge_tokens_in": 355,
800
+ "judge_tokens_out": 62,
801
+ "judge_latency_ms": 1281.8933610022068
802
+ },
803
+ {
804
+ "task_id": "rubric-full-takeback-story-atlas",
805
+ "query": "Walk me through the complete material journey of an Atlas Phone Shell returned via take-back.",
806
+ "answer": "Take-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
807
+ "hits": [
808
+ {
809
+ "text": "Take-back programme \u2014 Atlas Phone Shell (ATL-PHN-001). Accepted condition: any, including broken. Credit: 50% of original purchase price as store credit. Process: free returns label at pentatonic.com/takeback/ATL-PHN-001. Recovery route: shell is mechanically shredded, rPP-OB fraction recovered and fed back into the next production run. Closed-loop: YES.",
810
+ "score": 0.558707344294221,
811
+ "source": "pentatonic-memory",
812
+ "doc_id": "takeback-atlas-phone"
813
+ }
814
+ ],
815
+ "correct": false,
816
+ "score": 0.5,
817
+ "grading_notes": "judge=0.50 \u2014 The answer covers stages 1 (take-back programme) and 4 (shell is mechanically shredded), but lacks details on the recove",
818
+ "search_time_ms": 31.778311997186393,
819
+ "generation_time_ms": 0.0,
820
+ "tokens_in": 0,
821
+ "tokens_out": 0,
822
+ "retrieval_tokens": 102,
823
+ "query_tokens": 17,
824
+ "context_tokens": 85,
825
+ "judge_tokens_in": 413,
826
+ "judge_tokens_out": 46,
827
+ "judge_latency_ms": 1095.577819019556
828
+ },
829
+ {
830
+ "task_id": "rubric-regulatory-scope-briefing",
831
+ "query": "Give me a short regulatory briefing on what applies to Pentatonic in the UK, EU, and California.",
832
+ "answer": "",
833
+ "hits": [],
834
+ "correct": false,
835
+ "score": 0.0,
836
+ "grading_notes": "judge=0.00 \u2014 The agent failed to mention any of the three jurisdictions (UK, EU, California) and instead provided an unrelated answer",
837
+ "search_time_ms": 33.02550900843926,
838
+ "generation_time_ms": 0.0,
839
+ "tokens_in": 0,
840
+ "tokens_out": 0,
841
+ "retrieval_tokens": 22,
842
+ "query_tokens": 22,
843
+ "context_tokens": 0,
844
+ "judge_tokens_in": 351,
845
+ "judge_tokens_out": 38,
846
+ "judge_latency_ms": 745.5519150197506
847
+ },
848
+ {
849
+ "task_id": "entity-closed-loop-skus",
850
+ "query": "Which SKUs in our catalogue achieve closed-loop material recovery?",
851
+ "answer": "",
852
+ "hits": [],
853
+ "correct": false,
854
+ "score": 0.0,
855
+ "grading_notes": "no expected_substrings set",
856
+ "search_time_ms": 29.39699200214818,
857
+ "generation_time_ms": 0.0,
858
+ "tokens_in": 0,
859
+ "tokens_out": 0,
860
+ "retrieval_tokens": 12,
861
+ "query_tokens": 12,
862
+ "context_tokens": 0,
863
+ "judge_tokens_in": 0,
864
+ "judge_tokens_out": 0,
865
+ "judge_latency_ms": 0.0
866
+ },
867
+ {
868
+ "task_id": "entity-home-compostable",
869
+ "query": "List any materials in our catalogue that are home-compostable.",
870
+ "answer": "Material: PBAT (polybutylene adipate terephthalate) + corn starch biopolymer. Industrially compostable (EN13432). Home-compostability: NO \u2014 requires industrial temperatures. Used in: Root Bio Bin Liners. Regulatory note: accepted in most UK local-authority food-waste streams that accept certified compostable bags, but some authorities reject all bag types \u2014 check locally.",
871
+ "hits": [
872
+ {
873
+ "text": "Material: PBAT (polybutylene adipate terephthalate) + corn starch biopolymer. Industrially compostable (EN13432). Home-compostability: NO \u2014 requires industrial temperatures. Used in: Root Bio Bin Liners. Regulatory note: accepted in most UK local-authority food-waste streams that accept certified compostable bags, but some authorities reject all bag types \u2014 check locally.",
874
+ "score": 0.5324476747882344,
875
+ "source": "pentatonic-memory",
876
+ "doc_id": "material-pbat-starch"
877
+ }
878
+ ],
879
+ "correct": false,
880
+ "score": 0.0,
881
+ "grading_notes": "missing 2/3: ['NOT home-compostable', 'none']; forbidden substring(s) present: ['PBAT']",
882
+ "search_time_ms": 24.964277021354064,
883
+ "generation_time_ms": 0.0,
884
+ "tokens_in": 0,
885
+ "tokens_out": 0,
886
+ "retrieval_tokens": 98,
887
+ "query_tokens": 13,
888
+ "context_tokens": 85,
889
+ "judge_tokens_in": 0,
890
+ "judge_tokens_out": 0,
891
+ "judge_latency_ms": 0.0
892
+ }
893
+ ]
894
+ }