@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1018 @@
1
+ {
2
+ "bench": "customer-support",
3
+ "stack": "pentatonic-memory",
4
+ "n_tasks": 20,
5
+ "n_correct": 15,
6
+ "accuracy": 0.75,
7
+ "mean_score": 0.8,
8
+ "p50_search_ms": 128.41969198780134,
9
+ "p95_search_ms": 170.69528470892692,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 10,
16
+ "mean_score": 1.0,
17
+ "accuracy": 1.0
18
+ },
19
+ "customer": {
20
+ "n": 8,
21
+ "mean_score": 0.8125,
22
+ "accuracy": 0.75
23
+ },
24
+ "multi-doc": {
25
+ "n": 6,
26
+ "mean_score": 0.9166666666666666,
27
+ "accuracy": 0.8333333333333334
28
+ },
29
+ "rma": {
30
+ "n": 3,
31
+ "mean_score": 0.6666666666666666,
32
+ "accuracy": 0.6666666666666666
33
+ },
34
+ "policy": {
35
+ "n": 5,
36
+ "mean_score": 0.8,
37
+ "accuracy": 0.8
38
+ },
39
+ "escalation": {
40
+ "n": 4,
41
+ "mean_score": 0.875,
42
+ "accuracy": 0.75
43
+ },
44
+ "rubric": {
45
+ "n": 3,
46
+ "mean_score": 0.6666666666666666,
47
+ "accuracy": 0.3333333333333333
48
+ },
49
+ "multi-hop": {
50
+ "n": 1,
51
+ "mean_score": 0.5,
52
+ "accuracy": 0.0
53
+ },
54
+ "entity": {
55
+ "n": 2,
56
+ "mean_score": 0.0,
57
+ "accuracy": 0.0
58
+ }
59
+ },
60
+ "extra": {
61
+ "ingest_ms": 6125.601384002948,
62
+ "grading": "substring",
63
+ "limit": 3,
64
+ "tokens": {
65
+ "corpus_tokens": 1227,
66
+ "query_tokens": 283,
67
+ "context_tokens": 3972,
68
+ "retrieval_tokens": 4255,
69
+ "naive_tokens": 24823,
70
+ "saved_tokens": 20568,
71
+ "reduction_pct": 0.8285863916529026,
72
+ "mean_retrieval_tokens_per_task": 212.75,
73
+ "tokenizer": "cl100k_base",
74
+ "per_task": {
75
+ "order-mina-count": {
76
+ "query": 11,
77
+ "context": 222,
78
+ "retrieval": 233,
79
+ "judge_in": 0,
80
+ "judge_out": 0,
81
+ "judge_latency_ms": 0.0
82
+ },
83
+ "order-mina-latest": {
84
+ "query": 13,
85
+ "context": 222,
86
+ "retrieval": 235,
87
+ "judge_in": 0,
88
+ "judge_out": 0,
89
+ "judge_latency_ms": 0.0
90
+ },
91
+ "rma-mina-sleeve-reason": {
92
+ "query": 17,
93
+ "context": 252,
94
+ "retrieval": 269,
95
+ "judge_in": 0,
96
+ "judge_out": 0,
97
+ "judge_latency_ms": 0.0
98
+ },
99
+ "rma-mina-lid-resolution": {
100
+ "query": 11,
101
+ "context": 199,
102
+ "retrieval": 210,
103
+ "judge_in": 0,
104
+ "judge_out": 0,
105
+ "judge_latency_ms": 0.0
106
+ },
107
+ "jareth-harness-bar-followup": {
108
+ "query": 15,
109
+ "context": 170,
110
+ "retrieval": 185,
111
+ "judge_in": 0,
112
+ "judge_out": 0,
113
+ "judge_latency_ms": 0.0
114
+ },
115
+ "jareth-second-order": {
116
+ "query": 11,
117
+ "context": 170,
118
+ "retrieval": 181,
119
+ "judge_in": 0,
120
+ "judge_out": 0,
121
+ "judge_latency_ms": 0.0
122
+ },
123
+ "priya-custom-status": {
124
+ "query": 11,
125
+ "context": 184,
126
+ "retrieval": 195,
127
+ "judge_in": 0,
128
+ "judge_out": 0,
129
+ "judge_latency_ms": 0.0
130
+ },
131
+ "priya-goodwill-offered": {
132
+ "query": 9,
133
+ "context": 221,
134
+ "retrieval": 230,
135
+ "judge_in": 0,
136
+ "judge_out": 0,
137
+ "judge_latency_ms": 0.0
138
+ },
139
+ "policy-custom-return": {
140
+ "query": 11,
141
+ "context": 180,
142
+ "retrieval": 191,
143
+ "judge_in": 0,
144
+ "judge_out": 0,
145
+ "judge_latency_ms": 0.0
146
+ },
147
+ "policy-40day-return": {
148
+ "query": 18,
149
+ "context": 154,
150
+ "retrieval": 172,
151
+ "judge_in": 0,
152
+ "judge_out": 0,
153
+ "judge_latency_ms": 0.0
154
+ },
155
+ "escalation-400gbp-full-refund": {
156
+ "query": 20,
157
+ "context": 159,
158
+ "retrieval": 179,
159
+ "judge_in": 0,
160
+ "judge_out": 0,
161
+ "judge_latency_ms": 0.0
162
+ },
163
+ "escalation-goodwill-20pct-tier": {
164
+ "query": 15,
165
+ "context": 194,
166
+ "retrieval": 209,
167
+ "judge_in": 0,
168
+ "judge_out": 0,
169
+ "judge_latency_ms": 0.0
170
+ },
171
+ "mina-vip-status": {
172
+ "query": 10,
173
+ "context": 222,
174
+ "retrieval": 232,
175
+ "judge_in": 0,
176
+ "judge_out": 0,
177
+ "judge_latency_ms": 0.0
178
+ },
179
+ "mina-preferences": {
180
+ "query": 11,
181
+ "context": 222,
182
+ "retrieval": 233,
183
+ "judge_in": 0,
184
+ "judge_out": 0,
185
+ "judge_latency_ms": 0.0
186
+ },
187
+ "priya-agent-guidance": {
188
+ "query": 16,
189
+ "context": 184,
190
+ "retrieval": 200,
191
+ "judge_in": 0,
192
+ "judge_out": 0,
193
+ "judge_latency_ms": 0.0
194
+ },
195
+ "rubric-mina-warranty-recommendation": {
196
+ "query": 18,
197
+ "context": 174,
198
+ "retrieval": 192,
199
+ "judge_in": 506,
200
+ "judge_out": 48,
201
+ "judge_latency_ms": 1022.3502889871597
202
+ },
203
+ "rubric-jareth-escalation": {
204
+ "query": 30,
205
+ "context": 170,
206
+ "retrieval": 200,
207
+ "judge_in": 527,
208
+ "judge_out": 38,
209
+ "judge_latency_ms": 890.2632699906826
210
+ },
211
+ "rubric-priya-delayed-custom": {
212
+ "query": 16,
213
+ "context": 242,
214
+ "retrieval": 258,
215
+ "judge_in": 552,
216
+ "judge_out": 45,
217
+ "judge_latency_ms": 794.1419219970703
218
+ },
219
+ "entity-mina-orders": {
220
+ "query": 10,
221
+ "context": 222,
222
+ "retrieval": 232,
223
+ "judge_in": 0,
224
+ "judge_out": 0,
225
+ "judge_latency_ms": 0.0
226
+ },
227
+ "entity-all-rmas": {
228
+ "query": 10,
229
+ "context": 209,
230
+ "retrieval": 219,
231
+ "judge_in": 0,
232
+ "judge_out": 0,
233
+ "judge_latency_ms": 0.0
234
+ }
235
+ },
236
+ "judge_tokens_in": 1585,
237
+ "judge_tokens_out": 131,
238
+ "judge_calls": 3,
239
+ "judge_mean_latency_ms": 902.2518269916376
240
+ },
241
+ "cost_usd": {
242
+ "assumed_completion_tokens_per_task": 100,
243
+ "rates": {
244
+ "input_per_1k": 0.0025,
245
+ "output_per_1k": 0.01,
246
+ "model": "gpt-4o"
247
+ },
248
+ "retrieval_usd_in": 0.010637500000000001,
249
+ "retrieval_usd_out": 0.02,
250
+ "retrieval_usd_total": 0.0306375,
251
+ "naive_usd_total": 0.0820575,
252
+ "saved_usd": 0.05142000000000001,
253
+ "saved_usd_per_1k_tasks": 2.571
254
+ }
255
+ },
256
+ "task_results": [
257
+ {
258
+ "task_id": "order-mina-count",
259
+ "query": "How many orders has Mina Okafor placed?",
260
+ "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
261
+ "hits": [
262
+ {
263
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
264
+ "score": 0.4692591726779938,
265
+ "source": "pentatonic-memory",
266
+ "doc_id": "order-2026-0187-mina"
267
+ },
268
+ {
269
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
270
+ "score": 0.4614717364311218,
271
+ "source": "pentatonic-memory",
272
+ "doc_id": "order-2026-0142-mina"
273
+ },
274
+ {
275
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
276
+ "score": 0.5065365433692932,
277
+ "source": "pentatonic-memory",
278
+ "doc_id": "customer-note-mina-vip"
279
+ }
280
+ ],
281
+ "correct": true,
282
+ "score": 1.0,
283
+ "grading_notes": "all substrings matched",
284
+ "search_time_ms": 121.94390798686072,
285
+ "generation_time_ms": 0.0,
286
+ "tokens_in": 0,
287
+ "tokens_out": 0,
288
+ "retrieval_tokens": 233,
289
+ "query_tokens": 11,
290
+ "context_tokens": 222,
291
+ "judge_tokens_in": 0,
292
+ "judge_tokens_out": 0,
293
+ "judge_latency_ms": 0.0
294
+ },
295
+ {
296
+ "task_id": "order-mina-latest",
297
+ "query": "What was in Mina Okafor's most recent order?",
298
+ "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
299
+ "hits": [
300
+ {
301
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
302
+ "score": 0.4980609118938446,
303
+ "source": "pentatonic-memory",
304
+ "doc_id": "order-2026-0187-mina"
305
+ },
306
+ {
307
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
308
+ "score": 0.4663473665714264,
309
+ "source": "pentatonic-memory",
310
+ "doc_id": "order-2026-0142-mina"
311
+ },
312
+ {
313
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
314
+ "score": 0.5009183883666992,
315
+ "source": "pentatonic-memory",
316
+ "doc_id": "customer-note-mina-vip"
317
+ }
318
+ ],
319
+ "correct": true,
320
+ "score": 1.0,
321
+ "grading_notes": "all substrings matched",
322
+ "search_time_ms": 121.74513799254782,
323
+ "generation_time_ms": 0.0,
324
+ "tokens_in": 0,
325
+ "tokens_out": 0,
326
+ "retrieval_tokens": 235,
327
+ "query_tokens": 13,
328
+ "context_tokens": 222,
329
+ "judge_tokens_in": 0,
330
+ "judge_tokens_out": 0,
331
+ "judge_latency_ms": 0.0
332
+ },
333
+ {
334
+ "task_id": "rma-mina-sleeve-reason",
335
+ "query": "Why did Mina open an RMA on order 2026-0142?",
336
+ "answer": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
337
+ "hits": [
338
+ {
339
+ "text": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.",
340
+ "score": 0.41409578919410706,
341
+ "source": "pentatonic-memory",
342
+ "doc_id": "rma-2026-0142-swap"
343
+ },
344
+ {
345
+ "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
346
+ "score": 0.4296145439147949,
347
+ "source": "pentatonic-memory",
348
+ "doc_id": "rma-2026-0187-lid"
349
+ },
350
+ {
351
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
352
+ "score": 0.39442405104637146,
353
+ "source": "pentatonic-memory",
354
+ "doc_id": "order-2026-0142-mina"
355
+ }
356
+ ],
357
+ "correct": true,
358
+ "score": 1.0,
359
+ "grading_notes": "all substrings matched",
360
+ "search_time_ms": 122.5400409894064,
361
+ "generation_time_ms": 0.0,
362
+ "tokens_in": 0,
363
+ "tokens_out": 0,
364
+ "retrieval_tokens": 269,
365
+ "query_tokens": 17,
366
+ "context_tokens": 252,
367
+ "judge_tokens_in": 0,
368
+ "judge_tokens_out": 0,
369
+ "judge_latency_ms": 0.0
370
+ },
371
+ {
372
+ "task_id": "rma-mina-lid-resolution",
373
+ "query": "How was Mina's Luna bottle lid complaint resolved?",
374
+ "answer": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
375
+ "hits": [
376
+ {
377
+ "text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
378
+ "score": 0.6011244654655457,
379
+ "source": "pentatonic-memory",
380
+ "doc_id": "chat-mina-2026-04-10"
381
+ },
382
+ {
383
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
384
+ "score": 0.4347590506076813,
385
+ "source": "pentatonic-memory",
386
+ "doc_id": "order-2026-0187-mina"
387
+ },
388
+ {
389
+ "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
390
+ "score": 0.539427638053894,
391
+ "source": "pentatonic-memory",
392
+ "doc_id": "rma-2026-0187-lid"
393
+ }
394
+ ],
395
+ "correct": true,
396
+ "score": 1.0,
397
+ "grading_notes": "all substrings matched",
398
+ "search_time_ms": 128.34562797797844,
399
+ "generation_time_ms": 0.0,
400
+ "tokens_in": 0,
401
+ "tokens_out": 0,
402
+ "retrieval_tokens": 210,
403
+ "query_tokens": 11,
404
+ "context_tokens": 199,
405
+ "judge_tokens_in": 0,
406
+ "judge_tokens_out": 0,
407
+ "judge_latency_ms": 0.0
408
+ },
409
+ {
410
+ "task_id": "jareth-harness-bar-followup",
411
+ "query": "Did Jareth's kite harness work with his bar out of the box?",
412
+ "answer": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.\n---\nOrder #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
413
+ "hits": [
414
+ {
415
+ "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
416
+ "score": 0.5966447591781616,
417
+ "source": "pentatonic-memory",
418
+ "doc_id": "chat-jareth-2026-04-08"
419
+ },
420
+ {
421
+ "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
422
+ "score": 0.5066688656806946,
423
+ "source": "pentatonic-memory",
424
+ "doc_id": "customer-note-jareth-kiter"
425
+ },
426
+ {
427
+ "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
428
+ "score": 0.4137082099914551,
429
+ "source": "pentatonic-memory",
430
+ "doc_id": "order-2026-0201-jareth"
431
+ }
432
+ ],
433
+ "correct": true,
434
+ "score": 1.0,
435
+ "grading_notes": "all substrings matched",
436
+ "search_time_ms": 135.78065202455036,
437
+ "generation_time_ms": 0.0,
438
+ "tokens_in": 0,
439
+ "tokens_out": 0,
440
+ "retrieval_tokens": 185,
441
+ "query_tokens": 15,
442
+ "context_tokens": 170,
443
+ "judge_tokens_in": 0,
444
+ "judge_tokens_out": 0,
445
+ "judge_latency_ms": 0.0
446
+ },
447
+ {
448
+ "task_id": "jareth-second-order",
449
+ "query": "What did Jareth order after his initial harness purchase?",
450
+ "answer": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.\n---\n[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
451
+ "hits": [
452
+ {
453
+ "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
454
+ "score": 0.4606146514415741,
455
+ "source": "pentatonic-memory",
456
+ "doc_id": "order-2026-0201-jareth"
457
+ },
458
+ {
459
+ "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
460
+ "score": 0.4756895899772644,
461
+ "source": "pentatonic-memory",
462
+ "doc_id": "customer-note-jareth-kiter"
463
+ },
464
+ {
465
+ "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
466
+ "score": 0.3477482497692108,
467
+ "source": "pentatonic-memory",
468
+ "doc_id": "chat-jareth-2026-04-08"
469
+ }
470
+ ],
471
+ "correct": true,
472
+ "score": 1.0,
473
+ "grading_notes": "all substrings matched",
474
+ "search_time_ms": 128.49375599762425,
475
+ "generation_time_ms": 0.0,
476
+ "tokens_in": 0,
477
+ "tokens_out": 0,
478
+ "retrieval_tokens": 181,
479
+ "query_tokens": 11,
480
+ "context_tokens": 170,
481
+ "judge_tokens_in": 0,
482
+ "judge_tokens_out": 0,
483
+ "judge_latency_ms": 0.0
484
+ },
485
+ {
486
+ "task_id": "priya-custom-status",
487
+ "query": "Can Priya change the text on her custom tote?",
488
+ "answer": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\nCustomer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
489
+ "hits": [
490
+ {
491
+ "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
492
+ "score": 0.5468156933784485,
493
+ "source": "pentatonic-memory",
494
+ "doc_id": "chat-priya-2026-04-20"
495
+ },
496
+ {
497
+ "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
498
+ "score": 0.490670770406723,
499
+ "source": "pentatonic-memory",
500
+ "doc_id": "chat-priya-2026-04-14"
501
+ },
502
+ {
503
+ "text": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
504
+ "score": 0.46206608414649963,
505
+ "source": "pentatonic-memory",
506
+ "doc_id": "customer-note-priya-custom"
507
+ }
508
+ ],
509
+ "correct": true,
510
+ "score": 1.0,
511
+ "grading_notes": "all substrings matched",
512
+ "search_time_ms": 171.14407900953665,
513
+ "generation_time_ms": 0.0,
514
+ "tokens_in": 0,
515
+ "tokens_out": 0,
516
+ "retrieval_tokens": 195,
517
+ "query_tokens": 11,
518
+ "context_tokens": 184,
519
+ "judge_tokens_in": 0,
520
+ "judge_tokens_out": 0,
521
+ "judge_latency_ms": 0.0
522
+ },
523
+ {
524
+ "task_id": "priya-goodwill-offered",
525
+ "query": "What goodwill credit did Priya get offered?",
526
+ "answer": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\nOrder #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
527
+ "hits": [
528
+ {
529
+ "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
530
+ "score": 0.33671367168426514,
531
+ "source": "pentatonic-memory",
532
+ "doc_id": "chat-priya-2026-04-20"
533
+ },
534
+ {
535
+ "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
536
+ "score": 0.4352283179759979,
537
+ "source": "pentatonic-memory",
538
+ "doc_id": "policy-goodwill-credit"
539
+ },
540
+ {
541
+ "text": "Order #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
542
+ "score": 0.25958457589149475,
543
+ "source": "pentatonic-memory",
544
+ "doc_id": "order-2026-0215-priya"
545
+ }
546
+ ],
547
+ "correct": true,
548
+ "score": 1.0,
549
+ "grading_notes": "all substrings matched",
550
+ "search_time_ms": 162.16819299734198,
551
+ "generation_time_ms": 0.0,
552
+ "tokens_in": 0,
553
+ "tokens_out": 0,
554
+ "retrieval_tokens": 230,
555
+ "query_tokens": 9,
556
+ "context_tokens": 221,
557
+ "judge_tokens_in": 0,
558
+ "judge_tokens_out": 0,
559
+ "judge_latency_ms": 0.0
560
+ },
561
+ {
562
+ "task_id": "policy-custom-return",
563
+ "query": "Can a custom-printed tote be returned for refund?",
564
+ "answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
565
+ "hits": [
566
+ {
567
+ "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
568
+ "score": 0.5682638883590698,
569
+ "source": "pentatonic-memory",
570
+ "doc_id": "policy-returns-30day"
571
+ },
572
+ {
573
+ "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
574
+ "score": 0.31737956404685974,
575
+ "source": "pentatonic-memory",
576
+ "doc_id": "policy-escalation-tiers"
577
+ },
578
+ {
579
+ "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
580
+ "score": 0.47565758228302,
581
+ "source": "pentatonic-memory",
582
+ "doc_id": "chat-priya-2026-04-14"
583
+ }
584
+ ],
585
+ "correct": true,
586
+ "score": 1.0,
587
+ "grading_notes": "all substrings matched",
588
+ "search_time_ms": 120.61583198374137,
589
+ "generation_time_ms": 0.0,
590
+ "tokens_in": 0,
591
+ "tokens_out": 0,
592
+ "retrieval_tokens": 191,
593
+ "query_tokens": 11,
594
+ "context_tokens": 180,
595
+ "judge_tokens_in": 0,
596
+ "judge_tokens_out": 0,
597
+ "judge_latency_ms": 0.0
598
+ },
599
+ {
600
+ "task_id": "policy-40day-return",
601
+ "query": "If an unused item was delivered 40 days ago, can I get a full refund?",
602
+ "answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\n[Chat 2026-03-20] Mina: Hi, the sleeve I got is the wrong colour \u2014 I wanted Oat, not Charcoal. Can I swap? Agent: Yes, you're within the 30-day window. I'll send an exchange label.",
603
+ "hits": [
604
+ {
605
+ "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
606
+ "score": 0.6555708646774292,
607
+ "source": "pentatonic-memory",
608
+ "doc_id": "policy-returns-30day"
609
+ },
610
+ {
611
+ "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
612
+ "score": 0.2732425928115845,
613
+ "source": "pentatonic-memory",
614
+ "doc_id": "policy-goodwill-credit"
615
+ },
616
+ {
617
+ "text": "[Chat 2026-03-20] Mina: Hi, the sleeve I got is the wrong colour \u2014 I wanted Oat, not Charcoal. Can I swap? Agent: Yes, you're within the 30-day window. I'll send an exchange label.",
618
+ "score": 0.3445591926574707,
619
+ "source": "pentatonic-memory",
620
+ "doc_id": "chat-mina-2026-03-20"
621
+ }
622
+ ],
623
+ "correct": false,
624
+ "score": 0.0,
625
+ "grading_notes": "all substrings matched; forbidden substring(s) present: ['full refund']",
626
+ "search_time_ms": 122.33487097546458,
627
+ "generation_time_ms": 0.0,
628
+ "tokens_in": 0,
629
+ "tokens_out": 0,
630
+ "retrieval_tokens": 172,
631
+ "query_tokens": 18,
632
+ "context_tokens": 154,
633
+ "judge_tokens_in": 0,
634
+ "judge_tokens_out": 0,
635
+ "judge_latency_ms": 0.0
636
+ },
637
+ {
638
+ "task_id": "escalation-400gbp-full-refund",
639
+ "query": "A Tier 1 agent wants to issue a full refund on a \u00a3400 order \u2014 can they?",
640
+ "answer": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\nReturns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
641
+ "hits": [
642
+ {
643
+ "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
644
+ "score": 0.5317927598953247,
645
+ "source": "pentatonic-memory",
646
+ "doc_id": "policy-escalation-tiers"
647
+ },
648
+ {
649
+ "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
650
+ "score": 0.558272123336792,
651
+ "source": "pentatonic-memory",
652
+ "doc_id": "policy-goodwill-credit"
653
+ },
654
+ {
655
+ "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
656
+ "score": 0.3208146095275879,
657
+ "source": "pentatonic-memory",
658
+ "doc_id": "policy-returns-30day"
659
+ }
660
+ ],
661
+ "correct": true,
662
+ "score": 1.0,
663
+ "grading_notes": "all substrings matched",
664
+ "search_time_ms": 120.98205901565962,
665
+ "generation_time_ms": 0.0,
666
+ "tokens_in": 0,
667
+ "tokens_out": 0,
668
+ "retrieval_tokens": 179,
669
+ "query_tokens": 20,
670
+ "context_tokens": 159,
671
+ "judge_tokens_in": 0,
672
+ "judge_tokens_out": 0,
673
+ "judge_latency_ms": 0.0
674
+ },
675
+ {
676
+ "task_id": "escalation-goodwill-20pct-tier",
677
+ "query": "What's the minimum agent tier needed to offer 20% goodwill credit?",
678
+ "answer": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
679
+ "hits": [
680
+ {
681
+ "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
682
+ "score": 0.7269918322563171,
683
+ "source": "pentatonic-memory",
684
+ "doc_id": "policy-goodwill-credit"
685
+ },
686
+ {
687
+ "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
688
+ "score": 0.29580065608024597,
689
+ "source": "pentatonic-memory",
690
+ "doc_id": "chat-priya-2026-04-20"
691
+ },
692
+ {
693
+ "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
694
+ "score": 0.4651184380054474,
695
+ "source": "pentatonic-memory",
696
+ "doc_id": "policy-escalation-tiers"
697
+ }
698
+ ],
699
+ "correct": true,
700
+ "score": 1.0,
701
+ "grading_notes": "all substrings matched",
702
+ "search_time_ms": 123.65483402390964,
703
+ "generation_time_ms": 0.0,
704
+ "tokens_in": 0,
705
+ "tokens_out": 0,
706
+ "retrieval_tokens": 209,
707
+ "query_tokens": 15,
708
+ "context_tokens": 194,
709
+ "judge_tokens_in": 0,
710
+ "judge_tokens_out": 0,
711
+ "judge_latency_ms": 0.0
712
+ },
713
+ {
714
+ "task_id": "mina-vip-status",
715
+ "query": "Is Mina Okafor a VIP customer?",
716
+ "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
717
+ "hits": [
718
+ {
719
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
720
+ "score": 0.6328713297843933,
721
+ "source": "pentatonic-memory",
722
+ "doc_id": "customer-note-mina-vip"
723
+ },
724
+ {
725
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
726
+ "score": 0.40744826197624207,
727
+ "source": "pentatonic-memory",
728
+ "doc_id": "order-2026-0142-mina"
729
+ },
730
+ {
731
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
732
+ "score": 0.42423495650291443,
733
+ "source": "pentatonic-memory",
734
+ "doc_id": "order-2026-0187-mina"
735
+ }
736
+ ],
737
+ "correct": true,
738
+ "score": 1.0,
739
+ "grading_notes": "all substrings matched",
740
+ "search_time_ms": 118.19845100399107,
741
+ "generation_time_ms": 0.0,
742
+ "tokens_in": 0,
743
+ "tokens_out": 0,
744
+ "retrieval_tokens": 232,
745
+ "query_tokens": 10,
746
+ "context_tokens": 222,
747
+ "judge_tokens_in": 0,
748
+ "judge_tokens_out": 0,
749
+ "judge_latency_ms": 0.0
750
+ },
751
+ {
752
+ "task_id": "mina-preferences",
753
+ "query": "What are Mina Okafor's stated preferences?",
754
+ "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
755
+ "hits": [
756
+ {
757
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
758
+ "score": 0.5033327341079712,
759
+ "source": "pentatonic-memory",
760
+ "doc_id": "customer-note-mina-vip"
761
+ },
762
+ {
763
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
764
+ "score": 0.3317295014858246,
765
+ "source": "pentatonic-memory",
766
+ "doc_id": "order-2026-0187-mina"
767
+ },
768
+ {
769
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
770
+ "score": 0.2938464879989624,
771
+ "source": "pentatonic-memory",
772
+ "doc_id": "order-2026-0142-mina"
773
+ }
774
+ ],
775
+ "correct": true,
776
+ "score": 1.0,
777
+ "grading_notes": "all substrings matched",
778
+ "search_time_ms": 128.2598020043224,
779
+ "generation_time_ms": 0.0,
780
+ "tokens_in": 0,
781
+ "tokens_out": 0,
782
+ "retrieval_tokens": 233,
783
+ "query_tokens": 11,
784
+ "context_tokens": 222,
785
+ "judge_tokens_in": 0,
786
+ "judge_tokens_out": 0,
787
+ "judge_latency_ms": 0.0
788
+ },
789
+ {
790
+ "task_id": "priya-agent-guidance",
791
+ "query": "When handling Priya's orders, what should the agent do before production starts?",
792
+ "answer": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
793
+ "hits": [
794
+ {
795
+ "text": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
796
+ "score": 0.5809648036956787,
797
+ "source": "pentatonic-memory",
798
+ "doc_id": "customer-note-priya-custom"
799
+ },
800
+ {
801
+ "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
802
+ "score": 0.3932722210884094,
803
+ "source": "pentatonic-memory",
804
+ "doc_id": "chat-priya-2026-04-14"
805
+ },
806
+ {
807
+ "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
808
+ "score": 0.38053300976753235,
809
+ "source": "pentatonic-memory",
810
+ "doc_id": "chat-priya-2026-04-20"
811
+ }
812
+ ],
813
+ "correct": true,
814
+ "score": 1.0,
815
+ "grading_notes": "all substrings matched",
816
+ "search_time_ms": 137.435241980711,
817
+ "generation_time_ms": 0.0,
818
+ "tokens_in": 0,
819
+ "tokens_out": 0,
820
+ "retrieval_tokens": 200,
821
+ "query_tokens": 16,
822
+ "context_tokens": 184,
823
+ "judge_tokens_in": 0,
824
+ "judge_tokens_out": 0,
825
+ "judge_latency_ms": 0.0
826
+ },
827
+ {
828
+ "task_id": "rubric-mina-warranty-recommendation",
829
+ "query": "Mina is asking for product recommendations. Based on her history, what should we suggest?",
830
+ "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\n[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
831
+ "hits": [
832
+ {
833
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
834
+ "score": 0.5650954246520996,
835
+ "source": "pentatonic-memory",
836
+ "doc_id": "customer-note-mina-vip"
837
+ },
838
+ {
839
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
840
+ "score": 0.329296350479126,
841
+ "source": "pentatonic-memory",
842
+ "doc_id": "order-2026-0187-mina"
843
+ },
844
+ {
845
+ "text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
846
+ "score": 0.35196399688720703,
847
+ "source": "pentatonic-memory",
848
+ "doc_id": "chat-mina-2026-04-10"
849
+ }
850
+ ],
851
+ "correct": true,
852
+ "score": 1.0,
853
+ "grading_notes": "judge=1.00 \u2014 The answer references Mina's VIP status and her stated interest in lifetime-warranty items, recommending the Luna Water ",
854
+ "search_time_ms": 136.84402097715065,
855
+ "generation_time_ms": 0.0,
856
+ "tokens_in": 0,
857
+ "tokens_out": 0,
858
+ "retrieval_tokens": 192,
859
+ "query_tokens": 18,
860
+ "context_tokens": 174,
861
+ "judge_tokens_in": 506,
862
+ "judge_tokens_out": 48,
863
+ "judge_latency_ms": 1022.3502889871597
864
+ },
865
+ {
866
+ "task_id": "rubric-jareth-escalation",
867
+ "query": "Jareth wants to return his kite harness for a full refund, claiming it 'didn't work' with his bar \u2014 how should we handle this?",
868
+ "answer": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.\n---\nOrder #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
869
+ "hits": [
870
+ {
871
+ "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
872
+ "score": 0.4707748591899872,
873
+ "source": "pentatonic-memory",
874
+ "doc_id": "chat-jareth-2026-04-08"
875
+ },
876
+ {
877
+ "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
878
+ "score": 0.4572660028934479,
879
+ "source": "pentatonic-memory",
880
+ "doc_id": "order-2026-0201-jareth"
881
+ },
882
+ {
883
+ "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
884
+ "score": 0.5262770056724548,
885
+ "source": "pentatonic-memory",
886
+ "doc_id": "customer-note-jareth-kiter"
887
+ }
888
+ ],
889
+ "correct": false,
890
+ "score": 0.5,
891
+ "grading_notes": "judge=0.50 \u2014 The answer mentions the adapter history (Order #2026-0201) but does not suggest Tier 2 escalation.",
892
+ "search_time_ms": 143.29434899264015,
893
+ "generation_time_ms": 0.0,
894
+ "tokens_in": 0,
895
+ "tokens_out": 0,
896
+ "retrieval_tokens": 200,
897
+ "query_tokens": 30,
898
+ "context_tokens": 170,
899
+ "judge_tokens_in": 527,
900
+ "judge_tokens_out": 38,
901
+ "judge_latency_ms": 890.2632699906826
902
+ },
903
+ {
904
+ "task_id": "rubric-priya-delayed-custom",
905
+ "query": "Priya is frustrated about her custom tote delay. What's the best response?",
906
+ "answer": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nOrder #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
907
+ "hits": [
908
+ {
909
+ "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
910
+ "score": 0.5332370400428772,
911
+ "source": "pentatonic-memory",
912
+ "doc_id": "chat-priya-2026-04-14"
913
+ },
914
+ {
915
+ "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
916
+ "score": 0.41297996044158936,
917
+ "source": "pentatonic-memory",
918
+ "doc_id": "chat-priya-2026-04-20"
919
+ },
920
+ {
921
+ "text": "Order #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
922
+ "score": 0.40319278836250305,
923
+ "source": "pentatonic-memory",
924
+ "doc_id": "order-2026-0215-priya"
925
+ }
926
+ ],
927
+ "correct": false,
928
+ "score": 0.5,
929
+ "grading_notes": "judge=0.50 \u2014 The agent acknowledges the 10-12 working day lead time for custom items and provides a goodwill offer of 15% off Priya's",
930
+ "search_time_ms": 129.14211401948705,
931
+ "generation_time_ms": 0.0,
932
+ "tokens_in": 0,
933
+ "tokens_out": 0,
934
+ "retrieval_tokens": 258,
935
+ "query_tokens": 16,
936
+ "context_tokens": 242,
937
+ "judge_tokens_in": 552,
938
+ "judge_tokens_out": 45,
939
+ "judge_latency_ms": 794.1419219970703
940
+ },
941
+ {
942
+ "task_id": "entity-mina-orders",
943
+ "query": "List Mina Okafor's order IDs.",
944
+ "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
945
+ "hits": [
946
+ {
947
+ "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
948
+ "score": 0.5401065349578857,
949
+ "source": "pentatonic-memory",
950
+ "doc_id": "order-2026-0187-mina"
951
+ },
952
+ {
953
+ "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
954
+ "score": 0.5784992575645447,
955
+ "source": "pentatonic-memory",
956
+ "doc_id": "order-2026-0142-mina"
957
+ },
958
+ {
959
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
960
+ "score": 0.5376931428909302,
961
+ "source": "pentatonic-memory",
962
+ "doc_id": "customer-note-mina-vip"
963
+ }
964
+ ],
965
+ "correct": false,
966
+ "score": 0.0,
967
+ "grading_notes": "no expected_substrings set",
968
+ "search_time_ms": 141.6770379873924,
969
+ "generation_time_ms": 0.0,
970
+ "tokens_in": 0,
971
+ "tokens_out": 0,
972
+ "retrieval_tokens": 232,
973
+ "query_tokens": 10,
974
+ "context_tokens": 222,
975
+ "judge_tokens_in": 0,
976
+ "judge_tokens_out": 0,
977
+ "judge_latency_ms": 0.0
978
+ },
979
+ {
980
+ "task_id": "entity-all-rmas",
981
+ "query": "List all open and closed RMA case IDs.",
982
+ "answer": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
983
+ "hits": [
984
+ {
985
+ "text": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.",
986
+ "score": 0.39210593700408936,
987
+ "source": "pentatonic-memory",
988
+ "doc_id": "rma-2026-0142-swap"
989
+ },
990
+ {
991
+ "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
992
+ "score": 0.4559558928012848,
993
+ "source": "pentatonic-memory",
994
+ "doc_id": "rma-2026-0187-lid"
995
+ },
996
+ {
997
+ "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
998
+ "score": 0.2331419140100479,
999
+ "source": "pentatonic-memory",
1000
+ "doc_id": "policy-escalation-tiers"
1001
+ }
1002
+ ],
1003
+ "correct": false,
1004
+ "score": 0.0,
1005
+ "grading_notes": "no expected_substrings set",
1006
+ "search_time_ms": 158.67203500238247,
1007
+ "generation_time_ms": 0.0,
1008
+ "tokens_in": 0,
1009
+ "tokens_out": 0,
1010
+ "retrieval_tokens": 219,
1011
+ "query_tokens": 10,
1012
+ "context_tokens": 209,
1013
+ "judge_tokens_in": 0,
1014
+ "judge_tokens_out": 0,
1015
+ "judge_latency_ms": 0.0
1016
+ }
1017
+ ]
1018
+ }