@pentatonic-ai/ai-agent-sdk 0.9.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +3 -3
  2. package/bin/cli.js +1 -1
  3. package/bin/commands/config.js +1 -1
  4. package/dist/index.cjs +39 -72
  5. package/dist/index.js +36 -69
  6. package/package.json +10 -3
  7. package/packages/doctor/src/checks/local-memory.js +2 -2
  8. package/packages/memory/README.md +2 -2
  9. package/packages/memory/openclaw-plugin/README.md +2 -2
  10. package/packages/memory/openclaw-plugin/openclaw.plugin.json +1 -1
  11. package/packages/memory/package-lock.json +49 -33
  12. package/packages/memory/package.json +4 -1
  13. package/packages/memory/src/__tests__/engine.test.js +40 -5
  14. package/packages/memory/src/engine.js +38 -3
  15. package/packages/memory/src/server.js +2 -2
  16. package/packages/memory-engine-v2/.env.example +30 -0
  17. package/packages/memory-engine-v2/README.md +125 -0
  18. package/packages/memory-engine-v2/compat/Dockerfile +11 -0
  19. package/packages/memory-engine-v2/compat/requirements.txt +6 -0
  20. package/packages/memory-engine-v2/compat/server.py +1047 -0
  21. package/packages/memory-engine-v2/docker-compose.aws.yml +78 -0
  22. package/packages/memory-engine-v2/docker-compose.yml +206 -0
  23. package/packages/memory-engine-v2/extractor-async/Dockerfile +14 -0
  24. package/packages/memory-engine-v2/extractor-async/confidence.py +62 -0
  25. package/packages/memory-engine-v2/extractor-async/noise_filter.py +144 -0
  26. package/packages/memory-engine-v2/extractor-async/requirements.txt +2 -0
  27. package/packages/memory-engine-v2/extractor-async/test_confidence.py +76 -0
  28. package/packages/memory-engine-v2/extractor-async/test_noise_filter.py +177 -0
  29. package/packages/memory-engine-v2/extractor-async/worker.py +797 -0
  30. package/packages/memory-engine-v2/extractor-sync/Dockerfile +11 -0
  31. package/packages/memory-engine-v2/extractor-sync/requirements.txt +4 -0
  32. package/packages/memory-engine-v2/extractor-sync/server.py +424 -0
  33. package/packages/memory-engine-v2/org-model/migrations/001_init.sql +390 -0
  34. package/packages/memory-engine-v2/tests/e2e_smoke.py +356 -0
  35. package/packages/memory-engine-v2/tests/fixtures/generate_synthetic_corpus.py +758 -0
  36. package/packages/memory-engine/.env.example +0 -13
  37. package/packages/memory-engine/MIGRATION.md +0 -219
  38. package/packages/memory-engine/README.md +0 -145
  39. package/packages/memory-engine/bench/README.md +0 -99
  40. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +0 -1115
  41. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +0 -819
  42. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +0 -1278
  43. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +0 -1018
  44. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +0 -1038
  45. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +0 -961
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +0 -1115
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +0 -819
  48. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +0 -1278
  49. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +0 -1018
  50. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +0 -1038
  51. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +0 -937
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +0 -1115
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +0 -819
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +0 -1278
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +0 -1018
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +0 -1038
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +0 -961
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +0 -1115
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +0 -819
  60. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +0 -1278
  61. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +0 -1018
  62. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +0 -1038
  63. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +0 -883
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +0 -1115
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +0 -819
  66. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +0 -1278
  67. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +0 -1018
  68. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +0 -1038
  69. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +0 -937
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +0 -1115
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +0 -1115
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +0 -819
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +0 -542
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +0 -1278
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +0 -894
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +0 -1018
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +0 -680
  78. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +0 -1038
  79. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +0 -693
  80. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +0 -961
  81. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +0 -727
  82. package/packages/memory-engine/compat/Dockerfile +0 -22
  83. package/packages/memory-engine/compat/server.py +0 -1255
  84. package/packages/memory-engine/docker-compose.test.yml +0 -59
  85. package/packages/memory-engine/docker-compose.yml +0 -240
  86. package/packages/memory-engine/engine/README.md +0 -52
  87. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +0 -1543
  88. package/packages/memory-engine/engine/l5-comms-layer.py +0 -663
  89. package/packages/memory-engine/engine/l6-document-store.py +0 -1018
  90. package/packages/memory-engine/engine/services/_shared/__init__.py +0 -1
  91. package/packages/memory-engine/engine/services/_shared/embed_provider.py +0 -468
  92. package/packages/memory-engine/engine/services/l2/Dockerfile +0 -50
  93. package/packages/memory-engine/engine/services/l2/init_databases.py +0 -81
  94. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +0 -2721
  95. package/packages/memory-engine/engine/services/l5/Dockerfile +0 -11
  96. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +0 -808
  97. package/packages/memory-engine/engine/services/l6/Dockerfile +0 -30
  98. package/packages/memory-engine/engine/services/l6/l6-document-store.py +0 -1221
  99. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +0 -28
  100. package/packages/memory-engine/engine/services/nv-embed/server.py +0 -152
  101. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  102. package/packages/memory-engine/pme_memory/__main__.py +0 -129
  103. package/packages/memory-engine/pme_memory/artifacts.py +0 -95
  104. package/packages/memory-engine/pme_memory/embed.py +0 -74
  105. package/packages/memory-engine/pme_memory/health.py +0 -36
  106. package/packages/memory-engine/pme_memory/hygiene.py +0 -159
  107. package/packages/memory-engine/pme_memory/indexer.py +0 -200
  108. package/packages/memory-engine/pme_memory/needs.py +0 -55
  109. package/packages/memory-engine/pme_memory/provenance.py +0 -80
  110. package/packages/memory-engine/pme_memory/scoring.py +0 -168
  111. package/packages/memory-engine/pme_memory/search.py +0 -52
  112. package/packages/memory-engine/pme_memory/store.py +0 -86
  113. package/packages/memory-engine/pme_memory/synthesis.py +0 -114
  114. package/packages/memory-engine/pyproject.toml +0 -65
  115. package/packages/memory-engine/scripts/kg-extractor.py +0 -557
  116. package/packages/memory-engine/scripts/kg-preflexor-v2.py +0 -738
  117. package/packages/memory-engine/scripts/wipe-legacy-l3-entities.py +0 -128
  118. package/packages/memory-engine/tests/e2e_arena.sh +0 -259
  119. package/packages/memory-engine/tests/embed_stub/Dockerfile +0 -13
  120. package/packages/memory-engine/tests/embed_stub/server.py +0 -80
  121. package/packages/memory-engine/tests/test_aggregate.py +0 -333
  122. package/packages/memory-engine/tests/test_api_contract.sh +0 -57
  123. package/packages/memory-engine/tests/test_arena_safety.py +0 -232
  124. package/packages/memory-engine/tests/test_channel_stat_reader.py +0 -437
  125. package/packages/memory-engine/tests/test_channel_stat_rollups.py +0 -308
  126. package/packages/memory-engine/tests/test_compat_nv_embed_probe.py +0 -48
  127. package/packages/memory-engine/tests/test_embed_provider.py +0 -492
  128. package/packages/memory-engine/tests/test_l2_qmd_vec_search.py +0 -280
  129. package/packages/memory-engine/tests/test_l3_arena_isolation.py +0 -412
  130. package/packages/memory-engine/tests/test_l6_module_load.py +0 -84
  131. package/packages/memory-engine/tests/test_people_list_reader.py +0 -432
@@ -1,1018 +0,0 @@
1
- {
2
- "bench": "customer-support",
3
- "stack": "pentatonic-memory",
4
- "n_tasks": 20,
5
- "n_correct": 15,
6
- "accuracy": 0.75,
7
- "mean_score": 0.8,
8
- "p50_search_ms": 128.41969198780134,
9
- "p95_search_ms": 170.69528470892692,
10
- "total_tokens_in": 0,
11
- "total_tokens_out": 0,
12
- "total_usd": 0.0,
13
- "by_tag": {
14
- "factoid": {
15
- "n": 10,
16
- "mean_score": 1.0,
17
- "accuracy": 1.0
18
- },
19
- "customer": {
20
- "n": 8,
21
- "mean_score": 0.8125,
22
- "accuracy": 0.75
23
- },
24
- "multi-doc": {
25
- "n": 6,
26
- "mean_score": 0.9166666666666666,
27
- "accuracy": 0.8333333333333334
28
- },
29
- "rma": {
30
- "n": 3,
31
- "mean_score": 0.6666666666666666,
32
- "accuracy": 0.6666666666666666
33
- },
34
- "policy": {
35
- "n": 5,
36
- "mean_score": 0.8,
37
- "accuracy": 0.8
38
- },
39
- "escalation": {
40
- "n": 4,
41
- "mean_score": 0.875,
42
- "accuracy": 0.75
43
- },
44
- "rubric": {
45
- "n": 3,
46
- "mean_score": 0.6666666666666666,
47
- "accuracy": 0.3333333333333333
48
- },
49
- "multi-hop": {
50
- "n": 1,
51
- "mean_score": 0.5,
52
- "accuracy": 0.0
53
- },
54
- "entity": {
55
- "n": 2,
56
- "mean_score": 0.0,
57
- "accuracy": 0.0
58
- }
59
- },
60
- "extra": {
61
- "ingest_ms": 6125.601384002948,
62
- "grading": "substring",
63
- "limit": 3,
64
- "tokens": {
65
- "corpus_tokens": 1227,
66
- "query_tokens": 283,
67
- "context_tokens": 3972,
68
- "retrieval_tokens": 4255,
69
- "naive_tokens": 24823,
70
- "saved_tokens": 20568,
71
- "reduction_pct": 0.8285863916529026,
72
- "mean_retrieval_tokens_per_task": 212.75,
73
- "tokenizer": "cl100k_base",
74
- "per_task": {
75
- "order-mina-count": {
76
- "query": 11,
77
- "context": 222,
78
- "retrieval": 233,
79
- "judge_in": 0,
80
- "judge_out": 0,
81
- "judge_latency_ms": 0.0
82
- },
83
- "order-mina-latest": {
84
- "query": 13,
85
- "context": 222,
86
- "retrieval": 235,
87
- "judge_in": 0,
88
- "judge_out": 0,
89
- "judge_latency_ms": 0.0
90
- },
91
- "rma-mina-sleeve-reason": {
92
- "query": 17,
93
- "context": 252,
94
- "retrieval": 269,
95
- "judge_in": 0,
96
- "judge_out": 0,
97
- "judge_latency_ms": 0.0
98
- },
99
- "rma-mina-lid-resolution": {
100
- "query": 11,
101
- "context": 199,
102
- "retrieval": 210,
103
- "judge_in": 0,
104
- "judge_out": 0,
105
- "judge_latency_ms": 0.0
106
- },
107
- "jareth-harness-bar-followup": {
108
- "query": 15,
109
- "context": 170,
110
- "retrieval": 185,
111
- "judge_in": 0,
112
- "judge_out": 0,
113
- "judge_latency_ms": 0.0
114
- },
115
- "jareth-second-order": {
116
- "query": 11,
117
- "context": 170,
118
- "retrieval": 181,
119
- "judge_in": 0,
120
- "judge_out": 0,
121
- "judge_latency_ms": 0.0
122
- },
123
- "priya-custom-status": {
124
- "query": 11,
125
- "context": 184,
126
- "retrieval": 195,
127
- "judge_in": 0,
128
- "judge_out": 0,
129
- "judge_latency_ms": 0.0
130
- },
131
- "priya-goodwill-offered": {
132
- "query": 9,
133
- "context": 221,
134
- "retrieval": 230,
135
- "judge_in": 0,
136
- "judge_out": 0,
137
- "judge_latency_ms": 0.0
138
- },
139
- "policy-custom-return": {
140
- "query": 11,
141
- "context": 180,
142
- "retrieval": 191,
143
- "judge_in": 0,
144
- "judge_out": 0,
145
- "judge_latency_ms": 0.0
146
- },
147
- "policy-40day-return": {
148
- "query": 18,
149
- "context": 154,
150
- "retrieval": 172,
151
- "judge_in": 0,
152
- "judge_out": 0,
153
- "judge_latency_ms": 0.0
154
- },
155
- "escalation-400gbp-full-refund": {
156
- "query": 20,
157
- "context": 159,
158
- "retrieval": 179,
159
- "judge_in": 0,
160
- "judge_out": 0,
161
- "judge_latency_ms": 0.0
162
- },
163
- "escalation-goodwill-20pct-tier": {
164
- "query": 15,
165
- "context": 194,
166
- "retrieval": 209,
167
- "judge_in": 0,
168
- "judge_out": 0,
169
- "judge_latency_ms": 0.0
170
- },
171
- "mina-vip-status": {
172
- "query": 10,
173
- "context": 222,
174
- "retrieval": 232,
175
- "judge_in": 0,
176
- "judge_out": 0,
177
- "judge_latency_ms": 0.0
178
- },
179
- "mina-preferences": {
180
- "query": 11,
181
- "context": 222,
182
- "retrieval": 233,
183
- "judge_in": 0,
184
- "judge_out": 0,
185
- "judge_latency_ms": 0.0
186
- },
187
- "priya-agent-guidance": {
188
- "query": 16,
189
- "context": 184,
190
- "retrieval": 200,
191
- "judge_in": 0,
192
- "judge_out": 0,
193
- "judge_latency_ms": 0.0
194
- },
195
- "rubric-mina-warranty-recommendation": {
196
- "query": 18,
197
- "context": 174,
198
- "retrieval": 192,
199
- "judge_in": 506,
200
- "judge_out": 48,
201
- "judge_latency_ms": 1022.3502889871597
202
- },
203
- "rubric-jareth-escalation": {
204
- "query": 30,
205
- "context": 170,
206
- "retrieval": 200,
207
- "judge_in": 527,
208
- "judge_out": 38,
209
- "judge_latency_ms": 890.2632699906826
210
- },
211
- "rubric-priya-delayed-custom": {
212
- "query": 16,
213
- "context": 242,
214
- "retrieval": 258,
215
- "judge_in": 552,
216
- "judge_out": 45,
217
- "judge_latency_ms": 794.1419219970703
218
- },
219
- "entity-mina-orders": {
220
- "query": 10,
221
- "context": 222,
222
- "retrieval": 232,
223
- "judge_in": 0,
224
- "judge_out": 0,
225
- "judge_latency_ms": 0.0
226
- },
227
- "entity-all-rmas": {
228
- "query": 10,
229
- "context": 209,
230
- "retrieval": 219,
231
- "judge_in": 0,
232
- "judge_out": 0,
233
- "judge_latency_ms": 0.0
234
- }
235
- },
236
- "judge_tokens_in": 1585,
237
- "judge_tokens_out": 131,
238
- "judge_calls": 3,
239
- "judge_mean_latency_ms": 902.2518269916376
240
- },
241
- "cost_usd": {
242
- "assumed_completion_tokens_per_task": 100,
243
- "rates": {
244
- "input_per_1k": 0.0025,
245
- "output_per_1k": 0.01,
246
- "model": "gpt-4o"
247
- },
248
- "retrieval_usd_in": 0.010637500000000001,
249
- "retrieval_usd_out": 0.02,
250
- "retrieval_usd_total": 0.0306375,
251
- "naive_usd_total": 0.0820575,
252
- "saved_usd": 0.05142000000000001,
253
- "saved_usd_per_1k_tasks": 2.571
254
- }
255
- },
256
- "task_results": [
257
- {
258
- "task_id": "order-mina-count",
259
- "query": "How many orders has Mina Okafor placed?",
260
- "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
261
- "hits": [
262
- {
263
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
264
- "score": 0.4692591726779938,
265
- "source": "pentatonic-memory",
266
- "doc_id": "order-2026-0187-mina"
267
- },
268
- {
269
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
270
- "score": 0.4614717364311218,
271
- "source": "pentatonic-memory",
272
- "doc_id": "order-2026-0142-mina"
273
- },
274
- {
275
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
276
- "score": 0.5065365433692932,
277
- "source": "pentatonic-memory",
278
- "doc_id": "customer-note-mina-vip"
279
- }
280
- ],
281
- "correct": true,
282
- "score": 1.0,
283
- "grading_notes": "all substrings matched",
284
- "search_time_ms": 121.94390798686072,
285
- "generation_time_ms": 0.0,
286
- "tokens_in": 0,
287
- "tokens_out": 0,
288
- "retrieval_tokens": 233,
289
- "query_tokens": 11,
290
- "context_tokens": 222,
291
- "judge_tokens_in": 0,
292
- "judge_tokens_out": 0,
293
- "judge_latency_ms": 0.0
294
- },
295
- {
296
- "task_id": "order-mina-latest",
297
- "query": "What was in Mina Okafor's most recent order?",
298
- "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
299
- "hits": [
300
- {
301
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
302
- "score": 0.4980609118938446,
303
- "source": "pentatonic-memory",
304
- "doc_id": "order-2026-0187-mina"
305
- },
306
- {
307
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
308
- "score": 0.4663473665714264,
309
- "source": "pentatonic-memory",
310
- "doc_id": "order-2026-0142-mina"
311
- },
312
- {
313
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
314
- "score": 0.5009183883666992,
315
- "source": "pentatonic-memory",
316
- "doc_id": "customer-note-mina-vip"
317
- }
318
- ],
319
- "correct": true,
320
- "score": 1.0,
321
- "grading_notes": "all substrings matched",
322
- "search_time_ms": 121.74513799254782,
323
- "generation_time_ms": 0.0,
324
- "tokens_in": 0,
325
- "tokens_out": 0,
326
- "retrieval_tokens": 235,
327
- "query_tokens": 13,
328
- "context_tokens": 222,
329
- "judge_tokens_in": 0,
330
- "judge_tokens_out": 0,
331
- "judge_latency_ms": 0.0
332
- },
333
- {
334
- "task_id": "rma-mina-sleeve-reason",
335
- "query": "Why did Mina open an RMA on order 2026-0142?",
336
- "answer": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
337
- "hits": [
338
- {
339
- "text": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.",
340
- "score": 0.41409578919410706,
341
- "source": "pentatonic-memory",
342
- "doc_id": "rma-2026-0142-swap"
343
- },
344
- {
345
- "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
346
- "score": 0.4296145439147949,
347
- "source": "pentatonic-memory",
348
- "doc_id": "rma-2026-0187-lid"
349
- },
350
- {
351
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
352
- "score": 0.39442405104637146,
353
- "source": "pentatonic-memory",
354
- "doc_id": "order-2026-0142-mina"
355
- }
356
- ],
357
- "correct": true,
358
- "score": 1.0,
359
- "grading_notes": "all substrings matched",
360
- "search_time_ms": 122.5400409894064,
361
- "generation_time_ms": 0.0,
362
- "tokens_in": 0,
363
- "tokens_out": 0,
364
- "retrieval_tokens": 269,
365
- "query_tokens": 17,
366
- "context_tokens": 252,
367
- "judge_tokens_in": 0,
368
- "judge_tokens_out": 0,
369
- "judge_latency_ms": 0.0
370
- },
371
- {
372
- "task_id": "rma-mina-lid-resolution",
373
- "query": "How was Mina's Luna bottle lid complaint resolved?",
374
- "answer": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
375
- "hits": [
376
- {
377
- "text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
378
- "score": 0.6011244654655457,
379
- "source": "pentatonic-memory",
380
- "doc_id": "chat-mina-2026-04-10"
381
- },
382
- {
383
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
384
- "score": 0.4347590506076813,
385
- "source": "pentatonic-memory",
386
- "doc_id": "order-2026-0187-mina"
387
- },
388
- {
389
- "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
390
- "score": 0.539427638053894,
391
- "source": "pentatonic-memory",
392
- "doc_id": "rma-2026-0187-lid"
393
- }
394
- ],
395
- "correct": true,
396
- "score": 1.0,
397
- "grading_notes": "all substrings matched",
398
- "search_time_ms": 128.34562797797844,
399
- "generation_time_ms": 0.0,
400
- "tokens_in": 0,
401
- "tokens_out": 0,
402
- "retrieval_tokens": 210,
403
- "query_tokens": 11,
404
- "context_tokens": 199,
405
- "judge_tokens_in": 0,
406
- "judge_tokens_out": 0,
407
- "judge_latency_ms": 0.0
408
- },
409
- {
410
- "task_id": "jareth-harness-bar-followup",
411
- "query": "Did Jareth's kite harness work with his bar out of the box?",
412
- "answer": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.\n---\nOrder #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
413
- "hits": [
414
- {
415
- "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
416
- "score": 0.5966447591781616,
417
- "source": "pentatonic-memory",
418
- "doc_id": "chat-jareth-2026-04-08"
419
- },
420
- {
421
- "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
422
- "score": 0.5066688656806946,
423
- "source": "pentatonic-memory",
424
- "doc_id": "customer-note-jareth-kiter"
425
- },
426
- {
427
- "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
428
- "score": 0.4137082099914551,
429
- "source": "pentatonic-memory",
430
- "doc_id": "order-2026-0201-jareth"
431
- }
432
- ],
433
- "correct": true,
434
- "score": 1.0,
435
- "grading_notes": "all substrings matched",
436
- "search_time_ms": 135.78065202455036,
437
- "generation_time_ms": 0.0,
438
- "tokens_in": 0,
439
- "tokens_out": 0,
440
- "retrieval_tokens": 185,
441
- "query_tokens": 15,
442
- "context_tokens": 170,
443
- "judge_tokens_in": 0,
444
- "judge_tokens_out": 0,
445
- "judge_latency_ms": 0.0
446
- },
447
- {
448
- "task_id": "jareth-second-order",
449
- "query": "What did Jareth order after his initial harness purchase?",
450
- "answer": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.\n---\n[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
451
- "hits": [
452
- {
453
- "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
454
- "score": 0.4606146514415741,
455
- "source": "pentatonic-memory",
456
- "doc_id": "order-2026-0201-jareth"
457
- },
458
- {
459
- "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
460
- "score": 0.4756895899772644,
461
- "source": "pentatonic-memory",
462
- "doc_id": "customer-note-jareth-kiter"
463
- },
464
- {
465
- "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
466
- "score": 0.3477482497692108,
467
- "source": "pentatonic-memory",
468
- "doc_id": "chat-jareth-2026-04-08"
469
- }
470
- ],
471
- "correct": true,
472
- "score": 1.0,
473
- "grading_notes": "all substrings matched",
474
- "search_time_ms": 128.49375599762425,
475
- "generation_time_ms": 0.0,
476
- "tokens_in": 0,
477
- "tokens_out": 0,
478
- "retrieval_tokens": 181,
479
- "query_tokens": 11,
480
- "context_tokens": 170,
481
- "judge_tokens_in": 0,
482
- "judge_tokens_out": 0,
483
- "judge_latency_ms": 0.0
484
- },
485
- {
486
- "task_id": "priya-custom-status",
487
- "query": "Can Priya change the text on her custom tote?",
488
- "answer": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\nCustomer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
489
- "hits": [
490
- {
491
- "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
492
- "score": 0.5468156933784485,
493
- "source": "pentatonic-memory",
494
- "doc_id": "chat-priya-2026-04-20"
495
- },
496
- {
497
- "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
498
- "score": 0.490670770406723,
499
- "source": "pentatonic-memory",
500
- "doc_id": "chat-priya-2026-04-14"
501
- },
502
- {
503
- "text": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
504
- "score": 0.46206608414649963,
505
- "source": "pentatonic-memory",
506
- "doc_id": "customer-note-priya-custom"
507
- }
508
- ],
509
- "correct": true,
510
- "score": 1.0,
511
- "grading_notes": "all substrings matched",
512
- "search_time_ms": 171.14407900953665,
513
- "generation_time_ms": 0.0,
514
- "tokens_in": 0,
515
- "tokens_out": 0,
516
- "retrieval_tokens": 195,
517
- "query_tokens": 11,
518
- "context_tokens": 184,
519
- "judge_tokens_in": 0,
520
- "judge_tokens_out": 0,
521
- "judge_latency_ms": 0.0
522
- },
523
- {
524
- "task_id": "priya-goodwill-offered",
525
- "query": "What goodwill credit did Priya get offered?",
526
- "answer": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\nOrder #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
527
- "hits": [
528
- {
529
- "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
530
- "score": 0.33671367168426514,
531
- "source": "pentatonic-memory",
532
- "doc_id": "chat-priya-2026-04-20"
533
- },
534
- {
535
- "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
536
- "score": 0.4352283179759979,
537
- "source": "pentatonic-memory",
538
- "doc_id": "policy-goodwill-credit"
539
- },
540
- {
541
- "text": "Order #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
542
- "score": 0.25958457589149475,
543
- "source": "pentatonic-memory",
544
- "doc_id": "order-2026-0215-priya"
545
- }
546
- ],
547
- "correct": true,
548
- "score": 1.0,
549
- "grading_notes": "all substrings matched",
550
- "search_time_ms": 162.16819299734198,
551
- "generation_time_ms": 0.0,
552
- "tokens_in": 0,
553
- "tokens_out": 0,
554
- "retrieval_tokens": 230,
555
- "query_tokens": 9,
556
- "context_tokens": 221,
557
- "judge_tokens_in": 0,
558
- "judge_tokens_out": 0,
559
- "judge_latency_ms": 0.0
560
- },
561
- {
562
- "task_id": "policy-custom-return",
563
- "query": "Can a custom-printed tote be returned for refund?",
564
- "answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
565
- "hits": [
566
- {
567
- "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
568
- "score": 0.5682638883590698,
569
- "source": "pentatonic-memory",
570
- "doc_id": "policy-returns-30day"
571
- },
572
- {
573
- "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
574
- "score": 0.31737956404685974,
575
- "source": "pentatonic-memory",
576
- "doc_id": "policy-escalation-tiers"
577
- },
578
- {
579
- "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
580
- "score": 0.47565758228302,
581
- "source": "pentatonic-memory",
582
- "doc_id": "chat-priya-2026-04-14"
583
- }
584
- ],
585
- "correct": true,
586
- "score": 1.0,
587
- "grading_notes": "all substrings matched",
588
- "search_time_ms": 120.61583198374137,
589
- "generation_time_ms": 0.0,
590
- "tokens_in": 0,
591
- "tokens_out": 0,
592
- "retrieval_tokens": 191,
593
- "query_tokens": 11,
594
- "context_tokens": 180,
595
- "judge_tokens_in": 0,
596
- "judge_tokens_out": 0,
597
- "judge_latency_ms": 0.0
598
- },
599
- {
600
- "task_id": "policy-40day-return",
601
- "query": "If an unused item was delivered 40 days ago, can I get a full refund?",
602
- "answer": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\n[Chat 2026-03-20] Mina: Hi, the sleeve I got is the wrong colour \u2014 I wanted Oat, not Charcoal. Can I swap? Agent: Yes, you're within the 30-day window. I'll send an exchange label.",
603
- "hits": [
604
- {
605
- "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
606
- "score": 0.6555708646774292,
607
- "source": "pentatonic-memory",
608
- "doc_id": "policy-returns-30day"
609
- },
610
- {
611
- "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
612
- "score": 0.2732425928115845,
613
- "source": "pentatonic-memory",
614
- "doc_id": "policy-goodwill-credit"
615
- },
616
- {
617
- "text": "[Chat 2026-03-20] Mina: Hi, the sleeve I got is the wrong colour \u2014 I wanted Oat, not Charcoal. Can I swap? Agent: Yes, you're within the 30-day window. I'll send an exchange label.",
618
- "score": 0.3445591926574707,
619
- "source": "pentatonic-memory",
620
- "doc_id": "chat-mina-2026-03-20"
621
- }
622
- ],
623
- "correct": false,
624
- "score": 0.0,
625
- "grading_notes": "all substrings matched; forbidden substring(s) present: ['full refund']",
626
- "search_time_ms": 122.33487097546458,
627
- "generation_time_ms": 0.0,
628
- "tokens_in": 0,
629
- "tokens_out": 0,
630
- "retrieval_tokens": 172,
631
- "query_tokens": 18,
632
- "context_tokens": 154,
633
- "judge_tokens_in": 0,
634
- "judge_tokens_out": 0,
635
- "judge_latency_ms": 0.0
636
- },
637
- {
638
- "task_id": "escalation-400gbp-full-refund",
639
- "query": "A Tier 1 agent wants to issue a full refund on a \u00a3400 order \u2014 can they?",
640
- "answer": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.\n---\nGoodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\nReturns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
641
- "hits": [
642
- {
643
- "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
644
- "score": 0.5317927598953247,
645
- "source": "pentatonic-memory",
646
- "doc_id": "policy-escalation-tiers"
647
- },
648
- {
649
- "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
650
- "score": 0.558272123336792,
651
- "source": "pentatonic-memory",
652
- "doc_id": "policy-goodwill-credit"
653
- },
654
- {
655
- "text": "Returns policy: unused items returnable within 30 days of delivery for full refund. After 30 days but under 90 days: store credit only. Custom-printed and consumable items are final sale and non-returnable.",
656
- "score": 0.3208146095275879,
657
- "source": "pentatonic-memory",
658
- "doc_id": "policy-returns-30day"
659
- }
660
- ],
661
- "correct": true,
662
- "score": 1.0,
663
- "grading_notes": "all substrings matched",
664
- "search_time_ms": 120.98205901565962,
665
- "generation_time_ms": 0.0,
666
- "tokens_in": 0,
667
- "tokens_out": 0,
668
- "retrieval_tokens": 179,
669
- "query_tokens": 20,
670
- "context_tokens": 159,
671
- "judge_tokens_in": 0,
672
- "judge_tokens_out": 0,
673
- "judge_latency_ms": 0.0
674
- },
675
- {
676
- "task_id": "escalation-goodwill-20pct-tier",
677
- "query": "What's the minimum agent tier needed to offer 20% goodwill credit?",
678
- "answer": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
679
- "hits": [
680
- {
681
- "text": "Goodwill credit: agents at Tier 1 may offer up to 10% off next order as goodwill. Tier 2 may offer up to 25%. Anything above 25% or any full refund outside policy requires Tier 3 approval.",
682
- "score": 0.7269918322563171,
683
- "source": "pentatonic-memory",
684
- "doc_id": "policy-goodwill-credit"
685
- },
686
- {
687
- "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
688
- "score": 0.29580065608024597,
689
- "source": "pentatonic-memory",
690
- "doc_id": "chat-priya-2026-04-20"
691
- },
692
- {
693
- "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
694
- "score": 0.4651184380054474,
695
- "source": "pentatonic-memory",
696
- "doc_id": "policy-escalation-tiers"
697
- }
698
- ],
699
- "correct": true,
700
- "score": 1.0,
701
- "grading_notes": "all substrings matched",
702
- "search_time_ms": 123.65483402390964,
703
- "generation_time_ms": 0.0,
704
- "tokens_in": 0,
705
- "tokens_out": 0,
706
- "retrieval_tokens": 209,
707
- "query_tokens": 15,
708
- "context_tokens": 194,
709
- "judge_tokens_in": 0,
710
- "judge_tokens_out": 0,
711
- "judge_latency_ms": 0.0
712
- },
713
- {
714
- "task_id": "mina-vip-status",
715
- "query": "Is Mina Okafor a VIP customer?",
716
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
717
- "hits": [
718
- {
719
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
720
- "score": 0.6328713297843933,
721
- "source": "pentatonic-memory",
722
- "doc_id": "customer-note-mina-vip"
723
- },
724
- {
725
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
726
- "score": 0.40744826197624207,
727
- "source": "pentatonic-memory",
728
- "doc_id": "order-2026-0142-mina"
729
- },
730
- {
731
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
732
- "score": 0.42423495650291443,
733
- "source": "pentatonic-memory",
734
- "doc_id": "order-2026-0187-mina"
735
- }
736
- ],
737
- "correct": true,
738
- "score": 1.0,
739
- "grading_notes": "all substrings matched",
740
- "search_time_ms": 118.19845100399107,
741
- "generation_time_ms": 0.0,
742
- "tokens_in": 0,
743
- "tokens_out": 0,
744
- "retrieval_tokens": 232,
745
- "query_tokens": 10,
746
- "context_tokens": 222,
747
- "judge_tokens_in": 0,
748
- "judge_tokens_out": 0,
749
- "judge_latency_ms": 0.0
750
- },
751
- {
752
- "task_id": "mina-preferences",
753
- "query": "What are Mina Okafor's stated preferences?",
754
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
755
- "hits": [
756
- {
757
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
758
- "score": 0.5033327341079712,
759
- "source": "pentatonic-memory",
760
- "doc_id": "customer-note-mina-vip"
761
- },
762
- {
763
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
764
- "score": 0.3317295014858246,
765
- "source": "pentatonic-memory",
766
- "doc_id": "order-2026-0187-mina"
767
- },
768
- {
769
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
770
- "score": 0.2938464879989624,
771
- "source": "pentatonic-memory",
772
- "doc_id": "order-2026-0142-mina"
773
- }
774
- ],
775
- "correct": true,
776
- "score": 1.0,
777
- "grading_notes": "all substrings matched",
778
- "search_time_ms": 128.2598020043224,
779
- "generation_time_ms": 0.0,
780
- "tokens_in": 0,
781
- "tokens_out": 0,
782
- "retrieval_tokens": 233,
783
- "query_tokens": 11,
784
- "context_tokens": 222,
785
- "judge_tokens_in": 0,
786
- "judge_tokens_out": 0,
787
- "judge_latency_ms": 0.0
788
- },
789
- {
790
- "task_id": "priya-agent-guidance",
791
- "query": "When handling Priya's orders, what should the agent do before production starts?",
792
- "answer": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.\n---\n[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
793
- "hits": [
794
- {
795
- "text": "Customer note \u2014 Priya Natarajan: orders frequently include custom-printed items. Agent should proactively confirm print text before production starts, not after.",
796
- "score": 0.5809648036956787,
797
- "source": "pentatonic-memory",
798
- "doc_id": "customer-note-priya-custom"
799
- },
800
- {
801
- "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
802
- "score": 0.3932722210884094,
803
- "source": "pentatonic-memory",
804
- "doc_id": "chat-priya-2026-04-14"
805
- },
806
- {
807
- "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
808
- "score": 0.38053300976753235,
809
- "source": "pentatonic-memory",
810
- "doc_id": "chat-priya-2026-04-20"
811
- }
812
- ],
813
- "correct": true,
814
- "score": 1.0,
815
- "grading_notes": "all substrings matched",
816
- "search_time_ms": 137.435241980711,
817
- "generation_time_ms": 0.0,
818
- "tokens_in": 0,
819
- "tokens_out": 0,
820
- "retrieval_tokens": 200,
821
- "query_tokens": 16,
822
- "context_tokens": 184,
823
- "judge_tokens_in": 0,
824
- "judge_tokens_out": 0,
825
- "judge_latency_ms": 0.0
826
- },
827
- {
828
- "task_id": "rubric-mina-warranty-recommendation",
829
- "query": "Mina is asking for product recommendations. Based on her history, what should we suggest?",
830
- "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\nOrder #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\n[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
831
- "hits": [
832
- {
833
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
834
- "score": 0.5650954246520996,
835
- "source": "pentatonic-memory",
836
- "doc_id": "customer-note-mina-vip"
837
- },
838
- {
839
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
840
- "score": 0.329296350479126,
841
- "source": "pentatonic-memory",
842
- "doc_id": "order-2026-0187-mina"
843
- },
844
- {
845
- "text": "[Chat 2026-04-10] Mina: Hey, the Luna bottle lid is leaking \u2014 is that covered? Agent: Yes, the lid has a 2-year warranty. I'll ship you a replacement lid free of charge. Mina: Great, thanks!",
846
- "score": 0.35196399688720703,
847
- "source": "pentatonic-memory",
848
- "doc_id": "chat-mina-2026-04-10"
849
- }
850
- ],
851
- "correct": true,
852
- "score": 1.0,
853
- "grading_notes": "judge=1.00 \u2014 The answer references Mina's VIP status and her stated interest in lifetime-warranty items, recommending the Luna Water ",
854
- "search_time_ms": 136.84402097715065,
855
- "generation_time_ms": 0.0,
856
- "tokens_in": 0,
857
- "tokens_out": 0,
858
- "retrieval_tokens": 192,
859
- "query_tokens": 18,
860
- "context_tokens": 174,
861
- "judge_tokens_in": 506,
862
- "judge_tokens_out": 48,
863
- "judge_latency_ms": 1022.3502889871597
864
- },
865
- {
866
- "task_id": "rubric-jareth-escalation",
867
- "query": "Jareth wants to return his kite harness for a full refund, claiming it 'didn't work' with his bar \u2014 how should we handle this?",
868
- "answer": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.\n---\nOrder #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.\n---\nCustomer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
869
- "hits": [
870
- {
871
- "text": "[Chat 2026-04-08] Jareth: My kite harness arrived but I actually have a Duotone bar \u2014 will this work? Agent: You'll need the Duotone/Core adapter kit (NMD-ADPT-DC, \u00a318). Want me to add one to a new order? Jareth: Yes please.",
872
- "score": 0.4707748591899872,
873
- "source": "pentatonic-memory",
874
- "doc_id": "chat-jareth-2026-04-08"
875
- },
876
- {
877
- "text": "Order #2026-0201 \u2014 placed 2026-04-02 by customer Jareth Kim (jareth@example.com).\nItems: 1\u00d7 Nomad Kite Harness v3 (L).\nTotal: \u00a3145.00. Shipped 2026-04-03, delivered 2026-04-05.",
878
- "score": 0.4572660028934479,
879
- "source": "pentatonic-memory",
880
- "doc_id": "order-2026-0201-jareth"
881
- },
882
- {
883
- "text": "Customer note \u2014 Jareth Kim: kite surfer, Worthing-based. Typically orders kite-related gear. Previously flagged harness size L as correct.",
884
- "score": 0.5262770056724548,
885
- "source": "pentatonic-memory",
886
- "doc_id": "customer-note-jareth-kiter"
887
- }
888
- ],
889
- "correct": false,
890
- "score": 0.5,
891
- "grading_notes": "judge=0.50 \u2014 The answer mentions the adapter history (Order #2026-0201) but does not suggest Tier 2 escalation.",
892
- "search_time_ms": 143.29434899264015,
893
- "generation_time_ms": 0.0,
894
- "tokens_in": 0,
895
- "tokens_out": 0,
896
- "retrieval_tokens": 200,
897
- "query_tokens": 30,
898
- "context_tokens": 170,
899
- "judge_tokens_in": 527,
900
- "judge_tokens_out": 38,
901
- "judge_latency_ms": 890.2632699906826
902
- },
903
- {
904
- "task_id": "rubric-priya-delayed-custom",
905
- "query": "Priya is frustrated about her custom tote delay. What's the best response?",
906
- "answer": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.\n---\n[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.\n---\nOrder #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
907
- "hits": [
908
- {
909
- "text": "[Chat 2026-04-14] Priya: When is my tote shipping? I ordered on the 6th. Agent: Your custom print is still in production, ETA 2026-04-18 for dispatch. Custom items have a 10-12 working day lead time. Priya: OK, thanks for confirming.",
910
- "score": 0.5332370400428772,
911
- "source": "pentatonic-memory",
912
- "doc_id": "chat-priya-2026-04-14"
913
- },
914
- {
915
- "text": "[Chat 2026-04-20] Priya: Can I change the print text? I'd prefer 'The Natarajans'. Agent: I'm sorry \u2014 the custom print started production on 2026-04-08 so we can't change the text now. I can offer 15% off your next order as goodwill. Priya: Ugh, fine \u2014 thanks.",
916
- "score": 0.41297996044158936,
917
- "source": "pentatonic-memory",
918
- "doc_id": "chat-priya-2026-04-20"
919
- },
920
- {
921
- "text": "Order #2026-0215 \u2014 placed 2026-04-06 by customer Priya Natarajan (priya.n@example.com).\nItems: 3\u00d7 Bio Bin Liners 30L (roll of 20), 1\u00d7 Canvas Tote Bag Large (custom print: 'Natarajan Household').\nTotal: \u00a340.60. Status: custom print in production, ETA 2026-04-18.",
922
- "score": 0.40319278836250305,
923
- "source": "pentatonic-memory",
924
- "doc_id": "order-2026-0215-priya"
925
- }
926
- ],
927
- "correct": false,
928
- "score": 0.5,
929
- "grading_notes": "judge=0.50 \u2014 The agent acknowledges the 10-12 working day lead time for custom items and provides a goodwill offer of 15% off Priya's",
930
- "search_time_ms": 129.14211401948705,
931
- "generation_time_ms": 0.0,
932
- "tokens_in": 0,
933
- "tokens_out": 0,
934
- "retrieval_tokens": 258,
935
- "query_tokens": 16,
936
- "context_tokens": 242,
937
- "judge_tokens_in": 552,
938
- "judge_tokens_out": 45,
939
- "judge_latency_ms": 794.1419219970703
940
- },
941
- {
942
- "task_id": "entity-mina-orders",
943
- "query": "List Mina Okafor's order IDs.",
944
- "answer": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.\n---\nOrder #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.\n---\nCustomer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
945
- "hits": [
946
- {
947
- "text": "Order #2026-0187 \u2014 placed 2026-03-28 by customer Mina Okafor.\nItems: 1\u00d7 Luna Water Bottle 500ml (Sage green), 1\u00d7 Loop Reusable Coffee Cup.\nTotal: \u00a331.00. Delivered 2026-04-01 via Royal Mail Tracked 48.",
948
- "score": 0.5401065349578857,
949
- "source": "pentatonic-memory",
950
- "doc_id": "order-2026-0187-mina"
951
- },
952
- {
953
- "text": "Order #2026-0142 \u2014 placed 2026-03-14 by customer Mina Okafor (mina.okafor@example.com).\nItems: 2\u00d7 Pentatonic Crew Tee Black (M), 1\u00d7 Haven Laptop Sleeve 13\" (Charcoal).\nShipping address: 12 Heene Road, Worthing BN11 3RG, UK.\nTotal: \u00a3104.00. Payment: Visa ending 4412. Delivered 2026-03-18 via DPD.",
954
- "score": 0.5784992575645447,
955
- "source": "pentatonic-memory",
956
- "doc_id": "order-2026-0142-mina"
957
- },
958
- {
959
- "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
960
- "score": 0.5376931428909302,
961
- "source": "pentatonic-memory",
962
- "doc_id": "customer-note-mina-vip"
963
- }
964
- ],
965
- "correct": false,
966
- "score": 0.0,
967
- "grading_notes": "no expected_substrings set",
968
- "search_time_ms": 141.6770379873924,
969
- "generation_time_ms": 0.0,
970
- "tokens_in": 0,
971
- "tokens_out": 0,
972
- "retrieval_tokens": 232,
973
- "query_tokens": 10,
974
- "context_tokens": 222,
975
- "judge_tokens_in": 0,
976
- "judge_tokens_out": 0,
977
- "judge_latency_ms": 0.0
978
- },
979
- {
980
- "task_id": "entity-all-rmas",
981
- "query": "List all open and closed RMA case IDs.",
982
- "answer": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.\n---\nRMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.\n---\nEscalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
983
- "hits": [
984
- {
985
- "text": "RMA case RMA-0142-01 opened 2026-03-20 for order #2026-0142. Reason: wrong colour received (Charcoal, wanted Oat). Resolution: exchange, not refund. Status: Oat sleeve shipped 2026-03-23, tracking RM-9912-UK. Closed 2026-03-25.",
986
- "score": 0.39210593700408936,
987
- "source": "pentatonic-memory",
988
- "doc_id": "rma-2026-0142-swap"
989
- },
990
- {
991
- "text": "RMA case RMA-0187-01 opened 2026-04-10 for order #2026-0187. Reason: defective lid on Luna Water Bottle 500ml. Resolution: free replacement lid under warranty. Status: shipped 2026-04-11. Closed 2026-04-14.",
992
- "score": 0.4559558928012848,
993
- "source": "pentatonic-memory",
994
- "doc_id": "rma-2026-0187-lid"
995
- },
996
- {
997
- "text": "Escalation tiers: Tier 1 (agent) resolves simple status/return requests. Tier 2 (senior agent) handles disputes, partial refunds, custom-print grievances. Tier 3 (manager) handles orders over \u00a3500, legal queries, and any request involving the take-back programme credit reconciliation.",
998
- "score": 0.2331419140100479,
999
- "source": "pentatonic-memory",
1000
- "doc_id": "policy-escalation-tiers"
1001
- }
1002
- ],
1003
- "correct": false,
1004
- "score": 0.0,
1005
- "grading_notes": "no expected_substrings set",
1006
- "search_time_ms": 158.67203500238247,
1007
- "generation_time_ms": 0.0,
1008
- "tokens_in": 0,
1009
- "tokens_out": 0,
1010
- "retrieval_tokens": 219,
1011
- "query_tokens": 10,
1012
- "context_tokens": 209,
1013
- "judge_tokens_in": 0,
1014
- "judge_tokens_out": 0,
1015
- "judge_latency_ms": 0.0
1016
- }
1017
- ]
1018
- }