@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1018 @@
1
+ {
2
+ "bench": "customer-support",
3
+ "stack": "pentatonic-memory-latest",
4
+ "n_tasks": 20,
5
+ "n_correct": 1,
6
+ "accuracy": 0.05,
7
+ "mean_score": 0.05,
8
+ "p50_search_ms": 31.97749049286358,
9
+ "p95_search_ms": 48.48551471950486,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 10,
16
+ "mean_score": 0.1,
17
+ "accuracy": 0.1
18
+ },
19
+ "customer": {
20
+ "n": 8,
21
+ "mean_score": 0.125,
22
+ "accuracy": 0.125
23
+ },
24
+ "multi-doc": {
25
+ "n": 6,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "rma": {
30
+ "n": 3,
31
+ "mean_score": 0.0,
32
+ "accuracy": 0.0
33
+ },
34
+ "policy": {
35
+ "n": 5,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "escalation": {
40
+ "n": 4,
41
+ "mean_score": 0.0,
42
+ "accuracy": 0.0
43
+ },
44
+ "rubric": {
45
+ "n": 3,
46
+ "mean_score": 0.0,
47
+ "accuracy": 0.0
48
+ },
49
+ "multi-hop": {
50
+ "n": 1,
51
+ "mean_score": 0.0,
52
+ "accuracy": 0.0
53
+ },
54
+ "entity": {
55
+ "n": 2,
56
+ "mean_score": 0.0,
57
+ "accuracy": 0.0
58
+ }
59
+ },
60
+ "extra": {
61
+ "ingest_ms": 42532.69734699279,
62
+ "grading": "substring",
63
+ "limit": 3,
64
+ "tokens": {
65
+ "corpus_tokens": 1227,
66
+ "query_tokens": 283,
67
+ "context_tokens": 871,
68
+ "retrieval_tokens": 1154,
69
+ "naive_tokens": 24823,
70
+ "saved_tokens": 23669,
71
+ "reduction_pct": 0.9535108568666156,
72
+ "mean_retrieval_tokens_per_task": 57.7,
73
+ "tokenizer": "cl100k_base",
74
+ "per_task": {
75
+ "order-mina-count": {
76
+ "query": 11,
77
+ "context": 44,
78
+ "retrieval": 55,
79
+ "judge_in": 0,
80
+ "judge_out": 0,
81
+ "judge_latency_ms": 0.0
82
+ },
83
+ "order-mina-latest": {
84
+ "query": 13,
85
+ "context": 44,
86
+ "retrieval": 57,
87
+ "judge_in": 0,
88
+ "judge_out": 0,
89
+ "judge_latency_ms": 0.0
90
+ },
91
+ "rma-mina-sleeve-reason": {
92
+ "query": 17,
93
+ "context": 43,
94
+ "retrieval": 60,
95
+ "judge_in": 0,
96
+ "judge_out": 0,
97
+ "judge_latency_ms": 0.0
98
+ },
99
+ "rma-mina-lid-resolution": {
100
+ "query": 11,
101
+ "context": 49,
102
+ "retrieval": 60,
103
+ "judge_in": 0,
104
+ "judge_out": 0,
105
+ "judge_latency_ms": 0.0
106
+ },
107
+ "jareth-harness-bar-followup": {
108
+ "query": 15,
109
+ "context": 35,
110
+ "retrieval": 50,
111
+ "judge_in": 0,
112
+ "judge_out": 0,
113
+ "judge_latency_ms": 0.0
114
+ },
115
+ "jareth-second-order": {
116
+ "query": 11,
117
+ "context": 44,
118
+ "retrieval": 55,
119
+ "judge_in": 0,
120
+ "judge_out": 0,
121
+ "judge_latency_ms": 0.0
122
+ },
123
+ "priya-custom-status": {
124
+ "query": 11,
125
+ "context": 37,
126
+ "retrieval": 48,
127
+ "judge_in": 0,
128
+ "judge_out": 0,
129
+ "judge_latency_ms": 0.0
130
+ },
131
+ "priya-goodwill-offered": {
132
+ "query": 9,
133
+ "context": 36,
134
+ "retrieval": 45,
135
+ "judge_in": 0,
136
+ "judge_out": 0,
137
+ "judge_latency_ms": 0.0
138
+ },
139
+ "policy-custom-return": {
140
+ "query": 11,
141
+ "context": 43,
142
+ "retrieval": 54,
143
+ "judge_in": 0,
144
+ "judge_out": 0,
145
+ "judge_latency_ms": 0.0
146
+ },
147
+ "policy-40day-return": {
148
+ "query": 18,
149
+ "context": 44,
150
+ "retrieval": 62,
151
+ "judge_in": 0,
152
+ "judge_out": 0,
153
+ "judge_latency_ms": 0.0
154
+ },
155
+ "escalation-400gbp-full-refund": {
156
+ "query": 20,
157
+ "context": 38,
158
+ "retrieval": 58,
159
+ "judge_in": 0,
160
+ "judge_out": 0,
161
+ "judge_latency_ms": 0.0
162
+ },
163
+ "escalation-goodwill-20pct-tier": {
164
+ "query": 15,
165
+ "context": 29,
166
+ "retrieval": 44,
167
+ "judge_in": 0,
168
+ "judge_out": 0,
169
+ "judge_latency_ms": 0.0
170
+ },
171
+ "mina-vip-status": {
172
+ "query": 10,
173
+ "context": 75,
174
+ "retrieval": 85,
175
+ "judge_in": 0,
176
+ "judge_out": 0,
177
+ "judge_latency_ms": 0.0
178
+ },
179
+ "mina-preferences": {
180
+ "query": 11,
181
+ "context": 33,
182
+ "retrieval": 44,
183
+ "judge_in": 0,
184
+ "judge_out": 0,
185
+ "judge_latency_ms": 0.0
186
+ },
187
+ "priya-agent-guidance": {
188
+ "query": 16,
189
+ "context": 44,
190
+ "retrieval": 60,
191
+ "judge_in": 0,
192
+ "judge_out": 0,
193
+ "judge_latency_ms": 0.0
194
+ },
195
+ "rubric-mina-warranty-recommendation": {
196
+ "query": 18,
197
+ "context": 45,
198
+ "retrieval": 63,
199
+ "judge_in": 378,
200
+ "judge_out": 41,
201
+ "judge_latency_ms": 1245.5484809875488
202
+ },
203
+ "rubric-jareth-escalation": {
204
+ "query": 30,
205
+ "context": 40,
206
+ "retrieval": 70,
207
+ "judge_in": 398,
208
+ "judge_out": 37,
209
+ "judge_latency_ms": 1219.425511032343
210
+ },
211
+ "rubric-priya-delayed-custom": {
212
+ "query": 16,
213
+ "context": 49,
214
+ "retrieval": 65,
215
+ "judge_in": 360,
216
+ "judge_out": 31,
217
+ "judge_latency_ms": 577.4644939899445
218
+ },
219
+ "entity-mina-orders": {
220
+ "query": 10,
221
+ "context": 56,
222
+ "retrieval": 66,
223
+ "judge_in": 0,
224
+ "judge_out": 0,
225
+ "judge_latency_ms": 0.0
226
+ },
227
+ "entity-all-rmas": {
228
+ "query": 10,
229
+ "context": 43,
230
+ "retrieval": 53,
231
+ "judge_in": 0,
232
+ "judge_out": 0,
233
+ "judge_latency_ms": 0.0
234
+ }
235
+ },
236
+ "judge_tokens_in": 1136,
237
+ "judge_tokens_out": 109,
238
+ "judge_calls": 3,
239
+ "judge_mean_latency_ms": 1014.1461620032787
240
+ },
241
+ "cost_usd": {
242
+ "assumed_completion_tokens_per_task": 100,
243
+ "rates": {
244
+ "input_per_1k": 0.0025,
245
+ "output_per_1k": 0.01,
246
+ "model": "gpt-4o"
247
+ },
248
+ "retrieval_usd_in": 0.0028850000000000004,
249
+ "retrieval_usd_out": 0.02,
250
+ "retrieval_usd_total": 0.022885000000000003,
251
+ "naive_usd_total": 0.0820575,
252
+ "saved_usd": 0.0591725,
253
+ "saved_usd_per_1k_tasks": 2.9586250000000005
254
+ }
255
+ },
256
+ "task_results": [
257
+ {
258
+ "task_id": "order-mina-count",
259
+ "query": "How many orders has Mina Okafor placed?",
260
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nreporting-svc consumes OrderPaid for financial reports",
261
+ "hits": [
262
+ {
263
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
264
+ "score": 0.7348100455668961,
265
+ "source": "pentatonic-memory",
266
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
267
+ },
268
+ {
269
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
270
+ "score": 0.714597754640188,
271
+ "source": "pentatonic-memory",
272
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
273
+ },
274
+ {
275
+ "text": "reporting-svc consumes OrderPaid for financial reports",
276
+ "score": 0.7145555501636411,
277
+ "source": "pentatonic-memory",
278
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
279
+ }
280
+ ],
281
+ "correct": false,
282
+ "score": 0.0,
283
+ "grading_notes": "missing 1/1: ['2']",
284
+ "search_time_ms": 48.87774802045897,
285
+ "generation_time_ms": 0.0,
286
+ "tokens_in": 0,
287
+ "tokens_out": 0,
288
+ "retrieval_tokens": 55,
289
+ "query_tokens": 11,
290
+ "context_tokens": 44,
291
+ "judge_tokens_in": 0,
292
+ "judge_tokens_out": 0,
293
+ "judge_latency_ms": 0.0
294
+ },
295
+ {
296
+ "task_id": "order-mina-latest",
297
+ "query": "What was in Mina Okafor's most recent order?",
298
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nemail-svc consumes OrderPaid for the receipt email",
299
+ "hits": [
300
+ {
301
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
302
+ "score": 0.7523848879261325,
303
+ "source": "pentatonic-memory",
304
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
305
+ },
306
+ {
307
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
308
+ "score": 0.7290429542492324,
309
+ "source": "pentatonic-memory",
310
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
311
+ },
312
+ {
313
+ "text": "email-svc consumes OrderPaid for the receipt email",
314
+ "score": 0.726473237442234,
315
+ "source": "pentatonic-memory",
316
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
317
+ }
318
+ ],
319
+ "correct": false,
320
+ "score": 0.0,
321
+ "grading_notes": "missing 2/2: ['Luna', 'Loop']",
322
+ "search_time_ms": 31.01924099610187,
323
+ "generation_time_ms": 0.0,
324
+ "tokens_in": 0,
325
+ "tokens_out": 0,
326
+ "retrieval_tokens": 57,
327
+ "query_tokens": 13,
328
+ "context_tokens": 44,
329
+ "judge_tokens_in": 0,
330
+ "judge_tokens_out": 0,
331
+ "judge_latency_ms": 0.0
332
+ },
333
+ {
334
+ "task_id": "rma-mina-sleeve-reason",
335
+ "query": "Why did Mina open an RMA on order 2026-0142?",
336
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nemail-svc consumes OrderPaid for the receipt email",
337
+ "hits": [
338
+ {
339
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
340
+ "score": 0.7412810721806334,
341
+ "source": "pentatonic-memory",
342
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
343
+ },
344
+ {
345
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
346
+ "score": 0.738627617800018,
347
+ "source": "pentatonic-memory",
348
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
349
+ },
350
+ {
351
+ "text": "email-svc consumes OrderPaid for the receipt email",
352
+ "score": 0.7263630342430126,
353
+ "source": "pentatonic-memory",
354
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
355
+ }
356
+ ],
357
+ "correct": false,
358
+ "score": 0.0,
359
+ "grading_notes": "missing 2/2: ['wrong colour', 'Oat']",
360
+ "search_time_ms": 34.496553998906165,
361
+ "generation_time_ms": 0.0,
362
+ "tokens_in": 0,
363
+ "tokens_out": 0,
364
+ "retrieval_tokens": 60,
365
+ "query_tokens": 17,
366
+ "context_tokens": 43,
367
+ "judge_tokens_in": 0,
368
+ "judge_tokens_out": 0,
369
+ "judge_latency_ms": 0.0
370
+ },
371
+ {
372
+ "task_id": "rma-mina-lid-resolution",
373
+ "query": "How was Mina's Luna bottle lid complaint resolved?",
374
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
375
+ "hits": [
376
+ {
377
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
378
+ "score": 0.6924995919896444,
379
+ "source": "pentatonic-memory",
380
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
381
+ },
382
+ {
383
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
384
+ "score": 0.6881876638919722,
385
+ "source": "pentatonic-memory",
386
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
387
+ },
388
+ {
389
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
390
+ "score": 0.6774820029527524,
391
+ "source": "pentatonic-memory",
392
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
393
+ }
394
+ ],
395
+ "correct": false,
396
+ "score": 0.0,
397
+ "grading_notes": "missing 3/3: ['free', 'replacement', 'warranty']",
398
+ "search_time_ms": 30.18009799416177,
399
+ "generation_time_ms": 0.0,
400
+ "tokens_in": 0,
401
+ "tokens_out": 0,
402
+ "retrieval_tokens": 60,
403
+ "query_tokens": 11,
404
+ "context_tokens": 49,
405
+ "judge_tokens_in": 0,
406
+ "judge_tokens_out": 0,
407
+ "judge_latency_ms": 0.0
408
+ },
409
+ {
410
+ "task_id": "jareth-harness-bar-followup",
411
+ "query": "Did Jareth's kite harness work with his bar out of the box?",
412
+ "answer": "billing-svc API keys \u2014 jordan\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nsearch-svc API keys \u2014 alex",
413
+ "hits": [
414
+ {
415
+ "text": "billing-svc API keys \u2014 jordan",
416
+ "score": 0.712132003684325,
417
+ "source": "pentatonic-memory",
418
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
419
+ },
420
+ {
421
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
422
+ "score": 0.6997367511670954,
423
+ "source": "pentatonic-memory",
424
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
425
+ },
426
+ {
427
+ "text": "search-svc API keys \u2014 alex",
428
+ "score": 0.6991961951123548,
429
+ "source": "pentatonic-memory",
430
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
431
+ }
432
+ ],
433
+ "correct": false,
434
+ "score": 0.0,
435
+ "grading_notes": "missing 2/2: ['Duotone', 'adapter']",
436
+ "search_time_ms": 35.19590600626543,
437
+ "generation_time_ms": 0.0,
438
+ "tokens_in": 0,
439
+ "tokens_out": 0,
440
+ "retrieval_tokens": 50,
441
+ "query_tokens": 15,
442
+ "context_tokens": 35,
443
+ "judge_tokens_in": 0,
444
+ "judge_tokens_out": 0,
445
+ "judge_latency_ms": 0.0
446
+ },
447
+ {
448
+ "task_id": "jareth-second-order",
449
+ "query": "What did Jareth order after his initial harness purchase?",
450
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nemail-svc consumes OrderPaid for the receipt email",
451
+ "hits": [
452
+ {
453
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
454
+ "score": 0.7333830509419805,
455
+ "source": "pentatonic-memory",
456
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
457
+ },
458
+ {
459
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
460
+ "score": 0.732984482141125,
461
+ "source": "pentatonic-memory",
462
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
463
+ },
464
+ {
465
+ "text": "email-svc consumes OrderPaid for the receipt email",
466
+ "score": 0.719114942674962,
467
+ "source": "pentatonic-memory",
468
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
469
+ }
470
+ ],
471
+ "correct": false,
472
+ "score": 0.0,
473
+ "grading_notes": "missing 2/2: ['adapter', 'NMD-ADPT-DC']",
474
+ "search_time_ms": 27.49728801427409,
475
+ "generation_time_ms": 0.0,
476
+ "tokens_in": 0,
477
+ "tokens_out": 0,
478
+ "retrieval_tokens": 55,
479
+ "query_tokens": 11,
480
+ "context_tokens": 44,
481
+ "judge_tokens_in": 0,
482
+ "judge_tokens_out": 0,
483
+ "judge_latency_ms": 0.0
484
+ },
485
+ {
486
+ "task_id": "priya-custom-status",
487
+ "query": "Can Priya change the text on her custom tote?",
488
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling.new_invoice_pdf",
489
+ "hits": [
490
+ {
491
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
492
+ "score": 0.7055909766324263,
493
+ "source": "pentatonic-memory",
494
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
495
+ },
496
+ {
497
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
498
+ "score": 0.7040487646883715,
499
+ "source": "pentatonic-memory",
500
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
501
+ },
502
+ {
503
+ "text": "billing.new_invoice_pdf",
504
+ "score": 0.7011764727752321,
505
+ "source": "pentatonic-memory",
506
+ "doc_id": "mem_ca8bdac3-12ca-4bae-b5a2-fb9c2d069173"
507
+ }
508
+ ],
509
+ "correct": false,
510
+ "score": 0.0,
511
+ "grading_notes": "missing 2/2: [\"can't change\", 'production']",
512
+ "search_time_ms": 30.94983199844137,
513
+ "generation_time_ms": 0.0,
514
+ "tokens_in": 0,
515
+ "tokens_out": 0,
516
+ "retrieval_tokens": 48,
517
+ "query_tokens": 11,
518
+ "context_tokens": 37,
519
+ "judge_tokens_in": 0,
520
+ "judge_tokens_out": 0,
521
+ "judge_latency_ms": 0.0
522
+ },
523
+ {
524
+ "task_id": "priya-goodwill-offered",
525
+ "query": "What goodwill credit did Priya get offered?",
526
+ "answer": "billing-svc API keys \u2014 jordan\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nreporting-svc consumes OrderPaid for financial reports",
527
+ "hits": [
528
+ {
529
+ "text": "billing-svc API keys \u2014 jordan",
530
+ "score": 0.732138105000754,
531
+ "source": "pentatonic-memory",
532
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
533
+ },
534
+ {
535
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
536
+ "score": 0.7257386363316584,
537
+ "source": "pentatonic-memory",
538
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
539
+ },
540
+ {
541
+ "text": "reporting-svc consumes OrderPaid for financial reports",
542
+ "score": 0.7223475822522414,
543
+ "source": "pentatonic-memory",
544
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
545
+ }
546
+ ],
547
+ "correct": false,
548
+ "score": 0.0,
549
+ "grading_notes": "missing 1/1: ['15%']",
550
+ "search_time_ms": 33.91869302140549,
551
+ "generation_time_ms": 0.0,
552
+ "tokens_in": 0,
553
+ "tokens_out": 0,
554
+ "retrieval_tokens": 45,
555
+ "query_tokens": 9,
556
+ "context_tokens": 36,
557
+ "judge_tokens_in": 0,
558
+ "judge_tokens_out": 0,
559
+ "judge_latency_ms": 0.0
560
+ },
561
+ {
562
+ "task_id": "policy-custom-return",
563
+ "query": "Can a custom-printed tote be returned for refund?",
564
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email",
565
+ "hits": [
566
+ {
567
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
568
+ "score": 0.7081009500798201,
569
+ "source": "pentatonic-memory",
570
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
571
+ },
572
+ {
573
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
574
+ "score": 0.7004294040289488,
575
+ "source": "pentatonic-memory",
576
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
577
+ },
578
+ {
579
+ "text": "email-svc consumes OrderPaid for the receipt email",
580
+ "score": 0.698938157952978,
581
+ "source": "pentatonic-memory",
582
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
583
+ }
584
+ ],
585
+ "correct": false,
586
+ "score": 0.0,
587
+ "grading_notes": "missing 1/1: ['final sale']",
588
+ "search_time_ms": 27.65293698757887,
589
+ "generation_time_ms": 0.0,
590
+ "tokens_in": 0,
591
+ "tokens_out": 0,
592
+ "retrieval_tokens": 54,
593
+ "query_tokens": 11,
594
+ "context_tokens": 43,
595
+ "judge_tokens_in": 0,
596
+ "judge_tokens_out": 0,
597
+ "judge_latency_ms": 0.0
598
+ },
599
+ {
600
+ "task_id": "policy-40day-return",
601
+ "query": "If an unused item was delivered 40 days ago, can I get a full refund?",
602
+ "answer": "email-svc consumes OrderPaid for the receipt email\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
603
+ "hits": [
604
+ {
605
+ "text": "email-svc consumes OrderPaid for the receipt email",
606
+ "score": 0.7123921402669716,
607
+ "source": "pentatonic-memory",
608
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
609
+ },
610
+ {
611
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
612
+ "score": 0.7006719406640058,
613
+ "source": "pentatonic-memory",
614
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
615
+ },
616
+ {
617
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
618
+ "score": 0.6996661986676126,
619
+ "source": "pentatonic-memory",
620
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
621
+ }
622
+ ],
623
+ "correct": false,
624
+ "score": 0.0,
625
+ "grading_notes": "missing 1/1: ['store credit']",
626
+ "search_time_ms": 32.93573998962529,
627
+ "generation_time_ms": 0.0,
628
+ "tokens_in": 0,
629
+ "tokens_out": 0,
630
+ "retrieval_tokens": 62,
631
+ "query_tokens": 18,
632
+ "context_tokens": 44,
633
+ "judge_tokens_in": 0,
634
+ "judge_tokens_out": 0,
635
+ "judge_latency_ms": 0.0
636
+ },
637
+ {
638
+ "task_id": "escalation-400gbp-full-refund",
639
+ "query": "A Tier 1 agent wants to issue a full refund on a \u00a3400 order \u2014 can they?",
640
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nreporting-svc consumes OrderPaid for financial reports",
641
+ "hits": [
642
+ {
643
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
644
+ "score": 0.7415371376891134,
645
+ "source": "pentatonic-memory",
646
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
647
+ },
648
+ {
649
+ "text": "email-svc consumes OrderPaid for the receipt email",
650
+ "score": 0.7331214412862811,
651
+ "source": "pentatonic-memory",
652
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
653
+ },
654
+ {
655
+ "text": "reporting-svc consumes OrderPaid for financial reports",
656
+ "score": 0.7286868615136332,
657
+ "source": "pentatonic-memory",
658
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
659
+ }
660
+ ],
661
+ "correct": false,
662
+ "score": 0.0,
663
+ "grading_notes": "missing 1/1: ['Tier 3']",
664
+ "search_time_ms": 30.9724890103098,
665
+ "generation_time_ms": 0.0,
666
+ "tokens_in": 0,
667
+ "tokens_out": 0,
668
+ "retrieval_tokens": 58,
669
+ "query_tokens": 20,
670
+ "context_tokens": 38,
671
+ "judge_tokens_in": 0,
672
+ "judge_tokens_out": 0,
673
+ "judge_latency_ms": 0.0
674
+ },
675
+ {
676
+ "task_id": "escalation-goodwill-20pct-tier",
677
+ "query": "What's the minimum agent tier needed to offer 20% goodwill credit?",
678
+ "answer": "billing-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex\n---\nreporting-svc consumes OrderPaid for financial reports",
679
+ "hits": [
680
+ {
681
+ "text": "billing-svc API keys \u2014 jordan",
682
+ "score": 0.7133219905624273,
683
+ "source": "pentatonic-memory",
684
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
685
+ },
686
+ {
687
+ "text": "search-svc API keys \u2014 alex",
688
+ "score": 0.7099292551016788,
689
+ "source": "pentatonic-memory",
690
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
691
+ },
692
+ {
693
+ "text": "reporting-svc consumes OrderPaid for financial reports",
694
+ "score": 0.7080727802931086,
695
+ "source": "pentatonic-memory",
696
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
697
+ }
698
+ ],
699
+ "correct": false,
700
+ "score": 0.0,
701
+ "grading_notes": "missing 1/1: ['Tier 2']",
702
+ "search_time_ms": 28.71934699942358,
703
+ "generation_time_ms": 0.0,
704
+ "tokens_in": 0,
705
+ "tokens_out": 0,
706
+ "retrieval_tokens": 44,
707
+ "query_tokens": 15,
708
+ "context_tokens": 29,
709
+ "judge_tokens_in": 0,
710
+ "judge_tokens_out": 0,
711
+ "judge_latency_ms": 0.0
712
+ },
713
+ {
714
+ "task_id": "mina-vip-status",
715
+ "query": "Is Mina Okafor a VIP customer?",
716
+ "answer": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
717
+ "hits": [
718
+ {
719
+ "text": "Customer note \u2014 Mina Okafor: VIP tier (5+ orders, \u00a3350+ lifetime spend). Preferences: matte finishes, neutrals over brights. Has flagged interest in lifetime-warranty items.",
720
+ "score": 0.7720706800424769,
721
+ "source": "pentatonic-memory",
722
+ "doc_id": "customer-note-mina-vip"
723
+ },
724
+ {
725
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
726
+ "score": 0.7348438938902003,
727
+ "source": "pentatonic-memory",
728
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
729
+ },
730
+ {
731
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
732
+ "score": 0.7235270502207439,
733
+ "source": "pentatonic-memory",
734
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
735
+ }
736
+ ],
737
+ "correct": true,
738
+ "score": 1.0,
739
+ "grading_notes": "all substrings matched",
740
+ "search_time_ms": 27.80705102486536,
741
+ "generation_time_ms": 0.0,
742
+ "tokens_in": 0,
743
+ "tokens_out": 0,
744
+ "retrieval_tokens": 85,
745
+ "query_tokens": 10,
746
+ "context_tokens": 75,
747
+ "judge_tokens_in": 0,
748
+ "judge_tokens_out": 0,
749
+ "judge_latency_ms": 0.0
750
+ },
751
+ {
752
+ "task_id": "mina-preferences",
753
+ "query": "What are Mina Okafor's stated preferences?",
754
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nbilling-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex",
755
+ "hits": [
756
+ {
757
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
758
+ "score": 0.7055044745838214,
759
+ "source": "pentatonic-memory",
760
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
761
+ },
762
+ {
763
+ "text": "billing-svc API keys \u2014 jordan",
764
+ "score": 0.7001017821797055,
765
+ "source": "pentatonic-memory",
766
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
767
+ },
768
+ {
769
+ "text": "search-svc API keys \u2014 alex",
770
+ "score": 0.6988370568106516,
771
+ "source": "pentatonic-memory",
772
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
773
+ }
774
+ ],
775
+ "correct": false,
776
+ "score": 0.0,
777
+ "grading_notes": "missing 2/2: ['matte', 'neutrals']",
778
+ "search_time_ms": 25.142928992863744,
779
+ "generation_time_ms": 0.0,
780
+ "tokens_in": 0,
781
+ "tokens_out": 0,
782
+ "retrieval_tokens": 44,
783
+ "query_tokens": 11,
784
+ "context_tokens": 33,
785
+ "judge_tokens_in": 0,
786
+ "judge_tokens_out": 0,
787
+ "judge_latency_ms": 0.0
788
+ },
789
+ {
790
+ "task_id": "priya-agent-guidance",
791
+ "query": "When handling Priya's orders, what should the agent do before production starts?",
792
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nreporting-svc consumes OrderPaid for financial reports",
793
+ "hits": [
794
+ {
795
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
796
+ "score": 0.7256843616193557,
797
+ "source": "pentatonic-memory",
798
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
799
+ },
800
+ {
801
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
802
+ "score": 0.7167362886072972,
803
+ "source": "pentatonic-memory",
804
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
805
+ },
806
+ {
807
+ "text": "reporting-svc consumes OrderPaid for financial reports",
808
+ "score": 0.7163327580212099,
809
+ "source": "pentatonic-memory",
810
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
811
+ }
812
+ ],
813
+ "correct": false,
814
+ "score": 0.0,
815
+ "grading_notes": "missing 2/2: ['confirm', 'print text']",
816
+ "search_time_ms": 30.428772006416693,
817
+ "generation_time_ms": 0.0,
818
+ "tokens_in": 0,
819
+ "tokens_out": 0,
820
+ "retrieval_tokens": 60,
821
+ "query_tokens": 16,
822
+ "context_tokens": 44,
823
+ "judge_tokens_in": 0,
824
+ "judge_tokens_out": 0,
825
+ "judge_latency_ms": 0.0
826
+ },
827
+ {
828
+ "task_id": "rubric-mina-warranty-recommendation",
829
+ "query": "Mina is asking for product recommendations. Based on her history, what should we suggest?",
830
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
831
+ "hits": [
832
+ {
833
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
834
+ "score": 0.6870628699890775,
835
+ "source": "pentatonic-memory",
836
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
837
+ },
838
+ {
839
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
840
+ "score": 0.6841676093807408,
841
+ "source": "pentatonic-memory",
842
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
843
+ },
844
+ {
845
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
846
+ "score": 0.6829611935772381,
847
+ "source": "pentatonic-memory",
848
+ "doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
849
+ }
850
+ ],
851
+ "correct": false,
852
+ "score": 0.0,
853
+ "grading_notes": "judge=0.00 \u2014 The answer does not reference Mina's VIP status or her stated interest in lifetime-warranty items, instead discussing or",
854
+ "search_time_ms": 38.33310300251469,
855
+ "generation_time_ms": 0.0,
856
+ "tokens_in": 0,
857
+ "tokens_out": 0,
858
+ "retrieval_tokens": 63,
859
+ "query_tokens": 18,
860
+ "context_tokens": 45,
861
+ "judge_tokens_in": 378,
862
+ "judge_tokens_out": 41,
863
+ "judge_latency_ms": 1245.5484809875488
864
+ },
865
+ {
866
+ "task_id": "rubric-jareth-escalation",
867
+ "query": "Jareth wants to return his kite harness for a full refund, claiming it 'didn't work' with his bar \u2014 how should we handle this?",
868
+ "answer": "billing-svc API keys \u2014 jordan\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
869
+ "hits": [
870
+ {
871
+ "text": "billing-svc API keys \u2014 jordan",
872
+ "score": 0.7109743445335438,
873
+ "source": "pentatonic-memory",
874
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
875
+ },
876
+ {
877
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
878
+ "score": 0.707515245905571,
879
+ "source": "pentatonic-memory",
880
+ "doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
881
+ },
882
+ {
883
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
884
+ "score": 0.7024184805852562,
885
+ "source": "pentatonic-memory",
886
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
887
+ }
888
+ ],
889
+ "correct": false,
890
+ "score": 0.0,
891
+ "grading_notes": "judge=0.00 \u2014 The answer does not address the customer's claim or provide any context about Jareth's purchase history with the adapter",
892
+ "search_time_ms": 41.0330820013769,
893
+ "generation_time_ms": 0.0,
894
+ "tokens_in": 0,
895
+ "tokens_out": 0,
896
+ "retrieval_tokens": 70,
897
+ "query_tokens": 30,
898
+ "context_tokens": 40,
899
+ "judge_tokens_in": 398,
900
+ "judge_tokens_out": 37,
901
+ "judge_latency_ms": 1219.425511032343
902
+ },
903
+ {
904
+ "task_id": "rubric-priya-delayed-custom",
905
+ "query": "Priya is frustrated about her custom tote delay. What's the best response?",
906
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
907
+ "hits": [
908
+ {
909
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
910
+ "score": 0.7042906586266301,
911
+ "source": "pentatonic-memory",
912
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
913
+ },
914
+ {
915
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
916
+ "score": 0.6989621203762365,
917
+ "source": "pentatonic-memory",
918
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
919
+ },
920
+ {
921
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
922
+ "score": 0.6970906609262577,
923
+ "source": "pentatonic-memory",
924
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
925
+ }
926
+ ],
927
+ "correct": false,
928
+ "score": 0.0,
929
+ "grading_notes": "judge=0.00 \u2014 The answer does not acknowledge the custom item delay or any goodwill received by Priya.",
930
+ "search_time_ms": 39.566906983964145,
931
+ "generation_time_ms": 0.0,
932
+ "tokens_in": 0,
933
+ "tokens_out": 0,
934
+ "retrieval_tokens": 65,
935
+ "query_tokens": 16,
936
+ "context_tokens": 49,
937
+ "judge_tokens_in": 360,
938
+ "judge_tokens_out": 31,
939
+ "judge_latency_ms": 577.4644939899445
940
+ },
941
+ {
942
+ "task_id": "entity-mina-orders",
943
+ "query": "List Mina Okafor's order IDs.",
944
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nOrder #2026-0187 placed 2026-03-28 by customer Mina Okafor.\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
945
+ "hits": [
946
+ {
947
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
948
+ "score": 0.7574458786135378,
949
+ "source": "pentatonic-memory",
950
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
951
+ },
952
+ {
953
+ "text": "Order #2026-0187 placed 2026-03-28 by customer Mina Okafor.",
954
+ "score": 0.7464184718259437,
955
+ "source": "pentatonic-memory",
956
+ "doc_id": "order-2026-0187-mina"
957
+ },
958
+ {
959
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
960
+ "score": 0.7412049112998839,
961
+ "source": "pentatonic-memory",
962
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
963
+ }
964
+ ],
965
+ "correct": false,
966
+ "score": 0.0,
967
+ "grading_notes": "no expected_substrings set",
968
+ "search_time_ms": 36.482718016486615,
969
+ "generation_time_ms": 0.0,
970
+ "tokens_in": 0,
971
+ "tokens_out": 0,
972
+ "retrieval_tokens": 66,
973
+ "query_tokens": 10,
974
+ "context_tokens": 56,
975
+ "judge_tokens_in": 0,
976
+ "judge_tokens_out": 0,
977
+ "judge_latency_ms": 0.0
978
+ },
979
+ {
980
+ "task_id": "entity-all-rmas",
981
+ "query": "List all open and closed RMA case IDs.",
982
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
983
+ "hits": [
984
+ {
985
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
986
+ "score": 0.7195884243776058,
987
+ "source": "pentatonic-memory",
988
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
989
+ },
990
+ {
991
+ "text": "kubectl rollout restart deployment/billing-svc",
992
+ "score": 0.715077341559819,
993
+ "source": "pentatonic-memory",
994
+ "doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
995
+ },
996
+ {
997
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
998
+ "score": 0.7073335588902188,
999
+ "source": "pentatonic-memory",
1000
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
1001
+ }
1002
+ ],
1003
+ "correct": false,
1004
+ "score": 0.0,
1005
+ "grading_notes": "no expected_substrings set",
1006
+ "search_time_ms": 34.67812400776893,
1007
+ "generation_time_ms": 0.0,
1008
+ "tokens_in": 0,
1009
+ "tokens_out": 0,
1010
+ "retrieval_tokens": 53,
1011
+ "query_tokens": 10,
1012
+ "context_tokens": 43,
1013
+ "judge_tokens_in": 0,
1014
+ "judge_tokens_out": 0,
1015
+ "judge_latency_ms": 0.0
1016
+ }
1017
+ ]
1018
+ }