@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1038 @@
1
+ {
2
+ "bench": "marketplace-ops",
3
+ "stack": "pentatonic-memory-latest",
4
+ "n_tasks": 20,
5
+ "n_correct": 3,
6
+ "accuracy": 0.15,
7
+ "mean_score": 0.225,
8
+ "p50_search_ms": 35.84986749046948,
9
+ "p95_search_ms": 40.03294614230981,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 12,
16
+ "mean_score": 0.16666666666666666,
17
+ "accuracy": 0.16666666666666666
18
+ },
19
+ "event-log": {
20
+ "n": 7,
21
+ "mean_score": 0.21428571428571427,
22
+ "accuracy": 0.14285714285714285
23
+ },
24
+ "multi-fact": {
25
+ "n": 1,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "agent-commerce": {
30
+ "n": 6,
31
+ "mean_score": 0.16666666666666666,
32
+ "accuracy": 0.0
33
+ },
34
+ "math": {
35
+ "n": 1,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "seller": {
40
+ "n": 5,
41
+ "mean_score": 0.5,
42
+ "accuracy": 0.4
43
+ },
44
+ "buyer": {
45
+ "n": 3,
46
+ "mean_score": 0.16666666666666666,
47
+ "accuracy": 0.0
48
+ },
49
+ "multi-doc": {
50
+ "n": 1,
51
+ "mean_score": 0.0,
52
+ "accuracy": 0.0
53
+ },
54
+ "policy": {
55
+ "n": 3,
56
+ "mean_score": 0.0,
57
+ "accuracy": 0.0
58
+ },
59
+ "rubric": {
60
+ "n": 3,
61
+ "mean_score": 0.5,
62
+ "accuracy": 0.0
63
+ },
64
+ "multi-hop": {
65
+ "n": 2,
66
+ "mean_score": 0.5,
67
+ "accuracy": 0.0
68
+ },
69
+ "entity": {
70
+ "n": 2,
71
+ "mean_score": 0.5,
72
+ "accuracy": 0.5
73
+ },
74
+ "negative": {
75
+ "n": 1,
76
+ "mean_score": 1.0,
77
+ "accuracy": 1.0
78
+ }
79
+ },
80
+ "extra": {
81
+ "ingest_ms": 52153.452629980166,
82
+ "grading": "substring",
83
+ "limit": 3,
84
+ "tokens": {
85
+ "corpus_tokens": 1388,
86
+ "query_tokens": 240,
87
+ "context_tokens": 892,
88
+ "retrieval_tokens": 1132,
89
+ "naive_tokens": 28000,
90
+ "saved_tokens": 26868,
91
+ "reduction_pct": 0.9595714285714285,
92
+ "mean_retrieval_tokens_per_task": 56.6,
93
+ "tokenizer": "cl100k_base",
94
+ "per_task": {
95
+ "thing-lst-9001-sold-price": {
96
+ "query": 13,
97
+ "context": 43,
98
+ "retrieval": 56,
99
+ "judge_in": 0,
100
+ "judge_out": 0,
101
+ "judge_latency_ms": 0.0
102
+ },
103
+ "thing-lst-9001-buyer": {
104
+ "query": 8,
105
+ "context": 34,
106
+ "retrieval": 42,
107
+ "judge_in": 0,
108
+ "judge_out": 0,
109
+ "judge_latency_ms": 0.0
110
+ },
111
+ "thing-lst-9001-first-offer": {
112
+ "query": 17,
113
+ "context": 44,
114
+ "retrieval": 61,
115
+ "judge_in": 0,
116
+ "judge_out": 0,
117
+ "judge_latency_ms": 0.0
118
+ },
119
+ "thing-lst-9014-flagged-reason": {
120
+ "query": 11,
121
+ "context": 69,
122
+ "retrieval": 80,
123
+ "judge_in": 0,
124
+ "judge_out": 0,
125
+ "judge_latency_ms": 0.0
126
+ },
127
+ "thing-lst-9014-return-reason": {
128
+ "query": 11,
129
+ "context": 55,
130
+ "retrieval": 66,
131
+ "judge_in": 0,
132
+ "judge_out": 0,
133
+ "judge_latency_ms": 0.0
134
+ },
135
+ "thing-lst-9030-agent-offer": {
136
+ "query": 15,
137
+ "context": 33,
138
+ "retrieval": 48,
139
+ "judge_in": 0,
140
+ "judge_out": 0,
141
+ "judge_latency_ms": 0.0
142
+ },
143
+ "thing-lst-9030-agent-discount": {
144
+ "query": 15,
145
+ "context": 44,
146
+ "retrieval": 59,
147
+ "judge_in": 0,
148
+ "judge_out": 0,
149
+ "judge_latency_ms": 0.0
150
+ },
151
+ "seller-mariposa-rating": {
152
+ "query": 11,
153
+ "context": 82,
154
+ "retrieval": 93,
155
+ "judge_in": 0,
156
+ "judge_out": 0,
157
+ "judge_latency_ms": 0.0
158
+ },
159
+ "seller-rix-review-status": {
160
+ "query": 11,
161
+ "context": 41,
162
+ "retrieval": 52,
163
+ "judge_in": 0,
164
+ "judge_out": 0,
165
+ "judge_latency_ms": 0.0
166
+ },
167
+ "seller-velocipede-agent-friendly": {
168
+ "query": 14,
169
+ "context": 44,
170
+ "retrieval": 58,
171
+ "judge_in": 0,
172
+ "judge_out": 0,
173
+ "judge_latency_ms": 0.0
174
+ },
175
+ "buyer-sera-specialism": {
176
+ "query": 10,
177
+ "context": 41,
178
+ "retrieval": 51,
179
+ "judge_in": 0,
180
+ "judge_out": 0,
181
+ "judge_latency_ms": 0.0
182
+ },
183
+ "buyer-ariadne-disputes": {
184
+ "query": 11,
185
+ "context": 35,
186
+ "retrieval": 46,
187
+ "judge_in": 0,
188
+ "judge_out": 0,
189
+ "judge_latency_ms": 0.0
190
+ },
191
+ "policy-duplicate-trigger": {
192
+ "query": 12,
193
+ "context": 45,
194
+ "retrieval": 57,
195
+ "judge_in": 0,
196
+ "judge_out": 0,
197
+ "judge_latency_ms": 0.0
198
+ },
199
+ "policy-agent-opt-out": {
200
+ "query": 9,
201
+ "context": 38,
202
+ "retrieval": 47,
203
+ "judge_in": 0,
204
+ "judge_out": 0,
205
+ "judge_latency_ms": 0.0
206
+ },
207
+ "policy-enhanced-review-lifted": {
208
+ "query": 12,
209
+ "context": 45,
210
+ "retrieval": 57,
211
+ "judge_in": 0,
212
+ "judge_out": 0,
213
+ "judge_latency_ms": 0.0
214
+ },
215
+ "rubric-rix-buy-decision": {
216
+ "query": 17,
217
+ "context": 38,
218
+ "retrieval": 55,
219
+ "judge_in": 365,
220
+ "judge_out": 46,
221
+ "judge_latency_ms": 900.4068410098553
222
+ },
223
+ "rubric-lst-9014-full-story": {
224
+ "query": 16,
225
+ "context": 35,
226
+ "retrieval": 51,
227
+ "judge_in": 371,
228
+ "judge_out": 64,
229
+ "judge_latency_ms": 1144.362673997879
230
+ },
231
+ "rubric-agent-commerce-thora": {
232
+ "query": 8,
233
+ "context": 36,
234
+ "retrieval": 44,
235
+ "judge_in": 333,
236
+ "judge_out": 35,
237
+ "judge_latency_ms": 668.5855540037155
238
+ },
239
+ "entity-all-sold-things": {
240
+ "query": 9,
241
+ "context": 46,
242
+ "retrieval": 55,
243
+ "judge_in": 0,
244
+ "judge_out": 0,
245
+ "judge_latency_ms": 0.0
246
+ },
247
+ "entity-sellers-with-disputes": {
248
+ "query": 10,
249
+ "context": 44,
250
+ "retrieval": 54,
251
+ "judge_in": 0,
252
+ "judge_out": 0,
253
+ "judge_latency_ms": 0.0
254
+ }
255
+ },
256
+ "judge_tokens_in": 1069,
257
+ "judge_tokens_out": 145,
258
+ "judge_calls": 3,
259
+ "judge_mean_latency_ms": 904.4516896704832
260
+ },
261
+ "cost_usd": {
262
+ "assumed_completion_tokens_per_task": 100,
263
+ "rates": {
264
+ "input_per_1k": 0.0025,
265
+ "output_per_1k": 0.01,
266
+ "model": "gpt-4o"
267
+ },
268
+ "retrieval_usd_in": 0.00283,
269
+ "retrieval_usd_out": 0.02,
270
+ "retrieval_usd_total": 0.02283,
271
+ "naive_usd_total": 0.09000000000000001,
272
+ "saved_usd": 0.06717000000000001,
273
+ "saved_usd_per_1k_tasks": 3.3585000000000003
274
+ }
275
+ },
276
+ "task_results": [
277
+ {
278
+ "task_id": "thing-lst-9001-sold-price",
279
+ "query": "What was the final sale price of LST-9001?",
280
+ "answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan",
281
+ "hits": [
282
+ {
283
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
284
+ "score": 0.7006427157763535,
285
+ "source": "pentatonic-memory",
286
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
287
+ },
288
+ {
289
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
290
+ "score": 0.7003682434332478,
291
+ "source": "pentatonic-memory",
292
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
293
+ },
294
+ {
295
+ "text": "billing-svc API keys \u2014 jordan",
296
+ "score": 0.6949448571487986,
297
+ "source": "pentatonic-memory",
298
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
299
+ }
300
+ ],
301
+ "correct": false,
302
+ "score": 0.0,
303
+ "grading_notes": "missing 1/1: ['2,250']",
304
+ "search_time_ms": 40.05719599081203,
305
+ "generation_time_ms": 0.0,
306
+ "tokens_in": 0,
307
+ "tokens_out": 0,
308
+ "retrieval_tokens": 56,
309
+ "query_tokens": 13,
310
+ "context_tokens": 43,
311
+ "judge_tokens_in": 0,
312
+ "judge_tokens_out": 0,
313
+ "judge_latency_ms": 0.0
314
+ },
315
+ {
316
+ "task_id": "thing-lst-9001-buyer",
317
+ "query": "Who bought LST-9001?",
318
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nbilling-svc API keys \u2014 jordan\n---\nsearch-svc API keys \u2014 alex",
319
+ "hits": [
320
+ {
321
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
322
+ "score": 0.7156553520106418,
323
+ "source": "pentatonic-memory",
324
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
325
+ },
326
+ {
327
+ "text": "billing-svc API keys \u2014 jordan",
328
+ "score": 0.7074979698500966,
329
+ "source": "pentatonic-memory",
330
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
331
+ },
332
+ {
333
+ "text": "search-svc API keys \u2014 alex",
334
+ "score": 0.7062652156890311,
335
+ "source": "pentatonic-memory",
336
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
337
+ }
338
+ ],
339
+ "correct": false,
340
+ "score": 0.0,
341
+ "grading_notes": "missing 1/1: ['sera-interiors']",
342
+ "search_time_ms": 33.638543012784794,
343
+ "generation_time_ms": 0.0,
344
+ "tokens_in": 0,
345
+ "tokens_out": 0,
346
+ "retrieval_tokens": 42,
347
+ "query_tokens": 8,
348
+ "context_tokens": 34,
349
+ "judge_tokens_in": 0,
350
+ "judge_tokens_out": 0,
351
+ "judge_latency_ms": 0.0
352
+ },
353
+ {
354
+ "task_id": "thing-lst-9001-first-offer",
355
+ "query": "What was the first offer received on LST-9001 and was it accepted?",
356
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nreporting-svc consumes OrderPaid for financial reports",
357
+ "hits": [
358
+ {
359
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
360
+ "score": 0.723592167411367,
361
+ "source": "pentatonic-memory",
362
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
363
+ },
364
+ {
365
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
366
+ "score": 0.7212647702122161,
367
+ "source": "pentatonic-memory",
368
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
369
+ },
370
+ {
371
+ "text": "reporting-svc consumes OrderPaid for financial reports",
372
+ "score": 0.7102858699669232,
373
+ "source": "pentatonic-memory",
374
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
375
+ }
376
+ ],
377
+ "correct": false,
378
+ "score": 0.0,
379
+ "grading_notes": "missing 3/3: ['2,000', 'hendrik', 'declined']",
380
+ "search_time_ms": 38.42421999434009,
381
+ "generation_time_ms": 0.0,
382
+ "tokens_in": 0,
383
+ "tokens_out": 0,
384
+ "retrieval_tokens": 61,
385
+ "query_tokens": 17,
386
+ "context_tokens": 44,
387
+ "judge_tokens_in": 0,
388
+ "judge_tokens_out": 0,
389
+ "judge_latency_ms": 0.0
390
+ },
391
+ {
392
+ "task_id": "thing-lst-9014-flagged-reason",
393
+ "query": "Why was LST-9014 flagged as duplicate?",
394
+ "answer": "2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
395
+ "hits": [
396
+ {
397
+ "text": "2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)",
398
+ "score": 0.9016563500429841,
399
+ "source": "pentatonic-memory",
400
+ "doc_id": "tes-events-lst-9014"
401
+ },
402
+ {
403
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
404
+ "score": 0.7403346776960328,
405
+ "source": "pentatonic-memory",
406
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
407
+ },
408
+ {
409
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
410
+ "score": 0.7197999858934553,
411
+ "source": "pentatonic-memory",
412
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
413
+ }
414
+ ],
415
+ "correct": true,
416
+ "score": 1.0,
417
+ "grading_notes": "all substrings matched",
418
+ "search_time_ms": 35.562208999181166,
419
+ "generation_time_ms": 0.0,
420
+ "tokens_in": 0,
421
+ "tokens_out": 0,
422
+ "retrieval_tokens": 80,
423
+ "query_tokens": 11,
424
+ "context_tokens": 69,
425
+ "judge_tokens_in": 0,
426
+ "judge_tokens_out": 0,
427
+ "judge_latency_ms": 0.0
428
+ },
429
+ {
430
+ "task_id": "thing-lst-9014-return-reason",
431
+ "query": "Why did the buyer return LST-9014?",
432
+ "answer": "One item is currently open for return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
433
+ "hits": [
434
+ {
435
+ "text": "One item is currently open for return (LST-9014 battery-health dispute, closed in buyer's favour).",
436
+ "score": 0.8220346607727136,
437
+ "source": "pentatonic-memory",
438
+ "doc_id": "buyer-ariadne-profile"
439
+ },
440
+ {
441
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
442
+ "score": 0.7207021213487643,
443
+ "source": "pentatonic-memory",
444
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
445
+ },
446
+ {
447
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
448
+ "score": 0.7164366965811265,
449
+ "source": "pentatonic-memory",
450
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
451
+ }
452
+ ],
453
+ "correct": false,
454
+ "score": 0.0,
455
+ "grading_notes": "missing 1/1: ['battery health']",
456
+ "search_time_ms": 31.3100729836151,
457
+ "generation_time_ms": 0.0,
458
+ "tokens_in": 0,
459
+ "tokens_out": 0,
460
+ "retrieval_tokens": 66,
461
+ "query_tokens": 11,
462
+ "context_tokens": 55,
463
+ "judge_tokens_in": 0,
464
+ "judge_tokens_out": 0,
465
+ "judge_latency_ms": 0.0
466
+ },
467
+ {
468
+ "task_id": "thing-lst-9030-agent-offer",
469
+ "query": "Which agent made the offer on LST-9030 and for whom?",
470
+ "answer": "search-svc API keys \u2014 alex\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nbilling-svc API keys \u2014 jordan",
471
+ "hits": [
472
+ {
473
+ "text": "search-svc API keys \u2014 alex",
474
+ "score": 0.7186173953170144,
475
+ "source": "pentatonic-memory",
476
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
477
+ },
478
+ {
479
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
480
+ "score": 0.7175099174878017,
481
+ "source": "pentatonic-memory",
482
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
483
+ },
484
+ {
485
+ "text": "billing-svc API keys \u2014 jordan",
486
+ "score": 0.7107812081575241,
487
+ "source": "pentatonic-memory",
488
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
489
+ }
490
+ ],
491
+ "correct": false,
492
+ "score": 0.0,
493
+ "grading_notes": "missing 2/2: ['buyer-agent-7', 'thora']",
494
+ "search_time_ms": 38.240394002059475,
495
+ "generation_time_ms": 0.0,
496
+ "tokens_in": 0,
497
+ "tokens_out": 0,
498
+ "retrieval_tokens": 48,
499
+ "query_tokens": 15,
500
+ "context_tokens": 33,
501
+ "judge_tokens_in": 0,
502
+ "judge_tokens_out": 0,
503
+ "judge_latency_ms": 0.0
504
+ },
505
+ {
506
+ "task_id": "thing-lst-9030-agent-discount",
507
+ "query": "What percentage discount did the shopping agent offer on LST-9030?",
508
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
509
+ "hits": [
510
+ {
511
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
512
+ "score": 0.7148984022641006,
513
+ "source": "pentatonic-memory",
514
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
515
+ },
516
+ {
517
+ "text": "email-svc consumes OrderPaid for the receipt email",
518
+ "score": 0.7113749863793137,
519
+ "source": "pentatonic-memory",
520
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
521
+ },
522
+ {
523
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
524
+ "score": 0.7055279575981932,
525
+ "source": "pentatonic-memory",
526
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
527
+ }
528
+ ],
529
+ "correct": false,
530
+ "score": 0.0,
531
+ "grading_notes": "missing 2/2: ['380', '420']",
532
+ "search_time_ms": 39.22571599832736,
533
+ "generation_time_ms": 0.0,
534
+ "tokens_in": 0,
535
+ "tokens_out": 0,
536
+ "retrieval_tokens": 59,
537
+ "query_tokens": 15,
538
+ "context_tokens": 44,
539
+ "judge_tokens_in": 0,
540
+ "judge_tokens_out": 0,
541
+ "judge_latency_ms": 0.0
542
+ },
543
+ {
544
+ "task_id": "seller-mariposa-rating",
545
+ "query": "What's @mariposa's average star rating?",
546
+ "answer": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.\n---\nsearch-svc API keys \u2014 alex\n---\nbilling-svc API keys \u2014 jordan",
547
+ "hits": [
548
+ {
549
+ "text": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
550
+ "score": 0.7209147715310881,
551
+ "source": "pentatonic-memory",
552
+ "doc_id": "seller-mariposa-profile"
553
+ },
554
+ {
555
+ "text": "search-svc API keys \u2014 alex",
556
+ "score": 0.7181170622157985,
557
+ "source": "pentatonic-memory",
558
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
559
+ },
560
+ {
561
+ "text": "billing-svc API keys \u2014 jordan",
562
+ "score": 0.7163751178936137,
563
+ "source": "pentatonic-memory",
564
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
565
+ }
566
+ ],
567
+ "correct": true,
568
+ "score": 1.0,
569
+ "grading_notes": "all substrings matched",
570
+ "search_time_ms": 35.9132039884571,
571
+ "generation_time_ms": 0.0,
572
+ "tokens_in": 0,
573
+ "tokens_out": 0,
574
+ "retrieval_tokens": 93,
575
+ "query_tokens": 11,
576
+ "context_tokens": 82,
577
+ "judge_tokens_in": 0,
578
+ "judge_tokens_out": 0,
579
+ "judge_latency_ms": 0.0
580
+ },
581
+ {
582
+ "task_id": "seller-rix-review-status",
583
+ "query": "Is @rix-digital under any special review status?",
584
+ "answer": "billing-svc API keys \u2014 jordan\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\norder-svc marks order as paid, emits OrderPaid on pubsub",
585
+ "hits": [
586
+ {
587
+ "text": "billing-svc API keys \u2014 jordan",
588
+ "score": 0.7199540151546203,
589
+ "source": "pentatonic-memory",
590
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
591
+ },
592
+ {
593
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
594
+ "score": 0.7189223522433535,
595
+ "source": "pentatonic-memory",
596
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
597
+ },
598
+ {
599
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
600
+ "score": 0.7186887112080456,
601
+ "source": "pentatonic-memory",
602
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
603
+ }
604
+ ],
605
+ "correct": false,
606
+ "score": 0.0,
607
+ "grading_notes": "missing 1/1: ['enhanced listing review']",
608
+ "search_time_ms": 33.03063299972564,
609
+ "generation_time_ms": 0.0,
610
+ "tokens_in": 0,
611
+ "tokens_out": 0,
612
+ "retrieval_tokens": 52,
613
+ "query_tokens": 11,
614
+ "context_tokens": 41,
615
+ "judge_tokens_in": 0,
616
+ "judge_tokens_out": 0,
617
+ "judge_latency_ms": 0.0
618
+ },
619
+ {
620
+ "task_id": "seller-velocipede-agent-friendly",
621
+ "query": "Does @velocipede-jo respond well to shopping agents?",
622
+ "answer": "email-svc consumes OrderPaid for the receipt email\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
623
+ "hits": [
624
+ {
625
+ "text": "email-svc consumes OrderPaid for the receipt email",
626
+ "score": 0.7377549677445517,
627
+ "source": "pentatonic-memory",
628
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
629
+ },
630
+ {
631
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
632
+ "score": 0.7375860425187651,
633
+ "source": "pentatonic-memory",
634
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
635
+ },
636
+ {
637
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
638
+ "score": 0.7322622679879591,
639
+ "source": "pentatonic-memory",
640
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
641
+ }
642
+ ],
643
+ "correct": false,
644
+ "score": 0.0,
645
+ "grading_notes": "missing 2/2: ['yes', '5 minutes']",
646
+ "search_time_ms": 37.258273019688204,
647
+ "generation_time_ms": 0.0,
648
+ "tokens_in": 0,
649
+ "tokens_out": 0,
650
+ "retrieval_tokens": 58,
651
+ "query_tokens": 14,
652
+ "context_tokens": 44,
653
+ "judge_tokens_in": 0,
654
+ "judge_tokens_out": 0,
655
+ "judge_latency_ms": 0.0
656
+ },
657
+ {
658
+ "task_id": "buyer-sera-specialism",
659
+ "query": "What does @sera-interiors typically buy?",
660
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nsearch-svc API keys \u2014 alex",
661
+ "hits": [
662
+ {
663
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
664
+ "score": 0.7394461379296969,
665
+ "source": "pentatonic-memory",
666
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
667
+ },
668
+ {
669
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
670
+ "score": 0.7331208769984241,
671
+ "source": "pentatonic-memory",
672
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
673
+ },
674
+ {
675
+ "text": "search-svc API keys \u2014 alex",
676
+ "score": 0.7322189523727279,
677
+ "source": "pentatonic-memory",
678
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
679
+ }
680
+ ],
681
+ "correct": false,
682
+ "score": 0.0,
683
+ "grading_notes": "missing 2/2: ['mid-century', 'furniture']",
684
+ "search_time_ms": 35.78653099248186,
685
+ "generation_time_ms": 0.0,
686
+ "tokens_in": 0,
687
+ "tokens_out": 0,
688
+ "retrieval_tokens": 51,
689
+ "query_tokens": 10,
690
+ "context_tokens": 41,
691
+ "judge_tokens_in": 0,
692
+ "judge_tokens_out": 0,
693
+ "judge_latency_ms": 0.0
694
+ },
695
+ {
696
+ "task_id": "buyer-ariadne-disputes",
697
+ "query": "Does @ariadne have any disputes on record?",
698
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nreporting-svc consumes OrderPaid for financial reports\n---\nsearch-svc API keys \u2014 alex",
699
+ "hits": [
700
+ {
701
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
702
+ "score": 0.7134620771960158,
703
+ "source": "pentatonic-memory",
704
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
705
+ },
706
+ {
707
+ "text": "reporting-svc consumes OrderPaid for financial reports",
708
+ "score": 0.7114378942204062,
709
+ "source": "pentatonic-memory",
710
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
711
+ },
712
+ {
713
+ "text": "search-svc API keys \u2014 alex",
714
+ "score": 0.7088609924561785,
715
+ "source": "pentatonic-memory",
716
+ "doc_id": "mem_730b4496-40b1-499f-952a-739ae3912826"
717
+ }
718
+ ],
719
+ "correct": false,
720
+ "score": 0.0,
721
+ "grading_notes": "missing 2/2: ['LST-9014', 'battery']",
722
+ "search_time_ms": 35.92176499660127,
723
+ "generation_time_ms": 0.0,
724
+ "tokens_in": 0,
725
+ "tokens_out": 0,
726
+ "retrieval_tokens": 46,
727
+ "query_tokens": 11,
728
+ "context_tokens": 35,
729
+ "judge_tokens_in": 0,
730
+ "judge_tokens_out": 0,
731
+ "judge_latency_ms": 0.0
732
+ },
733
+ {
734
+ "task_id": "policy-duplicate-trigger",
735
+ "query": "What triggers a FLAGGED_DUPLICATE event in TES?",
736
+ "answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\nIf secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nemail-svc consumes OrderPaid for the receipt email",
737
+ "hits": [
738
+ {
739
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
740
+ "score": 0.7384890905248356,
741
+ "source": "pentatonic-memory",
742
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
743
+ },
744
+ {
745
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
746
+ "score": 0.7311341919099343,
747
+ "source": "pentatonic-memory",
748
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
749
+ },
750
+ {
751
+ "text": "email-svc consumes OrderPaid for the receipt email",
752
+ "score": 0.7213569923858261,
753
+ "source": "pentatonic-memory",
754
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
755
+ }
756
+ ],
757
+ "correct": false,
758
+ "score": 0.0,
759
+ "grading_notes": "missing 2/2: ['device fingerprint', 'serial']",
760
+ "search_time_ms": 36.552042001858354,
761
+ "generation_time_ms": 0.0,
762
+ "tokens_in": 0,
763
+ "tokens_out": 0,
764
+ "retrieval_tokens": 57,
765
+ "query_tokens": 12,
766
+ "context_tokens": 45,
767
+ "judge_tokens_in": 0,
768
+ "judge_tokens_out": 0,
769
+ "judge_latency_ms": 0.0
770
+ },
771
+ {
772
+ "task_id": "policy-agent-opt-out",
773
+ "query": "Can a seller refuse offers from shopping agents?",
774
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nreporting-svc consumes OrderPaid for financial reports",
775
+ "hits": [
776
+ {
777
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
778
+ "score": 0.7109830638978126,
779
+ "source": "pentatonic-memory",
780
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
781
+ },
782
+ {
783
+ "text": "email-svc consumes OrderPaid for the receipt email",
784
+ "score": 0.7033728802624761,
785
+ "source": "pentatonic-memory",
786
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
787
+ },
788
+ {
789
+ "text": "reporting-svc consumes OrderPaid for financial reports",
790
+ "score": 0.7027066808611204,
791
+ "source": "pentatonic-memory",
792
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
793
+ }
794
+ ],
795
+ "correct": false,
796
+ "score": 0.0,
797
+ "grading_notes": "missing 1/1: ['AGENT_OPT_OUT']",
798
+ "search_time_ms": 34.710568987065926,
799
+ "generation_time_ms": 0.0,
800
+ "tokens_in": 0,
801
+ "tokens_out": 0,
802
+ "retrieval_tokens": 47,
803
+ "query_tokens": 9,
804
+ "context_tokens": 38,
805
+ "judge_tokens_in": 0,
806
+ "judge_tokens_out": 0,
807
+ "judge_latency_ms": 0.0
808
+ },
809
+ {
810
+ "task_id": "policy-enhanced-review-lifted",
811
+ "query": "After how long of clean activity is enhanced listing review lifted?",
812
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nreporting-svc consumes OrderPaid for financial reports\n---\nhandle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
813
+ "hits": [
814
+ {
815
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
816
+ "score": 0.7081794044582482,
817
+ "source": "pentatonic-memory",
818
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
819
+ },
820
+ {
821
+ "text": "reporting-svc consumes OrderPaid for financial reports",
822
+ "score": 0.7075553845787061,
823
+ "source": "pentatonic-memory",
824
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
825
+ },
826
+ {
827
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
828
+ "score": 0.7065336520496776,
829
+ "source": "pentatonic-memory",
830
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
831
+ }
832
+ ],
833
+ "correct": false,
834
+ "score": 0.0,
835
+ "grading_notes": "missing 1/1: ['30 days']",
836
+ "search_time_ms": 27.74471099837683,
837
+ "generation_time_ms": 0.0,
838
+ "tokens_in": 0,
839
+ "tokens_out": 0,
840
+ "retrieval_tokens": 57,
841
+ "query_tokens": 12,
842
+ "context_tokens": 45,
843
+ "judge_tokens_in": 0,
844
+ "judge_tokens_out": 0,
845
+ "judge_latency_ms": 0.0
846
+ },
847
+ {
848
+ "task_id": "rubric-rix-buy-decision",
849
+ "query": "A shopping agent is considering buying from @rix-digital. What's your recommendation?",
850
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nreporting-svc consumes OrderPaid for financial reports",
851
+ "hits": [
852
+ {
853
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
854
+ "score": 0.7129850854424628,
855
+ "source": "pentatonic-memory",
856
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
857
+ },
858
+ {
859
+ "text": "email-svc consumes OrderPaid for the receipt email",
860
+ "score": 0.7109710326325284,
861
+ "source": "pentatonic-memory",
862
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
863
+ },
864
+ {
865
+ "text": "reporting-svc consumes OrderPaid for financial reports",
866
+ "score": 0.7041281023998354,
867
+ "source": "pentatonic-memory",
868
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
869
+ }
870
+ ],
871
+ "correct": false,
872
+ "score": 0.5,
873
+ "grading_notes": "judge=0.50 \u2014 The answer mentions @rix-digital is under enhanced listing review due to spec discrepancies, but does not explicitly men",
874
+ "search_time_ms": 27.758039010223,
875
+ "generation_time_ms": 0.0,
876
+ "tokens_in": 0,
877
+ "tokens_out": 0,
878
+ "retrieval_tokens": 55,
879
+ "query_tokens": 17,
880
+ "context_tokens": 38,
881
+ "judge_tokens_in": 365,
882
+ "judge_tokens_out": 46,
883
+ "judge_latency_ms": 900.4068410098553
884
+ },
885
+ {
886
+ "task_id": "rubric-lst-9014-full-story",
887
+ "query": "Tell me the full story of LST-9014 from creation to resolution.",
888
+ "answer": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`\n---\nkubectl rollout restart deployment/billing-svc\n---\nbilling-svc API keys \u2014 jordan",
889
+ "hits": [
890
+ {
891
+ "text": "If secret was rotated recently, `kubectl rollout restart deployment/billing-svc`",
892
+ "score": 0.7260789657724627,
893
+ "source": "pentatonic-memory",
894
+ "doc_id": "mem_bb06d70a-c2f5-4168-b0df-a03b1b585b29"
895
+ },
896
+ {
897
+ "text": "kubectl rollout restart deployment/billing-svc",
898
+ "score": 0.7089971403550961,
899
+ "source": "pentatonic-memory",
900
+ "doc_id": "mem_049e0812-46ed-4db7-84d8-c4a121a1d19c"
901
+ },
902
+ {
903
+ "text": "billing-svc API keys \u2014 jordan",
904
+ "score": 0.7037271368306142,
905
+ "source": "pentatonic-memory",
906
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
907
+ }
908
+ ],
909
+ "correct": false,
910
+ "score": 0.5,
911
+ "grading_notes": "judge=0.50 \u2014 The answer partially covers stages 1 and 4, but lacks details about creation date, listing price, flagging as duplicate,",
912
+ "search_time_ms": 39.572199020767584,
913
+ "generation_time_ms": 0.0,
914
+ "tokens_in": 0,
915
+ "tokens_out": 0,
916
+ "retrieval_tokens": 51,
917
+ "query_tokens": 16,
918
+ "context_tokens": 35,
919
+ "judge_tokens_in": 371,
920
+ "judge_tokens_out": 64,
921
+ "judge_latency_ms": 1144.362673997879
922
+ },
923
+ {
924
+ "task_id": "rubric-agent-commerce-thora",
925
+ "query": "Describe how @thora uses the marketplace.",
926
+ "answer": "order-svc marks order as paid, emits OrderPaid on pubsub\n---\nemail-svc consumes OrderPaid for the receipt email\n---\nbilling-svc API keys \u2014 jordan",
927
+ "hits": [
928
+ {
929
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
930
+ "score": 0.7404235195303052,
931
+ "source": "pentatonic-memory",
932
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
933
+ },
934
+ {
935
+ "text": "email-svc consumes OrderPaid for the receipt email",
936
+ "score": 0.7365608038487811,
937
+ "source": "pentatonic-memory",
938
+ "doc_id": "mem_8decb311-57b6-4b78-9dd6-3683a441963f"
939
+ },
940
+ {
941
+ "text": "billing-svc API keys \u2014 jordan",
942
+ "score": 0.7300710594859035,
943
+ "source": "pentatonic-memory",
944
+ "doc_id": "mem_80182b2e-cbba-4642-b6fd-b8d3fbd6e617"
945
+ }
946
+ ],
947
+ "correct": false,
948
+ "score": 0.5,
949
+ "grading_notes": "judge=0.50 \u2014 The answer mentions @thora's human profile but does not clearly state that she uses an autonomous shopping agent.",
950
+ "search_time_ms": 29.80231400579214,
951
+ "generation_time_ms": 0.0,
952
+ "tokens_in": 0,
953
+ "tokens_out": 0,
954
+ "retrieval_tokens": 44,
955
+ "query_tokens": 8,
956
+ "context_tokens": 36,
957
+ "judge_tokens_in": 333,
958
+ "judge_tokens_out": 35,
959
+ "judge_latency_ms": 668.5855540037155
960
+ },
961
+ {
962
+ "task_id": "entity-all-sold-things",
963
+ "query": "List the thing IDs of all completed sales.",
964
+ "answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nStripe posts a invoice.payment_succeeded webhook to billing-svc",
965
+ "hits": [
966
+ {
967
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
968
+ "score": 0.7449149360826441,
969
+ "source": "pentatonic-memory",
970
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
971
+ },
972
+ {
973
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
974
+ "score": 0.7244922743753247,
975
+ "source": "pentatonic-memory",
976
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
977
+ },
978
+ {
979
+ "text": "Stripe posts a invoice.payment_succeeded webhook to billing-svc",
980
+ "score": 0.7224140355346768,
981
+ "source": "pentatonic-memory",
982
+ "doc_id": "mem_76c60bf6-9327-4cd1-9f56-234d229e8c8a"
983
+ }
984
+ ],
985
+ "correct": false,
986
+ "score": 0.0,
987
+ "grading_notes": "no expected_substrings set",
988
+ "search_time_ms": 36.64004398160614,
989
+ "generation_time_ms": 0.0,
990
+ "tokens_in": 0,
991
+ "tokens_out": 0,
992
+ "retrieval_tokens": 55,
993
+ "query_tokens": 9,
994
+ "context_tokens": 46,
995
+ "judge_tokens_in": 0,
996
+ "judge_tokens_out": 0,
997
+ "judge_latency_ms": 0.0
998
+ },
999
+ {
1000
+ "task_id": "entity-sellers-with-disputes",
1001
+ "query": "Which sellers in this corpus have disputes on record?",
1002
+ "answer": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc\n---\norder-svc marks order as paid, emits OrderPaid on pubsub\n---\nreporting-svc consumes OrderPaid for financial reports",
1003
+ "hits": [
1004
+ {
1005
+ "text": "handle_webhook verifies signature, enqueues internal InvoicePaid event on order-svc",
1006
+ "score": 0.7333061220485395,
1007
+ "source": "pentatonic-memory",
1008
+ "doc_id": "mem_b6596977-9a91-4469-a8c5-3872edbbbf1c"
1009
+ },
1010
+ {
1011
+ "text": "order-svc marks order as paid, emits OrderPaid on pubsub",
1012
+ "score": 0.7317836516925226,
1013
+ "source": "pentatonic-memory",
1014
+ "doc_id": "mem_f1083a75-3da4-4a35-bce4-db4a29b46034"
1015
+ },
1016
+ {
1017
+ "text": "reporting-svc consumes OrderPaid for financial reports",
1018
+ "score": 0.7299593574086183,
1019
+ "source": "pentatonic-memory",
1020
+ "doc_id": "mem_778aa40d-6c20-4495-96d9-286a57b53f7a"
1021
+ }
1022
+ ],
1023
+ "correct": true,
1024
+ "score": 1.0,
1025
+ "grading_notes": "no positive criteria (negative-only task)",
1026
+ "search_time_ms": 34.57520800293423,
1027
+ "generation_time_ms": 0.0,
1028
+ "tokens_in": 0,
1029
+ "tokens_out": 0,
1030
+ "retrieval_tokens": 54,
1031
+ "query_tokens": 10,
1032
+ "context_tokens": 44,
1033
+ "judge_tokens_in": 0,
1034
+ "judge_tokens_out": 0,
1035
+ "judge_latency_ms": 0.0
1036
+ }
1037
+ ]
1038
+ }