@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,693 @@
1
+ {
2
+ "bench": "marketplace-ops",
3
+ "stack": "pentatonic-memory",
4
+ "n_tasks": 20,
5
+ "n_correct": 5,
6
+ "accuracy": 0.25,
7
+ "mean_score": 0.25,
8
+ "p50_search_ms": 22.744177986169234,
9
+ "p95_search_ms": 29.40429764421424,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 12,
16
+ "mean_score": 0.16666666666666666,
17
+ "accuracy": 0.16666666666666666
18
+ },
19
+ "event-log": {
20
+ "n": 7,
21
+ "mean_score": 0.14285714285714285,
22
+ "accuracy": 0.14285714285714285
23
+ },
24
+ "multi-fact": {
25
+ "n": 1,
26
+ "mean_score": 0.0,
27
+ "accuracy": 0.0
28
+ },
29
+ "agent-commerce": {
30
+ "n": 6,
31
+ "mean_score": 0.16666666666666666,
32
+ "accuracy": 0.16666666666666666
33
+ },
34
+ "math": {
35
+ "n": 1,
36
+ "mean_score": 0.0,
37
+ "accuracy": 0.0
38
+ },
39
+ "seller": {
40
+ "n": 5,
41
+ "mean_score": 0.4,
42
+ "accuracy": 0.4
43
+ },
44
+ "buyer": {
45
+ "n": 3,
46
+ "mean_score": 0.0,
47
+ "accuracy": 0.0
48
+ },
49
+ "multi-doc": {
50
+ "n": 1,
51
+ "mean_score": 0.0,
52
+ "accuracy": 0.0
53
+ },
54
+ "policy": {
55
+ "n": 3,
56
+ "mean_score": 0.6666666666666666,
57
+ "accuracy": 0.6666666666666666
58
+ },
59
+ "rubric": {
60
+ "n": 3,
61
+ "mean_score": 0.0,
62
+ "accuracy": 0.0
63
+ },
64
+ "multi-hop": {
65
+ "n": 2,
66
+ "mean_score": 0.0,
67
+ "accuracy": 0.0
68
+ },
69
+ "entity": {
70
+ "n": 2,
71
+ "mean_score": 0.5,
72
+ "accuracy": 0.5
73
+ },
74
+ "negative": {
75
+ "n": 1,
76
+ "mean_score": 1.0,
77
+ "accuracy": 1.0
78
+ }
79
+ },
80
+ "extra": {
81
+ "ingest_ms": 7761.137416004203,
82
+ "grading": "substring",
83
+ "limit": 3,
84
+ "tokens": {
85
+ "corpus_tokens": 1388,
86
+ "query_tokens": 240,
87
+ "context_tokens": 438,
88
+ "retrieval_tokens": 678,
89
+ "naive_tokens": 28000,
90
+ "saved_tokens": 27322,
91
+ "reduction_pct": 0.9757857142857143,
92
+ "mean_retrieval_tokens_per_task": 33.9,
93
+ "tokenizer": "cl100k_base",
94
+ "per_task": {
95
+ "thing-lst-9001-sold-price": {
96
+ "query": 13,
97
+ "context": 0,
98
+ "retrieval": 13,
99
+ "judge_in": 0,
100
+ "judge_out": 0,
101
+ "judge_latency_ms": 0.0
102
+ },
103
+ "thing-lst-9001-buyer": {
104
+ "query": 8,
105
+ "context": 0,
106
+ "retrieval": 8,
107
+ "judge_in": 0,
108
+ "judge_out": 0,
109
+ "judge_latency_ms": 0.0
110
+ },
111
+ "thing-lst-9001-first-offer": {
112
+ "query": 17,
113
+ "context": 0,
114
+ "retrieval": 17,
115
+ "judge_in": 0,
116
+ "judge_out": 0,
117
+ "judge_latency_ms": 0.0
118
+ },
119
+ "thing-lst-9014-flagged-reason": {
120
+ "query": 11,
121
+ "context": 241,
122
+ "retrieval": 252,
123
+ "judge_in": 0,
124
+ "judge_out": 0,
125
+ "judge_latency_ms": 0.0
126
+ },
127
+ "thing-lst-9014-return-reason": {
128
+ "query": 11,
129
+ "context": 43,
130
+ "retrieval": 54,
131
+ "judge_in": 0,
132
+ "judge_out": 0,
133
+ "judge_latency_ms": 0.0
134
+ },
135
+ "thing-lst-9030-agent-offer": {
136
+ "query": 15,
137
+ "context": 0,
138
+ "retrieval": 15,
139
+ "judge_in": 0,
140
+ "judge_out": 0,
141
+ "judge_latency_ms": 0.0
142
+ },
143
+ "thing-lst-9030-agent-discount": {
144
+ "query": 15,
145
+ "context": 0,
146
+ "retrieval": 15,
147
+ "judge_in": 0,
148
+ "judge_out": 0,
149
+ "judge_latency_ms": 0.0
150
+ },
151
+ "seller-mariposa-rating": {
152
+ "query": 11,
153
+ "context": 64,
154
+ "retrieval": 75,
155
+ "judge_in": 0,
156
+ "judge_out": 0,
157
+ "judge_latency_ms": 0.0
158
+ },
159
+ "seller-rix-review-status": {
160
+ "query": 11,
161
+ "context": 0,
162
+ "retrieval": 11,
163
+ "judge_in": 0,
164
+ "judge_out": 0,
165
+ "judge_latency_ms": 0.0
166
+ },
167
+ "seller-velocipede-agent-friendly": {
168
+ "query": 14,
169
+ "context": 0,
170
+ "retrieval": 14,
171
+ "judge_in": 0,
172
+ "judge_out": 0,
173
+ "judge_latency_ms": 0.0
174
+ },
175
+ "buyer-sera-specialism": {
176
+ "query": 10,
177
+ "context": 0,
178
+ "retrieval": 10,
179
+ "judge_in": 0,
180
+ "judge_out": 0,
181
+ "judge_latency_ms": 0.0
182
+ },
183
+ "buyer-ariadne-disputes": {
184
+ "query": 11,
185
+ "context": 0,
186
+ "retrieval": 11,
187
+ "judge_in": 0,
188
+ "judge_out": 0,
189
+ "judge_latency_ms": 0.0
190
+ },
191
+ "policy-duplicate-trigger": {
192
+ "query": 12,
193
+ "context": 0,
194
+ "retrieval": 12,
195
+ "judge_in": 0,
196
+ "judge_out": 0,
197
+ "judge_latency_ms": 0.0
198
+ },
199
+ "policy-agent-opt-out": {
200
+ "query": 9,
201
+ "context": 45,
202
+ "retrieval": 54,
203
+ "judge_in": 0,
204
+ "judge_out": 0,
205
+ "judge_latency_ms": 0.0
206
+ },
207
+ "policy-enhanced-review-lifted": {
208
+ "query": 12,
209
+ "context": 45,
210
+ "retrieval": 57,
211
+ "judge_in": 0,
212
+ "judge_out": 0,
213
+ "judge_latency_ms": 0.0
214
+ },
215
+ "rubric-rix-buy-decision": {
216
+ "query": 17,
217
+ "context": 0,
218
+ "retrieval": 17,
219
+ "judge_in": 326,
220
+ "judge_out": 39,
221
+ "judge_latency_ms": 852.8406230211258
222
+ },
223
+ "rubric-lst-9014-full-story": {
224
+ "query": 16,
225
+ "context": 0,
226
+ "retrieval": 16,
227
+ "judge_in": 335,
228
+ "judge_out": 42,
229
+ "judge_latency_ms": 864.7506729960442
230
+ },
231
+ "rubric-agent-commerce-thora": {
232
+ "query": 8,
233
+ "context": 0,
234
+ "retrieval": 8,
235
+ "judge_in": 296,
236
+ "judge_out": 35,
237
+ "judge_latency_ms": 592.765478014946
238
+ },
239
+ "entity-all-sold-things": {
240
+ "query": 9,
241
+ "context": 0,
242
+ "retrieval": 9,
243
+ "judge_in": 0,
244
+ "judge_out": 0,
245
+ "judge_latency_ms": 0.0
246
+ },
247
+ "entity-sellers-with-disputes": {
248
+ "query": 10,
249
+ "context": 0,
250
+ "retrieval": 10,
251
+ "judge_in": 0,
252
+ "judge_out": 0,
253
+ "judge_latency_ms": 0.0
254
+ }
255
+ },
256
+ "judge_tokens_in": 957,
257
+ "judge_tokens_out": 116,
258
+ "judge_calls": 3,
259
+ "judge_mean_latency_ms": 770.118924677372
260
+ },
261
+ "cost_usd": {
262
+ "assumed_completion_tokens_per_task": 100,
263
+ "rates": {
264
+ "input_per_1k": 0.0025,
265
+ "output_per_1k": 0.01,
266
+ "model": "gpt-4o"
267
+ },
268
+ "retrieval_usd_in": 0.0016950000000000001,
269
+ "retrieval_usd_out": 0.02,
270
+ "retrieval_usd_total": 0.021695,
271
+ "naive_usd_total": 0.09000000000000001,
272
+ "saved_usd": 0.068305,
273
+ "saved_usd_per_1k_tasks": 3.4152500000000003
274
+ }
275
+ },
276
+ "task_results": [
277
+ {
278
+ "task_id": "thing-lst-9001-sold-price",
279
+ "query": "What was the final sale price of LST-9001?",
280
+ "answer": "",
281
+ "hits": [],
282
+ "correct": false,
283
+ "score": 0.0,
284
+ "grading_notes": "missing 1/1: ['2,250']",
285
+ "search_time_ms": 28.186109993839636,
286
+ "generation_time_ms": 0.0,
287
+ "tokens_in": 0,
288
+ "tokens_out": 0,
289
+ "retrieval_tokens": 13,
290
+ "query_tokens": 13,
291
+ "context_tokens": 0,
292
+ "judge_tokens_in": 0,
293
+ "judge_tokens_out": 0,
294
+ "judge_latency_ms": 0.0
295
+ },
296
+ {
297
+ "task_id": "thing-lst-9001-buyer",
298
+ "query": "Who bought LST-9001?",
299
+ "answer": "",
300
+ "hits": [],
301
+ "correct": false,
302
+ "score": 0.0,
303
+ "grading_notes": "missing 1/1: ['sera-interiors']",
304
+ "search_time_ms": 20.723878988064826,
305
+ "generation_time_ms": 0.0,
306
+ "tokens_in": 0,
307
+ "tokens_out": 0,
308
+ "retrieval_tokens": 8,
309
+ "query_tokens": 8,
310
+ "context_tokens": 0,
311
+ "judge_tokens_in": 0,
312
+ "judge_tokens_out": 0,
313
+ "judge_latency_ms": 0.0
314
+ },
315
+ {
316
+ "task_id": "thing-lst-9001-first-offer",
317
+ "query": "What was the first offer received on LST-9001 and was it accepted?",
318
+ "answer": "",
319
+ "hits": [],
320
+ "correct": false,
321
+ "score": 0.0,
322
+ "grading_notes": "missing 3/3: ['2,000', 'hendrik', 'declined']",
323
+ "search_time_ms": 22.80937001341954,
324
+ "generation_time_ms": 0.0,
325
+ "tokens_in": 0,
326
+ "tokens_out": 0,
327
+ "retrieval_tokens": 17,
328
+ "query_tokens": 17,
329
+ "context_tokens": 0,
330
+ "judge_tokens_in": 0,
331
+ "judge_tokens_out": 0,
332
+ "judge_latency_ms": 0.0
333
+ },
334
+ {
335
+ "task_id": "thing-lst-9014-flagged-reason",
336
+ "query": "Why was LST-9014 flagged as duplicate?",
337
+ "answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)",
338
+ "hits": [
339
+ {
340
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
341
+ "score": 0.735049280295262,
342
+ "source": "pentatonic-memory",
343
+ "doc_id": "tes-events-lst-9014"
344
+ }
345
+ ],
346
+ "correct": true,
347
+ "score": 1.0,
348
+ "grading_notes": "all substrings matched",
349
+ "search_time_ms": 19.363785977475345,
350
+ "generation_time_ms": 0.0,
351
+ "tokens_in": 0,
352
+ "tokens_out": 0,
353
+ "retrieval_tokens": 252,
354
+ "query_tokens": 11,
355
+ "context_tokens": 241,
356
+ "judge_tokens_in": 0,
357
+ "judge_tokens_out": 0,
358
+ "judge_latency_ms": 0.0
359
+ },
360
+ {
361
+ "task_id": "thing-lst-9014-return-reason",
362
+ "query": "Why did the buyer return LST-9014?",
363
+ "answer": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
364
+ "hits": [
365
+ {
366
+ "text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
367
+ "score": 0.708891987825674,
368
+ "source": "pentatonic-memory",
369
+ "doc_id": "buyer-ariadne-profile"
370
+ }
371
+ ],
372
+ "correct": false,
373
+ "score": 0.0,
374
+ "grading_notes": "missing 1/1: ['battery health']",
375
+ "search_time_ms": 19.234279985539615,
376
+ "generation_time_ms": 0.0,
377
+ "tokens_in": 0,
378
+ "tokens_out": 0,
379
+ "retrieval_tokens": 54,
380
+ "query_tokens": 11,
381
+ "context_tokens": 43,
382
+ "judge_tokens_in": 0,
383
+ "judge_tokens_out": 0,
384
+ "judge_latency_ms": 0.0
385
+ },
386
+ {
387
+ "task_id": "thing-lst-9030-agent-offer",
388
+ "query": "Which agent made the offer on LST-9030 and for whom?",
389
+ "answer": "",
390
+ "hits": [],
391
+ "correct": false,
392
+ "score": 0.0,
393
+ "grading_notes": "missing 2/2: ['buyer-agent-7', 'thora']",
394
+ "search_time_ms": 25.778518989682198,
395
+ "generation_time_ms": 0.0,
396
+ "tokens_in": 0,
397
+ "tokens_out": 0,
398
+ "retrieval_tokens": 15,
399
+ "query_tokens": 15,
400
+ "context_tokens": 0,
401
+ "judge_tokens_in": 0,
402
+ "judge_tokens_out": 0,
403
+ "judge_latency_ms": 0.0
404
+ },
405
+ {
406
+ "task_id": "thing-lst-9030-agent-discount",
407
+ "query": "What percentage discount did the shopping agent offer on LST-9030?",
408
+ "answer": "",
409
+ "hits": [],
410
+ "correct": false,
411
+ "score": 0.0,
412
+ "grading_notes": "missing 2/2: ['380', '420']",
413
+ "search_time_ms": 24.16933499625884,
414
+ "generation_time_ms": 0.0,
415
+ "tokens_in": 0,
416
+ "tokens_out": 0,
417
+ "retrieval_tokens": 15,
418
+ "query_tokens": 15,
419
+ "context_tokens": 0,
420
+ "judge_tokens_in": 0,
421
+ "judge_tokens_out": 0,
422
+ "judge_latency_ms": 0.0
423
+ },
424
+ {
425
+ "task_id": "seller-mariposa-rating",
426
+ "query": "What's @mariposa's average star rating?",
427
+ "answer": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
428
+ "hits": [
429
+ {
430
+ "text": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
431
+ "score": 0.7218132427756615,
432
+ "source": "pentatonic-memory",
433
+ "doc_id": "seller-mariposa-profile"
434
+ }
435
+ ],
436
+ "correct": true,
437
+ "score": 1.0,
438
+ "grading_notes": "all substrings matched",
439
+ "search_time_ms": 20.875624002655968,
440
+ "generation_time_ms": 0.0,
441
+ "tokens_in": 0,
442
+ "tokens_out": 0,
443
+ "retrieval_tokens": 75,
444
+ "query_tokens": 11,
445
+ "context_tokens": 64,
446
+ "judge_tokens_in": 0,
447
+ "judge_tokens_out": 0,
448
+ "judge_latency_ms": 0.0
449
+ },
450
+ {
451
+ "task_id": "seller-rix-review-status",
452
+ "query": "Is @rix-digital under any special review status?",
453
+ "answer": "",
454
+ "hits": [],
455
+ "correct": false,
456
+ "score": 0.0,
457
+ "grading_notes": "missing 1/1: ['enhanced listing review']",
458
+ "search_time_ms": 22.08369900472462,
459
+ "generation_time_ms": 0.0,
460
+ "tokens_in": 0,
461
+ "tokens_out": 0,
462
+ "retrieval_tokens": 11,
463
+ "query_tokens": 11,
464
+ "context_tokens": 0,
465
+ "judge_tokens_in": 0,
466
+ "judge_tokens_out": 0,
467
+ "judge_latency_ms": 0.0
468
+ },
469
+ {
470
+ "task_id": "seller-velocipede-agent-friendly",
471
+ "query": "Does @velocipede-jo respond well to shopping agents?",
472
+ "answer": "",
473
+ "hits": [],
474
+ "correct": false,
475
+ "score": 0.0,
476
+ "grading_notes": "missing 2/2: ['yes', '5 minutes']",
477
+ "search_time_ms": 25.911798991728574,
478
+ "generation_time_ms": 0.0,
479
+ "tokens_in": 0,
480
+ "tokens_out": 0,
481
+ "retrieval_tokens": 14,
482
+ "query_tokens": 14,
483
+ "context_tokens": 0,
484
+ "judge_tokens_in": 0,
485
+ "judge_tokens_out": 0,
486
+ "judge_latency_ms": 0.0
487
+ },
488
+ {
489
+ "task_id": "buyer-sera-specialism",
490
+ "query": "What does @sera-interiors typically buy?",
491
+ "answer": "",
492
+ "hits": [],
493
+ "correct": false,
494
+ "score": 0.0,
495
+ "grading_notes": "missing 2/2: ['mid-century', 'furniture']",
496
+ "search_time_ms": 18.640466005308554,
497
+ "generation_time_ms": 0.0,
498
+ "tokens_in": 0,
499
+ "tokens_out": 0,
500
+ "retrieval_tokens": 10,
501
+ "query_tokens": 10,
502
+ "context_tokens": 0,
503
+ "judge_tokens_in": 0,
504
+ "judge_tokens_out": 0,
505
+ "judge_latency_ms": 0.0
506
+ },
507
+ {
508
+ "task_id": "buyer-ariadne-disputes",
509
+ "query": "Does @ariadne have any disputes on record?",
510
+ "answer": "",
511
+ "hits": [],
512
+ "correct": false,
513
+ "score": 0.0,
514
+ "grading_notes": "missing 2/2: ['LST-9014', 'battery']",
515
+ "search_time_ms": 20.008318999316543,
516
+ "generation_time_ms": 0.0,
517
+ "tokens_in": 0,
518
+ "tokens_out": 0,
519
+ "retrieval_tokens": 11,
520
+ "query_tokens": 11,
521
+ "context_tokens": 0,
522
+ "judge_tokens_in": 0,
523
+ "judge_tokens_out": 0,
524
+ "judge_latency_ms": 0.0
525
+ },
526
+ {
527
+ "task_id": "policy-duplicate-trigger",
528
+ "query": "What triggers a FLAGGED_DUPLICATE event in TES?",
529
+ "answer": "",
530
+ "hits": [],
531
+ "correct": false,
532
+ "score": 0.0,
533
+ "grading_notes": "missing 2/2: ['device fingerprint', 'serial']",
534
+ "search_time_ms": 20.936009008437395,
535
+ "generation_time_ms": 0.0,
536
+ "tokens_in": 0,
537
+ "tokens_out": 0,
538
+ "retrieval_tokens": 12,
539
+ "query_tokens": 12,
540
+ "context_tokens": 0,
541
+ "judge_tokens_in": 0,
542
+ "judge_tokens_out": 0,
543
+ "judge_latency_ms": 0.0
544
+ },
545
+ {
546
+ "task_id": "policy-agent-opt-out",
547
+ "query": "Can a seller refuse offers from shopping agents?",
548
+ "answer": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
549
+ "hits": [
550
+ {
551
+ "text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
552
+ "score": 0.5153800318362944,
553
+ "source": "pentatonic-memory",
554
+ "doc_id": "policy-agent-commerce"
555
+ }
556
+ ],
557
+ "correct": true,
558
+ "score": 1.0,
559
+ "grading_notes": "all substrings matched",
560
+ "search_time_ms": 22.686360985971987,
561
+ "generation_time_ms": 0.0,
562
+ "tokens_in": 0,
563
+ "tokens_out": 0,
564
+ "retrieval_tokens": 54,
565
+ "query_tokens": 9,
566
+ "context_tokens": 45,
567
+ "judge_tokens_in": 0,
568
+ "judge_tokens_out": 0,
569
+ "judge_latency_ms": 0.0
570
+ },
571
+ {
572
+ "task_id": "policy-enhanced-review-lifted",
573
+ "query": "After how long of clean activity is enhanced listing review lifted?",
574
+ "answer": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.",
575
+ "hits": [
576
+ {
577
+ "text": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.",
578
+ "score": 0.5413613700787364,
579
+ "source": "pentatonic-memory",
580
+ "doc_id": "policy-seller-enhanced-review"
581
+ }
582
+ ],
583
+ "correct": true,
584
+ "score": 1.0,
585
+ "grading_notes": "all substrings matched",
586
+ "search_time_ms": 22.642649011686444,
587
+ "generation_time_ms": 0.0,
588
+ "tokens_in": 0,
589
+ "tokens_out": 0,
590
+ "retrieval_tokens": 57,
591
+ "query_tokens": 12,
592
+ "context_tokens": 45,
593
+ "judge_tokens_in": 0,
594
+ "judge_tokens_out": 0,
595
+ "judge_latency_ms": 0.0
596
+ },
597
+ {
598
+ "task_id": "rubric-rix-buy-decision",
599
+ "query": "A shopping agent is considering buying from @rix-digital. What's your recommendation?",
600
+ "answer": "",
601
+ "hits": [],
602
+ "correct": false,
603
+ "score": 0.0,
604
+ "grading_notes": "judge=0.00 \u2014 The agent's answer does not mention @rix-digital being under enhanced listing review due to spec discrepancies or its di",
605
+ "search_time_ms": 29.031301994109526,
606
+ "generation_time_ms": 0.0,
607
+ "tokens_in": 0,
608
+ "tokens_out": 0,
609
+ "retrieval_tokens": 17,
610
+ "query_tokens": 17,
611
+ "context_tokens": 0,
612
+ "judge_tokens_in": 326,
613
+ "judge_tokens_out": 39,
614
+ "judge_latency_ms": 852.8406230211258
615
+ },
616
+ {
617
+ "task_id": "rubric-lst-9014-full-story",
618
+ "query": "Tell me the full story of LST-9014 from creation to resolution.",
619
+ "answer": "",
620
+ "hits": [],
621
+ "correct": false,
622
+ "score": 0.0,
623
+ "grading_notes": "judge=0.00 \u2014 The answer does not cover stage (1) created 2026-02-14 by @rix-digital, listed at \u00a3780.",
624
+ "search_time_ms": 27.950554998824373,
625
+ "generation_time_ms": 0.0,
626
+ "tokens_in": 0,
627
+ "tokens_out": 0,
628
+ "retrieval_tokens": 16,
629
+ "query_tokens": 16,
630
+ "context_tokens": 0,
631
+ "judge_tokens_in": 335,
632
+ "judge_tokens_out": 42,
633
+ "judge_latency_ms": 864.7506729960442
634
+ },
635
+ {
636
+ "task_id": "rubric-agent-commerce-thora",
637
+ "query": "Describe how @thora uses the marketplace.",
638
+ "answer": "",
639
+ "hits": [],
640
+ "correct": false,
641
+ "score": 0.0,
642
+ "grading_notes": "judge=0.00 \u2014 The answer does not mention @thora using an autonomous shopping agent (@buyer-agent-7) at all.",
643
+ "search_time_ms": 29.42392899421975,
644
+ "generation_time_ms": 0.0,
645
+ "tokens_in": 0,
646
+ "tokens_out": 0,
647
+ "retrieval_tokens": 8,
648
+ "query_tokens": 8,
649
+ "context_tokens": 0,
650
+ "judge_tokens_in": 296,
651
+ "judge_tokens_out": 35,
652
+ "judge_latency_ms": 592.765478014946
653
+ },
654
+ {
655
+ "task_id": "entity-all-sold-things",
656
+ "query": "List the thing IDs of all completed sales.",
657
+ "answer": "",
658
+ "hits": [],
659
+ "correct": false,
660
+ "score": 0.0,
661
+ "grading_notes": "no expected_substrings set",
662
+ "search_time_ms": 22.80199498636648,
663
+ "generation_time_ms": 0.0,
664
+ "tokens_in": 0,
665
+ "tokens_out": 0,
666
+ "retrieval_tokens": 9,
667
+ "query_tokens": 9,
668
+ "context_tokens": 0,
669
+ "judge_tokens_in": 0,
670
+ "judge_tokens_out": 0,
671
+ "judge_latency_ms": 0.0
672
+ },
673
+ {
674
+ "task_id": "entity-sellers-with-disputes",
675
+ "query": "Which sellers in this corpus have disputes on record?",
676
+ "answer": "",
677
+ "hits": [],
678
+ "correct": true,
679
+ "score": 1.0,
680
+ "grading_notes": "no positive criteria (negative-only task)",
681
+ "search_time_ms": 27.79455398558639,
682
+ "generation_time_ms": 0.0,
683
+ "tokens_in": 0,
684
+ "tokens_out": 0,
685
+ "retrieval_tokens": 10,
686
+ "query_tokens": 10,
687
+ "context_tokens": 0,
688
+ "judge_tokens_in": 0,
689
+ "judge_tokens_out": 0,
690
+ "judge_latency_ms": 0.0
691
+ }
692
+ ]
693
+ }