@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +170 -69
  2. package/bin/__tests__/callback-server.test.js +4 -1
  3. package/bin/cli.js +41 -164
  4. package/bin/commands/config.js +251 -0
  5. package/package.json +2 -1
  6. package/packages/doctor/__tests__/detect.test.js +2 -6
  7. package/packages/doctor/src/checks/local-memory.js +164 -196
  8. package/packages/doctor/src/detect.js +11 -3
  9. package/packages/memory/src/corpus/adapters.js +104 -0
  10. package/packages/memory/src/corpus/cli.js +72 -7
  11. package/packages/memory/src/corpus/index.js +1 -1
  12. package/packages/memory-engine/.env.example +13 -0
  13. package/packages/memory-engine/README.md +131 -0
  14. package/packages/memory-engine/bench/README.md +99 -0
  15. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  16. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  17. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  18. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  19. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  20. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  21. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  22. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  23. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  24. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  25. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  26. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  27. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  28. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  29. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  30. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  31. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  32. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  33. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  34. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  35. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  36. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  37. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  38. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  39. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  40. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  41. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  42. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  43. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  44. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  45. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  46. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  49. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  50. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  51. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  52. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  53. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  54. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  55. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  56. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  57. package/packages/memory-engine/compat/Dockerfile +11 -0
  58. package/packages/memory-engine/compat/server.py +680 -0
  59. package/packages/memory-engine/docker-compose.yml +243 -0
  60. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  61. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  62. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  63. package/packages/memory-engine/engine/README.md +52 -0
  64. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  65. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  66. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  67. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  68. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  69. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  70. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  71. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  72. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  73. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  74. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  75. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  76. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  77. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  78. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  79. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  80. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  81. package/packages/memory-engine/pme_memory/embed.py +74 -0
  82. package/packages/memory-engine/pme_memory/health.py +36 -0
  83. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  84. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  85. package/packages/memory-engine/pme_memory/needs.py +55 -0
  86. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  87. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  88. package/packages/memory-engine/pme_memory/search.py +52 -0
  89. package/packages/memory-engine/pme_memory/store.py +86 -0
  90. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  91. package/packages/memory-engine/pyproject.toml +65 -0
  92. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  93. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  94. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,1038 @@
1
+ {
2
+ "bench": "marketplace-ops",
3
+ "stack": "pentatonic-baseline",
4
+ "n_tasks": 20,
5
+ "n_correct": 17,
6
+ "accuracy": 0.85,
7
+ "mean_score": 0.89,
8
+ "p50_search_ms": 1192.875291017117,
9
+ "p95_search_ms": 1360.2062849502545,
10
+ "total_tokens_in": 0,
11
+ "total_tokens_out": 0,
12
+ "total_usd": 0.0,
13
+ "by_tag": {
14
+ "factoid": {
15
+ "n": 12,
16
+ "mean_score": 1.0,
17
+ "accuracy": 1.0
18
+ },
19
+ "event-log": {
20
+ "n": 7,
21
+ "mean_score": 0.8285714285714285,
22
+ "accuracy": 0.7142857142857143
23
+ },
24
+ "multi-fact": {
25
+ "n": 1,
26
+ "mean_score": 1.0,
27
+ "accuracy": 1.0
28
+ },
29
+ "agent-commerce": {
30
+ "n": 6,
31
+ "mean_score": 0.8333333333333334,
32
+ "accuracy": 0.8333333333333334
33
+ },
34
+ "math": {
35
+ "n": 1,
36
+ "mean_score": 1.0,
37
+ "accuracy": 1.0
38
+ },
39
+ "seller": {
40
+ "n": 5,
41
+ "mean_score": 0.8,
42
+ "accuracy": 0.8
43
+ },
44
+ "buyer": {
45
+ "n": 3,
46
+ "mean_score": 1.0,
47
+ "accuracy": 1.0
48
+ },
49
+ "multi-doc": {
50
+ "n": 1,
51
+ "mean_score": 1.0,
52
+ "accuracy": 1.0
53
+ },
54
+ "policy": {
55
+ "n": 3,
56
+ "mean_score": 1.0,
57
+ "accuracy": 1.0
58
+ },
59
+ "rubric": {
60
+ "n": 3,
61
+ "mean_score": 0.6,
62
+ "accuracy": 0.3333333333333333
63
+ },
64
+ "multi-hop": {
65
+ "n": 2,
66
+ "mean_score": 0.4,
67
+ "accuracy": 0.0
68
+ },
69
+ "entity": {
70
+ "n": 2,
71
+ "mean_score": 0.5,
72
+ "accuracy": 0.5
73
+ },
74
+ "negative": {
75
+ "n": 1,
76
+ "mean_score": 1.0,
77
+ "accuracy": 1.0
78
+ }
79
+ },
80
+ "extra": {
81
+ "ingest_ms": 30824.596883001504,
82
+ "grading": "substring",
83
+ "limit": 3,
84
+ "tokens": {
85
+ "corpus_tokens": 1388,
86
+ "query_tokens": 240,
87
+ "context_tokens": 6278,
88
+ "retrieval_tokens": 6518,
89
+ "naive_tokens": 28000,
90
+ "saved_tokens": 21482,
91
+ "reduction_pct": 0.7672142857142857,
92
+ "mean_retrieval_tokens_per_task": 325.9,
93
+ "tokenizer": "cl100k_base",
94
+ "per_task": {
95
+ "thing-lst-9001-sold-price": {
96
+ "query": 13,
97
+ "context": 475,
98
+ "retrieval": 488,
99
+ "judge_in": 0,
100
+ "judge_out": 0,
101
+ "judge_latency_ms": 0.0
102
+ },
103
+ "thing-lst-9001-buyer": {
104
+ "query": 8,
105
+ "context": 475,
106
+ "retrieval": 483,
107
+ "judge_in": 0,
108
+ "judge_out": 0,
109
+ "judge_latency_ms": 0.0
110
+ },
111
+ "thing-lst-9001-first-offer": {
112
+ "query": 17,
113
+ "context": 441,
114
+ "retrieval": 458,
115
+ "judge_in": 0,
116
+ "judge_out": 0,
117
+ "judge_latency_ms": 0.0
118
+ },
119
+ "thing-lst-9014-flagged-reason": {
120
+ "query": 11,
121
+ "context": 477,
122
+ "retrieval": 488,
123
+ "judge_in": 0,
124
+ "judge_out": 0,
125
+ "judge_latency_ms": 0.0
126
+ },
127
+ "thing-lst-9014-return-reason": {
128
+ "query": 11,
129
+ "context": 306,
130
+ "retrieval": 317,
131
+ "judge_in": 0,
132
+ "judge_out": 0,
133
+ "judge_latency_ms": 0.0
134
+ },
135
+ "thing-lst-9030-agent-offer": {
136
+ "query": 15,
137
+ "context": 431,
138
+ "retrieval": 446,
139
+ "judge_in": 0,
140
+ "judge_out": 0,
141
+ "judge_latency_ms": 0.0
142
+ },
143
+ "thing-lst-9030-agent-discount": {
144
+ "query": 15,
145
+ "context": 449,
146
+ "retrieval": 464,
147
+ "judge_in": 0,
148
+ "judge_out": 0,
149
+ "judge_latency_ms": 0.0
150
+ },
151
+ "seller-mariposa-rating": {
152
+ "query": 11,
153
+ "context": 242,
154
+ "retrieval": 253,
155
+ "judge_in": 0,
156
+ "judge_out": 0,
157
+ "judge_latency_ms": 0.0
158
+ },
159
+ "seller-rix-review-status": {
160
+ "query": 11,
161
+ "context": 355,
162
+ "retrieval": 366,
163
+ "judge_in": 0,
164
+ "judge_out": 0,
165
+ "judge_latency_ms": 0.0
166
+ },
167
+ "seller-velocipede-agent-friendly": {
168
+ "query": 14,
169
+ "context": 171,
170
+ "retrieval": 185,
171
+ "judge_in": 0,
172
+ "judge_out": 0,
173
+ "judge_latency_ms": 0.0
174
+ },
175
+ "buyer-sera-specialism": {
176
+ "query": 10,
177
+ "context": 323,
178
+ "retrieval": 333,
179
+ "judge_in": 0,
180
+ "judge_out": 0,
181
+ "judge_latency_ms": 0.0
182
+ },
183
+ "buyer-ariadne-disputes": {
184
+ "query": 11,
185
+ "context": 345,
186
+ "retrieval": 356,
187
+ "judge_in": 0,
188
+ "judge_out": 0,
189
+ "judge_latency_ms": 0.0
190
+ },
191
+ "policy-duplicate-trigger": {
192
+ "query": 12,
193
+ "context": 445,
194
+ "retrieval": 457,
195
+ "judge_in": 0,
196
+ "judge_out": 0,
197
+ "judge_latency_ms": 0.0
198
+ },
199
+ "policy-agent-opt-out": {
200
+ "query": 9,
201
+ "context": 147,
202
+ "retrieval": 156,
203
+ "judge_in": 0,
204
+ "judge_out": 0,
205
+ "judge_latency_ms": 0.0
206
+ },
207
+ "policy-enhanced-review-lifted": {
208
+ "query": 12,
209
+ "context": 207,
210
+ "retrieval": 219,
211
+ "judge_in": 0,
212
+ "judge_out": 0,
213
+ "judge_latency_ms": 0.0
214
+ },
215
+ "rubric-rix-buy-decision": {
216
+ "query": 17,
217
+ "context": 198,
218
+ "retrieval": 215,
219
+ "judge_in": 524,
220
+ "judge_out": 38,
221
+ "judge_latency_ms": 901.7658139765263
222
+ },
223
+ "rubric-lst-9014-full-story": {
224
+ "query": 16,
225
+ "context": 287,
226
+ "retrieval": 303,
227
+ "judge_in": 622,
228
+ "judge_out": 64,
229
+ "judge_latency_ms": 1331.2221429944038
230
+ },
231
+ "rubric-agent-commerce-thora": {
232
+ "query": 8,
233
+ "context": 147,
234
+ "retrieval": 155,
235
+ "judge_in": 443,
236
+ "judge_out": 45,
237
+ "judge_latency_ms": 868.7936529815197
238
+ },
239
+ "entity-all-sold-things": {
240
+ "query": 9,
241
+ "context": 139,
242
+ "retrieval": 148,
243
+ "judge_in": 0,
244
+ "judge_out": 0,
245
+ "judge_latency_ms": 0.0
246
+ },
247
+ "entity-sellers-with-disputes": {
248
+ "query": 10,
249
+ "context": 218,
250
+ "retrieval": 228,
251
+ "judge_in": 0,
252
+ "judge_out": 0,
253
+ "judge_latency_ms": 0.0
254
+ }
255
+ },
256
+ "judge_tokens_in": 1589,
257
+ "judge_tokens_out": 147,
258
+ "judge_calls": 3,
259
+ "judge_mean_latency_ms": 1033.9272033174832
260
+ },
261
+ "cost_usd": {
262
+ "assumed_completion_tokens_per_task": 100,
263
+ "rates": {
264
+ "input_per_1k": 0.0025,
265
+ "output_per_1k": 0.01,
266
+ "model": "gpt-4o"
267
+ },
268
+ "retrieval_usd_in": 0.016295,
269
+ "retrieval_usd_out": 0.02,
270
+ "retrieval_usd_total": 0.036295,
271
+ "naive_usd_total": 0.09000000000000001,
272
+ "saved_usd": 0.05370500000000001,
273
+ "saved_usd_per_1k_tasks": 2.6852500000000004
274
+ }
275
+ },
276
+ "task_results": [
277
+ {
278
+ "task_id": "thing-lst-9001-sold-price",
279
+ "query": "What was the final sale price of LST-9001?",
280
+ "answer": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P\n---\nThing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.\n---\n<!-- doc_id: tes-events-lst-9001 -->\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interi",
281
+ "hits": [
282
+ {
283
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
284
+ "score": 0.542,
285
+ "source": "pentatonic-baseline:L0_workspace_bm25",
286
+ "doc_id": "tes-events-lst-9001"
287
+ },
288
+ {
289
+ "text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
290
+ "score": 0.5034,
291
+ "source": "pentatonic-baseline:L0_workspace_bm25",
292
+ "doc_id": "tes-thing-lst-9001"
293
+ },
294
+ {
295
+ "text": "<!-- doc_id: tes-events-lst-9001 -->\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T",
296
+ "score": 0.3919,
297
+ "source": "pentatonic-baseline:L6_documents",
298
+ "doc_id": "tes-events-lst-9001"
299
+ }
300
+ ],
301
+ "correct": true,
302
+ "score": 1.0,
303
+ "grading_notes": "all substrings matched",
304
+ "search_time_ms": 1338.991093012737,
305
+ "generation_time_ms": 0.0,
306
+ "tokens_in": 0,
307
+ "tokens_out": 0,
308
+ "retrieval_tokens": 488,
309
+ "query_tokens": 13,
310
+ "context_tokens": 475,
311
+ "judge_tokens_in": 0,
312
+ "judge_tokens_out": 0,
313
+ "judge_latency_ms": 0.0
314
+ },
315
+ {
316
+ "task_id": "thing-lst-9001-buyer",
317
+ "query": "Who bought LST-9001?",
318
+ "answer": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P\n---\n<!-- doc_id: tes-events-lst-9001 -->\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interi",
319
+ "hits": [
320
+ {
321
+ "text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
322
+ "score": 0.502,
323
+ "source": "pentatonic-baseline:L0_workspace_bm25",
324
+ "doc_id": "tes-thing-lst-9001"
325
+ },
326
+ {
327
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
328
+ "score": 0.5011,
329
+ "source": "pentatonic-baseline:L0_workspace_bm25",
330
+ "doc_id": "tes-events-lst-9001"
331
+ },
332
+ {
333
+ "text": "<!-- doc_id: tes-events-lst-9001 -->\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T",
334
+ "score": 0.3806,
335
+ "source": "pentatonic-baseline:L6_documents",
336
+ "doc_id": "tes-events-lst-9001"
337
+ }
338
+ ],
339
+ "correct": true,
340
+ "score": 1.0,
341
+ "grading_notes": "all substrings matched",
342
+ "search_time_ms": 1197.549014003016,
343
+ "generation_time_ms": 0.0,
344
+ "tokens_in": 0,
345
+ "tokens_out": 0,
346
+ "retrieval_tokens": 483,
347
+ "query_tokens": 8,
348
+ "context_tokens": 475,
349
+ "judge_tokens_in": 0,
350
+ "judge_tokens_out": 0,
351
+ "judge_latency_ms": 0.0
352
+ },
353
+ {
354
+ "task_id": "thing-lst-9001-first-offer",
355
+ "query": "What was the first offer received on LST-9001 and was it accepted?",
356
+ "answer": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPT\n---\nThing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
357
+ "hits": [
358
+ {
359
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
360
+ "score": 0.6186,
361
+ "source": "pentatonic-baseline:L0_workspace_bm25",
362
+ "doc_id": "tes-events-lst-9001"
363
+ },
364
+ {
365
+ "text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
366
+ "score": 0.5827,
367
+ "source": "pentatonic-baseline:L0_workspace_bm25",
368
+ "doc_id": "tes-events-lst-9030"
369
+ },
370
+ {
371
+ "text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
372
+ "score": 0.4804,
373
+ "source": "pentatonic-baseline:L0_workspace_bm25",
374
+ "doc_id": "tes-thing-lst-9001"
375
+ }
376
+ ],
377
+ "correct": true,
378
+ "score": 1.0,
379
+ "grading_notes": "all substrings matched",
380
+ "search_time_ms": 1273.1086690037046,
381
+ "generation_time_ms": 0.0,
382
+ "tokens_in": 0,
383
+ "tokens_out": 0,
384
+ "retrieval_tokens": 458,
385
+ "query_tokens": 17,
386
+ "context_tokens": 441,
387
+ "judge_tokens_in": 0,
388
+ "judge_tokens_out": 0,
389
+ "judge_latency_ms": 0.0
390
+ },
391
+ {
392
+ "task_id": "thing-lst-9014-flagged-reason",
393
+ "query": "Why was LST-9014 flagged as duplicate?",
394
+ "answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refu\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETUR\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
395
+ "hits": [
396
+ {
397
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
398
+ "score": 0.6014,
399
+ "source": "pentatonic-baseline",
400
+ "doc_id": "tes-events-lst-9014"
401
+ },
402
+ {
403
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
404
+ "score": 0.5754,
405
+ "source": "pentatonic-baseline:L0_workspace_bm25",
406
+ "doc_id": "tes-events-lst-9014"
407
+ },
408
+ {
409
+ "text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
410
+ "score": 0.5463,
411
+ "source": "pentatonic-baseline:L0_workspace_bm25",
412
+ "doc_id": "policy-duplicate-listings"
413
+ }
414
+ ],
415
+ "correct": true,
416
+ "score": 1.0,
417
+ "grading_notes": "all substrings matched",
418
+ "search_time_ms": 1196.9661400071345,
419
+ "generation_time_ms": 0.0,
420
+ "tokens_in": 0,
421
+ "tokens_out": 0,
422
+ "retrieval_tokens": 488,
423
+ "query_tokens": 11,
424
+ "context_tokens": 477,
425
+ "judge_tokens_in": 0,
426
+ "judge_tokens_out": 0,
427
+ "judge_latency_ms": 0.0
428
+ },
429
+ {
430
+ "task_id": "thing-lst-9014-return-reason",
431
+ "query": "Why did the buyer return LST-9014?",
432
+ "answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refu\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
433
+ "hits": [
434
+ {
435
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
436
+ "score": 0.5828,
437
+ "source": "pentatonic-baseline",
438
+ "doc_id": "tes-events-lst-9014"
439
+ },
440
+ {
441
+ "text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
442
+ "score": 0.5737,
443
+ "source": "pentatonic-baseline",
444
+ "doc_id": "buyer-ariadne-profile"
445
+ },
446
+ {
447
+ "text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
448
+ "score": 0.5647,
449
+ "source": "pentatonic-baseline:L0_workspace_bm25",
450
+ "doc_id": "buyer-ariadne-profile"
451
+ }
452
+ ],
453
+ "correct": true,
454
+ "score": 1.0,
455
+ "grading_notes": "all substrings matched",
456
+ "search_time_ms": 1188.8224919966888,
457
+ "generation_time_ms": 0.0,
458
+ "tokens_in": 0,
459
+ "tokens_out": 0,
460
+ "retrieval_tokens": 317,
461
+ "query_tokens": 11,
462
+ "context_tokens": 306,
463
+ "judge_tokens_in": 0,
464
+ "judge_tokens_out": 0,
465
+ "judge_latency_ms": 0.0
466
+ },
467
+ {
468
+ "task_id": "thing-lst-9030-agent-offer",
469
+ "query": "Which agent made the offer on LST-9030 and for whom?",
470
+ "answer": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPT\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
471
+ "hits": [
472
+ {
473
+ "text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
474
+ "score": 0.6072,
475
+ "source": "pentatonic-baseline:L0_workspace_bm25",
476
+ "doc_id": "tes-events-lst-9030"
477
+ },
478
+ {
479
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
480
+ "score": 0.5132,
481
+ "source": "pentatonic-baseline:L0_workspace_bm25",
482
+ "doc_id": "tes-events-lst-9001"
483
+ },
484
+ {
485
+ "text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
486
+ "score": 0.4761,
487
+ "source": "pentatonic-baseline:L0_workspace_bm25",
488
+ "doc_id": "tes-thing-lst-9030"
489
+ }
490
+ ],
491
+ "correct": true,
492
+ "score": 1.0,
493
+ "grading_notes": "all substrings matched",
494
+ "search_time_ms": 1194.4603790179826,
495
+ "generation_time_ms": 0.0,
496
+ "tokens_in": 0,
497
+ "tokens_out": 0,
498
+ "retrieval_tokens": 446,
499
+ "query_tokens": 15,
500
+ "context_tokens": 431,
501
+ "judge_tokens_in": 0,
502
+ "judge_tokens_out": 0,
503
+ "judge_latency_ms": 0.0
504
+ },
505
+ {
506
+ "task_id": "thing-lst-9030-agent-discount",
507
+ "query": "What percentage discount did the shopping agent offer on LST-9030?",
508
+ "answer": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPT\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P",
509
+ "hits": [
510
+ {
511
+ "text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
512
+ "score": 0.6121,
513
+ "source": "pentatonic-baseline:L0_workspace_bm25",
514
+ "doc_id": "tes-events-lst-9030"
515
+ },
516
+ {
517
+ "text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
518
+ "score": 0.5171,
519
+ "source": "pentatonic-baseline:L0_workspace_bm25",
520
+ "doc_id": "agent-buyer-agent-7"
521
+ },
522
+ {
523
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
524
+ "score": 0.5088,
525
+ "source": "pentatonic-baseline:L0_workspace_bm25",
526
+ "doc_id": "tes-events-lst-9001"
527
+ }
528
+ ],
529
+ "correct": true,
530
+ "score": 1.0,
531
+ "grading_notes": "all substrings matched",
532
+ "search_time_ms": 1165.1576769945677,
533
+ "generation_time_ms": 0.0,
534
+ "tokens_in": 0,
535
+ "tokens_out": 0,
536
+ "retrieval_tokens": 464,
537
+ "query_tokens": 15,
538
+ "context_tokens": 449,
539
+ "judge_tokens_in": 0,
540
+ "judge_tokens_out": 0,
541
+ "judge_latency_ms": 0.0
542
+ },
543
+ {
544
+ "task_id": "seller-mariposa-rating",
545
+ "query": "What's @mariposa's average star rating?",
546
+ "answer": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.\n---\nSeller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
547
+ "hits": [
548
+ {
549
+ "text": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
550
+ "score": 0.645,
551
+ "source": "pentatonic-baseline:L0_workspace_bm25",
552
+ "doc_id": "seller-mariposa-profile"
553
+ },
554
+ {
555
+ "text": "Seller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).",
556
+ "score": 0.5366,
557
+ "source": "pentatonic-baseline:L0_workspace_bm25",
558
+ "doc_id": "seller-velocipede-jo-profile"
559
+ },
560
+ {
561
+ "text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
562
+ "score": 0.5096,
563
+ "source": "pentatonic-baseline:L0_workspace_bm25",
564
+ "doc_id": "seller-rix-digital-profile"
565
+ }
566
+ ],
567
+ "correct": true,
568
+ "score": 1.0,
569
+ "grading_notes": "all substrings matched",
570
+ "search_time_ms": 1109.8837760218885,
571
+ "generation_time_ms": 0.0,
572
+ "tokens_in": 0,
573
+ "tokens_out": 0,
574
+ "retrieval_tokens": 253,
575
+ "query_tokens": 11,
576
+ "context_tokens": 242,
577
+ "judge_tokens_in": 0,
578
+ "judge_tokens_out": 0,
579
+ "judge_latency_ms": 0.0
580
+ },
581
+ {
582
+ "task_id": "seller-rix-review-status",
583
+ "query": "Is @rix-digital under any special review status?",
584
+ "answer": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETUR\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
585
+ "hits": [
586
+ {
587
+ "text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
588
+ "score": 0.6528,
589
+ "source": "pentatonic-baseline:L0_workspace_bm25",
590
+ "doc_id": "seller-rix-digital-profile"
591
+ },
592
+ {
593
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
594
+ "score": 0.4789,
595
+ "source": "pentatonic-baseline:L0_workspace_bm25",
596
+ "doc_id": "tes-events-lst-9014"
597
+ },
598
+ {
599
+ "text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
600
+ "score": 0.4583,
601
+ "source": "pentatonic-baseline:L0_workspace_bm25",
602
+ "doc_id": "policy-duplicate-listings"
603
+ }
604
+ ],
605
+ "correct": true,
606
+ "score": 1.0,
607
+ "grading_notes": "all substrings matched",
608
+ "search_time_ms": 1148.8985969917849,
609
+ "generation_time_ms": 0.0,
610
+ "tokens_in": 0,
611
+ "tokens_out": 0,
612
+ "retrieval_tokens": 366,
613
+ "query_tokens": 11,
614
+ "context_tokens": 355,
615
+ "judge_tokens_in": 0,
616
+ "judge_tokens_out": 0,
617
+ "judge_latency_ms": 0.0
618
+ },
619
+ {
620
+ "task_id": "seller-velocipede-agent-friendly",
621
+ "query": "Does @velocipede-jo respond well to shopping agents?",
622
+ "answer": "Seller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).\n---\nAgent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
623
+ "hits": [
624
+ {
625
+ "text": "Seller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).",
626
+ "score": 0.6065,
627
+ "source": "pentatonic-baseline:L0_workspace_bm25",
628
+ "doc_id": "seller-velocipede-jo-profile"
629
+ },
630
+ {
631
+ "text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
632
+ "score": 0.5485,
633
+ "source": "pentatonic-baseline:L0_workspace_bm25",
634
+ "doc_id": "policy-agent-commerce"
635
+ },
636
+ {
637
+ "text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
638
+ "score": 0.4931,
639
+ "source": "pentatonic-baseline:L0_workspace_bm25",
640
+ "doc_id": "tes-thing-lst-9030"
641
+ }
642
+ ],
643
+ "correct": true,
644
+ "score": 1.0,
645
+ "grading_notes": "all substrings matched",
646
+ "search_time_ms": 1103.089379001176,
647
+ "generation_time_ms": 0.0,
648
+ "tokens_in": 0,
649
+ "tokens_out": 0,
650
+ "retrieval_tokens": 185,
651
+ "query_tokens": 14,
652
+ "context_tokens": 171,
653
+ "judge_tokens_in": 0,
654
+ "judge_tokens_out": 0,
655
+ "judge_latency_ms": 0.0
656
+ },
657
+ {
658
+ "task_id": "buyer-sera-specialism",
659
+ "query": "What does @sera-interiors typically buy?",
660
+ "answer": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_P",
661
+ "hits": [
662
+ {
663
+ "text": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.",
664
+ "score": 0.6451,
665
+ "source": "pentatonic-baseline:L0_workspace_bm25",
666
+ "doc_id": "buyer-sera-interiors-profile"
667
+ },
668
+ {
669
+ "text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
670
+ "score": 0.4834,
671
+ "source": "pentatonic-baseline:L0_workspace_bm25",
672
+ "doc_id": "agent-buyer-agent-7"
673
+ },
674
+ {
675
+ "text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
676
+ "score": 0.4474,
677
+ "source": "pentatonic-baseline:L0_workspace_bm25",
678
+ "doc_id": "tes-events-lst-9001"
679
+ }
680
+ ],
681
+ "correct": true,
682
+ "score": 1.0,
683
+ "grading_notes": "all substrings matched",
684
+ "search_time_ms": 1225.2754700020887,
685
+ "generation_time_ms": 0.0,
686
+ "tokens_in": 0,
687
+ "tokens_out": 0,
688
+ "retrieval_tokens": 333,
689
+ "query_tokens": 10,
690
+ "context_tokens": 323,
691
+ "judge_tokens_in": 0,
692
+ "judge_tokens_out": 0,
693
+ "judge_latency_ms": 0.0
694
+ },
695
+ {
696
+ "task_id": "buyer-ariadne-disputes",
697
+ "query": "Does @ariadne have any disputes on record?",
698
+ "answer": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETUR\n---\n<!-- doc_id: seller-rix-digital-profile -->\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
699
+ "hits": [
700
+ {
701
+ "text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
702
+ "score": 0.6086,
703
+ "source": "pentatonic-baseline:L0_workspace_bm25",
704
+ "doc_id": "buyer-ariadne-profile"
705
+ },
706
+ {
707
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
708
+ "score": 0.4198,
709
+ "source": "pentatonic-baseline:L0_workspace_bm25",
710
+ "doc_id": "tes-events-lst-9014"
711
+ },
712
+ {
713
+ "text": "<!-- doc_id: seller-rix-digital-profile -->\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
714
+ "score": 0.3932,
715
+ "source": "pentatonic-baseline:L6_documents",
716
+ "doc_id": "seller-rix-digital-profile"
717
+ }
718
+ ],
719
+ "correct": true,
720
+ "score": 1.0,
721
+ "grading_notes": "all substrings matched",
722
+ "search_time_ms": 1209.16436999687,
723
+ "generation_time_ms": 0.0,
724
+ "tokens_in": 0,
725
+ "tokens_out": 0,
726
+ "retrieval_tokens": 356,
727
+ "query_tokens": 11,
728
+ "context_tokens": 345,
729
+ "judge_tokens_in": 0,
730
+ "judge_tokens_out": 0,
731
+ "judge_latency_ms": 0.0
732
+ },
733
+ {
734
+ "task_id": "policy-duplicate-trigger",
735
+ "query": "What triggers a FLAGGED_DUPLICATE event in TES?",
736
+ "answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETUR\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.\n---\n<!-- doc_id: tes-events-lst-9014 -->\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T0",
737
+ "hits": [
738
+ {
739
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
740
+ "score": 0.5329,
741
+ "source": "pentatonic-baseline:L0_workspace_bm25",
742
+ "doc_id": "tes-events-lst-9014"
743
+ },
744
+ {
745
+ "text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
746
+ "score": 0.5297,
747
+ "source": "pentatonic-baseline:L0_workspace_bm25",
748
+ "doc_id": "policy-duplicate-listings"
749
+ },
750
+ {
751
+ "text": "<!-- doc_id: tes-events-lst-9014 -->\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-1",
752
+ "score": 0.4391,
753
+ "source": "pentatonic-baseline:L6_documents",
754
+ "doc_id": "tes-events-lst-9014"
755
+ }
756
+ ],
757
+ "correct": true,
758
+ "score": 1.0,
759
+ "grading_notes": "all substrings matched",
760
+ "search_time_ms": 1279.8272609943524,
761
+ "generation_time_ms": 0.0,
762
+ "tokens_in": 0,
763
+ "tokens_out": 0,
764
+ "retrieval_tokens": 457,
765
+ "query_tokens": 12,
766
+ "context_tokens": 445,
767
+ "judge_tokens_in": 0,
768
+ "judge_tokens_out": 0,
769
+ "judge_latency_ms": 0.0
770
+ },
771
+ {
772
+ "task_id": "policy-agent-opt-out",
773
+ "query": "Can a seller refuse offers from shopping agents?",
774
+ "answer": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nBuyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
775
+ "hits": [
776
+ {
777
+ "text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
778
+ "score": 0.6571,
779
+ "source": "pentatonic-baseline:L0_workspace_bm25",
780
+ "doc_id": "policy-agent-commerce"
781
+ },
782
+ {
783
+ "text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
784
+ "score": 0.551,
785
+ "source": "pentatonic-baseline:L0_workspace_bm25",
786
+ "doc_id": "agent-buyer-agent-7"
787
+ },
788
+ {
789
+ "text": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
790
+ "score": 0.3999,
791
+ "source": "pentatonic-baseline:L0_workspace_bm25",
792
+ "doc_id": "buyer-thora-profile"
793
+ }
794
+ ],
795
+ "correct": true,
796
+ "score": 1.0,
797
+ "grading_notes": "all substrings matched",
798
+ "search_time_ms": 1145.2531699906103,
799
+ "generation_time_ms": 0.0,
800
+ "tokens_in": 0,
801
+ "tokens_out": 0,
802
+ "retrieval_tokens": 156,
803
+ "query_tokens": 9,
804
+ "context_tokens": 147,
805
+ "judge_tokens_in": 0,
806
+ "judge_tokens_out": 0,
807
+ "judge_latency_ms": 0.0
808
+ },
809
+ {
810
+ "task_id": "policy-enhanced-review-lifted",
811
+ "query": "After how long of clean activity is enhanced listing review lifted?",
812
+ "answer": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
813
+ "hits": [
814
+ {
815
+ "text": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.",
816
+ "score": 0.7349,
817
+ "source": "pentatonic-baseline:L0_workspace_bm25",
818
+ "doc_id": "policy-seller-enhanced-review"
819
+ },
820
+ {
821
+ "text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
822
+ "score": 0.5436,
823
+ "source": "pentatonic-baseline:L0_workspace_bm25",
824
+ "doc_id": "seller-rix-digital-profile"
825
+ },
826
+ {
827
+ "text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
828
+ "score": 0.4923,
829
+ "source": "pentatonic-baseline:L0_workspace_bm25",
830
+ "doc_id": "policy-duplicate-listings"
831
+ }
832
+ ],
833
+ "correct": true,
834
+ "score": 1.0,
835
+ "grading_notes": "all substrings matched",
836
+ "search_time_ms": 1165.4072110250127,
837
+ "generation_time_ms": 0.0,
838
+ "tokens_in": 0,
839
+ "tokens_out": 0,
840
+ "retrieval_tokens": 219,
841
+ "query_tokens": 12,
842
+ "context_tokens": 207,
843
+ "judge_tokens_in": 0,
844
+ "judge_tokens_out": 0,
845
+ "judge_latency_ms": 0.0
846
+ },
847
+ {
848
+ "task_id": "rubric-rix-buy-decision",
849
+ "query": "A shopping agent is considering buying from @rix-digital. What's your recommendation?",
850
+ "answer": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nBuyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
851
+ "hits": [
852
+ {
853
+ "text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
854
+ "score": 0.5544,
855
+ "source": "pentatonic-baseline:L0_workspace_bm25",
856
+ "doc_id": "seller-rix-digital-profile"
857
+ },
858
+ {
859
+ "text": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
860
+ "score": 0.5221,
861
+ "source": "pentatonic-baseline:L0_workspace_bm25",
862
+ "doc_id": "buyer-thora-profile"
863
+ },
864
+ {
865
+ "text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
866
+ "score": 0.5196,
867
+ "source": "pentatonic-baseline:L0_workspace_bm25",
868
+ "doc_id": "agent-buyer-agent-7"
869
+ }
870
+ ],
871
+ "correct": false,
872
+ "score": 0.0,
873
+ "grading_notes": "judge=0.00 \u2014 The answer does not mention the enhanced listing review status or dispute history, treating @rix-digital as uncontrovers",
874
+ "search_time_ms": 1191.2902030162513,
875
+ "generation_time_ms": 0.0,
876
+ "tokens_in": 0,
877
+ "tokens_out": 0,
878
+ "retrieval_tokens": 215,
879
+ "query_tokens": 17,
880
+ "context_tokens": 198,
881
+ "judge_tokens_in": 524,
882
+ "judge_tokens_out": 38,
883
+ "judge_latency_ms": 901.7658139765263
884
+ },
885
+ {
886
+ "task_id": "rubric-lst-9014-full-story",
887
+ "query": "Tell me the full story of LST-9014 from creation to resolution.",
888
+ "answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETUR\n---\nThing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
889
+ "hits": [
890
+ {
891
+ "text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
892
+ "score": 0.4858,
893
+ "source": "pentatonic-baseline:L0_workspace_bm25",
894
+ "doc_id": "tes-events-lst-9014"
895
+ },
896
+ {
897
+ "text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
898
+ "score": 0.446,
899
+ "source": "pentatonic-baseline:L0_workspace_bm25",
900
+ "doc_id": "tes-thing-lst-9014"
901
+ },
902
+ {
903
+ "text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
904
+ "score": 0.4187,
905
+ "source": "pentatonic-baseline:L0_workspace_bm25",
906
+ "doc_id": "buyer-ariadne-profile"
907
+ }
908
+ ],
909
+ "correct": false,
910
+ "score": 0.8,
911
+ "grading_notes": "judge=0.80 \u2014 The answer covers stages 1, 2, 3, 4, and 5 except for the 'relisted' stage on 2026-02-16, which is mentioned but not exp",
912
+ "search_time_ms": 1361.3228739995975,
913
+ "generation_time_ms": 0.0,
914
+ "tokens_in": 0,
915
+ "tokens_out": 0,
916
+ "retrieval_tokens": 303,
917
+ "query_tokens": 16,
918
+ "context_tokens": 287,
919
+ "judge_tokens_in": 622,
920
+ "judge_tokens_out": 64,
921
+ "judge_latency_ms": 1331.2221429944038
922
+ },
923
+ {
924
+ "task_id": "rubric-agent-commerce-thora",
925
+ "query": "Describe how @thora uses the marketplace.",
926
+ "answer": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nAgent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
927
+ "hits": [
928
+ {
929
+ "text": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
930
+ "score": 0.6667,
931
+ "source": "pentatonic-baseline:L0_workspace_bm25",
932
+ "doc_id": "buyer-thora-profile"
933
+ },
934
+ {
935
+ "text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
936
+ "score": 0.5722,
937
+ "source": "pentatonic-baseline:L0_workspace_bm25",
938
+ "doc_id": "agent-buyer-agent-7"
939
+ },
940
+ {
941
+ "text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
942
+ "score": 0.4012,
943
+ "source": "pentatonic-baseline:L0_workspace_bm25",
944
+ "doc_id": "policy-agent-commerce"
945
+ }
946
+ ],
947
+ "correct": true,
948
+ "score": 1.0,
949
+ "grading_notes": "judge=1.00 \u2014 The answer clearly states that @thora uses an autonomous shopping agent (@buyer-agent-7) for most transactions and provi",
950
+ "search_time_ms": 1178.6121949844528,
951
+ "generation_time_ms": 0.0,
952
+ "tokens_in": 0,
953
+ "tokens_out": 0,
954
+ "retrieval_tokens": 155,
955
+ "query_tokens": 8,
956
+ "context_tokens": 147,
957
+ "judge_tokens_in": 443,
958
+ "judge_tokens_out": 45,
959
+ "judge_latency_ms": 868.7936529815197
960
+ },
961
+ {
962
+ "task_id": "entity-all-sold-things",
963
+ "query": "List the thing IDs of all completed sales.",
964
+ "answer": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
965
+ "hits": [
966
+ {
967
+ "text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
968
+ "score": 0.6137,
969
+ "source": "pentatonic-baseline",
970
+ "doc_id": "tes-thing-lst-9014"
971
+ },
972
+ {
973
+ "text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
974
+ "score": 0.6133,
975
+ "source": "pentatonic-baseline",
976
+ "doc_id": "tes-thing-lst-9030"
977
+ },
978
+ {
979
+ "text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
980
+ "score": 0.4792,
981
+ "source": "pentatonic-baseline:L0_workspace_bm25",
982
+ "doc_id": "tes-thing-lst-9030"
983
+ }
984
+ ],
985
+ "correct": false,
986
+ "score": 0.0,
987
+ "grading_notes": "no expected_substrings set",
988
+ "search_time_ms": 1129.5945640013088,
989
+ "generation_time_ms": 0.0,
990
+ "tokens_in": 0,
991
+ "tokens_out": 0,
992
+ "retrieval_tokens": 148,
993
+ "query_tokens": 9,
994
+ "context_tokens": 139,
995
+ "judge_tokens_in": 0,
996
+ "judge_tokens_out": 0,
997
+ "judge_latency_ms": 0.0
998
+ },
999
+ {
1000
+ "task_id": "entity-sellers-with-disputes",
1001
+ "query": "Which sellers in this corpus have disputes on record?",
1002
+ "answer": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.\n---\n<!-- doc_id: seller-rix-digital-profile -->\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nAgent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
1003
+ "hits": [
1004
+ {
1005
+ "text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
1006
+ "score": 0.5094,
1007
+ "source": "pentatonic-baseline:L0_workspace_bm25",
1008
+ "doc_id": "policy-duplicate-listings"
1009
+ },
1010
+ {
1011
+ "text": "<!-- doc_id: seller-rix-digital-profile -->\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
1012
+ "score": 0.4467,
1013
+ "source": "pentatonic-baseline:L6_documents",
1014
+ "doc_id": "seller-rix-digital-profile"
1015
+ },
1016
+ {
1017
+ "text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
1018
+ "score": 0.4318,
1019
+ "source": "pentatonic-baseline:L0_workspace_bm25",
1020
+ "doc_id": "policy-agent-commerce"
1021
+ }
1022
+ ],
1023
+ "correct": true,
1024
+ "score": 1.0,
1025
+ "grading_notes": "no positive criteria (negative-only task)",
1026
+ "search_time_ms": 1203.501937998226,
1027
+ "generation_time_ms": 0.0,
1028
+ "tokens_in": 0,
1029
+ "tokens_out": 0,
1030
+ "retrieval_tokens": 228,
1031
+ "query_tokens": 10,
1032
+ "context_tokens": 218,
1033
+ "judge_tokens_in": 0,
1034
+ "judge_tokens_out": 0,
1035
+ "judge_latency_ms": 0.0
1036
+ }
1037
+ ]
1038
+ }