@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,1038 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bench": "marketplace-ops",
|
|
3
|
+
"stack": "pentatonic-memory",
|
|
4
|
+
"n_tasks": 20,
|
|
5
|
+
"n_correct": 16,
|
|
6
|
+
"accuracy": 0.8,
|
|
7
|
+
"mean_score": 0.865,
|
|
8
|
+
"p50_search_ms": 132.63059900782537,
|
|
9
|
+
"p95_search_ms": 172.13241270947037,
|
|
10
|
+
"total_tokens_in": 0,
|
|
11
|
+
"total_tokens_out": 0,
|
|
12
|
+
"total_usd": 0.0,
|
|
13
|
+
"by_tag": {
|
|
14
|
+
"factoid": {
|
|
15
|
+
"n": 12,
|
|
16
|
+
"mean_score": 1.0,
|
|
17
|
+
"accuracy": 1.0
|
|
18
|
+
},
|
|
19
|
+
"event-log": {
|
|
20
|
+
"n": 7,
|
|
21
|
+
"mean_score": 0.8285714285714285,
|
|
22
|
+
"accuracy": 0.7142857142857143
|
|
23
|
+
},
|
|
24
|
+
"multi-fact": {
|
|
25
|
+
"n": 1,
|
|
26
|
+
"mean_score": 1.0,
|
|
27
|
+
"accuracy": 1.0
|
|
28
|
+
},
|
|
29
|
+
"agent-commerce": {
|
|
30
|
+
"n": 6,
|
|
31
|
+
"mean_score": 0.9166666666666666,
|
|
32
|
+
"accuracy": 0.8333333333333334
|
|
33
|
+
},
|
|
34
|
+
"math": {
|
|
35
|
+
"n": 1,
|
|
36
|
+
"mean_score": 1.0,
|
|
37
|
+
"accuracy": 1.0
|
|
38
|
+
},
|
|
39
|
+
"seller": {
|
|
40
|
+
"n": 5,
|
|
41
|
+
"mean_score": 0.7,
|
|
42
|
+
"accuracy": 0.6
|
|
43
|
+
},
|
|
44
|
+
"buyer": {
|
|
45
|
+
"n": 3,
|
|
46
|
+
"mean_score": 1.0,
|
|
47
|
+
"accuracy": 1.0
|
|
48
|
+
},
|
|
49
|
+
"multi-doc": {
|
|
50
|
+
"n": 1,
|
|
51
|
+
"mean_score": 1.0,
|
|
52
|
+
"accuracy": 1.0
|
|
53
|
+
},
|
|
54
|
+
"policy": {
|
|
55
|
+
"n": 3,
|
|
56
|
+
"mean_score": 1.0,
|
|
57
|
+
"accuracy": 1.0
|
|
58
|
+
},
|
|
59
|
+
"rubric": {
|
|
60
|
+
"n": 3,
|
|
61
|
+
"mean_score": 0.7666666666666666,
|
|
62
|
+
"accuracy": 0.3333333333333333
|
|
63
|
+
},
|
|
64
|
+
"multi-hop": {
|
|
65
|
+
"n": 2,
|
|
66
|
+
"mean_score": 0.65,
|
|
67
|
+
"accuracy": 0.0
|
|
68
|
+
},
|
|
69
|
+
"entity": {
|
|
70
|
+
"n": 2,
|
|
71
|
+
"mean_score": 0.0,
|
|
72
|
+
"accuracy": 0.0
|
|
73
|
+
},
|
|
74
|
+
"negative": {
|
|
75
|
+
"n": 1,
|
|
76
|
+
"mean_score": 0.0,
|
|
77
|
+
"accuracy": 0.0
|
|
78
|
+
}
|
|
79
|
+
},
|
|
80
|
+
"extra": {
|
|
81
|
+
"ingest_ms": 5274.455192004098,
|
|
82
|
+
"grading": "substring",
|
|
83
|
+
"limit": 3,
|
|
84
|
+
"tokens": {
|
|
85
|
+
"corpus_tokens": 1388,
|
|
86
|
+
"query_tokens": 240,
|
|
87
|
+
"context_tokens": 6438,
|
|
88
|
+
"retrieval_tokens": 6678,
|
|
89
|
+
"naive_tokens": 28000,
|
|
90
|
+
"saved_tokens": 21322,
|
|
91
|
+
"reduction_pct": 0.7615,
|
|
92
|
+
"mean_retrieval_tokens_per_task": 333.9,
|
|
93
|
+
"tokenizer": "cl100k_base",
|
|
94
|
+
"per_task": {
|
|
95
|
+
"thing-lst-9001-sold-price": {
|
|
96
|
+
"query": 13,
|
|
97
|
+
"context": 503,
|
|
98
|
+
"retrieval": 516,
|
|
99
|
+
"judge_in": 0,
|
|
100
|
+
"judge_out": 0,
|
|
101
|
+
"judge_latency_ms": 0.0
|
|
102
|
+
},
|
|
103
|
+
"thing-lst-9001-buyer": {
|
|
104
|
+
"query": 8,
|
|
105
|
+
"context": 316,
|
|
106
|
+
"retrieval": 324,
|
|
107
|
+
"judge_in": 0,
|
|
108
|
+
"judge_out": 0,
|
|
109
|
+
"judge_latency_ms": 0.0
|
|
110
|
+
},
|
|
111
|
+
"thing-lst-9001-first-offer": {
|
|
112
|
+
"query": 17,
|
|
113
|
+
"context": 474,
|
|
114
|
+
"retrieval": 491,
|
|
115
|
+
"judge_in": 0,
|
|
116
|
+
"judge_out": 0,
|
|
117
|
+
"judge_latency_ms": 0.0
|
|
118
|
+
},
|
|
119
|
+
"thing-lst-9014-flagged-reason": {
|
|
120
|
+
"query": 11,
|
|
121
|
+
"context": 356,
|
|
122
|
+
"retrieval": 367,
|
|
123
|
+
"judge_in": 0,
|
|
124
|
+
"judge_out": 0,
|
|
125
|
+
"judge_latency_ms": 0.0
|
|
126
|
+
},
|
|
127
|
+
"thing-lst-9014-return-reason": {
|
|
128
|
+
"query": 11,
|
|
129
|
+
"context": 335,
|
|
130
|
+
"retrieval": 346,
|
|
131
|
+
"judge_in": 0,
|
|
132
|
+
"judge_out": 0,
|
|
133
|
+
"judge_latency_ms": 0.0
|
|
134
|
+
},
|
|
135
|
+
"thing-lst-9030-agent-offer": {
|
|
136
|
+
"query": 15,
|
|
137
|
+
"context": 310,
|
|
138
|
+
"retrieval": 325,
|
|
139
|
+
"judge_in": 0,
|
|
140
|
+
"judge_out": 0,
|
|
141
|
+
"judge_latency_ms": 0.0
|
|
142
|
+
},
|
|
143
|
+
"thing-lst-9030-agent-discount": {
|
|
144
|
+
"query": 15,
|
|
145
|
+
"context": 309,
|
|
146
|
+
"retrieval": 324,
|
|
147
|
+
"judge_in": 0,
|
|
148
|
+
"judge_out": 0,
|
|
149
|
+
"judge_latency_ms": 0.0
|
|
150
|
+
},
|
|
151
|
+
"seller-mariposa-rating": {
|
|
152
|
+
"query": 11,
|
|
153
|
+
"context": 209,
|
|
154
|
+
"retrieval": 220,
|
|
155
|
+
"judge_in": 0,
|
|
156
|
+
"judge_out": 0,
|
|
157
|
+
"judge_latency_ms": 0.0
|
|
158
|
+
},
|
|
159
|
+
"seller-rix-review-status": {
|
|
160
|
+
"query": 11,
|
|
161
|
+
"context": 388,
|
|
162
|
+
"retrieval": 399,
|
|
163
|
+
"judge_in": 0,
|
|
164
|
+
"judge_out": 0,
|
|
165
|
+
"judge_latency_ms": 0.0
|
|
166
|
+
},
|
|
167
|
+
"seller-velocipede-agent-friendly": {
|
|
168
|
+
"query": 14,
|
|
169
|
+
"context": 345,
|
|
170
|
+
"retrieval": 359,
|
|
171
|
+
"judge_in": 0,
|
|
172
|
+
"judge_out": 0,
|
|
173
|
+
"judge_latency_ms": 0.0
|
|
174
|
+
},
|
|
175
|
+
"buyer-sera-specialism": {
|
|
176
|
+
"query": 10,
|
|
177
|
+
"context": 329,
|
|
178
|
+
"retrieval": 339,
|
|
179
|
+
"judge_in": 0,
|
|
180
|
+
"judge_out": 0,
|
|
181
|
+
"judge_latency_ms": 0.0
|
|
182
|
+
},
|
|
183
|
+
"buyer-ariadne-disputes": {
|
|
184
|
+
"query": 11,
|
|
185
|
+
"context": 333,
|
|
186
|
+
"retrieval": 344,
|
|
187
|
+
"judge_in": 0,
|
|
188
|
+
"judge_out": 0,
|
|
189
|
+
"judge_latency_ms": 0.0
|
|
190
|
+
},
|
|
191
|
+
"policy-duplicate-trigger": {
|
|
192
|
+
"query": 12,
|
|
193
|
+
"context": 507,
|
|
194
|
+
"retrieval": 519,
|
|
195
|
+
"judge_in": 0,
|
|
196
|
+
"judge_out": 0,
|
|
197
|
+
"judge_latency_ms": 0.0
|
|
198
|
+
},
|
|
199
|
+
"policy-agent-opt-out": {
|
|
200
|
+
"query": 9,
|
|
201
|
+
"context": 310,
|
|
202
|
+
"retrieval": 319,
|
|
203
|
+
"judge_in": 0,
|
|
204
|
+
"judge_out": 0,
|
|
205
|
+
"judge_latency_ms": 0.0
|
|
206
|
+
},
|
|
207
|
+
"policy-enhanced-review-lifted": {
|
|
208
|
+
"query": 12,
|
|
209
|
+
"context": 207,
|
|
210
|
+
"retrieval": 219,
|
|
211
|
+
"judge_in": 0,
|
|
212
|
+
"judge_out": 0,
|
|
213
|
+
"judge_latency_ms": 0.0
|
|
214
|
+
},
|
|
215
|
+
"rubric-rix-buy-decision": {
|
|
216
|
+
"query": 17,
|
|
217
|
+
"context": 198,
|
|
218
|
+
"retrieval": 215,
|
|
219
|
+
"judge_in": 524,
|
|
220
|
+
"judge_out": 36,
|
|
221
|
+
"judge_latency_ms": 894.0554240047932
|
|
222
|
+
},
|
|
223
|
+
"rubric-lst-9014-full-story": {
|
|
224
|
+
"query": 16,
|
|
225
|
+
"context": 336,
|
|
226
|
+
"retrieval": 352,
|
|
227
|
+
"judge_in": 671,
|
|
228
|
+
"judge_out": 45,
|
|
229
|
+
"judge_latency_ms": 1039.1030340194702
|
|
230
|
+
},
|
|
231
|
+
"rubric-agent-commerce-thora": {
|
|
232
|
+
"query": 8,
|
|
233
|
+
"context": 302,
|
|
234
|
+
"retrieval": 310,
|
|
235
|
+
"judge_in": 599,
|
|
236
|
+
"judge_out": 39,
|
|
237
|
+
"judge_latency_ms": 791.2452349960804
|
|
238
|
+
},
|
|
239
|
+
"entity-all-sold-things": {
|
|
240
|
+
"query": 9,
|
|
241
|
+
"context": 164,
|
|
242
|
+
"retrieval": 173,
|
|
243
|
+
"judge_in": 0,
|
|
244
|
+
"judge_out": 0,
|
|
245
|
+
"judge_latency_ms": 0.0
|
|
246
|
+
},
|
|
247
|
+
"entity-sellers-with-disputes": {
|
|
248
|
+
"query": 10,
|
|
249
|
+
"context": 207,
|
|
250
|
+
"retrieval": 217,
|
|
251
|
+
"judge_in": 0,
|
|
252
|
+
"judge_out": 0,
|
|
253
|
+
"judge_latency_ms": 0.0
|
|
254
|
+
}
|
|
255
|
+
},
|
|
256
|
+
"judge_tokens_in": 1794,
|
|
257
|
+
"judge_tokens_out": 120,
|
|
258
|
+
"judge_calls": 3,
|
|
259
|
+
"judge_mean_latency_ms": 908.1345643401146
|
|
260
|
+
},
|
|
261
|
+
"cost_usd": {
|
|
262
|
+
"assumed_completion_tokens_per_task": 100,
|
|
263
|
+
"rates": {
|
|
264
|
+
"input_per_1k": 0.0025,
|
|
265
|
+
"output_per_1k": 0.01,
|
|
266
|
+
"model": "gpt-4o"
|
|
267
|
+
},
|
|
268
|
+
"retrieval_usd_in": 0.016695,
|
|
269
|
+
"retrieval_usd_out": 0.02,
|
|
270
|
+
"retrieval_usd_total": 0.036695000000000005,
|
|
271
|
+
"naive_usd_total": 0.09000000000000001,
|
|
272
|
+
"saved_usd": 0.053305000000000005,
|
|
273
|
+
"saved_usd_per_1k_tasks": 2.66525
|
|
274
|
+
}
|
|
275
|
+
},
|
|
276
|
+
"task_results": [
|
|
277
|
+
{
|
|
278
|
+
"task_id": "thing-lst-9001-sold-price",
|
|
279
|
+
"query": "What was the final sale price of LST-9001?",
|
|
280
|
+
"answer": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_POSITIVE 5/5\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)",
|
|
281
|
+
"hits": [
|
|
282
|
+
{
|
|
283
|
+
"text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
|
|
284
|
+
"score": 0.42022624611854553,
|
|
285
|
+
"source": "pentatonic-memory",
|
|
286
|
+
"doc_id": "tes-events-lst-9001"
|
|
287
|
+
},
|
|
288
|
+
{
|
|
289
|
+
"text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
290
|
+
"score": 0.25040844082832336,
|
|
291
|
+
"source": "pentatonic-memory",
|
|
292
|
+
"doc_id": "buyer-ariadne-profile"
|
|
293
|
+
},
|
|
294
|
+
{
|
|
295
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
296
|
+
"score": 0.33980050683021545,
|
|
297
|
+
"source": "pentatonic-memory",
|
|
298
|
+
"doc_id": "tes-events-lst-9014"
|
|
299
|
+
}
|
|
300
|
+
],
|
|
301
|
+
"correct": true,
|
|
302
|
+
"score": 1.0,
|
|
303
|
+
"grading_notes": "all substrings matched",
|
|
304
|
+
"search_time_ms": 133.8921549904626,
|
|
305
|
+
"generation_time_ms": 0.0,
|
|
306
|
+
"tokens_in": 0,
|
|
307
|
+
"tokens_out": 0,
|
|
308
|
+
"retrieval_tokens": 516,
|
|
309
|
+
"query_tokens": 13,
|
|
310
|
+
"context_tokens": 503,
|
|
311
|
+
"judge_tokens_in": 0,
|
|
312
|
+
"judge_tokens_out": 0,
|
|
313
|
+
"judge_latency_ms": 0.0
|
|
314
|
+
},
|
|
315
|
+
{
|
|
316
|
+
"task_id": "thing-lst-9001-buyer",
|
|
317
|
+
"query": "Who bought LST-9001?",
|
|
318
|
+
"answer": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_POSITIVE 5/5\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
319
|
+
"hits": [
|
|
320
|
+
{
|
|
321
|
+
"text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
|
|
322
|
+
"score": 0.3285704255104065,
|
|
323
|
+
"source": "pentatonic-memory",
|
|
324
|
+
"doc_id": "tes-thing-lst-9001"
|
|
325
|
+
},
|
|
326
|
+
{
|
|
327
|
+
"text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
|
|
328
|
+
"score": 0.40953725576400757,
|
|
329
|
+
"source": "pentatonic-memory",
|
|
330
|
+
"doc_id": "tes-events-lst-9001"
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
"text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
334
|
+
"score": 0.25917935371398926,
|
|
335
|
+
"source": "pentatonic-memory",
|
|
336
|
+
"doc_id": "buyer-ariadne-profile"
|
|
337
|
+
}
|
|
338
|
+
],
|
|
339
|
+
"correct": true,
|
|
340
|
+
"score": 1.0,
|
|
341
|
+
"grading_notes": "all substrings matched",
|
|
342
|
+
"search_time_ms": 130.75137499254197,
|
|
343
|
+
"generation_time_ms": 0.0,
|
|
344
|
+
"tokens_in": 0,
|
|
345
|
+
"tokens_out": 0,
|
|
346
|
+
"retrieval_tokens": 324,
|
|
347
|
+
"query_tokens": 8,
|
|
348
|
+
"context_tokens": 316,
|
|
349
|
+
"judge_tokens_in": 0,
|
|
350
|
+
"judge_tokens_out": 0,
|
|
351
|
+
"judge_latency_ms": 0.0
|
|
352
|
+
},
|
|
353
|
+
{
|
|
354
|
+
"task_id": "thing-lst-9001-first-offer",
|
|
355
|
+
"query": "What was the first offer received on LST-9001 and was it accepted?",
|
|
356
|
+
"answer": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_POSITIVE 5/5\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED\n---\nThing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
|
|
357
|
+
"hits": [
|
|
358
|
+
{
|
|
359
|
+
"text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
|
|
360
|
+
"score": 0.41836875677108765,
|
|
361
|
+
"source": "pentatonic-memory",
|
|
362
|
+
"doc_id": "tes-events-lst-9001"
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
366
|
+
"score": 0.36205410957336426,
|
|
367
|
+
"source": "pentatonic-memory",
|
|
368
|
+
"doc_id": "tes-events-lst-9030"
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
"text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
|
|
372
|
+
"score": 0.27437692880630493,
|
|
373
|
+
"source": "pentatonic-memory",
|
|
374
|
+
"doc_id": "tes-thing-lst-9001"
|
|
375
|
+
}
|
|
376
|
+
],
|
|
377
|
+
"correct": true,
|
|
378
|
+
"score": 1.0,
|
|
379
|
+
"grading_notes": "all substrings matched",
|
|
380
|
+
"search_time_ms": 131.4999090100173,
|
|
381
|
+
"generation_time_ms": 0.0,
|
|
382
|
+
"tokens_in": 0,
|
|
383
|
+
"tokens_out": 0,
|
|
384
|
+
"retrieval_tokens": 491,
|
|
385
|
+
"query_tokens": 17,
|
|
386
|
+
"context_tokens": 474,
|
|
387
|
+
"judge_tokens_in": 0,
|
|
388
|
+
"judge_tokens_out": 0,
|
|
389
|
+
"judge_latency_ms": 0.0
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"task_id": "thing-lst-9014-flagged-reason",
|
|
393
|
+
"query": "Why was LST-9014 flagged as duplicate?",
|
|
394
|
+
"answer": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.\n---\nThing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
395
|
+
"hits": [
|
|
396
|
+
{
|
|
397
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
398
|
+
"score": 0.40653809905052185,
|
|
399
|
+
"source": "pentatonic-memory",
|
|
400
|
+
"doc_id": "tes-events-lst-9014"
|
|
401
|
+
},
|
|
402
|
+
{
|
|
403
|
+
"text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
|
|
404
|
+
"score": 0.3389016389846802,
|
|
405
|
+
"source": "pentatonic-memory",
|
|
406
|
+
"doc_id": "policy-duplicate-listings"
|
|
407
|
+
},
|
|
408
|
+
{
|
|
409
|
+
"text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
410
|
+
"score": 0.2385099232196808,
|
|
411
|
+
"source": "pentatonic-memory",
|
|
412
|
+
"doc_id": "tes-thing-lst-9014"
|
|
413
|
+
}
|
|
414
|
+
],
|
|
415
|
+
"correct": true,
|
|
416
|
+
"score": 1.0,
|
|
417
|
+
"grading_notes": "all substrings matched",
|
|
418
|
+
"search_time_ms": 126.69649202143773,
|
|
419
|
+
"generation_time_ms": 0.0,
|
|
420
|
+
"tokens_in": 0,
|
|
421
|
+
"tokens_out": 0,
|
|
422
|
+
"retrieval_tokens": 367,
|
|
423
|
+
"query_tokens": 11,
|
|
424
|
+
"context_tokens": 356,
|
|
425
|
+
"judge_tokens_in": 0,
|
|
426
|
+
"judge_tokens_out": 0,
|
|
427
|
+
"judge_latency_ms": 0.0
|
|
428
|
+
},
|
|
429
|
+
{
|
|
430
|
+
"task_id": "thing-lst-9014-return-reason",
|
|
431
|
+
"query": "Why did the buyer return LST-9014?",
|
|
432
|
+
"answer": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nThing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
433
|
+
"hits": [
|
|
434
|
+
{
|
|
435
|
+
"text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
436
|
+
"score": 0.384602814912796,
|
|
437
|
+
"source": "pentatonic-memory",
|
|
438
|
+
"doc_id": "buyer-ariadne-profile"
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
442
|
+
"score": 0.42786866426467896,
|
|
443
|
+
"source": "pentatonic-memory",
|
|
444
|
+
"doc_id": "tes-events-lst-9014"
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
"text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
448
|
+
"score": 0.28044596314430237,
|
|
449
|
+
"source": "pentatonic-memory",
|
|
450
|
+
"doc_id": "tes-thing-lst-9014"
|
|
451
|
+
}
|
|
452
|
+
],
|
|
453
|
+
"correct": true,
|
|
454
|
+
"score": 1.0,
|
|
455
|
+
"grading_notes": "all substrings matched",
|
|
456
|
+
"search_time_ms": 130.40500399074517,
|
|
457
|
+
"generation_time_ms": 0.0,
|
|
458
|
+
"tokens_in": 0,
|
|
459
|
+
"tokens_out": 0,
|
|
460
|
+
"retrieval_tokens": 346,
|
|
461
|
+
"query_tokens": 11,
|
|
462
|
+
"context_tokens": 335,
|
|
463
|
+
"judge_tokens_in": 0,
|
|
464
|
+
"judge_tokens_out": 0,
|
|
465
|
+
"judge_latency_ms": 0.0
|
|
466
|
+
},
|
|
467
|
+
{
|
|
468
|
+
"task_id": "thing-lst-9030-agent-offer",
|
|
469
|
+
"query": "Which agent made the offer on LST-9030 and for whom?",
|
|
470
|
+
"answer": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
471
|
+
"hits": [
|
|
472
|
+
{
|
|
473
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
474
|
+
"score": 0.42710399627685547,
|
|
475
|
+
"source": "pentatonic-memory",
|
|
476
|
+
"doc_id": "tes-events-lst-9030"
|
|
477
|
+
},
|
|
478
|
+
{
|
|
479
|
+
"text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
|
|
480
|
+
"score": 0.2539333701133728,
|
|
481
|
+
"source": "pentatonic-memory",
|
|
482
|
+
"doc_id": "tes-thing-lst-9030"
|
|
483
|
+
},
|
|
484
|
+
{
|
|
485
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
486
|
+
"score": 0.30077192187309265,
|
|
487
|
+
"source": "pentatonic-memory",
|
|
488
|
+
"doc_id": "agent-buyer-agent-7"
|
|
489
|
+
}
|
|
490
|
+
],
|
|
491
|
+
"correct": true,
|
|
492
|
+
"score": 1.0,
|
|
493
|
+
"grading_notes": "all substrings matched",
|
|
494
|
+
"search_time_ms": 142.9224090243224,
|
|
495
|
+
"generation_time_ms": 0.0,
|
|
496
|
+
"tokens_in": 0,
|
|
497
|
+
"tokens_out": 0,
|
|
498
|
+
"retrieval_tokens": 325,
|
|
499
|
+
"query_tokens": 15,
|
|
500
|
+
"context_tokens": 310,
|
|
501
|
+
"judge_tokens_in": 0,
|
|
502
|
+
"judge_tokens_out": 0,
|
|
503
|
+
"judge_latency_ms": 0.0
|
|
504
|
+
},
|
|
505
|
+
{
|
|
506
|
+
"task_id": "thing-lst-9030-agent-discount",
|
|
507
|
+
"query": "What percentage discount did the shopping agent offer on LST-9030?",
|
|
508
|
+
"answer": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nBuyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
509
|
+
"hits": [
|
|
510
|
+
{
|
|
511
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
512
|
+
"score": 0.44162556529045105,
|
|
513
|
+
"source": "pentatonic-memory",
|
|
514
|
+
"doc_id": "tes-events-lst-9030"
|
|
515
|
+
},
|
|
516
|
+
{
|
|
517
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
518
|
+
"score": 0.38962602615356445,
|
|
519
|
+
"source": "pentatonic-memory",
|
|
520
|
+
"doc_id": "agent-buyer-agent-7"
|
|
521
|
+
},
|
|
522
|
+
{
|
|
523
|
+
"text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
524
|
+
"score": 0.22289475798606873,
|
|
525
|
+
"source": "pentatonic-memory",
|
|
526
|
+
"doc_id": "buyer-ariadne-profile"
|
|
527
|
+
}
|
|
528
|
+
],
|
|
529
|
+
"correct": true,
|
|
530
|
+
"score": 1.0,
|
|
531
|
+
"grading_notes": "all substrings matched",
|
|
532
|
+
"search_time_ms": 140.9766800061334,
|
|
533
|
+
"generation_time_ms": 0.0,
|
|
534
|
+
"tokens_in": 0,
|
|
535
|
+
"tokens_out": 0,
|
|
536
|
+
"retrieval_tokens": 324,
|
|
537
|
+
"query_tokens": 15,
|
|
538
|
+
"context_tokens": 309,
|
|
539
|
+
"judge_tokens_in": 0,
|
|
540
|
+
"judge_tokens_out": 0,
|
|
541
|
+
"judge_latency_ms": 0.0
|
|
542
|
+
},
|
|
543
|
+
{
|
|
544
|
+
"task_id": "seller-mariposa-rating",
|
|
545
|
+
"query": "What's @mariposa's average star rating?",
|
|
546
|
+
"answer": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.\n---\nBuyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
547
|
+
"hits": [
|
|
548
|
+
{
|
|
549
|
+
"text": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
|
|
550
|
+
"score": 0.6198980212211609,
|
|
551
|
+
"source": "pentatonic-memory",
|
|
552
|
+
"doc_id": "seller-mariposa-profile"
|
|
553
|
+
},
|
|
554
|
+
{
|
|
555
|
+
"text": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.",
|
|
556
|
+
"score": 0.46923014521598816,
|
|
557
|
+
"source": "pentatonic-memory",
|
|
558
|
+
"doc_id": "buyer-sera-interiors-profile"
|
|
559
|
+
},
|
|
560
|
+
{
|
|
561
|
+
"text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
562
|
+
"score": 0.31547778844833374,
|
|
563
|
+
"source": "pentatonic-memory",
|
|
564
|
+
"doc_id": "seller-rix-digital-profile"
|
|
565
|
+
}
|
|
566
|
+
],
|
|
567
|
+
"correct": true,
|
|
568
|
+
"score": 1.0,
|
|
569
|
+
"grading_notes": "all substrings matched",
|
|
570
|
+
"search_time_ms": 162.25307201966643,
|
|
571
|
+
"generation_time_ms": 0.0,
|
|
572
|
+
"tokens_in": 0,
|
|
573
|
+
"tokens_out": 0,
|
|
574
|
+
"retrieval_tokens": 220,
|
|
575
|
+
"query_tokens": 11,
|
|
576
|
+
"context_tokens": 209,
|
|
577
|
+
"judge_tokens_in": 0,
|
|
578
|
+
"judge_tokens_out": 0,
|
|
579
|
+
"judge_latency_ms": 0.0
|
|
580
|
+
},
|
|
581
|
+
{
|
|
582
|
+
"task_id": "seller-rix-review-status",
|
|
583
|
+
"query": "Is @rix-digital under any special review status?",
|
|
584
|
+
"answer": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nThing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
585
|
+
"hits": [
|
|
586
|
+
{
|
|
587
|
+
"text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
588
|
+
"score": 0.591421365737915,
|
|
589
|
+
"source": "pentatonic-memory",
|
|
590
|
+
"doc_id": "seller-rix-digital-profile"
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
594
|
+
"score": 0.3365422785282135,
|
|
595
|
+
"source": "pentatonic-memory",
|
|
596
|
+
"doc_id": "tes-events-lst-9014"
|
|
597
|
+
},
|
|
598
|
+
{
|
|
599
|
+
"text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
600
|
+
"score": 0.30144694447517395,
|
|
601
|
+
"source": "pentatonic-memory",
|
|
602
|
+
"doc_id": "tes-thing-lst-9014"
|
|
603
|
+
}
|
|
604
|
+
],
|
|
605
|
+
"correct": true,
|
|
606
|
+
"score": 1.0,
|
|
607
|
+
"grading_notes": "all substrings matched",
|
|
608
|
+
"search_time_ms": 172.65237800893374,
|
|
609
|
+
"generation_time_ms": 0.0,
|
|
610
|
+
"tokens_in": 0,
|
|
611
|
+
"tokens_out": 0,
|
|
612
|
+
"retrieval_tokens": 399,
|
|
613
|
+
"query_tokens": 11,
|
|
614
|
+
"context_tokens": 388,
|
|
615
|
+
"judge_tokens_in": 0,
|
|
616
|
+
"judge_tokens_out": 0,
|
|
617
|
+
"judge_latency_ms": 0.0
|
|
618
|
+
},
|
|
619
|
+
{
|
|
620
|
+
"task_id": "seller-velocipede-agent-friendly",
|
|
621
|
+
"query": "Does @velocipede-jo respond well to shopping agents?",
|
|
622
|
+
"answer": "Seller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED",
|
|
623
|
+
"hits": [
|
|
624
|
+
{
|
|
625
|
+
"text": "Seller @velocipede-jo: registered 2025-04. Category: restored bicycles. Listings to date: 12. Sell-through rate: 100%. Average star rating: 5.0/5 (based on 11 reviews). Disputes: 0. Agent-commerce friendly: yes (responds to AGENT_SPEC_CHECK events within 5 minutes on average).",
|
|
626
|
+
"score": 0.6114906072616577,
|
|
627
|
+
"source": "pentatonic-memory",
|
|
628
|
+
"doc_id": "seller-velocipede-jo-profile"
|
|
629
|
+
},
|
|
630
|
+
{
|
|
631
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
632
|
+
"score": 0.44196510314941406,
|
|
633
|
+
"source": "pentatonic-memory",
|
|
634
|
+
"doc_id": "agent-buyer-agent-7"
|
|
635
|
+
},
|
|
636
|
+
{
|
|
637
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
638
|
+
"score": 0.37640511989593506,
|
|
639
|
+
"source": "pentatonic-memory",
|
|
640
|
+
"doc_id": "tes-events-lst-9030"
|
|
641
|
+
}
|
|
642
|
+
],
|
|
643
|
+
"correct": true,
|
|
644
|
+
"score": 1.0,
|
|
645
|
+
"grading_notes": "all substrings matched",
|
|
646
|
+
"search_time_ms": 131.25643599778414,
|
|
647
|
+
"generation_time_ms": 0.0,
|
|
648
|
+
"tokens_in": 0,
|
|
649
|
+
"tokens_out": 0,
|
|
650
|
+
"retrieval_tokens": 359,
|
|
651
|
+
"query_tokens": 14,
|
|
652
|
+
"context_tokens": 345,
|
|
653
|
+
"judge_tokens_in": 0,
|
|
654
|
+
"judge_tokens_out": 0,
|
|
655
|
+
"judge_latency_ms": 0.0
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
"task_id": "buyer-sera-specialism",
|
|
659
|
+
"query": "What does @sera-interiors typically buy?",
|
|
660
|
+
"answer": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.\n---\nTES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords, tracking PKF-88291)\n2026-02-16T15:20 DELIVERED to @sera-interiors\n2026-02-18T10:00 BUYER_FEEDBACK_POSITIVE 5/5\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
661
|
+
"hits": [
|
|
662
|
+
{
|
|
663
|
+
"text": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.",
|
|
664
|
+
"score": 0.6819412112236023,
|
|
665
|
+
"source": "pentatonic-memory",
|
|
666
|
+
"doc_id": "buyer-sera-interiors-profile"
|
|
667
|
+
},
|
|
668
|
+
{
|
|
669
|
+
"text": "TES event log for LST-9001:\n2026-02-08T10:00 CREATED by @mariposa\n2026-02-08T10:02 LISTED (price \u00a32,400, marketplace UK)\n2026-02-10T14:30 OFFER_RECEIVED \u00a32,000 from @hendrik\n2026-02-10T16:12 OFFER_DECLINED by @mariposa\n2026-02-12T09:45 OFFER_RECEIVED \u00a32,250 from @sera-interiors\n2026-02-12T11:00 OFFER_ACCEPTED\n2026-02-13T08:30 PAYMENT_RECEIVED \u00a32,250\n2026-02-14T12:00 DISPATCHED (courier: Pickfords,",
|
|
670
|
+
"score": 0.2755875587463379,
|
|
671
|
+
"source": "pentatonic-memory",
|
|
672
|
+
"doc_id": "tes-events-lst-9001"
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
676
|
+
"score": 0.30571281909942627,
|
|
677
|
+
"source": "pentatonic-memory",
|
|
678
|
+
"doc_id": "agent-buyer-agent-7"
|
|
679
|
+
}
|
|
680
|
+
],
|
|
681
|
+
"correct": true,
|
|
682
|
+
"score": 1.0,
|
|
683
|
+
"grading_notes": "all substrings matched",
|
|
684
|
+
"search_time_ms": 129.76375001017004,
|
|
685
|
+
"generation_time_ms": 0.0,
|
|
686
|
+
"tokens_in": 0,
|
|
687
|
+
"tokens_out": 0,
|
|
688
|
+
"retrieval_tokens": 339,
|
|
689
|
+
"query_tokens": 10,
|
|
690
|
+
"context_tokens": 329,
|
|
691
|
+
"judge_tokens_in": 0,
|
|
692
|
+
"judge_tokens_out": 0,
|
|
693
|
+
"judge_latency_ms": 0.0
|
|
694
|
+
},
|
|
695
|
+
{
|
|
696
|
+
"task_id": "buyer-ariadne-disputes",
|
|
697
|
+
"query": "Does @ariadne have any disputes on record?",
|
|
698
|
+
"answer": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nBuyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.",
|
|
699
|
+
"hits": [
|
|
700
|
+
{
|
|
701
|
+
"text": "Buyer @ariadne: consumer account. Purchases to date: 14. Average basket: \u00a3410. One open return (LST-9014 battery-health dispute, closed in buyer's favour).",
|
|
702
|
+
"score": 0.5878862738609314,
|
|
703
|
+
"source": "pentatonic-memory",
|
|
704
|
+
"doc_id": "buyer-ariadne-profile"
|
|
705
|
+
},
|
|
706
|
+
{
|
|
707
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
708
|
+
"score": 0.35119953751564026,
|
|
709
|
+
"source": "pentatonic-memory",
|
|
710
|
+
"doc_id": "tes-events-lst-9014"
|
|
711
|
+
},
|
|
712
|
+
{
|
|
713
|
+
"text": "Buyer @sera-interiors: interior design studio account. Purchases to date: 28. Average basket: \u00a31,850. Specialism: mid-century furniture. No disputes. Frequently buys from @mariposa.",
|
|
714
|
+
"score": 0.3721349239349365,
|
|
715
|
+
"source": "pentatonic-memory",
|
|
716
|
+
"doc_id": "buyer-sera-interiors-profile"
|
|
717
|
+
}
|
|
718
|
+
],
|
|
719
|
+
"correct": true,
|
|
720
|
+
"score": 1.0,
|
|
721
|
+
"grading_notes": "all substrings matched",
|
|
722
|
+
"search_time_ms": 128.8983509875834,
|
|
723
|
+
"generation_time_ms": 0.0,
|
|
724
|
+
"tokens_in": 0,
|
|
725
|
+
"tokens_out": 0,
|
|
726
|
+
"retrieval_tokens": 344,
|
|
727
|
+
"query_tokens": 11,
|
|
728
|
+
"context_tokens": 333,
|
|
729
|
+
"judge_tokens_in": 0,
|
|
730
|
+
"judge_tokens_out": 0,
|
|
731
|
+
"judge_latency_ms": 0.0
|
|
732
|
+
},
|
|
733
|
+
{
|
|
734
|
+
"task_id": "policy-duplicate-trigger",
|
|
735
|
+
"query": "What triggers a FLAGGED_DUPLICATE event in TES?",
|
|
736
|
+
"answer": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED",
|
|
737
|
+
"hits": [
|
|
738
|
+
{
|
|
739
|
+
"text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
|
|
740
|
+
"score": 0.41180965304374695,
|
|
741
|
+
"source": "pentatonic-memory",
|
|
742
|
+
"doc_id": "policy-duplicate-listings"
|
|
743
|
+
},
|
|
744
|
+
{
|
|
745
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
746
|
+
"score": 0.4230816662311554,
|
|
747
|
+
"source": "pentatonic-memory",
|
|
748
|
+
"doc_id": "tes-events-lst-9014"
|
|
749
|
+
},
|
|
750
|
+
{
|
|
751
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
752
|
+
"score": 0.27056431770324707,
|
|
753
|
+
"source": "pentatonic-memory",
|
|
754
|
+
"doc_id": "tes-events-lst-9030"
|
|
755
|
+
}
|
|
756
|
+
],
|
|
757
|
+
"correct": true,
|
|
758
|
+
"score": 1.0,
|
|
759
|
+
"grading_notes": "all substrings matched",
|
|
760
|
+
"search_time_ms": 133.76128900563344,
|
|
761
|
+
"generation_time_ms": 0.0,
|
|
762
|
+
"tokens_in": 0,
|
|
763
|
+
"tokens_out": 0,
|
|
764
|
+
"retrieval_tokens": 519,
|
|
765
|
+
"query_tokens": 12,
|
|
766
|
+
"context_tokens": 507,
|
|
767
|
+
"judge_tokens_in": 0,
|
|
768
|
+
"judge_tokens_out": 0,
|
|
769
|
+
"judge_latency_ms": 0.0
|
|
770
|
+
},
|
|
771
|
+
{
|
|
772
|
+
"task_id": "policy-agent-opt-out",
|
|
773
|
+
"query": "Can a seller refuse offers from shopping agents?",
|
|
774
|
+
"answer": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED",
|
|
775
|
+
"hits": [
|
|
776
|
+
{
|
|
777
|
+
"text": "Agent-commerce policy: autonomous shopping agents must register with the marketplace and declare principal human(s). Agent offers follow the same event flow as human offers. Sellers may opt out of agent-commerce at listing creation (AGENT_OPT_OUT flag).",
|
|
778
|
+
"score": 0.5487691164016724,
|
|
779
|
+
"source": "pentatonic-memory",
|
|
780
|
+
"doc_id": "policy-agent-commerce"
|
|
781
|
+
},
|
|
782
|
+
{
|
|
783
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
784
|
+
"score": 0.43056946992874146,
|
|
785
|
+
"source": "pentatonic-memory",
|
|
786
|
+
"doc_id": "agent-buyer-agent-7"
|
|
787
|
+
},
|
|
788
|
+
{
|
|
789
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
790
|
+
"score": 0.29238182306289673,
|
|
791
|
+
"source": "pentatonic-memory",
|
|
792
|
+
"doc_id": "tes-events-lst-9030"
|
|
793
|
+
}
|
|
794
|
+
],
|
|
795
|
+
"correct": true,
|
|
796
|
+
"score": 1.0,
|
|
797
|
+
"grading_notes": "all substrings matched",
|
|
798
|
+
"search_time_ms": 129.24622499849647,
|
|
799
|
+
"generation_time_ms": 0.0,
|
|
800
|
+
"tokens_in": 0,
|
|
801
|
+
"tokens_out": 0,
|
|
802
|
+
"retrieval_tokens": 319,
|
|
803
|
+
"query_tokens": 9,
|
|
804
|
+
"context_tokens": 310,
|
|
805
|
+
"judge_tokens_in": 0,
|
|
806
|
+
"judge_tokens_out": 0,
|
|
807
|
+
"judge_latency_ms": 0.0
|
|
808
|
+
},
|
|
809
|
+
{
|
|
810
|
+
"task_id": "policy-enhanced-review-lifted",
|
|
811
|
+
"query": "After how long of clean activity is enhanced listing review lifted?",
|
|
812
|
+
"answer": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
|
|
813
|
+
"hits": [
|
|
814
|
+
{
|
|
815
|
+
"text": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.",
|
|
816
|
+
"score": 0.7121304869651794,
|
|
817
|
+
"source": "pentatonic-memory",
|
|
818
|
+
"doc_id": "policy-seller-enhanced-review"
|
|
819
|
+
},
|
|
820
|
+
{
|
|
821
|
+
"text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
822
|
+
"score": 0.377176433801651,
|
|
823
|
+
"source": "pentatonic-memory",
|
|
824
|
+
"doc_id": "seller-rix-digital-profile"
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
"text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
|
|
828
|
+
"score": 0.35480085015296936,
|
|
829
|
+
"source": "pentatonic-memory",
|
|
830
|
+
"doc_id": "policy-duplicate-listings"
|
|
831
|
+
}
|
|
832
|
+
],
|
|
833
|
+
"correct": true,
|
|
834
|
+
"score": 1.0,
|
|
835
|
+
"grading_notes": "all substrings matched",
|
|
836
|
+
"search_time_ms": 141.89460698980838,
|
|
837
|
+
"generation_time_ms": 0.0,
|
|
838
|
+
"tokens_in": 0,
|
|
839
|
+
"tokens_out": 0,
|
|
840
|
+
"retrieval_tokens": 219,
|
|
841
|
+
"query_tokens": 12,
|
|
842
|
+
"context_tokens": 207,
|
|
843
|
+
"judge_tokens_in": 0,
|
|
844
|
+
"judge_tokens_out": 0,
|
|
845
|
+
"judge_latency_ms": 0.0
|
|
846
|
+
},
|
|
847
|
+
{
|
|
848
|
+
"task_id": "rubric-rix-buy-decision",
|
|
849
|
+
"query": "A shopping agent is considering buying from @rix-digital. What's your recommendation?",
|
|
850
|
+
"answer": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.\n---\nBuyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
|
|
851
|
+
"hits": [
|
|
852
|
+
{
|
|
853
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
854
|
+
"score": 0.3959510922431946,
|
|
855
|
+
"source": "pentatonic-memory",
|
|
856
|
+
"doc_id": "agent-buyer-agent-7"
|
|
857
|
+
},
|
|
858
|
+
{
|
|
859
|
+
"text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
860
|
+
"score": 0.5673623085021973,
|
|
861
|
+
"source": "pentatonic-memory",
|
|
862
|
+
"doc_id": "seller-rix-digital-profile"
|
|
863
|
+
},
|
|
864
|
+
{
|
|
865
|
+
"text": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
|
|
866
|
+
"score": 0.4040350317955017,
|
|
867
|
+
"source": "pentatonic-memory",
|
|
868
|
+
"doc_id": "buyer-thora-profile"
|
|
869
|
+
}
|
|
870
|
+
],
|
|
871
|
+
"correct": false,
|
|
872
|
+
"score": 0.5,
|
|
873
|
+
"grading_notes": "judge=0.50 \u2014 The answer mentions the enhanced listing review status of @rix-digital but does not explicitly mention its dispute histo",
|
|
874
|
+
"search_time_ms": 143.1204739783425,
|
|
875
|
+
"generation_time_ms": 0.0,
|
|
876
|
+
"tokens_in": 0,
|
|
877
|
+
"tokens_out": 0,
|
|
878
|
+
"retrieval_tokens": 215,
|
|
879
|
+
"query_tokens": 17,
|
|
880
|
+
"context_tokens": 198,
|
|
881
|
+
"judge_tokens_in": 524,
|
|
882
|
+
"judge_tokens_out": 36,
|
|
883
|
+
"judge_latency_ms": 894.0554240047932
|
|
884
|
+
},
|
|
885
|
+
{
|
|
886
|
+
"task_id": "rubric-lst-9014-full-story",
|
|
887
|
+
"query": "Tell me the full story of LST-9014 from creation to resolution.",
|
|
888
|
+
"answer": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.\n---\nTES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOLD for \u00a3780 to @ariadne\n2026-02-20T14:00 DISPATCHED\n2026-02-22T09:45 DELIVERED\n2026-02-24T16:00 RETURN_REQUEST (buyer claims battery health lower than advertised)\n2026-02-26T10:30 RETURN_APPROVED, refund \u00a3780 issued\n2026-02-26T10:32 THING_RELISTED (accurate battery spec)\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
|
|
889
|
+
"hits": [
|
|
890
|
+
{
|
|
891
|
+
"text": "Thing LST-9014: second-hand MacBook Air M2 13\" 256GB (condition: very good, battery cycle count 112). Created in TES 2026-02-14 by seller @rix-digital.",
|
|
892
|
+
"score": 0.25764524936676025,
|
|
893
|
+
"source": "pentatonic-memory",
|
|
894
|
+
"doc_id": "tes-thing-lst-9014"
|
|
895
|
+
},
|
|
896
|
+
{
|
|
897
|
+
"text": "TES event log for LST-9014:\n2026-02-14T09:20 CREATED by @rix-digital\n2026-02-14T09:25 LISTED (price \u00a3780)\n2026-02-15T12:00 FLAGGED_DUPLICATE \u2014 matches LST-9014-DUP (different seller, same serial number detected via device fingerprint)\n2026-02-15T13:30 LISTING_HELD pending review\n2026-02-16T10:00 REVIEW_CLEARED (original seller confirmed; dup delisted)\n2026-02-16T10:05 RELISTED\n2026-02-19T11:18 SOL",
|
|
898
|
+
"score": 0.40446287393569946,
|
|
899
|
+
"source": "pentatonic-memory",
|
|
900
|
+
"doc_id": "tes-events-lst-9014"
|
|
901
|
+
},
|
|
902
|
+
{
|
|
903
|
+
"text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
|
|
904
|
+
"score": 0.30556970834732056,
|
|
905
|
+
"source": "pentatonic-memory",
|
|
906
|
+
"doc_id": "tes-thing-lst-9030"
|
|
907
|
+
}
|
|
908
|
+
],
|
|
909
|
+
"correct": false,
|
|
910
|
+
"score": 0.8,
|
|
911
|
+
"grading_notes": "judge=0.80 \u2014 The answer covers stages 1, 2, 3, 4, and 5 except for the 'SOLD' stage which is missing.",
|
|
912
|
+
"search_time_ms": 140.61630898504518,
|
|
913
|
+
"generation_time_ms": 0.0,
|
|
914
|
+
"tokens_in": 0,
|
|
915
|
+
"tokens_out": 0,
|
|
916
|
+
"retrieval_tokens": 352,
|
|
917
|
+
"query_tokens": 16,
|
|
918
|
+
"context_tokens": 336,
|
|
919
|
+
"judge_tokens_in": 671,
|
|
920
|
+
"judge_tokens_out": 45,
|
|
921
|
+
"judge_latency_ms": 1039.1030340194702
|
|
922
|
+
},
|
|
923
|
+
{
|
|
924
|
+
"task_id": "rubric-agent-commerce-thora",
|
|
925
|
+
"query": "Describe how @thora uses the marketplace.",
|
|
926
|
+
"answer": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.\n---\nShopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.\n---\nTES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-05T11:30 OFFER_RECEIVED \u00a3380 from @buyer-agent-7 (on behalf of @thora)\n2026-03-05T11:45 OFFER_ACCEPTED\n2026-03-06T10:00 PAYMENT_RECEIVED\n2026-03-07T14:20 DISPATCHED",
|
|
927
|
+
"hits": [
|
|
928
|
+
{
|
|
929
|
+
"text": "Buyer @thora: consumer account. Purchases to date: 6. Uses an autonomous shopping agent (@buyer-agent-7) for most transactions. Average basket: \u00a3510.",
|
|
930
|
+
"score": 0.6050072312355042,
|
|
931
|
+
"source": "pentatonic-memory",
|
|
932
|
+
"doc_id": "buyer-thora-profile"
|
|
933
|
+
},
|
|
934
|
+
{
|
|
935
|
+
"text": "Shopping agent @buyer-agent-7: autonomous agent operating on behalf of @thora (and 4 other principals). Runs AGENT_SPEC_CHECK events before offering, typically bids 90-92% of asking price. Responds to counter-offers within 30s. Registered with marketplace 2025-11.",
|
|
936
|
+
"score": 0.4905954897403717,
|
|
937
|
+
"source": "pentatonic-memory",
|
|
938
|
+
"doc_id": "agent-buyer-agent-7"
|
|
939
|
+
},
|
|
940
|
+
{
|
|
941
|
+
"text": "TES event log for LST-9030:\n2026-03-03T08:00 CREATED\n2026-03-03T08:05 LISTED (price \u00a3420)\n2026-03-05T11:20 AGENT_INQUIRY from @buyer-agent-7 (autonomous shopping agent)\n2026-03-05T11:22 AGENT_SPEC_CHECK (agent requested frame size, gear count, original vs restored)\n2026-03-05T11:23 AGENT_SPEC_RESPONSE (frame 23\", 10-speed, restored \u2014 new cables, tyres, bar tape; original frame and wheels)\n2026-03-",
|
|
942
|
+
"score": 0.3642008602619171,
|
|
943
|
+
"source": "pentatonic-memory",
|
|
944
|
+
"doc_id": "tes-events-lst-9030"
|
|
945
|
+
}
|
|
946
|
+
],
|
|
947
|
+
"correct": true,
|
|
948
|
+
"score": 1.0,
|
|
949
|
+
"grading_notes": "judge=1.00 \u2014 The agent's usage is clearly stated in the shopping agent's description and supported by TES event log entries showing a",
|
|
950
|
+
"search_time_ms": 142.02617600676604,
|
|
951
|
+
"generation_time_ms": 0.0,
|
|
952
|
+
"tokens_in": 0,
|
|
953
|
+
"tokens_out": 0,
|
|
954
|
+
"retrieval_tokens": 310,
|
|
955
|
+
"query_tokens": 8,
|
|
956
|
+
"context_tokens": 302,
|
|
957
|
+
"judge_tokens_in": 599,
|
|
958
|
+
"judge_tokens_out": 39,
|
|
959
|
+
"judge_latency_ms": 791.2452349960804
|
|
960
|
+
},
|
|
961
|
+
{
|
|
962
|
+
"task_id": "entity-all-sold-things",
|
|
963
|
+
"query": "List the thing IDs of all completed sales.",
|
|
964
|
+
"answer": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.\n---\nDuplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.\n---\nThing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
|
|
965
|
+
"hits": [
|
|
966
|
+
{
|
|
967
|
+
"text": "Thing LST-9001: vintage Eames lounge chair (1970s, walnut, rosewood stain, minor wear on armrests). Created in TES 2026-02-08 by seller @mariposa. Origin: private estate clearance.",
|
|
968
|
+
"score": 0.4182210862636566,
|
|
969
|
+
"source": "pentatonic-memory",
|
|
970
|
+
"doc_id": "tes-thing-lst-9001"
|
|
971
|
+
},
|
|
972
|
+
{
|
|
973
|
+
"text": "Duplicate-listing policy: two listings with matching device fingerprints (serial numbers, IMEI, chassis VIN) trigger an automatic FLAGGED_DUPLICATE event and LISTING_HELD status. Both sellers are contacted; the listing with earliest verifiable ownership proof is cleared. No penalty if sellers cooperate with review within 48h.",
|
|
974
|
+
"score": 0.2768407464027405,
|
|
975
|
+
"source": "pentatonic-memory",
|
|
976
|
+
"doc_id": "policy-duplicate-listings"
|
|
977
|
+
},
|
|
978
|
+
{
|
|
979
|
+
"text": "Thing LST-9030: Restored Schwinn Varsity bicycle (1982, red, original components). Created in TES 2026-03-03 by seller @velocipede-jo.",
|
|
980
|
+
"score": 0.41206637024879456,
|
|
981
|
+
"source": "pentatonic-memory",
|
|
982
|
+
"doc_id": "tes-thing-lst-9030"
|
|
983
|
+
}
|
|
984
|
+
],
|
|
985
|
+
"correct": false,
|
|
986
|
+
"score": 0.0,
|
|
987
|
+
"grading_notes": "no expected_substrings set",
|
|
988
|
+
"search_time_ms": 125.81155600491911,
|
|
989
|
+
"generation_time_ms": 0.0,
|
|
990
|
+
"tokens_in": 0,
|
|
991
|
+
"tokens_out": 0,
|
|
992
|
+
"retrieval_tokens": 173,
|
|
993
|
+
"query_tokens": 9,
|
|
994
|
+
"context_tokens": 164,
|
|
995
|
+
"judge_tokens_in": 0,
|
|
996
|
+
"judge_tokens_out": 0,
|
|
997
|
+
"judge_latency_ms": 0.0
|
|
998
|
+
},
|
|
999
|
+
{
|
|
1000
|
+
"task_id": "entity-sellers-with-disputes",
|
|
1001
|
+
"query": "Which sellers in this corpus have disputes on record?",
|
|
1002
|
+
"answer": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.\n---\nSeller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.\n---\nSeller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
1003
|
+
"hits": [
|
|
1004
|
+
{
|
|
1005
|
+
"text": "Enhanced listing review: triggered when a seller accumulates 3+ spec-discrepancy disputes in a 90-day window. Listings are manually reviewed before going live. Lifted after 30 days of clean activity.",
|
|
1006
|
+
"score": 0.42692145705223083,
|
|
1007
|
+
"source": "pentatonic-memory",
|
|
1008
|
+
"doc_id": "policy-seller-enhanced-review"
|
|
1009
|
+
},
|
|
1010
|
+
{
|
|
1011
|
+
"text": "Seller @mariposa: registered 2024-08. Category: vintage furniture and homeware. Listings to date: 47. Sell-through rate: 89%. Average star rating: 4.8/5 (based on 38 reviews). Disputes in last 12 months: 0.",
|
|
1012
|
+
"score": 0.4587053656578064,
|
|
1013
|
+
"source": "pentatonic-memory",
|
|
1014
|
+
"doc_id": "seller-mariposa-profile"
|
|
1015
|
+
},
|
|
1016
|
+
{
|
|
1017
|
+
"text": "Seller @rix-digital: registered 2023-11. Category: refurbished electronics. Listings to date: 312. Sell-through rate: 72%. Average star rating: 4.3/5 (based on 201 reviews). Disputes in last 12 months: 4 (2 refunded, 1 partial, 1 dismissed). Flagged for repeated spec discrepancies 2026-02-24 \u2014 currently on 'enhanced listing review' status.",
|
|
1018
|
+
"score": 0.48887112736701965,
|
|
1019
|
+
"source": "pentatonic-memory",
|
|
1020
|
+
"doc_id": "seller-rix-digital-profile"
|
|
1021
|
+
}
|
|
1022
|
+
],
|
|
1023
|
+
"correct": false,
|
|
1024
|
+
"score": 0.0,
|
|
1025
|
+
"grading_notes": "no positive criteria (negative-only task); forbidden substring(s) present: ['@mariposa']",
|
|
1026
|
+
"search_time_ms": 127.18707200838253,
|
|
1027
|
+
"generation_time_ms": 0.0,
|
|
1028
|
+
"tokens_in": 0,
|
|
1029
|
+
"tokens_out": 0,
|
|
1030
|
+
"retrieval_tokens": 217,
|
|
1031
|
+
"query_tokens": 10,
|
|
1032
|
+
"context_tokens": 207,
|
|
1033
|
+
"judge_tokens_in": 0,
|
|
1034
|
+
"judge_tokens_out": 0,
|
|
1035
|
+
"judge_latency_ms": 0.0
|
|
1036
|
+
}
|
|
1037
|
+
]
|
|
1038
|
+
}
|