@pentatonic-ai/ai-agent-sdk 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +170 -69
- package/bin/__tests__/callback-server.test.js +4 -1
- package/bin/cli.js +41 -164
- package/bin/commands/config.js +251 -0
- package/package.json +2 -1
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/corpus/adapters.js +104 -0
- package/packages/memory/src/corpus/cli.js +72 -7
- package/packages/memory/src/corpus/index.js +1 -1
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,819 @@
|
|
|
1
|
+
{
|
|
2
|
+
"bench": "chat-recall",
|
|
3
|
+
"stack": "pentatonic-baseline",
|
|
4
|
+
"n_tasks": 16,
|
|
5
|
+
"n_correct": 16,
|
|
6
|
+
"accuracy": 1.0,
|
|
7
|
+
"mean_score": 1.0,
|
|
8
|
+
"p50_search_ms": 1125.783211493399,
|
|
9
|
+
"p95_search_ms": 2792.650751012843,
|
|
10
|
+
"total_tokens_in": 0,
|
|
11
|
+
"total_tokens_out": 0,
|
|
12
|
+
"total_usd": 0.0,
|
|
13
|
+
"by_tag": {
|
|
14
|
+
"factoid": {
|
|
15
|
+
"n": 6,
|
|
16
|
+
"mean_score": 1.0,
|
|
17
|
+
"accuracy": 1.0
|
|
18
|
+
},
|
|
19
|
+
"owner": {
|
|
20
|
+
"n": 4,
|
|
21
|
+
"mean_score": 1.0,
|
|
22
|
+
"accuracy": 1.0
|
|
23
|
+
},
|
|
24
|
+
"temporal": {
|
|
25
|
+
"n": 4,
|
|
26
|
+
"mean_score": 1.0,
|
|
27
|
+
"accuracy": 1.0
|
|
28
|
+
},
|
|
29
|
+
"contradiction": {
|
|
30
|
+
"n": 2,
|
|
31
|
+
"mean_score": 1.0,
|
|
32
|
+
"accuracy": 1.0
|
|
33
|
+
},
|
|
34
|
+
"status": {
|
|
35
|
+
"n": 4,
|
|
36
|
+
"mean_score": 1.0,
|
|
37
|
+
"accuracy": 1.0
|
|
38
|
+
},
|
|
39
|
+
"multi-hop": {
|
|
40
|
+
"n": 4,
|
|
41
|
+
"mean_score": 1.0,
|
|
42
|
+
"accuracy": 1.0
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"extra": {
|
|
46
|
+
"ingest_ms": 64355.045720003545,
|
|
47
|
+
"grading": "substring",
|
|
48
|
+
"limit": 3,
|
|
49
|
+
"tokens": {
|
|
50
|
+
"corpus_tokens": 513,
|
|
51
|
+
"query_tokens": 140,
|
|
52
|
+
"context_tokens": 803,
|
|
53
|
+
"retrieval_tokens": 943,
|
|
54
|
+
"naive_tokens": 8348,
|
|
55
|
+
"saved_tokens": 7405,
|
|
56
|
+
"reduction_pct": 0.8870388116914231,
|
|
57
|
+
"mean_retrieval_tokens_per_task": 58.9375,
|
|
58
|
+
"tokenizer": "cl100k_base",
|
|
59
|
+
"per_task": {
|
|
60
|
+
"who-owns-atlas": {
|
|
61
|
+
"query": 5,
|
|
62
|
+
"context": 36,
|
|
63
|
+
"retrieval": 41,
|
|
64
|
+
"judge_in": 0,
|
|
65
|
+
"judge_out": 0,
|
|
66
|
+
"judge_latency_ms": 0.0
|
|
67
|
+
},
|
|
68
|
+
"who-owns-borealis": {
|
|
69
|
+
"query": 7,
|
|
70
|
+
"context": 46,
|
|
71
|
+
"retrieval": 53,
|
|
72
|
+
"judge_in": 0,
|
|
73
|
+
"judge_out": 0,
|
|
74
|
+
"judge_latency_ms": 0.0
|
|
75
|
+
},
|
|
76
|
+
"who-owns-cirrus": {
|
|
77
|
+
"query": 6,
|
|
78
|
+
"context": 42,
|
|
79
|
+
"retrieval": 48,
|
|
80
|
+
"judge_in": 0,
|
|
81
|
+
"judge_out": 0,
|
|
82
|
+
"judge_latency_ms": 0.0
|
|
83
|
+
},
|
|
84
|
+
"who-owns-dune": {
|
|
85
|
+
"query": 6,
|
|
86
|
+
"context": 41,
|
|
87
|
+
"retrieval": 47,
|
|
88
|
+
"judge_in": 0,
|
|
89
|
+
"judge_out": 0,
|
|
90
|
+
"judge_latency_ms": 0.0
|
|
91
|
+
},
|
|
92
|
+
"current-deadline-atlas": {
|
|
93
|
+
"query": 8,
|
|
94
|
+
"context": 74,
|
|
95
|
+
"retrieval": 82,
|
|
96
|
+
"judge_in": 0,
|
|
97
|
+
"judge_out": 0,
|
|
98
|
+
"judge_latency_ms": 0.0
|
|
99
|
+
},
|
|
100
|
+
"current-deadline-borealis": {
|
|
101
|
+
"query": 10,
|
|
102
|
+
"context": 53,
|
|
103
|
+
"retrieval": 63,
|
|
104
|
+
"judge_in": 0,
|
|
105
|
+
"judge_out": 0,
|
|
106
|
+
"judge_latency_ms": 0.0
|
|
107
|
+
},
|
|
108
|
+
"current-deadline-cirrus": {
|
|
109
|
+
"query": 9,
|
|
110
|
+
"context": 62,
|
|
111
|
+
"retrieval": 71,
|
|
112
|
+
"judge_in": 0,
|
|
113
|
+
"judge_out": 0,
|
|
114
|
+
"judge_latency_ms": 0.0
|
|
115
|
+
},
|
|
116
|
+
"current-deadline-dune": {
|
|
117
|
+
"query": 9,
|
|
118
|
+
"context": 48,
|
|
119
|
+
"retrieval": 57,
|
|
120
|
+
"judge_in": 0,
|
|
121
|
+
"judge_out": 0,
|
|
122
|
+
"judge_latency_ms": 0.0
|
|
123
|
+
},
|
|
124
|
+
"status-atlas": {
|
|
125
|
+
"query": 8,
|
|
126
|
+
"context": 44,
|
|
127
|
+
"retrieval": 52,
|
|
128
|
+
"judge_in": 0,
|
|
129
|
+
"judge_out": 0,
|
|
130
|
+
"judge_latency_ms": 0.0
|
|
131
|
+
},
|
|
132
|
+
"status-borealis": {
|
|
133
|
+
"query": 10,
|
|
134
|
+
"context": 42,
|
|
135
|
+
"retrieval": 52,
|
|
136
|
+
"judge_in": 0,
|
|
137
|
+
"judge_out": 0,
|
|
138
|
+
"judge_latency_ms": 0.0
|
|
139
|
+
},
|
|
140
|
+
"status-cirrus": {
|
|
141
|
+
"query": 9,
|
|
142
|
+
"context": 49,
|
|
143
|
+
"retrieval": 58,
|
|
144
|
+
"judge_in": 0,
|
|
145
|
+
"judge_out": 0,
|
|
146
|
+
"judge_latency_ms": 0.0
|
|
147
|
+
},
|
|
148
|
+
"status-dune": {
|
|
149
|
+
"query": 9,
|
|
150
|
+
"context": 42,
|
|
151
|
+
"retrieval": 51,
|
|
152
|
+
"judge_in": 0,
|
|
153
|
+
"judge_out": 0,
|
|
154
|
+
"judge_latency_ms": 0.0
|
|
155
|
+
},
|
|
156
|
+
"multihop-atlas": {
|
|
157
|
+
"query": 10,
|
|
158
|
+
"context": 56,
|
|
159
|
+
"retrieval": 66,
|
|
160
|
+
"judge_in": 0,
|
|
161
|
+
"judge_out": 0,
|
|
162
|
+
"judge_latency_ms": 0.0
|
|
163
|
+
},
|
|
164
|
+
"multihop-borealis": {
|
|
165
|
+
"query": 12,
|
|
166
|
+
"context": 53,
|
|
167
|
+
"retrieval": 65,
|
|
168
|
+
"judge_in": 0,
|
|
169
|
+
"judge_out": 0,
|
|
170
|
+
"judge_latency_ms": 0.0
|
|
171
|
+
},
|
|
172
|
+
"multihop-cirrus": {
|
|
173
|
+
"query": 11,
|
|
174
|
+
"context": 67,
|
|
175
|
+
"retrieval": 78,
|
|
176
|
+
"judge_in": 0,
|
|
177
|
+
"judge_out": 0,
|
|
178
|
+
"judge_latency_ms": 0.0
|
|
179
|
+
},
|
|
180
|
+
"multihop-dune": {
|
|
181
|
+
"query": 11,
|
|
182
|
+
"context": 48,
|
|
183
|
+
"retrieval": 59,
|
|
184
|
+
"judge_in": 0,
|
|
185
|
+
"judge_out": 0,
|
|
186
|
+
"judge_latency_ms": 0.0
|
|
187
|
+
}
|
|
188
|
+
},
|
|
189
|
+
"judge_tokens_in": 0,
|
|
190
|
+
"judge_tokens_out": 0,
|
|
191
|
+
"judge_calls": 0,
|
|
192
|
+
"judge_mean_latency_ms": 0.0
|
|
193
|
+
},
|
|
194
|
+
"cost_usd": {
|
|
195
|
+
"assumed_completion_tokens_per_task": 100,
|
|
196
|
+
"rates": {
|
|
197
|
+
"input_per_1k": 0.0025,
|
|
198
|
+
"output_per_1k": 0.01,
|
|
199
|
+
"model": "gpt-4o"
|
|
200
|
+
},
|
|
201
|
+
"retrieval_usd_in": 0.0023575,
|
|
202
|
+
"retrieval_usd_out": 0.016,
|
|
203
|
+
"retrieval_usd_total": 0.0183575,
|
|
204
|
+
"naive_usd_total": 0.03687,
|
|
205
|
+
"saved_usd": 0.0185125,
|
|
206
|
+
"saved_usd_per_1k_tasks": 1.1570312500000002
|
|
207
|
+
}
|
|
208
|
+
},
|
|
209
|
+
"task_results": [
|
|
210
|
+
{
|
|
211
|
+
"task_id": "who-owns-atlas",
|
|
212
|
+
"query": "Who owns project Atlas?",
|
|
213
|
+
"answer": "Alice: I'll own project Atlas. Kickoff this week.\n---\nAlice: I'll own project Atlas. Kickoff this week.\n---\nAlice: Atlas status \u2014 on track.",
|
|
214
|
+
"hits": [
|
|
215
|
+
{
|
|
216
|
+
"text": "Alice: I'll own project Atlas. Kickoff this week.",
|
|
217
|
+
"score": 0.7395,
|
|
218
|
+
"source": "pentatonic-baseline",
|
|
219
|
+
"doc_id": "chat-assign-atlas"
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
"text": "Alice: I'll own project Atlas. Kickoff this week.",
|
|
223
|
+
"score": 0.6881,
|
|
224
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
225
|
+
"doc_id": "chat-assign-atlas"
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
"text": "Alice: Atlas status \u2014 on track.",
|
|
229
|
+
"score": 0.62,
|
|
230
|
+
"source": "pentatonic-baseline",
|
|
231
|
+
"doc_id": "status-atlas-m1"
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
"correct": true,
|
|
235
|
+
"score": 1.0,
|
|
236
|
+
"grading_notes": "all substrings matched",
|
|
237
|
+
"search_time_ms": 2792.650751012843,
|
|
238
|
+
"generation_time_ms": 0.0,
|
|
239
|
+
"tokens_in": 0,
|
|
240
|
+
"tokens_out": 0,
|
|
241
|
+
"retrieval_tokens": 41,
|
|
242
|
+
"query_tokens": 5,
|
|
243
|
+
"context_tokens": 36,
|
|
244
|
+
"judge_tokens_in": 0,
|
|
245
|
+
"judge_tokens_out": 0,
|
|
246
|
+
"judge_latency_ms": 0.0
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"task_id": "who-owns-borealis",
|
|
250
|
+
"query": "Who owns project Borealis?",
|
|
251
|
+
"answer": "Clara: I'll own project Borealis. Kickoff this week.\n---\nClara: I'll own project Borealis. Kickoff this week.\n---\nClara: Borealis status \u2014 blocked on vendor.",
|
|
252
|
+
"hits": [
|
|
253
|
+
{
|
|
254
|
+
"text": "Clara: I'll own project Borealis. Kickoff this week.",
|
|
255
|
+
"score": 0.7648,
|
|
256
|
+
"source": "pentatonic-baseline",
|
|
257
|
+
"doc_id": "chat-assign-borealis"
|
|
258
|
+
},
|
|
259
|
+
{
|
|
260
|
+
"text": "Clara: I'll own project Borealis. Kickoff this week.",
|
|
261
|
+
"score": 0.6883,
|
|
262
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
263
|
+
"doc_id": "chat-assign-borealis"
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
"text": "Clara: Borealis status \u2014 blocked on vendor.",
|
|
267
|
+
"score": 0.5686,
|
|
268
|
+
"source": "pentatonic-baseline",
|
|
269
|
+
"doc_id": "status-borealis-m2"
|
|
270
|
+
}
|
|
271
|
+
],
|
|
272
|
+
"correct": true,
|
|
273
|
+
"score": 1.0,
|
|
274
|
+
"grading_notes": "all substrings matched",
|
|
275
|
+
"search_time_ms": 1167.0277139928658,
|
|
276
|
+
"generation_time_ms": 0.0,
|
|
277
|
+
"tokens_in": 0,
|
|
278
|
+
"tokens_out": 0,
|
|
279
|
+
"retrieval_tokens": 53,
|
|
280
|
+
"query_tokens": 7,
|
|
281
|
+
"context_tokens": 46,
|
|
282
|
+
"judge_tokens_in": 0,
|
|
283
|
+
"judge_tokens_out": 0,
|
|
284
|
+
"judge_latency_ms": 0.0
|
|
285
|
+
},
|
|
286
|
+
{
|
|
287
|
+
"task_id": "who-owns-cirrus",
|
|
288
|
+
"query": "Who owns project Cirrus?",
|
|
289
|
+
"answer": "Diego: I'll own project Cirrus. Kickoff this week.\n---\nDiego: I'll own project Cirrus. Kickoff this week.\n---\nDiego: Cirrus status \u2014 scoping.",
|
|
290
|
+
"hits": [
|
|
291
|
+
{
|
|
292
|
+
"text": "Diego: I'll own project Cirrus. Kickoff this week.",
|
|
293
|
+
"score": 0.7758,
|
|
294
|
+
"source": "pentatonic-baseline",
|
|
295
|
+
"doc_id": "chat-assign-cirrus"
|
|
296
|
+
},
|
|
297
|
+
{
|
|
298
|
+
"text": "Diego: I'll own project Cirrus. Kickoff this week.",
|
|
299
|
+
"score": 0.6908,
|
|
300
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
301
|
+
"doc_id": "chat-assign-cirrus"
|
|
302
|
+
},
|
|
303
|
+
{
|
|
304
|
+
"text": "Diego: Cirrus status \u2014 scoping.",
|
|
305
|
+
"score": 0.6237,
|
|
306
|
+
"source": "pentatonic-baseline",
|
|
307
|
+
"doc_id": "status-cirrus-m3"
|
|
308
|
+
}
|
|
309
|
+
],
|
|
310
|
+
"correct": true,
|
|
311
|
+
"score": 1.0,
|
|
312
|
+
"grading_notes": "all substrings matched",
|
|
313
|
+
"search_time_ms": 1094.0878769906703,
|
|
314
|
+
"generation_time_ms": 0.0,
|
|
315
|
+
"tokens_in": 0,
|
|
316
|
+
"tokens_out": 0,
|
|
317
|
+
"retrieval_tokens": 48,
|
|
318
|
+
"query_tokens": 6,
|
|
319
|
+
"context_tokens": 42,
|
|
320
|
+
"judge_tokens_in": 0,
|
|
321
|
+
"judge_tokens_out": 0,
|
|
322
|
+
"judge_latency_ms": 0.0
|
|
323
|
+
},
|
|
324
|
+
{
|
|
325
|
+
"task_id": "who-owns-dune",
|
|
326
|
+
"query": "Who owns project Dune?",
|
|
327
|
+
"answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.",
|
|
328
|
+
"hits": [
|
|
329
|
+
{
|
|
330
|
+
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
331
|
+
"score": 0.7892,
|
|
332
|
+
"source": "pentatonic-baseline",
|
|
333
|
+
"doc_id": "chat-assign-dune"
|
|
334
|
+
},
|
|
335
|
+
{
|
|
336
|
+
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
337
|
+
"score": 0.7043,
|
|
338
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
339
|
+
"doc_id": "chat-assign-dune"
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
"text": "Farid: Dune status \u2014 launched.",
|
|
343
|
+
"score": 0.5987,
|
|
344
|
+
"source": "pentatonic-baseline",
|
|
345
|
+
"doc_id": "status-dune-m4"
|
|
346
|
+
}
|
|
347
|
+
],
|
|
348
|
+
"correct": true,
|
|
349
|
+
"score": 1.0,
|
|
350
|
+
"grading_notes": "all substrings matched",
|
|
351
|
+
"search_time_ms": 1064.0079930017237,
|
|
352
|
+
"generation_time_ms": 0.0,
|
|
353
|
+
"tokens_in": 0,
|
|
354
|
+
"tokens_out": 0,
|
|
355
|
+
"retrieval_tokens": 47,
|
|
356
|
+
"query_tokens": 6,
|
|
357
|
+
"context_tokens": 41,
|
|
358
|
+
"judge_tokens_in": 0,
|
|
359
|
+
"judge_tokens_out": 0,
|
|
360
|
+
"judge_latency_ms": 0.0
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
"task_id": "current-deadline-atlas",
|
|
364
|
+
"query": "What is the current deadline for Atlas?",
|
|
365
|
+
"answer": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
|
|
366
|
+
"hits": [
|
|
367
|
+
{
|
|
368
|
+
"text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
|
|
369
|
+
"score": 0.7384,
|
|
370
|
+
"source": "pentatonic-baseline",
|
|
371
|
+
"doc_id": "chat-deadline2-atlas"
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
"text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
|
|
375
|
+
"score": 0.6794,
|
|
376
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
377
|
+
"doc_id": "chat-deadline2-atlas"
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
"text": "Alice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
|
|
381
|
+
"score": 0.6703,
|
|
382
|
+
"source": "pentatonic-baseline",
|
|
383
|
+
"doc_id": "chat-deadline1-atlas"
|
|
384
|
+
}
|
|
385
|
+
],
|
|
386
|
+
"correct": true,
|
|
387
|
+
"score": 1.0,
|
|
388
|
+
"grading_notes": "all substrings matched",
|
|
389
|
+
"search_time_ms": 1233.0802179931197,
|
|
390
|
+
"generation_time_ms": 0.0,
|
|
391
|
+
"tokens_in": 0,
|
|
392
|
+
"tokens_out": 0,
|
|
393
|
+
"retrieval_tokens": 82,
|
|
394
|
+
"query_tokens": 8,
|
|
395
|
+
"context_tokens": 74,
|
|
396
|
+
"judge_tokens_in": 0,
|
|
397
|
+
"judge_tokens_out": 0,
|
|
398
|
+
"judge_latency_ms": 0.0
|
|
399
|
+
},
|
|
400
|
+
{
|
|
401
|
+
"task_id": "current-deadline-borealis",
|
|
402
|
+
"query": "What is the current deadline for Borealis?",
|
|
403
|
+
"answer": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.\n---\nClara: Borealis status \u2014 blocked on vendor.\n---\nClara: I'll own project Borealis. Kickoff this week.",
|
|
404
|
+
"hits": [
|
|
405
|
+
{
|
|
406
|
+
"text": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
|
|
407
|
+
"score": 0.6554,
|
|
408
|
+
"source": "pentatonic-baseline",
|
|
409
|
+
"doc_id": "chat-deadline1-borealis"
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
"text": "Clara: Borealis status \u2014 blocked on vendor.",
|
|
413
|
+
"score": 0.5875,
|
|
414
|
+
"source": "pentatonic-baseline",
|
|
415
|
+
"doc_id": "status-borealis-m2"
|
|
416
|
+
},
|
|
417
|
+
{
|
|
418
|
+
"text": "Clara: I'll own project Borealis. Kickoff this week.",
|
|
419
|
+
"score": 0.5874,
|
|
420
|
+
"source": "pentatonic-baseline",
|
|
421
|
+
"doc_id": "chat-assign-borealis"
|
|
422
|
+
}
|
|
423
|
+
],
|
|
424
|
+
"correct": true,
|
|
425
|
+
"score": 1.0,
|
|
426
|
+
"grading_notes": "all substrings matched",
|
|
427
|
+
"search_time_ms": 1080.975874006981,
|
|
428
|
+
"generation_time_ms": 0.0,
|
|
429
|
+
"tokens_in": 0,
|
|
430
|
+
"tokens_out": 0,
|
|
431
|
+
"retrieval_tokens": 63,
|
|
432
|
+
"query_tokens": 10,
|
|
433
|
+
"context_tokens": 53,
|
|
434
|
+
"judge_tokens_in": 0,
|
|
435
|
+
"judge_tokens_out": 0,
|
|
436
|
+
"judge_latency_ms": 0.0
|
|
437
|
+
},
|
|
438
|
+
{
|
|
439
|
+
"task_id": "current-deadline-cirrus",
|
|
440
|
+
"query": "What is the current deadline for Cirrus?",
|
|
441
|
+
"answer": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: Cirrus status \u2014 scoping.",
|
|
442
|
+
"hits": [
|
|
443
|
+
{
|
|
444
|
+
"text": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.",
|
|
445
|
+
"score": 0.7547,
|
|
446
|
+
"source": "pentatonic-baseline",
|
|
447
|
+
"doc_id": "chat-deadline2-cirrus"
|
|
448
|
+
},
|
|
449
|
+
{
|
|
450
|
+
"text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
|
|
451
|
+
"score": 0.727,
|
|
452
|
+
"source": "pentatonic-baseline",
|
|
453
|
+
"doc_id": "chat-deadline1-cirrus"
|
|
454
|
+
},
|
|
455
|
+
{
|
|
456
|
+
"text": "Diego: Cirrus status \u2014 scoping.",
|
|
457
|
+
"score": 0.6648,
|
|
458
|
+
"source": "pentatonic-baseline",
|
|
459
|
+
"doc_id": "status-cirrus-m3"
|
|
460
|
+
}
|
|
461
|
+
],
|
|
462
|
+
"correct": true,
|
|
463
|
+
"score": 1.0,
|
|
464
|
+
"grading_notes": "all substrings matched",
|
|
465
|
+
"search_time_ms": 1158.1688209844287,
|
|
466
|
+
"generation_time_ms": 0.0,
|
|
467
|
+
"tokens_in": 0,
|
|
468
|
+
"tokens_out": 0,
|
|
469
|
+
"retrieval_tokens": 71,
|
|
470
|
+
"query_tokens": 9,
|
|
471
|
+
"context_tokens": 62,
|
|
472
|
+
"judge_tokens_in": 0,
|
|
473
|
+
"judge_tokens_out": 0,
|
|
474
|
+
"judge_latency_ms": 0.0
|
|
475
|
+
},
|
|
476
|
+
{
|
|
477
|
+
"task_id": "current-deadline-dune",
|
|
478
|
+
"query": "What is the current deadline for Dune?",
|
|
479
|
+
"answer": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.\n---\nFarid: I'll own project Dune. Kickoff this week.\n---\nFarid: Dune status \u2014 launched.",
|
|
480
|
+
"hits": [
|
|
481
|
+
{
|
|
482
|
+
"text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
|
|
483
|
+
"score": 0.7046,
|
|
484
|
+
"source": "pentatonic-baseline",
|
|
485
|
+
"doc_id": "chat-deadline1-dune"
|
|
486
|
+
},
|
|
487
|
+
{
|
|
488
|
+
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
489
|
+
"score": 0.5789,
|
|
490
|
+
"source": "pentatonic-baseline",
|
|
491
|
+
"doc_id": "chat-assign-dune"
|
|
492
|
+
},
|
|
493
|
+
{
|
|
494
|
+
"text": "Farid: Dune status \u2014 launched.",
|
|
495
|
+
"score": 0.5647,
|
|
496
|
+
"source": "pentatonic-baseline",
|
|
497
|
+
"doc_id": "status-dune-m4"
|
|
498
|
+
}
|
|
499
|
+
],
|
|
500
|
+
"correct": true,
|
|
501
|
+
"score": 1.0,
|
|
502
|
+
"grading_notes": "all substrings matched",
|
|
503
|
+
"search_time_ms": 1098.5507189761847,
|
|
504
|
+
"generation_time_ms": 0.0,
|
|
505
|
+
"tokens_in": 0,
|
|
506
|
+
"tokens_out": 0,
|
|
507
|
+
"retrieval_tokens": 57,
|
|
508
|
+
"query_tokens": 9,
|
|
509
|
+
"context_tokens": 48,
|
|
510
|
+
"judge_tokens_in": 0,
|
|
511
|
+
"judge_tokens_out": 0,
|
|
512
|
+
"judge_latency_ms": 0.0
|
|
513
|
+
},
|
|
514
|
+
{
|
|
515
|
+
"task_id": "status-atlas",
|
|
516
|
+
"query": "What's the latest status of Atlas?",
|
|
517
|
+
"answer": "Alice: Atlas status \u2014 on track.\n---\nAlice: Atlas status \u2014 on track.\n---\nAlice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
|
|
518
|
+
"hits": [
|
|
519
|
+
{
|
|
520
|
+
"text": "Alice: Atlas status \u2014 on track.",
|
|
521
|
+
"score": 0.7563,
|
|
522
|
+
"source": "pentatonic-baseline",
|
|
523
|
+
"doc_id": "status-atlas-m1"
|
|
524
|
+
},
|
|
525
|
+
{
|
|
526
|
+
"text": "Alice: Atlas status \u2014 on track.",
|
|
527
|
+
"score": 0.7149,
|
|
528
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
529
|
+
"doc_id": "status-atlas-m1"
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
|
|
533
|
+
"score": 0.6697,
|
|
534
|
+
"source": "pentatonic-baseline",
|
|
535
|
+
"doc_id": "chat-deadline2-atlas"
|
|
536
|
+
}
|
|
537
|
+
],
|
|
538
|
+
"correct": true,
|
|
539
|
+
"score": 1.0,
|
|
540
|
+
"grading_notes": "all substrings matched",
|
|
541
|
+
"search_time_ms": 1126.3183399860281,
|
|
542
|
+
"generation_time_ms": 0.0,
|
|
543
|
+
"tokens_in": 0,
|
|
544
|
+
"tokens_out": 0,
|
|
545
|
+
"retrieval_tokens": 52,
|
|
546
|
+
"query_tokens": 8,
|
|
547
|
+
"context_tokens": 44,
|
|
548
|
+
"judge_tokens_in": 0,
|
|
549
|
+
"judge_tokens_out": 0,
|
|
550
|
+
"judge_latency_ms": 0.0
|
|
551
|
+
},
|
|
552
|
+
{
|
|
553
|
+
"task_id": "status-borealis",
|
|
554
|
+
"query": "What's the latest status of Borealis?",
|
|
555
|
+
"answer": "Clara: Borealis status \u2014 blocked on vendor.\n---\nClara: Borealis status \u2014 blocked on vendor.\n---\nClara: I'll own project Borealis. Kickoff this week.",
|
|
556
|
+
"hits": [
|
|
557
|
+
{
|
|
558
|
+
"text": "Clara: Borealis status \u2014 blocked on vendor.",
|
|
559
|
+
"score": 0.6973,
|
|
560
|
+
"source": "pentatonic-baseline",
|
|
561
|
+
"doc_id": "status-borealis-m2"
|
|
562
|
+
},
|
|
563
|
+
{
|
|
564
|
+
"text": "Clara: Borealis status \u2014 blocked on vendor.",
|
|
565
|
+
"score": 0.6638,
|
|
566
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
567
|
+
"doc_id": "status-borealis-m2"
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
"text": "Clara: I'll own project Borealis. Kickoff this week.",
|
|
571
|
+
"score": 0.6623,
|
|
572
|
+
"source": "pentatonic-baseline",
|
|
573
|
+
"doc_id": "chat-assign-borealis"
|
|
574
|
+
}
|
|
575
|
+
],
|
|
576
|
+
"correct": true,
|
|
577
|
+
"score": 1.0,
|
|
578
|
+
"grading_notes": "all substrings matched",
|
|
579
|
+
"search_time_ms": 1205.3974389855284,
|
|
580
|
+
"generation_time_ms": 0.0,
|
|
581
|
+
"tokens_in": 0,
|
|
582
|
+
"tokens_out": 0,
|
|
583
|
+
"retrieval_tokens": 52,
|
|
584
|
+
"query_tokens": 10,
|
|
585
|
+
"context_tokens": 42,
|
|
586
|
+
"judge_tokens_in": 0,
|
|
587
|
+
"judge_tokens_out": 0,
|
|
588
|
+
"judge_latency_ms": 0.0
|
|
589
|
+
},
|
|
590
|
+
{
|
|
591
|
+
"task_id": "status-cirrus",
|
|
592
|
+
"query": "What's the latest status of Cirrus?",
|
|
593
|
+
"answer": "Diego: Cirrus status \u2014 scoping.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: I'll own project Cirrus. Kickoff this week.",
|
|
594
|
+
"hits": [
|
|
595
|
+
{
|
|
596
|
+
"text": "Diego: Cirrus status \u2014 scoping.",
|
|
597
|
+
"score": 0.7422,
|
|
598
|
+
"source": "pentatonic-baseline",
|
|
599
|
+
"doc_id": "status-cirrus-m3"
|
|
600
|
+
},
|
|
601
|
+
{
|
|
602
|
+
"text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
|
|
603
|
+
"score": 0.7086,
|
|
604
|
+
"source": "pentatonic-baseline",
|
|
605
|
+
"doc_id": "chat-deadline1-cirrus"
|
|
606
|
+
},
|
|
607
|
+
{
|
|
608
|
+
"text": "Diego: I'll own project Cirrus. Kickoff this week.",
|
|
609
|
+
"score": 0.6911,
|
|
610
|
+
"source": "pentatonic-baseline",
|
|
611
|
+
"doc_id": "chat-assign-cirrus"
|
|
612
|
+
}
|
|
613
|
+
],
|
|
614
|
+
"correct": true,
|
|
615
|
+
"score": 1.0,
|
|
616
|
+
"grading_notes": "all substrings matched",
|
|
617
|
+
"search_time_ms": 1125.2480830007698,
|
|
618
|
+
"generation_time_ms": 0.0,
|
|
619
|
+
"tokens_in": 0,
|
|
620
|
+
"tokens_out": 0,
|
|
621
|
+
"retrieval_tokens": 58,
|
|
622
|
+
"query_tokens": 9,
|
|
623
|
+
"context_tokens": 49,
|
|
624
|
+
"judge_tokens_in": 0,
|
|
625
|
+
"judge_tokens_out": 0,
|
|
626
|
+
"judge_latency_ms": 0.0
|
|
627
|
+
},
|
|
628
|
+
{
|
|
629
|
+
"task_id": "status-dune",
|
|
630
|
+
"query": "What's the latest status of Dune?",
|
|
631
|
+
"answer": "Farid: Dune status \u2014 launched.\n---\nFarid: Dune status \u2014 launched.\n---\nFarid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
|
|
632
|
+
"hits": [
|
|
633
|
+
{
|
|
634
|
+
"text": "Farid: Dune status \u2014 launched.",
|
|
635
|
+
"score": 0.6785,
|
|
636
|
+
"source": "pentatonic-baseline",
|
|
637
|
+
"doc_id": "status-dune-m4"
|
|
638
|
+
},
|
|
639
|
+
{
|
|
640
|
+
"text": "Farid: Dune status \u2014 launched.",
|
|
641
|
+
"score": 0.6539,
|
|
642
|
+
"source": "pentatonic-baseline:L0_workspace_bm25",
|
|
643
|
+
"doc_id": "status-dune-m4"
|
|
644
|
+
},
|
|
645
|
+
{
|
|
646
|
+
"text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
|
|
647
|
+
"score": 0.6538,
|
|
648
|
+
"source": "pentatonic-baseline",
|
|
649
|
+
"doc_id": "chat-deadline1-dune"
|
|
650
|
+
}
|
|
651
|
+
],
|
|
652
|
+
"correct": true,
|
|
653
|
+
"score": 1.0,
|
|
654
|
+
"grading_notes": "all substrings matched",
|
|
655
|
+
"search_time_ms": 1171.0565309913363,
|
|
656
|
+
"generation_time_ms": 0.0,
|
|
657
|
+
"tokens_in": 0,
|
|
658
|
+
"tokens_out": 0,
|
|
659
|
+
"retrieval_tokens": 51,
|
|
660
|
+
"query_tokens": 9,
|
|
661
|
+
"context_tokens": 42,
|
|
662
|
+
"judge_tokens_in": 0,
|
|
663
|
+
"judge_tokens_out": 0,
|
|
664
|
+
"judge_latency_ms": 0.0
|
|
665
|
+
},
|
|
666
|
+
{
|
|
667
|
+
"task_id": "multihop-atlas",
|
|
668
|
+
"query": "Who owns Atlas and what is its current deadline?",
|
|
669
|
+
"answer": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.\n---\nAlice: Atlas status \u2014 on track.\n---\nAlice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
|
|
670
|
+
"hits": [
|
|
671
|
+
{
|
|
672
|
+
"text": "Alice: Update \u2014 Atlas deadline has been moved to 2026-03-17. Latest one supersedes earlier guidance.",
|
|
673
|
+
"score": 0.6769,
|
|
674
|
+
"source": "pentatonic-baseline",
|
|
675
|
+
"doc_id": "chat-deadline2-atlas"
|
|
676
|
+
},
|
|
677
|
+
{
|
|
678
|
+
"text": "Alice: Atlas status \u2014 on track.",
|
|
679
|
+
"score": 0.6421,
|
|
680
|
+
"source": "pentatonic-baseline",
|
|
681
|
+
"doc_id": "status-atlas-m1"
|
|
682
|
+
},
|
|
683
|
+
{
|
|
684
|
+
"text": "Alice: Target delivery for Atlas is 2026-03-14. Please pencil it in.",
|
|
685
|
+
"score": 0.6357,
|
|
686
|
+
"source": "pentatonic-baseline",
|
|
687
|
+
"doc_id": "chat-deadline1-atlas"
|
|
688
|
+
}
|
|
689
|
+
],
|
|
690
|
+
"correct": true,
|
|
691
|
+
"score": 1.0,
|
|
692
|
+
"grading_notes": "all substrings matched",
|
|
693
|
+
"search_time_ms": 1122.7682839962654,
|
|
694
|
+
"generation_time_ms": 0.0,
|
|
695
|
+
"tokens_in": 0,
|
|
696
|
+
"tokens_out": 0,
|
|
697
|
+
"retrieval_tokens": 66,
|
|
698
|
+
"query_tokens": 10,
|
|
699
|
+
"context_tokens": 56,
|
|
700
|
+
"judge_tokens_in": 0,
|
|
701
|
+
"judge_tokens_out": 0,
|
|
702
|
+
"judge_latency_ms": 0.0
|
|
703
|
+
},
|
|
704
|
+
{
|
|
705
|
+
"task_id": "multihop-borealis",
|
|
706
|
+
"query": "Who owns Borealis and what is its current deadline?",
|
|
707
|
+
"answer": "Clara: I'll own project Borealis. Kickoff this week.\n---\nClara: Target delivery for Borealis is 2026-02-28. Please pencil it in.\n---\nClara: Borealis status \u2014 blocked on vendor.",
|
|
708
|
+
"hits": [
|
|
709
|
+
{
|
|
710
|
+
"text": "Clara: I'll own project Borealis. Kickoff this week.",
|
|
711
|
+
"score": 0.6402,
|
|
712
|
+
"source": "pentatonic-baseline",
|
|
713
|
+
"doc_id": "chat-assign-borealis"
|
|
714
|
+
},
|
|
715
|
+
{
|
|
716
|
+
"text": "Clara: Target delivery for Borealis is 2026-02-28. Please pencil it in.",
|
|
717
|
+
"score": 0.6212,
|
|
718
|
+
"source": "pentatonic-baseline",
|
|
719
|
+
"doc_id": "chat-deadline1-borealis"
|
|
720
|
+
},
|
|
721
|
+
{
|
|
722
|
+
"text": "Clara: Borealis status \u2014 blocked on vendor.",
|
|
723
|
+
"score": 0.5911,
|
|
724
|
+
"source": "pentatonic-baseline",
|
|
725
|
+
"doc_id": "status-borealis-m2"
|
|
726
|
+
}
|
|
727
|
+
],
|
|
728
|
+
"correct": true,
|
|
729
|
+
"score": 1.0,
|
|
730
|
+
"grading_notes": "all substrings matched",
|
|
731
|
+
"search_time_ms": 1122.615259984741,
|
|
732
|
+
"generation_time_ms": 0.0,
|
|
733
|
+
"tokens_in": 0,
|
|
734
|
+
"tokens_out": 0,
|
|
735
|
+
"retrieval_tokens": 65,
|
|
736
|
+
"query_tokens": 12,
|
|
737
|
+
"context_tokens": 53,
|
|
738
|
+
"judge_tokens_in": 0,
|
|
739
|
+
"judge_tokens_out": 0,
|
|
740
|
+
"judge_latency_ms": 0.0
|
|
741
|
+
},
|
|
742
|
+
{
|
|
743
|
+
"task_id": "multihop-cirrus",
|
|
744
|
+
"query": "Who owns Cirrus and what is its current deadline?",
|
|
745
|
+
"answer": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.\n---\nDiego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.\n---\nDiego: I'll own project Cirrus. Kickoff this week.",
|
|
746
|
+
"hits": [
|
|
747
|
+
{
|
|
748
|
+
"text": "Diego: Update \u2014 Cirrus deadline has been moved to 2026-04-08. Latest one supersedes earlier guidance.",
|
|
749
|
+
"score": 0.6848,
|
|
750
|
+
"source": "pentatonic-baseline",
|
|
751
|
+
"doc_id": "chat-deadline2-cirrus"
|
|
752
|
+
},
|
|
753
|
+
{
|
|
754
|
+
"text": "Diego: Target delivery for Cirrus is 2026-04-01. Please pencil it in.",
|
|
755
|
+
"score": 0.6752,
|
|
756
|
+
"source": "pentatonic-baseline",
|
|
757
|
+
"doc_id": "chat-deadline1-cirrus"
|
|
758
|
+
},
|
|
759
|
+
{
|
|
760
|
+
"text": "Diego: I'll own project Cirrus. Kickoff this week.",
|
|
761
|
+
"score": 0.6549,
|
|
762
|
+
"source": "pentatonic-baseline",
|
|
763
|
+
"doc_id": "chat-assign-cirrus"
|
|
764
|
+
}
|
|
765
|
+
],
|
|
766
|
+
"correct": true,
|
|
767
|
+
"score": 1.0,
|
|
768
|
+
"grading_notes": "all substrings matched",
|
|
769
|
+
"search_time_ms": 1120.0645309872925,
|
|
770
|
+
"generation_time_ms": 0.0,
|
|
771
|
+
"tokens_in": 0,
|
|
772
|
+
"tokens_out": 0,
|
|
773
|
+
"retrieval_tokens": 78,
|
|
774
|
+
"query_tokens": 11,
|
|
775
|
+
"context_tokens": 67,
|
|
776
|
+
"judge_tokens_in": 0,
|
|
777
|
+
"judge_tokens_out": 0,
|
|
778
|
+
"judge_latency_ms": 0.0
|
|
779
|
+
},
|
|
780
|
+
{
|
|
781
|
+
"task_id": "multihop-dune",
|
|
782
|
+
"query": "Who owns Dune and what is its current deadline?",
|
|
783
|
+
"answer": "Farid: I'll own project Dune. Kickoff this week.\n---\nFarid: Target delivery for Dune is 2026-05-20. Please pencil it in.\n---\nFarid: Dune status \u2014 launched.",
|
|
784
|
+
"hits": [
|
|
785
|
+
{
|
|
786
|
+
"text": "Farid: I'll own project Dune. Kickoff this week.",
|
|
787
|
+
"score": 0.6696,
|
|
788
|
+
"source": "pentatonic-baseline",
|
|
789
|
+
"doc_id": "chat-assign-dune"
|
|
790
|
+
},
|
|
791
|
+
{
|
|
792
|
+
"text": "Farid: Target delivery for Dune is 2026-05-20. Please pencil it in.",
|
|
793
|
+
"score": 0.6538,
|
|
794
|
+
"source": "pentatonic-baseline",
|
|
795
|
+
"doc_id": "chat-deadline1-dune"
|
|
796
|
+
},
|
|
797
|
+
{
|
|
798
|
+
"text": "Farid: Dune status \u2014 launched.",
|
|
799
|
+
"score": 0.6101,
|
|
800
|
+
"source": "pentatonic-baseline",
|
|
801
|
+
"doc_id": "status-dune-m4"
|
|
802
|
+
}
|
|
803
|
+
],
|
|
804
|
+
"correct": true,
|
|
805
|
+
"score": 1.0,
|
|
806
|
+
"grading_notes": "all substrings matched",
|
|
807
|
+
"search_time_ms": 1201.8343300151173,
|
|
808
|
+
"generation_time_ms": 0.0,
|
|
809
|
+
"tokens_in": 0,
|
|
810
|
+
"tokens_out": 0,
|
|
811
|
+
"retrieval_tokens": 59,
|
|
812
|
+
"query_tokens": 11,
|
|
813
|
+
"context_tokens": 48,
|
|
814
|
+
"judge_tokens_in": 0,
|
|
815
|
+
"judge_tokens_out": 0,
|
|
816
|
+
"judge_latency_ms": 0.0
|
|
817
|
+
}
|
|
818
|
+
]
|
|
819
|
+
}
|