@lov3kaizen/agentsea-evaluate 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/dist/annotation/index.d.mts +3 -0
- package/dist/annotation/index.d.ts +3 -0
- package/dist/annotation/index.js +630 -0
- package/dist/annotation/index.mjs +22 -0
- package/dist/chunk-5JRYKRSE.mjs +2791 -0
- package/dist/chunk-EUXXIZK3.mjs +676 -0
- package/dist/chunk-NBMUSATK.mjs +596 -0
- package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
- package/dist/chunk-TUMNJN2S.mjs +416 -0
- package/dist/continuous/index.d.mts +2 -0
- package/dist/continuous/index.d.ts +2 -0
- package/dist/continuous/index.js +707 -0
- package/dist/continuous/index.mjs +16 -0
- package/dist/datasets/index.d.mts +1 -0
- package/dist/datasets/index.d.ts +1 -0
- package/dist/datasets/index.js +456 -0
- package/dist/datasets/index.mjs +14 -0
- package/dist/evaluation/index.d.mts +1 -0
- package/dist/evaluation/index.d.ts +1 -0
- package/dist/evaluation/index.js +2853 -0
- package/dist/evaluation/index.mjs +78 -0
- package/dist/feedback/index.d.mts +2 -0
- package/dist/feedback/index.d.ts +2 -0
- package/dist/feedback/index.js +1158 -0
- package/dist/feedback/index.mjs +40 -0
- package/dist/index-6Pbiq7ny.d.mts +234 -0
- package/dist/index-6Pbiq7ny.d.ts +234 -0
- package/dist/index-BNTycFEA.d.mts +479 -0
- package/dist/index-BNTycFEA.d.ts +479 -0
- package/dist/index-CTYCfWfH.d.mts +543 -0
- package/dist/index-CTYCfWfH.d.ts +543 -0
- package/dist/index-Cq5LwG_3.d.mts +322 -0
- package/dist/index-Cq5LwG_3.d.ts +322 -0
- package/dist/index-bPghFsfP.d.mts +315 -0
- package/dist/index-bPghFsfP.d.ts +315 -0
- package/dist/index.d.mts +81 -0
- package/dist/index.d.ts +81 -0
- package/dist/index.js +5962 -0
- package/dist/index.mjs +429 -0
- package/package.json +102 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
// src/datasets/PreferenceDatasetBuilder.ts
|
|
2
|
+
import { nanoid } from "nanoid";
|
|
3
|
+
var PreferenceDataset = class _PreferenceDataset {
|
|
4
|
+
type = "preference";
|
|
5
|
+
pairs;
|
|
6
|
+
_stats;
|
|
7
|
+
constructor(pairs) {
|
|
8
|
+
this.pairs = pairs;
|
|
9
|
+
}
|
|
10
|
+
get size() {
|
|
11
|
+
return this.pairs.length;
|
|
12
|
+
}
|
|
13
|
+
get stats() {
|
|
14
|
+
if (!this._stats) {
|
|
15
|
+
this._stats = this.calculateStats();
|
|
16
|
+
}
|
|
17
|
+
return this._stats;
|
|
18
|
+
}
|
|
19
|
+
getPairs() {
|
|
20
|
+
return [...this.pairs];
|
|
21
|
+
}
|
|
22
|
+
filter(predicate) {
|
|
23
|
+
return new _PreferenceDataset(this.pairs.filter(predicate));
|
|
24
|
+
}
|
|
25
|
+
sample(count) {
|
|
26
|
+
if (count >= this.pairs.length) {
|
|
27
|
+
return new _PreferenceDataset([...this.pairs]);
|
|
28
|
+
}
|
|
29
|
+
const shuffled = [...this.pairs];
|
|
30
|
+
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
31
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
32
|
+
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
33
|
+
}
|
|
34
|
+
return new _PreferenceDataset(shuffled.slice(0, count));
|
|
35
|
+
}
|
|
36
|
+
split(ratio) {
|
|
37
|
+
const shuffled = [...this.pairs];
|
|
38
|
+
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
39
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
40
|
+
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
41
|
+
}
|
|
42
|
+
const splitIndex = Math.floor(shuffled.length * ratio);
|
|
43
|
+
return [
|
|
44
|
+
new _PreferenceDataset(shuffled.slice(0, splitIndex)),
|
|
45
|
+
new _PreferenceDataset(shuffled.slice(splitIndex))
|
|
46
|
+
];
|
|
47
|
+
}
|
|
48
|
+
shuffle(seed) {
|
|
49
|
+
const shuffled = [...this.pairs];
|
|
50
|
+
const rng = seed !== void 0 ? this.seededRandom(seed) : Math.random;
|
|
51
|
+
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
52
|
+
const j = Math.floor(rng() * (i + 1));
|
|
53
|
+
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
54
|
+
}
|
|
55
|
+
return new _PreferenceDataset(shuffled);
|
|
56
|
+
}
|
|
57
|
+
calculateStats() {
|
|
58
|
+
const models = /* @__PURE__ */ new Map();
|
|
59
|
+
let totalPromptLen = 0;
|
|
60
|
+
let totalResponseLen = 0;
|
|
61
|
+
const uniquePrompts = /* @__PURE__ */ new Set();
|
|
62
|
+
for (const pair of this.pairs) {
|
|
63
|
+
uniquePrompts.add(pair.prompt);
|
|
64
|
+
totalPromptLen += pair.prompt.length;
|
|
65
|
+
totalResponseLen += (pair.chosen.length + pair.rejected.length) / 2;
|
|
66
|
+
if (pair.chosenModel) {
|
|
67
|
+
models.set(pair.chosenModel, (models.get(pair.chosenModel) ?? 0) + 1);
|
|
68
|
+
}
|
|
69
|
+
if (pair.rejectedModel) {
|
|
70
|
+
models.set(
|
|
71
|
+
pair.rejectedModel,
|
|
72
|
+
(models.get(pair.rejectedModel) ?? 0) + 1
|
|
73
|
+
);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
size: this.pairs.length,
|
|
78
|
+
type: "preference",
|
|
79
|
+
avgPromptLength: this.pairs.length > 0 ? totalPromptLen / this.pairs.length : 0,
|
|
80
|
+
avgResponseLength: this.pairs.length > 0 ? totalResponseLen / this.pairs.length : 0,
|
|
81
|
+
uniquePrompts: uniquePrompts.size,
|
|
82
|
+
modelDistribution: Object.fromEntries(models)
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
seededRandom(seed) {
|
|
86
|
+
return () => {
|
|
87
|
+
seed = (seed * 9301 + 49297) % 233280;
|
|
88
|
+
return seed / 233280;
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
};
|
|
92
|
+
var PreferenceDatasetBuilder = class {
|
|
93
|
+
feedbackStore;
|
|
94
|
+
sampling;
|
|
95
|
+
constructor(config) {
|
|
96
|
+
this.feedbackStore = config.feedbackStore;
|
|
97
|
+
this.sampling = config.sampling;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Build preference dataset from feedback
|
|
101
|
+
*/
|
|
102
|
+
async build(options = {}) {
|
|
103
|
+
const minPairs = options.minPairs ?? 0;
|
|
104
|
+
const maxPairs = options.maxPairs ?? Infinity;
|
|
105
|
+
const result = await this.feedbackStore.query({
|
|
106
|
+
type: "preference",
|
|
107
|
+
minConfidence: this.sampling?.minConfidence,
|
|
108
|
+
limit: maxPairs * 2
|
|
109
|
+
// Fetch extra to account for filtering
|
|
110
|
+
});
|
|
111
|
+
let pairs = [];
|
|
112
|
+
for (const item of result.items) {
|
|
113
|
+
const feedback = item;
|
|
114
|
+
if (feedback.preference === "tie") {
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
const chosen = feedback.preference === "A" ? feedback.responseA : feedback.responseB;
|
|
118
|
+
const rejected = feedback.preference === "A" ? feedback.responseB : feedback.responseA;
|
|
119
|
+
const pair = {
|
|
120
|
+
id: nanoid(),
|
|
121
|
+
prompt: feedback.input,
|
|
122
|
+
chosen: chosen.content,
|
|
123
|
+
rejected: rejected.content,
|
|
124
|
+
chosenModel: chosen.model,
|
|
125
|
+
rejectedModel: rejected.model,
|
|
126
|
+
reason: feedback.reason,
|
|
127
|
+
confidence: feedback.confidence,
|
|
128
|
+
metadata: feedback.metadata
|
|
129
|
+
};
|
|
130
|
+
pairs.push(pair);
|
|
131
|
+
}
|
|
132
|
+
if (options.filterFn) {
|
|
133
|
+
pairs = pairs.filter(options.filterFn);
|
|
134
|
+
}
|
|
135
|
+
if (options.deduplication && options.deduplication !== "none") {
|
|
136
|
+
pairs = this.deduplicate(pairs, options.deduplication);
|
|
137
|
+
}
|
|
138
|
+
if (this.sampling) {
|
|
139
|
+
pairs = this.applySampling(pairs);
|
|
140
|
+
}
|
|
141
|
+
if (pairs.length > maxPairs) {
|
|
142
|
+
pairs = pairs.slice(0, maxPairs);
|
|
143
|
+
}
|
|
144
|
+
if (pairs.length < minPairs) {
|
|
145
|
+
console.warn(
|
|
146
|
+
`Only ${pairs.length} pairs available, requested minimum ${minPairs}`
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
return new PreferenceDataset(pairs);
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Deduplicate pairs
|
|
153
|
+
*/
|
|
154
|
+
deduplicate(pairs, mode) {
|
|
155
|
+
const seen = /* @__PURE__ */ new Set();
|
|
156
|
+
const result = [];
|
|
157
|
+
for (const pair of pairs) {
|
|
158
|
+
const key = mode === "prompt" ? pair.prompt : `${pair.prompt}|${pair.chosen}|${pair.rejected}`;
|
|
159
|
+
if (!seen.has(key)) {
|
|
160
|
+
seen.add(key);
|
|
161
|
+
result.push(pair);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
return result;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Apply sampling strategy
|
|
168
|
+
*/
|
|
169
|
+
applySampling(pairs) {
|
|
170
|
+
if (!this.sampling) return pairs;
|
|
171
|
+
switch (this.sampling.type) {
|
|
172
|
+
case "random":
|
|
173
|
+
return this.randomSample(pairs);
|
|
174
|
+
case "balanced":
|
|
175
|
+
return this.balancedSample(pairs);
|
|
176
|
+
case "stratified":
|
|
177
|
+
return this.stratifiedSample(pairs);
|
|
178
|
+
default:
|
|
179
|
+
return pairs;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
randomSample(pairs) {
|
|
183
|
+
const shuffled = [...pairs];
|
|
184
|
+
const seed = this.sampling?.seed;
|
|
185
|
+
const rng = seed !== void 0 ? this.seededRandom(seed) : Math.random;
|
|
186
|
+
for (let i = shuffled.length - 1; i > 0; i--) {
|
|
187
|
+
const j = Math.floor(rng() * (i + 1));
|
|
188
|
+
[shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
|
|
189
|
+
}
|
|
190
|
+
return shuffled;
|
|
191
|
+
}
|
|
192
|
+
balancedSample(pairs) {
|
|
193
|
+
const minConf = this.sampling?.minConfidence ?? 0;
|
|
194
|
+
return pairs.filter((p) => (p.confidence ?? 1) >= minConf);
|
|
195
|
+
}
|
|
196
|
+
stratifiedSample(pairs) {
|
|
197
|
+
const field = this.sampling?.stratifyBy ?? "chosenModel";
|
|
198
|
+
const groups = /* @__PURE__ */ new Map();
|
|
199
|
+
for (const pair of pairs) {
|
|
200
|
+
const key = String(
|
|
201
|
+
pair[field] ?? "unknown"
|
|
202
|
+
);
|
|
203
|
+
if (!groups.has(key)) {
|
|
204
|
+
groups.set(key, []);
|
|
205
|
+
}
|
|
206
|
+
groups.get(key).push(pair);
|
|
207
|
+
}
|
|
208
|
+
const result = [];
|
|
209
|
+
const ratios = this.sampling?.stratifyRatios ?? {};
|
|
210
|
+
for (const [key, group] of groups) {
|
|
211
|
+
const ratio = ratios[key] ?? 1 / groups.size;
|
|
212
|
+
const count = Math.ceil(pairs.length * ratio);
|
|
213
|
+
result.push(...group.slice(0, count));
|
|
214
|
+
}
|
|
215
|
+
return result;
|
|
216
|
+
}
|
|
217
|
+
seededRandom(seed) {
|
|
218
|
+
return () => {
|
|
219
|
+
seed = (seed * 9301 + 49297) % 233280;
|
|
220
|
+
return seed / 233280;
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
};
|
|
224
|
+
function createPreferenceDatasetBuilder(config) {
|
|
225
|
+
return new PreferenceDatasetBuilder(config);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// src/datasets/DatasetExporter.ts
|
|
229
|
+
import * as fs from "fs/promises";
|
|
230
|
+
var DatasetExporter = class {
|
|
231
|
+
/**
|
|
232
|
+
* Export preference dataset to file
|
|
233
|
+
*/
|
|
234
|
+
async exportPreferences(dataset, options) {
|
|
235
|
+
const pairs = dataset.getPairs();
|
|
236
|
+
let content;
|
|
237
|
+
const warnings = [];
|
|
238
|
+
switch (options.format) {
|
|
239
|
+
case "jsonl":
|
|
240
|
+
content = this.toJSONL(pairs, options);
|
|
241
|
+
break;
|
|
242
|
+
case "json":
|
|
243
|
+
content = JSON.stringify(pairs, null, 2);
|
|
244
|
+
break;
|
|
245
|
+
case "csv":
|
|
246
|
+
content = this.toCSV(pairs);
|
|
247
|
+
break;
|
|
248
|
+
case "huggingface":
|
|
249
|
+
return this.exportToHuggingFace(pairs, options);
|
|
250
|
+
case "anthropic":
|
|
251
|
+
content = this.toAnthropicFormat(pairs);
|
|
252
|
+
break;
|
|
253
|
+
case "openai":
|
|
254
|
+
content = this.toOpenAIFormat(pairs);
|
|
255
|
+
break;
|
|
256
|
+
default:
|
|
257
|
+
throw new Error(`Unsupported export format: ${options.format}`);
|
|
258
|
+
}
|
|
259
|
+
if (options.path) {
|
|
260
|
+
await fs.writeFile(options.path, content, "utf-8");
|
|
261
|
+
}
|
|
262
|
+
return {
|
|
263
|
+
format: options.format,
|
|
264
|
+
path: options.path,
|
|
265
|
+
itemCount: pairs.length,
|
|
266
|
+
bytesWritten: Buffer.byteLength(content, "utf-8"),
|
|
267
|
+
warnings: warnings.length > 0 ? warnings : void 0
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Convert to JSONL format
|
|
272
|
+
*/
|
|
273
|
+
toJSONL(pairs, options) {
|
|
274
|
+
const format = options?.formatOptions?.format ?? "dpo";
|
|
275
|
+
return pairs.map((pair) => {
|
|
276
|
+
switch (format) {
|
|
277
|
+
case "dpo":
|
|
278
|
+
return JSON.stringify({
|
|
279
|
+
prompt: pair.prompt,
|
|
280
|
+
chosen: pair.chosen,
|
|
281
|
+
rejected: pair.rejected
|
|
282
|
+
});
|
|
283
|
+
case "sft":
|
|
284
|
+
return JSON.stringify({
|
|
285
|
+
instruction: pair.prompt,
|
|
286
|
+
output: pair.chosen
|
|
287
|
+
});
|
|
288
|
+
default:
|
|
289
|
+
return JSON.stringify(pair);
|
|
290
|
+
}
|
|
291
|
+
}).join("\n");
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* Convert to CSV format
|
|
295
|
+
*/
|
|
296
|
+
toCSV(pairs) {
|
|
297
|
+
const headers = [
|
|
298
|
+
"prompt",
|
|
299
|
+
"chosen",
|
|
300
|
+
"rejected",
|
|
301
|
+
"chosen_model",
|
|
302
|
+
"rejected_model",
|
|
303
|
+
"confidence"
|
|
304
|
+
];
|
|
305
|
+
const rows = pairs.map(
|
|
306
|
+
(pair) => [
|
|
307
|
+
this.escapeCSV(pair.prompt),
|
|
308
|
+
this.escapeCSV(pair.chosen),
|
|
309
|
+
this.escapeCSV(pair.rejected),
|
|
310
|
+
pair.chosenModel ?? "",
|
|
311
|
+
pair.rejectedModel ?? "",
|
|
312
|
+
pair.confidence?.toString() ?? ""
|
|
313
|
+
].join(",")
|
|
314
|
+
);
|
|
315
|
+
return [headers.join(","), ...rows].join("\n");
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Convert to Anthropic format
|
|
319
|
+
*/
|
|
320
|
+
toAnthropicFormat(pairs) {
|
|
321
|
+
return pairs.map(
|
|
322
|
+
(pair) => JSON.stringify({
|
|
323
|
+
prompt: `
|
|
324
|
+
|
|
325
|
+
Human: ${pair.prompt}
|
|
326
|
+
|
|
327
|
+
Assistant:`,
|
|
328
|
+
completion: ` ${pair.chosen}`
|
|
329
|
+
})
|
|
330
|
+
).join("\n");
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* Convert to OpenAI format
|
|
334
|
+
*/
|
|
335
|
+
toOpenAIFormat(pairs) {
|
|
336
|
+
return pairs.map(
|
|
337
|
+
(pair) => JSON.stringify({
|
|
338
|
+
messages: [
|
|
339
|
+
{ role: "user", content: pair.prompt },
|
|
340
|
+
{ role: "assistant", content: pair.chosen }
|
|
341
|
+
]
|
|
342
|
+
})
|
|
343
|
+
).join("\n");
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Export to HuggingFace Hub (stub)
|
|
347
|
+
*/
|
|
348
|
+
async exportToHuggingFace(pairs, options) {
|
|
349
|
+
const hfOptions = options.formatOptions;
|
|
350
|
+
if (!hfOptions?.token) {
|
|
351
|
+
throw new Error("HuggingFace token is required for Hub export");
|
|
352
|
+
}
|
|
353
|
+
console.warn(
|
|
354
|
+
"HuggingFace Hub export not fully implemented. Saving locally instead."
|
|
355
|
+
);
|
|
356
|
+
const localPath = options.path ?? `./${hfOptions.name ?? "dataset"}.jsonl`;
|
|
357
|
+
const content = this.toJSONL(pairs, { formatOptions: { format: "dpo" } });
|
|
358
|
+
await fs.writeFile(localPath, content, "utf-8");
|
|
359
|
+
return {
|
|
360
|
+
format: "huggingface",
|
|
361
|
+
path: localPath,
|
|
362
|
+
itemCount: pairs.length,
|
|
363
|
+
warnings: ["Exported locally. Use @huggingface/hub to push to Hub."]
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* Export to multiple formats
|
|
368
|
+
*/
|
|
369
|
+
async exportMultiple(dataset, formats, basePath) {
|
|
370
|
+
const results = /* @__PURE__ */ new Map();
|
|
371
|
+
for (const format of formats) {
|
|
372
|
+
const ext = this.getExtension(format);
|
|
373
|
+
const path = `${basePath}.${ext}`;
|
|
374
|
+
const result = await this.exportPreferences(dataset, { format, path });
|
|
375
|
+
results.set(format, result);
|
|
376
|
+
}
|
|
377
|
+
return results;
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Get file extension for format
|
|
381
|
+
*/
|
|
382
|
+
getExtension(format) {
|
|
383
|
+
switch (format) {
|
|
384
|
+
case "jsonl":
|
|
385
|
+
return "jsonl";
|
|
386
|
+
case "json":
|
|
387
|
+
return "json";
|
|
388
|
+
case "csv":
|
|
389
|
+
return "csv";
|
|
390
|
+
case "parquet":
|
|
391
|
+
return "parquet";
|
|
392
|
+
default:
|
|
393
|
+
return "jsonl";
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
/**
|
|
397
|
+
* Escape CSV value
|
|
398
|
+
*/
|
|
399
|
+
escapeCSV(value) {
|
|
400
|
+
if (value.includes(",") || value.includes('"') || value.includes("\n")) {
|
|
401
|
+
return `"${value.replace(/"/g, '""')}"`;
|
|
402
|
+
}
|
|
403
|
+
return value;
|
|
404
|
+
}
|
|
405
|
+
};
|
|
406
|
+
function createDatasetExporter() {
|
|
407
|
+
return new DatasetExporter();
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
export {
|
|
411
|
+
PreferenceDataset,
|
|
412
|
+
PreferenceDatasetBuilder,
|
|
413
|
+
createPreferenceDatasetBuilder,
|
|
414
|
+
DatasetExporter,
|
|
415
|
+
createDatasetExporter
|
|
416
|
+
};
|