@lov3kaizen/agentsea-evaluate 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/dist/annotation/index.d.mts +3 -0
  4. package/dist/annotation/index.d.ts +3 -0
  5. package/dist/annotation/index.js +630 -0
  6. package/dist/annotation/index.mjs +22 -0
  7. package/dist/chunk-5JRYKRSE.mjs +2791 -0
  8. package/dist/chunk-EUXXIZK3.mjs +676 -0
  9. package/dist/chunk-NBMUSATK.mjs +596 -0
  10. package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
  11. package/dist/chunk-TUMNJN2S.mjs +416 -0
  12. package/dist/continuous/index.d.mts +2 -0
  13. package/dist/continuous/index.d.ts +2 -0
  14. package/dist/continuous/index.js +707 -0
  15. package/dist/continuous/index.mjs +16 -0
  16. package/dist/datasets/index.d.mts +1 -0
  17. package/dist/datasets/index.d.ts +1 -0
  18. package/dist/datasets/index.js +456 -0
  19. package/dist/datasets/index.mjs +14 -0
  20. package/dist/evaluation/index.d.mts +1 -0
  21. package/dist/evaluation/index.d.ts +1 -0
  22. package/dist/evaluation/index.js +2853 -0
  23. package/dist/evaluation/index.mjs +78 -0
  24. package/dist/feedback/index.d.mts +2 -0
  25. package/dist/feedback/index.d.ts +2 -0
  26. package/dist/feedback/index.js +1158 -0
  27. package/dist/feedback/index.mjs +40 -0
  28. package/dist/index-6Pbiq7ny.d.mts +234 -0
  29. package/dist/index-6Pbiq7ny.d.ts +234 -0
  30. package/dist/index-BNTycFEA.d.mts +479 -0
  31. package/dist/index-BNTycFEA.d.ts +479 -0
  32. package/dist/index-CTYCfWfH.d.mts +543 -0
  33. package/dist/index-CTYCfWfH.d.ts +543 -0
  34. package/dist/index-Cq5LwG_3.d.mts +322 -0
  35. package/dist/index-Cq5LwG_3.d.ts +322 -0
  36. package/dist/index-bPghFsfP.d.mts +315 -0
  37. package/dist/index-bPghFsfP.d.ts +315 -0
  38. package/dist/index.d.mts +81 -0
  39. package/dist/index.d.ts +81 -0
  40. package/dist/index.js +5962 -0
  41. package/dist/index.mjs +429 -0
  42. package/package.json +102 -0
@@ -0,0 +1 @@
1
+ export { v as DatasetExporter, s as PreferenceDataset, t as PreferenceDatasetBuilder, w as createDatasetExporter, u as createPreferenceDatasetBuilder } from '../index-6Pbiq7ny.js';
@@ -0,0 +1,456 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/datasets/index.ts
31
+ var datasets_exports = {};
32
+ __export(datasets_exports, {
33
+ DatasetExporter: () => DatasetExporter,
34
+ PreferenceDataset: () => PreferenceDataset,
35
+ PreferenceDatasetBuilder: () => PreferenceDatasetBuilder,
36
+ createDatasetExporter: () => createDatasetExporter,
37
+ createPreferenceDatasetBuilder: () => createPreferenceDatasetBuilder
38
+ });
39
+ module.exports = __toCommonJS(datasets_exports);
40
+
41
+ // src/datasets/PreferenceDatasetBuilder.ts
42
+ var import_nanoid = require("nanoid");
43
+ var PreferenceDataset = class _PreferenceDataset {
44
+ type = "preference";
45
+ pairs;
46
+ _stats;
47
+ constructor(pairs) {
48
+ this.pairs = pairs;
49
+ }
50
+ get size() {
51
+ return this.pairs.length;
52
+ }
53
+ get stats() {
54
+ if (!this._stats) {
55
+ this._stats = this.calculateStats();
56
+ }
57
+ return this._stats;
58
+ }
59
+ getPairs() {
60
+ return [...this.pairs];
61
+ }
62
+ filter(predicate) {
63
+ return new _PreferenceDataset(this.pairs.filter(predicate));
64
+ }
65
+ sample(count) {
66
+ if (count >= this.pairs.length) {
67
+ return new _PreferenceDataset([...this.pairs]);
68
+ }
69
+ const shuffled = [...this.pairs];
70
+ for (let i = shuffled.length - 1; i > 0; i--) {
71
+ const j = Math.floor(Math.random() * (i + 1));
72
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
73
+ }
74
+ return new _PreferenceDataset(shuffled.slice(0, count));
75
+ }
76
+ split(ratio) {
77
+ const shuffled = [...this.pairs];
78
+ for (let i = shuffled.length - 1; i > 0; i--) {
79
+ const j = Math.floor(Math.random() * (i + 1));
80
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
81
+ }
82
+ const splitIndex = Math.floor(shuffled.length * ratio);
83
+ return [
84
+ new _PreferenceDataset(shuffled.slice(0, splitIndex)),
85
+ new _PreferenceDataset(shuffled.slice(splitIndex))
86
+ ];
87
+ }
88
+ shuffle(seed) {
89
+ const shuffled = [...this.pairs];
90
+ const rng = seed !== void 0 ? this.seededRandom(seed) : Math.random;
91
+ for (let i = shuffled.length - 1; i > 0; i--) {
92
+ const j = Math.floor(rng() * (i + 1));
93
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
94
+ }
95
+ return new _PreferenceDataset(shuffled);
96
+ }
97
+ calculateStats() {
98
+ const models = /* @__PURE__ */ new Map();
99
+ let totalPromptLen = 0;
100
+ let totalResponseLen = 0;
101
+ const uniquePrompts = /* @__PURE__ */ new Set();
102
+ for (const pair of this.pairs) {
103
+ uniquePrompts.add(pair.prompt);
104
+ totalPromptLen += pair.prompt.length;
105
+ totalResponseLen += (pair.chosen.length + pair.rejected.length) / 2;
106
+ if (pair.chosenModel) {
107
+ models.set(pair.chosenModel, (models.get(pair.chosenModel) ?? 0) + 1);
108
+ }
109
+ if (pair.rejectedModel) {
110
+ models.set(
111
+ pair.rejectedModel,
112
+ (models.get(pair.rejectedModel) ?? 0) + 1
113
+ );
114
+ }
115
+ }
116
+ return {
117
+ size: this.pairs.length,
118
+ type: "preference",
119
+ avgPromptLength: this.pairs.length > 0 ? totalPromptLen / this.pairs.length : 0,
120
+ avgResponseLength: this.pairs.length > 0 ? totalResponseLen / this.pairs.length : 0,
121
+ uniquePrompts: uniquePrompts.size,
122
+ modelDistribution: Object.fromEntries(models)
123
+ };
124
+ }
125
+ seededRandom(seed) {
126
+ return () => {
127
+ seed = (seed * 9301 + 49297) % 233280;
128
+ return seed / 233280;
129
+ };
130
+ }
131
+ };
132
+ var PreferenceDatasetBuilder = class {
133
+ feedbackStore;
134
+ sampling;
135
+ constructor(config) {
136
+ this.feedbackStore = config.feedbackStore;
137
+ this.sampling = config.sampling;
138
+ }
139
+ /**
140
+ * Build preference dataset from feedback
141
+ */
142
+ async build(options = {}) {
143
+ const minPairs = options.minPairs ?? 0;
144
+ const maxPairs = options.maxPairs ?? Infinity;
145
+ const result = await this.feedbackStore.query({
146
+ type: "preference",
147
+ minConfidence: this.sampling?.minConfidence,
148
+ limit: maxPairs * 2
149
+ // Fetch extra to account for filtering
150
+ });
151
+ let pairs = [];
152
+ for (const item of result.items) {
153
+ const feedback = item;
154
+ if (feedback.preference === "tie") {
155
+ continue;
156
+ }
157
+ const chosen = feedback.preference === "A" ? feedback.responseA : feedback.responseB;
158
+ const rejected = feedback.preference === "A" ? feedback.responseB : feedback.responseA;
159
+ const pair = {
160
+ id: (0, import_nanoid.nanoid)(),
161
+ prompt: feedback.input,
162
+ chosen: chosen.content,
163
+ rejected: rejected.content,
164
+ chosenModel: chosen.model,
165
+ rejectedModel: rejected.model,
166
+ reason: feedback.reason,
167
+ confidence: feedback.confidence,
168
+ metadata: feedback.metadata
169
+ };
170
+ pairs.push(pair);
171
+ }
172
+ if (options.filterFn) {
173
+ pairs = pairs.filter(options.filterFn);
174
+ }
175
+ if (options.deduplication && options.deduplication !== "none") {
176
+ pairs = this.deduplicate(pairs, options.deduplication);
177
+ }
178
+ if (this.sampling) {
179
+ pairs = this.applySampling(pairs);
180
+ }
181
+ if (pairs.length > maxPairs) {
182
+ pairs = pairs.slice(0, maxPairs);
183
+ }
184
+ if (pairs.length < minPairs) {
185
+ console.warn(
186
+ `Only ${pairs.length} pairs available, requested minimum ${minPairs}`
187
+ );
188
+ }
189
+ return new PreferenceDataset(pairs);
190
+ }
191
+ /**
192
+ * Deduplicate pairs
193
+ */
194
+ deduplicate(pairs, mode) {
195
+ const seen = /* @__PURE__ */ new Set();
196
+ const result = [];
197
+ for (const pair of pairs) {
198
+ const key = mode === "prompt" ? pair.prompt : `${pair.prompt}|${pair.chosen}|${pair.rejected}`;
199
+ if (!seen.has(key)) {
200
+ seen.add(key);
201
+ result.push(pair);
202
+ }
203
+ }
204
+ return result;
205
+ }
206
+ /**
207
+ * Apply sampling strategy
208
+ */
209
+ applySampling(pairs) {
210
+ if (!this.sampling) return pairs;
211
+ switch (this.sampling.type) {
212
+ case "random":
213
+ return this.randomSample(pairs);
214
+ case "balanced":
215
+ return this.balancedSample(pairs);
216
+ case "stratified":
217
+ return this.stratifiedSample(pairs);
218
+ default:
219
+ return pairs;
220
+ }
221
+ }
222
+ randomSample(pairs) {
223
+ const shuffled = [...pairs];
224
+ const seed = this.sampling?.seed;
225
+ const rng = seed !== void 0 ? this.seededRandom(seed) : Math.random;
226
+ for (let i = shuffled.length - 1; i > 0; i--) {
227
+ const j = Math.floor(rng() * (i + 1));
228
+ [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]];
229
+ }
230
+ return shuffled;
231
+ }
232
+ balancedSample(pairs) {
233
+ const minConf = this.sampling?.minConfidence ?? 0;
234
+ return pairs.filter((p) => (p.confidence ?? 1) >= minConf);
235
+ }
236
+ stratifiedSample(pairs) {
237
+ const field = this.sampling?.stratifyBy ?? "chosenModel";
238
+ const groups = /* @__PURE__ */ new Map();
239
+ for (const pair of pairs) {
240
+ const key = String(
241
+ pair[field] ?? "unknown"
242
+ );
243
+ if (!groups.has(key)) {
244
+ groups.set(key, []);
245
+ }
246
+ groups.get(key).push(pair);
247
+ }
248
+ const result = [];
249
+ const ratios = this.sampling?.stratifyRatios ?? {};
250
+ for (const [key, group] of groups) {
251
+ const ratio = ratios[key] ?? 1 / groups.size;
252
+ const count = Math.ceil(pairs.length * ratio);
253
+ result.push(...group.slice(0, count));
254
+ }
255
+ return result;
256
+ }
257
+ seededRandom(seed) {
258
+ return () => {
259
+ seed = (seed * 9301 + 49297) % 233280;
260
+ return seed / 233280;
261
+ };
262
+ }
263
+ };
264
+ function createPreferenceDatasetBuilder(config) {
265
+ return new PreferenceDatasetBuilder(config);
266
+ }
267
+
268
+ // src/datasets/DatasetExporter.ts
269
+ var fs = __toESM(require("fs/promises"));
270
+ var DatasetExporter = class {
271
+ /**
272
+ * Export preference dataset to file
273
+ */
274
+ async exportPreferences(dataset, options) {
275
+ const pairs = dataset.getPairs();
276
+ let content;
277
+ const warnings = [];
278
+ switch (options.format) {
279
+ case "jsonl":
280
+ content = this.toJSONL(pairs, options);
281
+ break;
282
+ case "json":
283
+ content = JSON.stringify(pairs, null, 2);
284
+ break;
285
+ case "csv":
286
+ content = this.toCSV(pairs);
287
+ break;
288
+ case "huggingface":
289
+ return this.exportToHuggingFace(pairs, options);
290
+ case "anthropic":
291
+ content = this.toAnthropicFormat(pairs);
292
+ break;
293
+ case "openai":
294
+ content = this.toOpenAIFormat(pairs);
295
+ break;
296
+ default:
297
+ throw new Error(`Unsupported export format: ${options.format}`);
298
+ }
299
+ if (options.path) {
300
+ await fs.writeFile(options.path, content, "utf-8");
301
+ }
302
+ return {
303
+ format: options.format,
304
+ path: options.path,
305
+ itemCount: pairs.length,
306
+ bytesWritten: Buffer.byteLength(content, "utf-8"),
307
+ warnings: warnings.length > 0 ? warnings : void 0
308
+ };
309
+ }
310
+ /**
311
+ * Convert to JSONL format
312
+ */
313
+ toJSONL(pairs, options) {
314
+ const format = options?.formatOptions?.format ?? "dpo";
315
+ return pairs.map((pair) => {
316
+ switch (format) {
317
+ case "dpo":
318
+ return JSON.stringify({
319
+ prompt: pair.prompt,
320
+ chosen: pair.chosen,
321
+ rejected: pair.rejected
322
+ });
323
+ case "sft":
324
+ return JSON.stringify({
325
+ instruction: pair.prompt,
326
+ output: pair.chosen
327
+ });
328
+ default:
329
+ return JSON.stringify(pair);
330
+ }
331
+ }).join("\n");
332
+ }
333
+ /**
334
+ * Convert to CSV format
335
+ */
336
+ toCSV(pairs) {
337
+ const headers = [
338
+ "prompt",
339
+ "chosen",
340
+ "rejected",
341
+ "chosen_model",
342
+ "rejected_model",
343
+ "confidence"
344
+ ];
345
+ const rows = pairs.map(
346
+ (pair) => [
347
+ this.escapeCSV(pair.prompt),
348
+ this.escapeCSV(pair.chosen),
349
+ this.escapeCSV(pair.rejected),
350
+ pair.chosenModel ?? "",
351
+ pair.rejectedModel ?? "",
352
+ pair.confidence?.toString() ?? ""
353
+ ].join(",")
354
+ );
355
+ return [headers.join(","), ...rows].join("\n");
356
+ }
357
+ /**
358
+ * Convert to Anthropic format
359
+ */
360
+ toAnthropicFormat(pairs) {
361
+ return pairs.map(
362
+ (pair) => JSON.stringify({
363
+ prompt: `
364
+
365
+ Human: ${pair.prompt}
366
+
367
+ Assistant:`,
368
+ completion: ` ${pair.chosen}`
369
+ })
370
+ ).join("\n");
371
+ }
372
+ /**
373
+ * Convert to OpenAI format
374
+ */
375
+ toOpenAIFormat(pairs) {
376
+ return pairs.map(
377
+ (pair) => JSON.stringify({
378
+ messages: [
379
+ { role: "user", content: pair.prompt },
380
+ { role: "assistant", content: pair.chosen }
381
+ ]
382
+ })
383
+ ).join("\n");
384
+ }
385
+ /**
386
+ * Export to HuggingFace Hub (stub)
387
+ */
388
+ async exportToHuggingFace(pairs, options) {
389
+ const hfOptions = options.formatOptions;
390
+ if (!hfOptions?.token) {
391
+ throw new Error("HuggingFace token is required for Hub export");
392
+ }
393
+ console.warn(
394
+ "HuggingFace Hub export not fully implemented. Saving locally instead."
395
+ );
396
+ const localPath = options.path ?? `./${hfOptions.name ?? "dataset"}.jsonl`;
397
+ const content = this.toJSONL(pairs, { formatOptions: { format: "dpo" } });
398
+ await fs.writeFile(localPath, content, "utf-8");
399
+ return {
400
+ format: "huggingface",
401
+ path: localPath,
402
+ itemCount: pairs.length,
403
+ warnings: ["Exported locally. Use @huggingface/hub to push to Hub."]
404
+ };
405
+ }
406
+ /**
407
+ * Export to multiple formats
408
+ */
409
+ async exportMultiple(dataset, formats, basePath) {
410
+ const results = /* @__PURE__ */ new Map();
411
+ for (const format of formats) {
412
+ const ext = this.getExtension(format);
413
+ const path = `${basePath}.${ext}`;
414
+ const result = await this.exportPreferences(dataset, { format, path });
415
+ results.set(format, result);
416
+ }
417
+ return results;
418
+ }
419
+ /**
420
+ * Get file extension for format
421
+ */
422
+ getExtension(format) {
423
+ switch (format) {
424
+ case "jsonl":
425
+ return "jsonl";
426
+ case "json":
427
+ return "json";
428
+ case "csv":
429
+ return "csv";
430
+ case "parquet":
431
+ return "parquet";
432
+ default:
433
+ return "jsonl";
434
+ }
435
+ }
436
+ /**
437
+ * Escape CSV value
438
+ */
439
+ escapeCSV(value) {
440
+ if (value.includes(",") || value.includes('"') || value.includes("\n")) {
441
+ return `"${value.replace(/"/g, '""')}"`;
442
+ }
443
+ return value;
444
+ }
445
+ };
446
+ function createDatasetExporter() {
447
+ return new DatasetExporter();
448
+ }
449
+ // Annotate the CommonJS export names for ESM import in node:
450
+ 0 && (module.exports = {
451
+ DatasetExporter,
452
+ PreferenceDataset,
453
+ PreferenceDatasetBuilder,
454
+ createDatasetExporter,
455
+ createPreferenceDatasetBuilder
456
+ });
@@ -0,0 +1,14 @@
1
+ import {
2
+ DatasetExporter,
3
+ PreferenceDataset,
4
+ PreferenceDatasetBuilder,
5
+ createDatasetExporter,
6
+ createPreferenceDatasetBuilder
7
+ } from "../chunk-TUMNJN2S.mjs";
8
+ export {
9
+ DatasetExporter,
10
+ PreferenceDataset,
11
+ PreferenceDatasetBuilder,
12
+ createDatasetExporter,
13
+ createPreferenceDatasetBuilder
14
+ };
@@ -0,0 +1 @@
1
+ export { X as Accuracy, W as BaseMetric, aj as CodeQualityRubric, $ as Coherence, al as ComparativeJudge, an as ConsensusJudge, a5 as ContextRelevance, a7 as CustomMetric, a as EvalDataset, Q as EvalRunner, E as EvaluationPipeline, a3 as Faithfulness, ak as HelpfulnessRubric, ae as LLMJudge, ai as QualityRubric, Z as Relevance, ag as RubricJudge, a1 as Toxicity, Y as createAccuracyMetric, a0 as createCoherenceMetric, am as createComparativeJudge, ao as createConsensusJudge, ad as createContainsMetric, a6 as createContextRelevanceMetric, a8 as createCustomMetric, O as createEvalDataset, U as createEvalRunner, V as createEvaluationPipeline, a4 as createFaithfulnessMetric, ac as createJSONMetric, af as createLLMJudge, aa as createLengthMetric, ab as createRegexMetric, _ as createRelevanceMetric, ah as createRubricJudge, a9 as createSimpleMetric, a2 as createToxicityMetric } from '../index-CTYCfWfH.mjs';
@@ -0,0 +1 @@
1
+ export { X as Accuracy, W as BaseMetric, aj as CodeQualityRubric, $ as Coherence, al as ComparativeJudge, an as ConsensusJudge, a5 as ContextRelevance, a7 as CustomMetric, a as EvalDataset, Q as EvalRunner, E as EvaluationPipeline, a3 as Faithfulness, ak as HelpfulnessRubric, ae as LLMJudge, ai as QualityRubric, Z as Relevance, ag as RubricJudge, a1 as Toxicity, Y as createAccuracyMetric, a0 as createCoherenceMetric, am as createComparativeJudge, ao as createConsensusJudge, ad as createContainsMetric, a6 as createContextRelevanceMetric, a8 as createCustomMetric, O as createEvalDataset, U as createEvalRunner, V as createEvaluationPipeline, a4 as createFaithfulnessMetric, ac as createJSONMetric, af as createLLMJudge, aa as createLengthMetric, ab as createRegexMetric, _ as createRelevanceMetric, ah as createRubricJudge, a9 as createSimpleMetric, a2 as createToxicityMetric } from '../index-CTYCfWfH.js';