@lov3kaizen/agentsea-evaluate 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/dist/annotation/index.d.mts +3 -0
  4. package/dist/annotation/index.d.ts +3 -0
  5. package/dist/annotation/index.js +630 -0
  6. package/dist/annotation/index.mjs +22 -0
  7. package/dist/chunk-5JRYKRSE.mjs +2791 -0
  8. package/dist/chunk-EUXXIZK3.mjs +676 -0
  9. package/dist/chunk-NBMUSATK.mjs +596 -0
  10. package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
  11. package/dist/chunk-TUMNJN2S.mjs +416 -0
  12. package/dist/continuous/index.d.mts +2 -0
  13. package/dist/continuous/index.d.ts +2 -0
  14. package/dist/continuous/index.js +707 -0
  15. package/dist/continuous/index.mjs +16 -0
  16. package/dist/datasets/index.d.mts +1 -0
  17. package/dist/datasets/index.d.ts +1 -0
  18. package/dist/datasets/index.js +456 -0
  19. package/dist/datasets/index.mjs +14 -0
  20. package/dist/evaluation/index.d.mts +1 -0
  21. package/dist/evaluation/index.d.ts +1 -0
  22. package/dist/evaluation/index.js +2853 -0
  23. package/dist/evaluation/index.mjs +78 -0
  24. package/dist/feedback/index.d.mts +2 -0
  25. package/dist/feedback/index.d.ts +2 -0
  26. package/dist/feedback/index.js +1158 -0
  27. package/dist/feedback/index.mjs +40 -0
  28. package/dist/index-6Pbiq7ny.d.mts +234 -0
  29. package/dist/index-6Pbiq7ny.d.ts +234 -0
  30. package/dist/index-BNTycFEA.d.mts +479 -0
  31. package/dist/index-BNTycFEA.d.ts +479 -0
  32. package/dist/index-CTYCfWfH.d.mts +543 -0
  33. package/dist/index-CTYCfWfH.d.ts +543 -0
  34. package/dist/index-Cq5LwG_3.d.mts +322 -0
  35. package/dist/index-Cq5LwG_3.d.ts +322 -0
  36. package/dist/index-bPghFsfP.d.mts +315 -0
  37. package/dist/index-bPghFsfP.d.ts +315 -0
  38. package/dist/index.d.mts +81 -0
  39. package/dist/index.d.ts +81 -0
  40. package/dist/index.js +5962 -0
  41. package/dist/index.mjs +429 -0
  42. package/package.json +102 -0
package/dist/index.mjs ADDED
@@ -0,0 +1,429 @@
1
+ import {
2
+ AnnotationQueue,
3
+ AnnotationTask,
4
+ BinaryClassificationSchema,
5
+ ConsensusManager,
6
+ QualityRatingSchema,
7
+ TextSpanSchema,
8
+ createAnnotationQueue,
9
+ createAnnotationTask,
10
+ createConsensusManager
11
+ } from "./chunk-NBMUSATK.mjs";
12
+ import {
13
+ ABTestRunner,
14
+ AlertManager,
15
+ ContinuousEval,
16
+ createABTestRunner,
17
+ createAlertManager,
18
+ createContinuousEval
19
+ } from "./chunk-EUXXIZK3.mjs";
20
+ import {
21
+ DatasetExporter,
22
+ PreferenceDataset,
23
+ PreferenceDatasetBuilder,
24
+ createDatasetExporter,
25
+ createPreferenceDatasetBuilder
26
+ } from "./chunk-TUMNJN2S.mjs";
27
+ import {
28
+ Accuracy,
29
+ BaseMetric,
30
+ CodeQualityRubric,
31
+ Coherence,
32
+ ComparativeJudge,
33
+ ConsensusJudge,
34
+ ContextRelevance,
35
+ CustomMetric,
36
+ EvalDataset,
37
+ EvalRunner,
38
+ EvaluationPipeline,
39
+ Faithfulness,
40
+ HelpfulnessRubric,
41
+ LLMJudge,
42
+ QualityRubric,
43
+ Relevance,
44
+ RubricJudge,
45
+ Toxicity,
46
+ createAccuracyMetric,
47
+ createCoherenceMetric,
48
+ createComparativeJudge,
49
+ createConsensusJudge,
50
+ createContainsMetric,
51
+ createContextRelevanceMetric,
52
+ createCustomMetric,
53
+ createEvalDataset,
54
+ createEvalRunner,
55
+ createEvaluationPipeline,
56
+ createFaithfulnessMetric,
57
+ createJSONMetric,
58
+ createLLMJudge,
59
+ createLengthMetric,
60
+ createRegexMetric,
61
+ createRelevanceMetric,
62
+ createRubricJudge,
63
+ createSimpleMetric,
64
+ createToxicityMetric
65
+ } from "./chunk-5JRYKRSE.mjs";
66
+ import {
67
+ BaseCollector,
68
+ CorrectionCollector,
69
+ FeedbackAggregator,
70
+ FeedbackExporter,
71
+ MemoryFeedbackStore,
72
+ MultiCriteriaCollector,
73
+ PreferenceCollector,
74
+ RatingCollector,
75
+ SQLiteFeedbackStore,
76
+ ThumbsCollector,
77
+ createCorrectionCollector,
78
+ createFeedbackAggregator,
79
+ createFeedbackExporter,
80
+ createFeedbackStore,
81
+ createMultiCriteriaCollector,
82
+ createPreferenceCollector,
83
+ createRatingCollector,
84
+ createThumbsCollector
85
+ } from "./chunk-PAQ2TTJJ.mjs";
86
+
87
+ // src/types/feedback.types.ts
88
+ import { z } from "zod";
89
+ var ThumbsRatingSchema = z.enum(["up", "down"]);
90
+ var StarRatingSchema = z.union([
91
+ z.literal(1),
92
+ z.literal(2),
93
+ z.literal(3),
94
+ z.literal(4),
95
+ z.literal(5)
96
+ ]);
97
+ var PreferenceChoiceSchema = z.enum(["A", "B", "tie"]);
98
+ var CollectThumbsInputSchema = z.object({
99
+ responseId: z.string(),
100
+ conversationId: z.string().optional(),
101
+ input: z.string(),
102
+ output: z.string(),
103
+ feedback: z.object({
104
+ rating: ThumbsRatingSchema,
105
+ comment: z.string().optional()
106
+ }),
107
+ userId: z.string().optional(),
108
+ metadata: z.record(z.unknown()).optional()
109
+ });
110
+ var CollectPreferenceInputSchema = z.object({
111
+ input: z.string(),
112
+ responseA: z.object({
113
+ id: z.string(),
114
+ content: z.string(),
115
+ model: z.string().optional()
116
+ }),
117
+ responseB: z.object({
118
+ id: z.string(),
119
+ content: z.string(),
120
+ model: z.string().optional()
121
+ }),
122
+ preference: PreferenceChoiceSchema,
123
+ reason: z.string().optional(),
124
+ confidence: z.number().min(0).max(1).optional(),
125
+ userId: z.string().optional(),
126
+ metadata: z.record(z.unknown()).optional()
127
+ });
128
+
129
+ // src/integrations/agentsea/FeedbackMiddleware.ts
130
+ var FeedbackMiddleware = class {
131
+ collector;
132
+ autoCapture;
133
+ captureFields;
134
+ pendingFeedback = /* @__PURE__ */ new Map();
135
+ constructor(options) {
136
+ this.collector = options.collector ?? new ThumbsCollector({ store: options.store });
137
+ this.autoCapture = options.autoCapture ?? true;
138
+ this.captureFields = options.captureFields ?? [
139
+ "input",
140
+ "output",
141
+ "toolCalls",
142
+ "latency"
143
+ ];
144
+ }
145
+ /**
146
+ * Process agent message and capture for potential feedback
147
+ */
148
+ capture(context) {
149
+ if (!this.autoCapture) return;
150
+ const messages = context.messages;
151
+ if (messages.length < 2) return;
152
+ let userMessage;
153
+ let assistantMessage;
154
+ for (let i = messages.length - 1; i >= 0; i--) {
155
+ if (!assistantMessage && messages[i].role === "assistant") {
156
+ assistantMessage = messages[i];
157
+ }
158
+ if (!userMessage && messages[i].role === "user") {
159
+ userMessage = messages[i];
160
+ }
161
+ if (userMessage && assistantMessage) break;
162
+ }
163
+ if (!userMessage || !assistantMessage) return;
164
+ const metadata = {};
165
+ if (this.captureFields.includes("toolCalls") && assistantMessage.metadata?.toolCalls) {
166
+ metadata.toolCalls = assistantMessage.metadata.toolCalls;
167
+ }
168
+ if (this.captureFields.includes("latency") && assistantMessage.metadata?.latencyMs) {
169
+ metadata.latencyMs = assistantMessage.metadata.latencyMs;
170
+ }
171
+ if (context.metadata?.model) {
172
+ metadata.model = context.metadata.model;
173
+ }
174
+ this.pendingFeedback.set(assistantMessage.id, {
175
+ input: userMessage.content,
176
+ output: assistantMessage.content,
177
+ conversationId: context.conversationId,
178
+ metadata,
179
+ timestamp: Date.now()
180
+ });
181
+ this.cleanupPending();
182
+ }
183
+ /**
184
+ * Record feedback for a response
185
+ */
186
+ async recordFeedback(responseId, rating, comment, userId) {
187
+ const pending = this.pendingFeedback.get(responseId);
188
+ if (!pending) {
189
+ console.warn(`No pending feedback found for response ${responseId}`);
190
+ return null;
191
+ }
192
+ const feedback = await this.collector.collect({
193
+ responseId,
194
+ conversationId: pending.conversationId,
195
+ input: pending.input,
196
+ output: pending.output,
197
+ feedback: { rating, comment },
198
+ userId,
199
+ metadata: pending.metadata
200
+ });
201
+ this.pendingFeedback.delete(responseId);
202
+ return feedback;
203
+ }
204
+ /**
205
+ * Get pending feedback IDs
206
+ */
207
+ getPendingIds() {
208
+ return Array.from(this.pendingFeedback.keys());
209
+ }
210
+ /**
211
+ * Clear pending feedback
212
+ */
213
+ clearPending() {
214
+ this.pendingFeedback.clear();
215
+ }
216
+ /**
217
+ * Clean up old pending feedback
218
+ */
219
+ cleanupPending() {
220
+ const oneHourAgo = Date.now() - 36e5;
221
+ for (const [id, data] of this.pendingFeedback.entries()) {
222
+ if (data.timestamp < oneHourAgo) {
223
+ this.pendingFeedback.delete(id);
224
+ }
225
+ }
226
+ }
227
+ /**
228
+ * Get collector
229
+ */
230
+ getCollector() {
231
+ return this.collector;
232
+ }
233
+ };
234
+ function createFeedbackMiddleware(options) {
235
+ return new FeedbackMiddleware(options);
236
+ }
237
+
238
+ // src/integrations/agentsea/AgentEvaluator.ts
239
+ var AgentEvaluator = class {
240
+ pipeline;
241
+ scenarios;
242
+ constructor(options) {
243
+ this.pipeline = options.pipeline;
244
+ this.scenarios = options.scenarios;
245
+ }
246
+ /**
247
+ * Evaluate an agent
248
+ */
249
+ async evaluate(agent) {
250
+ const categoryScores = {};
251
+ const categoryResults = {};
252
+ const recommendations = [];
253
+ let totalTests = 0;
254
+ let totalPassed = 0;
255
+ let weightedScoreSum = 0;
256
+ let totalWeight = 0;
257
+ for (const scenario of this.scenarios) {
258
+ const result = await this.pipeline.evaluate({
259
+ dataset: scenario.dataset,
260
+ generateFn: async (input, context) => {
261
+ return agent.execute(input, context);
262
+ }
263
+ });
264
+ const avgScore = result.summary.avgScore;
265
+ const weight = scenario.weight ?? 1;
266
+ categoryScores[scenario.category] = avgScore;
267
+ categoryResults[scenario.category] = result;
268
+ totalTests += result.summary.totalItems;
269
+ totalPassed += result.summary.passedItems;
270
+ weightedScoreSum += avgScore * weight;
271
+ totalWeight += weight;
272
+ if (avgScore < 0.7) {
273
+ recommendations.push(
274
+ `Improve ${scenario.category}: current score ${(avgScore * 100).toFixed(1)}%`
275
+ );
276
+ }
277
+ if (result.summary.passRate < 0.8) {
278
+ const topFailures = result.failures.slice(0, 3).map((f) => f.failedMetrics.join(", "));
279
+ if (topFailures.length > 0) {
280
+ recommendations.push(
281
+ `${scenario.category} failures often in: ${[...new Set(topFailures)].join(", ")}`
282
+ );
283
+ }
284
+ }
285
+ }
286
+ const overallScore = totalWeight > 0 ? weightedScoreSum / totalWeight : 0;
287
+ return {
288
+ overallScore,
289
+ categoryScores,
290
+ categoryResults,
291
+ recommendations,
292
+ summary: {
293
+ totalTests,
294
+ passed: totalPassed,
295
+ failed: totalTests - totalPassed,
296
+ passRate: totalTests > 0 ? totalPassed / totalTests : 0
297
+ }
298
+ };
299
+ }
300
+ /**
301
+ * Run quick benchmark
302
+ */
303
+ async benchmark(agent, sampleSize = 10) {
304
+ const allItems = [];
305
+ for (const scenario of this.scenarios) {
306
+ allItems.push(...scenario.dataset.sample(sampleSize).getItems());
307
+ }
308
+ const startTime = performance.now();
309
+ let totalScore = 0;
310
+ let count = 0;
311
+ for (const item of allItems.slice(0, sampleSize)) {
312
+ try {
313
+ const output = await agent.execute(item.input);
314
+ if (output && output.length > 0) {
315
+ totalScore += 1;
316
+ }
317
+ count++;
318
+ } catch {
319
+ count++;
320
+ }
321
+ }
322
+ const latencyMs = (performance.now() - startTime) / count;
323
+ return {
324
+ score: count > 0 ? totalScore / count : 0,
325
+ latencyMs
326
+ };
327
+ }
328
+ /**
329
+ * Add a scenario
330
+ */
331
+ addScenario(scenario) {
332
+ this.scenarios.push(scenario);
333
+ }
334
+ /**
335
+ * Get scenarios
336
+ */
337
+ getScenarios() {
338
+ return [...this.scenarios];
339
+ }
340
+ };
341
+ function createAgentEvaluator(options) {
342
+ return new AgentEvaluator(options);
343
+ }
344
+ export {
345
+ ABTestRunner,
346
+ Accuracy,
347
+ AgentEvaluator,
348
+ AlertManager,
349
+ AnnotationQueue,
350
+ AnnotationTask,
351
+ BaseCollector,
352
+ BaseMetric,
353
+ BinaryClassificationSchema,
354
+ CodeQualityRubric,
355
+ Coherence,
356
+ CollectPreferenceInputSchema,
357
+ CollectThumbsInputSchema,
358
+ ComparativeJudge,
359
+ ConsensusJudge,
360
+ ConsensusManager,
361
+ ContextRelevance,
362
+ ContinuousEval,
363
+ CorrectionCollector,
364
+ CustomMetric,
365
+ DatasetExporter,
366
+ EvalDataset,
367
+ EvalRunner,
368
+ EvaluationPipeline,
369
+ Faithfulness,
370
+ FeedbackAggregator,
371
+ FeedbackExporter,
372
+ FeedbackMiddleware,
373
+ HelpfulnessRubric,
374
+ LLMJudge,
375
+ MemoryFeedbackStore,
376
+ MultiCriteriaCollector,
377
+ PreferenceChoiceSchema,
378
+ PreferenceCollector,
379
+ PreferenceDataset,
380
+ PreferenceDatasetBuilder,
381
+ QualityRatingSchema,
382
+ QualityRubric,
383
+ RatingCollector,
384
+ Relevance,
385
+ RubricJudge,
386
+ SQLiteFeedbackStore,
387
+ StarRatingSchema,
388
+ TextSpanSchema,
389
+ ThumbsCollector,
390
+ ThumbsRatingSchema,
391
+ Toxicity,
392
+ createABTestRunner,
393
+ createAccuracyMetric,
394
+ createAgentEvaluator,
395
+ createAlertManager,
396
+ createAnnotationQueue,
397
+ createAnnotationTask,
398
+ createCoherenceMetric,
399
+ createComparativeJudge,
400
+ createConsensusJudge,
401
+ createConsensusManager,
402
+ createContainsMetric,
403
+ createContextRelevanceMetric,
404
+ createContinuousEval,
405
+ createCorrectionCollector,
406
+ createCustomMetric,
407
+ createDatasetExporter,
408
+ createEvalDataset,
409
+ createEvalRunner,
410
+ createEvaluationPipeline,
411
+ createFaithfulnessMetric,
412
+ createFeedbackAggregator,
413
+ createFeedbackExporter,
414
+ createFeedbackMiddleware,
415
+ createFeedbackStore,
416
+ createJSONMetric,
417
+ createLLMJudge,
418
+ createLengthMetric,
419
+ createMultiCriteriaCollector,
420
+ createPreferenceCollector,
421
+ createPreferenceDatasetBuilder,
422
+ createRatingCollector,
423
+ createRegexMetric,
424
+ createRelevanceMetric,
425
+ createRubricJudge,
426
+ createSimpleMetric,
427
+ createThumbsCollector,
428
+ createToxicityMetric
429
+ };
package/package.json ADDED
@@ -0,0 +1,102 @@
1
+ {
2
+ "name": "@lov3kaizen/agentsea-evaluate",
3
+ "version": "0.5.1",
4
+ "description": "Comprehensive feedback collection and LLM evaluation platform for Node.js - human-in-the-loop annotation, automated evaluation pipelines, preference dataset generation",
5
+ "main": "dist/index.js",
6
+ "module": "dist/index.mjs",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "types": "./dist/index.d.ts",
11
+ "import": "./dist/index.mjs",
12
+ "require": "./dist/index.js"
13
+ },
14
+ "./feedback": {
15
+ "types": "./dist/feedback/index.d.ts",
16
+ "import": "./dist/feedback/index.mjs",
17
+ "require": "./dist/feedback/index.js"
18
+ },
19
+ "./evaluation": {
20
+ "types": "./dist/evaluation/index.d.ts",
21
+ "import": "./dist/evaluation/index.mjs",
22
+ "require": "./dist/evaluation/index.js"
23
+ },
24
+ "./datasets": {
25
+ "types": "./dist/datasets/index.d.ts",
26
+ "import": "./dist/datasets/index.mjs",
27
+ "require": "./dist/datasets/index.js"
28
+ },
29
+ "./annotation": {
30
+ "types": "./dist/annotation/index.d.ts",
31
+ "import": "./dist/annotation/index.mjs",
32
+ "require": "./dist/annotation/index.js"
33
+ },
34
+ "./continuous": {
35
+ "types": "./dist/continuous/index.d.ts",
36
+ "import": "./dist/continuous/index.mjs",
37
+ "require": "./dist/continuous/index.js"
38
+ }
39
+ },
40
+ "files": [
41
+ "dist",
42
+ "README.md"
43
+ ],
44
+ "keywords": [
45
+ "llm",
46
+ "evaluation",
47
+ "feedback",
48
+ "annotation",
49
+ "rlhf",
50
+ "dpo",
51
+ "preference-learning",
52
+ "llm-as-judge",
53
+ "ai",
54
+ "machine-learning",
55
+ "nlp",
56
+ "rag",
57
+ "quality-assurance"
58
+ ],
59
+ "author": "lov3kaizen",
60
+ "license": "MIT",
61
+ "repository": {
62
+ "type": "git",
63
+ "url": "https://github.com/lov3kaizen/agentsea.git",
64
+ "directory": "packages/evaluate"
65
+ },
66
+ "dependencies": {
67
+ "eventemitter3": "^5.0.0",
68
+ "nanoid": "^5.0.0",
69
+ "zod": "^3.22.0"
70
+ },
71
+ "devDependencies": {
72
+ "@types/better-sqlite3": "^7.6.0",
73
+ "@types/node": "^20.0.0",
74
+ "tsup": "^8.0.0",
75
+ "typescript": "^5.3.0",
76
+ "vitest": "^1.0.0"
77
+ },
78
+ "peerDependencies": {
79
+ "@lov3kaizen/agentsea-core": ">=0.5.0"
80
+ },
81
+ "peerDependenciesMeta": {
82
+ "@lov3kaizen/agentsea-core": {
83
+ "optional": true
84
+ }
85
+ },
86
+ "optionalDependencies": {
87
+ "better-sqlite3": "^9.2.0",
88
+ "@huggingface/hub": "^0.14.0"
89
+ },
90
+ "engines": {
91
+ "node": ">=18.0.0"
92
+ },
93
+ "scripts": {
94
+ "build": "tsup src/index.ts src/feedback/index.ts src/evaluation/index.ts src/datasets/index.ts src/annotation/index.ts src/continuous/index.ts --format cjs,esm --dts --clean --external better-sqlite3 --external @huggingface/hub",
95
+ "dev": "tsup src/index.ts --format cjs,esm --dts --watch",
96
+ "test": "vitest run",
97
+ "test:watch": "vitest",
98
+ "test:coverage": "vitest run --coverage",
99
+ "lint": "eslint src --ext .ts",
100
+ "typecheck": "tsc --noEmit"
101
+ }
102
+ }