@lov3kaizen/agentsea-evaluate 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/dist/annotation/index.d.mts +3 -0
- package/dist/annotation/index.d.ts +3 -0
- package/dist/annotation/index.js +630 -0
- package/dist/annotation/index.mjs +22 -0
- package/dist/chunk-5JRYKRSE.mjs +2791 -0
- package/dist/chunk-EUXXIZK3.mjs +676 -0
- package/dist/chunk-NBMUSATK.mjs +596 -0
- package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
- package/dist/chunk-TUMNJN2S.mjs +416 -0
- package/dist/continuous/index.d.mts +2 -0
- package/dist/continuous/index.d.ts +2 -0
- package/dist/continuous/index.js +707 -0
- package/dist/continuous/index.mjs +16 -0
- package/dist/datasets/index.d.mts +1 -0
- package/dist/datasets/index.d.ts +1 -0
- package/dist/datasets/index.js +456 -0
- package/dist/datasets/index.mjs +14 -0
- package/dist/evaluation/index.d.mts +1 -0
- package/dist/evaluation/index.d.ts +1 -0
- package/dist/evaluation/index.js +2853 -0
- package/dist/evaluation/index.mjs +78 -0
- package/dist/feedback/index.d.mts +2 -0
- package/dist/feedback/index.d.ts +2 -0
- package/dist/feedback/index.js +1158 -0
- package/dist/feedback/index.mjs +40 -0
- package/dist/index-6Pbiq7ny.d.mts +234 -0
- package/dist/index-6Pbiq7ny.d.ts +234 -0
- package/dist/index-BNTycFEA.d.mts +479 -0
- package/dist/index-BNTycFEA.d.ts +479 -0
- package/dist/index-CTYCfWfH.d.mts +543 -0
- package/dist/index-CTYCfWfH.d.ts +543 -0
- package/dist/index-Cq5LwG_3.d.mts +322 -0
- package/dist/index-Cq5LwG_3.d.ts +322 -0
- package/dist/index-bPghFsfP.d.mts +315 -0
- package/dist/index-bPghFsfP.d.ts +315 -0
- package/dist/index.d.mts +81 -0
- package/dist/index.d.ts +81 -0
- package/dist/index.js +5962 -0
- package/dist/index.mjs +429 -0
- package/package.json +102 -0
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,429 @@
|
|
|
1
|
+
import {
|
|
2
|
+
AnnotationQueue,
|
|
3
|
+
AnnotationTask,
|
|
4
|
+
BinaryClassificationSchema,
|
|
5
|
+
ConsensusManager,
|
|
6
|
+
QualityRatingSchema,
|
|
7
|
+
TextSpanSchema,
|
|
8
|
+
createAnnotationQueue,
|
|
9
|
+
createAnnotationTask,
|
|
10
|
+
createConsensusManager
|
|
11
|
+
} from "./chunk-NBMUSATK.mjs";
|
|
12
|
+
import {
|
|
13
|
+
ABTestRunner,
|
|
14
|
+
AlertManager,
|
|
15
|
+
ContinuousEval,
|
|
16
|
+
createABTestRunner,
|
|
17
|
+
createAlertManager,
|
|
18
|
+
createContinuousEval
|
|
19
|
+
} from "./chunk-EUXXIZK3.mjs";
|
|
20
|
+
import {
|
|
21
|
+
DatasetExporter,
|
|
22
|
+
PreferenceDataset,
|
|
23
|
+
PreferenceDatasetBuilder,
|
|
24
|
+
createDatasetExporter,
|
|
25
|
+
createPreferenceDatasetBuilder
|
|
26
|
+
} from "./chunk-TUMNJN2S.mjs";
|
|
27
|
+
import {
|
|
28
|
+
Accuracy,
|
|
29
|
+
BaseMetric,
|
|
30
|
+
CodeQualityRubric,
|
|
31
|
+
Coherence,
|
|
32
|
+
ComparativeJudge,
|
|
33
|
+
ConsensusJudge,
|
|
34
|
+
ContextRelevance,
|
|
35
|
+
CustomMetric,
|
|
36
|
+
EvalDataset,
|
|
37
|
+
EvalRunner,
|
|
38
|
+
EvaluationPipeline,
|
|
39
|
+
Faithfulness,
|
|
40
|
+
HelpfulnessRubric,
|
|
41
|
+
LLMJudge,
|
|
42
|
+
QualityRubric,
|
|
43
|
+
Relevance,
|
|
44
|
+
RubricJudge,
|
|
45
|
+
Toxicity,
|
|
46
|
+
createAccuracyMetric,
|
|
47
|
+
createCoherenceMetric,
|
|
48
|
+
createComparativeJudge,
|
|
49
|
+
createConsensusJudge,
|
|
50
|
+
createContainsMetric,
|
|
51
|
+
createContextRelevanceMetric,
|
|
52
|
+
createCustomMetric,
|
|
53
|
+
createEvalDataset,
|
|
54
|
+
createEvalRunner,
|
|
55
|
+
createEvaluationPipeline,
|
|
56
|
+
createFaithfulnessMetric,
|
|
57
|
+
createJSONMetric,
|
|
58
|
+
createLLMJudge,
|
|
59
|
+
createLengthMetric,
|
|
60
|
+
createRegexMetric,
|
|
61
|
+
createRelevanceMetric,
|
|
62
|
+
createRubricJudge,
|
|
63
|
+
createSimpleMetric,
|
|
64
|
+
createToxicityMetric
|
|
65
|
+
} from "./chunk-5JRYKRSE.mjs";
|
|
66
|
+
import {
|
|
67
|
+
BaseCollector,
|
|
68
|
+
CorrectionCollector,
|
|
69
|
+
FeedbackAggregator,
|
|
70
|
+
FeedbackExporter,
|
|
71
|
+
MemoryFeedbackStore,
|
|
72
|
+
MultiCriteriaCollector,
|
|
73
|
+
PreferenceCollector,
|
|
74
|
+
RatingCollector,
|
|
75
|
+
SQLiteFeedbackStore,
|
|
76
|
+
ThumbsCollector,
|
|
77
|
+
createCorrectionCollector,
|
|
78
|
+
createFeedbackAggregator,
|
|
79
|
+
createFeedbackExporter,
|
|
80
|
+
createFeedbackStore,
|
|
81
|
+
createMultiCriteriaCollector,
|
|
82
|
+
createPreferenceCollector,
|
|
83
|
+
createRatingCollector,
|
|
84
|
+
createThumbsCollector
|
|
85
|
+
} from "./chunk-PAQ2TTJJ.mjs";
|
|
86
|
+
|
|
87
|
+
// src/types/feedback.types.ts
|
|
88
|
+
import { z } from "zod";
|
|
89
|
+
var ThumbsRatingSchema = z.enum(["up", "down"]);
|
|
90
|
+
var StarRatingSchema = z.union([
|
|
91
|
+
z.literal(1),
|
|
92
|
+
z.literal(2),
|
|
93
|
+
z.literal(3),
|
|
94
|
+
z.literal(4),
|
|
95
|
+
z.literal(5)
|
|
96
|
+
]);
|
|
97
|
+
var PreferenceChoiceSchema = z.enum(["A", "B", "tie"]);
|
|
98
|
+
var CollectThumbsInputSchema = z.object({
|
|
99
|
+
responseId: z.string(),
|
|
100
|
+
conversationId: z.string().optional(),
|
|
101
|
+
input: z.string(),
|
|
102
|
+
output: z.string(),
|
|
103
|
+
feedback: z.object({
|
|
104
|
+
rating: ThumbsRatingSchema,
|
|
105
|
+
comment: z.string().optional()
|
|
106
|
+
}),
|
|
107
|
+
userId: z.string().optional(),
|
|
108
|
+
metadata: z.record(z.unknown()).optional()
|
|
109
|
+
});
|
|
110
|
+
var CollectPreferenceInputSchema = z.object({
|
|
111
|
+
input: z.string(),
|
|
112
|
+
responseA: z.object({
|
|
113
|
+
id: z.string(),
|
|
114
|
+
content: z.string(),
|
|
115
|
+
model: z.string().optional()
|
|
116
|
+
}),
|
|
117
|
+
responseB: z.object({
|
|
118
|
+
id: z.string(),
|
|
119
|
+
content: z.string(),
|
|
120
|
+
model: z.string().optional()
|
|
121
|
+
}),
|
|
122
|
+
preference: PreferenceChoiceSchema,
|
|
123
|
+
reason: z.string().optional(),
|
|
124
|
+
confidence: z.number().min(0).max(1).optional(),
|
|
125
|
+
userId: z.string().optional(),
|
|
126
|
+
metadata: z.record(z.unknown()).optional()
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
// src/integrations/agentsea/FeedbackMiddleware.ts
|
|
130
|
+
var FeedbackMiddleware = class {
|
|
131
|
+
collector;
|
|
132
|
+
autoCapture;
|
|
133
|
+
captureFields;
|
|
134
|
+
pendingFeedback = /* @__PURE__ */ new Map();
|
|
135
|
+
constructor(options) {
|
|
136
|
+
this.collector = options.collector ?? new ThumbsCollector({ store: options.store });
|
|
137
|
+
this.autoCapture = options.autoCapture ?? true;
|
|
138
|
+
this.captureFields = options.captureFields ?? [
|
|
139
|
+
"input",
|
|
140
|
+
"output",
|
|
141
|
+
"toolCalls",
|
|
142
|
+
"latency"
|
|
143
|
+
];
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Process agent message and capture for potential feedback
|
|
147
|
+
*/
|
|
148
|
+
capture(context) {
|
|
149
|
+
if (!this.autoCapture) return;
|
|
150
|
+
const messages = context.messages;
|
|
151
|
+
if (messages.length < 2) return;
|
|
152
|
+
let userMessage;
|
|
153
|
+
let assistantMessage;
|
|
154
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
155
|
+
if (!assistantMessage && messages[i].role === "assistant") {
|
|
156
|
+
assistantMessage = messages[i];
|
|
157
|
+
}
|
|
158
|
+
if (!userMessage && messages[i].role === "user") {
|
|
159
|
+
userMessage = messages[i];
|
|
160
|
+
}
|
|
161
|
+
if (userMessage && assistantMessage) break;
|
|
162
|
+
}
|
|
163
|
+
if (!userMessage || !assistantMessage) return;
|
|
164
|
+
const metadata = {};
|
|
165
|
+
if (this.captureFields.includes("toolCalls") && assistantMessage.metadata?.toolCalls) {
|
|
166
|
+
metadata.toolCalls = assistantMessage.metadata.toolCalls;
|
|
167
|
+
}
|
|
168
|
+
if (this.captureFields.includes("latency") && assistantMessage.metadata?.latencyMs) {
|
|
169
|
+
metadata.latencyMs = assistantMessage.metadata.latencyMs;
|
|
170
|
+
}
|
|
171
|
+
if (context.metadata?.model) {
|
|
172
|
+
metadata.model = context.metadata.model;
|
|
173
|
+
}
|
|
174
|
+
this.pendingFeedback.set(assistantMessage.id, {
|
|
175
|
+
input: userMessage.content,
|
|
176
|
+
output: assistantMessage.content,
|
|
177
|
+
conversationId: context.conversationId,
|
|
178
|
+
metadata,
|
|
179
|
+
timestamp: Date.now()
|
|
180
|
+
});
|
|
181
|
+
this.cleanupPending();
|
|
182
|
+
}
|
|
183
|
+
/**
|
|
184
|
+
* Record feedback for a response
|
|
185
|
+
*/
|
|
186
|
+
async recordFeedback(responseId, rating, comment, userId) {
|
|
187
|
+
const pending = this.pendingFeedback.get(responseId);
|
|
188
|
+
if (!pending) {
|
|
189
|
+
console.warn(`No pending feedback found for response ${responseId}`);
|
|
190
|
+
return null;
|
|
191
|
+
}
|
|
192
|
+
const feedback = await this.collector.collect({
|
|
193
|
+
responseId,
|
|
194
|
+
conversationId: pending.conversationId,
|
|
195
|
+
input: pending.input,
|
|
196
|
+
output: pending.output,
|
|
197
|
+
feedback: { rating, comment },
|
|
198
|
+
userId,
|
|
199
|
+
metadata: pending.metadata
|
|
200
|
+
});
|
|
201
|
+
this.pendingFeedback.delete(responseId);
|
|
202
|
+
return feedback;
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Get pending feedback IDs
|
|
206
|
+
*/
|
|
207
|
+
getPendingIds() {
|
|
208
|
+
return Array.from(this.pendingFeedback.keys());
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Clear pending feedback
|
|
212
|
+
*/
|
|
213
|
+
clearPending() {
|
|
214
|
+
this.pendingFeedback.clear();
|
|
215
|
+
}
|
|
216
|
+
/**
|
|
217
|
+
* Clean up old pending feedback
|
|
218
|
+
*/
|
|
219
|
+
cleanupPending() {
|
|
220
|
+
const oneHourAgo = Date.now() - 36e5;
|
|
221
|
+
for (const [id, data] of this.pendingFeedback.entries()) {
|
|
222
|
+
if (data.timestamp < oneHourAgo) {
|
|
223
|
+
this.pendingFeedback.delete(id);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
/**
|
|
228
|
+
* Get collector
|
|
229
|
+
*/
|
|
230
|
+
getCollector() {
|
|
231
|
+
return this.collector;
|
|
232
|
+
}
|
|
233
|
+
};
|
|
234
|
+
function createFeedbackMiddleware(options) {
|
|
235
|
+
return new FeedbackMiddleware(options);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// src/integrations/agentsea/AgentEvaluator.ts
|
|
239
|
+
var AgentEvaluator = class {
|
|
240
|
+
pipeline;
|
|
241
|
+
scenarios;
|
|
242
|
+
constructor(options) {
|
|
243
|
+
this.pipeline = options.pipeline;
|
|
244
|
+
this.scenarios = options.scenarios;
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Evaluate an agent
|
|
248
|
+
*/
|
|
249
|
+
async evaluate(agent) {
|
|
250
|
+
const categoryScores = {};
|
|
251
|
+
const categoryResults = {};
|
|
252
|
+
const recommendations = [];
|
|
253
|
+
let totalTests = 0;
|
|
254
|
+
let totalPassed = 0;
|
|
255
|
+
let weightedScoreSum = 0;
|
|
256
|
+
let totalWeight = 0;
|
|
257
|
+
for (const scenario of this.scenarios) {
|
|
258
|
+
const result = await this.pipeline.evaluate({
|
|
259
|
+
dataset: scenario.dataset,
|
|
260
|
+
generateFn: async (input, context) => {
|
|
261
|
+
return agent.execute(input, context);
|
|
262
|
+
}
|
|
263
|
+
});
|
|
264
|
+
const avgScore = result.summary.avgScore;
|
|
265
|
+
const weight = scenario.weight ?? 1;
|
|
266
|
+
categoryScores[scenario.category] = avgScore;
|
|
267
|
+
categoryResults[scenario.category] = result;
|
|
268
|
+
totalTests += result.summary.totalItems;
|
|
269
|
+
totalPassed += result.summary.passedItems;
|
|
270
|
+
weightedScoreSum += avgScore * weight;
|
|
271
|
+
totalWeight += weight;
|
|
272
|
+
if (avgScore < 0.7) {
|
|
273
|
+
recommendations.push(
|
|
274
|
+
`Improve ${scenario.category}: current score ${(avgScore * 100).toFixed(1)}%`
|
|
275
|
+
);
|
|
276
|
+
}
|
|
277
|
+
if (result.summary.passRate < 0.8) {
|
|
278
|
+
const topFailures = result.failures.slice(0, 3).map((f) => f.failedMetrics.join(", "));
|
|
279
|
+
if (topFailures.length > 0) {
|
|
280
|
+
recommendations.push(
|
|
281
|
+
`${scenario.category} failures often in: ${[...new Set(topFailures)].join(", ")}`
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const overallScore = totalWeight > 0 ? weightedScoreSum / totalWeight : 0;
|
|
287
|
+
return {
|
|
288
|
+
overallScore,
|
|
289
|
+
categoryScores,
|
|
290
|
+
categoryResults,
|
|
291
|
+
recommendations,
|
|
292
|
+
summary: {
|
|
293
|
+
totalTests,
|
|
294
|
+
passed: totalPassed,
|
|
295
|
+
failed: totalTests - totalPassed,
|
|
296
|
+
passRate: totalTests > 0 ? totalPassed / totalTests : 0
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
/**
|
|
301
|
+
* Run quick benchmark
|
|
302
|
+
*/
|
|
303
|
+
async benchmark(agent, sampleSize = 10) {
|
|
304
|
+
const allItems = [];
|
|
305
|
+
for (const scenario of this.scenarios) {
|
|
306
|
+
allItems.push(...scenario.dataset.sample(sampleSize).getItems());
|
|
307
|
+
}
|
|
308
|
+
const startTime = performance.now();
|
|
309
|
+
let totalScore = 0;
|
|
310
|
+
let count = 0;
|
|
311
|
+
for (const item of allItems.slice(0, sampleSize)) {
|
|
312
|
+
try {
|
|
313
|
+
const output = await agent.execute(item.input);
|
|
314
|
+
if (output && output.length > 0) {
|
|
315
|
+
totalScore += 1;
|
|
316
|
+
}
|
|
317
|
+
count++;
|
|
318
|
+
} catch {
|
|
319
|
+
count++;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
const latencyMs = (performance.now() - startTime) / count;
|
|
323
|
+
return {
|
|
324
|
+
score: count > 0 ? totalScore / count : 0,
|
|
325
|
+
latencyMs
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
/**
|
|
329
|
+
* Add a scenario
|
|
330
|
+
*/
|
|
331
|
+
addScenario(scenario) {
|
|
332
|
+
this.scenarios.push(scenario);
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Get scenarios
|
|
336
|
+
*/
|
|
337
|
+
getScenarios() {
|
|
338
|
+
return [...this.scenarios];
|
|
339
|
+
}
|
|
340
|
+
};
|
|
341
|
+
function createAgentEvaluator(options) {
|
|
342
|
+
return new AgentEvaluator(options);
|
|
343
|
+
}
|
|
344
|
+
export {
|
|
345
|
+
ABTestRunner,
|
|
346
|
+
Accuracy,
|
|
347
|
+
AgentEvaluator,
|
|
348
|
+
AlertManager,
|
|
349
|
+
AnnotationQueue,
|
|
350
|
+
AnnotationTask,
|
|
351
|
+
BaseCollector,
|
|
352
|
+
BaseMetric,
|
|
353
|
+
BinaryClassificationSchema,
|
|
354
|
+
CodeQualityRubric,
|
|
355
|
+
Coherence,
|
|
356
|
+
CollectPreferenceInputSchema,
|
|
357
|
+
CollectThumbsInputSchema,
|
|
358
|
+
ComparativeJudge,
|
|
359
|
+
ConsensusJudge,
|
|
360
|
+
ConsensusManager,
|
|
361
|
+
ContextRelevance,
|
|
362
|
+
ContinuousEval,
|
|
363
|
+
CorrectionCollector,
|
|
364
|
+
CustomMetric,
|
|
365
|
+
DatasetExporter,
|
|
366
|
+
EvalDataset,
|
|
367
|
+
EvalRunner,
|
|
368
|
+
EvaluationPipeline,
|
|
369
|
+
Faithfulness,
|
|
370
|
+
FeedbackAggregator,
|
|
371
|
+
FeedbackExporter,
|
|
372
|
+
FeedbackMiddleware,
|
|
373
|
+
HelpfulnessRubric,
|
|
374
|
+
LLMJudge,
|
|
375
|
+
MemoryFeedbackStore,
|
|
376
|
+
MultiCriteriaCollector,
|
|
377
|
+
PreferenceChoiceSchema,
|
|
378
|
+
PreferenceCollector,
|
|
379
|
+
PreferenceDataset,
|
|
380
|
+
PreferenceDatasetBuilder,
|
|
381
|
+
QualityRatingSchema,
|
|
382
|
+
QualityRubric,
|
|
383
|
+
RatingCollector,
|
|
384
|
+
Relevance,
|
|
385
|
+
RubricJudge,
|
|
386
|
+
SQLiteFeedbackStore,
|
|
387
|
+
StarRatingSchema,
|
|
388
|
+
TextSpanSchema,
|
|
389
|
+
ThumbsCollector,
|
|
390
|
+
ThumbsRatingSchema,
|
|
391
|
+
Toxicity,
|
|
392
|
+
createABTestRunner,
|
|
393
|
+
createAccuracyMetric,
|
|
394
|
+
createAgentEvaluator,
|
|
395
|
+
createAlertManager,
|
|
396
|
+
createAnnotationQueue,
|
|
397
|
+
createAnnotationTask,
|
|
398
|
+
createCoherenceMetric,
|
|
399
|
+
createComparativeJudge,
|
|
400
|
+
createConsensusJudge,
|
|
401
|
+
createConsensusManager,
|
|
402
|
+
createContainsMetric,
|
|
403
|
+
createContextRelevanceMetric,
|
|
404
|
+
createContinuousEval,
|
|
405
|
+
createCorrectionCollector,
|
|
406
|
+
createCustomMetric,
|
|
407
|
+
createDatasetExporter,
|
|
408
|
+
createEvalDataset,
|
|
409
|
+
createEvalRunner,
|
|
410
|
+
createEvaluationPipeline,
|
|
411
|
+
createFaithfulnessMetric,
|
|
412
|
+
createFeedbackAggregator,
|
|
413
|
+
createFeedbackExporter,
|
|
414
|
+
createFeedbackMiddleware,
|
|
415
|
+
createFeedbackStore,
|
|
416
|
+
createJSONMetric,
|
|
417
|
+
createLLMJudge,
|
|
418
|
+
createLengthMetric,
|
|
419
|
+
createMultiCriteriaCollector,
|
|
420
|
+
createPreferenceCollector,
|
|
421
|
+
createPreferenceDatasetBuilder,
|
|
422
|
+
createRatingCollector,
|
|
423
|
+
createRegexMetric,
|
|
424
|
+
createRelevanceMetric,
|
|
425
|
+
createRubricJudge,
|
|
426
|
+
createSimpleMetric,
|
|
427
|
+
createThumbsCollector,
|
|
428
|
+
createToxicityMetric
|
|
429
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@lov3kaizen/agentsea-evaluate",
|
|
3
|
+
"version": "0.5.1",
|
|
4
|
+
"description": "Comprehensive feedback collection and LLM evaluation platform for Node.js - human-in-the-loop annotation, automated evaluation pipelines, preference dataset generation",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"module": "dist/index.mjs",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"types": "./dist/index.d.ts",
|
|
11
|
+
"import": "./dist/index.mjs",
|
|
12
|
+
"require": "./dist/index.js"
|
|
13
|
+
},
|
|
14
|
+
"./feedback": {
|
|
15
|
+
"types": "./dist/feedback/index.d.ts",
|
|
16
|
+
"import": "./dist/feedback/index.mjs",
|
|
17
|
+
"require": "./dist/feedback/index.js"
|
|
18
|
+
},
|
|
19
|
+
"./evaluation": {
|
|
20
|
+
"types": "./dist/evaluation/index.d.ts",
|
|
21
|
+
"import": "./dist/evaluation/index.mjs",
|
|
22
|
+
"require": "./dist/evaluation/index.js"
|
|
23
|
+
},
|
|
24
|
+
"./datasets": {
|
|
25
|
+
"types": "./dist/datasets/index.d.ts",
|
|
26
|
+
"import": "./dist/datasets/index.mjs",
|
|
27
|
+
"require": "./dist/datasets/index.js"
|
|
28
|
+
},
|
|
29
|
+
"./annotation": {
|
|
30
|
+
"types": "./dist/annotation/index.d.ts",
|
|
31
|
+
"import": "./dist/annotation/index.mjs",
|
|
32
|
+
"require": "./dist/annotation/index.js"
|
|
33
|
+
},
|
|
34
|
+
"./continuous": {
|
|
35
|
+
"types": "./dist/continuous/index.d.ts",
|
|
36
|
+
"import": "./dist/continuous/index.mjs",
|
|
37
|
+
"require": "./dist/continuous/index.js"
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"files": [
|
|
41
|
+
"dist",
|
|
42
|
+
"README.md"
|
|
43
|
+
],
|
|
44
|
+
"keywords": [
|
|
45
|
+
"llm",
|
|
46
|
+
"evaluation",
|
|
47
|
+
"feedback",
|
|
48
|
+
"annotation",
|
|
49
|
+
"rlhf",
|
|
50
|
+
"dpo",
|
|
51
|
+
"preference-learning",
|
|
52
|
+
"llm-as-judge",
|
|
53
|
+
"ai",
|
|
54
|
+
"machine-learning",
|
|
55
|
+
"nlp",
|
|
56
|
+
"rag",
|
|
57
|
+
"quality-assurance"
|
|
58
|
+
],
|
|
59
|
+
"author": "lov3kaizen",
|
|
60
|
+
"license": "MIT",
|
|
61
|
+
"repository": {
|
|
62
|
+
"type": "git",
|
|
63
|
+
"url": "https://github.com/lov3kaizen/agentsea.git",
|
|
64
|
+
"directory": "packages/evaluate"
|
|
65
|
+
},
|
|
66
|
+
"dependencies": {
|
|
67
|
+
"eventemitter3": "^5.0.0",
|
|
68
|
+
"nanoid": "^5.0.0",
|
|
69
|
+
"zod": "^3.22.0"
|
|
70
|
+
},
|
|
71
|
+
"devDependencies": {
|
|
72
|
+
"@types/better-sqlite3": "^7.6.0",
|
|
73
|
+
"@types/node": "^20.0.0",
|
|
74
|
+
"tsup": "^8.0.0",
|
|
75
|
+
"typescript": "^5.3.0",
|
|
76
|
+
"vitest": "^1.0.0"
|
|
77
|
+
},
|
|
78
|
+
"peerDependencies": {
|
|
79
|
+
"@lov3kaizen/agentsea-core": ">=0.5.0"
|
|
80
|
+
},
|
|
81
|
+
"peerDependenciesMeta": {
|
|
82
|
+
"@lov3kaizen/agentsea-core": {
|
|
83
|
+
"optional": true
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
"optionalDependencies": {
|
|
87
|
+
"better-sqlite3": "^9.2.0",
|
|
88
|
+
"@huggingface/hub": "^0.14.0"
|
|
89
|
+
},
|
|
90
|
+
"engines": {
|
|
91
|
+
"node": ">=18.0.0"
|
|
92
|
+
},
|
|
93
|
+
"scripts": {
|
|
94
|
+
"build": "tsup src/index.ts src/feedback/index.ts src/evaluation/index.ts src/datasets/index.ts src/annotation/index.ts src/continuous/index.ts --format cjs,esm --dts --clean --external better-sqlite3 --external @huggingface/hub",
|
|
95
|
+
"dev": "tsup src/index.ts --format cjs,esm --dts --watch",
|
|
96
|
+
"test": "vitest run",
|
|
97
|
+
"test:watch": "vitest",
|
|
98
|
+
"test:coverage": "vitest run --coverage",
|
|
99
|
+
"lint": "eslint src --ext .ts",
|
|
100
|
+
"typecheck": "tsc --noEmit"
|
|
101
|
+
}
|
|
102
|
+
}
|