@lov3kaizen/agentsea-evaluate 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of @lov3kaizen/agentsea-evaluate might be problematic. Click here for more details.
- package/LICENSE +21 -0
- package/README.md +339 -0
- package/dist/annotation/index.d.mts +3 -0
- package/dist/annotation/index.d.ts +3 -0
- package/dist/annotation/index.js +630 -0
- package/dist/annotation/index.mjs +22 -0
- package/dist/chunk-5JRYKRSE.mjs +2791 -0
- package/dist/chunk-EUXXIZK3.mjs +676 -0
- package/dist/chunk-NBMUSATK.mjs +596 -0
- package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
- package/dist/chunk-TUMNJN2S.mjs +416 -0
- package/dist/continuous/index.d.mts +2 -0
- package/dist/continuous/index.d.ts +2 -0
- package/dist/continuous/index.js +707 -0
- package/dist/continuous/index.mjs +16 -0
- package/dist/datasets/index.d.mts +1 -0
- package/dist/datasets/index.d.ts +1 -0
- package/dist/datasets/index.js +456 -0
- package/dist/datasets/index.mjs +14 -0
- package/dist/evaluation/index.d.mts +1 -0
- package/dist/evaluation/index.d.ts +1 -0
- package/dist/evaluation/index.js +2853 -0
- package/dist/evaluation/index.mjs +78 -0
- package/dist/feedback/index.d.mts +2 -0
- package/dist/feedback/index.d.ts +2 -0
- package/dist/feedback/index.js +1158 -0
- package/dist/feedback/index.mjs +40 -0
- package/dist/index-6Pbiq7ny.d.mts +234 -0
- package/dist/index-6Pbiq7ny.d.ts +234 -0
- package/dist/index-BNTycFEA.d.mts +479 -0
- package/dist/index-BNTycFEA.d.ts +479 -0
- package/dist/index-CTYCfWfH.d.mts +543 -0
- package/dist/index-CTYCfWfH.d.ts +543 -0
- package/dist/index-Cq5LwG_3.d.mts +322 -0
- package/dist/index-Cq5LwG_3.d.ts +322 -0
- package/dist/index-bPghFsfP.d.mts +315 -0
- package/dist/index-bPghFsfP.d.ts +315 -0
- package/dist/index.d.mts +81 -0
- package/dist/index.d.ts +81 -0
- package/dist/index.js +5962 -0
- package/dist/index.mjs +429 -0
- package/package.json +102 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 lovekaizen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# @lov3kaizen/agentsea-evaluate
|
|
2
|
+
|
|
3
|
+
Comprehensive feedback collection and LLM evaluation platform for Node.js. Build production-ready evaluation pipelines with human-in-the-loop annotation, automated metrics, LLM-as-Judge, and preference dataset generation.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Evaluation Metrics** - Built-in metrics for accuracy, relevance, coherence, toxicity, faithfulness, and more
|
|
8
|
+
- **LLM-as-Judge** - Use LLMs to evaluate responses with rubric-based and comparative scoring
|
|
9
|
+
- **Human Feedback** - Collect ratings, rankings, and corrections from annotators
|
|
10
|
+
- **Dataset Management** - Create, import, and manage evaluation datasets with HuggingFace integration
|
|
11
|
+
- **Continuous Evaluation** - Monitor production quality with automated evaluation pipelines
|
|
12
|
+
- **Preference Learning** - Generate datasets for RLHF, DPO, and preference optimization
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pnpm add @lov3kaizen/agentsea-evaluate
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```typescript
|
|
23
|
+
import {
|
|
24
|
+
EvaluationPipeline,
|
|
25
|
+
AccuracyMetric,
|
|
26
|
+
RelevanceMetric,
|
|
27
|
+
LLMJudge,
|
|
28
|
+
EvalDataset,
|
|
29
|
+
} from '@lov3kaizen/agentsea-evaluate';
|
|
30
|
+
|
|
31
|
+
// Create metrics
|
|
32
|
+
const accuracy = new AccuracyMetric({ type: 'fuzzy' });
|
|
33
|
+
const relevance = new RelevanceMetric();
|
|
34
|
+
|
|
35
|
+
// Create evaluation pipeline
|
|
36
|
+
const pipeline = new EvaluationPipeline({
|
|
37
|
+
metrics: [accuracy, relevance],
|
|
38
|
+
parallelism: 5,
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
// Create dataset
|
|
42
|
+
const dataset = new EvalDataset({
|
|
43
|
+
items: [
|
|
44
|
+
{
|
|
45
|
+
id: '1',
|
|
46
|
+
input: 'What is the capital of France?',
|
|
47
|
+
expectedOutput: 'Paris',
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
id: '2',
|
|
51
|
+
input: 'What is 2 + 2?',
|
|
52
|
+
expectedOutput: '4',
|
|
53
|
+
},
|
|
54
|
+
],
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Run evaluation
|
|
58
|
+
const results = await pipeline.evaluate({
|
|
59
|
+
dataset,
|
|
60
|
+
generateFn: async (input) => {
|
|
61
|
+
// Your LLM generation function
|
|
62
|
+
return await myAgent.run(input);
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
console.log(results.summary);
|
|
67
|
+
// { passRate: 0.95, avgScore: 0.87, ... }
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Metrics
|
|
71
|
+
|
|
72
|
+
### Built-in Metrics
|
|
73
|
+
|
|
74
|
+
| Metric | Description |
|
|
75
|
+
| ------------------------ | ------------------------------------------------------- |
|
|
76
|
+
| `AccuracyMetric` | Exact, fuzzy, or semantic match against expected output |
|
|
77
|
+
| `RelevanceMetric` | How relevant the response is to the input |
|
|
78
|
+
| `CoherenceMetric` | Logical flow and consistency of the response |
|
|
79
|
+
| `ToxicityMetric` | Detection of harmful or inappropriate content |
|
|
80
|
+
| `FaithfulnessMetric` | Factual accuracy relative to provided context (RAG) |
|
|
81
|
+
| `ContextRelevanceMetric` | Relevance of retrieved context (RAG) |
|
|
82
|
+
| `FluencyMetric` | Grammar, spelling, and readability |
|
|
83
|
+
| `ConcisenessMetric` | Brevity without losing important information |
|
|
84
|
+
| `HelpfulnessMetric` | How helpful the response is to the user |
|
|
85
|
+
| `SafetyMetric` | Detection of unsafe or harmful outputs |
|
|
86
|
+
|
|
87
|
+
### Custom Metrics
|
|
88
|
+
|
|
89
|
+
```typescript
|
|
90
|
+
import {
|
|
91
|
+
BaseMetric,
|
|
92
|
+
MetricResult,
|
|
93
|
+
EvaluationInput,
|
|
94
|
+
} from '@lov3kaizen/agentsea-evaluate';
|
|
95
|
+
|
|
96
|
+
class CustomMetric extends BaseMetric {
|
|
97
|
+
readonly type = 'custom';
|
|
98
|
+
readonly name = 'my-metric';
|
|
99
|
+
|
|
100
|
+
async evaluate(input: EvaluationInput): Promise<MetricResult> {
|
|
101
|
+
// Your evaluation logic
|
|
102
|
+
const score = calculateScore(input.output, input.expectedOutput);
|
|
103
|
+
|
|
104
|
+
return {
|
|
105
|
+
metric: this.name,
|
|
106
|
+
score,
|
|
107
|
+
explanation: `Score: ${score}`,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## LLM-as-Judge
|
|
114
|
+
|
|
115
|
+
### Rubric-Based Evaluation
|
|
116
|
+
|
|
117
|
+
```typescript
|
|
118
|
+
import { RubricJudge } from '@lov3kaizen/agentsea-evaluate';
|
|
119
|
+
|
|
120
|
+
const judge = new RubricJudge({
|
|
121
|
+
provider: anthropicProvider,
|
|
122
|
+
rubric: {
|
|
123
|
+
criteria: 'Response Quality',
|
|
124
|
+
levels: [
|
|
125
|
+
{ score: 1, description: 'Poor - Incorrect or irrelevant' },
|
|
126
|
+
{ score: 2, description: 'Fair - Partially correct' },
|
|
127
|
+
{ score: 3, description: 'Good - Correct but incomplete' },
|
|
128
|
+
{ score: 4, description: 'Very Good - Correct and complete' },
|
|
129
|
+
{
|
|
130
|
+
score: 5,
|
|
131
|
+
description: 'Excellent - Correct, complete, and well-explained',
|
|
132
|
+
},
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
const result = await judge.evaluate({
|
|
138
|
+
input: 'Explain quantum entanglement',
|
|
139
|
+
output: response,
|
|
140
|
+
});
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Comparative Evaluation
|
|
144
|
+
|
|
145
|
+
```typescript
|
|
146
|
+
import { ComparativeJudge } from '@lov3kaizen/agentsea-evaluate';
|
|
147
|
+
|
|
148
|
+
const judge = new ComparativeJudge({
|
|
149
|
+
provider: openaiProvider,
|
|
150
|
+
criteria: ['accuracy', 'helpfulness', 'clarity'],
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
const result = await judge.compare({
|
|
154
|
+
input: 'Summarize this article',
|
|
155
|
+
responseA: modelAOutput,
|
|
156
|
+
responseB: modelBOutput,
|
|
157
|
+
});
|
|
158
|
+
// { winner: 'A', reasoning: '...', criteriaScores: {...} }
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## Human Feedback
|
|
162
|
+
|
|
163
|
+
### Rating Collector
|
|
164
|
+
|
|
165
|
+
```typescript
|
|
166
|
+
import { RatingCollector } from '@lov3kaizen/agentsea-evaluate/feedback';
|
|
167
|
+
|
|
168
|
+
const collector = new RatingCollector({
|
|
169
|
+
scale: 5,
|
|
170
|
+
criteria: ['accuracy', 'helpfulness', 'clarity'],
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
// Collect feedback
|
|
174
|
+
await collector.collect({
|
|
175
|
+
itemId: 'response-123',
|
|
176
|
+
input: 'What is ML?',
|
|
177
|
+
output: 'Machine Learning is...',
|
|
178
|
+
annotatorId: 'user-1',
|
|
179
|
+
ratings: {
|
|
180
|
+
accuracy: 4,
|
|
181
|
+
helpfulness: 5,
|
|
182
|
+
clarity: 4,
|
|
183
|
+
},
|
|
184
|
+
comment: 'Good explanation',
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
// Get aggregated scores
|
|
188
|
+
const stats = collector.getStatistics('response-123');
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Preference Collection
|
|
192
|
+
|
|
193
|
+
```typescript
|
|
194
|
+
import { PreferenceCollector } from '@lov3kaizen/agentsea-evaluate/feedback';
|
|
195
|
+
|
|
196
|
+
const collector = new PreferenceCollector();
|
|
197
|
+
|
|
198
|
+
// Collect A/B preferences
|
|
199
|
+
await collector.collect({
|
|
200
|
+
input: 'Explain recursion',
|
|
201
|
+
responseA: '...',
|
|
202
|
+
responseB: '...',
|
|
203
|
+
preference: 'A',
|
|
204
|
+
annotatorId: 'user-1',
|
|
205
|
+
reason: 'More concise explanation',
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Export for RLHF/DPO training
|
|
209
|
+
const dataset = collector.exportForDPO();
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Datasets
|
|
213
|
+
|
|
214
|
+
### Create Dataset
|
|
215
|
+
|
|
216
|
+
```typescript
|
|
217
|
+
import { EvalDataset } from '@lov3kaizen/agentsea-evaluate/datasets';
|
|
218
|
+
|
|
219
|
+
const dataset = new EvalDataset({
|
|
220
|
+
name: 'qa-benchmark',
|
|
221
|
+
items: [
|
|
222
|
+
{
|
|
223
|
+
id: '1',
|
|
224
|
+
input: 'Question 1',
|
|
225
|
+
expectedOutput: 'Answer 1',
|
|
226
|
+
context: ['Relevant context...'],
|
|
227
|
+
tags: ['factual', 'science'],
|
|
228
|
+
},
|
|
229
|
+
],
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
// Filter and sample
|
|
233
|
+
const subset = dataset
|
|
234
|
+
.filter((item) => item.tags?.includes('science'))
|
|
235
|
+
.sample(100);
|
|
236
|
+
|
|
237
|
+
// Split for train/test
|
|
238
|
+
const [train, test] = dataset.split(0.8);
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### HuggingFace Integration
|
|
242
|
+
|
|
243
|
+
```typescript
|
|
244
|
+
import { loadHuggingFaceDataset } from '@lov3kaizen/agentsea-evaluate/datasets';
|
|
245
|
+
|
|
246
|
+
const dataset = await loadHuggingFaceDataset('squad', {
|
|
247
|
+
split: 'validation',
|
|
248
|
+
inputField: 'question',
|
|
249
|
+
outputField: 'answers.text[0]',
|
|
250
|
+
contextField: 'context',
|
|
251
|
+
limit: 1000,
|
|
252
|
+
});
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## Continuous Evaluation
|
|
256
|
+
|
|
257
|
+
### Production Monitoring
|
|
258
|
+
|
|
259
|
+
```typescript
|
|
260
|
+
import { ContinuousEvaluator } from '@lov3kaizen/agentsea-evaluate/continuous';
|
|
261
|
+
|
|
262
|
+
const evaluator = new ContinuousEvaluator({
|
|
263
|
+
metrics: [accuracy, relevance, toxicity],
|
|
264
|
+
sampleRate: 0.1, // Evaluate 10% of requests
|
|
265
|
+
alertThresholds: {
|
|
266
|
+
accuracy: 0.8,
|
|
267
|
+
toxicity: 0.1,
|
|
268
|
+
},
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
// In your production code
|
|
272
|
+
evaluator.on('alert', (alert) => {
|
|
273
|
+
console.error(`Quality alert: ${alert.metric} below threshold`);
|
|
274
|
+
notifyOncall(alert);
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
// Log production interactions
|
|
278
|
+
await evaluator.log({
|
|
279
|
+
input: userQuery,
|
|
280
|
+
output: agentResponse,
|
|
281
|
+
expectedOutput: groundTruth, // Optional
|
|
282
|
+
});
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
## API Reference
|
|
286
|
+
|
|
287
|
+
### EvaluationPipeline
|
|
288
|
+
|
|
289
|
+
```typescript
|
|
290
|
+
interface EvaluationPipelineConfig {
|
|
291
|
+
metrics: MetricInterface[];
|
|
292
|
+
llmJudge?: JudgeInterface;
|
|
293
|
+
parallelism?: number;
|
|
294
|
+
timeout?: number;
|
|
295
|
+
retries?: number;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
// Methods
|
|
299
|
+
pipeline.evaluate(options: PipelineEvaluationOptions): Promise<PipelineEvaluationResult>
|
|
300
|
+
```
|
|
301
|
+
|
|
302
|
+
### EvalDataset
|
|
303
|
+
|
|
304
|
+
```typescript
|
|
305
|
+
interface EvalDatasetItem {
|
|
306
|
+
id: string;
|
|
307
|
+
input: string;
|
|
308
|
+
expectedOutput?: string;
|
|
309
|
+
context?: string[];
|
|
310
|
+
reference?: string;
|
|
311
|
+
metadata?: Record<string, unknown>;
|
|
312
|
+
tags?: string[];
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Methods
|
|
316
|
+
dataset.getItems(): EvalDatasetItem[]
|
|
317
|
+
dataset.filter(predicate): EvalDataset
|
|
318
|
+
dataset.sample(count): EvalDataset
|
|
319
|
+
dataset.split(ratio): [EvalDataset, EvalDataset]
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### PipelineEvaluationResult
|
|
323
|
+
|
|
324
|
+
```typescript
|
|
325
|
+
interface PipelineEvaluationResult {
|
|
326
|
+
results: SingleEvaluationResult[];
|
|
327
|
+
metrics: MetricsSummary;
|
|
328
|
+
failures: FailureAnalysis[];
|
|
329
|
+
summary: EvaluationSummary;
|
|
330
|
+
exportJSON(): string;
|
|
331
|
+
exportCSV(): string;
|
|
332
|
+
}
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
## Links
|
|
336
|
+
|
|
337
|
+
- [Documentation](https://github.com/lov3kaizen/agentsea)
|
|
338
|
+
- [Examples](../../examples)
|
|
339
|
+
- [API Reference](../../docs/API.md)
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export { t as AnnotationQueue, p as AnnotationTask, r as BinaryClassificationSchema, v as ConsensusManager, s as QualityRatingSchema, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from '../index-bPghFsfP.mjs';
|
|
2
|
+
import 'zod';
|
|
3
|
+
import 'eventemitter3';
|
|
@@ -0,0 +1,3 @@
|
|
|
1
|
+
export { t as AnnotationQueue, p as AnnotationTask, r as BinaryClassificationSchema, v as ConsensusManager, s as QualityRatingSchema, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from '../index-bPghFsfP.js';
|
|
2
|
+
import 'zod';
|
|
3
|
+
import 'eventemitter3';
|