@lov3kaizen/agentsea-evaluate 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of @lov3kaizen/agentsea-evaluate might be problematic. Click here for more details.

Files changed (42) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/dist/annotation/index.d.mts +3 -0
  4. package/dist/annotation/index.d.ts +3 -0
  5. package/dist/annotation/index.js +630 -0
  6. package/dist/annotation/index.mjs +22 -0
  7. package/dist/chunk-5JRYKRSE.mjs +2791 -0
  8. package/dist/chunk-EUXXIZK3.mjs +676 -0
  9. package/dist/chunk-NBMUSATK.mjs +596 -0
  10. package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
  11. package/dist/chunk-TUMNJN2S.mjs +416 -0
  12. package/dist/continuous/index.d.mts +2 -0
  13. package/dist/continuous/index.d.ts +2 -0
  14. package/dist/continuous/index.js +707 -0
  15. package/dist/continuous/index.mjs +16 -0
  16. package/dist/datasets/index.d.mts +1 -0
  17. package/dist/datasets/index.d.ts +1 -0
  18. package/dist/datasets/index.js +456 -0
  19. package/dist/datasets/index.mjs +14 -0
  20. package/dist/evaluation/index.d.mts +1 -0
  21. package/dist/evaluation/index.d.ts +1 -0
  22. package/dist/evaluation/index.js +2853 -0
  23. package/dist/evaluation/index.mjs +78 -0
  24. package/dist/feedback/index.d.mts +2 -0
  25. package/dist/feedback/index.d.ts +2 -0
  26. package/dist/feedback/index.js +1158 -0
  27. package/dist/feedback/index.mjs +40 -0
  28. package/dist/index-6Pbiq7ny.d.mts +234 -0
  29. package/dist/index-6Pbiq7ny.d.ts +234 -0
  30. package/dist/index-BNTycFEA.d.mts +479 -0
  31. package/dist/index-BNTycFEA.d.ts +479 -0
  32. package/dist/index-CTYCfWfH.d.mts +543 -0
  33. package/dist/index-CTYCfWfH.d.ts +543 -0
  34. package/dist/index-Cq5LwG_3.d.mts +322 -0
  35. package/dist/index-Cq5LwG_3.d.ts +322 -0
  36. package/dist/index-bPghFsfP.d.mts +315 -0
  37. package/dist/index-bPghFsfP.d.ts +315 -0
  38. package/dist/index.d.mts +81 -0
  39. package/dist/index.d.ts +81 -0
  40. package/dist/index.js +5962 -0
  41. package/dist/index.mjs +429 -0
  42. package/package.json +102 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 lovekaizen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,339 @@
1
+ # @lov3kaizen/agentsea-evaluate
2
+
3
+ Comprehensive feedback collection and LLM evaluation platform for Node.js. Build production-ready evaluation pipelines with human-in-the-loop annotation, automated metrics, LLM-as-Judge, and preference dataset generation.
4
+
5
+ ## Features
6
+
7
+ - **Evaluation Metrics** - Built-in metrics for accuracy, relevance, coherence, toxicity, faithfulness, and more
8
+ - **LLM-as-Judge** - Use LLMs to evaluate responses with rubric-based and comparative scoring
9
+ - **Human Feedback** - Collect ratings, rankings, and corrections from annotators
10
+ - **Dataset Management** - Create, import, and manage evaluation datasets with HuggingFace integration
11
+ - **Continuous Evaluation** - Monitor production quality with automated evaluation pipelines
12
+ - **Preference Learning** - Generate datasets for RLHF, DPO, and preference optimization
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ pnpm add @lov3kaizen/agentsea-evaluate
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```typescript
23
+ import {
24
+ EvaluationPipeline,
25
+ AccuracyMetric,
26
+ RelevanceMetric,
27
+ LLMJudge,
28
+ EvalDataset,
29
+ } from '@lov3kaizen/agentsea-evaluate';
30
+
31
+ // Create metrics
32
+ const accuracy = new AccuracyMetric({ type: 'fuzzy' });
33
+ const relevance = new RelevanceMetric();
34
+
35
+ // Create evaluation pipeline
36
+ const pipeline = new EvaluationPipeline({
37
+ metrics: [accuracy, relevance],
38
+ parallelism: 5,
39
+ });
40
+
41
+ // Create dataset
42
+ const dataset = new EvalDataset({
43
+ items: [
44
+ {
45
+ id: '1',
46
+ input: 'What is the capital of France?',
47
+ expectedOutput: 'Paris',
48
+ },
49
+ {
50
+ id: '2',
51
+ input: 'What is 2 + 2?',
52
+ expectedOutput: '4',
53
+ },
54
+ ],
55
+ });
56
+
57
+ // Run evaluation
58
+ const results = await pipeline.evaluate({
59
+ dataset,
60
+ generateFn: async (input) => {
61
+ // Your LLM generation function
62
+ return await myAgent.run(input);
63
+ },
64
+ });
65
+
66
+ console.log(results.summary);
67
+ // { passRate: 0.95, avgScore: 0.87, ... }
68
+ ```
69
+
70
+ ## Metrics
71
+
72
+ ### Built-in Metrics
73
+
74
+ | Metric | Description |
75
+ | ------------------------ | ------------------------------------------------------- |
76
+ | `AccuracyMetric` | Exact, fuzzy, or semantic match against expected output |
77
+ | `RelevanceMetric` | How relevant the response is to the input |
78
+ | `CoherenceMetric` | Logical flow and consistency of the response |
79
+ | `ToxicityMetric` | Detection of harmful or inappropriate content |
80
+ | `FaithfulnessMetric` | Factual accuracy relative to provided context (RAG) |
81
+ | `ContextRelevanceMetric` | Relevance of retrieved context (RAG) |
82
+ | `FluencyMetric` | Grammar, spelling, and readability |
83
+ | `ConcisenessMetric` | Brevity without losing important information |
84
+ | `HelpfulnessMetric` | How helpful the response is to the user |
85
+ | `SafetyMetric` | Detection of unsafe or harmful outputs |
86
+
87
+ ### Custom Metrics
88
+
89
+ ```typescript
90
+ import {
91
+ BaseMetric,
92
+ MetricResult,
93
+ EvaluationInput,
94
+ } from '@lov3kaizen/agentsea-evaluate';
95
+
96
+ class CustomMetric extends BaseMetric {
97
+ readonly type = 'custom';
98
+ readonly name = 'my-metric';
99
+
100
+ async evaluate(input: EvaluationInput): Promise<MetricResult> {
101
+ // Your evaluation logic
102
+ const score = calculateScore(input.output, input.expectedOutput);
103
+
104
+ return {
105
+ metric: this.name,
106
+ score,
107
+ explanation: `Score: ${score}`,
108
+ };
109
+ }
110
+ }
111
+ ```
112
+
113
+ ## LLM-as-Judge
114
+
115
+ ### Rubric-Based Evaluation
116
+
117
+ ```typescript
118
+ import { RubricJudge } from '@lov3kaizen/agentsea-evaluate';
119
+
120
+ const judge = new RubricJudge({
121
+ provider: anthropicProvider,
122
+ rubric: {
123
+ criteria: 'Response Quality',
124
+ levels: [
125
+ { score: 1, description: 'Poor - Incorrect or irrelevant' },
126
+ { score: 2, description: 'Fair - Partially correct' },
127
+ { score: 3, description: 'Good - Correct but incomplete' },
128
+ { score: 4, description: 'Very Good - Correct and complete' },
129
+ {
130
+ score: 5,
131
+ description: 'Excellent - Correct, complete, and well-explained',
132
+ },
133
+ ],
134
+ },
135
+ });
136
+
137
+ const result = await judge.evaluate({
138
+ input: 'Explain quantum entanglement',
139
+ output: response,
140
+ });
141
+ ```
142
+
143
+ ### Comparative Evaluation
144
+
145
+ ```typescript
146
+ import { ComparativeJudge } from '@lov3kaizen/agentsea-evaluate';
147
+
148
+ const judge = new ComparativeJudge({
149
+ provider: openaiProvider,
150
+ criteria: ['accuracy', 'helpfulness', 'clarity'],
151
+ });
152
+
153
+ const result = await judge.compare({
154
+ input: 'Summarize this article',
155
+ responseA: modelAOutput,
156
+ responseB: modelBOutput,
157
+ });
158
+ // { winner: 'A', reasoning: '...', criteriaScores: {...} }
159
+ ```
160
+
161
+ ## Human Feedback
162
+
163
+ ### Rating Collector
164
+
165
+ ```typescript
166
+ import { RatingCollector } from '@lov3kaizen/agentsea-evaluate/feedback';
167
+
168
+ const collector = new RatingCollector({
169
+ scale: 5,
170
+ criteria: ['accuracy', 'helpfulness', 'clarity'],
171
+ });
172
+
173
+ // Collect feedback
174
+ await collector.collect({
175
+ itemId: 'response-123',
176
+ input: 'What is ML?',
177
+ output: 'Machine Learning is...',
178
+ annotatorId: 'user-1',
179
+ ratings: {
180
+ accuracy: 4,
181
+ helpfulness: 5,
182
+ clarity: 4,
183
+ },
184
+ comment: 'Good explanation',
185
+ });
186
+
187
+ // Get aggregated scores
188
+ const stats = collector.getStatistics('response-123');
189
+ ```
190
+
191
+ ### Preference Collection
192
+
193
+ ```typescript
194
+ import { PreferenceCollector } from '@lov3kaizen/agentsea-evaluate/feedback';
195
+
196
+ const collector = new PreferenceCollector();
197
+
198
+ // Collect A/B preferences
199
+ await collector.collect({
200
+ input: 'Explain recursion',
201
+ responseA: '...',
202
+ responseB: '...',
203
+ preference: 'A',
204
+ annotatorId: 'user-1',
205
+ reason: 'More concise explanation',
206
+ });
207
+
208
+ // Export for RLHF/DPO training
209
+ const dataset = collector.exportForDPO();
210
+ ```
211
+
212
+ ## Datasets
213
+
214
+ ### Create Dataset
215
+
216
+ ```typescript
217
+ import { EvalDataset } from '@lov3kaizen/agentsea-evaluate/datasets';
218
+
219
+ const dataset = new EvalDataset({
220
+ name: 'qa-benchmark',
221
+ items: [
222
+ {
223
+ id: '1',
224
+ input: 'Question 1',
225
+ expectedOutput: 'Answer 1',
226
+ context: ['Relevant context...'],
227
+ tags: ['factual', 'science'],
228
+ },
229
+ ],
230
+ });
231
+
232
+ // Filter and sample
233
+ const subset = dataset
234
+ .filter((item) => item.tags?.includes('science'))
235
+ .sample(100);
236
+
237
+ // Split for train/test
238
+ const [train, test] = dataset.split(0.8);
239
+ ```
240
+
241
+ ### HuggingFace Integration
242
+
243
+ ```typescript
244
+ import { loadHuggingFaceDataset } from '@lov3kaizen/agentsea-evaluate/datasets';
245
+
246
+ const dataset = await loadHuggingFaceDataset('squad', {
247
+ split: 'validation',
248
+ inputField: 'question',
249
+ outputField: 'answers.text[0]',
250
+ contextField: 'context',
251
+ limit: 1000,
252
+ });
253
+ ```
254
+
255
+ ## Continuous Evaluation
256
+
257
+ ### Production Monitoring
258
+
259
+ ```typescript
260
+ import { ContinuousEvaluator } from '@lov3kaizen/agentsea-evaluate/continuous';
261
+
262
+ const evaluator = new ContinuousEvaluator({
263
+ metrics: [accuracy, relevance, toxicity],
264
+ sampleRate: 0.1, // Evaluate 10% of requests
265
+ alertThresholds: {
266
+ accuracy: 0.8,
267
+ toxicity: 0.1,
268
+ },
269
+ });
270
+
271
+ // In your production code
272
+ evaluator.on('alert', (alert) => {
273
+ console.error(`Quality alert: ${alert.metric} below threshold`);
274
+ notifyOncall(alert);
275
+ });
276
+
277
+ // Log production interactions
278
+ await evaluator.log({
279
+ input: userQuery,
280
+ output: agentResponse,
281
+ expectedOutput: groundTruth, // Optional
282
+ });
283
+ ```
284
+
285
+ ## API Reference
286
+
287
+ ### EvaluationPipeline
288
+
289
+ ```typescript
290
+ interface EvaluationPipelineConfig {
291
+ metrics: MetricInterface[];
292
+ llmJudge?: JudgeInterface;
293
+ parallelism?: number;
294
+ timeout?: number;
295
+ retries?: number;
296
+ }
297
+
298
+ // Methods
299
+ pipeline.evaluate(options: PipelineEvaluationOptions): Promise<PipelineEvaluationResult>
300
+ ```
301
+
302
+ ### EvalDataset
303
+
304
+ ```typescript
305
+ interface EvalDatasetItem {
306
+ id: string;
307
+ input: string;
308
+ expectedOutput?: string;
309
+ context?: string[];
310
+ reference?: string;
311
+ metadata?: Record<string, unknown>;
312
+ tags?: string[];
313
+ }
314
+
315
+ // Methods
316
+ dataset.getItems(): EvalDatasetItem[]
317
+ dataset.filter(predicate): EvalDataset
318
+ dataset.sample(count): EvalDataset
319
+ dataset.split(ratio): [EvalDataset, EvalDataset]
320
+ ```
321
+
322
+ ### PipelineEvaluationResult
323
+
324
+ ```typescript
325
+ interface PipelineEvaluationResult {
326
+ results: SingleEvaluationResult[];
327
+ metrics: MetricsSummary;
328
+ failures: FailureAnalysis[];
329
+ summary: EvaluationSummary;
330
+ exportJSON(): string;
331
+ exportCSV(): string;
332
+ }
333
+ ```
334
+
335
+ ## Links
336
+
337
+ - [Documentation](https://github.com/lov3kaizen/agentsea)
338
+ - [Examples](../../examples)
339
+ - [API Reference](../../docs/API.md)
@@ -0,0 +1,3 @@
1
+ export { t as AnnotationQueue, p as AnnotationTask, r as BinaryClassificationSchema, v as ConsensusManager, s as QualityRatingSchema, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from '../index-bPghFsfP.mjs';
2
+ import 'zod';
3
+ import 'eventemitter3';
@@ -0,0 +1,3 @@
1
+ export { t as AnnotationQueue, p as AnnotationTask, r as BinaryClassificationSchema, v as ConsensusManager, s as QualityRatingSchema, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from '../index-bPghFsfP.js';
2
+ import 'zod';
3
+ import 'eventemitter3';