@lov3kaizen/agentsea-evaluate 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +339 -0
  3. package/dist/annotation/index.d.mts +3 -0
  4. package/dist/annotation/index.d.ts +3 -0
  5. package/dist/annotation/index.js +630 -0
  6. package/dist/annotation/index.mjs +22 -0
  7. package/dist/chunk-5JRYKRSE.mjs +2791 -0
  8. package/dist/chunk-EUXXIZK3.mjs +676 -0
  9. package/dist/chunk-NBMUSATK.mjs +596 -0
  10. package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
  11. package/dist/chunk-TUMNJN2S.mjs +416 -0
  12. package/dist/continuous/index.d.mts +2 -0
  13. package/dist/continuous/index.d.ts +2 -0
  14. package/dist/continuous/index.js +707 -0
  15. package/dist/continuous/index.mjs +16 -0
  16. package/dist/datasets/index.d.mts +1 -0
  17. package/dist/datasets/index.d.ts +1 -0
  18. package/dist/datasets/index.js +456 -0
  19. package/dist/datasets/index.mjs +14 -0
  20. package/dist/evaluation/index.d.mts +1 -0
  21. package/dist/evaluation/index.d.ts +1 -0
  22. package/dist/evaluation/index.js +2853 -0
  23. package/dist/evaluation/index.mjs +78 -0
  24. package/dist/feedback/index.d.mts +2 -0
  25. package/dist/feedback/index.d.ts +2 -0
  26. package/dist/feedback/index.js +1158 -0
  27. package/dist/feedback/index.mjs +40 -0
  28. package/dist/index-6Pbiq7ny.d.mts +234 -0
  29. package/dist/index-6Pbiq7ny.d.ts +234 -0
  30. package/dist/index-BNTycFEA.d.mts +479 -0
  31. package/dist/index-BNTycFEA.d.ts +479 -0
  32. package/dist/index-CTYCfWfH.d.mts +543 -0
  33. package/dist/index-CTYCfWfH.d.ts +543 -0
  34. package/dist/index-Cq5LwG_3.d.mts +322 -0
  35. package/dist/index-Cq5LwG_3.d.ts +322 -0
  36. package/dist/index-bPghFsfP.d.mts +315 -0
  37. package/dist/index-bPghFsfP.d.ts +315 -0
  38. package/dist/index.d.mts +81 -0
  39. package/dist/index.d.ts +81 -0
  40. package/dist/index.js +5962 -0
  41. package/dist/index.mjs +429 -0
  42. package/package.json +102 -0
@@ -0,0 +1,81 @@
1
+ import { T as ThumbsCollector, F as FeedbackStoreInterface, a as ThumbsFeedback } from './index-BNTycFEA.mjs';
2
+ export { s as AggregationMetric, A as AggregationOptions, t as AggregationResult, O as BaseCollector, B as BaseFeedbackEntry, o as CollectCorrectionInput, p as CollectMultiCriteriaInput, n as CollectPreferenceInput, D as CollectPreferenceInputSchema, m as CollectRatingInput, l as CollectThumbsInput, z as CollectThumbsInputSchema, Y as CorrectionCollector, C as CorrectionFeedback, d as CriterionDefinition, e as CriterionRating, E as ExportFormat, u as ExportOptions, J as FeedbackAggregator, g as FeedbackCollectorOptions, f as FeedbackEntry, L as FeedbackExporter, q as FeedbackQueryOptions, r as FeedbackQueryResult, v as FeedbackStoreConfig, G as MemoryFeedbackStore, _ as MultiCriteriaCollector, k as MultiCriteriaCollectorOptions, M as MultiCriteriaFeedback, P as PreferenceChoice, y as PreferenceChoiceSchema, W as PreferenceCollector, j as PreferenceCollectorOptions, c as PreferenceFeedback, U as RatingCollector, i as RatingCollectorOptions, R as RatingFeedback, H as SQLiteFeedbackStore, S as StarRating, x as StarRatingSchema, h as ThumbsCollectorOptions, b as ThumbsRating, w as ThumbsRatingSchema, Z as createCorrectionCollector, K as createFeedbackAggregator, N as createFeedbackExporter, I as createFeedbackStore, $ as createMultiCriteriaCollector, X as createPreferenceCollector, V as createRatingCollector, Q as createThumbsCollector } from './index-BNTycFEA.mjs';
3
+ import { E as EvaluationPipeline, a as EvalDataset, P as PipelineEvaluationResult } from './index-CTYCfWfH.mjs';
4
+ export { X as Accuracy, A as AccuracyMetricConfig, W as BaseMetric, B as BaseMetricConfig, aj as CodeQualityRubric, $ as Coherence, C as CoherenceMetricConfig, al as ComparativeJudge, m as ComparativeJudgeConfig, n as ComparisonInput, o as ComparisonResult, an as ConsensusJudge, p as ConsensusJudgeConfig, a5 as ContextRelevance, d as ContextRelevanceMetricConfig, a7 as CustomMetric, e as CustomMetricConfig, t as EvalDatasetConfig, K as EvalDatasetInterface, s as EvalDatasetItem, Q as EvalRunner, N as EvalRunnerConfig, x as EvaluationError, g as EvaluationInput, u as EvaluationPipelineConfig, w as EvaluationProgress, I as EvaluationSummary, D as FailureAnalysis, G as FailureFilterOptions, a3 as Faithfulness, F as FaithfulnessMetricConfig, H as HFDatasetConfig, ak as HelpfulnessRubric, h as JudgeCriterion, q as JudgeInterface, r as JudgeResult, J as JudgeType, ae as LLMJudge, L as LLMJudgeConfig, i as LLMProviderInterface, f as MetricInterface, b as MetricResult, M as MetricType, z as MetricsSummary, v as PipelineEvaluationOptions, ai as QualityRubric, Z as Relevance, R as RelevanceMetricConfig, k as RubricConfig, ag as RubricJudge, l as RubricJudgeConfig, j as RubricLevel, S as ScoreRange, y as SingleEvaluationResult, a1 as Toxicity, c as ToxicityCategory, T as ToxicityMetricConfig, Y as createAccuracyMetric, a0 as createCoherenceMetric, am as createComparativeJudge, ao as createConsensusJudge, ad as createContainsMetric, a6 as createContextRelevanceMetric, a8 as createCustomMetric, O as createEvalDataset, U as createEvalRunner, V as createEvaluationPipeline, a4 as createFaithfulnessMetric, ac as createJSONMetric, af as createLLMJudge, aa as createLengthMetric, ab as createRegexMetric, _ as createRelevanceMetric, ah as createRubricJudge, a9 as createSimpleMetric, a2 as createToxicityMetric } from './index-CTYCfWfH.mjs';
5
+ export { d as Annotation, o as AnnotationEvent, n as AnnotationEventType, c as AnnotationItem, a as AnnotationItemStatus, e as AnnotationQuality, t as AnnotationQueue, h as AnnotationQueueConfig, m as AnnotationResults, p as AnnotationTask, b as AnnotationTaskConfig, A as AnnotationTaskStatus, f as Annotator, g as AnnotatorStats, i as AssignmentStrategy, B as BatchAssignment, r as BinaryClassificationSchema, j as ConsensusConfig, v as ConsensusManager, C as ConsensusMethod, k as ConsensusResult, D as Disagreement, G as GoldStandardItem, I as IAnnotationTask, P as PrioritizationType, Q as QualityControlConfig, s as QualityRatingSchema, l as QueueStats, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from './index-bPghFsfP.mjs';
6
+ export { A as AnthropicFormatItem, b as ConversationExample, C as ConversationTurn, n as DPOFormatItem, f as DatasetBuilderConfig, a as DatasetExportFormat, m as DatasetExportOptions, v as DatasetExporter, i as DatasetFilterConfig, c as DatasetItem, g as DatasetQueryOptions, h as DatasetQueryResult, d as DatasetStats, D as DatasetType, q as DatasetValidationError, p as DatasetValidationResult, r as DatasetValidationWarning, E as ExportResult, F as FeedbackStoreRef, H as HFExportOptions, k as InstructionBuildOptions, I as InstructionExample, O as OpenAIFormatItem, j as PreferenceBuildOptions, s as PreferenceDataset, t as PreferenceDatasetBuilder, l as PreferenceDatasetInterface, P as PreferencePair, Q as QAExample, R as RLHFFormatItem, o as SFTFormatItem, e as SamplingConfig, S as SamplingStrategyType, w as createDatasetExporter, u as createPreferenceDatasetBuilder } from './index-6Pbiq7ny.mjs';
7
+ export { r as ABMetricResult, p as ABTest, m as ABTestConfig, q as ABTestResults, G as ABTestRunner, o as ABTestStatus, n as ABTestVariants, g as Alert, d as AlertChannelConfig, A as AlertChannelType, z as AlertManager, f as AlertManagerConfig, h as AlertNotification, e as AlertRule, B as BaselineMetrics, x as ContinuousEval, C as ContinuousEvalConfig, w as ContinuousEvalEvent, v as ContinuousEvalEventType, c as ContinuousEvalStats, D as DashboardUpdate, a as EvalInput, b as EvalOutput, E as EvaluationPipelineRef, H as HistoricalQueryOptions, i as MetricBaseline, l as MetricImprovement, k as MetricRegression, s as MetricSummary, M as MonitoringStatus, R as RegressionDetectorConfig, j as RegressionResult, S as SampleAssignment, u as ScheduleConfig, t as TimeSeries, T as TimeSeriesPoint, V as VariantConfig, I as createABTestRunner, F as createAlertManager, y as createContinuousEval } from './index-Cq5LwG_3.mjs';
8
+ import 'zod';
9
+ import 'eventemitter3';
10
+
11
+ interface FeedbackMiddlewareOptions {
12
+ collector?: ThumbsCollector;
13
+ store?: FeedbackStoreInterface;
14
+ autoCapture?: boolean;
15
+ captureFields?: string[];
16
+ }
17
+ interface AgentMessage {
18
+ id: string;
19
+ role: 'user' | 'assistant';
20
+ content: string;
21
+ metadata?: Record<string, unknown>;
22
+ }
23
+ interface AgentContext {
24
+ conversationId: string;
25
+ messages: AgentMessage[];
26
+ metadata?: Record<string, unknown>;
27
+ }
28
+ declare class FeedbackMiddleware {
29
+ private collector;
30
+ private autoCapture;
31
+ private captureFields;
32
+ private pendingFeedback;
33
+ constructor(options: FeedbackMiddlewareOptions);
34
+ capture(context: AgentContext): void;
35
+ recordFeedback(responseId: string, rating: 'up' | 'down', comment?: string, userId?: string): Promise<ThumbsFeedback | null>;
36
+ getPendingIds(): string[];
37
+ clearPending(): void;
38
+ private cleanupPending;
39
+ getCollector(): ThumbsCollector;
40
+ }
41
+ declare function createFeedbackMiddleware(options: FeedbackMiddlewareOptions): FeedbackMiddleware;
42
+
43
+ interface EvaluationScenario {
44
+ category: string;
45
+ dataset: EvalDataset;
46
+ weight?: number;
47
+ }
48
+ interface AgentEvaluatorOptions {
49
+ pipeline: EvaluationPipeline;
50
+ scenarios: EvaluationScenario[];
51
+ }
52
+ interface AgentInterface {
53
+ execute(input: string, context?: unknown): Promise<string>;
54
+ }
55
+ interface AgentEvaluationResult {
56
+ overallScore: number;
57
+ categoryScores: Record<string, number>;
58
+ categoryResults: Record<string, PipelineEvaluationResult>;
59
+ recommendations: string[];
60
+ summary: {
61
+ totalTests: number;
62
+ passed: number;
63
+ failed: number;
64
+ passRate: number;
65
+ };
66
+ }
67
+ declare class AgentEvaluator {
68
+ private pipeline;
69
+ private scenarios;
70
+ constructor(options: AgentEvaluatorOptions);
71
+ evaluate(agent: AgentInterface): Promise<AgentEvaluationResult>;
72
+ benchmark(agent: AgentInterface, sampleSize?: number): Promise<{
73
+ score: number;
74
+ latencyMs: number;
75
+ }>;
76
+ addScenario(scenario: EvaluationScenario): void;
77
+ getScenarios(): EvaluationScenario[];
78
+ }
79
+ declare function createAgentEvaluator(options: AgentEvaluatorOptions): AgentEvaluator;
80
+
81
+ export { type AgentContext, type AgentEvaluationResult, AgentEvaluator, type AgentEvaluatorOptions, type AgentInterface, type AgentMessage, EvalDataset, EvaluationPipeline, type EvaluationScenario, FeedbackMiddleware, type FeedbackMiddlewareOptions, FeedbackStoreInterface, PipelineEvaluationResult, ThumbsCollector, ThumbsFeedback, createAgentEvaluator, createFeedbackMiddleware };
@@ -0,0 +1,81 @@
1
+ import { T as ThumbsCollector, F as FeedbackStoreInterface, a as ThumbsFeedback } from './index-BNTycFEA.js';
2
+ export { s as AggregationMetric, A as AggregationOptions, t as AggregationResult, O as BaseCollector, B as BaseFeedbackEntry, o as CollectCorrectionInput, p as CollectMultiCriteriaInput, n as CollectPreferenceInput, D as CollectPreferenceInputSchema, m as CollectRatingInput, l as CollectThumbsInput, z as CollectThumbsInputSchema, Y as CorrectionCollector, C as CorrectionFeedback, d as CriterionDefinition, e as CriterionRating, E as ExportFormat, u as ExportOptions, J as FeedbackAggregator, g as FeedbackCollectorOptions, f as FeedbackEntry, L as FeedbackExporter, q as FeedbackQueryOptions, r as FeedbackQueryResult, v as FeedbackStoreConfig, G as MemoryFeedbackStore, _ as MultiCriteriaCollector, k as MultiCriteriaCollectorOptions, M as MultiCriteriaFeedback, P as PreferenceChoice, y as PreferenceChoiceSchema, W as PreferenceCollector, j as PreferenceCollectorOptions, c as PreferenceFeedback, U as RatingCollector, i as RatingCollectorOptions, R as RatingFeedback, H as SQLiteFeedbackStore, S as StarRating, x as StarRatingSchema, h as ThumbsCollectorOptions, b as ThumbsRating, w as ThumbsRatingSchema, Z as createCorrectionCollector, K as createFeedbackAggregator, N as createFeedbackExporter, I as createFeedbackStore, $ as createMultiCriteriaCollector, X as createPreferenceCollector, V as createRatingCollector, Q as createThumbsCollector } from './index-BNTycFEA.js';
3
+ import { E as EvaluationPipeline, a as EvalDataset, P as PipelineEvaluationResult } from './index-CTYCfWfH.js';
4
+ export { X as Accuracy, A as AccuracyMetricConfig, W as BaseMetric, B as BaseMetricConfig, aj as CodeQualityRubric, $ as Coherence, C as CoherenceMetricConfig, al as ComparativeJudge, m as ComparativeJudgeConfig, n as ComparisonInput, o as ComparisonResult, an as ConsensusJudge, p as ConsensusJudgeConfig, a5 as ContextRelevance, d as ContextRelevanceMetricConfig, a7 as CustomMetric, e as CustomMetricConfig, t as EvalDatasetConfig, K as EvalDatasetInterface, s as EvalDatasetItem, Q as EvalRunner, N as EvalRunnerConfig, x as EvaluationError, g as EvaluationInput, u as EvaluationPipelineConfig, w as EvaluationProgress, I as EvaluationSummary, D as FailureAnalysis, G as FailureFilterOptions, a3 as Faithfulness, F as FaithfulnessMetricConfig, H as HFDatasetConfig, ak as HelpfulnessRubric, h as JudgeCriterion, q as JudgeInterface, r as JudgeResult, J as JudgeType, ae as LLMJudge, L as LLMJudgeConfig, i as LLMProviderInterface, f as MetricInterface, b as MetricResult, M as MetricType, z as MetricsSummary, v as PipelineEvaluationOptions, ai as QualityRubric, Z as Relevance, R as RelevanceMetricConfig, k as RubricConfig, ag as RubricJudge, l as RubricJudgeConfig, j as RubricLevel, S as ScoreRange, y as SingleEvaluationResult, a1 as Toxicity, c as ToxicityCategory, T as ToxicityMetricConfig, Y as createAccuracyMetric, a0 as createCoherenceMetric, am as createComparativeJudge, ao as createConsensusJudge, ad as createContainsMetric, a6 as createContextRelevanceMetric, a8 as createCustomMetric, O as createEvalDataset, U as createEvalRunner, V as createEvaluationPipeline, a4 as createFaithfulnessMetric, ac as createJSONMetric, af as createLLMJudge, aa as createLengthMetric, ab as createRegexMetric, _ as createRelevanceMetric, ah as createRubricJudge, a9 as createSimpleMetric, a2 as createToxicityMetric } from './index-CTYCfWfH.js';
5
+ export { d as Annotation, o as AnnotationEvent, n as AnnotationEventType, c as AnnotationItem, a as AnnotationItemStatus, e as AnnotationQuality, t as AnnotationQueue, h as AnnotationQueueConfig, m as AnnotationResults, p as AnnotationTask, b as AnnotationTaskConfig, A as AnnotationTaskStatus, f as Annotator, g as AnnotatorStats, i as AssignmentStrategy, B as BatchAssignment, r as BinaryClassificationSchema, j as ConsensusConfig, v as ConsensusManager, C as ConsensusMethod, k as ConsensusResult, D as Disagreement, G as GoldStandardItem, I as IAnnotationTask, P as PrioritizationType, Q as QualityControlConfig, s as QualityRatingSchema, l as QueueStats, T as TextSpanSchema, u as createAnnotationQueue, q as createAnnotationTask, w as createConsensusManager } from './index-bPghFsfP.js';
6
+ export { A as AnthropicFormatItem, b as ConversationExample, C as ConversationTurn, n as DPOFormatItem, f as DatasetBuilderConfig, a as DatasetExportFormat, m as DatasetExportOptions, v as DatasetExporter, i as DatasetFilterConfig, c as DatasetItem, g as DatasetQueryOptions, h as DatasetQueryResult, d as DatasetStats, D as DatasetType, q as DatasetValidationError, p as DatasetValidationResult, r as DatasetValidationWarning, E as ExportResult, F as FeedbackStoreRef, H as HFExportOptions, k as InstructionBuildOptions, I as InstructionExample, O as OpenAIFormatItem, j as PreferenceBuildOptions, s as PreferenceDataset, t as PreferenceDatasetBuilder, l as PreferenceDatasetInterface, P as PreferencePair, Q as QAExample, R as RLHFFormatItem, o as SFTFormatItem, e as SamplingConfig, S as SamplingStrategyType, w as createDatasetExporter, u as createPreferenceDatasetBuilder } from './index-6Pbiq7ny.js';
7
+ export { r as ABMetricResult, p as ABTest, m as ABTestConfig, q as ABTestResults, G as ABTestRunner, o as ABTestStatus, n as ABTestVariants, g as Alert, d as AlertChannelConfig, A as AlertChannelType, z as AlertManager, f as AlertManagerConfig, h as AlertNotification, e as AlertRule, B as BaselineMetrics, x as ContinuousEval, C as ContinuousEvalConfig, w as ContinuousEvalEvent, v as ContinuousEvalEventType, c as ContinuousEvalStats, D as DashboardUpdate, a as EvalInput, b as EvalOutput, E as EvaluationPipelineRef, H as HistoricalQueryOptions, i as MetricBaseline, l as MetricImprovement, k as MetricRegression, s as MetricSummary, M as MonitoringStatus, R as RegressionDetectorConfig, j as RegressionResult, S as SampleAssignment, u as ScheduleConfig, t as TimeSeries, T as TimeSeriesPoint, V as VariantConfig, I as createABTestRunner, F as createAlertManager, y as createContinuousEval } from './index-Cq5LwG_3.js';
8
+ import 'zod';
9
+ import 'eventemitter3';
10
+
11
+ interface FeedbackMiddlewareOptions {
12
+ collector?: ThumbsCollector;
13
+ store?: FeedbackStoreInterface;
14
+ autoCapture?: boolean;
15
+ captureFields?: string[];
16
+ }
17
+ interface AgentMessage {
18
+ id: string;
19
+ role: 'user' | 'assistant';
20
+ content: string;
21
+ metadata?: Record<string, unknown>;
22
+ }
23
+ interface AgentContext {
24
+ conversationId: string;
25
+ messages: AgentMessage[];
26
+ metadata?: Record<string, unknown>;
27
+ }
28
+ declare class FeedbackMiddleware {
29
+ private collector;
30
+ private autoCapture;
31
+ private captureFields;
32
+ private pendingFeedback;
33
+ constructor(options: FeedbackMiddlewareOptions);
34
+ capture(context: AgentContext): void;
35
+ recordFeedback(responseId: string, rating: 'up' | 'down', comment?: string, userId?: string): Promise<ThumbsFeedback | null>;
36
+ getPendingIds(): string[];
37
+ clearPending(): void;
38
+ private cleanupPending;
39
+ getCollector(): ThumbsCollector;
40
+ }
41
+ declare function createFeedbackMiddleware(options: FeedbackMiddlewareOptions): FeedbackMiddleware;
42
+
43
+ interface EvaluationScenario {
44
+ category: string;
45
+ dataset: EvalDataset;
46
+ weight?: number;
47
+ }
48
+ interface AgentEvaluatorOptions {
49
+ pipeline: EvaluationPipeline;
50
+ scenarios: EvaluationScenario[];
51
+ }
52
+ interface AgentInterface {
53
+ execute(input: string, context?: unknown): Promise<string>;
54
+ }
55
+ interface AgentEvaluationResult {
56
+ overallScore: number;
57
+ categoryScores: Record<string, number>;
58
+ categoryResults: Record<string, PipelineEvaluationResult>;
59
+ recommendations: string[];
60
+ summary: {
61
+ totalTests: number;
62
+ passed: number;
63
+ failed: number;
64
+ passRate: number;
65
+ };
66
+ }
67
+ declare class AgentEvaluator {
68
+ private pipeline;
69
+ private scenarios;
70
+ constructor(options: AgentEvaluatorOptions);
71
+ evaluate(agent: AgentInterface): Promise<AgentEvaluationResult>;
72
+ benchmark(agent: AgentInterface, sampleSize?: number): Promise<{
73
+ score: number;
74
+ latencyMs: number;
75
+ }>;
76
+ addScenario(scenario: EvaluationScenario): void;
77
+ getScenarios(): EvaluationScenario[];
78
+ }
79
+ declare function createAgentEvaluator(options: AgentEvaluatorOptions): AgentEvaluator;
80
+
81
+ export { type AgentContext, type AgentEvaluationResult, AgentEvaluator, type AgentEvaluatorOptions, type AgentInterface, type AgentMessage, EvalDataset, EvaluationPipeline, type EvaluationScenario, FeedbackMiddleware, type FeedbackMiddlewareOptions, FeedbackStoreInterface, PipelineEvaluationResult, ThumbsCollector, ThumbsFeedback, createAgentEvaluator, createFeedbackMiddleware };