npm - @lov3kaizen/agentsea-evaluate - Versions diffs - 0.5.1 - Mend

@lov3kaizen/agentsea-evaluate 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/LICENSE +21 -0
package/README.md +339 -0
package/dist/annotation/index.d.mts +3 -0
package/dist/annotation/index.d.ts +3 -0
package/dist/annotation/index.js +630 -0
package/dist/annotation/index.mjs +22 -0
package/dist/chunk-5JRYKRSE.mjs +2791 -0
package/dist/chunk-EUXXIZK3.mjs +676 -0
package/dist/chunk-NBMUSATK.mjs +596 -0
package/dist/chunk-PAQ2TTJJ.mjs +1105 -0
package/dist/chunk-TUMNJN2S.mjs +416 -0
package/dist/continuous/index.d.mts +2 -0
package/dist/continuous/index.d.ts +2 -0
package/dist/continuous/index.js +707 -0
package/dist/continuous/index.mjs +16 -0
package/dist/datasets/index.d.mts +1 -0
package/dist/datasets/index.d.ts +1 -0
package/dist/datasets/index.js +456 -0
package/dist/datasets/index.mjs +14 -0
package/dist/evaluation/index.d.mts +1 -0
package/dist/evaluation/index.d.ts +1 -0
package/dist/evaluation/index.js +2853 -0
package/dist/evaluation/index.mjs +78 -0
package/dist/feedback/index.d.mts +2 -0
package/dist/feedback/index.d.ts +2 -0
package/dist/feedback/index.js +1158 -0
package/dist/feedback/index.mjs +40 -0
package/dist/index-6Pbiq7ny.d.mts +234 -0
package/dist/index-6Pbiq7ny.d.ts +234 -0
package/dist/index-BNTycFEA.d.mts +479 -0
package/dist/index-BNTycFEA.d.ts +479 -0
package/dist/index-CTYCfWfH.d.mts +543 -0
package/dist/index-CTYCfWfH.d.ts +543 -0
package/dist/index-Cq5LwG_3.d.mts +322 -0
package/dist/index-Cq5LwG_3.d.ts +322 -0
package/dist/index-bPghFsfP.d.mts +315 -0
package/dist/index-bPghFsfP.d.ts +315 -0
package/dist/index.d.mts +81 -0
package/dist/index.d.ts +81 -0
package/dist/index.js +5962 -0
package/dist/index.mjs +429 -0
package/package.json +102 -0

package/dist/index-CTYCfWfH.d.ts ADDED Viewed

@@ -0,0 +1,543 @@
+type MetricType = 'accuracy' | 'relevance' | 'coherence' | 'toxicity' | 'faithfulness' | 'answer_correctness' | 'context_relevance' | 'fluency' | 'conciseness' | 'helpfulness' | 'safety' | 'custom';
+interface ScoreRange {
+    min: number;
+    max: number;
+}
+interface MetricResult {
+    metric: string;
+    score: number;
+    explanation?: string;
+    details?: Record<string, unknown>;
+    confidence?: number;
+}
+interface BaseMetricConfig {
+    name?: string;
+    threshold?: number;
+    weight?: number;
+    scoreRange?: ScoreRange;
+}
+interface AccuracyMetricConfig extends BaseMetricConfig {
+    type: 'exact' | 'fuzzy' | 'semantic';
+    caseSensitive?: boolean;
+    ignoreWhitespace?: boolean;
+    similarityThreshold?: number;
+}
+interface RelevanceMetricConfig extends BaseMetricConfig {
+    model?: string;
+    prompt?: string;
+}
+interface CoherenceMetricConfig extends BaseMetricConfig {
+    checkLogicalFlow?: boolean;
+    checkConsistency?: boolean;
+}
+interface ToxicityMetricConfig extends BaseMetricConfig {
+    categories?: ToxicityCategory[];
+    strictMode?: boolean;
+}
+type ToxicityCategory = 'hate' | 'harassment' | 'violence' | 'sexual' | 'self_harm' | 'dangerous';
+interface FaithfulnessMetricConfig extends BaseMetricConfig {
+    model?: string;
+    checkFactualAccuracy?: boolean;
+    checkSourceAttribution?: boolean;
+}
+interface ContextRelevanceMetricConfig extends BaseMetricConfig {
+    model?: string;
+    minRelevantChunks?: number;
+}
+interface CustomMetricConfig extends BaseMetricConfig {
+    evaluateFn: (input: EvaluationInput) => Promise<MetricResult>;
+}
+interface MetricInterface {
+    readonly type: string;
+    readonly name: string;
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+}
+interface EvaluationInput {
+    input: string;
+    output: string;
+    expectedOutput?: string;
+    context?: string[];
+    reference?: string;
+    metadata?: Record<string, unknown>;
+}
+type JudgeType = 'llm' | 'rubric' | 'comparative' | 'consensus';
+interface JudgeCriterion {
+    name: string;
+    prompt: string;
+    scoreRange?: ScoreRange;
+    weight?: number;
+}
+interface LLMJudgeConfig {
+    provider: LLMProviderInterface;
+    model: string;
+    criteria: JudgeCriterion[];
+    systemPrompt?: string;
+    temperature?: number;
+    maxRetries?: number;
+}
+interface LLMProviderInterface {
+    complete(params: {
+        model: string;
+        messages: Array<{
+            role: string;
+            content: string;
+        }>;
+        temperature?: number;
+        maxTokens?: number;
+    }): Promise<{
+        content: string;
+    }>;
+}
+interface RubricLevel {
+    score: number;
+    description: string;
+    examples?: string[];
+}
+interface RubricConfig {
+    criteria: string;
+    levels: RubricLevel[];
+}
+interface RubricJudgeConfig {
+    provider: LLMProviderInterface;
+    model?: string;
+    rubric: RubricConfig;
+    temperature?: number;
+}
+interface ComparativeJudgeConfig {
+    provider: LLMProviderInterface;
+    model?: string;
+    criteria: string[];
+    tieBreaker?: string;
+    temperature?: number;
+}
+interface ComparisonInput {
+    input: string;
+    responseA: string;
+    responseB: string;
+    context?: string[];
+}
+interface ComparisonResult {
+    winner: 'A' | 'B' | 'tie';
+    reasoning: string;
+    criteriaScores?: Record<string, {
+        A: number;
+        B: number;
+    }>;
+    confidence?: number;
+}
+interface ConsensusJudgeConfig {
+    judges: JudgeInterface[];
+    aggregation: 'majority' | 'average' | 'weighted';
+    weights?: number[];
+    minAgreement?: number;
+}
+interface JudgeInterface {
+    readonly type: JudgeType;
+    evaluate(input: EvaluationInput): Promise<JudgeResult>;
+}
+interface JudgeResult {
+    scores: Record<string, number>;
+    explanations: Record<string, string>;
+    overallScore?: number;
+    confidence?: number;
+    metadata?: Record<string, unknown>;
+}
+interface EvalDatasetItem {
+    id: string;
+    input: string;
+    expectedOutput?: string;
+    context?: string[];
+    reference?: string;
+    metadata?: Record<string, unknown>;
+    tags?: string[];
+}
+interface EvalDatasetConfig {
+    name?: string;
+    description?: string;
+    items: EvalDatasetItem[];
+    metadata?: Record<string, unknown>;
+}
+interface HFDatasetConfig {
+    split?: string;
+    subset?: string;
+    inputField?: string;
+    outputField?: string;
+    contextField?: string;
+    limit?: number;
+}
+interface EvaluationPipelineConfig {
+    metrics: MetricInterface[];
+    llmJudge?: JudgeInterface;
+    parallelism?: number;
+    timeout?: number;
+    retries?: number;
+    batchSize?: number;
+}
+interface PipelineEvaluationOptions {
+    dataset: EvalDatasetInterface;
+    generateFn: (input: string, context?: string[]) => Promise<string>;
+    onProgress?: (progress: EvaluationProgress) => void;
+    onError?: (error: EvaluationError) => void;
+    stopOnError?: boolean;
+}
+interface EvaluationProgress {
+    completed: number;
+    total: number;
+    currentItem?: string;
+    elapsedMs: number;
+    estimatedRemainingMs?: number;
+}
+interface EvaluationError {
+    itemId: string;
+    input: string;
+    error: Error;
+    phase: 'generation' | 'evaluation';
+}
+interface SingleEvaluationResult {
+    itemId: string;
+    input: string;
+    output: string;
+    expectedOutput?: string;
+    context?: string[];
+    scores: Record<string, number>;
+    explanations?: Record<string, string>;
+    judgeResult?: JudgeResult;
+    passed: boolean;
+    durationMs: number;
+}
+interface PipelineEvaluationResult {
+    results: SingleEvaluationResult[];
+    metrics: MetricsSummary;
+    failures: FailureAnalysis[];
+    summary: EvaluationSummary;
+    exportJSON(): string;
+    exportCSV(): string;
+    getFailures(options?: FailureFilterOptions): FailureAnalysis[];
+}
+interface MetricsSummary {
+    [metric: string]: {
+        mean: number;
+        std: number;
+        min: number;
+        max: number;
+        median: number;
+        p90: number;
+        p95: number;
+        passRate: number;
+    };
+}
+interface FailureAnalysis {
+    itemId: string;
+    input: string;
+    output: string;
+    expectedOutput?: string;
+    scores: Record<string, number>;
+    failedMetrics: string[];
+    explanation?: string;
+}
+interface FailureFilterOptions {
+    threshold?: number;
+    metric?: string;
+    limit?: number;
+}
+interface EvaluationSummary {
+    totalItems: number;
+    passedItems: number;
+    failedItems: number;
+    passRate: number;
+    avgScore: number;
+    totalDurationMs: number;
+    avgDurationMs: number;
+    timestamp: number;
+}
+interface EvalDatasetInterface {
+    readonly name: string;
+    readonly size: number;
+    getItems(): EvalDatasetItem[];
+    getItem(id: string): EvalDatasetItem | undefined;
+    filter(predicate: (item: EvalDatasetItem) => boolean): EvalDatasetInterface;
+    sample(count: number): EvalDatasetInterface;
+    split(ratio: number): [EvalDatasetInterface, EvalDatasetInterface];
+}
+interface EvalRunnerConfig {
+    parallelism?: number;
+    timeout?: number;
+    retries?: number;
+    onItemComplete?: (result: SingleEvaluationResult) => void;
+    onError?: (error: EvaluationError) => void;
+}
+declare abstract class BaseMetric implements MetricInterface {
+    abstract readonly type: string;
+    readonly name: string;
+    protected threshold: number;
+    protected weight: number;
+    protected scoreRange: ScoreRange;
+    constructor(config?: BaseMetricConfig);
+    protected initName(config: BaseMetricConfig): void;
+    abstract evaluate(input: EvaluationInput): Promise<MetricResult>;
+    passes(score: number): boolean;
+    protected normalizeScore(score: number): number;
+    protected createResult(score: number, explanation?: string, details?: Record<string, unknown>): MetricResult;
+}
+declare class Accuracy extends BaseMetric {
+    readonly type: "accuracy";
+    private matchType;
+    private caseSensitive;
+    private ignoreWhitespace;
+    constructor(config?: AccuracyMetricConfig);
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private preprocess;
+    private calculateFuzzySimilarity;
+}
+declare function createAccuracyMetric(config?: AccuracyMetricConfig): Accuracy;
+declare class Relevance extends BaseMetric {
+    readonly type: "relevance";
+    private provider?;
+    private model;
+    private prompt?;
+    constructor(config?: RelevanceMetricConfig);
+    setProvider(provider: LLMProviderInterface): void;
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private evaluateHeuristic;
+    private evaluateWithLLM;
+    private getDefaultPrompt;
+    private extractKeywords;
+    private detectQuestionType;
+    private checkAnswerType;
+}
+declare function createRelevanceMetric(config?: RelevanceMetricConfig): Relevance;
+declare class Coherence extends BaseMetric {
+    readonly type: "coherence";
+    private checkLogicalFlow;
+    private checkConsistency;
+    constructor(config?: CoherenceMetricConfig);
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private checkStructure;
+    private checkFlow;
+    private checkInternalConsistency;
+    private checkCompleteness;
+    private generateExplanation;
+}
+declare function createCoherenceMetric(config?: CoherenceMetricConfig): Coherence;
+declare class Toxicity extends BaseMetric {
+    readonly type: "toxicity";
+    private categories;
+    private strictMode;
+    private static readonly TOXIC_PATTERNS;
+    constructor(config?: ToxicityMetricConfig);
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private checkCategory;
+    private generateExplanation;
+    passes(score: number): boolean;
+}
+declare function createToxicityMetric(config?: ToxicityMetricConfig): Toxicity;
+declare class Faithfulness extends BaseMetric {
+    readonly type: "faithfulness";
+    private provider?;
+    private model;
+    constructor(config?: FaithfulnessMetricConfig);
+    setProvider(provider: LLMProviderInterface): void;
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private evaluateHeuristic;
+    private evaluateWithLLM;
+    private extractClaims;
+    private checkClaimSupport;
+}
+declare function createFaithfulnessMetric(config?: FaithfulnessMetricConfig): Faithfulness;
+declare class ContextRelevance extends BaseMetric {
+    readonly type: "context_relevance";
+    private provider?;
+    private model;
+    private minRelevantChunks;
+    constructor(config?: ContextRelevanceMetricConfig);
+    setProvider(provider: LLMProviderInterface): void;
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+    private evaluateHeuristic;
+    private evaluateWithLLM;
+    private extractKeywords;
+}
+declare function createContextRelevanceMetric(config?: ContextRelevanceMetricConfig): ContextRelevance;
+declare class CustomMetric extends BaseMetric {
+    readonly type: "custom";
+    private evaluateFn;
+    constructor(config: CustomMetricConfig);
+    evaluate(input: EvaluationInput): Promise<MetricResult>;
+}
+declare function createCustomMetric(config: CustomMetricConfig): CustomMetric;
+declare function createSimpleMetric(name: string, scoreFn: (input: string, output: string, expected?: string) => number | Promise<number>, options?: {
+    threshold?: number;
+    weight?: number;
+}): CustomMetric;
+declare function createLengthMetric(options: {
+    minLength?: number;
+    maxLength?: number;
+    targetLength?: number;
+    tolerance?: number;
+}): CustomMetric;
+declare function createRegexMetric(options: {
+    pattern: RegExp;
+    name?: string;
+    shouldMatch?: boolean;
+}): CustomMetric;
+declare function createJSONMetric(options?: {
+    schema?: Record<string, unknown>;
+}): CustomMetric;
+declare function createContainsMetric(options: {
+    required?: string[];
+    forbidden?: string[];
+    caseSensitive?: boolean;
+}): CustomMetric;
+declare class LLMJudge implements JudgeInterface {
+    readonly type: "llm";
+    private provider;
+    private model;
+    private criteria;
+    private systemPrompt;
+    private temperature;
+    private maxRetries;
+    constructor(config: LLMJudgeConfig);
+    evaluate(input: EvaluationInput): Promise<JudgeResult>;
+    private evaluateCriterion;
+    private buildPrompt;
+    private parseResponse;
+    private calculateConfidence;
+    private getDefaultSystemPrompt;
+    addCriterion(criterion: JudgeCriterion): void;
+    removeCriterion(name: string): boolean;
+    getCriteria(): JudgeCriterion[];
+}
+declare function createLLMJudge(config: LLMJudgeConfig): LLMJudge;
+declare class RubricJudge implements JudgeInterface {
+    readonly type: "rubric";
+    private provider;
+    private model;
+    private rubric;
+    private temperature;
+    constructor(config: RubricJudgeConfig);
+    evaluate(input: EvaluationInput): Promise<JudgeResult>;
+    private buildPrompt;
+    private getSystemPrompt;
+    private parseResponse;
+    getRubric(): RubricConfig;
+    setRubric(rubric: RubricConfig): void;
+}
+declare function createRubricJudge(config: RubricJudgeConfig): RubricJudge;
+declare const QualityRubric: RubricConfig;
+declare const CodeQualityRubric: RubricConfig;
+declare const HelpfulnessRubric: RubricConfig;
+declare class ComparativeJudge implements JudgeInterface {
+    readonly type: "comparative";
+    private provider;
+    private model;
+    private criteria;
+    private tieBreaker?;
+    private temperature;
+    constructor(config: ComparativeJudgeConfig);
+    evaluate(input: EvaluationInput): Promise<JudgeResult>;
+    compare(input: ComparisonInput): Promise<ComparisonResult>;
+    private buildComparisonPrompt;
+    private getSystemPrompt;
+    private parseComparisonResponse;
+    getCriteria(): string[];
+    setCriteria(criteria: string[]): void;
+}
+declare function createComparativeJudge(config: ComparativeJudgeConfig): ComparativeJudge;
+declare class ConsensusJudge implements JudgeInterface {
+    readonly type: "consensus";
+    private judges;
+    private aggregation;
+    private weights?;
+    private minAgreement;
+    constructor(config: ConsensusJudgeConfig);
+    evaluate(input: EvaluationInput): Promise<JudgeResult>;
+    private aggregateMajority;
+    private aggregateAverage;
+    private aggregateWeighted;
+    addJudge(judge: JudgeInterface, weight?: number): void;
+    removeJudge(index: number): boolean;
+    getJudgeCount(): number;
+}
+declare function createConsensusJudge(config: ConsensusJudgeConfig): ConsensusJudge;
+declare class EvalDataset implements EvalDatasetInterface {
+    readonly name: string;
+    private items;
+    private metadata?;
+    constructor(config: EvalDatasetConfig);
+    get size(): number;
+    getItems(): EvalDatasetItem[];
+    getItem(id: string): EvalDatasetItem | undefined;
+    filter(predicate: (item: EvalDatasetItem) => boolean): EvalDataset;
+    sample(count: number, seed?: number): EvalDataset;
+    split(ratio: number): [EvalDataset, EvalDataset];
+    filterByTags(tags: string[], mode?: 'any' | 'all'): EvalDataset;
+    getTags(): string[];
+    addItems(items: EvalDatasetItem[]): void;
+    removeItem(id: string): boolean;
+    private seededRandom;
+    toJSON(): string;
+    toJSONL(): string;
+    static fromJSON(data: Array<{
+        input: string;
+        expectedOutput?: string;
+        context?: string[];
+        reference?: string;
+        metadata?: Record<string, unknown>;
+        tags?: string[];
+    }>, name?: string): EvalDataset;
+    static fromJSONL(jsonl: string, name?: string): EvalDataset;
+    static fromHuggingFace(datasetName: string, config?: HFDatasetConfig): Promise<EvalDataset>;
+    static fromCSV(csv: string, options?: {
+        inputColumn?: string;
+        outputColumn?: string;
+        contextColumn?: string;
+        delimiter?: string;
+    }): EvalDataset;
+}
+declare function createEvalDataset(config: EvalDatasetConfig): EvalDataset;
+declare class EvalRunner {
+    private parallelism;
+    private timeout;
+    private retries;
+    private onItemComplete?;
+    private onError?;
+    constructor(config?: EvalRunnerConfig);
+    run(dataset: EvalDatasetInterface, generateFn: (input: string, context?: string[]) => Promise<string>, metrics: MetricInterface[], judge?: JudgeInterface): Promise<SingleEvaluationResult[]>;
+    runStream(dataset: EvalDatasetInterface, generateFn: (input: string, context?: string[]) => Promise<string>, metrics: MetricInterface[], judge?: JudgeInterface): AsyncGenerator<SingleEvaluationResult, void, unknown>;
+    private evaluateItem;
+    private withTimeout;
+}
+declare function createEvalRunner(config?: EvalRunnerConfig): EvalRunner;
+declare class EvaluationPipeline {
+    private metrics;
+    private llmJudge?;
+    private runner;
+    constructor(config: EvaluationPipelineConfig);
+    evaluate(options: PipelineEvaluationOptions): Promise<PipelineEvaluationResult>;
+    evaluateStream(options: PipelineEvaluationOptions): AsyncGenerator<SingleEvaluationResult, PipelineEvaluationResult, unknown>;
+    private calculateMetricsSummary;
+    private analyzeFailures;
+    private createSummary;
+    private createResult;
+    addMetric(metric: MetricInterface): void;
+    removeMetric(name: string): boolean;
+    setJudge(judge: JudgeInterface): void;
+    getMetrics(): MetricInterface[];
+}
+declare function createEvaluationPipeline(config: EvaluationPipelineConfig): EvaluationPipeline;
+export { Coherence as $, type AccuracyMetricConfig as A, type BaseMetricConfig as B, type CoherenceMetricConfig as C, type FailureAnalysis as D, EvaluationPipeline as E, type FaithfulnessMetricConfig as F, type FailureFilterOptions as G, type HFDatasetConfig as H, type EvaluationSummary as I, type JudgeType as J, type EvalDatasetInterface as K, type LLMJudgeConfig as L, type MetricType as M, type EvalRunnerConfig as N, createEvalDataset as O, type PipelineEvaluationResult as P, EvalRunner as Q, type RelevanceMetricConfig as R, type ScoreRange as S, type ToxicityMetricConfig as T, createEvalRunner as U, createEvaluationPipeline as V, BaseMetric as W, Accuracy as X, createAccuracyMetric as Y, Relevance as Z, createRelevanceMetric as _, EvalDataset as a, createCoherenceMetric as a0, Toxicity as a1, createToxicityMetric as a2, Faithfulness as a3, createFaithfulnessMetric as a4, ContextRelevance as a5, createContextRelevanceMetric as a6, CustomMetric as a7, createCustomMetric as a8, createSimpleMetric as a9, createLengthMetric as aa, createRegexMetric as ab, createJSONMetric as ac, createContainsMetric as ad, LLMJudge as ae, createLLMJudge as af, RubricJudge as ag, createRubricJudge as ah, QualityRubric as ai, CodeQualityRubric as aj, HelpfulnessRubric as ak, ComparativeJudge as al, createComparativeJudge as am, ConsensusJudge as an, createConsensusJudge as ao, type MetricResult as b, type ToxicityCategory as c, type ContextRelevanceMetricConfig as d, type CustomMetricConfig as e, type MetricInterface as f, type EvaluationInput as g, type JudgeCriterion as h, type LLMProviderInterface as i, type RubricLevel as j, type RubricConfig as k, type RubricJudgeConfig as l, type ComparativeJudgeConfig as m, type ComparisonInput as n, type ComparisonResult as o, type ConsensusJudgeConfig as p, type JudgeInterface as q, type JudgeResult as r, type EvalDatasetItem as s, type EvalDatasetConfig as t, type EvaluationPipelineConfig as u, type PipelineEvaluationOptions as v, type EvaluationProgress as w, type EvaluationError as x, type SingleEvaluationResult as y, type MetricsSummary as z };