npm - @assay-ai/core - Versions diffs - 0.1.0-beta - Mend

@assay-ai/core 0.1.0-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,460 @@
+import { ZodType, z } from 'zod';
+interface ToolCall {
+    name: string;
+    description?: string;
+    reasoning?: string;
+    output?: unknown;
+    inputParameters?: Record<string, unknown>;
+}
+interface LLMTestCase {
+    input: string;
+    actualOutput?: string;
+    expectedOutput?: string;
+    context?: string[];
+    retrievalContext?: string[];
+    toolsCalled?: ToolCall[];
+    expectedTools?: ToolCall[];
+    tokenCost?: number;
+    completionTime?: number;
+    name?: string;
+    tags?: string[];
+}
+interface ConversationalTestCase {
+    turns: Array<{
+        role: "user" | "assistant";
+        content: string;
+    }>;
+    scenario?: string;
+    expectedOutcome?: string;
+    chatbotRole?: string;
+}
+interface Golden {
+    input: string;
+    actualOutput?: string;
+    expectedOutput?: string;
+    context?: string[];
+    retrievalContext?: string[];
+}
+interface EvaluationDataset {
+    name?: string;
+    goldens: Golden[];
+}
+interface ProviderConfig {
+    apiKey?: string;
+    model?: string;
+    baseUrl?: string;
+    temperature?: number;
+    maxTokens?: number;
+}
+declare abstract class BaseLLMProvider {
+    protected readonly config: ProviderConfig;
+    readonly modelName: string;
+    protected readonly temperature: number;
+    protected readonly maxTokens: number;
+    constructor(config: ProviderConfig, defaultModel: string);
+    /**
+     * Generate a raw text completion from the LLM.
+     */
+    abstract generate(prompt: string): Promise<string>;
+    /**
+     * Generate a typed JSON response from the LLM, validated against a Zod schema.
+     * Instructs the model to return JSON conforming to the schema, then parses
+     * and validates the response.
+     *
+     * @param prompt - The user prompt
+     * @param schema - A Zod schema to validate the response
+     * @param retries - Number of retries on parse/validation failure (default 2)
+     */
+    generateJSON<T>(prompt: string, schema: ZodType<T>, retries?: number): Promise<T>;
+    /**
+     * Returns the provider name for logging/identification.
+     */
+    abstract get providerName(): string;
+}
+interface MetricResult {
+    /** Score from 0 to 1 */
+    score: number;
+    /** LLM-generated explanation of the score */
+    reason?: string;
+    /** Whether the score meets the threshold */
+    pass: boolean;
+    /** Name of the metric */
+    metricName: string;
+    /** Threshold used */
+    threshold: number;
+    /** Time taken in ms */
+    evaluationTimeMs: number;
+    /** Details for debugging (extracted statements, verdicts, etc.) */
+    details?: Record<string, unknown>;
+}
+interface MetricConfig {
+    /** Score threshold for pass/fail (default: 0.5) */
+    threshold?: number;
+    /** LLM provider to use for evaluation */
+    provider?: BaseLLMProvider | string;
+    /** Include reasoning in results (default: true) */
+    includeReason?: boolean;
+    /** Binary scoring mode — 0 or 1 only (default: false) */
+    strictMode?: boolean;
+    /** Enable verbose logging (default: false) */
+    verbose?: boolean;
+}
+declare abstract class BaseMetric {
+    abstract readonly name: string;
+    abstract readonly requiredFields: (keyof LLMTestCase)[];
+    readonly threshold: number;
+    readonly includeReason: boolean;
+    readonly strictMode: boolean;
+    readonly verbose: boolean;
+    /** Whether a lower score is better (e.g., Hallucination, Bias, Toxicity) */
+    readonly lowerIsBetter: boolean;
+    protected provider: BaseLLMProvider;
+    constructor(config?: MetricConfig);
+    /** Run the metric evaluation. Must be implemented by each metric. */
+    abstract measure(testCase: LLMTestCase): Promise<MetricResult>;
+    /** Validate that required fields exist on the test case */
+    protected validate(testCase: LLMTestCase): void;
+    /** Apply strict mode (binary 0/1) if enabled */
+    protected applyStrictMode(score: number): number;
+    /** Build a MetricResult from score, reason, and timing */
+    protected buildResult(score: number, reason: string | undefined, startTime: number, details?: Record<string, unknown>): MetricResult;
+}
+declare class AnswerRelevancyMetric extends BaseMetric {
+    readonly name = "Answer Relevancy";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class FaithfulnessMetric extends BaseMetric {
+    readonly name = "Faithfulness";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class HallucinationMetric extends BaseMetric {
+    readonly name = "Hallucination";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    readonly lowerIsBetter = true;
+    constructor(config?: MetricConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class ContextualPrecisionMetric extends BaseMetric {
+    readonly name = "Contextual Precision";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class ContextualRecallMetric extends BaseMetric {
+    readonly name = "Contextual Recall";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class ContextualRelevancyMetric extends BaseMetric {
+    readonly name = "Contextual Relevancy";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class BiasMetric extends BaseMetric {
+    readonly name = "Bias";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    readonly lowerIsBetter = true;
+    constructor(config?: MetricConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class ToxicityMetric extends BaseMetric {
+    readonly name = "Toxicity";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    readonly lowerIsBetter = true;
+    constructor(config?: MetricConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+interface GEvalConfig extends MetricConfig {
+    /** Human-readable name for this evaluation */
+    name?: string;
+    /** The evaluation criteria in natural language */
+    criteria: string;
+    /** Which test case fields to include in evaluation */
+    evaluationParams?: (keyof LLMTestCase)[];
+    /** Optional pre-defined evaluation steps (auto-generated if omitted) */
+    evaluationSteps?: string[];
+}
+declare class GEval extends BaseMetric {
+    readonly name: string;
+    readonly requiredFields: (keyof LLMTestCase)[];
+    private readonly criteria;
+    private readonly evaluationParams;
+    private evaluationSteps?;
+    constructor(config: GEvalConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class SummarizationMetric extends BaseMetric {
+    readonly name = "Summarization";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+    private evaluateAlignment;
+    private evaluateCoverage;
+}
+interface ExactMatchConfig extends MetricConfig {
+    /** Whether to ignore case when comparing (default: false) */
+    ignoreCase?: boolean;
+    /** Whether to trim whitespace when comparing (default: true) */
+    trimWhitespace?: boolean;
+}
+declare class ExactMatchMetric extends BaseMetric {
+    readonly name = "Exact Match";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    readonly requiresProvider = false;
+    private readonly ignoreCase;
+    private readonly trimWhitespace;
+    constructor(config?: ExactMatchConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+interface JsonCorrectnessConfig extends MetricConfig {
+    /** Optional Zod schema to validate the JSON structure against */
+    schema?: z.ZodSchema;
+    /** If true, also compares against expectedOutput JSON (default: false) */
+    compareWithExpected?: boolean;
+}
+declare class JsonCorrectnessMetric extends BaseMetric {
+    readonly name = "JSON Correctness";
+    readonly requiredFields: (keyof LLMTestCase)[];
+    readonly requiresProvider = false;
+    private readonly schema?;
+    private readonly compareWithExpected;
+    constructor(config?: JsonCorrectnessConfig);
+    measure(testCase: LLMTestCase): Promise<MetricResult>;
+}
+declare class OpenAIProvider extends BaseLLMProvider {
+    private client;
+    constructor(config?: ProviderConfig);
+    get providerName(): string;
+    generate(prompt: string): Promise<string>;
+}
+declare class AnthropicProvider extends BaseLLMProvider {
+    private client;
+    constructor(config?: ProviderConfig);
+    get providerName(): string;
+    generate(prompt: string): Promise<string>;
+}
+declare class OllamaProvider extends BaseLLMProvider {
+    private readonly baseUrl;
+    constructor(config?: ProviderConfig);
+    get providerName(): string;
+    generate(prompt: string): Promise<string>;
+}
+/**
+ * Resolve a provider from a string name, provider instance, or auto-detect from env vars.
+ * Returns a noop provider if undefined (for non-LLM metrics).
+ */
+declare function resolveProvider(provider?: BaseLLMProvider | string): BaseLLMProvider;
+interface AssayConfig {
+    /** Default LLM provider for metrics that need one */
+    provider?: BaseLLMProvider;
+    /** Provider name shorthand (resolved via resolveProvider) */
+    providerName?: "openai" | "anthropic" | "ollama";
+    /** Model override */
+    model?: string;
+    /** API key override */
+    apiKey?: string;
+    /** Base URL override */
+    baseUrl?: string;
+    /** Default metrics to run if not specified per-evaluate call */
+    metrics?: BaseMetric[];
+    /** Max concurrent test case evaluations. Default 5. */
+    concurrency?: number;
+    /** Global threshold override */
+    threshold?: number;
+    /** Whether to print results to console. Default true. */
+    verbose?: boolean;
+}
+/**
+ * Attempt to auto-discover and load assay.config.ts from the current
+ * working directory. Returns an empty config if not found.
+ *
+ * Looks for:
+ * - assay.config.ts
+ * - assay.config.js
+ * - assay.config.mjs
+ */
+declare function resolveConfig(overrides?: AssayConfig): Promise<AssayConfig>;
+/**
+ * Reset the cached config (useful for testing).
+ */
+declare function resetConfigCache(): void;
+interface EvaluateConfig {
+    /** Maximum concurrent metric evaluations (default: 10) */
+    maxConcurrency?: number;
+    /** Delay between batches in ms (default: 0) */
+    throttleMs?: number;
+    /** Continue even if some metrics error (default: false) */
+    ignoreErrors?: boolean;
+    /** Show verbose output (default: true) */
+    verbose?: boolean;
+    /** Display mode: "all" | "failing" | "passing" (default: "all") */
+    display?: "all" | "failing" | "passing";
+}
+interface EvaluateResult {
+    testCases: Array<{
+        testCase: LLMTestCase;
+        results: MetricResult[];
+        passed: boolean;
+    }>;
+    summary: {
+        total: number;
+        passed: number;
+        failed: number;
+        passRate: number;
+        averageScores: Record<string, number>;
+        totalTimeMs: number;
+    };
+}
+/**
+ * Run a batch evaluation: execute all metrics on all test cases,
+ * collect results, compute summary statistics, and print a report.
+ */
+declare function evaluate(testCases: LLMTestCase[] | EvaluationDataset, metrics: BaseMetric[], config?: EvaluateConfig): Promise<EvaluateResult>;
+interface AssertEvalOptions {
+    /** The test case to evaluate */
+    testCase: LLMTestCase;
+    /** Metrics to check */
+    metrics: BaseMetric[];
+}
+interface AssertEvalResult {
+    /** Whether all metrics passed */
+    passed: boolean;
+    /** Individual metric results */
+    results: MetricResult[];
+    /** Failure messages for metrics that did not pass */
+    failures: string[];
+}
+/**
+ * Evaluate a single test case against one or more metrics.
+ * Designed for use inside unit tests (vitest, jest, etc).
+ *
+ * @example
+ * ```ts
+ * const result = await assertEval({
+ *   testCase: { input: "What is 2+2?", actualOutput: "4" },
+ *   metrics: [new AnswerRelevancyMetric({ threshold: 0.7 })],
+ * });
+ * expect(result.passed).toBe(true);
+ * ```
+ */
+declare function assertEval(options: AssertEvalOptions): Promise<AssertEvalResult>;
+interface EvaluationSummary {
+    results: TestCaseResult[];
+    totalTests: number;
+    totalPassed: number;
+    totalFailed: number;
+    averageScores: Record<string, number>;
+    duration: number;
+}
+interface TestCaseResult {
+    testCaseName: string;
+    input: string;
+    metricResults: MetricResult[];
+    passed: boolean;
+}
+declare class ConsoleReporter {
+    /**
+     * Print a full evaluation summary to the console.
+     */
+    report(summary: EvaluationSummary): void;
+    private printHeader;
+    private printResultsTable;
+    private printSummaryFooter;
+}
+/**
+ * Safely extracts and parses JSON from LLM responses that may contain
+ * markdown fences, surrounding text, or minor formatting issues.
+ */
+interface ParseJsonOptions {
+    /** If true, returns null instead of throwing on failure */
+    silent?: boolean;
+}
+/**
+ * Parse JSON from an LLM response string. Applies multiple strategies:
+ * 1. Direct JSON.parse
+ * 2. Strip markdown code fences, then parse
+ * 3. Remove trailing commas, then parse
+ * 4. Extract JSON substring by balanced braces
+ * 5. Regex fallback extraction
+ *
+ * @throws {Error} if all strategies fail and `silent` is not set
+ */
+declare function parseJson<T = unknown>(text: string, options?: ParseJsonOptions): T | null;
+/**
+ * Convenience wrapper that always returns null on failure.
+ */
+declare function tryParseJson<T = unknown>(text: string): T | null;
+/**
+ * A minimal p-limit style concurrency limiter.
+ * Controls the maximum number of concurrent async operations.
+ */
+type Limiter = <T>(fn: () => Promise<T>) => Promise<T>;
+/**
+ * Creates a concurrency limiter that ensures no more than `concurrency`
+ * async operations run simultaneously.
+ *
+ * @param concurrency - Maximum number of concurrent operations. Must be >= 1.
+ * @returns A function that wraps async operations with concurrency control.
+ *
+ * @example
+ * ```ts
+ * const limit = createLimiter(5);
+ * const results = await Promise.all(
+ *   urls.map(url => limit(() => fetch(url)))
+ * );
+ * ```
+ */
+declare function createLimiter(concurrency: number): Limiter;
+/**
+ * Common scoring utilities for evaluation metrics.
+ */
+/**
+ * Safe division returning 0 when total is 0.
+ * Result is clamped to [0, 1].
+ */
+declare function ratio(count: number, total: number): number;
+/**
+ * Compute a weighted average of values.
+ * If weights sum to 0, returns 0.
+ *
+ * @throws {Error} if values and weights have different lengths.
+ */
+declare function weightedAverage(values: number[], weights: number[]): number;
+/**
+ * Compute Mean Average Precision (MAP) from a list of relevance judgments.
+ * Used for evaluating contextual precision - how well relevant items
+ * are ranked before irrelevant ones.
+ *
+ * @param relevances - Array of boolean values indicating whether each item is relevant.
+ * @returns MAP score between 0 and 1.
+ */
+declare function meanAveragePrecision(relevances: boolean[]): number;
+export { AnswerRelevancyMetric, AnthropicProvider, type AssayConfig, type AssertEvalOptions, type AssertEvalResult, BaseLLMProvider, BaseMetric, BiasMetric, ConsoleReporter, ContextualPrecisionMetric, ContextualRecallMetric, ContextualRelevancyMetric, type ConversationalTestCase, type EvaluateConfig, type EvaluateResult, type EvaluationDataset, type EvaluationSummary, type ExactMatchConfig, ExactMatchMetric, FaithfulnessMetric, GEval, type GEvalConfig, type Golden, HallucinationMetric, type JsonCorrectnessConfig, JsonCorrectnessMetric, type LLMTestCase, type Limiter, type MetricConfig, type MetricResult, OllamaProvider, OpenAIProvider, type ParseJsonOptions, type ProviderConfig, SummarizationMetric, type TestCaseResult, type ToolCall, ToxicityMetric, assertEval, createLimiter, evaluate, meanAveragePrecision, parseJson, ratio, resetConfigCache, resolveConfig, resolveProvider, tryParseJson, weightedAverage };