npm - @orq-ai/evaluatorq - Versions diffs - 1.2.2 → 1.2.3-rc.1 - Mend

@orq-ai/evaluatorq 1.2.2 → 1.2.3-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/dist/lib/integrations/simulation/agents/base.js ADDED Viewed

@@ -0,0 +1,227 @@
+/**
+ * Base agent class for simulation agents.
+ *
+ * Provides common functionality for all agents in the simulation system,
+ * including LLM interaction with retry logic.
+ */
+import OpenAI from "openai";
+// Retry configuration
+const MAX_RETRY_ATTEMPTS = 5;
+const RETRY_MIN_WAIT_MS = 2_000;
+const RETRY_MAX_WAIT_MS = 60_000;
+const DEFAULT_TIMEOUT_S = 60;
+/**
+ * Determines whether an HTTP status code is retryable.
+ */
+function isRetryableStatus(status) {
+    if (status === undefined)
+        return false;
+    return status === 429 || status >= 500;
+}
+/**
+ * Abstract base class for simulation agents.
+ *
+ * Provides common LLM interaction functionality with exponential-backoff
+ * retry logic and cumulative token-usage tracking.
+ *
+ * **Client injection**: pass an existing `OpenAI` client via `config.client`
+ * to share a single HTTP connection across multiple agents. The agent will
+ * NOT close an injected client -- the caller is responsible for its lifecycle.
+ */
+export class BaseAgent {
+    model;
+    client;
+    clientOwned;
+    usage;
+    constructor(config) {
+        this.model = config?.model ?? "azure/gpt-4o-mini";
+        if (config?.client) {
+            this.client = config.client;
+            this.clientOwned = false;
+        }
+        else {
+            const resolvedApiKey = config?.apiKey ?? process.env.ORQ_API_KEY;
+            if (!resolvedApiKey) {
+                throw new Error("ORQ_API_KEY environment variable is not set. Set it or pass apiKey in AgentConfig.");
+            }
+            this.client = new OpenAI({
+                baseURL: process.env.ROUTER_BASE_URL ?? "https://api.orq.ai/v2/router",
+                apiKey: resolvedApiKey,
+            });
+            this.clientOwned = true;
+        }
+        this.usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+    }
+    // ---------------------------------------------------------------------------
+    // Public API
+    // ---------------------------------------------------------------------------
+    /**
+     * Generate a text response for a conversation.
+     *
+     * @param messages - Conversation history
+     * @param options  - Temperature, maxTokens, and timeout overrides
+     * @returns The agent's response text
+     * @throws {Error} If the LLM call returns no content
+     */
+    async respondAsync(messages, options) {
+        const result = await this.callLLM(messages, {
+            temperature: options?.temperature,
+            maxTokens: options?.maxTokens,
+            timeout: options?.timeout,
+            signal: options?.signal,
+        });
+        if (!result.content) {
+            throw new Error(`${this.name}: LLM call failed -- no content in response`);
+        }
+        return result.content;
+    }
+    /**
+     * Get cumulative token usage for this agent.
+     */
+    getUsage() {
+        return { ...this.usage };
+    }
+    /**
+     * Reset token usage counters to zero.
+     */
+    resetUsage() {
+        this.usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
+    }
+    /**
+     * Close the underlying HTTP client.
+     *
+     * Only closes clients that the agent created itself (not injected ones).
+     */
+    async close() {
+        // The OpenAI Node SDK does not currently expose a public close() method,
+        // but we guard against future changes and respect ownership semantics.
+        if (this.clientOwned &&
+            typeof this.client
+                .close === "function") {
+            await this.client.close();
+        }
+    }
+    // ---------------------------------------------------------------------------
+    // Protected helpers
+    // ---------------------------------------------------------------------------
+    /**
+     * Call the LLM with retry logic (exponential backoff).
+     *
+     * Retries on rate-limit (429) and server errors (500+). All other errors
+     * are raised immediately.
+     */
+    async callLLM(messages, options) {
+        const temperature = options?.temperature ?? 0.7;
+        const maxTokens = options?.maxTokens ?? 2048;
+        const timeoutS = options?.timeout ?? DEFAULT_TIMEOUT_S;
+        const fullMessages = [
+            { role: "system", content: this.systemPrompt },
+            ...messages.map((m) => ({
+                role: m.role,
+                content: m.content,
+            })),
+        ];
+        let lastError;
+        for (let attempt = 1; attempt <= MAX_RETRY_ATTEMPTS; attempt++) {
+            try {
+                // Bail immediately if already cancelled
+                if (options?.signal?.aborted) {
+                    throw new Error("Cancelled");
+                }
+                const controller = new AbortController();
+                const timer = setTimeout(() => controller.abort(), timeoutS * 1000);
+                // Link external signal to this request's controller
+                const onAbort = () => controller.abort();
+                options?.signal?.addEventListener("abort", onAbort, { once: true });
+                try {
+                    const params = {
+                        model: this.model,
+                        messages: fullMessages,
+                        temperature,
+                        max_tokens: maxTokens,
+                    };
+                    if (options?.tools && options.tools.length > 0) {
+                        params.tools = options.tools;
+                        params.tool_choice = "auto";
+                    }
+                    const response = await this.client.chat.completions.create(params, {
+                        signal: controller.signal,
+                    });
+                    clearTimeout(timer);
+                    const choice = response.choices[0];
+                    if (!choice) {
+                        throw new Error(`${this.name}: No choices in response`);
+                    }
+                    const message = choice.message;
+                    // Accumulate token usage
+                    if (response.usage) {
+                        this.usage.prompt_tokens += response.usage.prompt_tokens;
+                        this.usage.completion_tokens += response.usage.completion_tokens;
+                        this.usage.total_tokens += response.usage.total_tokens;
+                    }
+                    const result = {
+                        content: message.content ?? "",
+                    };
+                    if (message.tool_calls && message.tool_calls.length > 0) {
+                        result.tool_calls = message.tool_calls;
+                    }
+                    return result;
+                }
+                finally {
+                    clearTimeout(timer);
+                    options?.signal?.removeEventListener("abort", onAbort);
+                }
+            }
+            catch (err) {
+                lastError = err;
+                // Abort errors (from timeout cancellation) should never be retried
+                if (err instanceof Error && err.name === "AbortError") {
+                    throw err;
+                }
+                // Determine if retryable
+                const isApiError = err instanceof OpenAI.APIError;
+                const status = isApiError ? err.status : undefined;
+                const isNetworkError = !isApiError &&
+                    err instanceof Error &&
+                    "code" in err &&
+                    typeof err.code === "string" &&
+                    /^E(CONN|TIMEOUT|NOTFOUND|RESET)/.test(err.code ?? "");
+                // Re-throw immediately for external cancellation
+                if (options?.signal?.aborted)
+                    throw err;
+                if (!isRetryableStatus(status) && !isNetworkError) {
+                    throw err;
+                }
+                if (attempt < MAX_RETRY_ATTEMPTS) {
+                    const baseWait = RETRY_MIN_WAIT_MS * 2 ** (attempt - 1);
+                    const waitMs = Math.min(baseWait, RETRY_MAX_WAIT_MS);
+                    // Add jitter (0-25% of wait time)
+                    const jitter = Math.random() * waitMs * 0.25;
+                    await sleepCancellable(waitMs + jitter, options?.signal);
+                }
+            }
+        }
+        throw (lastError ??
+            new Error(`${this.name}: Max retries (${MAX_RETRY_ATTEMPTS}) exceeded`));
+    }
+}
+// ---------------------------------------------------------------------------
+// Utility
+// ---------------------------------------------------------------------------
+function sleepCancellable(ms, signal) {
+    return new Promise((resolve, reject) => {
+        if (signal?.aborted) {
+            reject(new Error("Cancelled"));
+            return;
+        }
+        const onAbort = () => {
+            clearTimeout(timer);
+            reject(new Error("Cancelled"));
+        };
+        const timer = setTimeout(() => {
+            signal?.removeEventListener("abort", onAbort);
+            resolve();
+        }, ms);
+        signal?.addEventListener("abort", onAbort, { once: true });
+    });
+}

package/dist/lib/integrations/simulation/agents/index.d.ts ADDED Viewed

@@ -0,0 +1,10 @@
+/**
+ * Agent exports for the simulation framework.
+ */
+export type { AgentConfig, LLMResult } from "./base.js";
+export { BaseAgent } from "./base.js";
+export type { JudgeAgentConfig } from "./judge.js";
+export { DEFAULT_JUDGE_PROMPT, JUDGE_TOOLS, JudgeAgent } from "./judge.js";
+export type { UserSimulatorAgentConfig } from "./user-simulator.js";
+export { DEFAULT_USER_SIMULATOR_PROMPT, UserSimulatorAgent, } from "./user-simulator.js";
+//# sourceMappingURL=index.d.ts.map

package/dist/lib/integrations/simulation/agents/index.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,YAAY,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AACtC,YAAY,EAAE,gBAAgB,EAAE,MAAM,YAAY,CAAC;AACnD,OAAO,EAAE,oBAAoB,EAAE,WAAW,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAC3E,YAAY,EAAE,wBAAwB,EAAE,MAAM,qBAAqB,CAAC;AACpE,OAAO,EACL,6BAA6B,EAC7B,kBAAkB,GACnB,MAAM,qBAAqB,CAAC"}

package/dist/lib/integrations/simulation/agents/index.js ADDED Viewed

@@ -0,0 +1,6 @@
+/**
+ * Agent exports for the simulation framework.
+ */
+export { BaseAgent } from "./base.js";
+export { DEFAULT_JUDGE_PROMPT, JUDGE_TOOLS, JudgeAgent } from "./judge.js";
+export { DEFAULT_USER_SIMULATOR_PROMPT, UserSimulatorAgent, } from "./user-simulator.js";

package/dist/lib/integrations/simulation/agents/judge.d.ts ADDED Viewed

@@ -0,0 +1,50 @@
+/**
+ * Judge agent for conversation evaluation.
+ *
+ * Evaluates conversations and decides when to terminate based on
+ * goal achievement or rule violations.
+ */
+import type OpenAI from "openai";
+import type { ChatMessage, Criterion, Judgment } from "../types.js";
+import type { AgentConfig } from "./base.js";
+import { BaseAgent } from "./base.js";
+export declare const JUDGE_TOOLS: OpenAI.Chat.Completions.ChatCompletionTool[];
+export declare const DEFAULT_JUDGE_PROMPT = "You are a conversation judge. Your role is to evaluate conversations between a user and an AI agent.\n\nYou will be given:\n1. The conversation history\n2. The user's goal\n3. Criteria that should or should not be satisfied\n\nYour task:\n- Evaluate whether the conversation should continue or end\n- Determine if the user's goal has been achieved\n- Check if any rules/criteria have been violated\n\nDecision rules:\n1. FINISH if the user's goal is clearly achieved\n2. FINISH if any \"must_not_happen\" criteria are violated\n3. CONTINUE if the goal is not yet achieved and no rules are broken\n4. CONTINUE if progress is being made toward the goal\n\nFor EVERY evaluation (continue or finish), also assess the agent's LAST response:\n- response_quality: How helpful, accurate, and complete was the response? (0.0=poor, 1.0=excellent)\n- hallucination_risk: Did the agent make up information not grounded in the conversation? (0.0=none, 1.0=high risk)\n- tone_appropriateness: Was the agent's tone appropriate for the situation? (0.0=inappropriate, 1.0=perfect)\n- factual_accuracy: If GROUND TRUTH is provided below, score how accurate the agent's response is against it (0.0=wrong, 1.0=correct). Skip if no ground truth.\n\nYou MUST call one of the provided tools to make your decision.";
+export interface JudgeAgentConfig extends AgentConfig {
+    goal?: string;
+    criteria?: Criterion[];
+    groundTruth?: string;
+}
+/**
+ * Agent that evaluates conversations and decides termination.
+ *
+ * Uses tool calling to make structured decisions about whether a conversation
+ * should continue or end.
+ */
+export declare class JudgeAgent extends BaseAgent {
+    private goal;
+    private criteria;
+    private groundTruth;
+    constructor(config?: JudgeAgentConfig);
+    get name(): string;
+    get systemPrompt(): string;
+    /**
+     * Evaluate a conversation and decide next action.
+     *
+     * @param messages - Conversation history to evaluate
+     * @returns Judgment with termination decision and reasoning
+     */
+    evaluate(messages: ChatMessage[], options?: {
+        signal?: AbortSignal;
+    }): Promise<Judgment>;
+    private parseJudgment;
+    /**
+     * Extract and clamp quality scores from tool call arguments.
+     */
+    private static extractQualityScores;
+    /**
+     * Format criteria for the system prompt.
+     */
+    private formatCriteria;
+}
+//# sourceMappingURL=judge.d.ts.map

package/dist/lib/integrations/simulation/agents/judge.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/judge.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,MAAM,MAAM,QAAQ,CAAC;AAEjC,OAAO,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,MAAM,aAAa,CAAC;AAEpE,OAAO,KAAK,EAAE,WAAW,EAAa,MAAM,WAAW,CAAC;AACxD,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAiCtC,eAAO,MAAM,WAAW,EAAE,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,kBAAkB,EA2DnE,CAAC;AAMF,eAAO,MAAM,oBAAoB,+wCAwB8B,CAAC;AAMhE,MAAM,WAAW,gBAAiB,SAAQ,WAAW;IACnD,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,SAAS,EAAE,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAmBD;;;;;GAKG;AACH,qBAAa,UAAW,SAAQ,SAAS;IACvC,OAAO,CAAC,IAAI,CAAS;IACrB,OAAO,CAAC,QAAQ,CAAc;IAC9B,OAAO,CAAC,WAAW,CAAS;gBAEhB,MAAM,CAAC,EAAE,gBAAgB;IAOrC,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,IAAI,YAAY,IAAI,MAAM,CASzB;IAED;;;;;OAKG;IACG,QAAQ,CACZ,QAAQ,EAAE,WAAW,EAAE,EACvB,OAAO,CAAC,EAAE;QAAE,MAAM,CAAC,EAAE,WAAW,CAAA;KAAE,GACjC,OAAO,CAAC,QAAQ,CAAC;IAuBpB,OAAO,CAAC,aAAa;IAiGrB;;OAEG;IACH,OAAO,CAAC,MAAM,CAAC,oBAAoB;IAkBnC;;OAEG;IACH,OAAO,CAAC,cAAc;CAuBvB"}

package/dist/lib/integrations/simulation/agents/judge.js ADDED Viewed

@@ -0,0 +1,313 @@
+/**
+ * Judge agent for conversation evaluation.
+ *
+ * Evaluates conversations and decides when to terminate based on
+ * goal achievement or rule violations.
+ */
+import { delimit } from "../utils/sanitize.js";
+import { BaseAgent } from "./base.js";
+// ---------------------------------------------------------------------------
+// Quality score property definitions (shared by both judge tools)
+// ---------------------------------------------------------------------------
+const QUALITY_SCORE_PROPERTIES = {
+    response_quality: {
+        type: "number",
+        description: "Quality of the agent's last response: helpful, accurate, complete (0.0=poor, 1.0=excellent)",
+    },
+    hallucination_risk: {
+        type: "number",
+        description: "Risk that the agent fabricated information not grounded in the conversation (0.0=none, 1.0=high risk)",
+    },
+    tone_appropriateness: {
+        type: "number",
+        description: "How appropriate the agent's tone was for the situation (0.0=inappropriate, 1.0=perfect)",
+    },
+    factual_accuracy: {
+        type: "number",
+        description: "Accuracy of the agent's response against the provided ground truth (0.0=completely wrong, 1.0=fully correct). Only score this if ground truth is provided.",
+    },
+};
+// ---------------------------------------------------------------------------
+// Judge tools for structured decision making
+// ---------------------------------------------------------------------------
+export const JUDGE_TOOLS = [
+    {
+        type: "function",
+        function: {
+            name: "continue_conversation",
+            description: "Allow the conversation to continue. Use when the goal is not yet achieved and no rules are broken.",
+            parameters: {
+                type: "object",
+                properties: {
+                    reason: {
+                        type: "string",
+                        description: "Brief explanation of why the conversation should continue",
+                    },
+                    ...QUALITY_SCORE_PROPERTIES,
+                },
+                required: ["reason"],
+            },
+        },
+    },
+    {
+        type: "function",
+        function: {
+            name: "finish_conversation",
+            description: "Terminate the conversation. Use when the goal is achieved OR a rule is broken.",
+            parameters: {
+                type: "object",
+                properties: {
+                    reason: {
+                        type: "string",
+                        description: "Explanation of why the conversation should end",
+                    },
+                    goal_achieved: {
+                        type: "boolean",
+                        description: "Whether the user's goal was successfully achieved",
+                    },
+                    rules_broken: {
+                        type: "array",
+                        items: { type: "string" },
+                        description: "List of criteria that were violated (empty if none)",
+                    },
+                    goal_completion_score: {
+                        type: "number",
+                        description: "How much of the goal was achieved, from 0.0 (none) to 1.0 (fully achieved). Use intermediate values for partial completion.",
+                    },
+                    ...QUALITY_SCORE_PROPERTIES,
+                },
+                required: [
+                    "reason",
+                    "goal_achieved",
+                    "rules_broken",
+                    "goal_completion_score",
+                ],
+            },
+        },
+    },
+];
+// ---------------------------------------------------------------------------
+// Default judge system prompt
+// ---------------------------------------------------------------------------
+export const DEFAULT_JUDGE_PROMPT = `You are a conversation judge. Your role is to evaluate conversations between a user and an AI agent.
+You will be given:
+1. The conversation history
+2. The user's goal
+3. Criteria that should or should not be satisfied
+Your task:
+- Evaluate whether the conversation should continue or end
+- Determine if the user's goal has been achieved
+- Check if any rules/criteria have been violated
+Decision rules:
+1. FINISH if the user's goal is clearly achieved
+2. FINISH if any "must_not_happen" criteria are violated
+3. CONTINUE if the goal is not yet achieved and no rules are broken
+4. CONTINUE if progress is being made toward the goal
+For EVERY evaluation (continue or finish), also assess the agent's LAST response:
+- response_quality: How helpful, accurate, and complete was the response? (0.0=poor, 1.0=excellent)
+- hallucination_risk: Did the agent make up information not grounded in the conversation? (0.0=none, 1.0=high risk)
+- tone_appropriateness: Was the agent's tone appropriate for the situation? (0.0=inappropriate, 1.0=perfect)
+- factual_accuracy: If GROUND TRUTH is provided below, score how accurate the agent's response is against it (0.0=wrong, 1.0=correct). Skip if no ground truth.
+You MUST call one of the provided tools to make your decision.`;
+// ---------------------------------------------------------------------------
+// Quality score field names
+// ---------------------------------------------------------------------------
+const QUALITY_SCORE_FIELDS = [
+    "response_quality",
+    "hallucination_risk",
+    "tone_appropriateness",
+    "factual_accuracy",
+];
+// ---------------------------------------------------------------------------
+// JudgeAgent
+// ---------------------------------------------------------------------------
+/**
+ * Agent that evaluates conversations and decides termination.
+ *
+ * Uses tool calling to make structured decisions about whether a conversation
+ * should continue or end.
+ */
+export class JudgeAgent extends BaseAgent {
+    goal;
+    criteria;
+    groundTruth;
+    constructor(config) {
+        super(config);
+        this.goal = config?.goal ?? "";
+        this.criteria = config?.criteria ?? [];
+        this.groundTruth = config?.groundTruth ?? "";
+    }
+    get name() {
+        return "JudgeAgent";
+    }
+    get systemPrompt() {
+        const criteriaText = this.formatCriteria();
+        let groundTruthText = "";
+        if (this.groundTruth) {
+            groundTruthText = `\n\nGROUND TRUTH (use this to score factual_accuracy):\n${delimit(this.groundTruth)}`;
+        }
+        return `${DEFAULT_JUDGE_PROMPT}\n\n---\n\nUSER'S GOAL: ${delimit(this.goal)}\n\nEVALUATION CRITERIA:\n${criteriaText}${groundTruthText}`;
+    }
+    /**
+     * Evaluate a conversation and decide next action.
+     *
+     * @param messages - Conversation history to evaluate
+     * @returns Judgment with termination decision and reasoning
+     */
+    async evaluate(messages, options) {
+        const evalMessages = [
+            ...messages,
+            {
+                role: "user",
+                content: "Evaluate the conversation above. Should it continue or end? Use the appropriate tool.",
+            },
+        ];
+        const result = await this.callLLM(evalMessages, {
+            temperature: 0.0,
+            tools: JUDGE_TOOLS,
+            signal: options?.signal,
+        });
+        return this.parseJudgment(result);
+    }
+    // ---------------------------------------------------------------------------
+    // Private helpers
+    // ---------------------------------------------------------------------------
+    parseJudgment(result) {
+        const toolCalls = result.tool_calls;
+        if (!toolCalls || toolCalls.length === 0) {
+            const content = (result.content ?? "").slice(0, 200);
+            console.warn(`JudgeAgent: No tool call in response (LLM may have failed). ` +
+                `Content: ${JSON.stringify(content)}. Defaulting to TERMINATE to prevent runaway conversations.`);
+            return {
+                should_terminate: true,
+                reason: "Judge failed to make explicit decision - terminating for safety",
+                goal_achieved: false,
+                rules_broken: [],
+                goal_completion_score: 0.0,
+            };
+        }
+        const toolCall = toolCalls[0];
+        const functionName = toolCall.function.name;
+        const argumentsStr = toolCall.function.arguments;
+        let args;
+        try {
+            const parsed = JSON.parse(argumentsStr);
+            if (typeof parsed !== "object" ||
+                parsed === null ||
+                Array.isArray(parsed)) {
+                throw new TypeError(`Expected object, got ${typeof parsed}`);
+            }
+            args = parsed;
+        }
+        catch (err) {
+            console.error(`JudgeAgent: Failed to parse tool arguments: ${String(err)} (raw: ${JSON.stringify(argumentsStr)})`);
+            return {
+                should_terminate: true,
+                reason: "Failed to parse judgment decision - terminating for safety",
+                goal_achieved: false,
+                rules_broken: [],
+                goal_completion_score: 0.0,
+            };
+        }
+        // Extract quality scores (shared by both tools)
+        const qualityScores = JudgeAgent.extractQualityScores(args);
+        if (functionName === "continue_conversation") {
+            return {
+                should_terminate: false,
+                reason: typeof args.reason === "string" ? args.reason : "",
+                goal_achieved: false,
+                rules_broken: [],
+                goal_completion_score: 0.0,
+                ...qualityScores,
+            };
+        }
+        if (functionName === "finish_conversation") {
+            const goalAchieved = typeof args.goal_achieved === "boolean" ? args.goal_achieved : false;
+            // Clamp goal_completion_score to [0.0, 1.0]
+            const rawScore = args.goal_completion_score;
+            const defaultScore = goalAchieved ? 1.0 : 0.0;
+            const goalCompletionScore = clamp(toNumber(rawScore, defaultScore));
+            const rulesBroken = Array.isArray(args.rules_broken)
+                ? args.rules_broken.map(String)
+                : [];
+            return {
+                should_terminate: true,
+                reason: typeof args.reason === "string" ? args.reason : "",
+                goal_achieved: goalAchieved,
+                rules_broken: rulesBroken,
+                goal_completion_score: goalCompletionScore,
+                ...qualityScores,
+            };
+        }
+        // Unknown function -- terminate for safety
+        console.warn(`JudgeAgent: Unknown function ${functionName} - terminating for safety`);
+        return {
+            should_terminate: true,
+            reason: `Unknown function '${functionName}' - terminating for safety`,
+            goal_achieved: false,
+            rules_broken: [],
+            goal_completion_score: 0.0,
+        };
+    }
+    /**
+     * Extract and clamp quality scores from tool call arguments.
+     */
+    static extractQualityScores(args) {
+        const scores = {};
+        for (const field of QUALITY_SCORE_FIELDS) {
+            const raw = args[field];
+            if (raw !== undefined && raw !== null) {
+                const num = Number(raw);
+                if (!Number.isNaN(num)) {
+                    scores[field] = clamp(num);
+                }
+            }
+        }
+        return scores;
+    }
+    /**
+     * Format criteria for the system prompt.
+     */
+    formatCriteria() {
+        if (this.criteria.length === 0) {
+            return "No specific criteria defined.";
+        }
+        const mustHappen = this.criteria
+            .filter((c) => c.type === "must_happen")
+            .map((c) => delimit(c.description));
+        const mustNot = this.criteria
+            .filter((c) => c.type === "must_not_happen")
+            .map((c) => delimit(c.description));
+        let text = "";
+        if (mustHappen.length > 0) {
+            text += `MUST HAPPEN:\n${mustHappen.map((c) => `- ${c}`).join("\n")}\n\n`;
+        }
+        if (mustNot.length > 0) {
+            text += `MUST NOT HAPPEN:\n${mustNot.map((c) => `- ${c}`).join("\n")}`;
+        }
+        return text.trim() || "No specific criteria defined.";
+    }
+}
+// ---------------------------------------------------------------------------
+// Utility helpers
+// ---------------------------------------------------------------------------
+/** Clamp a number to [0.0, 1.0]. */
+function clamp(value) {
+    return Math.max(0.0, Math.min(1.0, value));
+}
+/** Safely convert an unknown value to a number, falling back to a default. */
+function toNumber(value, fallback) {
+    if (typeof value === "number" && !Number.isNaN(value))
+        return value;
+    if (typeof value === "string") {
+        const n = Number(value);
+        if (!Number.isNaN(n))
+            return n;
+    }
+    return fallback;
+}

package/dist/lib/integrations/simulation/agents/user-simulator.d.ts ADDED Viewed

@@ -0,0 +1,41 @@
+/**
+ * User simulator agent.
+ *
+ * Simulates user behavior based on a persona and scenario,
+ * generating realistic user messages in conversations.
+ */
+import type { ChatMessage } from "../types.js";
+import type { AgentConfig } from "./base.js";
+import { BaseAgent } from "./base.js";
+export declare const DEFAULT_USER_SIMULATOR_PROMPT = "You are a user simulator. Your role is to simulate realistic user behavior in a conversation with an AI agent.\n\nYou will be given:\n1. A persona describing who you are and how you behave\n2. A scenario describing your goal and context\n\nYour task:\n- Generate realistic user messages based on your persona and scenario\n- Stay in character throughout the conversation\n- Work towards achieving your goal naturally\n- React authentically to the agent's responses\n- Do not break character or acknowledge that you are a simulation\n\nResponse format:\n- Respond only with the user's message\n- Do not include any meta-commentary or explanations\n- Keep responses natural and conversational";
+export interface UserSimulatorAgentConfig extends AgentConfig {
+    /** Custom system prompt to append to the default prompt. */
+    systemPrompt?: string;
+}
+/**
+ * Agent that simulates user behavior.
+ *
+ * Uses a persona and scenario to generate realistic user messages
+ * in a conversation with the agent being tested.
+ */
+export declare class UserSimulatorAgent extends BaseAgent {
+    private customSystemPrompt;
+    constructor(config?: UserSimulatorAgentConfig);
+    get name(): string;
+    get systemPrompt(): string;
+    /**
+     * Generate the first message to start a conversation.
+     *
+     * @param messages - Optional context messages
+     * @returns First user message to start the conversation
+     */
+    generateFirstMessage(messages?: ChatMessage[]): Promise<string>;
+    /**
+     * Update the persona and scenario context.
+     *
+     * @param personaContext  - Persona-specific context
+     * @param scenarioContext - Scenario-specific context
+     */
+    updateContext(personaContext?: string, scenarioContext?: string): void;
+}
+//# sourceMappingURL=user-simulator.d.ts.map

package/dist/lib/integrations/simulation/agents/user-simulator.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"user-simulator.d.ts","sourceRoot":"","sources":["../../../../../src/lib/integrations/simulation/agents/user-simulator.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAC/C,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,WAAW,CAAC;AAC7C,OAAO,EAAE,SAAS,EAAE,MAAM,WAAW,CAAC;AAMtC,eAAO,MAAM,6BAA6B,urBAgBE,CAAC;AAM7C,MAAM,WAAW,wBAAyB,SAAQ,WAAW;IAC3D,4DAA4D;IAC5D,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAMD;;;;;GAKG;AACH,qBAAa,kBAAmB,SAAQ,SAAS;IAC/C,OAAO,CAAC,kBAAkB,CAAgB;gBAE9B,MAAM,CAAC,EAAE,wBAAwB;IAK7C,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,IAAI,YAAY,IAAI,MAAM,CAKzB;IAED;;;;;OAKG;IACG,oBAAoB,CAAC,QAAQ,CAAC,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAWrE;;;;;OAKG;IACH,aAAa,CAAC,cAAc,CAAC,EAAE,MAAM,EAAE,eAAe,CAAC,EAAE,MAAM,GAAG,IAAI;CAWvE"}