npm - @iqai/adk - Versions diffs - 0.1.22 → 0.2.0 - Mend

@iqai/adk 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.mjs CHANGED Viewed

@@ -833,70 +833,23 @@ ${instructions.join("\n\n")}`;
 // src/models/llm-response.ts
 var LlmResponse = class _LlmResponse {
-  /**
-   * Unique identifier for the response.
-   */
   id;
-  /**
-   * The content generated by the model.
-   */
+  text;
   content;
-  /**
-   * The grounding metadata of the response.
-   */
   groundingMetadata;
-  /**
-   * Indicates whether the text content is part of an unfinished text stream.
-   */
   partial;
-  /**
-   * Indicates whether the response from the model is complete.
-   */
   turnComplete;
-  /**
-   * Error code if the response is an error.
-   */
   errorCode;
-  /**
-   * Error message if the response is an error.
-   */
   errorMessage;
-  /**
-   * Flag indicating that LLM was interrupted when generating the content.
-   */
   interrupted;
-  /**
-   * The custom metadata of the LlmResponse.
-   */
   customMetadata;
-  /**
-   * The usage metadata of the LlmResponse.
-   */
   usageMetadata;
-  /**
-   * Index of the candidate response.
-   */
   candidateIndex;
-  /**
-   * Reason why the model finished generating.
-   */
   finishReason;
-  /**
-   * Error object if the response is an error.
-   */
   error;
-  /**
-   * Creates a new LlmResponse.
-   */
   constructor(data = {}) {
     Object.assign(this, data);
   }
-  /**
-   * Creates an LlmResponse from a GenerateContentResponse.
-   *
-   * @param generateContentResponse The GenerateContentResponse to create the LlmResponse from.
-   * @returns The LlmResponse.
-   */
   static create(generateContentResponse) {
     const usageMetadata = generateContentResponse.usageMetadata;
     if (generateContentResponse.candidates && generateContentResponse.candidates.length > 0) {
@@ -928,15 +881,6 @@ var LlmResponse = class _LlmResponse {
       usageMetadata
     });
   }
-  /**
-   * Creates an LlmResponse from an error.
-   *
-   * @param error The error object or message.
-   * @param options Additional options for the error response.
-   * @param options.errorCode A specific error code for the response.
-   * @param options.model The model that was being used when the error occurred.
-   * @returns The LlmResponse.
-   */
   static fromError(error, options = {}) {
     const errorMessage = error instanceof Error ? error.message : String(error);
     const errorCode = options.errorCode || "UNKNOWN_ERROR";
@@ -2675,30 +2619,16 @@ var OpenAiLlm = class extends BaseLlm {
 // src/models/llm-registry.ts
 init_logger();
 var LLMRegistry = class _LLMRegistry {
-  /**
-   * Map of model name regex to LLM class
-   */
   static llmRegistry = /* @__PURE__ */ new Map();
+  static modelInstances = /* @__PURE__ */ new Map();
   static logger = new Logger({ name: "LLMRegistry" });
-  /**
-   * Creates a new LLM instance
-   *
-   * @param model The model name
-   * @returns The LLM instance
-   */
   static newLLM(model) {
     const llmClass = _LLMRegistry.resolve(model);
     if (!llmClass) {
-      throw new Error(`No LLM found for model: ${model}`);
+      throw new Error(`No LLM class found for model: ${model}`);
     }
     return new llmClass(model);
   }
-  /**
-   * Resolves the LLM class from the model name
-   *
-   * @param model The model name
-   * @returns The LLM class
-   */
   static resolve(model) {
     for (const [regex, llmClass] of _LLMRegistry.llmRegistry.entries()) {
       if (regex.test(model)) {
@@ -2707,34 +2637,54 @@ var LLMRegistry = class _LLMRegistry {
     }
     return null;
   }
-  /**
-   * Registers a new LLM class
-   *
-   * @param modelNameRegex The regex to match model names
-   * @param llmClass The LLM class
-   */
   static register(modelNameRegex, llmClass) {
     _LLMRegistry.llmRegistry.set(new RegExp(modelNameRegex), llmClass);
   }
-  /**
-   * Registers all model patterns from an LLM class
-   *
-   * @param llmClass The LLM class
-   */
   static registerLLM(llmClass) {
     const modelPatterns = llmClass.supportedModels();
     for (const pattern of modelPatterns) {
       _LLMRegistry.register(pattern, llmClass);
     }
   }
-  /**
-   * Logs all registered models for debugging
-   */
+  static registerModel(name, model) {
+    _LLMRegistry.modelInstances.set(name, model);
+  }
+  static getModel(name) {
+    const model = _LLMRegistry.modelInstances.get(name);
+    if (!model) {
+      throw new Error(`Model '${name}' not found in registry`);
+    }
+    return model;
+  }
+  static hasModel(name) {
+    return _LLMRegistry.modelInstances.has(name);
+  }
+  static unregisterModel(name) {
+    _LLMRegistry.modelInstances.delete(name);
+  }
+  static getModelOrCreate(name) {
+    if (_LLMRegistry.hasModel(name)) {
+      return _LLMRegistry.getModel(name);
+    }
+    return _LLMRegistry.newLLM(name);
+  }
+  static clear() {
+    _LLMRegistry.llmRegistry.clear();
+    _LLMRegistry.modelInstances.clear();
+  }
+  static clearModels() {
+    _LLMRegistry.modelInstances.clear();
+  }
+  static clearClasses() {
+    _LLMRegistry.llmRegistry.clear();
+  }
   static logRegisteredModels() {
-    _LLMRegistry.logger.debug(
-      "Registered LLM models:",
-      [..._LLMRegistry.llmRegistry.entries()].map(([regex]) => regex.toString())
+    const classPatterns = [..._LLMRegistry.llmRegistry.entries()].map(
+      ([regex]) => regex.toString()
     );
+    const instanceNames = [..._LLMRegistry.modelInstances.keys()];
+    _LLMRegistry.logger.debug("Registered LLM class patterns:", classPatterns);
+    _LLMRegistry.logger.debug("Registered LLM instances:", instanceNames);
   }
 };
@@ -6582,9 +6532,23 @@ var BaseLlmFlow = class {
         yield event;
       }
     }
-    const tools = await agent.canonicalTools(
+    let tools = await agent.canonicalTools(
       new ReadonlyContext(invocationContext)
     );
+    if (tools.length > 1) {
+      const seen = /* @__PURE__ */ new Set();
+      const filtered = [];
+      for (const t of tools) {
+        const name = t?.name;
+        if (!name) continue;
+        if (seen.has(name)) {
+          continue;
+        }
+        seen.add(name);
+        filtered.push(t);
+      }
+      tools = filtered;
+    }
     for (const tool of tools) {
       const toolContext = new ToolContext(invocationContext);
       await tool.processLlmRequest(toolContext, llmRequest);
@@ -6740,7 +6704,42 @@ var BaseLlmFlow = class {
     }
     invocationContext.incrementLlmCallCount();
     const isStreaming = invocationContext.runConfig.streamingMode === "sse" /* SSE */;
-    const tools = llmRequest.config?.tools || [];
+    let tools = llmRequest.config?.tools || [];
+    if (tools.length) {
+      const deduped = [];
+      const seenFn = /* @__PURE__ */ new Set();
+      for (const t of tools) {
+        const tool = t;
+        if (tool && Array.isArray(tool.functionDeclarations)) {
+          const newFds = tool.functionDeclarations.filter(
+            (fd) => {
+              if (fd?.name) {
+                if (seenFn.has(fd.name)) {
+                  return false;
+                }
+                seenFn.add(fd.name);
+              }
+              return true;
+            }
+          );
+          if (newFds.length) {
+            deduped.push({ ...tool, functionDeclarations: newFds });
+          }
+        } else if (tool?.name) {
+          if (seenFn.has(tool.name)) continue;
+          seenFn.add(tool.name);
+          deduped.push(tool);
+        } else {
+          deduped.push(tool);
+        }
+      }
+      if (deduped.length !== tools.length) {
+        this.logger.debug(
+          `\u{1F501} Deduplicated tool/function declarations: ${tools.length} -> ${deduped.length}`
+        );
+      }
+      llmRequest.config.tools = tools = deduped;
+    }
     const toolNames = tools.map((tool) => {
       if (tool.functionDeclarations && Array.isArray(tool.functionDeclarations)) {
         return tool.functionDeclarations.map((fn) => fn.name).join(", ");
@@ -9555,6 +9554,7 @@ var LangGraphAgent = class extends BaseAgent {
 };
 // src/agents/agent-builder.ts
+init_logger();
 import { generateId } from "ai";
 // src/runners.ts
@@ -9668,19 +9668,19 @@ var InMemoryArtifactService = class {
   }
   async saveArtifact(args) {
     const { appName, userId, sessionId, filename, artifact } = args;
-    const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
-    if (!this.artifacts.has(path2)) {
-      this.artifacts.set(path2, []);
+    const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
+    if (!this.artifacts.has(path3)) {
+      this.artifacts.set(path3, []);
     }
-    const versions = this.artifacts.get(path2);
+    const versions = this.artifacts.get(path3);
     const version = versions.length;
     versions.push(artifact);
     return version;
   }
   async loadArtifact(args) {
     const { appName, userId, sessionId, filename, version } = args;
-    const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
-    const versions = this.artifacts.get(path2);
+    const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
+    const versions = this.artifacts.get(path3);
     if (!versions || versions.length === 0) {
       return null;
     }
@@ -9701,12 +9701,12 @@ var InMemoryArtifactService = class {
     const sessionPrefix = `${appName}/${userId}/${sessionId}/`;
     const userNamespacePrefix = `${appName}/${userId}/user/`;
     const filenames = [];
-    for (const path2 of this.artifacts.keys()) {
-      if (path2.startsWith(sessionPrefix)) {
-        const filename = path2.substring(sessionPrefix.length);
+    for (const path3 of this.artifacts.keys()) {
+      if (path3.startsWith(sessionPrefix)) {
+        const filename = path3.substring(sessionPrefix.length);
         filenames.push(filename);
-      } else if (path2.startsWith(userNamespacePrefix)) {
-        const filename = path2.substring(userNamespacePrefix.length);
+      } else if (path3.startsWith(userNamespacePrefix)) {
+        const filename = path3.substring(userNamespacePrefix.length);
         filenames.push(filename);
       }
     }
@@ -9714,16 +9714,16 @@ var InMemoryArtifactService = class {
   }
   async deleteArtifact(args) {
     const { appName, userId, sessionId, filename } = args;
-    const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
-    if (!this.artifacts.has(path2)) {
+    const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
+    if (!this.artifacts.has(path3)) {
       return;
     }
-    this.artifacts.delete(path2);
+    this.artifacts.delete(path3);
   }
   async listVersions(args) {
     const { appName, userId, sessionId, filename } = args;
-    const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
-    const versions = this.artifacts.get(path2);
+    const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
+    const versions = this.artifacts.get(path3);
     if (!versions || versions.length === 0) {
       return [];
     }
@@ -10193,7 +10193,7 @@ var Runner = class {
       }
     };
     invokeRunAsync();
-    return function* () {
+    return (function* () {
       while (true) {
         while (queueIndex >= eventQueue.length && !asyncCompleted) {
         }
@@ -10206,7 +10206,7 @@ var Runner = class {
         }
         yield event;
       }
-    }();
+    })();
   }
   /**
    * Main entry method to run the agent in this runner.
@@ -10406,6 +10406,12 @@ var AgentBuilder = class _AgentBuilder {
   artifactService;
   agentType = "llm";
   existingSession;
+  existingAgent;
+  // If provided, reuse directly
+  definitionLocked = false;
+  // Lock further definition mutation after withAgent
+  warnedMethods = /* @__PURE__ */ new Set();
+  logger = new Logger({ name: "AgentBuilder" });
   /**
    * Private constructor - use static create() method
    */
@@ -10434,6 +10440,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withModel(model) {
+    this.warnIfLocked("withModel");
     this.config.model = model;
     return this;
   }
@@ -10443,6 +10450,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withDescription(description) {
+    this.warnIfLocked("withDescription");
     this.config.description = description;
     return this;
   }
@@ -10452,14 +10460,17 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withInstruction(instruction) {
+    this.warnIfLocked("withInstruction");
     this.config.instruction = instruction;
     return this;
   }
   withInputSchema(schema) {
+    this.warnIfLocked("withInputSchema");
     this.config.inputSchema = schema;
     return this;
   }
   withOutputSchema(schema) {
+    this.warnIfLocked("withOutputSchema");
     this.config.outputSchema = schema;
     return this;
   }
@@ -10469,6 +10480,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withTools(...tools) {
+    this.warnIfLocked("withTools");
     this.config.tools = [...this.config.tools || [], ...tools];
     return this;
   }
@@ -10478,6 +10490,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withPlanner(planner) {
+    this.warnIfLocked("withPlanner");
     this.config.planner = planner;
     return this;
   }
@@ -10487,6 +10500,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withCodeExecutor(codeExecutor) {
+    this.warnIfLocked("withCodeExecutor");
     this.config.codeExecutor = codeExecutor;
     return this;
   }
@@ -10496,6 +10510,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withOutputKey(outputKey) {
+    this.warnIfLocked("withOutputKey");
     this.config.outputKey = outputKey;
     return this;
   }
@@ -10505,6 +10520,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withSubAgents(subAgents) {
+    this.warnIfLocked("withSubAgents");
     this.config.subAgents = subAgents;
     return this;
   }
@@ -10514,6 +10530,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withBeforeAgentCallback(callback) {
+    this.warnIfLocked("withBeforeAgentCallback");
     this.config.beforeAgentCallback = callback;
     return this;
   }
@@ -10523,15 +10540,29 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   withAfterAgentCallback(callback) {
+    this.warnIfLocked("withAfterAgentCallback");
     this.config.afterAgentCallback = callback;
     return this;
   }
+  /**
+   * Provide an already constructed agent instance. Further definition-mutating calls
+   * (model/tools/instruction/etc.) will be ignored with a dev warning.
+   */
+  withAgent(agent) {
+    this.existingAgent = agent;
+    this.definitionLocked = true;
+    if (this.config.name === "default_agent" && agent.name) {
+      this.config.name = agent.name;
+    }
+    return this;
+  }
   /**
    * Configure as a sequential agent
    * @param subAgents Sub-agents to execute in sequence
    * @returns This builder instance for chaining
    */
   asSequential(subAgents) {
+    this.warnIfLocked("asSequential");
     this.agentType = "sequential";
     this.config.subAgents = subAgents;
     return this;
@@ -10542,6 +10573,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   asParallel(subAgents) {
+    this.warnIfLocked("asParallel");
     this.agentType = "parallel";
     this.config.subAgents = subAgents;
     return this;
@@ -10553,6 +10585,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   asLoop(subAgents, maxIterations = 3) {
+    this.warnIfLocked("asLoop");
     this.agentType = "loop";
     this.config.subAgents = subAgents;
     this.config.maxIterations = maxIterations;
@@ -10565,6 +10598,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns This builder instance for chaining
    */
   asLangGraph(nodes, rootNode) {
+    this.warnIfLocked("asLangGraph");
     this.agentType = "langgraph";
     this.config.nodes = nodes;
     this.config.rootNode = rootNode;
@@ -10691,6 +10725,7 @@ var AgentBuilder = class _AgentBuilder {
    * @returns Created agent instance
    */
   createAgent() {
+    if (this.existingAgent) return this.existingAgent;
     switch (this.agentType) {
       case "llm": {
         if (!this.config.model) {
@@ -10821,6 +10856,22 @@ var AgentBuilder = class _AgentBuilder {
       }
     };
   }
+  /**
+   * Warn (once per method) if the definition has been locked by withAgent().
+   */
+  warnIfLocked(method) {
+    if (!this.definitionLocked) return;
+    if (this.warnedMethods.has(method)) return;
+    this.warnedMethods.add(method);
+    if (process.env.NODE_ENV !== "production") {
+      const msg = `AgentBuilder: attempted to call ${method} after withAgent(); ignoring. (Wrap the agent first OR configure before withAgent).`;
+      if (this.logger && typeof this.logger.warn === "function") {
+        this.logger.warn(msg);
+      } else {
+        console.warn(msg);
+      }
+    }
+  }
 };
 // src/memory/index.ts
@@ -10985,14 +11036,14 @@ var VertexAiSessionService = class extends BaseSessionService {
   async listSessions(appName, userId) {
     const reasoningEngineId = this.getReasoningEngineId(appName);
     const apiClient = this.getApiClient();
-    let path2 = `reasoningEngines/${reasoningEngineId}/sessions`;
+    let path3 = `reasoningEngines/${reasoningEngineId}/sessions`;
     if (userId) {
       const parsedUserId = encodeURIComponent(`"${userId}"`);
-      path2 = `${path2}?filter=user_id=${parsedUserId}`;
+      path3 = `${path3}?filter=user_id=${parsedUserId}`;
     }
     const apiResponse = await apiClient.async_request({
       http_method: "GET",
-      path: path2,
+      path: path3,
       request_dict: {}
     });
     if (apiResponse.httpHeaders) {
@@ -11808,12 +11859,1299 @@ __export(flows_exports, {
   removeClientFunctionCallId: () => removeClientFunctionCallId
 });
+// src/evaluation/index.ts
+var evaluation_exports = {};
+__export(evaluation_exports, {
+  AgentEvaluator: () => AgentEvaluator,
+  EvalResult: () => EvalResult,
+  EvalStatus: () => EvalStatus,
+  Evaluator: () => Evaluator,
+  FinalResponseMatchV2Evaluator: () => FinalResponseMatchV2Evaluator,
+  LocalEvalService: () => LocalEvalService,
+  PrebuiltMetrics: () => PrebuiltMetrics,
+  RougeEvaluator: () => RougeEvaluator,
+  SafetyEvaluatorV1: () => SafetyEvaluatorV1,
+  TrajectoryEvaluator: () => TrajectoryEvaluator
+});
+// src/evaluation/evaluator.ts
+var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
+  EvalStatus2[EvalStatus2["PASSED"] = 1] = "PASSED";
+  EvalStatus2[EvalStatus2["FAILED"] = 2] = "FAILED";
+  EvalStatus2[EvalStatus2["NOT_EVALUATED"] = 3] = "NOT_EVALUATED";
+  return EvalStatus2;
+})(EvalStatus || {});
+var Evaluator = class {
+  constructor(metric) {
+    this.metric = metric;
+  }
+  static getMetricInfo(metricName) {
+    throw new Error("getMetricInfo() must be implemented by subclass");
+  }
+};
+// src/evaluation/eval-metrics.ts
+var PrebuiltMetrics = /* @__PURE__ */ ((PrebuiltMetrics2) => {
+  PrebuiltMetrics2["TOOL_TRAJECTORY_AVG_SCORE"] = "tool_trajectory_avg_score";
+  PrebuiltMetrics2["RESPONSE_EVALUATION_SCORE"] = "response_evaluation_score";
+  PrebuiltMetrics2["RESPONSE_MATCH_SCORE"] = "response_match_score";
+  PrebuiltMetrics2["SAFETY_V1"] = "safety_v1";
+  PrebuiltMetrics2["FINAL_RESPONSE_MATCH_V2"] = "final_response_match_v2";
+  PrebuiltMetrics2["TOOL_TRAJECTORY_SCORE"] = "tool_trajectory_score";
+  PrebuiltMetrics2["SAFETY"] = "safety";
+  PrebuiltMetrics2["RESPONSE_MATCH"] = "response_match";
+  return PrebuiltMetrics2;
+})(PrebuiltMetrics || {});
+// src/evaluation/eval-result.ts
+var EvalResult = class {
+  evalSetResultId;
+  evalSetResultName;
+  evalSetId;
+  evalCaseResults;
+  creationTimestamp;
+  constructor(init) {
+    this.evalSetResultId = init.evalSetResultId || "";
+    this.evalSetResultName = init.evalSetResultName;
+    this.evalSetId = init.evalSetId || "";
+    this.evalCaseResults = init.evalCaseResults || [];
+    this.creationTimestamp = init.creationTimestamp || Date.now() / 1e3;
+  }
+};
+// src/evaluation/agent-evaluator.ts
+import * as fs2 from "fs/promises";
+import * as path2 from "path";
+// src/evaluation/base-eval-service.ts
+var BaseEvalService = class {
+  async *evaluateSession(session) {
+    const inferenceResults = [];
+    for await (const result of this.performInference({
+      evalSetId: session.evalSetId,
+      evalCases: session.evalCases
+    })) {
+      inferenceResults.push(result);
+    }
+    for await (const result of this.evaluate({
+      inferenceResults,
+      evaluateConfig: session.evaluateConfig
+    })) {
+      yield result;
+    }
+  }
+};
+// src/evaluation/vertex-ai-eval-facade.ts
+var ERROR_MESSAGE_SUFFIX = `
+You should specify both project id and location. This metric uses Vertex Gen AI
+Eval SDK, and it requires google cloud credentials.
+If using an .env file add the values there, or explicitly set in the code using
+the template below:
+process.env.GOOGLE_CLOUD_LOCATION = <LOCATION>
+process.env.GOOGLE_CLOUD_PROJECT = <PROJECT ID>
+`;
+var VertexAiEvalFacade = class _VertexAiEvalFacade {
+  threshold;
+  metricName;
+  constructor(config) {
+    this.threshold = config.threshold;
+    this.metricName = config.metricName;
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    let totalScore = 0;
+    let numInvocations = 0;
+    const perInvocationResults = [];
+    for (let i = 0; i < actualInvocations.length; i++) {
+      const actual = actualInvocations[i];
+      const expected = expectedInvocations[i];
+      const prompt = this._getText(expected.userContent);
+      const reference = this._getText(expected.finalResponse);
+      const response = this._getText(actual.finalResponse);
+      const evalCase = {
+        prompt,
+        reference,
+        response
+      };
+      try {
+        const evalCaseResult = await _VertexAiEvalFacade._performEval(
+          [evalCase],
+          [this.metricName]
+        );
+        const score = this._getScore(evalCaseResult);
+        perInvocationResults.push({
+          actualInvocation: actual,
+          expectedInvocation: expected,
+          score,
+          evalStatus: this._getEvalStatus(score)
+        });
+        if (score !== null && score !== void 0) {
+          totalScore += score;
+          numInvocations++;
+        }
+      } catch (error) {
+        console.error("Error evaluating invocation:", error);
+        perInvocationResults.push({
+          actualInvocation: actual,
+          expectedInvocation: expected,
+          score: void 0,
+          evalStatus: 3 /* NOT_EVALUATED */
+        });
+      }
+    }
+    if (perInvocationResults.length > 0) {
+      const overallScore = numInvocations > 0 ? totalScore / numInvocations : void 0;
+      return {
+        overallScore,
+        overallEvalStatus: this._getEvalStatus(overallScore),
+        perInvocationResults
+      };
+    }
+    return {
+      overallScore: void 0,
+      overallEvalStatus: 3 /* NOT_EVALUATED */,
+      perInvocationResults: []
+    };
+  }
+  _getText(content) {
+    if (content?.parts) {
+      return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
+    }
+    return "";
+  }
+  _getScore(evalResult) {
+    if (evalResult?.summaryMetrics?.[0]?.meanScore !== void 0 && typeof evalResult.summaryMetrics[0].meanScore === "number" && !Number.isNaN(evalResult.summaryMetrics[0].meanScore)) {
+      return evalResult.summaryMetrics[0].meanScore;
+    }
+    return void 0;
+  }
+  _getEvalStatus(score) {
+    if (score !== null && score !== void 0) {
+      return score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
+    }
+    return 3 /* NOT_EVALUATED */;
+  }
+  static async _performEval(dataset, metrics) {
+    const projectId = process.env.GOOGLE_CLOUD_PROJECT;
+    const location = process.env.GOOGLE_CLOUD_LOCATION;
+    if (!projectId) {
+      throw new Error(`Missing project id. ${ERROR_MESSAGE_SUFFIX}`);
+    }
+    if (!location) {
+      throw new Error(`Missing location. ${ERROR_MESSAGE_SUFFIX}`);
+    }
+    console.warn(
+      "Vertex AI evaluation is not fully implemented. Using mock response."
+    );
+    return {
+      summaryMetrics: [
+        {
+          meanScore: Math.random() * 0.5 + 0.5
+        }
+      ]
+    };
+  }
+};
+// src/evaluation/response-evaluator.ts
+var ResponseEvaluator = class extends Evaluator {
+  metricName;
+  threshold;
+  constructor(evalMetric) {
+    super(evalMetric);
+    if (evalMetric.metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
+      this.metricName = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
+    } else if (evalMetric.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
+      this.metricName = "response_match_score" /* RESPONSE_MATCH_SCORE */;
+    } else {
+      throw new Error(`Metric ${evalMetric.metricName} is not supported.`);
+    }
+    this.threshold = evalMetric.threshold;
+  }
+  static getMetricInfo(metricName) {
+    if (metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
+      return {
+        metricName: "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */,
+        description: "This metric evaluates how coherent agent's response was. Value range of this metric is [1,5], with values closer to 5 more desirable.",
+        metricValueInfo: {
+          interval: {
+            minValue: 1,
+            maxValue: 5,
+            openAtMin: false,
+            openAtMax: false
+          }
+        }
+      };
+    }
+    if (metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
+      return {
+        metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
+        description: "This metric evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
+        metricValueInfo: {
+          interval: {
+            minValue: 0,
+            maxValue: 1,
+            openAtMin: false,
+            openAtMax: false
+          }
+        }
+      };
+    }
+    throw new Error(`Metric ${metricName} is not supported.`);
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    if (this.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
+      return this.evaluateRougeScore(actualInvocations, expectedInvocations);
+    }
+    const vertexAiFacade = new VertexAiEvalFacade({
+      threshold: this.threshold,
+      metricName: this.metricName
+    });
+    return vertexAiFacade.evaluateInvocations(
+      actualInvocations,
+      expectedInvocations
+    );
+  }
+  async evaluateRougeScore(actualInvocations, expectedInvocations) {
+    if (actualInvocations.length !== expectedInvocations.length) {
+      throw new Error("Number of actual and expected invocations must match");
+    }
+    const results = [];
+    for (let i = 0; i < actualInvocations.length; i++) {
+      const actual = actualInvocations[i];
+      const expected = expectedInvocations[i];
+      const result = await this.evaluateInvocation(actual, expected);
+      results.push(result);
+    }
+    const scores = results.map((r) => r.score).filter((s) => s !== void 0);
+    const overallScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
+    const overallStatus = overallScore !== void 0 && overallScore >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
+    return {
+      overallScore,
+      overallEvalStatus: overallStatus,
+      perInvocationResults: results
+    };
+  }
+  async evaluateInvocation(actual, expected) {
+    if (!actual.finalResponse || !expected.finalResponse) {
+      return {
+        actualInvocation: actual,
+        expectedInvocation: expected,
+        evalStatus: 3 /* NOT_EVALUATED */
+      };
+    }
+    const score = await this.computeRougeScore(
+      actual.finalResponse,
+      expected.finalResponse
+    );
+    return {
+      actualInvocation: actual,
+      expectedInvocation: expected,
+      score,
+      evalStatus: score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */
+    };
+  }
+  async computeRougeScore(actual, expected) {
+    const actualText = this.extractText(actual);
+    const expectedText = this.extractText(expected);
+    if (!actualText.trim() || !expectedText.trim()) {
+      return 0;
+    }
+    const actualTokens = this.tokenizeText(actualText);
+    const expectedTokens = this.tokenizeText(expectedText);
+    const actualUnigrams = new Set(actualTokens);
+    const expectedUnigrams = new Set(expectedTokens);
+    const commonUnigrams = new Set(
+      [...actualUnigrams].filter((token) => expectedUnigrams.has(token))
+    );
+    const precision = actualUnigrams.size > 0 ? commonUnigrams.size / actualUnigrams.size : 0;
+    const recall = expectedUnigrams.size > 0 ? commonUnigrams.size / expectedUnigrams.size : 0;
+    const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
+    return fmeasure;
+  }
+  extractText(content) {
+    if (content?.parts) {
+      return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join(" ");
+    }
+    return "";
+  }
+  tokenizeText(text) {
+    return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
+  }
+};
+// src/evaluation/trajectory-evaluator.ts
+var TrajectoryEvaluator = class extends Evaluator {
+  static getMetricInfo() {
+    return {
+      metricName: "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */,
+      description: "This metric compares two tool call trajectories (expected vs. actual) for the same user interaction. It performs an exact match on the tool name and arguments for each step in the trajectory. A score of 1.0 indicates a perfect match, while 0.0 indicates a mismatch. Higher values are better.",
+      metricValueInfo: {
+        interval: {
+          minValue: 0,
+          maxValue: 1,
+          openAtMin: false,
+          openAtMax: false
+        }
+      }
+    };
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    let totalToolUseAccuracy = 0;
+    let numInvocations = 0;
+    const perInvocationResults = [];
+    for (let i = 0; i < actualInvocations.length; i++) {
+      const actual = actualInvocations[i];
+      const expected = expectedInvocations[i];
+      if (!actual.intermediateData?.toolUses || !expected.intermediateData?.toolUses) {
+        perInvocationResults.push({
+          actualInvocation: actual,
+          expectedInvocation: expected,
+          evalStatus: 3 /* NOT_EVALUATED */
+        });
+        continue;
+      }
+      const toolUseAccuracy = this.areToolCallsEqual(
+        actual.intermediateData.toolUses,
+        expected.intermediateData.toolUses
+      ) ? 1 : 0;
+      perInvocationResults.push({
+        actualInvocation: actual,
+        expectedInvocation: expected,
+        score: toolUseAccuracy,
+        evalStatus: toolUseAccuracy >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */
+      });
+      totalToolUseAccuracy += toolUseAccuracy;
+      numInvocations++;
+    }
+    const overallScore = numInvocations > 0 ? totalToolUseAccuracy / numInvocations : 0;
+    return {
+      overallScore,
+      overallEvalStatus: overallScore >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */,
+      perInvocationResults
+    };
+  }
+  areToolCallsEqual(actual, expected) {
+    if (actual.length !== expected.length) {
+      return false;
+    }
+    return actual.every((actualCall, index) => {
+      const expectedCall = expected[index];
+      return this.isToolCallEqual(actualCall, expectedCall);
+    });
+  }
+  isToolCallEqual(actual, expected) {
+    if (actual.name !== expected.name) {
+      return false;
+    }
+    const actualArgs = actual.args || {};
+    const expectedArgs = expected.args || {};
+    const actualKeys = Object.keys(actualArgs).sort();
+    const expectedKeys = Object.keys(expectedArgs).sort();
+    if (actualKeys.length !== expectedKeys.length) {
+      return false;
+    }
+    return actualKeys.every((key, index) => {
+      const expectedKey = expectedKeys[index];
+      if (key !== expectedKey) {
+        return false;
+      }
+      return JSON.stringify(actualArgs[key]) === JSON.stringify(expectedArgs[key]);
+    });
+  }
+};
+// src/evaluation/safety-evaluator.ts
+var SafetyEvaluatorV1 = class extends Evaluator {
+  static getMetricInfo() {
+    return {
+      metricName: "safety_v1" /* SAFETY_V1 */,
+      description: "This metric evaluates the safety (harmlessness) of an Agent's Response. Value range of the metric is [0, 1], with values closer to 1 to be more desirable (safe).",
+      metricValueInfo: {
+        interval: {
+          minValue: 0,
+          maxValue: 1,
+          openAtMin: false,
+          openAtMax: false
+        }
+      }
+    };
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    const facade = new VertexAiEvalFacade({
+      threshold: this.metric.threshold,
+      metricName: "safety_v1" /* SAFETY_V1 */
+    });
+    return await facade.evaluateInvocations(
+      actualInvocations,
+      expectedInvocations
+    );
+  }
+};
+// src/evaluation/llm-as-judge-utils.ts
+function getTextFromContent(content) {
+  if (content?.parts) {
+    return content.parts.map((part) => part.text).filter(Boolean).join("\n");
+  }
+  return "";
+}
+function getEvalStatus(score, threshold) {
+  return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
+}
+// src/evaluation/llm-as-judge.ts
+var LlmAsJudge = class {
+  async sampleJudge(prompt, numSamples, critiqueParser, judgeModelOptions) {
+    const modelName = judgeModelOptions?.judgeModel || "gemini-2.5-flash";
+    const model = LLMRegistry.getModelOrCreate(modelName);
+    const config = judgeModelOptions?.judgeModelConfig || {};
+    const samples = [];
+    for (let i = 0; i < numSamples; i++) {
+      try {
+        const response = await model.generateContent({
+          prompt,
+          ...config
+        });
+        const label = critiqueParser(response.text);
+        if (label !== "not_found" /* NOT_FOUND */) {
+          samples.push(label);
+        }
+      } catch (error) {
+        console.error("Error sampling judge model:", error);
+      }
+    }
+    return samples;
+  }
+};
+// src/evaluation/final-response-match-v2.ts
+var FINAL_RESPONSE_MATCH_V2_PROMPT = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
+The primary focus of this rating task is to check correctness of the model responses.
+The data consists of:
+- A user query.
+- A model generated response for the prompt. The responses can consist of:
+  - Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
+  - Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
+You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
+Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
+When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
+You should follow the constitutions below very carefully to rate the model response:
+- Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
+  - e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
+  - e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
+  - e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
+- The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
+- If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
+- If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
+- When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
+- Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
+- When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
+- When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
+- When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 \u2192 0.80 is acceptable,  but 0.7 is not).
+Below are the inputs:
+{{
+  "User prompt": {prompt},
+  "Agent response": {response},
+  "Reference response": {golden_response},
+}}
+The answer should be a json alone which follows the json structure below:
+{{
+  "reasoning": [reasoning],
+  "is_the_agent_response_valid": [valid or invalid],
+}}
+Answer with assertiveness:
+`;
+var DEFAULT_NUM_SAMPLES = 5;
+function parseCritique(response) {
+  const labelMatchIsResponseValid = response.match(
+    /"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]/
+  );
+  if (labelMatchIsResponseValid?.[1]) {
+    const label = labelMatchIsResponseValid[1].toLowerCase();
+    return label === "valid" ? "valid" /* VALID */ : "invalid" /* INVALID */;
+  }
+  return "not_found" /* NOT_FOUND */;
+}
+var FinalResponseMatchV2Evaluator = class extends Evaluator {
+  constructor(evalMetric, llmAsJudge = new LlmAsJudge()) {
+    super(evalMetric);
+    this.llmAsJudge = llmAsJudge;
+  }
+  static getMetricInfo() {
+    return {
+      metricName: "final_response_match_v2" /* FINAL_RESPONSE_MATCH_V2 */,
+      description: "This metric evaluates if the agent's final response matches a golden/expected final response using an LLM judge. Value range for this metric is [0,1], with values closer to 1 more desirable.",
+      metricValueInfo: {
+        interval: {
+          minValue: 0,
+          maxValue: 1,
+          openAtMin: false,
+          openAtMax: false
+        }
+      }
+    };
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    const perInvocationResults = [];
+    let totalScore = 0;
+    let numInvocations = 0;
+    if (!actualInvocations.length) {
+      return {
+        overallEvalStatus: 3 /* NOT_EVALUATED */,
+        perInvocationResults: []
+      };
+    }
+    for (let i = 0; i < actualInvocations.length; i++) {
+      const actual = actualInvocations[i];
+      const expected = expectedInvocations[i];
+      const prompt = getTextFromContent(expected.userContent);
+      const response = getTextFromContent(actual.finalResponse);
+      const goldenResponse = getTextFromContent(expected.finalResponse);
+      const formattedPrompt = FINAL_RESPONSE_MATCH_V2_PROMPT.replace(
+        "{prompt}",
+        prompt
+      ).replace("{response}", response).replace("{golden_response}", goldenResponse);
+      const numSamples = this.metric.judgeModelOptions?.numSamples ?? DEFAULT_NUM_SAMPLES;
+      const labels = await this.llmAsJudge.sampleJudge(
+        formattedPrompt,
+        numSamples,
+        parseCritique,
+        this.metric.judgeModelOptions
+      );
+      const score = labels.filter((l) => l === "valid" /* VALID */).length / labels.length;
+      perInvocationResults.push({
+        actualInvocation: actual,
+        expectedInvocation: expected,
+        score,
+        evalStatus: getEvalStatus(score, this.metric.threshold)
+      });
+      totalScore += score;
+      numInvocations++;
+    }
+    const overallScore = totalScore / numInvocations;
+    return {
+      overallScore,
+      overallEvalStatus: getEvalStatus(overallScore, this.metric.threshold),
+      perInvocationResults
+    };
+  }
+};
+// src/evaluation/metric-evaluator-registry.ts
+var MetricEvaluatorRegistry = class {
+  registry = /* @__PURE__ */ new Map();
+  getEvaluator(evalMetric) {
+    const entry = this.registry.get(evalMetric.metricName);
+    if (!entry) {
+      throw new Error(`${evalMetric.metricName} not found in registry.`);
+    }
+    return new entry.evaluator(evalMetric);
+  }
+  registerEvaluator(metricInfo, evaluator) {
+    const metricName = metricInfo.metricName;
+    if (this.registry.has(metricName)) {
+      console.info(
+        `Updating Evaluator class for ${metricName} from ${this.registry.get(metricName)?.evaluator.name} to ${evaluator.name}`
+      );
+    }
+    this.registry.set(metricName, {
+      evaluator,
+      metricInfo: { ...metricInfo }
+    });
+  }
+  getRegisteredMetrics() {
+    return Array.from(this.registry.values()).map((entry) => ({
+      ...entry.metricInfo
+    }));
+  }
+};
+function getDefaultMetricEvaluatorRegistry() {
+  const registry = new MetricEvaluatorRegistry();
+  registry.registerEvaluator(
+    TrajectoryEvaluator.getMetricInfo(),
+    TrajectoryEvaluator
+  );
+  registry.registerEvaluator(
+    ResponseEvaluator.getMetricInfo("response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */),
+    ResponseEvaluator
+  );
+  registry.registerEvaluator(
+    ResponseEvaluator.getMetricInfo("response_match_score" /* RESPONSE_MATCH_SCORE */),
+    ResponseEvaluator
+  );
+  registry.registerEvaluator(
+    SafetyEvaluatorV1.getMetricInfo(),
+    SafetyEvaluatorV1
+  );
+  registry.registerEvaluator(
+    FinalResponseMatchV2Evaluator.getMetricInfo(),
+    FinalResponseMatchV2Evaluator
+  );
+  return registry;
+}
+var DEFAULT_METRIC_EVALUATOR_REGISTRY = getDefaultMetricEvaluatorRegistry();
+// src/evaluation/local-eval-service.ts
+var LocalEvalService = class extends BaseEvalService {
+  constructor(agent, parallelism = 4) {
+    super();
+    this.agent = agent;
+    this.parallelism = parallelism;
+    this.initializeRunner();
+  }
+  runner;
+  async initializeRunner() {
+    if ("ask" in this.agent) {
+      this.runner = this.agent;
+    } else {
+      try {
+        const { runner } = await AgentBuilder.create("eval_agent").withModel("gemini-2.5-flash").withDescription("Agent for evaluation purposes").build();
+        this.runner = {
+          ask: async (message) => {
+            return await runner.ask(message);
+          }
+        };
+      } catch (error) {
+        console.warn(
+          "Failed to create AgentBuilder runner, falling back to mock:",
+          error
+        );
+        this.runner = {
+          ask: async (message) => {
+            return `Mock response to: ${message}`;
+          }
+        };
+      }
+    }
+  }
+  async *performInference(request) {
+    for (const evalSet of request.evalCases) {
+      for (const evalCase of evalSet.evalCases) {
+        const expected = [];
+        for (const convo of evalCase.conversation) {
+          if (convo.finalResponse) {
+            expected.push({
+              invocationId: `${evalCase.evalId}-expected-${expected.length}`,
+              userContent: convo.userContent,
+              finalResponse: convo.finalResponse,
+              intermediateData: convo.intermediateData,
+              creationTimestamp: convo.creationTimestamp
+            });
+          }
+        }
+        const actual = await this.runInference(evalCase);
+        yield [...expected, ...actual];
+      }
+    }
+  }
+  async *evaluate(request) {
+    const { inferenceResults, evaluateConfig } = request;
+    const resultsByCase = /* @__PURE__ */ new Map();
+    for (const result of inferenceResults) {
+      const invocationId = result[0].invocationId;
+      if (!invocationId) continue;
+      const lastHyphenIndex = invocationId.lastIndexOf("-");
+      const evalId = lastHyphenIndex !== -1 ? invocationId.substring(0, lastHyphenIndex) : invocationId;
+      const existing = resultsByCase.get(evalId) || [];
+      resultsByCase.set(evalId, [...existing, ...result]);
+    }
+    for (const [evalId, results] of resultsByCase) {
+      const evalResult = {
+        evalSetResultId: `${evalId}-result-${Date.now()}`,
+        evalSetId: evalId,
+        evalCaseResults: [],
+        creationTimestamp: Date.now()
+      };
+      for (const evalMetric of evaluateConfig.evalMetrics) {
+        const evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.getEvaluator(evalMetric);
+        const actual = results.filter(
+          (r) => !r.invocationId?.includes("expected")
+        );
+        const expected = results.filter(
+          (r) => r.invocationId?.includes("expected")
+        );
+        const result = await evaluator.evaluateInvocations(actual, expected);
+        evalResult.evalCaseResults.push({
+          evalSetId: evalId,
+          evalId,
+          finalEvalStatus: result.perInvocationResults.length > 0 ? result.perInvocationResults[0].evalStatus : 3 /* NOT_EVALUATED */,
+          overallEvalMetricResults: [],
+          sessionId: evalId,
+          evalMetricResultPerInvocation: result.perInvocationResults.map(
+            (r) => ({
+              actualInvocation: r.actualInvocation,
+              expectedInvocation: r.expectedInvocation,
+              evalMetricResults: [
+                {
+                  metricName: evalMetric.metricName,
+                  threshold: evalMetric.threshold,
+                  score: r.score,
+                  evalStatus: r.evalStatus
+                }
+              ]
+            })
+          )
+        });
+      }
+      yield evalResult;
+    }
+  }
+  async runInference(evalCase) {
+    const results = [];
+    if (!this.runner) {
+      await this.initializeRunner();
+    }
+    if (evalCase.sessionInput) {
+      try {
+        if (this.runner.initializeSession) {
+          await this.runner.initializeSession(evalCase.sessionInput);
+        } else if (this.runner.setSessionState) {
+          await this.runner.setSessionState(evalCase.sessionInput);
+        } else {
+          console.log(
+            `Session input provided for ${evalCase.evalId}:`,
+            evalCase.sessionInput
+          );
+        }
+      } catch (error) {
+        console.warn(
+          `Failed to initialize session for ${evalCase.evalId}:`,
+          error
+        );
+      }
+    }
+    for (const invocation of evalCase.conversation) {
+      try {
+        const response = await this.runner.ask(invocation.userContent);
+        results.push({
+          invocationId: `${evalCase.evalId}-${results.length}`,
+          userContent: invocation.userContent,
+          finalResponse: {
+            role: "model",
+            parts: [{ text: response || "" }]
+          },
+          intermediateData: {
+            toolUses: [],
+            intermediateResponses: []
+          },
+          creationTimestamp: Date.now()
+        });
+      } catch (error) {
+        console.error(`Error running inference for ${evalCase.evalId}:`, error);
+        results.push({
+          invocationId: `${evalCase.evalId}-${results.length}`,
+          userContent: invocation.userContent,
+          finalResponse: {
+            role: "model",
+            parts: [
+              {
+                text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
+              }
+            ]
+          },
+          intermediateData: {
+            toolUses: [],
+            intermediateResponses: []
+          },
+          creationTimestamp: Date.now()
+        });
+      }
+    }
+    return results;
+  }
+};
+// src/evaluation/agent-evaluator.ts
+var NUM_RUNS = 2;
+var TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */;
+var RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
+var RESPONSE_MATCH_SCORE_KEY = "response_match_score" /* RESPONSE_MATCH_SCORE */;
+var SAFETY_V1_KEY = "safety_v1" /* SAFETY_V1 */;
+var ALLOWED_CRITERIA = [
+  TOOL_TRAJECTORY_SCORE_KEY,
+  RESPONSE_EVALUATION_SCORE_KEY,
+  RESPONSE_MATCH_SCORE_KEY,
+  SAFETY_V1_KEY
+];
+var QUERY_COLUMN = "query";
+var REFERENCE_COLUMN = "reference";
+var EXPECTED_TOOL_USE_COLUMN = "expected_tool_use";
+var DEFAULT_CRITERIA = {
+  [TOOL_TRAJECTORY_SCORE_KEY]: 1,
+  [RESPONSE_MATCH_SCORE_KEY]: 0.8
+};
+var loadJson = async (filePath) => {
+  try {
+    const fileContent = await fs2.readFile(filePath, "utf-8");
+    return JSON.parse(fileContent);
+  } catch (error) {
+    throw new Error(`Failed to load JSON from ${filePath}: ${error}`);
+  }
+};
+var AgentEvaluator = class _AgentEvaluator {
+  static async findConfigForTestFile(testFile) {
+    const testFolder = path2.dirname(testFile);
+    const configPath = path2.join(testFolder, "test_config.json");
+    try {
+      await fs2.access(configPath);
+      const configData = await loadJson(configPath);
+      if ("criteria" in configData && typeof configData.criteria === "object") {
+        return configData.criteria;
+      }
+      throw new Error(
+        `Invalid format for test_config.json at ${configPath}. Expected a 'criteria' dictionary.`
+      );
+    } catch (error) {
+      return DEFAULT_CRITERIA;
+    }
+  }
+  static async evaluateEvalSet(agent, evalSet, criteria, numRuns = NUM_RUNS, printDetailedResults = false) {
+    const evalMetrics = Object.entries(criteria).map(
+      ([metricName, threshold]) => ({
+        metricName,
+        threshold
+      })
+    );
+    const evalResultsByEvalId = await _AgentEvaluator._getEvalResultsByEvalId(
+      agent,
+      evalSet,
+      evalMetrics,
+      numRuns
+    );
+    const failures = [];
+    for (const [_, evalResultsPerEvalId] of evalResultsByEvalId) {
+      const evalMetricResults = _AgentEvaluator._getEvalMetricResultsWithInvocation(
+        evalResultsPerEvalId
+      );
+      const failuresPerEvalCase = _AgentEvaluator._processMetricsAndGetFailures(
+        evalMetricResults,
+        printDetailedResults,
+        agent.name || "Unknown Agent"
+      );
+      failures.push(...failuresPerEvalCase);
+    }
+    if (failures.length > 0) {
+      throw new Error(
+        `Following are all the test failures. If you looking to get more details on the failures, then please re-run this test with \`printDetailedResults\` set to \`true\`.
+${failures.join(
+          "\n"
+        )}`
+      );
+    }
+  }
+  static async evaluate(agent, evalDatasetFilePathOrDir, numRuns = NUM_RUNS, initialSessionFile) {
+    const testFiles = [];
+    try {
+      const stat2 = await fs2.stat(evalDatasetFilePathOrDir);
+      if (stat2.isDirectory()) {
+        const files = await this._findTestFilesRecursively(
+          evalDatasetFilePathOrDir
+        );
+        testFiles.push(...files);
+      } else {
+        testFiles.push(evalDatasetFilePathOrDir);
+      }
+    } catch (error) {
+      throw new Error(`Invalid path: ${evalDatasetFilePathOrDir}`);
+    }
+    const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
+    for (const testFile of testFiles) {
+      const criteria = await _AgentEvaluator.findConfigForTestFile(testFile);
+      const evalSet = await _AgentEvaluator._loadEvalSetFromFile(
+        testFile,
+        criteria,
+        initialSession
+      );
+      await _AgentEvaluator.evaluateEvalSet(agent, evalSet, criteria, numRuns);
+    }
+  }
+  static async migrateEvalDataToNewSchema(oldEvalDataFile, newEvalDataFile, initialSessionFile) {
+    if (!oldEvalDataFile || !newEvalDataFile) {
+      throw new Error("One of oldEvalDataFile or newEvalDataFile is empty.");
+    }
+    const criteria = await _AgentEvaluator.findConfigForTestFile(oldEvalDataFile);
+    const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
+    const evalSet = await _AgentEvaluator._getEvalSetFromOldFormat(
+      oldEvalDataFile,
+      criteria,
+      initialSession
+    );
+    await fs2.writeFile(newEvalDataFile, JSON.stringify(evalSet, null, 2));
+  }
+  static async _findTestFilesRecursively(dir) {
+    const testFiles = [];
+    async function walk(currentDir) {
+      const entries = await fs2.readdir(currentDir, { withFileTypes: true });
+      for (const entry of entries) {
+        const fullPath = path2.join(currentDir, entry.name);
+        if (entry.isDirectory()) {
+          await walk(fullPath);
+        } else if (entry.name.endsWith(".test.json")) {
+          testFiles.push(fullPath);
+        }
+      }
+    }
+    await walk(dir);
+    return testFiles;
+  }
+  static async _loadEvalSetFromFile(evalSetFile, criteria, initialSession) {
+    try {
+      const content = await fs2.readFile(evalSetFile, "utf-8");
+      try {
+        const evalSet = JSON.parse(content);
+        if (evalSet.evalSetId && evalSet.evalCases) {
+          if (Object.keys(initialSession).length > 0) {
+            throw new Error(
+              "Initial session should be specified as a part of EvalSet file. Explicit initial session is only needed, when specifying data in the older schema."
+            );
+          }
+          return evalSet;
+        }
+      } catch (parseError) {
+        throw new Error(`Failed to parse eval set data: ${parseError}`);
+      }
+    } catch (error) {
+      throw new Error(`Failed to process eval set file: ${error}`);
+    }
+    console.warn(
+      `Contents of ${evalSetFile} appear to be in older format. To avoid this warning, please update your test files to contain data in EvalSet schema. You can use 'migrateEvalDataToNewSchema' for migrating your old test files.`
+    );
+    return _AgentEvaluator._getEvalSetFromOldFormat(
+      evalSetFile,
+      criteria,
+      initialSession
+    );
+  }
+  static async _getEvalSetFromOldFormat(evalSetFile, criteria, initialSession) {
+    const data = await _AgentEvaluator._loadDataset(evalSetFile);
+    _AgentEvaluator._validateInput(data, criteria);
+    return {
+      evalSetId: `eval-set-${Date.now()}`,
+      name: evalSetFile,
+      evalCases: data[0].map(
+        (item, index) => ({
+          evalId: `eval-${index}`,
+          conversation: [
+            {
+              invocationId: `invocation-${index}`,
+              userContent: {
+                role: "user",
+                parts: [{ text: item[QUERY_COLUMN] || "" }]
+              },
+              finalResponse: item[REFERENCE_COLUMN] ? {
+                role: "model",
+                parts: [{ text: item[REFERENCE_COLUMN] }]
+              } : void 0,
+              intermediateData: item[EXPECTED_TOOL_USE_COLUMN] ? {
+                toolUses: item[EXPECTED_TOOL_USE_COLUMN],
+                intermediateResponses: []
+              } : void 0,
+              creationTimestamp: Date.now()
+            }
+          ],
+          sessionInput: Object.keys(initialSession).length > 0 ? {
+            appName: "test-app",
+            userId: "test-user",
+            state: initialSession
+          } : void 0
+        })
+      ),
+      creationTimestamp: Date.now()
+    };
+  }
+  static async _getInitialSession(initialSessionFile) {
+    if (!initialSessionFile) {
+      return {};
+    }
+    try {
+      const content = await fs2.readFile(initialSessionFile, "utf-8");
+      return JSON.parse(content);
+    } catch (error) {
+      throw new Error(
+        `Failed to load initial session from ${initialSessionFile}: ${error}`
+      );
+    }
+  }
+  static async _loadDataset(inputData) {
+    const stat2 = await fs2.stat(inputData);
+    if (stat2.isDirectory()) {
+      const testFiles = await this._findTestFilesRecursively(inputData);
+      const results = await Promise.all(testFiles.map((f) => loadJson(f)));
+      return results.map((r) => Array.isArray(r) ? r : [r]);
+    }
+    if (stat2.isFile()) {
+      const data = await loadJson(inputData);
+      return [Array.isArray(data) ? data : [data]];
+    }
+    throw new Error(`Invalid input path: ${inputData}`);
+  }
+  static _validateInput(evalDataset, criteria) {
+    if (!evalDataset || evalDataset.length === 0) {
+      throw new Error("The evaluation dataset is None or empty.");
+    }
+    for (const key of Object.keys(criteria)) {
+      if (!ALLOWED_CRITERIA.includes(key)) {
+        throw new Error(
+          `Invalid criteria key: ${key}. Expected one of ${ALLOWED_CRITERIA.join(
+            ", "
+          )}.`
+        );
+      }
+    }
+    const sample = evalDataset[0];
+    if (!Array.isArray(sample) || sample.length === 0) {
+      throw new Error("The evaluation dataset is empty.");
+    }
+    const firstQuery = sample[0];
+    if (typeof firstQuery !== "object") {
+      throw new Error(
+        `Each evaluation dataset sample must be list of dictionary. But it's ${JSON.stringify(
+          evalDataset
+        )}`
+      );
+    }
+    if (TOOL_TRAJECTORY_SCORE_KEY in criteria) {
+      if (!(QUERY_COLUMN in firstQuery) || !(EXPECTED_TOOL_USE_COLUMN in firstQuery)) {
+        throw new Error(
+          `Samples for ${TOOL_TRAJECTORY_SCORE_KEY} must include '${QUERY_COLUMN}' and '${EXPECTED_TOOL_USE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
+        );
+      }
+    }
+    if (RESPONSE_EVALUATION_SCORE_KEY in criteria) {
+      if (!(QUERY_COLUMN in firstQuery)) {
+        throw new Error(
+          `Samples for ${RESPONSE_EVALUATION_SCORE_KEY} must include '${QUERY_COLUMN}' key. The sample is ${JSON.stringify(sample)}.`
+        );
+      }
+    }
+    if (RESPONSE_MATCH_SCORE_KEY in criteria) {
+      if (!(QUERY_COLUMN in firstQuery) || !(REFERENCE_COLUMN in firstQuery)) {
+        throw new Error(
+          `Samples for ${RESPONSE_MATCH_SCORE_KEY} must include '${QUERY_COLUMN}' and '${REFERENCE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
+        );
+      }
+    }
+  }
+  static _printDetails(evalMetricResultWithInvocations, overallEvalStatus, overallScore, metricName = "", threshold = 0) {
+    console.log(
+      `Summary: \`${overallEvalStatus}\` for Metric: \`${metricName}\`. Expected threshold: \`${threshold}\`, actual value: \`${overallScore}\`.`
+    );
+    const data = evalMetricResultWithInvocations.map((per) => ({
+      evalStatus: per.evalMetricResult.evalStatus,
+      score: per.evalMetricResult.score,
+      threshold,
+      prompt: _AgentEvaluator._convertContentToText(
+        per.expectedInvocation.userContent
+      ),
+      expectedResponse: _AgentEvaluator._convertContentToText(
+        per.expectedInvocation.finalResponse
+      ),
+      actualResponse: _AgentEvaluator._convertContentToText(
+        per.actualInvocation.finalResponse
+      ),
+      expectedToolCalls: _AgentEvaluator._convertToolCallsToText(
+        per.expectedInvocation.intermediateData
+      ),
+      actualToolCalls: _AgentEvaluator._convertToolCallsToText(
+        per.actualInvocation.intermediateData
+      )
+    }));
+    console.table(data);
+    console.log("\n\n");
+  }
+  static _convertContentToText(content) {
+    if (content?.parts) {
+      return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
+    }
+    return "";
+  }
+  static _convertToolCallsToText(intermediateData) {
+    if (intermediateData?.toolUses) {
+      return intermediateData.toolUses.map((t) => JSON.stringify(t)).join("\n");
+    }
+    return "";
+  }
+  static async _getEvalResultsByEvalId(agent, evalSet, evalMetrics, numRuns) {
+    const evalService = new LocalEvalService(agent);
+    const inferenceResults = [];
+    for (let run = 0; run < numRuns; run++) {
+      for await (const result of evalService.performInference({
+        evalSetId: evalSet.evalSetId,
+        evalCases: [evalSet]
+      })) {
+        inferenceResults.push(result);
+      }
+    }
+    const evalResultsByEvalId = /* @__PURE__ */ new Map();
+    for await (const evalResult of evalService.evaluate({
+      inferenceResults,
+      evaluateConfig: { evalMetrics }
+    })) {
+      for (const caseResult of evalResult.evalCaseResults) {
+        const evalId = caseResult.evalId;
+        if (!evalResultsByEvalId.has(evalId)) {
+          evalResultsByEvalId.set(evalId, []);
+        }
+        evalResultsByEvalId.get(evalId).push(caseResult);
+      }
+    }
+    return evalResultsByEvalId;
+  }
+  static _getEvalMetricResultsWithInvocation(evalResultsPerEvalId) {
+    const evalMetricResults = {};
+    for (const evalCaseResult of evalResultsPerEvalId) {
+      for (const evalMetricsPerInvocation of evalCaseResult.evalMetricResultPerInvocation) {
+        for (const evalMetricResult of evalMetricsPerInvocation.evalMetricResults) {
+          const metricName = evalMetricResult.metricName;
+          if (!(metricName in evalMetricResults)) {
+            evalMetricResults[metricName] = [];
+          }
+          evalMetricResults[metricName].push({
+            actualInvocation: evalMetricsPerInvocation.actualInvocation,
+            expectedInvocation: evalMetricsPerInvocation.expectedInvocation,
+            evalMetricResult
+          });
+        }
+      }
+    }
+    return evalMetricResults;
+  }
+  static _processMetricsAndGetFailures(evalMetricResults, printDetailedResults, agentModule) {
+    const failures = [];
+    for (const [metricName, evalMetricResultsWithInvocations] of Object.entries(
+      evalMetricResults
+    )) {
+      const threshold = evalMetricResultsWithInvocations[0]?.evalMetricResult.threshold || 0;
+      const scores = evalMetricResultsWithInvocations.map((m) => m.evalMetricResult.score).filter((s) => s !== void 0);
+      let overallScore;
+      let overallEvalStatus;
+      if (scores.length > 0) {
+        overallScore = scores.reduce((a, b) => a + b, 0) / scores.length;
+        overallEvalStatus = overallScore >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
+      } else {
+        overallScore = void 0;
+        overallEvalStatus = 3 /* NOT_EVALUATED */;
+      }
+      if (overallEvalStatus !== 1 /* PASSED */) {
+        if (printDetailedResults) {
+          _AgentEvaluator._printDetails(
+            evalMetricResultsWithInvocations,
+            overallEvalStatus,
+            overallScore,
+            metricName,
+            threshold
+          );
+        }
+        failures.push(
+          `${metricName} for ${agentModule} Failed. Expected ${threshold}, but got ${overallScore}.`
+        );
+      }
+    }
+    return failures;
+  }
+};
+// src/evaluation/final-response-match-v1.ts
+var RougeEvaluator = class extends Evaluator {
+  evalMetric;
+  constructor(evalMetric) {
+    super(evalMetric);
+    this.evalMetric = evalMetric;
+  }
+  static getMetricInfo() {
+    return {
+      metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
+      description: "This metric evaluates if the agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
+      metricValueInfo: {
+        interval: {
+          minValue: 0,
+          maxValue: 1,
+          openAtMin: false,
+          openAtMax: false
+        }
+      }
+    };
+  }
+  async evaluateInvocations(actualInvocations, expectedInvocations) {
+    let totalScore = 0;
+    let numInvocations = 0;
+    const perInvocationResults = [];
+    for (let i = 0; i < actualInvocations.length; i++) {
+      const actual = actualInvocations[i];
+      const expected = expectedInvocations[i];
+      const reference = getTextFromContent2(expected.finalResponse);
+      const response = getTextFromContent2(actual.finalResponse);
+      const rouge1Scores = await calculateRouge1Scores(response, reference);
+      const score = rouge1Scores.fmeasure;
+      perInvocationResults.push({
+        actualInvocation: actual,
+        expectedInvocation: expected,
+        score,
+        evalStatus: getEvalStatus2(score, this.evalMetric.threshold)
+      });
+      totalScore += score;
+      numInvocations++;
+    }
+    if (perInvocationResults.length > 0) {
+      const overallScore = totalScore / numInvocations;
+      return {
+        overallScore,
+        overallEvalStatus: getEvalStatus2(
+          overallScore,
+          this.evalMetric.threshold
+        ),
+        perInvocationResults
+      };
+    }
+    return {
+      overallEvalStatus: 3 /* NOT_EVALUATED */,
+      perInvocationResults: []
+    };
+  }
+};
+function getTextFromContent2(content) {
+  if (content?.parts) {
+    return content.parts.map((part) => part.text).filter(Boolean).join("\n");
+  }
+  return "";
+}
+function getEvalStatus2(score, threshold) {
+  return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
+}
+function calculateRouge1Scores(response, reference) {
+  if (!response.trim() || !reference.trim()) {
+    return { precision: 0, recall: 0, fmeasure: 0 };
+  }
+  const responseTokens = tokenizeText(response);
+  const referenceTokens = tokenizeText(reference);
+  const responseUnigrams = new Set(responseTokens);
+  const referenceUnigrams = new Set(referenceTokens);
+  const commonUnigrams = new Set(
+    [...responseUnigrams].filter((token) => referenceUnigrams.has(token))
+  );
+  const precision = responseUnigrams.size > 0 ? commonUnigrams.size / responseUnigrams.size : 0;
+  const recall = referenceUnigrams.size > 0 ? commonUnigrams.size / referenceUnigrams.size : 0;
+  const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
+  return { precision, recall, fmeasure };
+}
+function tokenizeText(text) {
+  return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
+}
 // src/version.ts
 var VERSION = "0.1.0";
 export {
   AF_FUNCTION_CALL_ID_PREFIX,
   LlmAgent as Agent,
   AgentBuilder,
+  AgentEvaluator,
   AgentTool,
   agents_exports as Agents,
   AiSdkLlm,
@@ -11847,11 +13185,16 @@ export {
   CodeExecutorContext,
   DatabaseSessionService,
   EnhancedAuthConfig,
+  EvalResult,
+  EvalStatus,
+  evaluation_exports as Evaluation,
+  Evaluator,
   Event,
   EventActions,
   events_exports as Events,
   ExitLoopTool,
   FileOperationsTool,
+  FinalResponseMatchV2Evaluator,
   flows_exports as Flows,
   FunctionTool,
   GcsArtifactService,
@@ -11873,6 +13216,7 @@ export {
   LlmResponse,
   LoadArtifactsTool,
   LoadMemoryTool,
+  LocalEvalService,
   LoopAgent,
   McpAbi,
   McpAtp,
@@ -11900,10 +13244,13 @@ export {
   OpenIdConnectScheme,
   ParallelAgent,
   PlanReActPlanner,
+  PrebuiltMetrics,
   REQUEST_EUC_FUNCTION_CALL_NAME,
   ReadonlyContext,
+  RougeEvaluator,
   RunConfig,
   Runner,
+  SafetyEvaluatorV1,
   SequentialAgent,
   sessions_exports as Sessions,
   SingleFlow,
@@ -11912,6 +13259,7 @@ export {
   TelemetryService,
   ToolContext,
   tools_exports as Tools,
+  TrajectoryEvaluator,
   TransferToAgentTool,
   UserInteractionTool,
   VERSION,