@iqai/adk 0.1.22 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -833,70 +833,23 @@ ${instructions.join("\n\n")}`;
833
833
 
834
834
  // src/models/llm-response.ts
835
835
  var LlmResponse = class _LlmResponse {
836
- /**
837
- * Unique identifier for the response.
838
- */
839
836
  id;
840
- /**
841
- * The content generated by the model.
842
- */
837
+ text;
843
838
  content;
844
- /**
845
- * The grounding metadata of the response.
846
- */
847
839
  groundingMetadata;
848
- /**
849
- * Indicates whether the text content is part of an unfinished text stream.
850
- */
851
840
  partial;
852
- /**
853
- * Indicates whether the response from the model is complete.
854
- */
855
841
  turnComplete;
856
- /**
857
- * Error code if the response is an error.
858
- */
859
842
  errorCode;
860
- /**
861
- * Error message if the response is an error.
862
- */
863
843
  errorMessage;
864
- /**
865
- * Flag indicating that LLM was interrupted when generating the content.
866
- */
867
844
  interrupted;
868
- /**
869
- * The custom metadata of the LlmResponse.
870
- */
871
845
  customMetadata;
872
- /**
873
- * The usage metadata of the LlmResponse.
874
- */
875
846
  usageMetadata;
876
- /**
877
- * Index of the candidate response.
878
- */
879
847
  candidateIndex;
880
- /**
881
- * Reason why the model finished generating.
882
- */
883
848
  finishReason;
884
- /**
885
- * Error object if the response is an error.
886
- */
887
849
  error;
888
- /**
889
- * Creates a new LlmResponse.
890
- */
891
850
  constructor(data = {}) {
892
851
  Object.assign(this, data);
893
852
  }
894
- /**
895
- * Creates an LlmResponse from a GenerateContentResponse.
896
- *
897
- * @param generateContentResponse The GenerateContentResponse to create the LlmResponse from.
898
- * @returns The LlmResponse.
899
- */
900
853
  static create(generateContentResponse) {
901
854
  const usageMetadata = generateContentResponse.usageMetadata;
902
855
  if (generateContentResponse.candidates && generateContentResponse.candidates.length > 0) {
@@ -928,15 +881,6 @@ var LlmResponse = class _LlmResponse {
928
881
  usageMetadata
929
882
  });
930
883
  }
931
- /**
932
- * Creates an LlmResponse from an error.
933
- *
934
- * @param error The error object or message.
935
- * @param options Additional options for the error response.
936
- * @param options.errorCode A specific error code for the response.
937
- * @param options.model The model that was being used when the error occurred.
938
- * @returns The LlmResponse.
939
- */
940
884
  static fromError(error, options = {}) {
941
885
  const errorMessage = error instanceof Error ? error.message : String(error);
942
886
  const errorCode = options.errorCode || "UNKNOWN_ERROR";
@@ -2675,30 +2619,16 @@ var OpenAiLlm = class extends BaseLlm {
2675
2619
  // src/models/llm-registry.ts
2676
2620
  init_logger();
2677
2621
  var LLMRegistry = class _LLMRegistry {
2678
- /**
2679
- * Map of model name regex to LLM class
2680
- */
2681
2622
  static llmRegistry = /* @__PURE__ */ new Map();
2623
+ static modelInstances = /* @__PURE__ */ new Map();
2682
2624
  static logger = new Logger({ name: "LLMRegistry" });
2683
- /**
2684
- * Creates a new LLM instance
2685
- *
2686
- * @param model The model name
2687
- * @returns The LLM instance
2688
- */
2689
2625
  static newLLM(model) {
2690
2626
  const llmClass = _LLMRegistry.resolve(model);
2691
2627
  if (!llmClass) {
2692
- throw new Error(`No LLM found for model: ${model}`);
2628
+ throw new Error(`No LLM class found for model: ${model}`);
2693
2629
  }
2694
2630
  return new llmClass(model);
2695
2631
  }
2696
- /**
2697
- * Resolves the LLM class from the model name
2698
- *
2699
- * @param model The model name
2700
- * @returns The LLM class
2701
- */
2702
2632
  static resolve(model) {
2703
2633
  for (const [regex, llmClass] of _LLMRegistry.llmRegistry.entries()) {
2704
2634
  if (regex.test(model)) {
@@ -2707,34 +2637,54 @@ var LLMRegistry = class _LLMRegistry {
2707
2637
  }
2708
2638
  return null;
2709
2639
  }
2710
- /**
2711
- * Registers a new LLM class
2712
- *
2713
- * @param modelNameRegex The regex to match model names
2714
- * @param llmClass The LLM class
2715
- */
2716
2640
  static register(modelNameRegex, llmClass) {
2717
2641
  _LLMRegistry.llmRegistry.set(new RegExp(modelNameRegex), llmClass);
2718
2642
  }
2719
- /**
2720
- * Registers all model patterns from an LLM class
2721
- *
2722
- * @param llmClass The LLM class
2723
- */
2724
2643
  static registerLLM(llmClass) {
2725
2644
  const modelPatterns = llmClass.supportedModels();
2726
2645
  for (const pattern of modelPatterns) {
2727
2646
  _LLMRegistry.register(pattern, llmClass);
2728
2647
  }
2729
2648
  }
2730
- /**
2731
- * Logs all registered models for debugging
2732
- */
2649
+ static registerModel(name, model) {
2650
+ _LLMRegistry.modelInstances.set(name, model);
2651
+ }
2652
+ static getModel(name) {
2653
+ const model = _LLMRegistry.modelInstances.get(name);
2654
+ if (!model) {
2655
+ throw new Error(`Model '${name}' not found in registry`);
2656
+ }
2657
+ return model;
2658
+ }
2659
+ static hasModel(name) {
2660
+ return _LLMRegistry.modelInstances.has(name);
2661
+ }
2662
+ static unregisterModel(name) {
2663
+ _LLMRegistry.modelInstances.delete(name);
2664
+ }
2665
+ static getModelOrCreate(name) {
2666
+ if (_LLMRegistry.hasModel(name)) {
2667
+ return _LLMRegistry.getModel(name);
2668
+ }
2669
+ return _LLMRegistry.newLLM(name);
2670
+ }
2671
+ static clear() {
2672
+ _LLMRegistry.llmRegistry.clear();
2673
+ _LLMRegistry.modelInstances.clear();
2674
+ }
2675
+ static clearModels() {
2676
+ _LLMRegistry.modelInstances.clear();
2677
+ }
2678
+ static clearClasses() {
2679
+ _LLMRegistry.llmRegistry.clear();
2680
+ }
2733
2681
  static logRegisteredModels() {
2734
- _LLMRegistry.logger.debug(
2735
- "Registered LLM models:",
2736
- [..._LLMRegistry.llmRegistry.entries()].map(([regex]) => regex.toString())
2682
+ const classPatterns = [..._LLMRegistry.llmRegistry.entries()].map(
2683
+ ([regex]) => regex.toString()
2737
2684
  );
2685
+ const instanceNames = [..._LLMRegistry.modelInstances.keys()];
2686
+ _LLMRegistry.logger.debug("Registered LLM class patterns:", classPatterns);
2687
+ _LLMRegistry.logger.debug("Registered LLM instances:", instanceNames);
2738
2688
  }
2739
2689
  };
2740
2690
 
@@ -6582,9 +6532,23 @@ var BaseLlmFlow = class {
6582
6532
  yield event;
6583
6533
  }
6584
6534
  }
6585
- const tools = await agent.canonicalTools(
6535
+ let tools = await agent.canonicalTools(
6586
6536
  new ReadonlyContext(invocationContext)
6587
6537
  );
6538
+ if (tools.length > 1) {
6539
+ const seen = /* @__PURE__ */ new Set();
6540
+ const filtered = [];
6541
+ for (const t of tools) {
6542
+ const name = t?.name;
6543
+ if (!name) continue;
6544
+ if (seen.has(name)) {
6545
+ continue;
6546
+ }
6547
+ seen.add(name);
6548
+ filtered.push(t);
6549
+ }
6550
+ tools = filtered;
6551
+ }
6588
6552
  for (const tool of tools) {
6589
6553
  const toolContext = new ToolContext(invocationContext);
6590
6554
  await tool.processLlmRequest(toolContext, llmRequest);
@@ -6740,7 +6704,42 @@ var BaseLlmFlow = class {
6740
6704
  }
6741
6705
  invocationContext.incrementLlmCallCount();
6742
6706
  const isStreaming = invocationContext.runConfig.streamingMode === "sse" /* SSE */;
6743
- const tools = llmRequest.config?.tools || [];
6707
+ let tools = llmRequest.config?.tools || [];
6708
+ if (tools.length) {
6709
+ const deduped = [];
6710
+ const seenFn = /* @__PURE__ */ new Set();
6711
+ for (const t of tools) {
6712
+ const tool = t;
6713
+ if (tool && Array.isArray(tool.functionDeclarations)) {
6714
+ const newFds = tool.functionDeclarations.filter(
6715
+ (fd) => {
6716
+ if (fd?.name) {
6717
+ if (seenFn.has(fd.name)) {
6718
+ return false;
6719
+ }
6720
+ seenFn.add(fd.name);
6721
+ }
6722
+ return true;
6723
+ }
6724
+ );
6725
+ if (newFds.length) {
6726
+ deduped.push({ ...tool, functionDeclarations: newFds });
6727
+ }
6728
+ } else if (tool?.name) {
6729
+ if (seenFn.has(tool.name)) continue;
6730
+ seenFn.add(tool.name);
6731
+ deduped.push(tool);
6732
+ } else {
6733
+ deduped.push(tool);
6734
+ }
6735
+ }
6736
+ if (deduped.length !== tools.length) {
6737
+ this.logger.debug(
6738
+ `\u{1F501} Deduplicated tool/function declarations: ${tools.length} -> ${deduped.length}`
6739
+ );
6740
+ }
6741
+ llmRequest.config.tools = tools = deduped;
6742
+ }
6744
6743
  const toolNames = tools.map((tool) => {
6745
6744
  if (tool.functionDeclarations && Array.isArray(tool.functionDeclarations)) {
6746
6745
  return tool.functionDeclarations.map((fn) => fn.name).join(", ");
@@ -9555,6 +9554,7 @@ var LangGraphAgent = class extends BaseAgent {
9555
9554
  };
9556
9555
 
9557
9556
  // src/agents/agent-builder.ts
9557
+ init_logger();
9558
9558
  import { generateId } from "ai";
9559
9559
 
9560
9560
  // src/runners.ts
@@ -9668,19 +9668,19 @@ var InMemoryArtifactService = class {
9668
9668
  }
9669
9669
  async saveArtifact(args) {
9670
9670
  const { appName, userId, sessionId, filename, artifact } = args;
9671
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9672
- if (!this.artifacts.has(path2)) {
9673
- this.artifacts.set(path2, []);
9671
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9672
+ if (!this.artifacts.has(path3)) {
9673
+ this.artifacts.set(path3, []);
9674
9674
  }
9675
- const versions = this.artifacts.get(path2);
9675
+ const versions = this.artifacts.get(path3);
9676
9676
  const version = versions.length;
9677
9677
  versions.push(artifact);
9678
9678
  return version;
9679
9679
  }
9680
9680
  async loadArtifact(args) {
9681
9681
  const { appName, userId, sessionId, filename, version } = args;
9682
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9683
- const versions = this.artifacts.get(path2);
9682
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9683
+ const versions = this.artifacts.get(path3);
9684
9684
  if (!versions || versions.length === 0) {
9685
9685
  return null;
9686
9686
  }
@@ -9701,12 +9701,12 @@ var InMemoryArtifactService = class {
9701
9701
  const sessionPrefix = `${appName}/${userId}/${sessionId}/`;
9702
9702
  const userNamespacePrefix = `${appName}/${userId}/user/`;
9703
9703
  const filenames = [];
9704
- for (const path2 of this.artifacts.keys()) {
9705
- if (path2.startsWith(sessionPrefix)) {
9706
- const filename = path2.substring(sessionPrefix.length);
9704
+ for (const path3 of this.artifacts.keys()) {
9705
+ if (path3.startsWith(sessionPrefix)) {
9706
+ const filename = path3.substring(sessionPrefix.length);
9707
9707
  filenames.push(filename);
9708
- } else if (path2.startsWith(userNamespacePrefix)) {
9709
- const filename = path2.substring(userNamespacePrefix.length);
9708
+ } else if (path3.startsWith(userNamespacePrefix)) {
9709
+ const filename = path3.substring(userNamespacePrefix.length);
9710
9710
  filenames.push(filename);
9711
9711
  }
9712
9712
  }
@@ -9714,16 +9714,16 @@ var InMemoryArtifactService = class {
9714
9714
  }
9715
9715
  async deleteArtifact(args) {
9716
9716
  const { appName, userId, sessionId, filename } = args;
9717
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9718
- if (!this.artifacts.has(path2)) {
9717
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9718
+ if (!this.artifacts.has(path3)) {
9719
9719
  return;
9720
9720
  }
9721
- this.artifacts.delete(path2);
9721
+ this.artifacts.delete(path3);
9722
9722
  }
9723
9723
  async listVersions(args) {
9724
9724
  const { appName, userId, sessionId, filename } = args;
9725
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9726
- const versions = this.artifacts.get(path2);
9725
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9726
+ const versions = this.artifacts.get(path3);
9727
9727
  if (!versions || versions.length === 0) {
9728
9728
  return [];
9729
9729
  }
@@ -10193,7 +10193,7 @@ var Runner = class {
10193
10193
  }
10194
10194
  };
10195
10195
  invokeRunAsync();
10196
- return function* () {
10196
+ return (function* () {
10197
10197
  while (true) {
10198
10198
  while (queueIndex >= eventQueue.length && !asyncCompleted) {
10199
10199
  }
@@ -10206,7 +10206,7 @@ var Runner = class {
10206
10206
  }
10207
10207
  yield event;
10208
10208
  }
10209
- }();
10209
+ })();
10210
10210
  }
10211
10211
  /**
10212
10212
  * Main entry method to run the agent in this runner.
@@ -10406,6 +10406,12 @@ var AgentBuilder = class _AgentBuilder {
10406
10406
  artifactService;
10407
10407
  agentType = "llm";
10408
10408
  existingSession;
10409
+ existingAgent;
10410
+ // If provided, reuse directly
10411
+ definitionLocked = false;
10412
+ // Lock further definition mutation after withAgent
10413
+ warnedMethods = /* @__PURE__ */ new Set();
10414
+ logger = new Logger({ name: "AgentBuilder" });
10409
10415
  /**
10410
10416
  * Private constructor - use static create() method
10411
10417
  */
@@ -10434,6 +10440,7 @@ var AgentBuilder = class _AgentBuilder {
10434
10440
  * @returns This builder instance for chaining
10435
10441
  */
10436
10442
  withModel(model) {
10443
+ this.warnIfLocked("withModel");
10437
10444
  this.config.model = model;
10438
10445
  return this;
10439
10446
  }
@@ -10443,6 +10450,7 @@ var AgentBuilder = class _AgentBuilder {
10443
10450
  * @returns This builder instance for chaining
10444
10451
  */
10445
10452
  withDescription(description) {
10453
+ this.warnIfLocked("withDescription");
10446
10454
  this.config.description = description;
10447
10455
  return this;
10448
10456
  }
@@ -10452,14 +10460,17 @@ var AgentBuilder = class _AgentBuilder {
10452
10460
  * @returns This builder instance for chaining
10453
10461
  */
10454
10462
  withInstruction(instruction) {
10463
+ this.warnIfLocked("withInstruction");
10455
10464
  this.config.instruction = instruction;
10456
10465
  return this;
10457
10466
  }
10458
10467
  withInputSchema(schema) {
10468
+ this.warnIfLocked("withInputSchema");
10459
10469
  this.config.inputSchema = schema;
10460
10470
  return this;
10461
10471
  }
10462
10472
  withOutputSchema(schema) {
10473
+ this.warnIfLocked("withOutputSchema");
10463
10474
  this.config.outputSchema = schema;
10464
10475
  return this;
10465
10476
  }
@@ -10469,6 +10480,7 @@ var AgentBuilder = class _AgentBuilder {
10469
10480
  * @returns This builder instance for chaining
10470
10481
  */
10471
10482
  withTools(...tools) {
10483
+ this.warnIfLocked("withTools");
10472
10484
  this.config.tools = [...this.config.tools || [], ...tools];
10473
10485
  return this;
10474
10486
  }
@@ -10478,6 +10490,7 @@ var AgentBuilder = class _AgentBuilder {
10478
10490
  * @returns This builder instance for chaining
10479
10491
  */
10480
10492
  withPlanner(planner) {
10493
+ this.warnIfLocked("withPlanner");
10481
10494
  this.config.planner = planner;
10482
10495
  return this;
10483
10496
  }
@@ -10487,6 +10500,7 @@ var AgentBuilder = class _AgentBuilder {
10487
10500
  * @returns This builder instance for chaining
10488
10501
  */
10489
10502
  withCodeExecutor(codeExecutor) {
10503
+ this.warnIfLocked("withCodeExecutor");
10490
10504
  this.config.codeExecutor = codeExecutor;
10491
10505
  return this;
10492
10506
  }
@@ -10496,6 +10510,7 @@ var AgentBuilder = class _AgentBuilder {
10496
10510
  * @returns This builder instance for chaining
10497
10511
  */
10498
10512
  withOutputKey(outputKey) {
10513
+ this.warnIfLocked("withOutputKey");
10499
10514
  this.config.outputKey = outputKey;
10500
10515
  return this;
10501
10516
  }
@@ -10505,6 +10520,7 @@ var AgentBuilder = class _AgentBuilder {
10505
10520
  * @returns This builder instance for chaining
10506
10521
  */
10507
10522
  withSubAgents(subAgents) {
10523
+ this.warnIfLocked("withSubAgents");
10508
10524
  this.config.subAgents = subAgents;
10509
10525
  return this;
10510
10526
  }
@@ -10514,6 +10530,7 @@ var AgentBuilder = class _AgentBuilder {
10514
10530
  * @returns This builder instance for chaining
10515
10531
  */
10516
10532
  withBeforeAgentCallback(callback) {
10533
+ this.warnIfLocked("withBeforeAgentCallback");
10517
10534
  this.config.beforeAgentCallback = callback;
10518
10535
  return this;
10519
10536
  }
@@ -10523,15 +10540,29 @@ var AgentBuilder = class _AgentBuilder {
10523
10540
  * @returns This builder instance for chaining
10524
10541
  */
10525
10542
  withAfterAgentCallback(callback) {
10543
+ this.warnIfLocked("withAfterAgentCallback");
10526
10544
  this.config.afterAgentCallback = callback;
10527
10545
  return this;
10528
10546
  }
10547
+ /**
10548
+ * Provide an already constructed agent instance. Further definition-mutating calls
10549
+ * (model/tools/instruction/etc.) will be ignored with a dev warning.
10550
+ */
10551
+ withAgent(agent) {
10552
+ this.existingAgent = agent;
10553
+ this.definitionLocked = true;
10554
+ if (this.config.name === "default_agent" && agent.name) {
10555
+ this.config.name = agent.name;
10556
+ }
10557
+ return this;
10558
+ }
10529
10559
  /**
10530
10560
  * Configure as a sequential agent
10531
10561
  * @param subAgents Sub-agents to execute in sequence
10532
10562
  * @returns This builder instance for chaining
10533
10563
  */
10534
10564
  asSequential(subAgents) {
10565
+ this.warnIfLocked("asSequential");
10535
10566
  this.agentType = "sequential";
10536
10567
  this.config.subAgents = subAgents;
10537
10568
  return this;
@@ -10542,6 +10573,7 @@ var AgentBuilder = class _AgentBuilder {
10542
10573
  * @returns This builder instance for chaining
10543
10574
  */
10544
10575
  asParallel(subAgents) {
10576
+ this.warnIfLocked("asParallel");
10545
10577
  this.agentType = "parallel";
10546
10578
  this.config.subAgents = subAgents;
10547
10579
  return this;
@@ -10553,6 +10585,7 @@ var AgentBuilder = class _AgentBuilder {
10553
10585
  * @returns This builder instance for chaining
10554
10586
  */
10555
10587
  asLoop(subAgents, maxIterations = 3) {
10588
+ this.warnIfLocked("asLoop");
10556
10589
  this.agentType = "loop";
10557
10590
  this.config.subAgents = subAgents;
10558
10591
  this.config.maxIterations = maxIterations;
@@ -10565,6 +10598,7 @@ var AgentBuilder = class _AgentBuilder {
10565
10598
  * @returns This builder instance for chaining
10566
10599
  */
10567
10600
  asLangGraph(nodes, rootNode) {
10601
+ this.warnIfLocked("asLangGraph");
10568
10602
  this.agentType = "langgraph";
10569
10603
  this.config.nodes = nodes;
10570
10604
  this.config.rootNode = rootNode;
@@ -10691,6 +10725,7 @@ var AgentBuilder = class _AgentBuilder {
10691
10725
  * @returns Created agent instance
10692
10726
  */
10693
10727
  createAgent() {
10728
+ if (this.existingAgent) return this.existingAgent;
10694
10729
  switch (this.agentType) {
10695
10730
  case "llm": {
10696
10731
  if (!this.config.model) {
@@ -10821,6 +10856,22 @@ var AgentBuilder = class _AgentBuilder {
10821
10856
  }
10822
10857
  };
10823
10858
  }
10859
+ /**
10860
+ * Warn (once per method) if the definition has been locked by withAgent().
10861
+ */
10862
+ warnIfLocked(method) {
10863
+ if (!this.definitionLocked) return;
10864
+ if (this.warnedMethods.has(method)) return;
10865
+ this.warnedMethods.add(method);
10866
+ if (process.env.NODE_ENV !== "production") {
10867
+ const msg = `AgentBuilder: attempted to call ${method} after withAgent(); ignoring. (Wrap the agent first OR configure before withAgent).`;
10868
+ if (this.logger && typeof this.logger.warn === "function") {
10869
+ this.logger.warn(msg);
10870
+ } else {
10871
+ console.warn(msg);
10872
+ }
10873
+ }
10874
+ }
10824
10875
  };
10825
10876
 
10826
10877
  // src/memory/index.ts
@@ -10985,14 +11036,14 @@ var VertexAiSessionService = class extends BaseSessionService {
10985
11036
  async listSessions(appName, userId) {
10986
11037
  const reasoningEngineId = this.getReasoningEngineId(appName);
10987
11038
  const apiClient = this.getApiClient();
10988
- let path2 = `reasoningEngines/${reasoningEngineId}/sessions`;
11039
+ let path3 = `reasoningEngines/${reasoningEngineId}/sessions`;
10989
11040
  if (userId) {
10990
11041
  const parsedUserId = encodeURIComponent(`"${userId}"`);
10991
- path2 = `${path2}?filter=user_id=${parsedUserId}`;
11042
+ path3 = `${path3}?filter=user_id=${parsedUserId}`;
10992
11043
  }
10993
11044
  const apiResponse = await apiClient.async_request({
10994
11045
  http_method: "GET",
10995
- path: path2,
11046
+ path: path3,
10996
11047
  request_dict: {}
10997
11048
  });
10998
11049
  if (apiResponse.httpHeaders) {
@@ -11808,12 +11859,1299 @@ __export(flows_exports, {
11808
11859
  removeClientFunctionCallId: () => removeClientFunctionCallId
11809
11860
  });
11810
11861
 
11862
+ // src/evaluation/index.ts
11863
+ var evaluation_exports = {};
11864
+ __export(evaluation_exports, {
11865
+ AgentEvaluator: () => AgentEvaluator,
11866
+ EvalResult: () => EvalResult,
11867
+ EvalStatus: () => EvalStatus,
11868
+ Evaluator: () => Evaluator,
11869
+ FinalResponseMatchV2Evaluator: () => FinalResponseMatchV2Evaluator,
11870
+ LocalEvalService: () => LocalEvalService,
11871
+ PrebuiltMetrics: () => PrebuiltMetrics,
11872
+ RougeEvaluator: () => RougeEvaluator,
11873
+ SafetyEvaluatorV1: () => SafetyEvaluatorV1,
11874
+ TrajectoryEvaluator: () => TrajectoryEvaluator
11875
+ });
11876
+
11877
+ // src/evaluation/evaluator.ts
11878
+ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
11879
+ EvalStatus2[EvalStatus2["PASSED"] = 1] = "PASSED";
11880
+ EvalStatus2[EvalStatus2["FAILED"] = 2] = "FAILED";
11881
+ EvalStatus2[EvalStatus2["NOT_EVALUATED"] = 3] = "NOT_EVALUATED";
11882
+ return EvalStatus2;
11883
+ })(EvalStatus || {});
11884
+ var Evaluator = class {
11885
+ constructor(metric) {
11886
+ this.metric = metric;
11887
+ }
11888
+ static getMetricInfo(metricName) {
11889
+ throw new Error("getMetricInfo() must be implemented by subclass");
11890
+ }
11891
+ };
11892
+
11893
+ // src/evaluation/eval-metrics.ts
11894
+ var PrebuiltMetrics = /* @__PURE__ */ ((PrebuiltMetrics2) => {
11895
+ PrebuiltMetrics2["TOOL_TRAJECTORY_AVG_SCORE"] = "tool_trajectory_avg_score";
11896
+ PrebuiltMetrics2["RESPONSE_EVALUATION_SCORE"] = "response_evaluation_score";
11897
+ PrebuiltMetrics2["RESPONSE_MATCH_SCORE"] = "response_match_score";
11898
+ PrebuiltMetrics2["SAFETY_V1"] = "safety_v1";
11899
+ PrebuiltMetrics2["FINAL_RESPONSE_MATCH_V2"] = "final_response_match_v2";
11900
+ PrebuiltMetrics2["TOOL_TRAJECTORY_SCORE"] = "tool_trajectory_score";
11901
+ PrebuiltMetrics2["SAFETY"] = "safety";
11902
+ PrebuiltMetrics2["RESPONSE_MATCH"] = "response_match";
11903
+ return PrebuiltMetrics2;
11904
+ })(PrebuiltMetrics || {});
11905
+
11906
+ // src/evaluation/eval-result.ts
11907
+ var EvalResult = class {
11908
+ evalSetResultId;
11909
+ evalSetResultName;
11910
+ evalSetId;
11911
+ evalCaseResults;
11912
+ creationTimestamp;
11913
+ constructor(init) {
11914
+ this.evalSetResultId = init.evalSetResultId || "";
11915
+ this.evalSetResultName = init.evalSetResultName;
11916
+ this.evalSetId = init.evalSetId || "";
11917
+ this.evalCaseResults = init.evalCaseResults || [];
11918
+ this.creationTimestamp = init.creationTimestamp || Date.now() / 1e3;
11919
+ }
11920
+ };
11921
+
11922
+ // src/evaluation/agent-evaluator.ts
11923
+ import * as fs2 from "fs/promises";
11924
+ import * as path2 from "path";
11925
+
11926
+ // src/evaluation/base-eval-service.ts
11927
+ var BaseEvalService = class {
11928
+ async *evaluateSession(session) {
11929
+ const inferenceResults = [];
11930
+ for await (const result of this.performInference({
11931
+ evalSetId: session.evalSetId,
11932
+ evalCases: session.evalCases
11933
+ })) {
11934
+ inferenceResults.push(result);
11935
+ }
11936
+ for await (const result of this.evaluate({
11937
+ inferenceResults,
11938
+ evaluateConfig: session.evaluateConfig
11939
+ })) {
11940
+ yield result;
11941
+ }
11942
+ }
11943
+ };
11944
+
11945
+ // src/evaluation/vertex-ai-eval-facade.ts
11946
+ var ERROR_MESSAGE_SUFFIX = `
11947
+ You should specify both project id and location. This metric uses Vertex Gen AI
11948
+ Eval SDK, and it requires google cloud credentials.
11949
+
11950
+ If using an .env file add the values there, or explicitly set in the code using
11951
+ the template below:
11952
+
11953
+ process.env.GOOGLE_CLOUD_LOCATION = <LOCATION>
11954
+ process.env.GOOGLE_CLOUD_PROJECT = <PROJECT ID>
11955
+ `;
11956
+ var VertexAiEvalFacade = class _VertexAiEvalFacade {
11957
+ threshold;
11958
+ metricName;
11959
+ constructor(config) {
11960
+ this.threshold = config.threshold;
11961
+ this.metricName = config.metricName;
11962
+ }
11963
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
11964
+ let totalScore = 0;
11965
+ let numInvocations = 0;
11966
+ const perInvocationResults = [];
11967
+ for (let i = 0; i < actualInvocations.length; i++) {
11968
+ const actual = actualInvocations[i];
11969
+ const expected = expectedInvocations[i];
11970
+ const prompt = this._getText(expected.userContent);
11971
+ const reference = this._getText(expected.finalResponse);
11972
+ const response = this._getText(actual.finalResponse);
11973
+ const evalCase = {
11974
+ prompt,
11975
+ reference,
11976
+ response
11977
+ };
11978
+ try {
11979
+ const evalCaseResult = await _VertexAiEvalFacade._performEval(
11980
+ [evalCase],
11981
+ [this.metricName]
11982
+ );
11983
+ const score = this._getScore(evalCaseResult);
11984
+ perInvocationResults.push({
11985
+ actualInvocation: actual,
11986
+ expectedInvocation: expected,
11987
+ score,
11988
+ evalStatus: this._getEvalStatus(score)
11989
+ });
11990
+ if (score !== null && score !== void 0) {
11991
+ totalScore += score;
11992
+ numInvocations++;
11993
+ }
11994
+ } catch (error) {
11995
+ console.error("Error evaluating invocation:", error);
11996
+ perInvocationResults.push({
11997
+ actualInvocation: actual,
11998
+ expectedInvocation: expected,
11999
+ score: void 0,
12000
+ evalStatus: 3 /* NOT_EVALUATED */
12001
+ });
12002
+ }
12003
+ }
12004
+ if (perInvocationResults.length > 0) {
12005
+ const overallScore = numInvocations > 0 ? totalScore / numInvocations : void 0;
12006
+ return {
12007
+ overallScore,
12008
+ overallEvalStatus: this._getEvalStatus(overallScore),
12009
+ perInvocationResults
12010
+ };
12011
+ }
12012
+ return {
12013
+ overallScore: void 0,
12014
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
12015
+ perInvocationResults: []
12016
+ };
12017
+ }
12018
+ _getText(content) {
12019
+ if (content?.parts) {
12020
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
12021
+ }
12022
+ return "";
12023
+ }
12024
+ _getScore(evalResult) {
12025
+ if (evalResult?.summaryMetrics?.[0]?.meanScore !== void 0 && typeof evalResult.summaryMetrics[0].meanScore === "number" && !Number.isNaN(evalResult.summaryMetrics[0].meanScore)) {
12026
+ return evalResult.summaryMetrics[0].meanScore;
12027
+ }
12028
+ return void 0;
12029
+ }
12030
+ _getEvalStatus(score) {
12031
+ if (score !== null && score !== void 0) {
12032
+ return score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12033
+ }
12034
+ return 3 /* NOT_EVALUATED */;
12035
+ }
12036
+ static async _performEval(dataset, metrics) {
12037
+ const projectId = process.env.GOOGLE_CLOUD_PROJECT;
12038
+ const location = process.env.GOOGLE_CLOUD_LOCATION;
12039
+ if (!projectId) {
12040
+ throw new Error(`Missing project id. ${ERROR_MESSAGE_SUFFIX}`);
12041
+ }
12042
+ if (!location) {
12043
+ throw new Error(`Missing location. ${ERROR_MESSAGE_SUFFIX}`);
12044
+ }
12045
+ console.warn(
12046
+ "Vertex AI evaluation is not fully implemented. Using mock response."
12047
+ );
12048
+ return {
12049
+ summaryMetrics: [
12050
+ {
12051
+ meanScore: Math.random() * 0.5 + 0.5
12052
+ }
12053
+ ]
12054
+ };
12055
+ }
12056
+ };
12057
+
12058
+ // src/evaluation/response-evaluator.ts
12059
+ var ResponseEvaluator = class extends Evaluator {
12060
+ metricName;
12061
+ threshold;
12062
+ constructor(evalMetric) {
12063
+ super(evalMetric);
12064
+ if (evalMetric.metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
12065
+ this.metricName = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
12066
+ } else if (evalMetric.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12067
+ this.metricName = "response_match_score" /* RESPONSE_MATCH_SCORE */;
12068
+ } else {
12069
+ throw new Error(`Metric ${evalMetric.metricName} is not supported.`);
12070
+ }
12071
+ this.threshold = evalMetric.threshold;
12072
+ }
12073
+ static getMetricInfo(metricName) {
12074
+ if (metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
12075
+ return {
12076
+ metricName: "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */,
12077
+ description: "This metric evaluates how coherent agent's response was. Value range of this metric is [1,5], with values closer to 5 more desirable.",
12078
+ metricValueInfo: {
12079
+ interval: {
12080
+ minValue: 1,
12081
+ maxValue: 5,
12082
+ openAtMin: false,
12083
+ openAtMax: false
12084
+ }
12085
+ }
12086
+ };
12087
+ }
12088
+ if (metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12089
+ return {
12090
+ metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
12091
+ description: "This metric evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
12092
+ metricValueInfo: {
12093
+ interval: {
12094
+ minValue: 0,
12095
+ maxValue: 1,
12096
+ openAtMin: false,
12097
+ openAtMax: false
12098
+ }
12099
+ }
12100
+ };
12101
+ }
12102
+ throw new Error(`Metric ${metricName} is not supported.`);
12103
+ }
12104
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12105
+ if (this.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12106
+ return this.evaluateRougeScore(actualInvocations, expectedInvocations);
12107
+ }
12108
+ const vertexAiFacade = new VertexAiEvalFacade({
12109
+ threshold: this.threshold,
12110
+ metricName: this.metricName
12111
+ });
12112
+ return vertexAiFacade.evaluateInvocations(
12113
+ actualInvocations,
12114
+ expectedInvocations
12115
+ );
12116
+ }
12117
+ async evaluateRougeScore(actualInvocations, expectedInvocations) {
12118
+ if (actualInvocations.length !== expectedInvocations.length) {
12119
+ throw new Error("Number of actual and expected invocations must match");
12120
+ }
12121
+ const results = [];
12122
+ for (let i = 0; i < actualInvocations.length; i++) {
12123
+ const actual = actualInvocations[i];
12124
+ const expected = expectedInvocations[i];
12125
+ const result = await this.evaluateInvocation(actual, expected);
12126
+ results.push(result);
12127
+ }
12128
+ const scores = results.map((r) => r.score).filter((s) => s !== void 0);
12129
+ const overallScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
12130
+ const overallStatus = overallScore !== void 0 && overallScore >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12131
+ return {
12132
+ overallScore,
12133
+ overallEvalStatus: overallStatus,
12134
+ perInvocationResults: results
12135
+ };
12136
+ }
12137
+ async evaluateInvocation(actual, expected) {
12138
+ if (!actual.finalResponse || !expected.finalResponse) {
12139
+ return {
12140
+ actualInvocation: actual,
12141
+ expectedInvocation: expected,
12142
+ evalStatus: 3 /* NOT_EVALUATED */
12143
+ };
12144
+ }
12145
+ const score = await this.computeRougeScore(
12146
+ actual.finalResponse,
12147
+ expected.finalResponse
12148
+ );
12149
+ return {
12150
+ actualInvocation: actual,
12151
+ expectedInvocation: expected,
12152
+ score,
12153
+ evalStatus: score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */
12154
+ };
12155
+ }
12156
+ async computeRougeScore(actual, expected) {
12157
+ const actualText = this.extractText(actual);
12158
+ const expectedText = this.extractText(expected);
12159
+ if (!actualText.trim() || !expectedText.trim()) {
12160
+ return 0;
12161
+ }
12162
+ const actualTokens = this.tokenizeText(actualText);
12163
+ const expectedTokens = this.tokenizeText(expectedText);
12164
+ const actualUnigrams = new Set(actualTokens);
12165
+ const expectedUnigrams = new Set(expectedTokens);
12166
+ const commonUnigrams = new Set(
12167
+ [...actualUnigrams].filter((token) => expectedUnigrams.has(token))
12168
+ );
12169
+ const precision = actualUnigrams.size > 0 ? commonUnigrams.size / actualUnigrams.size : 0;
12170
+ const recall = expectedUnigrams.size > 0 ? commonUnigrams.size / expectedUnigrams.size : 0;
12171
+ const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
12172
+ return fmeasure;
12173
+ }
12174
+ extractText(content) {
12175
+ if (content?.parts) {
12176
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join(" ");
12177
+ }
12178
+ return "";
12179
+ }
12180
+ tokenizeText(text) {
12181
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
12182
+ }
12183
+ };
12184
+
12185
+ // src/evaluation/trajectory-evaluator.ts
12186
+ var TrajectoryEvaluator = class extends Evaluator {
12187
+ static getMetricInfo() {
12188
+ return {
12189
+ metricName: "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */,
12190
+ description: "This metric compares two tool call trajectories (expected vs. actual) for the same user interaction. It performs an exact match on the tool name and arguments for each step in the trajectory. A score of 1.0 indicates a perfect match, while 0.0 indicates a mismatch. Higher values are better.",
12191
+ metricValueInfo: {
12192
+ interval: {
12193
+ minValue: 0,
12194
+ maxValue: 1,
12195
+ openAtMin: false,
12196
+ openAtMax: false
12197
+ }
12198
+ }
12199
+ };
12200
+ }
12201
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12202
+ let totalToolUseAccuracy = 0;
12203
+ let numInvocations = 0;
12204
+ const perInvocationResults = [];
12205
+ for (let i = 0; i < actualInvocations.length; i++) {
12206
+ const actual = actualInvocations[i];
12207
+ const expected = expectedInvocations[i];
12208
+ if (!actual.intermediateData?.toolUses || !expected.intermediateData?.toolUses) {
12209
+ perInvocationResults.push({
12210
+ actualInvocation: actual,
12211
+ expectedInvocation: expected,
12212
+ evalStatus: 3 /* NOT_EVALUATED */
12213
+ });
12214
+ continue;
12215
+ }
12216
+ const toolUseAccuracy = this.areToolCallsEqual(
12217
+ actual.intermediateData.toolUses,
12218
+ expected.intermediateData.toolUses
12219
+ ) ? 1 : 0;
12220
+ perInvocationResults.push({
12221
+ actualInvocation: actual,
12222
+ expectedInvocation: expected,
12223
+ score: toolUseAccuracy,
12224
+ evalStatus: toolUseAccuracy >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */
12225
+ });
12226
+ totalToolUseAccuracy += toolUseAccuracy;
12227
+ numInvocations++;
12228
+ }
12229
+ const overallScore = numInvocations > 0 ? totalToolUseAccuracy / numInvocations : 0;
12230
+ return {
12231
+ overallScore,
12232
+ overallEvalStatus: overallScore >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */,
12233
+ perInvocationResults
12234
+ };
12235
+ }
12236
+ areToolCallsEqual(actual, expected) {
12237
+ if (actual.length !== expected.length) {
12238
+ return false;
12239
+ }
12240
+ return actual.every((actualCall, index) => {
12241
+ const expectedCall = expected[index];
12242
+ return this.isToolCallEqual(actualCall, expectedCall);
12243
+ });
12244
+ }
12245
+ isToolCallEqual(actual, expected) {
12246
+ if (actual.name !== expected.name) {
12247
+ return false;
12248
+ }
12249
+ const actualArgs = actual.args || {};
12250
+ const expectedArgs = expected.args || {};
12251
+ const actualKeys = Object.keys(actualArgs).sort();
12252
+ const expectedKeys = Object.keys(expectedArgs).sort();
12253
+ if (actualKeys.length !== expectedKeys.length) {
12254
+ return false;
12255
+ }
12256
+ return actualKeys.every((key, index) => {
12257
+ const expectedKey = expectedKeys[index];
12258
+ if (key !== expectedKey) {
12259
+ return false;
12260
+ }
12261
+ return JSON.stringify(actualArgs[key]) === JSON.stringify(expectedArgs[key]);
12262
+ });
12263
+ }
12264
+ };
12265
+
12266
+ // src/evaluation/safety-evaluator.ts
12267
+ var SafetyEvaluatorV1 = class extends Evaluator {
12268
+ static getMetricInfo() {
12269
+ return {
12270
+ metricName: "safety_v1" /* SAFETY_V1 */,
12271
+ description: "This metric evaluates the safety (harmlessness) of an Agent's Response. Value range of the metric is [0, 1], with values closer to 1 to be more desirable (safe).",
12272
+ metricValueInfo: {
12273
+ interval: {
12274
+ minValue: 0,
12275
+ maxValue: 1,
12276
+ openAtMin: false,
12277
+ openAtMax: false
12278
+ }
12279
+ }
12280
+ };
12281
+ }
12282
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12283
+ const facade = new VertexAiEvalFacade({
12284
+ threshold: this.metric.threshold,
12285
+ metricName: "safety_v1" /* SAFETY_V1 */
12286
+ });
12287
+ return await facade.evaluateInvocations(
12288
+ actualInvocations,
12289
+ expectedInvocations
12290
+ );
12291
+ }
12292
+ };
12293
+
12294
+ // src/evaluation/llm-as-judge-utils.ts
12295
+ function getTextFromContent(content) {
12296
+ if (content?.parts) {
12297
+ return content.parts.map((part) => part.text).filter(Boolean).join("\n");
12298
+ }
12299
+ return "";
12300
+ }
12301
+ function getEvalStatus(score, threshold) {
12302
+ return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12303
+ }
12304
+
12305
+ // src/evaluation/llm-as-judge.ts
12306
+ var LlmAsJudge = class {
12307
+ async sampleJudge(prompt, numSamples, critiqueParser, judgeModelOptions) {
12308
+ const modelName = judgeModelOptions?.judgeModel || "gemini-2.5-flash";
12309
+ const model = LLMRegistry.getModelOrCreate(modelName);
12310
+ const config = judgeModelOptions?.judgeModelConfig || {};
12311
+ const samples = [];
12312
+ for (let i = 0; i < numSamples; i++) {
12313
+ try {
12314
+ const response = await model.generateContent({
12315
+ prompt,
12316
+ ...config
12317
+ });
12318
+ const label = critiqueParser(response.text);
12319
+ if (label !== "not_found" /* NOT_FOUND */) {
12320
+ samples.push(label);
12321
+ }
12322
+ } catch (error) {
12323
+ console.error("Error sampling judge model:", error);
12324
+ }
12325
+ }
12326
+ return samples;
12327
+ }
12328
+ };
12329
+
12330
+ // src/evaluation/final-response-match-v2.ts
12331
+ var FINAL_RESPONSE_MATCH_V2_PROMPT = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
12332
+ The primary focus of this rating task is to check correctness of the model responses.
12333
+
12334
+ The data consists of:
12335
+ - A user query.
12336
+ - A model generated response for the prompt. The responses can consist of:
12337
+ - Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
12338
+ - Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
12339
+ You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
12340
+ Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
12341
+ When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
12342
+
12343
+ You should follow the constitutions below very carefully to rate the model response:
12344
+ - Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
12345
+ - e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
12346
+ - e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
12347
+ - e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
12348
+ - The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
12349
+ - If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
12350
+ - If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
12351
+ - When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
12352
+ - Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
12353
+ - When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
12354
+ - When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
12355
+ - When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 \u2192 0.80 is acceptable, but 0.7 is not).
12356
+
12357
+ Below are the inputs:
12358
+ {{
12359
+ "User prompt": {prompt},
12360
+ "Agent response": {response},
12361
+ "Reference response": {golden_response},
12362
+ }}
12363
+
12364
+ The answer should be a json alone which follows the json structure below:
12365
+ {{
12366
+ "reasoning": [reasoning],
12367
+ "is_the_agent_response_valid": [valid or invalid],
12368
+ }}
12369
+ Answer with assertiveness:
12370
+ `;
12371
+ var DEFAULT_NUM_SAMPLES = 5;
12372
+ function parseCritique(response) {
12373
+ const labelMatchIsResponseValid = response.match(
12374
+ /"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]/
12375
+ );
12376
+ if (labelMatchIsResponseValid?.[1]) {
12377
+ const label = labelMatchIsResponseValid[1].toLowerCase();
12378
+ return label === "valid" ? "valid" /* VALID */ : "invalid" /* INVALID */;
12379
+ }
12380
+ return "not_found" /* NOT_FOUND */;
12381
+ }
12382
+ var FinalResponseMatchV2Evaluator = class extends Evaluator {
12383
+ constructor(evalMetric, llmAsJudge = new LlmAsJudge()) {
12384
+ super(evalMetric);
12385
+ this.llmAsJudge = llmAsJudge;
12386
+ }
12387
+ static getMetricInfo() {
12388
+ return {
12389
+ metricName: "final_response_match_v2" /* FINAL_RESPONSE_MATCH_V2 */,
12390
+ description: "This metric evaluates if the agent's final response matches a golden/expected final response using an LLM judge. Value range for this metric is [0,1], with values closer to 1 more desirable.",
12391
+ metricValueInfo: {
12392
+ interval: {
12393
+ minValue: 0,
12394
+ maxValue: 1,
12395
+ openAtMin: false,
12396
+ openAtMax: false
12397
+ }
12398
+ }
12399
+ };
12400
+ }
12401
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12402
+ const perInvocationResults = [];
12403
+ let totalScore = 0;
12404
+ let numInvocations = 0;
12405
+ if (!actualInvocations.length) {
12406
+ return {
12407
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
12408
+ perInvocationResults: []
12409
+ };
12410
+ }
12411
+ for (let i = 0; i < actualInvocations.length; i++) {
12412
+ const actual = actualInvocations[i];
12413
+ const expected = expectedInvocations[i];
12414
+ const prompt = getTextFromContent(expected.userContent);
12415
+ const response = getTextFromContent(actual.finalResponse);
12416
+ const goldenResponse = getTextFromContent(expected.finalResponse);
12417
+ const formattedPrompt = FINAL_RESPONSE_MATCH_V2_PROMPT.replace(
12418
+ "{prompt}",
12419
+ prompt
12420
+ ).replace("{response}", response).replace("{golden_response}", goldenResponse);
12421
+ const numSamples = this.metric.judgeModelOptions?.numSamples ?? DEFAULT_NUM_SAMPLES;
12422
+ const labels = await this.llmAsJudge.sampleJudge(
12423
+ formattedPrompt,
12424
+ numSamples,
12425
+ parseCritique,
12426
+ this.metric.judgeModelOptions
12427
+ );
12428
+ const score = labels.filter((l) => l === "valid" /* VALID */).length / labels.length;
12429
+ perInvocationResults.push({
12430
+ actualInvocation: actual,
12431
+ expectedInvocation: expected,
12432
+ score,
12433
+ evalStatus: getEvalStatus(score, this.metric.threshold)
12434
+ });
12435
+ totalScore += score;
12436
+ numInvocations++;
12437
+ }
12438
+ const overallScore = totalScore / numInvocations;
12439
+ return {
12440
+ overallScore,
12441
+ overallEvalStatus: getEvalStatus(overallScore, this.metric.threshold),
12442
+ perInvocationResults
12443
+ };
12444
+ }
12445
+ };
12446
+
12447
+ // src/evaluation/metric-evaluator-registry.ts
12448
+ var MetricEvaluatorRegistry = class {
12449
+ registry = /* @__PURE__ */ new Map();
12450
+ getEvaluator(evalMetric) {
12451
+ const entry = this.registry.get(evalMetric.metricName);
12452
+ if (!entry) {
12453
+ throw new Error(`${evalMetric.metricName} not found in registry.`);
12454
+ }
12455
+ return new entry.evaluator(evalMetric);
12456
+ }
12457
+ registerEvaluator(metricInfo, evaluator) {
12458
+ const metricName = metricInfo.metricName;
12459
+ if (this.registry.has(metricName)) {
12460
+ console.info(
12461
+ `Updating Evaluator class for ${metricName} from ${this.registry.get(metricName)?.evaluator.name} to ${evaluator.name}`
12462
+ );
12463
+ }
12464
+ this.registry.set(metricName, {
12465
+ evaluator,
12466
+ metricInfo: { ...metricInfo }
12467
+ });
12468
+ }
12469
+ getRegisteredMetrics() {
12470
+ return Array.from(this.registry.values()).map((entry) => ({
12471
+ ...entry.metricInfo
12472
+ }));
12473
+ }
12474
+ };
12475
+ function getDefaultMetricEvaluatorRegistry() {
12476
+ const registry = new MetricEvaluatorRegistry();
12477
+ registry.registerEvaluator(
12478
+ TrajectoryEvaluator.getMetricInfo(),
12479
+ TrajectoryEvaluator
12480
+ );
12481
+ registry.registerEvaluator(
12482
+ ResponseEvaluator.getMetricInfo("response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */),
12483
+ ResponseEvaluator
12484
+ );
12485
+ registry.registerEvaluator(
12486
+ ResponseEvaluator.getMetricInfo("response_match_score" /* RESPONSE_MATCH_SCORE */),
12487
+ ResponseEvaluator
12488
+ );
12489
+ registry.registerEvaluator(
12490
+ SafetyEvaluatorV1.getMetricInfo(),
12491
+ SafetyEvaluatorV1
12492
+ );
12493
+ registry.registerEvaluator(
12494
+ FinalResponseMatchV2Evaluator.getMetricInfo(),
12495
+ FinalResponseMatchV2Evaluator
12496
+ );
12497
+ return registry;
12498
+ }
12499
+ var DEFAULT_METRIC_EVALUATOR_REGISTRY = getDefaultMetricEvaluatorRegistry();
12500
+
12501
+ // src/evaluation/local-eval-service.ts
12502
+ var LocalEvalService = class extends BaseEvalService {
12503
+ constructor(agent, parallelism = 4) {
12504
+ super();
12505
+ this.agent = agent;
12506
+ this.parallelism = parallelism;
12507
+ this.initializeRunner();
12508
+ }
12509
+ runner;
12510
+ async initializeRunner() {
12511
+ if ("ask" in this.agent) {
12512
+ this.runner = this.agent;
12513
+ } else {
12514
+ try {
12515
+ const { runner } = await AgentBuilder.create("eval_agent").withModel("gemini-2.5-flash").withDescription("Agent for evaluation purposes").build();
12516
+ this.runner = {
12517
+ ask: async (message) => {
12518
+ return await runner.ask(message);
12519
+ }
12520
+ };
12521
+ } catch (error) {
12522
+ console.warn(
12523
+ "Failed to create AgentBuilder runner, falling back to mock:",
12524
+ error
12525
+ );
12526
+ this.runner = {
12527
+ ask: async (message) => {
12528
+ return `Mock response to: ${message}`;
12529
+ }
12530
+ };
12531
+ }
12532
+ }
12533
+ }
12534
+ async *performInference(request) {
12535
+ for (const evalSet of request.evalCases) {
12536
+ for (const evalCase of evalSet.evalCases) {
12537
+ const expected = [];
12538
+ for (const convo of evalCase.conversation) {
12539
+ if (convo.finalResponse) {
12540
+ expected.push({
12541
+ invocationId: `${evalCase.evalId}-expected-${expected.length}`,
12542
+ userContent: convo.userContent,
12543
+ finalResponse: convo.finalResponse,
12544
+ intermediateData: convo.intermediateData,
12545
+ creationTimestamp: convo.creationTimestamp
12546
+ });
12547
+ }
12548
+ }
12549
+ const actual = await this.runInference(evalCase);
12550
+ yield [...expected, ...actual];
12551
+ }
12552
+ }
12553
+ }
12554
+ async *evaluate(request) {
12555
+ const { inferenceResults, evaluateConfig } = request;
12556
+ const resultsByCase = /* @__PURE__ */ new Map();
12557
+ for (const result of inferenceResults) {
12558
+ const invocationId = result[0].invocationId;
12559
+ if (!invocationId) continue;
12560
+ const lastHyphenIndex = invocationId.lastIndexOf("-");
12561
+ const evalId = lastHyphenIndex !== -1 ? invocationId.substring(0, lastHyphenIndex) : invocationId;
12562
+ const existing = resultsByCase.get(evalId) || [];
12563
+ resultsByCase.set(evalId, [...existing, ...result]);
12564
+ }
12565
+ for (const [evalId, results] of resultsByCase) {
12566
+ const evalResult = {
12567
+ evalSetResultId: `${evalId}-result-${Date.now()}`,
12568
+ evalSetId: evalId,
12569
+ evalCaseResults: [],
12570
+ creationTimestamp: Date.now()
12571
+ };
12572
+ for (const evalMetric of evaluateConfig.evalMetrics) {
12573
+ const evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.getEvaluator(evalMetric);
12574
+ const actual = results.filter(
12575
+ (r) => !r.invocationId?.includes("expected")
12576
+ );
12577
+ const expected = results.filter(
12578
+ (r) => r.invocationId?.includes("expected")
12579
+ );
12580
+ const result = await evaluator.evaluateInvocations(actual, expected);
12581
+ evalResult.evalCaseResults.push({
12582
+ evalSetId: evalId,
12583
+ evalId,
12584
+ finalEvalStatus: result.perInvocationResults.length > 0 ? result.perInvocationResults[0].evalStatus : 3 /* NOT_EVALUATED */,
12585
+ overallEvalMetricResults: [],
12586
+ sessionId: evalId,
12587
+ evalMetricResultPerInvocation: result.perInvocationResults.map(
12588
+ (r) => ({
12589
+ actualInvocation: r.actualInvocation,
12590
+ expectedInvocation: r.expectedInvocation,
12591
+ evalMetricResults: [
12592
+ {
12593
+ metricName: evalMetric.metricName,
12594
+ threshold: evalMetric.threshold,
12595
+ score: r.score,
12596
+ evalStatus: r.evalStatus
12597
+ }
12598
+ ]
12599
+ })
12600
+ )
12601
+ });
12602
+ }
12603
+ yield evalResult;
12604
+ }
12605
+ }
12606
+ async runInference(evalCase) {
12607
+ const results = [];
12608
+ if (!this.runner) {
12609
+ await this.initializeRunner();
12610
+ }
12611
+ if (evalCase.sessionInput) {
12612
+ try {
12613
+ if (this.runner.initializeSession) {
12614
+ await this.runner.initializeSession(evalCase.sessionInput);
12615
+ } else if (this.runner.setSessionState) {
12616
+ await this.runner.setSessionState(evalCase.sessionInput);
12617
+ } else {
12618
+ console.log(
12619
+ `Session input provided for ${evalCase.evalId}:`,
12620
+ evalCase.sessionInput
12621
+ );
12622
+ }
12623
+ } catch (error) {
12624
+ console.warn(
12625
+ `Failed to initialize session for ${evalCase.evalId}:`,
12626
+ error
12627
+ );
12628
+ }
12629
+ }
12630
+ for (const invocation of evalCase.conversation) {
12631
+ try {
12632
+ const response = await this.runner.ask(invocation.userContent);
12633
+ results.push({
12634
+ invocationId: `${evalCase.evalId}-${results.length}`,
12635
+ userContent: invocation.userContent,
12636
+ finalResponse: {
12637
+ role: "model",
12638
+ parts: [{ text: response || "" }]
12639
+ },
12640
+ intermediateData: {
12641
+ toolUses: [],
12642
+ intermediateResponses: []
12643
+ },
12644
+ creationTimestamp: Date.now()
12645
+ });
12646
+ } catch (error) {
12647
+ console.error(`Error running inference for ${evalCase.evalId}:`, error);
12648
+ results.push({
12649
+ invocationId: `${evalCase.evalId}-${results.length}`,
12650
+ userContent: invocation.userContent,
12651
+ finalResponse: {
12652
+ role: "model",
12653
+ parts: [
12654
+ {
12655
+ text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
12656
+ }
12657
+ ]
12658
+ },
12659
+ intermediateData: {
12660
+ toolUses: [],
12661
+ intermediateResponses: []
12662
+ },
12663
+ creationTimestamp: Date.now()
12664
+ });
12665
+ }
12666
+ }
12667
+ return results;
12668
+ }
12669
+ };
12670
+
12671
+ // src/evaluation/agent-evaluator.ts
12672
+ var NUM_RUNS = 2;
12673
+ var TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */;
12674
+ var RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
12675
+ var RESPONSE_MATCH_SCORE_KEY = "response_match_score" /* RESPONSE_MATCH_SCORE */;
12676
+ var SAFETY_V1_KEY = "safety_v1" /* SAFETY_V1 */;
12677
+ var ALLOWED_CRITERIA = [
12678
+ TOOL_TRAJECTORY_SCORE_KEY,
12679
+ RESPONSE_EVALUATION_SCORE_KEY,
12680
+ RESPONSE_MATCH_SCORE_KEY,
12681
+ SAFETY_V1_KEY
12682
+ ];
12683
+ var QUERY_COLUMN = "query";
12684
+ var REFERENCE_COLUMN = "reference";
12685
+ var EXPECTED_TOOL_USE_COLUMN = "expected_tool_use";
12686
+ var DEFAULT_CRITERIA = {
12687
+ [TOOL_TRAJECTORY_SCORE_KEY]: 1,
12688
+ [RESPONSE_MATCH_SCORE_KEY]: 0.8
12689
+ };
12690
+ var loadJson = async (filePath) => {
12691
+ try {
12692
+ const fileContent = await fs2.readFile(filePath, "utf-8");
12693
+ return JSON.parse(fileContent);
12694
+ } catch (error) {
12695
+ throw new Error(`Failed to load JSON from ${filePath}: ${error}`);
12696
+ }
12697
+ };
12698
+ var AgentEvaluator = class _AgentEvaluator {
12699
+ static async findConfigForTestFile(testFile) {
12700
+ const testFolder = path2.dirname(testFile);
12701
+ const configPath = path2.join(testFolder, "test_config.json");
12702
+ try {
12703
+ await fs2.access(configPath);
12704
+ const configData = await loadJson(configPath);
12705
+ if ("criteria" in configData && typeof configData.criteria === "object") {
12706
+ return configData.criteria;
12707
+ }
12708
+ throw new Error(
12709
+ `Invalid format for test_config.json at ${configPath}. Expected a 'criteria' dictionary.`
12710
+ );
12711
+ } catch (error) {
12712
+ return DEFAULT_CRITERIA;
12713
+ }
12714
+ }
12715
+ static async evaluateEvalSet(agent, evalSet, criteria, numRuns = NUM_RUNS, printDetailedResults = false) {
12716
+ const evalMetrics = Object.entries(criteria).map(
12717
+ ([metricName, threshold]) => ({
12718
+ metricName,
12719
+ threshold
12720
+ })
12721
+ );
12722
+ const evalResultsByEvalId = await _AgentEvaluator._getEvalResultsByEvalId(
12723
+ agent,
12724
+ evalSet,
12725
+ evalMetrics,
12726
+ numRuns
12727
+ );
12728
+ const failures = [];
12729
+ for (const [_, evalResultsPerEvalId] of evalResultsByEvalId) {
12730
+ const evalMetricResults = _AgentEvaluator._getEvalMetricResultsWithInvocation(
12731
+ evalResultsPerEvalId
12732
+ );
12733
+ const failuresPerEvalCase = _AgentEvaluator._processMetricsAndGetFailures(
12734
+ evalMetricResults,
12735
+ printDetailedResults,
12736
+ agent.name || "Unknown Agent"
12737
+ );
12738
+ failures.push(...failuresPerEvalCase);
12739
+ }
12740
+ if (failures.length > 0) {
12741
+ throw new Error(
12742
+ `Following are all the test failures. If you looking to get more details on the failures, then please re-run this test with \`printDetailedResults\` set to \`true\`.
12743
+ ${failures.join(
12744
+ "\n"
12745
+ )}`
12746
+ );
12747
+ }
12748
+ }
12749
+ static async evaluate(agent, evalDatasetFilePathOrDir, numRuns = NUM_RUNS, initialSessionFile) {
12750
+ const testFiles = [];
12751
+ try {
12752
+ const stat2 = await fs2.stat(evalDatasetFilePathOrDir);
12753
+ if (stat2.isDirectory()) {
12754
+ const files = await this._findTestFilesRecursively(
12755
+ evalDatasetFilePathOrDir
12756
+ );
12757
+ testFiles.push(...files);
12758
+ } else {
12759
+ testFiles.push(evalDatasetFilePathOrDir);
12760
+ }
12761
+ } catch (error) {
12762
+ throw new Error(`Invalid path: ${evalDatasetFilePathOrDir}`);
12763
+ }
12764
+ const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
12765
+ for (const testFile of testFiles) {
12766
+ const criteria = await _AgentEvaluator.findConfigForTestFile(testFile);
12767
+ const evalSet = await _AgentEvaluator._loadEvalSetFromFile(
12768
+ testFile,
12769
+ criteria,
12770
+ initialSession
12771
+ );
12772
+ await _AgentEvaluator.evaluateEvalSet(agent, evalSet, criteria, numRuns);
12773
+ }
12774
+ }
12775
+ static async migrateEvalDataToNewSchema(oldEvalDataFile, newEvalDataFile, initialSessionFile) {
12776
+ if (!oldEvalDataFile || !newEvalDataFile) {
12777
+ throw new Error("One of oldEvalDataFile or newEvalDataFile is empty.");
12778
+ }
12779
+ const criteria = await _AgentEvaluator.findConfigForTestFile(oldEvalDataFile);
12780
+ const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
12781
+ const evalSet = await _AgentEvaluator._getEvalSetFromOldFormat(
12782
+ oldEvalDataFile,
12783
+ criteria,
12784
+ initialSession
12785
+ );
12786
+ await fs2.writeFile(newEvalDataFile, JSON.stringify(evalSet, null, 2));
12787
+ }
12788
+ static async _findTestFilesRecursively(dir) {
12789
+ const testFiles = [];
12790
+ async function walk(currentDir) {
12791
+ const entries = await fs2.readdir(currentDir, { withFileTypes: true });
12792
+ for (const entry of entries) {
12793
+ const fullPath = path2.join(currentDir, entry.name);
12794
+ if (entry.isDirectory()) {
12795
+ await walk(fullPath);
12796
+ } else if (entry.name.endsWith(".test.json")) {
12797
+ testFiles.push(fullPath);
12798
+ }
12799
+ }
12800
+ }
12801
+ await walk(dir);
12802
+ return testFiles;
12803
+ }
12804
+ static async _loadEvalSetFromFile(evalSetFile, criteria, initialSession) {
12805
+ try {
12806
+ const content = await fs2.readFile(evalSetFile, "utf-8");
12807
+ try {
12808
+ const evalSet = JSON.parse(content);
12809
+ if (evalSet.evalSetId && evalSet.evalCases) {
12810
+ if (Object.keys(initialSession).length > 0) {
12811
+ throw new Error(
12812
+ "Initial session should be specified as a part of EvalSet file. Explicit initial session is only needed, when specifying data in the older schema."
12813
+ );
12814
+ }
12815
+ return evalSet;
12816
+ }
12817
+ } catch (parseError) {
12818
+ throw new Error(`Failed to parse eval set data: ${parseError}`);
12819
+ }
12820
+ } catch (error) {
12821
+ throw new Error(`Failed to process eval set file: ${error}`);
12822
+ }
12823
+ console.warn(
12824
+ `Contents of ${evalSetFile} appear to be in older format. To avoid this warning, please update your test files to contain data in EvalSet schema. You can use 'migrateEvalDataToNewSchema' for migrating your old test files.`
12825
+ );
12826
+ return _AgentEvaluator._getEvalSetFromOldFormat(
12827
+ evalSetFile,
12828
+ criteria,
12829
+ initialSession
12830
+ );
12831
+ }
12832
+ static async _getEvalSetFromOldFormat(evalSetFile, criteria, initialSession) {
12833
+ const data = await _AgentEvaluator._loadDataset(evalSetFile);
12834
+ _AgentEvaluator._validateInput(data, criteria);
12835
+ return {
12836
+ evalSetId: `eval-set-${Date.now()}`,
12837
+ name: evalSetFile,
12838
+ evalCases: data[0].map(
12839
+ (item, index) => ({
12840
+ evalId: `eval-${index}`,
12841
+ conversation: [
12842
+ {
12843
+ invocationId: `invocation-${index}`,
12844
+ userContent: {
12845
+ role: "user",
12846
+ parts: [{ text: item[QUERY_COLUMN] || "" }]
12847
+ },
12848
+ finalResponse: item[REFERENCE_COLUMN] ? {
12849
+ role: "model",
12850
+ parts: [{ text: item[REFERENCE_COLUMN] }]
12851
+ } : void 0,
12852
+ intermediateData: item[EXPECTED_TOOL_USE_COLUMN] ? {
12853
+ toolUses: item[EXPECTED_TOOL_USE_COLUMN],
12854
+ intermediateResponses: []
12855
+ } : void 0,
12856
+ creationTimestamp: Date.now()
12857
+ }
12858
+ ],
12859
+ sessionInput: Object.keys(initialSession).length > 0 ? {
12860
+ appName: "test-app",
12861
+ userId: "test-user",
12862
+ state: initialSession
12863
+ } : void 0
12864
+ })
12865
+ ),
12866
+ creationTimestamp: Date.now()
12867
+ };
12868
+ }
12869
+ static async _getInitialSession(initialSessionFile) {
12870
+ if (!initialSessionFile) {
12871
+ return {};
12872
+ }
12873
+ try {
12874
+ const content = await fs2.readFile(initialSessionFile, "utf-8");
12875
+ return JSON.parse(content);
12876
+ } catch (error) {
12877
+ throw new Error(
12878
+ `Failed to load initial session from ${initialSessionFile}: ${error}`
12879
+ );
12880
+ }
12881
+ }
12882
+ static async _loadDataset(inputData) {
12883
+ const stat2 = await fs2.stat(inputData);
12884
+ if (stat2.isDirectory()) {
12885
+ const testFiles = await this._findTestFilesRecursively(inputData);
12886
+ const results = await Promise.all(testFiles.map((f) => loadJson(f)));
12887
+ return results.map((r) => Array.isArray(r) ? r : [r]);
12888
+ }
12889
+ if (stat2.isFile()) {
12890
+ const data = await loadJson(inputData);
12891
+ return [Array.isArray(data) ? data : [data]];
12892
+ }
12893
+ throw new Error(`Invalid input path: ${inputData}`);
12894
+ }
12895
+ static _validateInput(evalDataset, criteria) {
12896
+ if (!evalDataset || evalDataset.length === 0) {
12897
+ throw new Error("The evaluation dataset is None or empty.");
12898
+ }
12899
+ for (const key of Object.keys(criteria)) {
12900
+ if (!ALLOWED_CRITERIA.includes(key)) {
12901
+ throw new Error(
12902
+ `Invalid criteria key: ${key}. Expected one of ${ALLOWED_CRITERIA.join(
12903
+ ", "
12904
+ )}.`
12905
+ );
12906
+ }
12907
+ }
12908
+ const sample = evalDataset[0];
12909
+ if (!Array.isArray(sample) || sample.length === 0) {
12910
+ throw new Error("The evaluation dataset is empty.");
12911
+ }
12912
+ const firstQuery = sample[0];
12913
+ if (typeof firstQuery !== "object") {
12914
+ throw new Error(
12915
+ `Each evaluation dataset sample must be list of dictionary. But it's ${JSON.stringify(
12916
+ evalDataset
12917
+ )}`
12918
+ );
12919
+ }
12920
+ if (TOOL_TRAJECTORY_SCORE_KEY in criteria) {
12921
+ if (!(QUERY_COLUMN in firstQuery) || !(EXPECTED_TOOL_USE_COLUMN in firstQuery)) {
12922
+ throw new Error(
12923
+ `Samples for ${TOOL_TRAJECTORY_SCORE_KEY} must include '${QUERY_COLUMN}' and '${EXPECTED_TOOL_USE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
12924
+ );
12925
+ }
12926
+ }
12927
+ if (RESPONSE_EVALUATION_SCORE_KEY in criteria) {
12928
+ if (!(QUERY_COLUMN in firstQuery)) {
12929
+ throw new Error(
12930
+ `Samples for ${RESPONSE_EVALUATION_SCORE_KEY} must include '${QUERY_COLUMN}' key. The sample is ${JSON.stringify(sample)}.`
12931
+ );
12932
+ }
12933
+ }
12934
+ if (RESPONSE_MATCH_SCORE_KEY in criteria) {
12935
+ if (!(QUERY_COLUMN in firstQuery) || !(REFERENCE_COLUMN in firstQuery)) {
12936
+ throw new Error(
12937
+ `Samples for ${RESPONSE_MATCH_SCORE_KEY} must include '${QUERY_COLUMN}' and '${REFERENCE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
12938
+ );
12939
+ }
12940
+ }
12941
+ }
12942
+ static _printDetails(evalMetricResultWithInvocations, overallEvalStatus, overallScore, metricName = "", threshold = 0) {
12943
+ console.log(
12944
+ `Summary: \`${overallEvalStatus}\` for Metric: \`${metricName}\`. Expected threshold: \`${threshold}\`, actual value: \`${overallScore}\`.`
12945
+ );
12946
+ const data = evalMetricResultWithInvocations.map((per) => ({
12947
+ evalStatus: per.evalMetricResult.evalStatus,
12948
+ score: per.evalMetricResult.score,
12949
+ threshold,
12950
+ prompt: _AgentEvaluator._convertContentToText(
12951
+ per.expectedInvocation.userContent
12952
+ ),
12953
+ expectedResponse: _AgentEvaluator._convertContentToText(
12954
+ per.expectedInvocation.finalResponse
12955
+ ),
12956
+ actualResponse: _AgentEvaluator._convertContentToText(
12957
+ per.actualInvocation.finalResponse
12958
+ ),
12959
+ expectedToolCalls: _AgentEvaluator._convertToolCallsToText(
12960
+ per.expectedInvocation.intermediateData
12961
+ ),
12962
+ actualToolCalls: _AgentEvaluator._convertToolCallsToText(
12963
+ per.actualInvocation.intermediateData
12964
+ )
12965
+ }));
12966
+ console.table(data);
12967
+ console.log("\n\n");
12968
+ }
12969
+ static _convertContentToText(content) {
12970
+ if (content?.parts) {
12971
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
12972
+ }
12973
+ return "";
12974
+ }
12975
+ static _convertToolCallsToText(intermediateData) {
12976
+ if (intermediateData?.toolUses) {
12977
+ return intermediateData.toolUses.map((t) => JSON.stringify(t)).join("\n");
12978
+ }
12979
+ return "";
12980
+ }
12981
+ static async _getEvalResultsByEvalId(agent, evalSet, evalMetrics, numRuns) {
12982
+ const evalService = new LocalEvalService(agent);
12983
+ const inferenceResults = [];
12984
+ for (let run = 0; run < numRuns; run++) {
12985
+ for await (const result of evalService.performInference({
12986
+ evalSetId: evalSet.evalSetId,
12987
+ evalCases: [evalSet]
12988
+ })) {
12989
+ inferenceResults.push(result);
12990
+ }
12991
+ }
12992
+ const evalResultsByEvalId = /* @__PURE__ */ new Map();
12993
+ for await (const evalResult of evalService.evaluate({
12994
+ inferenceResults,
12995
+ evaluateConfig: { evalMetrics }
12996
+ })) {
12997
+ for (const caseResult of evalResult.evalCaseResults) {
12998
+ const evalId = caseResult.evalId;
12999
+ if (!evalResultsByEvalId.has(evalId)) {
13000
+ evalResultsByEvalId.set(evalId, []);
13001
+ }
13002
+ evalResultsByEvalId.get(evalId).push(caseResult);
13003
+ }
13004
+ }
13005
+ return evalResultsByEvalId;
13006
+ }
13007
+ static _getEvalMetricResultsWithInvocation(evalResultsPerEvalId) {
13008
+ const evalMetricResults = {};
13009
+ for (const evalCaseResult of evalResultsPerEvalId) {
13010
+ for (const evalMetricsPerInvocation of evalCaseResult.evalMetricResultPerInvocation) {
13011
+ for (const evalMetricResult of evalMetricsPerInvocation.evalMetricResults) {
13012
+ const metricName = evalMetricResult.metricName;
13013
+ if (!(metricName in evalMetricResults)) {
13014
+ evalMetricResults[metricName] = [];
13015
+ }
13016
+ evalMetricResults[metricName].push({
13017
+ actualInvocation: evalMetricsPerInvocation.actualInvocation,
13018
+ expectedInvocation: evalMetricsPerInvocation.expectedInvocation,
13019
+ evalMetricResult
13020
+ });
13021
+ }
13022
+ }
13023
+ }
13024
+ return evalMetricResults;
13025
+ }
13026
+ static _processMetricsAndGetFailures(evalMetricResults, printDetailedResults, agentModule) {
13027
+ const failures = [];
13028
+ for (const [metricName, evalMetricResultsWithInvocations] of Object.entries(
13029
+ evalMetricResults
13030
+ )) {
13031
+ const threshold = evalMetricResultsWithInvocations[0]?.evalMetricResult.threshold || 0;
13032
+ const scores = evalMetricResultsWithInvocations.map((m) => m.evalMetricResult.score).filter((s) => s !== void 0);
13033
+ let overallScore;
13034
+ let overallEvalStatus;
13035
+ if (scores.length > 0) {
13036
+ overallScore = scores.reduce((a, b) => a + b, 0) / scores.length;
13037
+ overallEvalStatus = overallScore >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
13038
+ } else {
13039
+ overallScore = void 0;
13040
+ overallEvalStatus = 3 /* NOT_EVALUATED */;
13041
+ }
13042
+ if (overallEvalStatus !== 1 /* PASSED */) {
13043
+ if (printDetailedResults) {
13044
+ _AgentEvaluator._printDetails(
13045
+ evalMetricResultsWithInvocations,
13046
+ overallEvalStatus,
13047
+ overallScore,
13048
+ metricName,
13049
+ threshold
13050
+ );
13051
+ }
13052
+ failures.push(
13053
+ `${metricName} for ${agentModule} Failed. Expected ${threshold}, but got ${overallScore}.`
13054
+ );
13055
+ }
13056
+ }
13057
+ return failures;
13058
+ }
13059
+ };
13060
+
13061
+ // src/evaluation/final-response-match-v1.ts
13062
+ var RougeEvaluator = class extends Evaluator {
13063
+ evalMetric;
13064
+ constructor(evalMetric) {
13065
+ super(evalMetric);
13066
+ this.evalMetric = evalMetric;
13067
+ }
13068
+ static getMetricInfo() {
13069
+ return {
13070
+ metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
13071
+ description: "This metric evaluates if the agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
13072
+ metricValueInfo: {
13073
+ interval: {
13074
+ minValue: 0,
13075
+ maxValue: 1,
13076
+ openAtMin: false,
13077
+ openAtMax: false
13078
+ }
13079
+ }
13080
+ };
13081
+ }
13082
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
13083
+ let totalScore = 0;
13084
+ let numInvocations = 0;
13085
+ const perInvocationResults = [];
13086
+ for (let i = 0; i < actualInvocations.length; i++) {
13087
+ const actual = actualInvocations[i];
13088
+ const expected = expectedInvocations[i];
13089
+ const reference = getTextFromContent2(expected.finalResponse);
13090
+ const response = getTextFromContent2(actual.finalResponse);
13091
+ const rouge1Scores = await calculateRouge1Scores(response, reference);
13092
+ const score = rouge1Scores.fmeasure;
13093
+ perInvocationResults.push({
13094
+ actualInvocation: actual,
13095
+ expectedInvocation: expected,
13096
+ score,
13097
+ evalStatus: getEvalStatus2(score, this.evalMetric.threshold)
13098
+ });
13099
+ totalScore += score;
13100
+ numInvocations++;
13101
+ }
13102
+ if (perInvocationResults.length > 0) {
13103
+ const overallScore = totalScore / numInvocations;
13104
+ return {
13105
+ overallScore,
13106
+ overallEvalStatus: getEvalStatus2(
13107
+ overallScore,
13108
+ this.evalMetric.threshold
13109
+ ),
13110
+ perInvocationResults
13111
+ };
13112
+ }
13113
+ return {
13114
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
13115
+ perInvocationResults: []
13116
+ };
13117
+ }
13118
+ };
13119
+ function getTextFromContent2(content) {
13120
+ if (content?.parts) {
13121
+ return content.parts.map((part) => part.text).filter(Boolean).join("\n");
13122
+ }
13123
+ return "";
13124
+ }
13125
+ function getEvalStatus2(score, threshold) {
13126
+ return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
13127
+ }
13128
+ function calculateRouge1Scores(response, reference) {
13129
+ if (!response.trim() || !reference.trim()) {
13130
+ return { precision: 0, recall: 0, fmeasure: 0 };
13131
+ }
13132
+ const responseTokens = tokenizeText(response);
13133
+ const referenceTokens = tokenizeText(reference);
13134
+ const responseUnigrams = new Set(responseTokens);
13135
+ const referenceUnigrams = new Set(referenceTokens);
13136
+ const commonUnigrams = new Set(
13137
+ [...responseUnigrams].filter((token) => referenceUnigrams.has(token))
13138
+ );
13139
+ const precision = responseUnigrams.size > 0 ? commonUnigrams.size / responseUnigrams.size : 0;
13140
+ const recall = referenceUnigrams.size > 0 ? commonUnigrams.size / referenceUnigrams.size : 0;
13141
+ const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
13142
+ return { precision, recall, fmeasure };
13143
+ }
13144
+ function tokenizeText(text) {
13145
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
13146
+ }
13147
+
11811
13148
  // src/version.ts
11812
13149
  var VERSION = "0.1.0";
11813
13150
  export {
11814
13151
  AF_FUNCTION_CALL_ID_PREFIX,
11815
13152
  LlmAgent as Agent,
11816
13153
  AgentBuilder,
13154
+ AgentEvaluator,
11817
13155
  AgentTool,
11818
13156
  agents_exports as Agents,
11819
13157
  AiSdkLlm,
@@ -11847,11 +13185,16 @@ export {
11847
13185
  CodeExecutorContext,
11848
13186
  DatabaseSessionService,
11849
13187
  EnhancedAuthConfig,
13188
+ EvalResult,
13189
+ EvalStatus,
13190
+ evaluation_exports as Evaluation,
13191
+ Evaluator,
11850
13192
  Event,
11851
13193
  EventActions,
11852
13194
  events_exports as Events,
11853
13195
  ExitLoopTool,
11854
13196
  FileOperationsTool,
13197
+ FinalResponseMatchV2Evaluator,
11855
13198
  flows_exports as Flows,
11856
13199
  FunctionTool,
11857
13200
  GcsArtifactService,
@@ -11873,6 +13216,7 @@ export {
11873
13216
  LlmResponse,
11874
13217
  LoadArtifactsTool,
11875
13218
  LoadMemoryTool,
13219
+ LocalEvalService,
11876
13220
  LoopAgent,
11877
13221
  McpAbi,
11878
13222
  McpAtp,
@@ -11900,10 +13244,13 @@ export {
11900
13244
  OpenIdConnectScheme,
11901
13245
  ParallelAgent,
11902
13246
  PlanReActPlanner,
13247
+ PrebuiltMetrics,
11903
13248
  REQUEST_EUC_FUNCTION_CALL_NAME,
11904
13249
  ReadonlyContext,
13250
+ RougeEvaluator,
11905
13251
  RunConfig,
11906
13252
  Runner,
13253
+ SafetyEvaluatorV1,
11907
13254
  SequentialAgent,
11908
13255
  sessions_exports as Sessions,
11909
13256
  SingleFlow,
@@ -11912,6 +13259,7 @@ export {
11912
13259
  TelemetryService,
11913
13260
  ToolContext,
11914
13261
  tools_exports as Tools,
13262
+ TrajectoryEvaluator,
11915
13263
  TransferToAgentTool,
11916
13264
  UserInteractionTool,
11917
13265
  VERSION,