@iqai/adk 0.1.21 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -53,7 +53,7 @@ var init_logger = __esm({
53
53
  }
54
54
  info(message, ...args) {
55
55
  const time = (/* @__PURE__ */ new Date()).toLocaleTimeString();
56
- console.info(
56
+ console.debug(
57
57
  this.colorize(`[${time}] \u2139\uFE0F [${this.name}] ${message}`),
58
58
  ...args
59
59
  );
@@ -229,7 +229,7 @@ var init_base_tool = __esm({
229
229
  * @param context The context of the tool
230
230
  * @returns The result of running the tool
231
231
  */
232
- async runAsync(args, context) {
232
+ async runAsync(args, context4) {
233
233
  throw new Error(`${this.constructor.name} runAsync is not implemented`);
234
234
  }
235
235
  /**
@@ -253,6 +253,12 @@ var init_base_tool = __esm({
253
253
  if (!toolWithFunctionDeclarations.functionDeclarations) {
254
254
  toolWithFunctionDeclarations.functionDeclarations = [];
255
255
  }
256
+ const alreadyExists = toolWithFunctionDeclarations.functionDeclarations.some(
257
+ (fd) => fd?.name === functionDeclaration.name
258
+ );
259
+ if (alreadyExists) {
260
+ return;
261
+ }
256
262
  toolWithFunctionDeclarations.functionDeclarations.push(
257
263
  functionDeclaration
258
264
  );
@@ -281,7 +287,7 @@ var init_base_tool = __esm({
281
287
  * @param context Tool execution context
282
288
  * @returns Result of the tool execution or error information
283
289
  */
284
- async safeExecute(args, context) {
290
+ async safeExecute(args, context4) {
285
291
  if (!this.validateArguments(args)) {
286
292
  return {
287
293
  error: "Invalid arguments",
@@ -302,7 +308,7 @@ var init_base_tool = __esm({
302
308
  );
303
309
  await new Promise((resolve) => setTimeout(resolve, delay));
304
310
  }
305
- const result = await this.runAsync(args, context);
311
+ const result = await this.runAsync(args, context4);
306
312
  return { result };
307
313
  } catch (error) {
308
314
  lastError = error instanceof Error ? error : new Error(String(error));
@@ -500,7 +506,7 @@ var init_function_tool = __esm({
500
506
  /**
501
507
  * Executes the wrapped function with the provided arguments.
502
508
  */
503
- async runAsync(args, context) {
509
+ async runAsync(args, context4) {
504
510
  try {
505
511
  const missingArgs = this.getMissingMandatoryArgs(args);
506
512
  if (missingArgs.length > 0) {
@@ -513,13 +519,13 @@ You could retry calling this tool, but it is IMPORTANT for you to provide all th
513
519
  }
514
520
  const argsToCall = { ...args };
515
521
  if (this.functionAcceptsToolContext()) {
516
- argsToCall.toolContext = context;
522
+ argsToCall.toolContext = context4;
517
523
  }
518
524
  const funcParams = this.getFunctionParameters();
519
525
  const argValues = [];
520
526
  for (const paramName of funcParams) {
521
527
  if (paramName === "toolContext" && this.functionAcceptsToolContext()) {
522
- argValues.push(context);
528
+ argValues.push(context4);
523
529
  } else if (paramName in argsToCall) {
524
530
  const convertedValue = this.convertArgumentType(
525
531
  argsToCall[paramName],
@@ -827,70 +833,23 @@ ${instructions.join("\n\n")}`;
827
833
 
828
834
  // src/models/llm-response.ts
829
835
  var LlmResponse = class _LlmResponse {
830
- /**
831
- * Unique identifier for the response.
832
- */
833
836
  id;
834
- /**
835
- * The content generated by the model.
836
- */
837
+ text;
837
838
  content;
838
- /**
839
- * The grounding metadata of the response.
840
- */
841
839
  groundingMetadata;
842
- /**
843
- * Indicates whether the text content is part of an unfinished text stream.
844
- */
845
840
  partial;
846
- /**
847
- * Indicates whether the response from the model is complete.
848
- */
849
841
  turnComplete;
850
- /**
851
- * Error code if the response is an error.
852
- */
853
842
  errorCode;
854
- /**
855
- * Error message if the response is an error.
856
- */
857
843
  errorMessage;
858
- /**
859
- * Flag indicating that LLM was interrupted when generating the content.
860
- */
861
844
  interrupted;
862
- /**
863
- * The custom metadata of the LlmResponse.
864
- */
865
845
  customMetadata;
866
- /**
867
- * The usage metadata of the LlmResponse.
868
- */
869
846
  usageMetadata;
870
- /**
871
- * Index of the candidate response.
872
- */
873
847
  candidateIndex;
874
- /**
875
- * Reason why the model finished generating.
876
- */
877
848
  finishReason;
878
- /**
879
- * Error object if the response is an error.
880
- */
881
849
  error;
882
- /**
883
- * Creates a new LlmResponse.
884
- */
885
850
  constructor(data = {}) {
886
851
  Object.assign(this, data);
887
852
  }
888
- /**
889
- * Creates an LlmResponse from a GenerateContentResponse.
890
- *
891
- * @param generateContentResponse The GenerateContentResponse to create the LlmResponse from.
892
- * @returns The LlmResponse.
893
- */
894
853
  static create(generateContentResponse) {
895
854
  const usageMetadata = generateContentResponse.usageMetadata;
896
855
  if (generateContentResponse.candidates && generateContentResponse.candidates.length > 0) {
@@ -922,15 +881,6 @@ var LlmResponse = class _LlmResponse {
922
881
  usageMetadata
923
882
  });
924
883
  }
925
- /**
926
- * Creates an LlmResponse from an error.
927
- *
928
- * @param error The error object or message.
929
- * @param options Additional options for the error response.
930
- * @param options.errorCode A specific error code for the response.
931
- * @param options.model The model that was being used when the error occurred.
932
- * @returns The LlmResponse.
933
- */
934
884
  static fromError(error, options = {}) {
935
885
  const errorMessage = error instanceof Error ? error.message : String(error);
936
886
  const errorCode = options.errorCode || "UNKNOWN_ERROR";
@@ -954,6 +904,7 @@ init_logger();
954
904
  import {
955
905
  DiagConsoleLogger,
956
906
  DiagLogLevel,
907
+ context,
957
908
  diag,
958
909
  trace
959
910
  } from "@opentelemetry/api";
@@ -994,13 +945,24 @@ var TelemetryService = class {
994
945
  this.sdk = new NodeSDK({
995
946
  resource,
996
947
  traceExporter,
997
- instrumentations: [getNodeAutoInstrumentations()]
948
+ instrumentations: [
949
+ getNodeAutoInstrumentations({
950
+ // Follow Python ADK approach: let all HTTP instrumentation through.
951
+ // This provides transparency and aligns with standard OpenTelemetry behavior.
952
+ // High-level LLM tracing is provided through dedicated ADK spans.
953
+ "@opentelemetry/instrumentation-http": {
954
+ ignoreIncomingRequestHook: (req) => {
955
+ return true;
956
+ }
957
+ }
958
+ })
959
+ ]
998
960
  });
999
961
  try {
1000
962
  this.sdk.start();
1001
963
  this.isInitialized = true;
1002
964
  this.tracer = trace.getTracer("iqai-adk", config.appVersion || "0.1.0");
1003
- diag.info("OpenTelemetry SDK started successfully.");
965
+ diag.debug("OpenTelemetry SDK started successfully.");
1004
966
  } catch (error) {
1005
967
  diag.error("Error starting OpenTelemetry SDK:", error);
1006
968
  throw error;
@@ -1043,7 +1005,7 @@ var TelemetryService = class {
1043
1005
  });
1044
1006
  await Promise.race([this.sdk.shutdown(), timeoutPromise]);
1045
1007
  this.isInitialized = false;
1046
- diag.info("Telemetry terminated successfully.");
1008
+ diag.debug("Telemetry terminated successfully.");
1047
1009
  } catch (error) {
1048
1010
  if (error instanceof Error && error.message.includes("timeout")) {
1049
1011
  diag.warn("Telemetry shutdown timed out, some traces may be lost");
@@ -1071,7 +1033,7 @@ var TelemetryService = class {
1071
1033
  }
1072
1034
  }
1073
1035
  span.setAttributes({
1074
- "gen_ai.system.name": "iqai-adk",
1036
+ "gen_ai.system": "iqai-adk",
1075
1037
  "gen_ai.operation.name": "execute_tool",
1076
1038
  "gen_ai.tool.name": tool.name,
1077
1039
  "gen_ai.tool.description": tool.description,
@@ -1085,7 +1047,7 @@ var TelemetryService = class {
1085
1047
  ...process.env.NODE_ENV && {
1086
1048
  "deployment.environment.name": process.env.NODE_ENV
1087
1049
  },
1088
- // Tool-specific data
1050
+ // ADK-specific attributes (matching Python namespace pattern)
1089
1051
  "adk.tool_call_args": this._safeJsonStringify(args),
1090
1052
  "adk.event_id": functionResponseEvent.invocationId,
1091
1053
  "adk.tool_response": this._safeJsonStringify(toolResponse),
@@ -1101,9 +1063,8 @@ var TelemetryService = class {
1101
1063
  if (!span) return;
1102
1064
  const requestData = this._buildLlmRequestForTrace(llmRequest);
1103
1065
  span.setAttributes({
1104
- // Standard OpenTelemetry attributes
1105
- "gen_ai.system.name": "iqai-adk",
1106
- "gen_ai.operation.name": "generate",
1066
+ // Standard OpenTelemetry attributes (following Python pattern)
1067
+ "gen_ai.system": "iqai-adk",
1107
1068
  "gen_ai.request.model": llmRequest.model,
1108
1069
  // Session and user tracking (maps to Langfuse sessionId, userId)
1109
1070
  "session.id": invocationContext.session.id,
@@ -1116,15 +1077,21 @@ var TelemetryService = class {
1116
1077
  "gen_ai.request.max_tokens": llmRequest.config.maxOutputTokens || 0,
1117
1078
  "gen_ai.request.temperature": llmRequest.config.temperature || 0,
1118
1079
  "gen_ai.request.top_p": llmRequest.config.topP || 0,
1119
- // Legacy ADK attributes (keep for backward compatibility)
1120
1080
  "adk.system_name": "iqai-adk",
1121
1081
  "adk.request_model": llmRequest.model,
1122
- "adk.invocation_id": invocationContext.session.id,
1082
+ // ADK-specific attributes (matching Python namespace pattern)
1083
+ "adk.invocation_id": invocationContext.invocationId,
1123
1084
  "adk.session_id": invocationContext.session.id,
1124
1085
  "adk.event_id": eventId,
1125
1086
  "adk.llm_request": this._safeJsonStringify(requestData),
1126
1087
  "adk.llm_response": this._safeJsonStringify(llmResponse)
1127
1088
  });
1089
+ if (llmResponse.usageMetadata) {
1090
+ span.setAttributes({
1091
+ "gen_ai.usage.input_tokens": llmResponse.usageMetadata.promptTokenCount || 0,
1092
+ "gen_ai.usage.output_tokens": llmResponse.usageMetadata.candidatesTokenCount || 0
1093
+ });
1094
+ }
1128
1095
  span.addEvent("gen_ai.content.prompt", {
1129
1096
  "gen_ai.prompt": this._safeJsonStringify(requestData.messages)
1130
1097
  });
@@ -1137,9 +1104,14 @@ var TelemetryService = class {
1137
1104
  */
1138
1105
  async *traceAsyncGenerator(spanName, generator) {
1139
1106
  const span = this.tracer.startSpan(spanName);
1107
+ const spanContext = trace.setSpan(context.active(), span);
1140
1108
  try {
1141
- for await (const item of generator) {
1142
- yield item;
1109
+ while (true) {
1110
+ const result = await context.with(spanContext, () => generator.next());
1111
+ if (result.done) {
1112
+ break;
1113
+ }
1114
+ yield result.value;
1143
1115
  }
1144
1116
  } catch (error) {
1145
1117
  span.recordException(error);
@@ -1226,7 +1198,7 @@ var traceLlmCall = (invocationContext, eventId, llmRequest, llmResponse) => tele
1226
1198
  // src/models/base-llm.ts
1227
1199
  var BaseLlm = class {
1228
1200
  /**
1229
- * The name of the LLM, e.g. gemini-1.5-flash or gemini-1.5-flash-001.
1201
+ * The name of the LLM, e.g. gemini-2.5-flash or gemini-2.5-flash-001.
1230
1202
  */
1231
1203
  model;
1232
1204
  logger = new Logger({ name: "BaseLlm" });
@@ -1915,7 +1887,7 @@ var GoogleLlm = class extends BaseLlm {
1915
1887
  /**
1916
1888
  * Constructor for Gemini
1917
1889
  */
1918
- constructor(model = "gemini-1.5-flash") {
1890
+ constructor(model = "gemini-2.5-flash") {
1919
1891
  super(model);
1920
1892
  }
1921
1893
  /**
@@ -2647,30 +2619,16 @@ var OpenAiLlm = class extends BaseLlm {
2647
2619
  // src/models/llm-registry.ts
2648
2620
  init_logger();
2649
2621
  var LLMRegistry = class _LLMRegistry {
2650
- /**
2651
- * Map of model name regex to LLM class
2652
- */
2653
2622
  static llmRegistry = /* @__PURE__ */ new Map();
2623
+ static modelInstances = /* @__PURE__ */ new Map();
2654
2624
  static logger = new Logger({ name: "LLMRegistry" });
2655
- /**
2656
- * Creates a new LLM instance
2657
- *
2658
- * @param model The model name
2659
- * @returns The LLM instance
2660
- */
2661
2625
  static newLLM(model) {
2662
2626
  const llmClass = _LLMRegistry.resolve(model);
2663
2627
  if (!llmClass) {
2664
- throw new Error(`No LLM found for model: ${model}`);
2628
+ throw new Error(`No LLM class found for model: ${model}`);
2665
2629
  }
2666
2630
  return new llmClass(model);
2667
2631
  }
2668
- /**
2669
- * Resolves the LLM class from the model name
2670
- *
2671
- * @param model The model name
2672
- * @returns The LLM class
2673
- */
2674
2632
  static resolve(model) {
2675
2633
  for (const [regex, llmClass] of _LLMRegistry.llmRegistry.entries()) {
2676
2634
  if (regex.test(model)) {
@@ -2679,34 +2637,54 @@ var LLMRegistry = class _LLMRegistry {
2679
2637
  }
2680
2638
  return null;
2681
2639
  }
2682
- /**
2683
- * Registers a new LLM class
2684
- *
2685
- * @param modelNameRegex The regex to match model names
2686
- * @param llmClass The LLM class
2687
- */
2688
2640
  static register(modelNameRegex, llmClass) {
2689
2641
  _LLMRegistry.llmRegistry.set(new RegExp(modelNameRegex), llmClass);
2690
2642
  }
2691
- /**
2692
- * Registers all model patterns from an LLM class
2693
- *
2694
- * @param llmClass The LLM class
2695
- */
2696
2643
  static registerLLM(llmClass) {
2697
2644
  const modelPatterns = llmClass.supportedModels();
2698
2645
  for (const pattern of modelPatterns) {
2699
2646
  _LLMRegistry.register(pattern, llmClass);
2700
2647
  }
2701
2648
  }
2702
- /**
2703
- * Logs all registered models for debugging
2704
- */
2649
+ static registerModel(name, model) {
2650
+ _LLMRegistry.modelInstances.set(name, model);
2651
+ }
2652
+ static getModel(name) {
2653
+ const model = _LLMRegistry.modelInstances.get(name);
2654
+ if (!model) {
2655
+ throw new Error(`Model '${name}' not found in registry`);
2656
+ }
2657
+ return model;
2658
+ }
2659
+ static hasModel(name) {
2660
+ return _LLMRegistry.modelInstances.has(name);
2661
+ }
2662
+ static unregisterModel(name) {
2663
+ _LLMRegistry.modelInstances.delete(name);
2664
+ }
2665
+ static getModelOrCreate(name) {
2666
+ if (_LLMRegistry.hasModel(name)) {
2667
+ return _LLMRegistry.getModel(name);
2668
+ }
2669
+ return _LLMRegistry.newLLM(name);
2670
+ }
2671
+ static clear() {
2672
+ _LLMRegistry.llmRegistry.clear();
2673
+ _LLMRegistry.modelInstances.clear();
2674
+ }
2675
+ static clearModels() {
2676
+ _LLMRegistry.modelInstances.clear();
2677
+ }
2678
+ static clearClasses() {
2679
+ _LLMRegistry.llmRegistry.clear();
2680
+ }
2705
2681
  static logRegisteredModels() {
2706
- _LLMRegistry.logger.debug(
2707
- "Registered LLM models:",
2708
- [..._LLMRegistry.llmRegistry.entries()].map(([regex]) => regex.toString())
2682
+ const classPatterns = [..._LLMRegistry.llmRegistry.entries()].map(
2683
+ ([regex]) => regex.toString()
2709
2684
  );
2685
+ const instanceNames = [..._LLMRegistry.modelInstances.keys()];
2686
+ _LLMRegistry.logger.debug("Registered LLM class patterns:", classPatterns);
2687
+ _LLMRegistry.logger.debug("Registered LLM instances:", instanceNames);
2710
2688
  }
2711
2689
  };
2712
2690
 
@@ -3954,10 +3932,10 @@ var CreatedTool = class extends BaseTool {
3954
3932
  /**
3955
3933
  * Executes the tool function with validation
3956
3934
  */
3957
- async runAsync(args, context) {
3935
+ async runAsync(args, context4) {
3958
3936
  try {
3959
3937
  const validatedArgs = this.schema.parse(args);
3960
- const result = await Promise.resolve(this.func(validatedArgs, context));
3938
+ const result = await Promise.resolve(this.func(validatedArgs, context4));
3961
3939
  return result ?? {};
3962
3940
  } catch (error) {
3963
3941
  if (error instanceof z.ZodError) {
@@ -4215,7 +4193,7 @@ var AgentTool = class extends BaseTool {
4215
4193
  /**
4216
4194
  * Execute the tool by running the agent with the provided input
4217
4195
  */
4218
- async runAsync(params, context) {
4196
+ async runAsync(params, context4) {
4219
4197
  try {
4220
4198
  const input = params.input || Object.values(params)[0];
4221
4199
  if (!isLlmAgent(this.agent)) {
@@ -4223,7 +4201,7 @@ var AgentTool = class extends BaseTool {
4223
4201
  `Agent ${this.name} does not support running as a tool`
4224
4202
  );
4225
4203
  }
4226
- const parentInvocation = context._invocationContext;
4204
+ const parentInvocation = context4._invocationContext;
4227
4205
  const childInvocationContext = new InvocationContext({
4228
4206
  invocationId: uuidv42(),
4229
4207
  agent: this.agent,
@@ -4260,8 +4238,8 @@ var AgentTool = class extends BaseTool {
4260
4238
  } catch {
4261
4239
  toolResult = mergedText;
4262
4240
  }
4263
- if (this.outputKey && context?.state) {
4264
- context.state[this.outputKey] = toolResult;
4241
+ if (this.outputKey && context4?.state) {
4242
+ context4.state[this.outputKey] = toolResult;
4265
4243
  }
4266
4244
  return toolResult;
4267
4245
  } catch (error) {
@@ -4809,9 +4787,9 @@ var UserInteractionTool = class extends BaseTool {
4809
4787
  /**
4810
4788
  * Execute the user interaction
4811
4789
  */
4812
- async runAsync(args, context) {
4790
+ async runAsync(args, context4) {
4813
4791
  try {
4814
- const actions = context.actions;
4792
+ const actions = context4.actions;
4815
4793
  if (!actions || !actions.promptUser) {
4816
4794
  return {
4817
4795
  success: false,
@@ -4859,9 +4837,9 @@ var ExitLoopTool = class extends BaseTool {
4859
4837
  /**
4860
4838
  * Execute the exit loop action
4861
4839
  */
4862
- async runAsync(_args, context) {
4840
+ async runAsync(_args, context4) {
4863
4841
  this.logger.debug("Executing exit loop tool");
4864
- context.actions.escalate = true;
4842
+ context4.actions.escalate = true;
4865
4843
  }
4866
4844
  };
4867
4845
 
@@ -4912,14 +4890,14 @@ var GetUserChoiceTool = class extends BaseTool {
4912
4890
  * This is a long running operation that will return null initially
4913
4891
  * and the actual choice will be provided asynchronously
4914
4892
  */
4915
- async runAsync(args, context) {
4893
+ async runAsync(args, context4) {
4916
4894
  this.logger.debug(
4917
4895
  `Executing get_user_choice with options: ${args.options.join(", ")}`
4918
4896
  );
4919
4897
  if (args.question) {
4920
4898
  this.logger.debug(`Question: ${args.question}`);
4921
4899
  }
4922
- context.actions.skipSummarization = true;
4900
+ context4.actions.skipSummarization = true;
4923
4901
  return null;
4924
4902
  }
4925
4903
  };
@@ -4961,9 +4939,9 @@ var TransferToAgentTool = class extends BaseTool {
4961
4939
  /**
4962
4940
  * Execute the transfer to agent action
4963
4941
  */
4964
- async runAsync(args, context) {
4942
+ async runAsync(args, context4) {
4965
4943
  this.logger.debug(`Executing transfer to agent: ${args.agent_name}`);
4966
- context.actions.transferToAgent = args.agent_name;
4944
+ context4.actions.transferToAgent = args.agent_name;
4967
4945
  }
4968
4946
  };
4969
4947
 
@@ -5004,10 +4982,10 @@ var LoadMemoryTool = class extends BaseTool {
5004
4982
  /**
5005
4983
  * Execute the memory loading action
5006
4984
  */
5007
- async runAsync(args, context) {
4985
+ async runAsync(args, context4) {
5008
4986
  this.logger.debug(`Executing load_memory with query: ${args.query}`);
5009
4987
  try {
5010
- const searchResult = await context.searchMemory(args.query);
4988
+ const searchResult = await context4.searchMemory(args.query);
5011
4989
  return {
5012
4990
  memories: searchResult.memories || [],
5013
4991
  count: searchResult.memories?.length || 0
@@ -5057,7 +5035,7 @@ var LoadArtifactsTool = class extends BaseTool {
5057
5035
  /**
5058
5036
  * Execute the load artifacts operation
5059
5037
  */
5060
- async runAsync(args, context) {
5038
+ async runAsync(args, context4) {
5061
5039
  const artifactNames = args.artifact_names || [];
5062
5040
  return { artifact_names: artifactNames };
5063
5041
  }
@@ -6088,12 +6066,12 @@ var McpToolset = class {
6088
6066
  * Checks if a tool should be included based on the tool filter.
6089
6067
  * Similar to Python's _is_selected method.
6090
6068
  */
6091
- isSelected(tool, context) {
6069
+ isSelected(tool, context4) {
6092
6070
  if (!this.toolFilter) {
6093
6071
  return true;
6094
6072
  }
6095
6073
  if (typeof this.toolFilter === "function") {
6096
- return this.toolFilter(tool, context);
6074
+ return this.toolFilter(tool, context4);
6097
6075
  }
6098
6076
  if (Array.isArray(this.toolFilter)) {
6099
6077
  return this.toolFilter.includes(tool.name);
@@ -6146,7 +6124,7 @@ var McpToolset = class {
6146
6124
  * Retrieves tools from the MCP server and converts them to BaseTool instances.
6147
6125
  * Similar to Python's get_tools method.
6148
6126
  */
6149
- async getTools(context) {
6127
+ async getTools(context4) {
6150
6128
  try {
6151
6129
  if (this.isClosing) {
6152
6130
  throw new McpError(
@@ -6168,7 +6146,7 @@ var McpToolset = class {
6168
6146
  }
6169
6147
  const tools = [];
6170
6148
  for (const mcpTool of toolsResponse.tools) {
6171
- if (this.isSelected(mcpTool, context)) {
6149
+ if (this.isSelected(mcpTool, context4)) {
6172
6150
  try {
6173
6151
  const tool = await createTool2(mcpTool, client);
6174
6152
  tools.push(tool);
@@ -6205,9 +6183,9 @@ var McpToolset = class {
6205
6183
  /**
6206
6184
  * Refreshes the tool cache by clearing it and fetching tools again
6207
6185
  */
6208
- async refreshTools(context) {
6186
+ async refreshTools(context4) {
6209
6187
  this.tools = [];
6210
- return this.getTools(context);
6188
+ return this.getTools(context4);
6211
6189
  }
6212
6190
  /**
6213
6191
  * Closes the connection to the MCP server.
@@ -6251,6 +6229,7 @@ async function getMcpTools(config, toolFilter) {
6251
6229
  }
6252
6230
 
6253
6231
  // src/flows/llm-flows/functions.ts
6232
+ import { context as context2, trace as trace2 } from "@opentelemetry/api";
6254
6233
  var AF_FUNCTION_CALL_ID_PREFIX = "adk-";
6255
6234
  var REQUEST_EUC_FUNCTION_CALL_NAME = "adk_request_credential";
6256
6235
  function generateClientFunctionCallId() {
@@ -6340,23 +6319,40 @@ async function handleFunctionCallsAsync(invocationContext, functionCallEvent, to
6340
6319
  toolsDict
6341
6320
  );
6342
6321
  const functionArgs = functionCall.args || {};
6343
- const functionResponse = await callToolAsync(
6344
- tool,
6345
- functionArgs,
6346
- toolContext
6347
- );
6348
- if (tool.isLongRunning) {
6322
+ const tracer2 = telemetryService.getTracer();
6323
+ const span = tracer2.startSpan(`execute_tool ${tool.name}`);
6324
+ const spanContext = trace2.setSpan(context2.active(), span);
6325
+ try {
6326
+ const functionResponse = await context2.with(spanContext, async () => {
6327
+ const result = await callToolAsync(tool, functionArgs, toolContext);
6328
+ if (tool.isLongRunning && !result) {
6329
+ return null;
6330
+ }
6331
+ const functionResponseEvent = buildResponseEvent(
6332
+ tool,
6333
+ result,
6334
+ toolContext,
6335
+ invocationContext
6336
+ );
6337
+ telemetryService.traceToolCall(
6338
+ tool,
6339
+ functionArgs,
6340
+ functionResponseEvent
6341
+ );
6342
+ return { result, event: functionResponseEvent };
6343
+ });
6349
6344
  if (!functionResponse) {
6350
6345
  continue;
6351
6346
  }
6347
+ functionResponseEvents.push(functionResponse.event);
6348
+ span.setStatus({ code: 1 });
6349
+ } catch (error) {
6350
+ span.recordException(error);
6351
+ span.setStatus({ code: 2, message: error.message });
6352
+ throw error;
6353
+ } finally {
6354
+ span.end();
6352
6355
  }
6353
- const functionResponseEvent = buildResponseEvent(
6354
- tool,
6355
- functionResponse,
6356
- toolContext,
6357
- invocationContext
6358
- );
6359
- functionResponseEvents.push(functionResponseEvent);
6360
6356
  }
6361
6357
  if (!functionResponseEvents.length) {
6362
6358
  return null;
@@ -6456,7 +6452,7 @@ var BaseLlmFlow = class {
6456
6452
  responseProcessors = [];
6457
6453
  logger = new Logger({ name: "BaseLlmFlow" });
6458
6454
  async *runAsync(invocationContext) {
6459
- this.logger.info(`Agent '${invocationContext.agent.name}' started.`);
6455
+ this.logger.debug(`Agent '${invocationContext.agent.name}' started.`);
6460
6456
  let stepCount = 0;
6461
6457
  while (true) {
6462
6458
  stepCount++;
@@ -6466,7 +6462,7 @@ var BaseLlmFlow = class {
6466
6462
  yield event;
6467
6463
  }
6468
6464
  if (!lastEvent || lastEvent.isFinalResponse()) {
6469
- this.logger.info(
6465
+ this.logger.debug(
6470
6466
  `Agent '${invocationContext.agent.name}' finished after ${stepCount} steps.`
6471
6467
  );
6472
6468
  break;
@@ -6496,7 +6492,7 @@ var BaseLlmFlow = class {
6496
6492
  yield event;
6497
6493
  }
6498
6494
  if (invocationContext.endInvocation) {
6499
- this.logger.info("Invocation ended during preprocessing.");
6495
+ this.logger.debug("Invocation ended during preprocessing.");
6500
6496
  return;
6501
6497
  }
6502
6498
  const modelResponseEvent = new Event({
@@ -6536,9 +6532,23 @@ var BaseLlmFlow = class {
6536
6532
  yield event;
6537
6533
  }
6538
6534
  }
6539
- const tools = await agent.canonicalTools(
6535
+ let tools = await agent.canonicalTools(
6540
6536
  new ReadonlyContext(invocationContext)
6541
6537
  );
6538
+ if (tools.length > 1) {
6539
+ const seen = /* @__PURE__ */ new Set();
6540
+ const filtered = [];
6541
+ for (const t of tools) {
6542
+ const name = t?.name;
6543
+ if (!name) continue;
6544
+ if (seen.has(name)) {
6545
+ continue;
6546
+ }
6547
+ seen.add(name);
6548
+ filtered.push(t);
6549
+ }
6550
+ tools = filtered;
6551
+ }
6542
6552
  for (const tool of tools) {
6543
6553
  const toolContext = new ToolContext(invocationContext);
6544
6554
  await tool.processLlmRequest(toolContext, llmRequest);
@@ -6611,7 +6621,7 @@ var BaseLlmFlow = class {
6611
6621
  yield functionResponseEvent;
6612
6622
  const transferToAgent = functionResponseEvent.actions?.transferToAgent;
6613
6623
  if (transferToAgent) {
6614
- this.logger.info(`\u{1F504} Live transfer to agent '${transferToAgent}'`);
6624
+ this.logger.debug(`\u{1F504} Live transfer to agent '${transferToAgent}'`);
6615
6625
  const agentToRun = this._getAgentToRun(
6616
6626
  invocationContext,
6617
6627
  transferToAgent
@@ -6650,7 +6660,7 @@ var BaseLlmFlow = class {
6650
6660
  yield functionResponseEvent;
6651
6661
  const transferToAgent = functionResponseEvent.actions?.transferToAgent;
6652
6662
  if (transferToAgent) {
6653
- this.logger.info(`\u{1F504} Transferring to agent '${transferToAgent}'`);
6663
+ this.logger.debug(`\u{1F504} Transferring to agent '${transferToAgent}'`);
6654
6664
  const agentToRun = this._getAgentToRun(
6655
6665
  invocationContext,
6656
6666
  transferToAgent
@@ -6694,7 +6704,42 @@ var BaseLlmFlow = class {
6694
6704
  }
6695
6705
  invocationContext.incrementLlmCallCount();
6696
6706
  const isStreaming = invocationContext.runConfig.streamingMode === "sse" /* SSE */;
6697
- const tools = llmRequest.config?.tools || [];
6707
+ let tools = llmRequest.config?.tools || [];
6708
+ if (tools.length) {
6709
+ const deduped = [];
6710
+ const seenFn = /* @__PURE__ */ new Set();
6711
+ for (const t of tools) {
6712
+ const tool = t;
6713
+ if (tool && Array.isArray(tool.functionDeclarations)) {
6714
+ const newFds = tool.functionDeclarations.filter(
6715
+ (fd) => {
6716
+ if (fd?.name) {
6717
+ if (seenFn.has(fd.name)) {
6718
+ return false;
6719
+ }
6720
+ seenFn.add(fd.name);
6721
+ }
6722
+ return true;
6723
+ }
6724
+ );
6725
+ if (newFds.length) {
6726
+ deduped.push({ ...tool, functionDeclarations: newFds });
6727
+ }
6728
+ } else if (tool?.name) {
6729
+ if (seenFn.has(tool.name)) continue;
6730
+ seenFn.add(tool.name);
6731
+ deduped.push(tool);
6732
+ } else {
6733
+ deduped.push(tool);
6734
+ }
6735
+ }
6736
+ if (deduped.length !== tools.length) {
6737
+ this.logger.debug(
6738
+ `\u{1F501} Deduplicated tool/function declarations: ${tools.length} -> ${deduped.length}`
6739
+ );
6740
+ }
6741
+ llmRequest.config.tools = tools = deduped;
6742
+ }
6698
6743
  const toolNames = tools.map((tool) => {
6699
6744
  if (tool.functionDeclarations && Array.isArray(tool.functionDeclarations)) {
6700
6745
  return tool.functionDeclarations.map((fn) => fn.name).join(", ");
@@ -7074,8 +7119,6 @@ var BasicLlmRequestProcessor = class extends BaseLlmRequestProcessor {
7074
7119
  llmRequest.liveConnectConfig.realtimeInputConfig = runConfig.realtimeInputConfig;
7075
7120
  llmRequest.liveConnectConfig.enableAffectiveDialog = runConfig.enableAffectiveDialog;
7076
7121
  llmRequest.liveConnectConfig.proactivity = runConfig.proactivity;
7077
- const tools = await agent.canonicalTools();
7078
- llmRequest.appendTools(tools);
7079
7122
  for await (const _ of []) {
7080
7123
  yield _;
7081
7124
  }
@@ -9069,19 +9112,19 @@ var LlmAgent = class _LlmAgent extends BaseAgent {
9069
9112
  * Core logic to run this agent via text-based conversation
9070
9113
  * This matches the Python implementation's _run_async_impl
9071
9114
  */
9072
- async *runAsyncImpl(context) {
9115
+ async *runAsyncImpl(context4) {
9073
9116
  this.logger.debug(`Starting LlmAgent execution for "${this.name}"`);
9074
9117
  try {
9075
- for await (const event of this.llmFlow.runAsync(context)) {
9118
+ for await (const event of this.llmFlow.runAsync(context4)) {
9076
9119
  this.maybeSaveOutputToState(event);
9077
9120
  yield event;
9078
9121
  }
9079
9122
  } catch (error) {
9080
9123
  this.logger.error("Error in LlmAgent execution:", error);
9081
9124
  const errorEvent = new Event({
9082
- invocationId: context.invocationId,
9125
+ invocationId: context4.invocationId,
9083
9126
  author: this.name,
9084
- branch: context.branch,
9127
+ branch: context4.branch,
9085
9128
  content: {
9086
9129
  parts: [
9087
9130
  {
@@ -9349,7 +9392,7 @@ var LangGraphAgent = class extends BaseAgent {
9349
9392
  /**
9350
9393
  * Gets the next nodes to execute based on the current node and its result
9351
9394
  */
9352
- async getNextNodes(currentNode, lastEvent, context) {
9395
+ async getNextNodes(currentNode, lastEvent, context4) {
9353
9396
  if (!currentNode.targets || currentNode.targets.length === 0) {
9354
9397
  return [];
9355
9398
  }
@@ -9361,7 +9404,7 @@ var LangGraphAgent = class extends BaseAgent {
9361
9404
  continue;
9362
9405
  }
9363
9406
  if (targetNode.condition) {
9364
- const shouldExecute = await targetNode.condition(lastEvent, context);
9407
+ const shouldExecute = await targetNode.condition(lastEvent, context4);
9365
9408
  if (!shouldExecute) {
9366
9409
  this.logger.debug(`Skipping node "${targetName}" due to condition`);
9367
9410
  continue;
@@ -9374,7 +9417,7 @@ var LangGraphAgent = class extends BaseAgent {
9374
9417
  /**
9375
9418
  * Core logic to run this agent via text-based conversation.
9376
9419
  */
9377
- async *runAsyncImpl(context) {
9420
+ async *runAsyncImpl(context4) {
9378
9421
  this.logger.debug(
9379
9422
  `Starting graph execution from root node "${this.rootNode}"`
9380
9423
  );
@@ -9396,7 +9439,7 @@ var LangGraphAgent = class extends BaseAgent {
9396
9439
  return;
9397
9440
  }
9398
9441
  let stepCount = 0;
9399
- const nodesToExecute = [{ node: rootNode, context }];
9442
+ const nodesToExecute = [{ node: rootNode, context: context4 }];
9400
9443
  const executedNodes = [];
9401
9444
  let lastEvent = null;
9402
9445
  while (nodesToExecute.length > 0 && stepCount < this.maxSteps) {
@@ -9404,7 +9447,7 @@ var LangGraphAgent = class extends BaseAgent {
9404
9447
  const { node } = nodesToExecute.shift();
9405
9448
  this.logger.debug(`Step ${stepCount}: Executing node "${node.name}"`);
9406
9449
  executedNodes.push(node.name);
9407
- const childContext = context.createChildContext(node.agent);
9450
+ const childContext = context4.createChildContext(node.agent);
9408
9451
  try {
9409
9452
  const nodeEvents = [];
9410
9453
  for await (const event of node.agent.runAsync(childContext)) {
@@ -9417,7 +9460,7 @@ var LangGraphAgent = class extends BaseAgent {
9417
9460
  events: nodeEvents
9418
9461
  });
9419
9462
  if (lastEvent) {
9420
- const nextNodes = await this.getNextNodes(node, lastEvent, context);
9463
+ const nextNodes = await this.getNextNodes(node, lastEvent, context4);
9421
9464
  for (const nextNode of nextNodes) {
9422
9465
  nodesToExecute.push({
9423
9466
  node: nextNode,
@@ -9460,8 +9503,8 @@ var LangGraphAgent = class extends BaseAgent {
9460
9503
  * Core logic to run this agent via video/audio-based conversation.
9461
9504
  * For LangGraph, this follows the same execution pattern as text-based.
9462
9505
  */
9463
- async *runLiveImpl(context) {
9464
- yield* this.runAsyncImpl(context);
9506
+ async *runLiveImpl(context4) {
9507
+ yield* this.runAsyncImpl(context4);
9465
9508
  }
9466
9509
  /**
9467
9510
  * Gets the execution results from the last run
@@ -9511,10 +9554,11 @@ var LangGraphAgent = class extends BaseAgent {
9511
9554
  };
9512
9555
 
9513
9556
  // src/agents/agent-builder.ts
9557
+ init_logger();
9514
9558
  import { generateId } from "ai";
9515
9559
 
9516
9560
  // src/runners.ts
9517
- import { SpanStatusCode } from "@opentelemetry/api";
9561
+ import { SpanStatusCode, context as context3, trace as trace3 } from "@opentelemetry/api";
9518
9562
 
9519
9563
  // src/agents/run-config.ts
9520
9564
  var StreamingMode = /* @__PURE__ */ ((StreamingMode2) => {
@@ -9624,19 +9668,19 @@ var InMemoryArtifactService = class {
9624
9668
  }
9625
9669
  async saveArtifact(args) {
9626
9670
  const { appName, userId, sessionId, filename, artifact } = args;
9627
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9628
- if (!this.artifacts.has(path2)) {
9629
- this.artifacts.set(path2, []);
9671
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9672
+ if (!this.artifacts.has(path3)) {
9673
+ this.artifacts.set(path3, []);
9630
9674
  }
9631
- const versions = this.artifacts.get(path2);
9675
+ const versions = this.artifacts.get(path3);
9632
9676
  const version = versions.length;
9633
9677
  versions.push(artifact);
9634
9678
  return version;
9635
9679
  }
9636
9680
  async loadArtifact(args) {
9637
9681
  const { appName, userId, sessionId, filename, version } = args;
9638
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9639
- const versions = this.artifacts.get(path2);
9682
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9683
+ const versions = this.artifacts.get(path3);
9640
9684
  if (!versions || versions.length === 0) {
9641
9685
  return null;
9642
9686
  }
@@ -9657,12 +9701,12 @@ var InMemoryArtifactService = class {
9657
9701
  const sessionPrefix = `${appName}/${userId}/${sessionId}/`;
9658
9702
  const userNamespacePrefix = `${appName}/${userId}/user/`;
9659
9703
  const filenames = [];
9660
- for (const path2 of this.artifacts.keys()) {
9661
- if (path2.startsWith(sessionPrefix)) {
9662
- const filename = path2.substring(sessionPrefix.length);
9704
+ for (const path3 of this.artifacts.keys()) {
9705
+ if (path3.startsWith(sessionPrefix)) {
9706
+ const filename = path3.substring(sessionPrefix.length);
9663
9707
  filenames.push(filename);
9664
- } else if (path2.startsWith(userNamespacePrefix)) {
9665
- const filename = path2.substring(userNamespacePrefix.length);
9708
+ } else if (path3.startsWith(userNamespacePrefix)) {
9709
+ const filename = path3.substring(userNamespacePrefix.length);
9666
9710
  filenames.push(filename);
9667
9711
  }
9668
9712
  }
@@ -9670,16 +9714,16 @@ var InMemoryArtifactService = class {
9670
9714
  }
9671
9715
  async deleteArtifact(args) {
9672
9716
  const { appName, userId, sessionId, filename } = args;
9673
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9674
- if (!this.artifacts.has(path2)) {
9717
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9718
+ if (!this.artifacts.has(path3)) {
9675
9719
  return;
9676
9720
  }
9677
- this.artifacts.delete(path2);
9721
+ this.artifacts.delete(path3);
9678
9722
  }
9679
9723
  async listVersions(args) {
9680
9724
  const { appName, userId, sessionId, filename } = args;
9681
- const path2 = this.getArtifactPath(appName, userId, sessionId, filename);
9682
- const versions = this.artifacts.get(path2);
9725
+ const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
9726
+ const versions = this.artifacts.get(path3);
9683
9727
  if (!versions || versions.length === 0) {
9684
9728
  return [];
9685
9729
  }
@@ -10149,7 +10193,7 @@ var Runner = class {
10149
10193
  }
10150
10194
  };
10151
10195
  invokeRunAsync();
10152
- return function* () {
10196
+ return (function* () {
10153
10197
  while (true) {
10154
10198
  while (queueIndex >= eventQueue.length && !asyncCompleted) {
10155
10199
  }
@@ -10162,7 +10206,7 @@ var Runner = class {
10162
10206
  }
10163
10207
  yield event;
10164
10208
  }
10165
- }();
10209
+ })();
10166
10210
  }
10167
10211
  /**
10168
10212
  * Main entry method to run the agent in this runner.
@@ -10174,11 +10218,11 @@ var Runner = class {
10174
10218
  runConfig = new RunConfig()
10175
10219
  }) {
10176
10220
  const span = tracer.startSpan("invocation");
10221
+ const spanContext = trace3.setSpan(context3.active(), span);
10177
10222
  try {
10178
- const session = await this.sessionService.getSession(
10179
- this.appName,
10180
- userId,
10181
- sessionId
10223
+ const session = await context3.with(
10224
+ spanContext,
10225
+ () => this.sessionService.getSession(this.appName, userId, sessionId)
10182
10226
  );
10183
10227
  if (!session) {
10184
10228
  throw new Error(`Session not found: ${sessionId}`);
@@ -10188,22 +10232,34 @@ var Runner = class {
10188
10232
  runConfig
10189
10233
  });
10190
10234
  if (newMessage) {
10191
- await this._appendNewMessageToSession(
10192
- session,
10193
- newMessage,
10194
- invocationContext,
10195
- runConfig.saveInputBlobsAsArtifacts || false
10235
+ await context3.with(
10236
+ spanContext,
10237
+ () => this._appendNewMessageToSession(
10238
+ session,
10239
+ newMessage,
10240
+ invocationContext,
10241
+ runConfig.saveInputBlobsAsArtifacts || false
10242
+ )
10196
10243
  );
10197
10244
  }
10198
10245
  invocationContext.agent = this._findAgentToRun(session, this.agent);
10199
- for await (const event of invocationContext.agent.runAsync(
10200
- invocationContext
10201
- )) {
10246
+ const agentGenerator = invocationContext.agent.runAsync(invocationContext);
10247
+ while (true) {
10248
+ const result = await context3.with(
10249
+ spanContext,
10250
+ () => agentGenerator.next()
10251
+ );
10252
+ if (result.done) {
10253
+ break;
10254
+ }
10255
+ const event = result.value;
10202
10256
  if (!event.partial) {
10203
- await this.sessionService.appendEvent(session, event);
10204
- if (this.memoryService) {
10205
- await this.memoryService.addSessionToMemory(session);
10206
- }
10257
+ await context3.with(spanContext, async () => {
10258
+ await this.sessionService.appendEvent(session, event);
10259
+ if (this.memoryService) {
10260
+ await this.memoryService.addSessionToMemory(session);
10261
+ }
10262
+ });
10207
10263
  }
10208
10264
  yield event;
10209
10265
  }
@@ -10350,6 +10406,12 @@ var AgentBuilder = class _AgentBuilder {
10350
10406
  artifactService;
10351
10407
  agentType = "llm";
10352
10408
  existingSession;
10409
+ existingAgent;
10410
+ // If provided, reuse directly
10411
+ definitionLocked = false;
10412
+ // Lock further definition mutation after withAgent
10413
+ warnedMethods = /* @__PURE__ */ new Set();
10414
+ logger = new Logger({ name: "AgentBuilder" });
10353
10415
  /**
10354
10416
  * Private constructor - use static create() method
10355
10417
  */
@@ -10378,6 +10440,7 @@ var AgentBuilder = class _AgentBuilder {
10378
10440
  * @returns This builder instance for chaining
10379
10441
  */
10380
10442
  withModel(model) {
10443
+ this.warnIfLocked("withModel");
10381
10444
  this.config.model = model;
10382
10445
  return this;
10383
10446
  }
@@ -10387,6 +10450,7 @@ var AgentBuilder = class _AgentBuilder {
10387
10450
  * @returns This builder instance for chaining
10388
10451
  */
10389
10452
  withDescription(description) {
10453
+ this.warnIfLocked("withDescription");
10390
10454
  this.config.description = description;
10391
10455
  return this;
10392
10456
  }
@@ -10396,14 +10460,17 @@ var AgentBuilder = class _AgentBuilder {
10396
10460
  * @returns This builder instance for chaining
10397
10461
  */
10398
10462
  withInstruction(instruction) {
10463
+ this.warnIfLocked("withInstruction");
10399
10464
  this.config.instruction = instruction;
10400
10465
  return this;
10401
10466
  }
10402
10467
  withInputSchema(schema) {
10468
+ this.warnIfLocked("withInputSchema");
10403
10469
  this.config.inputSchema = schema;
10404
10470
  return this;
10405
10471
  }
10406
10472
  withOutputSchema(schema) {
10473
+ this.warnIfLocked("withOutputSchema");
10407
10474
  this.config.outputSchema = schema;
10408
10475
  return this;
10409
10476
  }
@@ -10413,6 +10480,7 @@ var AgentBuilder = class _AgentBuilder {
10413
10480
  * @returns This builder instance for chaining
10414
10481
  */
10415
10482
  withTools(...tools) {
10483
+ this.warnIfLocked("withTools");
10416
10484
  this.config.tools = [...this.config.tools || [], ...tools];
10417
10485
  return this;
10418
10486
  }
@@ -10422,6 +10490,7 @@ var AgentBuilder = class _AgentBuilder {
10422
10490
  * @returns This builder instance for chaining
10423
10491
  */
10424
10492
  withPlanner(planner) {
10493
+ this.warnIfLocked("withPlanner");
10425
10494
  this.config.planner = planner;
10426
10495
  return this;
10427
10496
  }
@@ -10431,6 +10500,7 @@ var AgentBuilder = class _AgentBuilder {
10431
10500
  * @returns This builder instance for chaining
10432
10501
  */
10433
10502
  withCodeExecutor(codeExecutor) {
10503
+ this.warnIfLocked("withCodeExecutor");
10434
10504
  this.config.codeExecutor = codeExecutor;
10435
10505
  return this;
10436
10506
  }
@@ -10440,6 +10510,7 @@ var AgentBuilder = class _AgentBuilder {
10440
10510
  * @returns This builder instance for chaining
10441
10511
  */
10442
10512
  withOutputKey(outputKey) {
10513
+ this.warnIfLocked("withOutputKey");
10443
10514
  this.config.outputKey = outputKey;
10444
10515
  return this;
10445
10516
  }
@@ -10449,6 +10520,7 @@ var AgentBuilder = class _AgentBuilder {
10449
10520
  * @returns This builder instance for chaining
10450
10521
  */
10451
10522
  withSubAgents(subAgents) {
10523
+ this.warnIfLocked("withSubAgents");
10452
10524
  this.config.subAgents = subAgents;
10453
10525
  return this;
10454
10526
  }
@@ -10458,6 +10530,7 @@ var AgentBuilder = class _AgentBuilder {
10458
10530
  * @returns This builder instance for chaining
10459
10531
  */
10460
10532
  withBeforeAgentCallback(callback) {
10533
+ this.warnIfLocked("withBeforeAgentCallback");
10461
10534
  this.config.beforeAgentCallback = callback;
10462
10535
  return this;
10463
10536
  }
@@ -10467,15 +10540,29 @@ var AgentBuilder = class _AgentBuilder {
10467
10540
  * @returns This builder instance for chaining
10468
10541
  */
10469
10542
  withAfterAgentCallback(callback) {
10543
+ this.warnIfLocked("withAfterAgentCallback");
10470
10544
  this.config.afterAgentCallback = callback;
10471
10545
  return this;
10472
10546
  }
10547
+ /**
10548
+ * Provide an already constructed agent instance. Further definition-mutating calls
10549
+ * (model/tools/instruction/etc.) will be ignored with a dev warning.
10550
+ */
10551
+ withAgent(agent) {
10552
+ this.existingAgent = agent;
10553
+ this.definitionLocked = true;
10554
+ if (this.config.name === "default_agent" && agent.name) {
10555
+ this.config.name = agent.name;
10556
+ }
10557
+ return this;
10558
+ }
10473
10559
  /**
10474
10560
  * Configure as a sequential agent
10475
10561
  * @param subAgents Sub-agents to execute in sequence
10476
10562
  * @returns This builder instance for chaining
10477
10563
  */
10478
10564
  asSequential(subAgents) {
10565
+ this.warnIfLocked("asSequential");
10479
10566
  this.agentType = "sequential";
10480
10567
  this.config.subAgents = subAgents;
10481
10568
  return this;
@@ -10486,6 +10573,7 @@ var AgentBuilder = class _AgentBuilder {
10486
10573
  * @returns This builder instance for chaining
10487
10574
  */
10488
10575
  asParallel(subAgents) {
10576
+ this.warnIfLocked("asParallel");
10489
10577
  this.agentType = "parallel";
10490
10578
  this.config.subAgents = subAgents;
10491
10579
  return this;
@@ -10497,6 +10585,7 @@ var AgentBuilder = class _AgentBuilder {
10497
10585
  * @returns This builder instance for chaining
10498
10586
  */
10499
10587
  asLoop(subAgents, maxIterations = 3) {
10588
+ this.warnIfLocked("asLoop");
10500
10589
  this.agentType = "loop";
10501
10590
  this.config.subAgents = subAgents;
10502
10591
  this.config.maxIterations = maxIterations;
@@ -10509,6 +10598,7 @@ var AgentBuilder = class _AgentBuilder {
10509
10598
  * @returns This builder instance for chaining
10510
10599
  */
10511
10600
  asLangGraph(nodes, rootNode) {
10601
+ this.warnIfLocked("asLangGraph");
10512
10602
  this.agentType = "langgraph";
10513
10603
  this.config.nodes = nodes;
10514
10604
  this.config.rootNode = rootNode;
@@ -10635,6 +10725,7 @@ var AgentBuilder = class _AgentBuilder {
10635
10725
  * @returns Created agent instance
10636
10726
  */
10637
10727
  createAgent() {
10728
+ if (this.existingAgent) return this.existingAgent;
10638
10729
  switch (this.agentType) {
10639
10730
  case "llm": {
10640
10731
  if (!this.config.model) {
@@ -10765,6 +10856,22 @@ var AgentBuilder = class _AgentBuilder {
10765
10856
  }
10766
10857
  };
10767
10858
  }
10859
+ /**
10860
+ * Warn (once per method) if the definition has been locked by withAgent().
10861
+ */
10862
+ warnIfLocked(method) {
10863
+ if (!this.definitionLocked) return;
10864
+ if (this.warnedMethods.has(method)) return;
10865
+ this.warnedMethods.add(method);
10866
+ if (process.env.NODE_ENV !== "production") {
10867
+ const msg = `AgentBuilder: attempted to call ${method} after withAgent(); ignoring. (Wrap the agent first OR configure before withAgent).`;
10868
+ if (this.logger && typeof this.logger.warn === "function") {
10869
+ this.logger.warn(msg);
10870
+ } else {
10871
+ console.warn(msg);
10872
+ }
10873
+ }
10874
+ }
10768
10875
  };
10769
10876
 
10770
10877
  // src/memory/index.ts
@@ -10818,7 +10925,7 @@ var VertexAiSessionService = class extends BaseSessionService {
10818
10925
  path: `reasoningEngines/${reasoningEngineId}/sessions`,
10819
10926
  request_dict: sessionJsonDict
10820
10927
  });
10821
- console.info("Create Session response", apiResponse);
10928
+ console.debug("Create Session response", apiResponse);
10822
10929
  const createdSessionId = apiResponse.name.split("/").slice(-3, -2)[0];
10823
10930
  const operationId = apiResponse.name.split("/").pop();
10824
10931
  let maxRetryAttempt = 5;
@@ -10929,14 +11036,14 @@ var VertexAiSessionService = class extends BaseSessionService {
10929
11036
  async listSessions(appName, userId) {
10930
11037
  const reasoningEngineId = this.getReasoningEngineId(appName);
10931
11038
  const apiClient = this.getApiClient();
10932
- let path2 = `reasoningEngines/${reasoningEngineId}/sessions`;
11039
+ let path3 = `reasoningEngines/${reasoningEngineId}/sessions`;
10933
11040
  if (userId) {
10934
11041
  const parsedUserId = encodeURIComponent(`"${userId}"`);
10935
- path2 = `${path2}?filter=user_id=${parsedUserId}`;
11042
+ path3 = `${path3}?filter=user_id=${parsedUserId}`;
10936
11043
  }
10937
11044
  const apiResponse = await apiClient.async_request({
10938
11045
  http_method: "GET",
10939
- path: path2,
11046
+ path: path3,
10940
11047
  request_dict: {}
10941
11048
  });
10942
11049
  if (apiResponse.httpHeaders) {
@@ -11752,12 +11859,1299 @@ __export(flows_exports, {
11752
11859
  removeClientFunctionCallId: () => removeClientFunctionCallId
11753
11860
  });
11754
11861
 
11862
+ // src/evaluation/index.ts
11863
+ var evaluation_exports = {};
11864
+ __export(evaluation_exports, {
11865
+ AgentEvaluator: () => AgentEvaluator,
11866
+ EvalResult: () => EvalResult,
11867
+ EvalStatus: () => EvalStatus,
11868
+ Evaluator: () => Evaluator,
11869
+ FinalResponseMatchV2Evaluator: () => FinalResponseMatchV2Evaluator,
11870
+ LocalEvalService: () => LocalEvalService,
11871
+ PrebuiltMetrics: () => PrebuiltMetrics,
11872
+ RougeEvaluator: () => RougeEvaluator,
11873
+ SafetyEvaluatorV1: () => SafetyEvaluatorV1,
11874
+ TrajectoryEvaluator: () => TrajectoryEvaluator
11875
+ });
11876
+
11877
+ // src/evaluation/evaluator.ts
11878
+ var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
11879
+ EvalStatus2[EvalStatus2["PASSED"] = 1] = "PASSED";
11880
+ EvalStatus2[EvalStatus2["FAILED"] = 2] = "FAILED";
11881
+ EvalStatus2[EvalStatus2["NOT_EVALUATED"] = 3] = "NOT_EVALUATED";
11882
+ return EvalStatus2;
11883
+ })(EvalStatus || {});
11884
+ var Evaluator = class {
11885
+ constructor(metric) {
11886
+ this.metric = metric;
11887
+ }
11888
+ static getMetricInfo(metricName) {
11889
+ throw new Error("getMetricInfo() must be implemented by subclass");
11890
+ }
11891
+ };
11892
+
11893
+ // src/evaluation/eval-metrics.ts
11894
+ var PrebuiltMetrics = /* @__PURE__ */ ((PrebuiltMetrics2) => {
11895
+ PrebuiltMetrics2["TOOL_TRAJECTORY_AVG_SCORE"] = "tool_trajectory_avg_score";
11896
+ PrebuiltMetrics2["RESPONSE_EVALUATION_SCORE"] = "response_evaluation_score";
11897
+ PrebuiltMetrics2["RESPONSE_MATCH_SCORE"] = "response_match_score";
11898
+ PrebuiltMetrics2["SAFETY_V1"] = "safety_v1";
11899
+ PrebuiltMetrics2["FINAL_RESPONSE_MATCH_V2"] = "final_response_match_v2";
11900
+ PrebuiltMetrics2["TOOL_TRAJECTORY_SCORE"] = "tool_trajectory_score";
11901
+ PrebuiltMetrics2["SAFETY"] = "safety";
11902
+ PrebuiltMetrics2["RESPONSE_MATCH"] = "response_match";
11903
+ return PrebuiltMetrics2;
11904
+ })(PrebuiltMetrics || {});
11905
+
11906
+ // src/evaluation/eval-result.ts
11907
+ var EvalResult = class {
11908
+ evalSetResultId;
11909
+ evalSetResultName;
11910
+ evalSetId;
11911
+ evalCaseResults;
11912
+ creationTimestamp;
11913
+ constructor(init) {
11914
+ this.evalSetResultId = init.evalSetResultId || "";
11915
+ this.evalSetResultName = init.evalSetResultName;
11916
+ this.evalSetId = init.evalSetId || "";
11917
+ this.evalCaseResults = init.evalCaseResults || [];
11918
+ this.creationTimestamp = init.creationTimestamp || Date.now() / 1e3;
11919
+ }
11920
+ };
11921
+
11922
+ // src/evaluation/agent-evaluator.ts
11923
+ import * as fs2 from "fs/promises";
11924
+ import * as path2 from "path";
11925
+
11926
+ // src/evaluation/base-eval-service.ts
11927
+ var BaseEvalService = class {
11928
+ async *evaluateSession(session) {
11929
+ const inferenceResults = [];
11930
+ for await (const result of this.performInference({
11931
+ evalSetId: session.evalSetId,
11932
+ evalCases: session.evalCases
11933
+ })) {
11934
+ inferenceResults.push(result);
11935
+ }
11936
+ for await (const result of this.evaluate({
11937
+ inferenceResults,
11938
+ evaluateConfig: session.evaluateConfig
11939
+ })) {
11940
+ yield result;
11941
+ }
11942
+ }
11943
+ };
11944
+
11945
+ // src/evaluation/vertex-ai-eval-facade.ts
11946
+ var ERROR_MESSAGE_SUFFIX = `
11947
+ You should specify both project id and location. This metric uses Vertex Gen AI
11948
+ Eval SDK, and it requires google cloud credentials.
11949
+
11950
+ If using an .env file add the values there, or explicitly set in the code using
11951
+ the template below:
11952
+
11953
+ process.env.GOOGLE_CLOUD_LOCATION = <LOCATION>
11954
+ process.env.GOOGLE_CLOUD_PROJECT = <PROJECT ID>
11955
+ `;
11956
+ var VertexAiEvalFacade = class _VertexAiEvalFacade {
11957
+ threshold;
11958
+ metricName;
11959
+ constructor(config) {
11960
+ this.threshold = config.threshold;
11961
+ this.metricName = config.metricName;
11962
+ }
11963
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
11964
+ let totalScore = 0;
11965
+ let numInvocations = 0;
11966
+ const perInvocationResults = [];
11967
+ for (let i = 0; i < actualInvocations.length; i++) {
11968
+ const actual = actualInvocations[i];
11969
+ const expected = expectedInvocations[i];
11970
+ const prompt = this._getText(expected.userContent);
11971
+ const reference = this._getText(expected.finalResponse);
11972
+ const response = this._getText(actual.finalResponse);
11973
+ const evalCase = {
11974
+ prompt,
11975
+ reference,
11976
+ response
11977
+ };
11978
+ try {
11979
+ const evalCaseResult = await _VertexAiEvalFacade._performEval(
11980
+ [evalCase],
11981
+ [this.metricName]
11982
+ );
11983
+ const score = this._getScore(evalCaseResult);
11984
+ perInvocationResults.push({
11985
+ actualInvocation: actual,
11986
+ expectedInvocation: expected,
11987
+ score,
11988
+ evalStatus: this._getEvalStatus(score)
11989
+ });
11990
+ if (score !== null && score !== void 0) {
11991
+ totalScore += score;
11992
+ numInvocations++;
11993
+ }
11994
+ } catch (error) {
11995
+ console.error("Error evaluating invocation:", error);
11996
+ perInvocationResults.push({
11997
+ actualInvocation: actual,
11998
+ expectedInvocation: expected,
11999
+ score: void 0,
12000
+ evalStatus: 3 /* NOT_EVALUATED */
12001
+ });
12002
+ }
12003
+ }
12004
+ if (perInvocationResults.length > 0) {
12005
+ const overallScore = numInvocations > 0 ? totalScore / numInvocations : void 0;
12006
+ return {
12007
+ overallScore,
12008
+ overallEvalStatus: this._getEvalStatus(overallScore),
12009
+ perInvocationResults
12010
+ };
12011
+ }
12012
+ return {
12013
+ overallScore: void 0,
12014
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
12015
+ perInvocationResults: []
12016
+ };
12017
+ }
12018
+ _getText(content) {
12019
+ if (content?.parts) {
12020
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
12021
+ }
12022
+ return "";
12023
+ }
12024
+ _getScore(evalResult) {
12025
+ if (evalResult?.summaryMetrics?.[0]?.meanScore !== void 0 && typeof evalResult.summaryMetrics[0].meanScore === "number" && !Number.isNaN(evalResult.summaryMetrics[0].meanScore)) {
12026
+ return evalResult.summaryMetrics[0].meanScore;
12027
+ }
12028
+ return void 0;
12029
+ }
12030
+ _getEvalStatus(score) {
12031
+ if (score !== null && score !== void 0) {
12032
+ return score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12033
+ }
12034
+ return 3 /* NOT_EVALUATED */;
12035
+ }
12036
+ static async _performEval(dataset, metrics) {
12037
+ const projectId = process.env.GOOGLE_CLOUD_PROJECT;
12038
+ const location = process.env.GOOGLE_CLOUD_LOCATION;
12039
+ if (!projectId) {
12040
+ throw new Error(`Missing project id. ${ERROR_MESSAGE_SUFFIX}`);
12041
+ }
12042
+ if (!location) {
12043
+ throw new Error(`Missing location. ${ERROR_MESSAGE_SUFFIX}`);
12044
+ }
12045
+ console.warn(
12046
+ "Vertex AI evaluation is not fully implemented. Using mock response."
12047
+ );
12048
+ return {
12049
+ summaryMetrics: [
12050
+ {
12051
+ meanScore: Math.random() * 0.5 + 0.5
12052
+ }
12053
+ ]
12054
+ };
12055
+ }
12056
+ };
12057
+
12058
+ // src/evaluation/response-evaluator.ts
12059
+ var ResponseEvaluator = class extends Evaluator {
12060
+ metricName;
12061
+ threshold;
12062
+ constructor(evalMetric) {
12063
+ super(evalMetric);
12064
+ if (evalMetric.metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
12065
+ this.metricName = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
12066
+ } else if (evalMetric.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12067
+ this.metricName = "response_match_score" /* RESPONSE_MATCH_SCORE */;
12068
+ } else {
12069
+ throw new Error(`Metric ${evalMetric.metricName} is not supported.`);
12070
+ }
12071
+ this.threshold = evalMetric.threshold;
12072
+ }
12073
+ static getMetricInfo(metricName) {
12074
+ if (metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
12075
+ return {
12076
+ metricName: "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */,
12077
+ description: "This metric evaluates how coherent agent's response was. Value range of this metric is [1,5], with values closer to 5 more desirable.",
12078
+ metricValueInfo: {
12079
+ interval: {
12080
+ minValue: 1,
12081
+ maxValue: 5,
12082
+ openAtMin: false,
12083
+ openAtMax: false
12084
+ }
12085
+ }
12086
+ };
12087
+ }
12088
+ if (metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12089
+ return {
12090
+ metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
12091
+ description: "This metric evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
12092
+ metricValueInfo: {
12093
+ interval: {
12094
+ minValue: 0,
12095
+ maxValue: 1,
12096
+ openAtMin: false,
12097
+ openAtMax: false
12098
+ }
12099
+ }
12100
+ };
12101
+ }
12102
+ throw new Error(`Metric ${metricName} is not supported.`);
12103
+ }
12104
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12105
+ if (this.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
12106
+ return this.evaluateRougeScore(actualInvocations, expectedInvocations);
12107
+ }
12108
+ const vertexAiFacade = new VertexAiEvalFacade({
12109
+ threshold: this.threshold,
12110
+ metricName: this.metricName
12111
+ });
12112
+ return vertexAiFacade.evaluateInvocations(
12113
+ actualInvocations,
12114
+ expectedInvocations
12115
+ );
12116
+ }
12117
+ async evaluateRougeScore(actualInvocations, expectedInvocations) {
12118
+ if (actualInvocations.length !== expectedInvocations.length) {
12119
+ throw new Error("Number of actual and expected invocations must match");
12120
+ }
12121
+ const results = [];
12122
+ for (let i = 0; i < actualInvocations.length; i++) {
12123
+ const actual = actualInvocations[i];
12124
+ const expected = expectedInvocations[i];
12125
+ const result = await this.evaluateInvocation(actual, expected);
12126
+ results.push(result);
12127
+ }
12128
+ const scores = results.map((r) => r.score).filter((s) => s !== void 0);
12129
+ const overallScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
12130
+ const overallStatus = overallScore !== void 0 && overallScore >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12131
+ return {
12132
+ overallScore,
12133
+ overallEvalStatus: overallStatus,
12134
+ perInvocationResults: results
12135
+ };
12136
+ }
12137
+ async evaluateInvocation(actual, expected) {
12138
+ if (!actual.finalResponse || !expected.finalResponse) {
12139
+ return {
12140
+ actualInvocation: actual,
12141
+ expectedInvocation: expected,
12142
+ evalStatus: 3 /* NOT_EVALUATED */
12143
+ };
12144
+ }
12145
+ const score = await this.computeRougeScore(
12146
+ actual.finalResponse,
12147
+ expected.finalResponse
12148
+ );
12149
+ return {
12150
+ actualInvocation: actual,
12151
+ expectedInvocation: expected,
12152
+ score,
12153
+ evalStatus: score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */
12154
+ };
12155
+ }
12156
+ async computeRougeScore(actual, expected) {
12157
+ const actualText = this.extractText(actual);
12158
+ const expectedText = this.extractText(expected);
12159
+ if (!actualText.trim() || !expectedText.trim()) {
12160
+ return 0;
12161
+ }
12162
+ const actualTokens = this.tokenizeText(actualText);
12163
+ const expectedTokens = this.tokenizeText(expectedText);
12164
+ const actualUnigrams = new Set(actualTokens);
12165
+ const expectedUnigrams = new Set(expectedTokens);
12166
+ const commonUnigrams = new Set(
12167
+ [...actualUnigrams].filter((token) => expectedUnigrams.has(token))
12168
+ );
12169
+ const precision = actualUnigrams.size > 0 ? commonUnigrams.size / actualUnigrams.size : 0;
12170
+ const recall = expectedUnigrams.size > 0 ? commonUnigrams.size / expectedUnigrams.size : 0;
12171
+ const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
12172
+ return fmeasure;
12173
+ }
12174
+ extractText(content) {
12175
+ if (content?.parts) {
12176
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join(" ");
12177
+ }
12178
+ return "";
12179
+ }
12180
+ tokenizeText(text) {
12181
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
12182
+ }
12183
+ };
12184
+
12185
+ // src/evaluation/trajectory-evaluator.ts
12186
+ var TrajectoryEvaluator = class extends Evaluator {
12187
+ static getMetricInfo() {
12188
+ return {
12189
+ metricName: "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */,
12190
+ description: "This metric compares two tool call trajectories (expected vs. actual) for the same user interaction. It performs an exact match on the tool name and arguments for each step in the trajectory. A score of 1.0 indicates a perfect match, while 0.0 indicates a mismatch. Higher values are better.",
12191
+ metricValueInfo: {
12192
+ interval: {
12193
+ minValue: 0,
12194
+ maxValue: 1,
12195
+ openAtMin: false,
12196
+ openAtMax: false
12197
+ }
12198
+ }
12199
+ };
12200
+ }
12201
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12202
+ let totalToolUseAccuracy = 0;
12203
+ let numInvocations = 0;
12204
+ const perInvocationResults = [];
12205
+ for (let i = 0; i < actualInvocations.length; i++) {
12206
+ const actual = actualInvocations[i];
12207
+ const expected = expectedInvocations[i];
12208
+ if (!actual.intermediateData?.toolUses || !expected.intermediateData?.toolUses) {
12209
+ perInvocationResults.push({
12210
+ actualInvocation: actual,
12211
+ expectedInvocation: expected,
12212
+ evalStatus: 3 /* NOT_EVALUATED */
12213
+ });
12214
+ continue;
12215
+ }
12216
+ const toolUseAccuracy = this.areToolCallsEqual(
12217
+ actual.intermediateData.toolUses,
12218
+ expected.intermediateData.toolUses
12219
+ ) ? 1 : 0;
12220
+ perInvocationResults.push({
12221
+ actualInvocation: actual,
12222
+ expectedInvocation: expected,
12223
+ score: toolUseAccuracy,
12224
+ evalStatus: toolUseAccuracy >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */
12225
+ });
12226
+ totalToolUseAccuracy += toolUseAccuracy;
12227
+ numInvocations++;
12228
+ }
12229
+ const overallScore = numInvocations > 0 ? totalToolUseAccuracy / numInvocations : 0;
12230
+ return {
12231
+ overallScore,
12232
+ overallEvalStatus: overallScore >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */,
12233
+ perInvocationResults
12234
+ };
12235
+ }
12236
+ areToolCallsEqual(actual, expected) {
12237
+ if (actual.length !== expected.length) {
12238
+ return false;
12239
+ }
12240
+ return actual.every((actualCall, index) => {
12241
+ const expectedCall = expected[index];
12242
+ return this.isToolCallEqual(actualCall, expectedCall);
12243
+ });
12244
+ }
12245
+ isToolCallEqual(actual, expected) {
12246
+ if (actual.name !== expected.name) {
12247
+ return false;
12248
+ }
12249
+ const actualArgs = actual.args || {};
12250
+ const expectedArgs = expected.args || {};
12251
+ const actualKeys = Object.keys(actualArgs).sort();
12252
+ const expectedKeys = Object.keys(expectedArgs).sort();
12253
+ if (actualKeys.length !== expectedKeys.length) {
12254
+ return false;
12255
+ }
12256
+ return actualKeys.every((key, index) => {
12257
+ const expectedKey = expectedKeys[index];
12258
+ if (key !== expectedKey) {
12259
+ return false;
12260
+ }
12261
+ return JSON.stringify(actualArgs[key]) === JSON.stringify(expectedArgs[key]);
12262
+ });
12263
+ }
12264
+ };
12265
+
12266
+ // src/evaluation/safety-evaluator.ts
12267
+ var SafetyEvaluatorV1 = class extends Evaluator {
12268
+ static getMetricInfo() {
12269
+ return {
12270
+ metricName: "safety_v1" /* SAFETY_V1 */,
12271
+ description: "This metric evaluates the safety (harmlessness) of an Agent's Response. Value range of the metric is [0, 1], with values closer to 1 to be more desirable (safe).",
12272
+ metricValueInfo: {
12273
+ interval: {
12274
+ minValue: 0,
12275
+ maxValue: 1,
12276
+ openAtMin: false,
12277
+ openAtMax: false
12278
+ }
12279
+ }
12280
+ };
12281
+ }
12282
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12283
+ const facade = new VertexAiEvalFacade({
12284
+ threshold: this.metric.threshold,
12285
+ metricName: "safety_v1" /* SAFETY_V1 */
12286
+ });
12287
+ return await facade.evaluateInvocations(
12288
+ actualInvocations,
12289
+ expectedInvocations
12290
+ );
12291
+ }
12292
+ };
12293
+
12294
+ // src/evaluation/llm-as-judge-utils.ts
12295
+ function getTextFromContent(content) {
12296
+ if (content?.parts) {
12297
+ return content.parts.map((part) => part.text).filter(Boolean).join("\n");
12298
+ }
12299
+ return "";
12300
+ }
12301
+ function getEvalStatus(score, threshold) {
12302
+ return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
12303
+ }
12304
+
12305
+ // src/evaluation/llm-as-judge.ts
12306
+ var LlmAsJudge = class {
12307
+ async sampleJudge(prompt, numSamples, critiqueParser, judgeModelOptions) {
12308
+ const modelName = judgeModelOptions?.judgeModel || "gemini-2.5-flash";
12309
+ const model = LLMRegistry.getModelOrCreate(modelName);
12310
+ const config = judgeModelOptions?.judgeModelConfig || {};
12311
+ const samples = [];
12312
+ for (let i = 0; i < numSamples; i++) {
12313
+ try {
12314
+ const response = await model.generateContent({
12315
+ prompt,
12316
+ ...config
12317
+ });
12318
+ const label = critiqueParser(response.text);
12319
+ if (label !== "not_found" /* NOT_FOUND */) {
12320
+ samples.push(label);
12321
+ }
12322
+ } catch (error) {
12323
+ console.error("Error sampling judge model:", error);
12324
+ }
12325
+ }
12326
+ return samples;
12327
+ }
12328
+ };
12329
+
12330
+ // src/evaluation/final-response-match-v2.ts
12331
+ var FINAL_RESPONSE_MATCH_V2_PROMPT = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
12332
+ The primary focus of this rating task is to check correctness of the model responses.
12333
+
12334
+ The data consists of:
12335
+ - A user query.
12336
+ - A model generated response for the prompt. The responses can consist of:
12337
+ - Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
12338
+ - Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
12339
+ You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
12340
+ Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
12341
+ When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
12342
+
12343
+ You should follow the constitutions below very carefully to rate the model response:
12344
+ - Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
12345
+ - e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
12346
+ - e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
12347
+ - e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
12348
+ - The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
12349
+ - If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
12350
+ - If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
12351
+ - When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
12352
+ - Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
12353
+ - When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
12354
+ - When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
12355
+ - When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 \u2192 0.80 is acceptable, but 0.7 is not).
12356
+
12357
+ Below are the inputs:
12358
+ {{
12359
+ "User prompt": {prompt},
12360
+ "Agent response": {response},
12361
+ "Reference response": {golden_response},
12362
+ }}
12363
+
12364
+ The answer should be a json alone which follows the json structure below:
12365
+ {{
12366
+ "reasoning": [reasoning],
12367
+ "is_the_agent_response_valid": [valid or invalid],
12368
+ }}
12369
+ Answer with assertiveness:
12370
+ `;
12371
+ var DEFAULT_NUM_SAMPLES = 5;
12372
+ function parseCritique(response) {
12373
+ const labelMatchIsResponseValid = response.match(
12374
+ /"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]/
12375
+ );
12376
+ if (labelMatchIsResponseValid?.[1]) {
12377
+ const label = labelMatchIsResponseValid[1].toLowerCase();
12378
+ return label === "valid" ? "valid" /* VALID */ : "invalid" /* INVALID */;
12379
+ }
12380
+ return "not_found" /* NOT_FOUND */;
12381
+ }
12382
+ var FinalResponseMatchV2Evaluator = class extends Evaluator {
12383
+ constructor(evalMetric, llmAsJudge = new LlmAsJudge()) {
12384
+ super(evalMetric);
12385
+ this.llmAsJudge = llmAsJudge;
12386
+ }
12387
+ static getMetricInfo() {
12388
+ return {
12389
+ metricName: "final_response_match_v2" /* FINAL_RESPONSE_MATCH_V2 */,
12390
+ description: "This metric evaluates if the agent's final response matches a golden/expected final response using an LLM judge. Value range for this metric is [0,1], with values closer to 1 more desirable.",
12391
+ metricValueInfo: {
12392
+ interval: {
12393
+ minValue: 0,
12394
+ maxValue: 1,
12395
+ openAtMin: false,
12396
+ openAtMax: false
12397
+ }
12398
+ }
12399
+ };
12400
+ }
12401
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
12402
+ const perInvocationResults = [];
12403
+ let totalScore = 0;
12404
+ let numInvocations = 0;
12405
+ if (!actualInvocations.length) {
12406
+ return {
12407
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
12408
+ perInvocationResults: []
12409
+ };
12410
+ }
12411
+ for (let i = 0; i < actualInvocations.length; i++) {
12412
+ const actual = actualInvocations[i];
12413
+ const expected = expectedInvocations[i];
12414
+ const prompt = getTextFromContent(expected.userContent);
12415
+ const response = getTextFromContent(actual.finalResponse);
12416
+ const goldenResponse = getTextFromContent(expected.finalResponse);
12417
+ const formattedPrompt = FINAL_RESPONSE_MATCH_V2_PROMPT.replace(
12418
+ "{prompt}",
12419
+ prompt
12420
+ ).replace("{response}", response).replace("{golden_response}", goldenResponse);
12421
+ const numSamples = this.metric.judgeModelOptions?.numSamples ?? DEFAULT_NUM_SAMPLES;
12422
+ const labels = await this.llmAsJudge.sampleJudge(
12423
+ formattedPrompt,
12424
+ numSamples,
12425
+ parseCritique,
12426
+ this.metric.judgeModelOptions
12427
+ );
12428
+ const score = labels.filter((l) => l === "valid" /* VALID */).length / labels.length;
12429
+ perInvocationResults.push({
12430
+ actualInvocation: actual,
12431
+ expectedInvocation: expected,
12432
+ score,
12433
+ evalStatus: getEvalStatus(score, this.metric.threshold)
12434
+ });
12435
+ totalScore += score;
12436
+ numInvocations++;
12437
+ }
12438
+ const overallScore = totalScore / numInvocations;
12439
+ return {
12440
+ overallScore,
12441
+ overallEvalStatus: getEvalStatus(overallScore, this.metric.threshold),
12442
+ perInvocationResults
12443
+ };
12444
+ }
12445
+ };
12446
+
12447
+ // src/evaluation/metric-evaluator-registry.ts
12448
+ var MetricEvaluatorRegistry = class {
12449
+ registry = /* @__PURE__ */ new Map();
12450
+ getEvaluator(evalMetric) {
12451
+ const entry = this.registry.get(evalMetric.metricName);
12452
+ if (!entry) {
12453
+ throw new Error(`${evalMetric.metricName} not found in registry.`);
12454
+ }
12455
+ return new entry.evaluator(evalMetric);
12456
+ }
12457
+ registerEvaluator(metricInfo, evaluator) {
12458
+ const metricName = metricInfo.metricName;
12459
+ if (this.registry.has(metricName)) {
12460
+ console.info(
12461
+ `Updating Evaluator class for ${metricName} from ${this.registry.get(metricName)?.evaluator.name} to ${evaluator.name}`
12462
+ );
12463
+ }
12464
+ this.registry.set(metricName, {
12465
+ evaluator,
12466
+ metricInfo: { ...metricInfo }
12467
+ });
12468
+ }
12469
+ getRegisteredMetrics() {
12470
+ return Array.from(this.registry.values()).map((entry) => ({
12471
+ ...entry.metricInfo
12472
+ }));
12473
+ }
12474
+ };
12475
+ function getDefaultMetricEvaluatorRegistry() {
12476
+ const registry = new MetricEvaluatorRegistry();
12477
+ registry.registerEvaluator(
12478
+ TrajectoryEvaluator.getMetricInfo(),
12479
+ TrajectoryEvaluator
12480
+ );
12481
+ registry.registerEvaluator(
12482
+ ResponseEvaluator.getMetricInfo("response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */),
12483
+ ResponseEvaluator
12484
+ );
12485
+ registry.registerEvaluator(
12486
+ ResponseEvaluator.getMetricInfo("response_match_score" /* RESPONSE_MATCH_SCORE */),
12487
+ ResponseEvaluator
12488
+ );
12489
+ registry.registerEvaluator(
12490
+ SafetyEvaluatorV1.getMetricInfo(),
12491
+ SafetyEvaluatorV1
12492
+ );
12493
+ registry.registerEvaluator(
12494
+ FinalResponseMatchV2Evaluator.getMetricInfo(),
12495
+ FinalResponseMatchV2Evaluator
12496
+ );
12497
+ return registry;
12498
+ }
12499
+ var DEFAULT_METRIC_EVALUATOR_REGISTRY = getDefaultMetricEvaluatorRegistry();
12500
+
12501
+ // src/evaluation/local-eval-service.ts
12502
+ var LocalEvalService = class extends BaseEvalService {
12503
+ constructor(agent, parallelism = 4) {
12504
+ super();
12505
+ this.agent = agent;
12506
+ this.parallelism = parallelism;
12507
+ this.initializeRunner();
12508
+ }
12509
+ runner;
12510
+ async initializeRunner() {
12511
+ if ("ask" in this.agent) {
12512
+ this.runner = this.agent;
12513
+ } else {
12514
+ try {
12515
+ const { runner } = await AgentBuilder.create("eval_agent").withModel("gemini-2.5-flash").withDescription("Agent for evaluation purposes").build();
12516
+ this.runner = {
12517
+ ask: async (message) => {
12518
+ return await runner.ask(message);
12519
+ }
12520
+ };
12521
+ } catch (error) {
12522
+ console.warn(
12523
+ "Failed to create AgentBuilder runner, falling back to mock:",
12524
+ error
12525
+ );
12526
+ this.runner = {
12527
+ ask: async (message) => {
12528
+ return `Mock response to: ${message}`;
12529
+ }
12530
+ };
12531
+ }
12532
+ }
12533
+ }
12534
+ async *performInference(request) {
12535
+ for (const evalSet of request.evalCases) {
12536
+ for (const evalCase of evalSet.evalCases) {
12537
+ const expected = [];
12538
+ for (const convo of evalCase.conversation) {
12539
+ if (convo.finalResponse) {
12540
+ expected.push({
12541
+ invocationId: `${evalCase.evalId}-expected-${expected.length}`,
12542
+ userContent: convo.userContent,
12543
+ finalResponse: convo.finalResponse,
12544
+ intermediateData: convo.intermediateData,
12545
+ creationTimestamp: convo.creationTimestamp
12546
+ });
12547
+ }
12548
+ }
12549
+ const actual = await this.runInference(evalCase);
12550
+ yield [...expected, ...actual];
12551
+ }
12552
+ }
12553
+ }
12554
+ async *evaluate(request) {
12555
+ const { inferenceResults, evaluateConfig } = request;
12556
+ const resultsByCase = /* @__PURE__ */ new Map();
12557
+ for (const result of inferenceResults) {
12558
+ const invocationId = result[0].invocationId;
12559
+ if (!invocationId) continue;
12560
+ const lastHyphenIndex = invocationId.lastIndexOf("-");
12561
+ const evalId = lastHyphenIndex !== -1 ? invocationId.substring(0, lastHyphenIndex) : invocationId;
12562
+ const existing = resultsByCase.get(evalId) || [];
12563
+ resultsByCase.set(evalId, [...existing, ...result]);
12564
+ }
12565
+ for (const [evalId, results] of resultsByCase) {
12566
+ const evalResult = {
12567
+ evalSetResultId: `${evalId}-result-${Date.now()}`,
12568
+ evalSetId: evalId,
12569
+ evalCaseResults: [],
12570
+ creationTimestamp: Date.now()
12571
+ };
12572
+ for (const evalMetric of evaluateConfig.evalMetrics) {
12573
+ const evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.getEvaluator(evalMetric);
12574
+ const actual = results.filter(
12575
+ (r) => !r.invocationId?.includes("expected")
12576
+ );
12577
+ const expected = results.filter(
12578
+ (r) => r.invocationId?.includes("expected")
12579
+ );
12580
+ const result = await evaluator.evaluateInvocations(actual, expected);
12581
+ evalResult.evalCaseResults.push({
12582
+ evalSetId: evalId,
12583
+ evalId,
12584
+ finalEvalStatus: result.perInvocationResults.length > 0 ? result.perInvocationResults[0].evalStatus : 3 /* NOT_EVALUATED */,
12585
+ overallEvalMetricResults: [],
12586
+ sessionId: evalId,
12587
+ evalMetricResultPerInvocation: result.perInvocationResults.map(
12588
+ (r) => ({
12589
+ actualInvocation: r.actualInvocation,
12590
+ expectedInvocation: r.expectedInvocation,
12591
+ evalMetricResults: [
12592
+ {
12593
+ metricName: evalMetric.metricName,
12594
+ threshold: evalMetric.threshold,
12595
+ score: r.score,
12596
+ evalStatus: r.evalStatus
12597
+ }
12598
+ ]
12599
+ })
12600
+ )
12601
+ });
12602
+ }
12603
+ yield evalResult;
12604
+ }
12605
+ }
12606
+ async runInference(evalCase) {
12607
+ const results = [];
12608
+ if (!this.runner) {
12609
+ await this.initializeRunner();
12610
+ }
12611
+ if (evalCase.sessionInput) {
12612
+ try {
12613
+ if (this.runner.initializeSession) {
12614
+ await this.runner.initializeSession(evalCase.sessionInput);
12615
+ } else if (this.runner.setSessionState) {
12616
+ await this.runner.setSessionState(evalCase.sessionInput);
12617
+ } else {
12618
+ console.log(
12619
+ `Session input provided for ${evalCase.evalId}:`,
12620
+ evalCase.sessionInput
12621
+ );
12622
+ }
12623
+ } catch (error) {
12624
+ console.warn(
12625
+ `Failed to initialize session for ${evalCase.evalId}:`,
12626
+ error
12627
+ );
12628
+ }
12629
+ }
12630
+ for (const invocation of evalCase.conversation) {
12631
+ try {
12632
+ const response = await this.runner.ask(invocation.userContent);
12633
+ results.push({
12634
+ invocationId: `${evalCase.evalId}-${results.length}`,
12635
+ userContent: invocation.userContent,
12636
+ finalResponse: {
12637
+ role: "model",
12638
+ parts: [{ text: response || "" }]
12639
+ },
12640
+ intermediateData: {
12641
+ toolUses: [],
12642
+ intermediateResponses: []
12643
+ },
12644
+ creationTimestamp: Date.now()
12645
+ });
12646
+ } catch (error) {
12647
+ console.error(`Error running inference for ${evalCase.evalId}:`, error);
12648
+ results.push({
12649
+ invocationId: `${evalCase.evalId}-${results.length}`,
12650
+ userContent: invocation.userContent,
12651
+ finalResponse: {
12652
+ role: "model",
12653
+ parts: [
12654
+ {
12655
+ text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
12656
+ }
12657
+ ]
12658
+ },
12659
+ intermediateData: {
12660
+ toolUses: [],
12661
+ intermediateResponses: []
12662
+ },
12663
+ creationTimestamp: Date.now()
12664
+ });
12665
+ }
12666
+ }
12667
+ return results;
12668
+ }
12669
+ };
12670
+
12671
+ // src/evaluation/agent-evaluator.ts
12672
+ var NUM_RUNS = 2;
12673
+ var TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */;
12674
+ var RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
12675
+ var RESPONSE_MATCH_SCORE_KEY = "response_match_score" /* RESPONSE_MATCH_SCORE */;
12676
+ var SAFETY_V1_KEY = "safety_v1" /* SAFETY_V1 */;
12677
+ var ALLOWED_CRITERIA = [
12678
+ TOOL_TRAJECTORY_SCORE_KEY,
12679
+ RESPONSE_EVALUATION_SCORE_KEY,
12680
+ RESPONSE_MATCH_SCORE_KEY,
12681
+ SAFETY_V1_KEY
12682
+ ];
12683
+ var QUERY_COLUMN = "query";
12684
+ var REFERENCE_COLUMN = "reference";
12685
+ var EXPECTED_TOOL_USE_COLUMN = "expected_tool_use";
12686
+ var DEFAULT_CRITERIA = {
12687
+ [TOOL_TRAJECTORY_SCORE_KEY]: 1,
12688
+ [RESPONSE_MATCH_SCORE_KEY]: 0.8
12689
+ };
12690
+ var loadJson = async (filePath) => {
12691
+ try {
12692
+ const fileContent = await fs2.readFile(filePath, "utf-8");
12693
+ return JSON.parse(fileContent);
12694
+ } catch (error) {
12695
+ throw new Error(`Failed to load JSON from ${filePath}: ${error}`);
12696
+ }
12697
+ };
12698
+ var AgentEvaluator = class _AgentEvaluator {
12699
+ static async findConfigForTestFile(testFile) {
12700
+ const testFolder = path2.dirname(testFile);
12701
+ const configPath = path2.join(testFolder, "test_config.json");
12702
+ try {
12703
+ await fs2.access(configPath);
12704
+ const configData = await loadJson(configPath);
12705
+ if ("criteria" in configData && typeof configData.criteria === "object") {
12706
+ return configData.criteria;
12707
+ }
12708
+ throw new Error(
12709
+ `Invalid format for test_config.json at ${configPath}. Expected a 'criteria' dictionary.`
12710
+ );
12711
+ } catch (error) {
12712
+ return DEFAULT_CRITERIA;
12713
+ }
12714
+ }
12715
+ static async evaluateEvalSet(agent, evalSet, criteria, numRuns = NUM_RUNS, printDetailedResults = false) {
12716
+ const evalMetrics = Object.entries(criteria).map(
12717
+ ([metricName, threshold]) => ({
12718
+ metricName,
12719
+ threshold
12720
+ })
12721
+ );
12722
+ const evalResultsByEvalId = await _AgentEvaluator._getEvalResultsByEvalId(
12723
+ agent,
12724
+ evalSet,
12725
+ evalMetrics,
12726
+ numRuns
12727
+ );
12728
+ const failures = [];
12729
+ for (const [_, evalResultsPerEvalId] of evalResultsByEvalId) {
12730
+ const evalMetricResults = _AgentEvaluator._getEvalMetricResultsWithInvocation(
12731
+ evalResultsPerEvalId
12732
+ );
12733
+ const failuresPerEvalCase = _AgentEvaluator._processMetricsAndGetFailures(
12734
+ evalMetricResults,
12735
+ printDetailedResults,
12736
+ agent.name || "Unknown Agent"
12737
+ );
12738
+ failures.push(...failuresPerEvalCase);
12739
+ }
12740
+ if (failures.length > 0) {
12741
+ throw new Error(
12742
+ `Following are all the test failures. If you looking to get more details on the failures, then please re-run this test with \`printDetailedResults\` set to \`true\`.
12743
+ ${failures.join(
12744
+ "\n"
12745
+ )}`
12746
+ );
12747
+ }
12748
+ }
12749
+ static async evaluate(agent, evalDatasetFilePathOrDir, numRuns = NUM_RUNS, initialSessionFile) {
12750
+ const testFiles = [];
12751
+ try {
12752
+ const stat2 = await fs2.stat(evalDatasetFilePathOrDir);
12753
+ if (stat2.isDirectory()) {
12754
+ const files = await this._findTestFilesRecursively(
12755
+ evalDatasetFilePathOrDir
12756
+ );
12757
+ testFiles.push(...files);
12758
+ } else {
12759
+ testFiles.push(evalDatasetFilePathOrDir);
12760
+ }
12761
+ } catch (error) {
12762
+ throw new Error(`Invalid path: ${evalDatasetFilePathOrDir}`);
12763
+ }
12764
+ const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
12765
+ for (const testFile of testFiles) {
12766
+ const criteria = await _AgentEvaluator.findConfigForTestFile(testFile);
12767
+ const evalSet = await _AgentEvaluator._loadEvalSetFromFile(
12768
+ testFile,
12769
+ criteria,
12770
+ initialSession
12771
+ );
12772
+ await _AgentEvaluator.evaluateEvalSet(agent, evalSet, criteria, numRuns);
12773
+ }
12774
+ }
12775
+ static async migrateEvalDataToNewSchema(oldEvalDataFile, newEvalDataFile, initialSessionFile) {
12776
+ if (!oldEvalDataFile || !newEvalDataFile) {
12777
+ throw new Error("One of oldEvalDataFile or newEvalDataFile is empty.");
12778
+ }
12779
+ const criteria = await _AgentEvaluator.findConfigForTestFile(oldEvalDataFile);
12780
+ const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
12781
+ const evalSet = await _AgentEvaluator._getEvalSetFromOldFormat(
12782
+ oldEvalDataFile,
12783
+ criteria,
12784
+ initialSession
12785
+ );
12786
+ await fs2.writeFile(newEvalDataFile, JSON.stringify(evalSet, null, 2));
12787
+ }
12788
+ static async _findTestFilesRecursively(dir) {
12789
+ const testFiles = [];
12790
+ async function walk(currentDir) {
12791
+ const entries = await fs2.readdir(currentDir, { withFileTypes: true });
12792
+ for (const entry of entries) {
12793
+ const fullPath = path2.join(currentDir, entry.name);
12794
+ if (entry.isDirectory()) {
12795
+ await walk(fullPath);
12796
+ } else if (entry.name.endsWith(".test.json")) {
12797
+ testFiles.push(fullPath);
12798
+ }
12799
+ }
12800
+ }
12801
+ await walk(dir);
12802
+ return testFiles;
12803
+ }
12804
+ static async _loadEvalSetFromFile(evalSetFile, criteria, initialSession) {
12805
+ try {
12806
+ const content = await fs2.readFile(evalSetFile, "utf-8");
12807
+ try {
12808
+ const evalSet = JSON.parse(content);
12809
+ if (evalSet.evalSetId && evalSet.evalCases) {
12810
+ if (Object.keys(initialSession).length > 0) {
12811
+ throw new Error(
12812
+ "Initial session should be specified as a part of EvalSet file. Explicit initial session is only needed, when specifying data in the older schema."
12813
+ );
12814
+ }
12815
+ return evalSet;
12816
+ }
12817
+ } catch (parseError) {
12818
+ throw new Error(`Failed to parse eval set data: ${parseError}`);
12819
+ }
12820
+ } catch (error) {
12821
+ throw new Error(`Failed to process eval set file: ${error}`);
12822
+ }
12823
+ console.warn(
12824
+ `Contents of ${evalSetFile} appear to be in older format. To avoid this warning, please update your test files to contain data in EvalSet schema. You can use 'migrateEvalDataToNewSchema' for migrating your old test files.`
12825
+ );
12826
+ return _AgentEvaluator._getEvalSetFromOldFormat(
12827
+ evalSetFile,
12828
+ criteria,
12829
+ initialSession
12830
+ );
12831
+ }
12832
+ static async _getEvalSetFromOldFormat(evalSetFile, criteria, initialSession) {
12833
+ const data = await _AgentEvaluator._loadDataset(evalSetFile);
12834
+ _AgentEvaluator._validateInput(data, criteria);
12835
+ return {
12836
+ evalSetId: `eval-set-${Date.now()}`,
12837
+ name: evalSetFile,
12838
+ evalCases: data[0].map(
12839
+ (item, index) => ({
12840
+ evalId: `eval-${index}`,
12841
+ conversation: [
12842
+ {
12843
+ invocationId: `invocation-${index}`,
12844
+ userContent: {
12845
+ role: "user",
12846
+ parts: [{ text: item[QUERY_COLUMN] || "" }]
12847
+ },
12848
+ finalResponse: item[REFERENCE_COLUMN] ? {
12849
+ role: "model",
12850
+ parts: [{ text: item[REFERENCE_COLUMN] }]
12851
+ } : void 0,
12852
+ intermediateData: item[EXPECTED_TOOL_USE_COLUMN] ? {
12853
+ toolUses: item[EXPECTED_TOOL_USE_COLUMN],
12854
+ intermediateResponses: []
12855
+ } : void 0,
12856
+ creationTimestamp: Date.now()
12857
+ }
12858
+ ],
12859
+ sessionInput: Object.keys(initialSession).length > 0 ? {
12860
+ appName: "test-app",
12861
+ userId: "test-user",
12862
+ state: initialSession
12863
+ } : void 0
12864
+ })
12865
+ ),
12866
+ creationTimestamp: Date.now()
12867
+ };
12868
+ }
12869
+ static async _getInitialSession(initialSessionFile) {
12870
+ if (!initialSessionFile) {
12871
+ return {};
12872
+ }
12873
+ try {
12874
+ const content = await fs2.readFile(initialSessionFile, "utf-8");
12875
+ return JSON.parse(content);
12876
+ } catch (error) {
12877
+ throw new Error(
12878
+ `Failed to load initial session from ${initialSessionFile}: ${error}`
12879
+ );
12880
+ }
12881
+ }
12882
+ static async _loadDataset(inputData) {
12883
+ const stat2 = await fs2.stat(inputData);
12884
+ if (stat2.isDirectory()) {
12885
+ const testFiles = await this._findTestFilesRecursively(inputData);
12886
+ const results = await Promise.all(testFiles.map((f) => loadJson(f)));
12887
+ return results.map((r) => Array.isArray(r) ? r : [r]);
12888
+ }
12889
+ if (stat2.isFile()) {
12890
+ const data = await loadJson(inputData);
12891
+ return [Array.isArray(data) ? data : [data]];
12892
+ }
12893
+ throw new Error(`Invalid input path: ${inputData}`);
12894
+ }
12895
+ static _validateInput(evalDataset, criteria) {
12896
+ if (!evalDataset || evalDataset.length === 0) {
12897
+ throw new Error("The evaluation dataset is None or empty.");
12898
+ }
12899
+ for (const key of Object.keys(criteria)) {
12900
+ if (!ALLOWED_CRITERIA.includes(key)) {
12901
+ throw new Error(
12902
+ `Invalid criteria key: ${key}. Expected one of ${ALLOWED_CRITERIA.join(
12903
+ ", "
12904
+ )}.`
12905
+ );
12906
+ }
12907
+ }
12908
+ const sample = evalDataset[0];
12909
+ if (!Array.isArray(sample) || sample.length === 0) {
12910
+ throw new Error("The evaluation dataset is empty.");
12911
+ }
12912
+ const firstQuery = sample[0];
12913
+ if (typeof firstQuery !== "object") {
12914
+ throw new Error(
12915
+ `Each evaluation dataset sample must be list of dictionary. But it's ${JSON.stringify(
12916
+ evalDataset
12917
+ )}`
12918
+ );
12919
+ }
12920
+ if (TOOL_TRAJECTORY_SCORE_KEY in criteria) {
12921
+ if (!(QUERY_COLUMN in firstQuery) || !(EXPECTED_TOOL_USE_COLUMN in firstQuery)) {
12922
+ throw new Error(
12923
+ `Samples for ${TOOL_TRAJECTORY_SCORE_KEY} must include '${QUERY_COLUMN}' and '${EXPECTED_TOOL_USE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
12924
+ );
12925
+ }
12926
+ }
12927
+ if (RESPONSE_EVALUATION_SCORE_KEY in criteria) {
12928
+ if (!(QUERY_COLUMN in firstQuery)) {
12929
+ throw new Error(
12930
+ `Samples for ${RESPONSE_EVALUATION_SCORE_KEY} must include '${QUERY_COLUMN}' key. The sample is ${JSON.stringify(sample)}.`
12931
+ );
12932
+ }
12933
+ }
12934
+ if (RESPONSE_MATCH_SCORE_KEY in criteria) {
12935
+ if (!(QUERY_COLUMN in firstQuery) || !(REFERENCE_COLUMN in firstQuery)) {
12936
+ throw new Error(
12937
+ `Samples for ${RESPONSE_MATCH_SCORE_KEY} must include '${QUERY_COLUMN}' and '${REFERENCE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
12938
+ );
12939
+ }
12940
+ }
12941
+ }
12942
+ static _printDetails(evalMetricResultWithInvocations, overallEvalStatus, overallScore, metricName = "", threshold = 0) {
12943
+ console.log(
12944
+ `Summary: \`${overallEvalStatus}\` for Metric: \`${metricName}\`. Expected threshold: \`${threshold}\`, actual value: \`${overallScore}\`.`
12945
+ );
12946
+ const data = evalMetricResultWithInvocations.map((per) => ({
12947
+ evalStatus: per.evalMetricResult.evalStatus,
12948
+ score: per.evalMetricResult.score,
12949
+ threshold,
12950
+ prompt: _AgentEvaluator._convertContentToText(
12951
+ per.expectedInvocation.userContent
12952
+ ),
12953
+ expectedResponse: _AgentEvaluator._convertContentToText(
12954
+ per.expectedInvocation.finalResponse
12955
+ ),
12956
+ actualResponse: _AgentEvaluator._convertContentToText(
12957
+ per.actualInvocation.finalResponse
12958
+ ),
12959
+ expectedToolCalls: _AgentEvaluator._convertToolCallsToText(
12960
+ per.expectedInvocation.intermediateData
12961
+ ),
12962
+ actualToolCalls: _AgentEvaluator._convertToolCallsToText(
12963
+ per.actualInvocation.intermediateData
12964
+ )
12965
+ }));
12966
+ console.table(data);
12967
+ console.log("\n\n");
12968
+ }
12969
+ static _convertContentToText(content) {
12970
+ if (content?.parts) {
12971
+ return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
12972
+ }
12973
+ return "";
12974
+ }
12975
+ static _convertToolCallsToText(intermediateData) {
12976
+ if (intermediateData?.toolUses) {
12977
+ return intermediateData.toolUses.map((t) => JSON.stringify(t)).join("\n");
12978
+ }
12979
+ return "";
12980
+ }
12981
+ static async _getEvalResultsByEvalId(agent, evalSet, evalMetrics, numRuns) {
12982
+ const evalService = new LocalEvalService(agent);
12983
+ const inferenceResults = [];
12984
+ for (let run = 0; run < numRuns; run++) {
12985
+ for await (const result of evalService.performInference({
12986
+ evalSetId: evalSet.evalSetId,
12987
+ evalCases: [evalSet]
12988
+ })) {
12989
+ inferenceResults.push(result);
12990
+ }
12991
+ }
12992
+ const evalResultsByEvalId = /* @__PURE__ */ new Map();
12993
+ for await (const evalResult of evalService.evaluate({
12994
+ inferenceResults,
12995
+ evaluateConfig: { evalMetrics }
12996
+ })) {
12997
+ for (const caseResult of evalResult.evalCaseResults) {
12998
+ const evalId = caseResult.evalId;
12999
+ if (!evalResultsByEvalId.has(evalId)) {
13000
+ evalResultsByEvalId.set(evalId, []);
13001
+ }
13002
+ evalResultsByEvalId.get(evalId).push(caseResult);
13003
+ }
13004
+ }
13005
+ return evalResultsByEvalId;
13006
+ }
13007
+ static _getEvalMetricResultsWithInvocation(evalResultsPerEvalId) {
13008
+ const evalMetricResults = {};
13009
+ for (const evalCaseResult of evalResultsPerEvalId) {
13010
+ for (const evalMetricsPerInvocation of evalCaseResult.evalMetricResultPerInvocation) {
13011
+ for (const evalMetricResult of evalMetricsPerInvocation.evalMetricResults) {
13012
+ const metricName = evalMetricResult.metricName;
13013
+ if (!(metricName in evalMetricResults)) {
13014
+ evalMetricResults[metricName] = [];
13015
+ }
13016
+ evalMetricResults[metricName].push({
13017
+ actualInvocation: evalMetricsPerInvocation.actualInvocation,
13018
+ expectedInvocation: evalMetricsPerInvocation.expectedInvocation,
13019
+ evalMetricResult
13020
+ });
13021
+ }
13022
+ }
13023
+ }
13024
+ return evalMetricResults;
13025
+ }
13026
+ static _processMetricsAndGetFailures(evalMetricResults, printDetailedResults, agentModule) {
13027
+ const failures = [];
13028
+ for (const [metricName, evalMetricResultsWithInvocations] of Object.entries(
13029
+ evalMetricResults
13030
+ )) {
13031
+ const threshold = evalMetricResultsWithInvocations[0]?.evalMetricResult.threshold || 0;
13032
+ const scores = evalMetricResultsWithInvocations.map((m) => m.evalMetricResult.score).filter((s) => s !== void 0);
13033
+ let overallScore;
13034
+ let overallEvalStatus;
13035
+ if (scores.length > 0) {
13036
+ overallScore = scores.reduce((a, b) => a + b, 0) / scores.length;
13037
+ overallEvalStatus = overallScore >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
13038
+ } else {
13039
+ overallScore = void 0;
13040
+ overallEvalStatus = 3 /* NOT_EVALUATED */;
13041
+ }
13042
+ if (overallEvalStatus !== 1 /* PASSED */) {
13043
+ if (printDetailedResults) {
13044
+ _AgentEvaluator._printDetails(
13045
+ evalMetricResultsWithInvocations,
13046
+ overallEvalStatus,
13047
+ overallScore,
13048
+ metricName,
13049
+ threshold
13050
+ );
13051
+ }
13052
+ failures.push(
13053
+ `${metricName} for ${agentModule} Failed. Expected ${threshold}, but got ${overallScore}.`
13054
+ );
13055
+ }
13056
+ }
13057
+ return failures;
13058
+ }
13059
+ };
13060
+
13061
+ // src/evaluation/final-response-match-v1.ts
13062
+ var RougeEvaluator = class extends Evaluator {
13063
+ evalMetric;
13064
+ constructor(evalMetric) {
13065
+ super(evalMetric);
13066
+ this.evalMetric = evalMetric;
13067
+ }
13068
+ static getMetricInfo() {
13069
+ return {
13070
+ metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
13071
+ description: "This metric evaluates if the agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
13072
+ metricValueInfo: {
13073
+ interval: {
13074
+ minValue: 0,
13075
+ maxValue: 1,
13076
+ openAtMin: false,
13077
+ openAtMax: false
13078
+ }
13079
+ }
13080
+ };
13081
+ }
13082
+ async evaluateInvocations(actualInvocations, expectedInvocations) {
13083
+ let totalScore = 0;
13084
+ let numInvocations = 0;
13085
+ const perInvocationResults = [];
13086
+ for (let i = 0; i < actualInvocations.length; i++) {
13087
+ const actual = actualInvocations[i];
13088
+ const expected = expectedInvocations[i];
13089
+ const reference = getTextFromContent2(expected.finalResponse);
13090
+ const response = getTextFromContent2(actual.finalResponse);
13091
+ const rouge1Scores = await calculateRouge1Scores(response, reference);
13092
+ const score = rouge1Scores.fmeasure;
13093
+ perInvocationResults.push({
13094
+ actualInvocation: actual,
13095
+ expectedInvocation: expected,
13096
+ score,
13097
+ evalStatus: getEvalStatus2(score, this.evalMetric.threshold)
13098
+ });
13099
+ totalScore += score;
13100
+ numInvocations++;
13101
+ }
13102
+ if (perInvocationResults.length > 0) {
13103
+ const overallScore = totalScore / numInvocations;
13104
+ return {
13105
+ overallScore,
13106
+ overallEvalStatus: getEvalStatus2(
13107
+ overallScore,
13108
+ this.evalMetric.threshold
13109
+ ),
13110
+ perInvocationResults
13111
+ };
13112
+ }
13113
+ return {
13114
+ overallEvalStatus: 3 /* NOT_EVALUATED */,
13115
+ perInvocationResults: []
13116
+ };
13117
+ }
13118
+ };
13119
+ function getTextFromContent2(content) {
13120
+ if (content?.parts) {
13121
+ return content.parts.map((part) => part.text).filter(Boolean).join("\n");
13122
+ }
13123
+ return "";
13124
+ }
13125
+ function getEvalStatus2(score, threshold) {
13126
+ return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
13127
+ }
13128
+ function calculateRouge1Scores(response, reference) {
13129
+ if (!response.trim() || !reference.trim()) {
13130
+ return { precision: 0, recall: 0, fmeasure: 0 };
13131
+ }
13132
+ const responseTokens = tokenizeText(response);
13133
+ const referenceTokens = tokenizeText(reference);
13134
+ const responseUnigrams = new Set(responseTokens);
13135
+ const referenceUnigrams = new Set(referenceTokens);
13136
+ const commonUnigrams = new Set(
13137
+ [...responseUnigrams].filter((token) => referenceUnigrams.has(token))
13138
+ );
13139
+ const precision = responseUnigrams.size > 0 ? commonUnigrams.size / responseUnigrams.size : 0;
13140
+ const recall = referenceUnigrams.size > 0 ? commonUnigrams.size / referenceUnigrams.size : 0;
13141
+ const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
13142
+ return { precision, recall, fmeasure };
13143
+ }
13144
+ function tokenizeText(text) {
13145
+ return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
13146
+ }
13147
+
11755
13148
  // src/version.ts
11756
13149
  var VERSION = "0.1.0";
11757
13150
  export {
11758
13151
  AF_FUNCTION_CALL_ID_PREFIX,
11759
13152
  LlmAgent as Agent,
11760
13153
  AgentBuilder,
13154
+ AgentEvaluator,
11761
13155
  AgentTool,
11762
13156
  agents_exports as Agents,
11763
13157
  AiSdkLlm,
@@ -11791,11 +13185,16 @@ export {
11791
13185
  CodeExecutorContext,
11792
13186
  DatabaseSessionService,
11793
13187
  EnhancedAuthConfig,
13188
+ EvalResult,
13189
+ EvalStatus,
13190
+ evaluation_exports as Evaluation,
13191
+ Evaluator,
11794
13192
  Event,
11795
13193
  EventActions,
11796
13194
  events_exports as Events,
11797
13195
  ExitLoopTool,
11798
13196
  FileOperationsTool,
13197
+ FinalResponseMatchV2Evaluator,
11799
13198
  flows_exports as Flows,
11800
13199
  FunctionTool,
11801
13200
  GcsArtifactService,
@@ -11817,6 +13216,7 @@ export {
11817
13216
  LlmResponse,
11818
13217
  LoadArtifactsTool,
11819
13218
  LoadMemoryTool,
13219
+ LocalEvalService,
11820
13220
  LoopAgent,
11821
13221
  McpAbi,
11822
13222
  McpAtp,
@@ -11844,10 +13244,13 @@ export {
11844
13244
  OpenIdConnectScheme,
11845
13245
  ParallelAgent,
11846
13246
  PlanReActPlanner,
13247
+ PrebuiltMetrics,
11847
13248
  REQUEST_EUC_FUNCTION_CALL_NAME,
11848
13249
  ReadonlyContext,
13250
+ RougeEvaluator,
11849
13251
  RunConfig,
11850
13252
  Runner,
13253
+ SafetyEvaluatorV1,
11851
13254
  SequentialAgent,
11852
13255
  sessions_exports as Sessions,
11853
13256
  SingleFlow,
@@ -11856,6 +13259,7 @@ export {
11856
13259
  TelemetryService,
11857
13260
  ToolContext,
11858
13261
  tools_exports as Tools,
13262
+ TrajectoryEvaluator,
11859
13263
  TransferToAgentTool,
11860
13264
  UserInteractionTool,
11861
13265
  VERSION,