@iqai/adk 0.1.21 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +52 -0
- package/dist/index.d.mts +539 -348
- package/dist/index.d.ts +539 -348
- package/dist/index.js +1780 -376
- package/dist/index.mjs +1611 -207
- package/package.json +11 -1
package/dist/index.mjs
CHANGED
|
@@ -53,7 +53,7 @@ var init_logger = __esm({
|
|
|
53
53
|
}
|
|
54
54
|
info(message, ...args) {
|
|
55
55
|
const time = (/* @__PURE__ */ new Date()).toLocaleTimeString();
|
|
56
|
-
console.
|
|
56
|
+
console.debug(
|
|
57
57
|
this.colorize(`[${time}] \u2139\uFE0F [${this.name}] ${message}`),
|
|
58
58
|
...args
|
|
59
59
|
);
|
|
@@ -229,7 +229,7 @@ var init_base_tool = __esm({
|
|
|
229
229
|
* @param context The context of the tool
|
|
230
230
|
* @returns The result of running the tool
|
|
231
231
|
*/
|
|
232
|
-
async runAsync(args,
|
|
232
|
+
async runAsync(args, context4) {
|
|
233
233
|
throw new Error(`${this.constructor.name} runAsync is not implemented`);
|
|
234
234
|
}
|
|
235
235
|
/**
|
|
@@ -253,6 +253,12 @@ var init_base_tool = __esm({
|
|
|
253
253
|
if (!toolWithFunctionDeclarations.functionDeclarations) {
|
|
254
254
|
toolWithFunctionDeclarations.functionDeclarations = [];
|
|
255
255
|
}
|
|
256
|
+
const alreadyExists = toolWithFunctionDeclarations.functionDeclarations.some(
|
|
257
|
+
(fd) => fd?.name === functionDeclaration.name
|
|
258
|
+
);
|
|
259
|
+
if (alreadyExists) {
|
|
260
|
+
return;
|
|
261
|
+
}
|
|
256
262
|
toolWithFunctionDeclarations.functionDeclarations.push(
|
|
257
263
|
functionDeclaration
|
|
258
264
|
);
|
|
@@ -281,7 +287,7 @@ var init_base_tool = __esm({
|
|
|
281
287
|
* @param context Tool execution context
|
|
282
288
|
* @returns Result of the tool execution or error information
|
|
283
289
|
*/
|
|
284
|
-
async safeExecute(args,
|
|
290
|
+
async safeExecute(args, context4) {
|
|
285
291
|
if (!this.validateArguments(args)) {
|
|
286
292
|
return {
|
|
287
293
|
error: "Invalid arguments",
|
|
@@ -302,7 +308,7 @@ var init_base_tool = __esm({
|
|
|
302
308
|
);
|
|
303
309
|
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
304
310
|
}
|
|
305
|
-
const result = await this.runAsync(args,
|
|
311
|
+
const result = await this.runAsync(args, context4);
|
|
306
312
|
return { result };
|
|
307
313
|
} catch (error) {
|
|
308
314
|
lastError = error instanceof Error ? error : new Error(String(error));
|
|
@@ -500,7 +506,7 @@ var init_function_tool = __esm({
|
|
|
500
506
|
/**
|
|
501
507
|
* Executes the wrapped function with the provided arguments.
|
|
502
508
|
*/
|
|
503
|
-
async runAsync(args,
|
|
509
|
+
async runAsync(args, context4) {
|
|
504
510
|
try {
|
|
505
511
|
const missingArgs = this.getMissingMandatoryArgs(args);
|
|
506
512
|
if (missingArgs.length > 0) {
|
|
@@ -513,13 +519,13 @@ You could retry calling this tool, but it is IMPORTANT for you to provide all th
|
|
|
513
519
|
}
|
|
514
520
|
const argsToCall = { ...args };
|
|
515
521
|
if (this.functionAcceptsToolContext()) {
|
|
516
|
-
argsToCall.toolContext =
|
|
522
|
+
argsToCall.toolContext = context4;
|
|
517
523
|
}
|
|
518
524
|
const funcParams = this.getFunctionParameters();
|
|
519
525
|
const argValues = [];
|
|
520
526
|
for (const paramName of funcParams) {
|
|
521
527
|
if (paramName === "toolContext" && this.functionAcceptsToolContext()) {
|
|
522
|
-
argValues.push(
|
|
528
|
+
argValues.push(context4);
|
|
523
529
|
} else if (paramName in argsToCall) {
|
|
524
530
|
const convertedValue = this.convertArgumentType(
|
|
525
531
|
argsToCall[paramName],
|
|
@@ -827,70 +833,23 @@ ${instructions.join("\n\n")}`;
|
|
|
827
833
|
|
|
828
834
|
// src/models/llm-response.ts
|
|
829
835
|
var LlmResponse = class _LlmResponse {
|
|
830
|
-
/**
|
|
831
|
-
* Unique identifier for the response.
|
|
832
|
-
*/
|
|
833
836
|
id;
|
|
834
|
-
|
|
835
|
-
* The content generated by the model.
|
|
836
|
-
*/
|
|
837
|
+
text;
|
|
837
838
|
content;
|
|
838
|
-
/**
|
|
839
|
-
* The grounding metadata of the response.
|
|
840
|
-
*/
|
|
841
839
|
groundingMetadata;
|
|
842
|
-
/**
|
|
843
|
-
* Indicates whether the text content is part of an unfinished text stream.
|
|
844
|
-
*/
|
|
845
840
|
partial;
|
|
846
|
-
/**
|
|
847
|
-
* Indicates whether the response from the model is complete.
|
|
848
|
-
*/
|
|
849
841
|
turnComplete;
|
|
850
|
-
/**
|
|
851
|
-
* Error code if the response is an error.
|
|
852
|
-
*/
|
|
853
842
|
errorCode;
|
|
854
|
-
/**
|
|
855
|
-
* Error message if the response is an error.
|
|
856
|
-
*/
|
|
857
843
|
errorMessage;
|
|
858
|
-
/**
|
|
859
|
-
* Flag indicating that LLM was interrupted when generating the content.
|
|
860
|
-
*/
|
|
861
844
|
interrupted;
|
|
862
|
-
/**
|
|
863
|
-
* The custom metadata of the LlmResponse.
|
|
864
|
-
*/
|
|
865
845
|
customMetadata;
|
|
866
|
-
/**
|
|
867
|
-
* The usage metadata of the LlmResponse.
|
|
868
|
-
*/
|
|
869
846
|
usageMetadata;
|
|
870
|
-
/**
|
|
871
|
-
* Index of the candidate response.
|
|
872
|
-
*/
|
|
873
847
|
candidateIndex;
|
|
874
|
-
/**
|
|
875
|
-
* Reason why the model finished generating.
|
|
876
|
-
*/
|
|
877
848
|
finishReason;
|
|
878
|
-
/**
|
|
879
|
-
* Error object if the response is an error.
|
|
880
|
-
*/
|
|
881
849
|
error;
|
|
882
|
-
/**
|
|
883
|
-
* Creates a new LlmResponse.
|
|
884
|
-
*/
|
|
885
850
|
constructor(data = {}) {
|
|
886
851
|
Object.assign(this, data);
|
|
887
852
|
}
|
|
888
|
-
/**
|
|
889
|
-
* Creates an LlmResponse from a GenerateContentResponse.
|
|
890
|
-
*
|
|
891
|
-
* @param generateContentResponse The GenerateContentResponse to create the LlmResponse from.
|
|
892
|
-
* @returns The LlmResponse.
|
|
893
|
-
*/
|
|
894
853
|
static create(generateContentResponse) {
|
|
895
854
|
const usageMetadata = generateContentResponse.usageMetadata;
|
|
896
855
|
if (generateContentResponse.candidates && generateContentResponse.candidates.length > 0) {
|
|
@@ -922,15 +881,6 @@ var LlmResponse = class _LlmResponse {
|
|
|
922
881
|
usageMetadata
|
|
923
882
|
});
|
|
924
883
|
}
|
|
925
|
-
/**
|
|
926
|
-
* Creates an LlmResponse from an error.
|
|
927
|
-
*
|
|
928
|
-
* @param error The error object or message.
|
|
929
|
-
* @param options Additional options for the error response.
|
|
930
|
-
* @param options.errorCode A specific error code for the response.
|
|
931
|
-
* @param options.model The model that was being used when the error occurred.
|
|
932
|
-
* @returns The LlmResponse.
|
|
933
|
-
*/
|
|
934
884
|
static fromError(error, options = {}) {
|
|
935
885
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
936
886
|
const errorCode = options.errorCode || "UNKNOWN_ERROR";
|
|
@@ -954,6 +904,7 @@ init_logger();
|
|
|
954
904
|
import {
|
|
955
905
|
DiagConsoleLogger,
|
|
956
906
|
DiagLogLevel,
|
|
907
|
+
context,
|
|
957
908
|
diag,
|
|
958
909
|
trace
|
|
959
910
|
} from "@opentelemetry/api";
|
|
@@ -994,13 +945,24 @@ var TelemetryService = class {
|
|
|
994
945
|
this.sdk = new NodeSDK({
|
|
995
946
|
resource,
|
|
996
947
|
traceExporter,
|
|
997
|
-
instrumentations: [
|
|
948
|
+
instrumentations: [
|
|
949
|
+
getNodeAutoInstrumentations({
|
|
950
|
+
// Follow Python ADK approach: let all HTTP instrumentation through.
|
|
951
|
+
// This provides transparency and aligns with standard OpenTelemetry behavior.
|
|
952
|
+
// High-level LLM tracing is provided through dedicated ADK spans.
|
|
953
|
+
"@opentelemetry/instrumentation-http": {
|
|
954
|
+
ignoreIncomingRequestHook: (req) => {
|
|
955
|
+
return true;
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
})
|
|
959
|
+
]
|
|
998
960
|
});
|
|
999
961
|
try {
|
|
1000
962
|
this.sdk.start();
|
|
1001
963
|
this.isInitialized = true;
|
|
1002
964
|
this.tracer = trace.getTracer("iqai-adk", config.appVersion || "0.1.0");
|
|
1003
|
-
diag.
|
|
965
|
+
diag.debug("OpenTelemetry SDK started successfully.");
|
|
1004
966
|
} catch (error) {
|
|
1005
967
|
diag.error("Error starting OpenTelemetry SDK:", error);
|
|
1006
968
|
throw error;
|
|
@@ -1043,7 +1005,7 @@ var TelemetryService = class {
|
|
|
1043
1005
|
});
|
|
1044
1006
|
await Promise.race([this.sdk.shutdown(), timeoutPromise]);
|
|
1045
1007
|
this.isInitialized = false;
|
|
1046
|
-
diag.
|
|
1008
|
+
diag.debug("Telemetry terminated successfully.");
|
|
1047
1009
|
} catch (error) {
|
|
1048
1010
|
if (error instanceof Error && error.message.includes("timeout")) {
|
|
1049
1011
|
diag.warn("Telemetry shutdown timed out, some traces may be lost");
|
|
@@ -1071,7 +1033,7 @@ var TelemetryService = class {
|
|
|
1071
1033
|
}
|
|
1072
1034
|
}
|
|
1073
1035
|
span.setAttributes({
|
|
1074
|
-
"gen_ai.system
|
|
1036
|
+
"gen_ai.system": "iqai-adk",
|
|
1075
1037
|
"gen_ai.operation.name": "execute_tool",
|
|
1076
1038
|
"gen_ai.tool.name": tool.name,
|
|
1077
1039
|
"gen_ai.tool.description": tool.description,
|
|
@@ -1085,7 +1047,7 @@ var TelemetryService = class {
|
|
|
1085
1047
|
...process.env.NODE_ENV && {
|
|
1086
1048
|
"deployment.environment.name": process.env.NODE_ENV
|
|
1087
1049
|
},
|
|
1088
|
-
//
|
|
1050
|
+
// ADK-specific attributes (matching Python namespace pattern)
|
|
1089
1051
|
"adk.tool_call_args": this._safeJsonStringify(args),
|
|
1090
1052
|
"adk.event_id": functionResponseEvent.invocationId,
|
|
1091
1053
|
"adk.tool_response": this._safeJsonStringify(toolResponse),
|
|
@@ -1101,9 +1063,8 @@ var TelemetryService = class {
|
|
|
1101
1063
|
if (!span) return;
|
|
1102
1064
|
const requestData = this._buildLlmRequestForTrace(llmRequest);
|
|
1103
1065
|
span.setAttributes({
|
|
1104
|
-
// Standard OpenTelemetry attributes
|
|
1105
|
-
"gen_ai.system
|
|
1106
|
-
"gen_ai.operation.name": "generate",
|
|
1066
|
+
// Standard OpenTelemetry attributes (following Python pattern)
|
|
1067
|
+
"gen_ai.system": "iqai-adk",
|
|
1107
1068
|
"gen_ai.request.model": llmRequest.model,
|
|
1108
1069
|
// Session and user tracking (maps to Langfuse sessionId, userId)
|
|
1109
1070
|
"session.id": invocationContext.session.id,
|
|
@@ -1116,15 +1077,21 @@ var TelemetryService = class {
|
|
|
1116
1077
|
"gen_ai.request.max_tokens": llmRequest.config.maxOutputTokens || 0,
|
|
1117
1078
|
"gen_ai.request.temperature": llmRequest.config.temperature || 0,
|
|
1118
1079
|
"gen_ai.request.top_p": llmRequest.config.topP || 0,
|
|
1119
|
-
// Legacy ADK attributes (keep for backward compatibility)
|
|
1120
1080
|
"adk.system_name": "iqai-adk",
|
|
1121
1081
|
"adk.request_model": llmRequest.model,
|
|
1122
|
-
|
|
1082
|
+
// ADK-specific attributes (matching Python namespace pattern)
|
|
1083
|
+
"adk.invocation_id": invocationContext.invocationId,
|
|
1123
1084
|
"adk.session_id": invocationContext.session.id,
|
|
1124
1085
|
"adk.event_id": eventId,
|
|
1125
1086
|
"adk.llm_request": this._safeJsonStringify(requestData),
|
|
1126
1087
|
"adk.llm_response": this._safeJsonStringify(llmResponse)
|
|
1127
1088
|
});
|
|
1089
|
+
if (llmResponse.usageMetadata) {
|
|
1090
|
+
span.setAttributes({
|
|
1091
|
+
"gen_ai.usage.input_tokens": llmResponse.usageMetadata.promptTokenCount || 0,
|
|
1092
|
+
"gen_ai.usage.output_tokens": llmResponse.usageMetadata.candidatesTokenCount || 0
|
|
1093
|
+
});
|
|
1094
|
+
}
|
|
1128
1095
|
span.addEvent("gen_ai.content.prompt", {
|
|
1129
1096
|
"gen_ai.prompt": this._safeJsonStringify(requestData.messages)
|
|
1130
1097
|
});
|
|
@@ -1137,9 +1104,14 @@ var TelemetryService = class {
|
|
|
1137
1104
|
*/
|
|
1138
1105
|
async *traceAsyncGenerator(spanName, generator) {
|
|
1139
1106
|
const span = this.tracer.startSpan(spanName);
|
|
1107
|
+
const spanContext = trace.setSpan(context.active(), span);
|
|
1140
1108
|
try {
|
|
1141
|
-
|
|
1142
|
-
|
|
1109
|
+
while (true) {
|
|
1110
|
+
const result = await context.with(spanContext, () => generator.next());
|
|
1111
|
+
if (result.done) {
|
|
1112
|
+
break;
|
|
1113
|
+
}
|
|
1114
|
+
yield result.value;
|
|
1143
1115
|
}
|
|
1144
1116
|
} catch (error) {
|
|
1145
1117
|
span.recordException(error);
|
|
@@ -1226,7 +1198,7 @@ var traceLlmCall = (invocationContext, eventId, llmRequest, llmResponse) => tele
|
|
|
1226
1198
|
// src/models/base-llm.ts
|
|
1227
1199
|
var BaseLlm = class {
|
|
1228
1200
|
/**
|
|
1229
|
-
* The name of the LLM, e.g. gemini-
|
|
1201
|
+
* The name of the LLM, e.g. gemini-2.5-flash or gemini-2.5-flash-001.
|
|
1230
1202
|
*/
|
|
1231
1203
|
model;
|
|
1232
1204
|
logger = new Logger({ name: "BaseLlm" });
|
|
@@ -1915,7 +1887,7 @@ var GoogleLlm = class extends BaseLlm {
|
|
|
1915
1887
|
/**
|
|
1916
1888
|
* Constructor for Gemini
|
|
1917
1889
|
*/
|
|
1918
|
-
constructor(model = "gemini-
|
|
1890
|
+
constructor(model = "gemini-2.5-flash") {
|
|
1919
1891
|
super(model);
|
|
1920
1892
|
}
|
|
1921
1893
|
/**
|
|
@@ -2647,30 +2619,16 @@ var OpenAiLlm = class extends BaseLlm {
|
|
|
2647
2619
|
// src/models/llm-registry.ts
|
|
2648
2620
|
init_logger();
|
|
2649
2621
|
var LLMRegistry = class _LLMRegistry {
|
|
2650
|
-
/**
|
|
2651
|
-
* Map of model name regex to LLM class
|
|
2652
|
-
*/
|
|
2653
2622
|
static llmRegistry = /* @__PURE__ */ new Map();
|
|
2623
|
+
static modelInstances = /* @__PURE__ */ new Map();
|
|
2654
2624
|
static logger = new Logger({ name: "LLMRegistry" });
|
|
2655
|
-
/**
|
|
2656
|
-
* Creates a new LLM instance
|
|
2657
|
-
*
|
|
2658
|
-
* @param model The model name
|
|
2659
|
-
* @returns The LLM instance
|
|
2660
|
-
*/
|
|
2661
2625
|
static newLLM(model) {
|
|
2662
2626
|
const llmClass = _LLMRegistry.resolve(model);
|
|
2663
2627
|
if (!llmClass) {
|
|
2664
|
-
throw new Error(`No LLM found for model: ${model}`);
|
|
2628
|
+
throw new Error(`No LLM class found for model: ${model}`);
|
|
2665
2629
|
}
|
|
2666
2630
|
return new llmClass(model);
|
|
2667
2631
|
}
|
|
2668
|
-
/**
|
|
2669
|
-
* Resolves the LLM class from the model name
|
|
2670
|
-
*
|
|
2671
|
-
* @param model The model name
|
|
2672
|
-
* @returns The LLM class
|
|
2673
|
-
*/
|
|
2674
2632
|
static resolve(model) {
|
|
2675
2633
|
for (const [regex, llmClass] of _LLMRegistry.llmRegistry.entries()) {
|
|
2676
2634
|
if (regex.test(model)) {
|
|
@@ -2679,34 +2637,54 @@ var LLMRegistry = class _LLMRegistry {
|
|
|
2679
2637
|
}
|
|
2680
2638
|
return null;
|
|
2681
2639
|
}
|
|
2682
|
-
/**
|
|
2683
|
-
* Registers a new LLM class
|
|
2684
|
-
*
|
|
2685
|
-
* @param modelNameRegex The regex to match model names
|
|
2686
|
-
* @param llmClass The LLM class
|
|
2687
|
-
*/
|
|
2688
2640
|
static register(modelNameRegex, llmClass) {
|
|
2689
2641
|
_LLMRegistry.llmRegistry.set(new RegExp(modelNameRegex), llmClass);
|
|
2690
2642
|
}
|
|
2691
|
-
/**
|
|
2692
|
-
* Registers all model patterns from an LLM class
|
|
2693
|
-
*
|
|
2694
|
-
* @param llmClass The LLM class
|
|
2695
|
-
*/
|
|
2696
2643
|
static registerLLM(llmClass) {
|
|
2697
2644
|
const modelPatterns = llmClass.supportedModels();
|
|
2698
2645
|
for (const pattern of modelPatterns) {
|
|
2699
2646
|
_LLMRegistry.register(pattern, llmClass);
|
|
2700
2647
|
}
|
|
2701
2648
|
}
|
|
2702
|
-
|
|
2703
|
-
|
|
2704
|
-
|
|
2649
|
+
static registerModel(name, model) {
|
|
2650
|
+
_LLMRegistry.modelInstances.set(name, model);
|
|
2651
|
+
}
|
|
2652
|
+
static getModel(name) {
|
|
2653
|
+
const model = _LLMRegistry.modelInstances.get(name);
|
|
2654
|
+
if (!model) {
|
|
2655
|
+
throw new Error(`Model '${name}' not found in registry`);
|
|
2656
|
+
}
|
|
2657
|
+
return model;
|
|
2658
|
+
}
|
|
2659
|
+
static hasModel(name) {
|
|
2660
|
+
return _LLMRegistry.modelInstances.has(name);
|
|
2661
|
+
}
|
|
2662
|
+
static unregisterModel(name) {
|
|
2663
|
+
_LLMRegistry.modelInstances.delete(name);
|
|
2664
|
+
}
|
|
2665
|
+
static getModelOrCreate(name) {
|
|
2666
|
+
if (_LLMRegistry.hasModel(name)) {
|
|
2667
|
+
return _LLMRegistry.getModel(name);
|
|
2668
|
+
}
|
|
2669
|
+
return _LLMRegistry.newLLM(name);
|
|
2670
|
+
}
|
|
2671
|
+
static clear() {
|
|
2672
|
+
_LLMRegistry.llmRegistry.clear();
|
|
2673
|
+
_LLMRegistry.modelInstances.clear();
|
|
2674
|
+
}
|
|
2675
|
+
static clearModels() {
|
|
2676
|
+
_LLMRegistry.modelInstances.clear();
|
|
2677
|
+
}
|
|
2678
|
+
static clearClasses() {
|
|
2679
|
+
_LLMRegistry.llmRegistry.clear();
|
|
2680
|
+
}
|
|
2705
2681
|
static logRegisteredModels() {
|
|
2706
|
-
_LLMRegistry.
|
|
2707
|
-
|
|
2708
|
-
[..._LLMRegistry.llmRegistry.entries()].map(([regex]) => regex.toString())
|
|
2682
|
+
const classPatterns = [..._LLMRegistry.llmRegistry.entries()].map(
|
|
2683
|
+
([regex]) => regex.toString()
|
|
2709
2684
|
);
|
|
2685
|
+
const instanceNames = [..._LLMRegistry.modelInstances.keys()];
|
|
2686
|
+
_LLMRegistry.logger.debug("Registered LLM class patterns:", classPatterns);
|
|
2687
|
+
_LLMRegistry.logger.debug("Registered LLM instances:", instanceNames);
|
|
2710
2688
|
}
|
|
2711
2689
|
};
|
|
2712
2690
|
|
|
@@ -3954,10 +3932,10 @@ var CreatedTool = class extends BaseTool {
|
|
|
3954
3932
|
/**
|
|
3955
3933
|
* Executes the tool function with validation
|
|
3956
3934
|
*/
|
|
3957
|
-
async runAsync(args,
|
|
3935
|
+
async runAsync(args, context4) {
|
|
3958
3936
|
try {
|
|
3959
3937
|
const validatedArgs = this.schema.parse(args);
|
|
3960
|
-
const result = await Promise.resolve(this.func(validatedArgs,
|
|
3938
|
+
const result = await Promise.resolve(this.func(validatedArgs, context4));
|
|
3961
3939
|
return result ?? {};
|
|
3962
3940
|
} catch (error) {
|
|
3963
3941
|
if (error instanceof z.ZodError) {
|
|
@@ -4215,7 +4193,7 @@ var AgentTool = class extends BaseTool {
|
|
|
4215
4193
|
/**
|
|
4216
4194
|
* Execute the tool by running the agent with the provided input
|
|
4217
4195
|
*/
|
|
4218
|
-
async runAsync(params,
|
|
4196
|
+
async runAsync(params, context4) {
|
|
4219
4197
|
try {
|
|
4220
4198
|
const input = params.input || Object.values(params)[0];
|
|
4221
4199
|
if (!isLlmAgent(this.agent)) {
|
|
@@ -4223,7 +4201,7 @@ var AgentTool = class extends BaseTool {
|
|
|
4223
4201
|
`Agent ${this.name} does not support running as a tool`
|
|
4224
4202
|
);
|
|
4225
4203
|
}
|
|
4226
|
-
const parentInvocation =
|
|
4204
|
+
const parentInvocation = context4._invocationContext;
|
|
4227
4205
|
const childInvocationContext = new InvocationContext({
|
|
4228
4206
|
invocationId: uuidv42(),
|
|
4229
4207
|
agent: this.agent,
|
|
@@ -4260,8 +4238,8 @@ var AgentTool = class extends BaseTool {
|
|
|
4260
4238
|
} catch {
|
|
4261
4239
|
toolResult = mergedText;
|
|
4262
4240
|
}
|
|
4263
|
-
if (this.outputKey &&
|
|
4264
|
-
|
|
4241
|
+
if (this.outputKey && context4?.state) {
|
|
4242
|
+
context4.state[this.outputKey] = toolResult;
|
|
4265
4243
|
}
|
|
4266
4244
|
return toolResult;
|
|
4267
4245
|
} catch (error) {
|
|
@@ -4809,9 +4787,9 @@ var UserInteractionTool = class extends BaseTool {
|
|
|
4809
4787
|
/**
|
|
4810
4788
|
* Execute the user interaction
|
|
4811
4789
|
*/
|
|
4812
|
-
async runAsync(args,
|
|
4790
|
+
async runAsync(args, context4) {
|
|
4813
4791
|
try {
|
|
4814
|
-
const actions =
|
|
4792
|
+
const actions = context4.actions;
|
|
4815
4793
|
if (!actions || !actions.promptUser) {
|
|
4816
4794
|
return {
|
|
4817
4795
|
success: false,
|
|
@@ -4859,9 +4837,9 @@ var ExitLoopTool = class extends BaseTool {
|
|
|
4859
4837
|
/**
|
|
4860
4838
|
* Execute the exit loop action
|
|
4861
4839
|
*/
|
|
4862
|
-
async runAsync(_args,
|
|
4840
|
+
async runAsync(_args, context4) {
|
|
4863
4841
|
this.logger.debug("Executing exit loop tool");
|
|
4864
|
-
|
|
4842
|
+
context4.actions.escalate = true;
|
|
4865
4843
|
}
|
|
4866
4844
|
};
|
|
4867
4845
|
|
|
@@ -4912,14 +4890,14 @@ var GetUserChoiceTool = class extends BaseTool {
|
|
|
4912
4890
|
* This is a long running operation that will return null initially
|
|
4913
4891
|
* and the actual choice will be provided asynchronously
|
|
4914
4892
|
*/
|
|
4915
|
-
async runAsync(args,
|
|
4893
|
+
async runAsync(args, context4) {
|
|
4916
4894
|
this.logger.debug(
|
|
4917
4895
|
`Executing get_user_choice with options: ${args.options.join(", ")}`
|
|
4918
4896
|
);
|
|
4919
4897
|
if (args.question) {
|
|
4920
4898
|
this.logger.debug(`Question: ${args.question}`);
|
|
4921
4899
|
}
|
|
4922
|
-
|
|
4900
|
+
context4.actions.skipSummarization = true;
|
|
4923
4901
|
return null;
|
|
4924
4902
|
}
|
|
4925
4903
|
};
|
|
@@ -4961,9 +4939,9 @@ var TransferToAgentTool = class extends BaseTool {
|
|
|
4961
4939
|
/**
|
|
4962
4940
|
* Execute the transfer to agent action
|
|
4963
4941
|
*/
|
|
4964
|
-
async runAsync(args,
|
|
4942
|
+
async runAsync(args, context4) {
|
|
4965
4943
|
this.logger.debug(`Executing transfer to agent: ${args.agent_name}`);
|
|
4966
|
-
|
|
4944
|
+
context4.actions.transferToAgent = args.agent_name;
|
|
4967
4945
|
}
|
|
4968
4946
|
};
|
|
4969
4947
|
|
|
@@ -5004,10 +4982,10 @@ var LoadMemoryTool = class extends BaseTool {
|
|
|
5004
4982
|
/**
|
|
5005
4983
|
* Execute the memory loading action
|
|
5006
4984
|
*/
|
|
5007
|
-
async runAsync(args,
|
|
4985
|
+
async runAsync(args, context4) {
|
|
5008
4986
|
this.logger.debug(`Executing load_memory with query: ${args.query}`);
|
|
5009
4987
|
try {
|
|
5010
|
-
const searchResult = await
|
|
4988
|
+
const searchResult = await context4.searchMemory(args.query);
|
|
5011
4989
|
return {
|
|
5012
4990
|
memories: searchResult.memories || [],
|
|
5013
4991
|
count: searchResult.memories?.length || 0
|
|
@@ -5057,7 +5035,7 @@ var LoadArtifactsTool = class extends BaseTool {
|
|
|
5057
5035
|
/**
|
|
5058
5036
|
* Execute the load artifacts operation
|
|
5059
5037
|
*/
|
|
5060
|
-
async runAsync(args,
|
|
5038
|
+
async runAsync(args, context4) {
|
|
5061
5039
|
const artifactNames = args.artifact_names || [];
|
|
5062
5040
|
return { artifact_names: artifactNames };
|
|
5063
5041
|
}
|
|
@@ -6088,12 +6066,12 @@ var McpToolset = class {
|
|
|
6088
6066
|
* Checks if a tool should be included based on the tool filter.
|
|
6089
6067
|
* Similar to Python's _is_selected method.
|
|
6090
6068
|
*/
|
|
6091
|
-
isSelected(tool,
|
|
6069
|
+
isSelected(tool, context4) {
|
|
6092
6070
|
if (!this.toolFilter) {
|
|
6093
6071
|
return true;
|
|
6094
6072
|
}
|
|
6095
6073
|
if (typeof this.toolFilter === "function") {
|
|
6096
|
-
return this.toolFilter(tool,
|
|
6074
|
+
return this.toolFilter(tool, context4);
|
|
6097
6075
|
}
|
|
6098
6076
|
if (Array.isArray(this.toolFilter)) {
|
|
6099
6077
|
return this.toolFilter.includes(tool.name);
|
|
@@ -6146,7 +6124,7 @@ var McpToolset = class {
|
|
|
6146
6124
|
* Retrieves tools from the MCP server and converts them to BaseTool instances.
|
|
6147
6125
|
* Similar to Python's get_tools method.
|
|
6148
6126
|
*/
|
|
6149
|
-
async getTools(
|
|
6127
|
+
async getTools(context4) {
|
|
6150
6128
|
try {
|
|
6151
6129
|
if (this.isClosing) {
|
|
6152
6130
|
throw new McpError(
|
|
@@ -6168,7 +6146,7 @@ var McpToolset = class {
|
|
|
6168
6146
|
}
|
|
6169
6147
|
const tools = [];
|
|
6170
6148
|
for (const mcpTool of toolsResponse.tools) {
|
|
6171
|
-
if (this.isSelected(mcpTool,
|
|
6149
|
+
if (this.isSelected(mcpTool, context4)) {
|
|
6172
6150
|
try {
|
|
6173
6151
|
const tool = await createTool2(mcpTool, client);
|
|
6174
6152
|
tools.push(tool);
|
|
@@ -6205,9 +6183,9 @@ var McpToolset = class {
|
|
|
6205
6183
|
/**
|
|
6206
6184
|
* Refreshes the tool cache by clearing it and fetching tools again
|
|
6207
6185
|
*/
|
|
6208
|
-
async refreshTools(
|
|
6186
|
+
async refreshTools(context4) {
|
|
6209
6187
|
this.tools = [];
|
|
6210
|
-
return this.getTools(
|
|
6188
|
+
return this.getTools(context4);
|
|
6211
6189
|
}
|
|
6212
6190
|
/**
|
|
6213
6191
|
* Closes the connection to the MCP server.
|
|
@@ -6251,6 +6229,7 @@ async function getMcpTools(config, toolFilter) {
|
|
|
6251
6229
|
}
|
|
6252
6230
|
|
|
6253
6231
|
// src/flows/llm-flows/functions.ts
|
|
6232
|
+
import { context as context2, trace as trace2 } from "@opentelemetry/api";
|
|
6254
6233
|
var AF_FUNCTION_CALL_ID_PREFIX = "adk-";
|
|
6255
6234
|
var REQUEST_EUC_FUNCTION_CALL_NAME = "adk_request_credential";
|
|
6256
6235
|
function generateClientFunctionCallId() {
|
|
@@ -6340,23 +6319,40 @@ async function handleFunctionCallsAsync(invocationContext, functionCallEvent, to
|
|
|
6340
6319
|
toolsDict
|
|
6341
6320
|
);
|
|
6342
6321
|
const functionArgs = functionCall.args || {};
|
|
6343
|
-
const
|
|
6344
|
-
|
|
6345
|
-
|
|
6346
|
-
|
|
6347
|
-
|
|
6348
|
-
|
|
6322
|
+
const tracer2 = telemetryService.getTracer();
|
|
6323
|
+
const span = tracer2.startSpan(`execute_tool ${tool.name}`);
|
|
6324
|
+
const spanContext = trace2.setSpan(context2.active(), span);
|
|
6325
|
+
try {
|
|
6326
|
+
const functionResponse = await context2.with(spanContext, async () => {
|
|
6327
|
+
const result = await callToolAsync(tool, functionArgs, toolContext);
|
|
6328
|
+
if (tool.isLongRunning && !result) {
|
|
6329
|
+
return null;
|
|
6330
|
+
}
|
|
6331
|
+
const functionResponseEvent = buildResponseEvent(
|
|
6332
|
+
tool,
|
|
6333
|
+
result,
|
|
6334
|
+
toolContext,
|
|
6335
|
+
invocationContext
|
|
6336
|
+
);
|
|
6337
|
+
telemetryService.traceToolCall(
|
|
6338
|
+
tool,
|
|
6339
|
+
functionArgs,
|
|
6340
|
+
functionResponseEvent
|
|
6341
|
+
);
|
|
6342
|
+
return { result, event: functionResponseEvent };
|
|
6343
|
+
});
|
|
6349
6344
|
if (!functionResponse) {
|
|
6350
6345
|
continue;
|
|
6351
6346
|
}
|
|
6347
|
+
functionResponseEvents.push(functionResponse.event);
|
|
6348
|
+
span.setStatus({ code: 1 });
|
|
6349
|
+
} catch (error) {
|
|
6350
|
+
span.recordException(error);
|
|
6351
|
+
span.setStatus({ code: 2, message: error.message });
|
|
6352
|
+
throw error;
|
|
6353
|
+
} finally {
|
|
6354
|
+
span.end();
|
|
6352
6355
|
}
|
|
6353
|
-
const functionResponseEvent = buildResponseEvent(
|
|
6354
|
-
tool,
|
|
6355
|
-
functionResponse,
|
|
6356
|
-
toolContext,
|
|
6357
|
-
invocationContext
|
|
6358
|
-
);
|
|
6359
|
-
functionResponseEvents.push(functionResponseEvent);
|
|
6360
6356
|
}
|
|
6361
6357
|
if (!functionResponseEvents.length) {
|
|
6362
6358
|
return null;
|
|
@@ -6456,7 +6452,7 @@ var BaseLlmFlow = class {
|
|
|
6456
6452
|
responseProcessors = [];
|
|
6457
6453
|
logger = new Logger({ name: "BaseLlmFlow" });
|
|
6458
6454
|
async *runAsync(invocationContext) {
|
|
6459
|
-
this.logger.
|
|
6455
|
+
this.logger.debug(`Agent '${invocationContext.agent.name}' started.`);
|
|
6460
6456
|
let stepCount = 0;
|
|
6461
6457
|
while (true) {
|
|
6462
6458
|
stepCount++;
|
|
@@ -6466,7 +6462,7 @@ var BaseLlmFlow = class {
|
|
|
6466
6462
|
yield event;
|
|
6467
6463
|
}
|
|
6468
6464
|
if (!lastEvent || lastEvent.isFinalResponse()) {
|
|
6469
|
-
this.logger.
|
|
6465
|
+
this.logger.debug(
|
|
6470
6466
|
`Agent '${invocationContext.agent.name}' finished after ${stepCount} steps.`
|
|
6471
6467
|
);
|
|
6472
6468
|
break;
|
|
@@ -6496,7 +6492,7 @@ var BaseLlmFlow = class {
|
|
|
6496
6492
|
yield event;
|
|
6497
6493
|
}
|
|
6498
6494
|
if (invocationContext.endInvocation) {
|
|
6499
|
-
this.logger.
|
|
6495
|
+
this.logger.debug("Invocation ended during preprocessing.");
|
|
6500
6496
|
return;
|
|
6501
6497
|
}
|
|
6502
6498
|
const modelResponseEvent = new Event({
|
|
@@ -6536,9 +6532,23 @@ var BaseLlmFlow = class {
|
|
|
6536
6532
|
yield event;
|
|
6537
6533
|
}
|
|
6538
6534
|
}
|
|
6539
|
-
|
|
6535
|
+
let tools = await agent.canonicalTools(
|
|
6540
6536
|
new ReadonlyContext(invocationContext)
|
|
6541
6537
|
);
|
|
6538
|
+
if (tools.length > 1) {
|
|
6539
|
+
const seen = /* @__PURE__ */ new Set();
|
|
6540
|
+
const filtered = [];
|
|
6541
|
+
for (const t of tools) {
|
|
6542
|
+
const name = t?.name;
|
|
6543
|
+
if (!name) continue;
|
|
6544
|
+
if (seen.has(name)) {
|
|
6545
|
+
continue;
|
|
6546
|
+
}
|
|
6547
|
+
seen.add(name);
|
|
6548
|
+
filtered.push(t);
|
|
6549
|
+
}
|
|
6550
|
+
tools = filtered;
|
|
6551
|
+
}
|
|
6542
6552
|
for (const tool of tools) {
|
|
6543
6553
|
const toolContext = new ToolContext(invocationContext);
|
|
6544
6554
|
await tool.processLlmRequest(toolContext, llmRequest);
|
|
@@ -6611,7 +6621,7 @@ var BaseLlmFlow = class {
|
|
|
6611
6621
|
yield functionResponseEvent;
|
|
6612
6622
|
const transferToAgent = functionResponseEvent.actions?.transferToAgent;
|
|
6613
6623
|
if (transferToAgent) {
|
|
6614
|
-
this.logger.
|
|
6624
|
+
this.logger.debug(`\u{1F504} Live transfer to agent '${transferToAgent}'`);
|
|
6615
6625
|
const agentToRun = this._getAgentToRun(
|
|
6616
6626
|
invocationContext,
|
|
6617
6627
|
transferToAgent
|
|
@@ -6650,7 +6660,7 @@ var BaseLlmFlow = class {
|
|
|
6650
6660
|
yield functionResponseEvent;
|
|
6651
6661
|
const transferToAgent = functionResponseEvent.actions?.transferToAgent;
|
|
6652
6662
|
if (transferToAgent) {
|
|
6653
|
-
this.logger.
|
|
6663
|
+
this.logger.debug(`\u{1F504} Transferring to agent '${transferToAgent}'`);
|
|
6654
6664
|
const agentToRun = this._getAgentToRun(
|
|
6655
6665
|
invocationContext,
|
|
6656
6666
|
transferToAgent
|
|
@@ -6694,7 +6704,42 @@ var BaseLlmFlow = class {
|
|
|
6694
6704
|
}
|
|
6695
6705
|
invocationContext.incrementLlmCallCount();
|
|
6696
6706
|
const isStreaming = invocationContext.runConfig.streamingMode === "sse" /* SSE */;
|
|
6697
|
-
|
|
6707
|
+
let tools = llmRequest.config?.tools || [];
|
|
6708
|
+
if (tools.length) {
|
|
6709
|
+
const deduped = [];
|
|
6710
|
+
const seenFn = /* @__PURE__ */ new Set();
|
|
6711
|
+
for (const t of tools) {
|
|
6712
|
+
const tool = t;
|
|
6713
|
+
if (tool && Array.isArray(tool.functionDeclarations)) {
|
|
6714
|
+
const newFds = tool.functionDeclarations.filter(
|
|
6715
|
+
(fd) => {
|
|
6716
|
+
if (fd?.name) {
|
|
6717
|
+
if (seenFn.has(fd.name)) {
|
|
6718
|
+
return false;
|
|
6719
|
+
}
|
|
6720
|
+
seenFn.add(fd.name);
|
|
6721
|
+
}
|
|
6722
|
+
return true;
|
|
6723
|
+
}
|
|
6724
|
+
);
|
|
6725
|
+
if (newFds.length) {
|
|
6726
|
+
deduped.push({ ...tool, functionDeclarations: newFds });
|
|
6727
|
+
}
|
|
6728
|
+
} else if (tool?.name) {
|
|
6729
|
+
if (seenFn.has(tool.name)) continue;
|
|
6730
|
+
seenFn.add(tool.name);
|
|
6731
|
+
deduped.push(tool);
|
|
6732
|
+
} else {
|
|
6733
|
+
deduped.push(tool);
|
|
6734
|
+
}
|
|
6735
|
+
}
|
|
6736
|
+
if (deduped.length !== tools.length) {
|
|
6737
|
+
this.logger.debug(
|
|
6738
|
+
`\u{1F501} Deduplicated tool/function declarations: ${tools.length} -> ${deduped.length}`
|
|
6739
|
+
);
|
|
6740
|
+
}
|
|
6741
|
+
llmRequest.config.tools = tools = deduped;
|
|
6742
|
+
}
|
|
6698
6743
|
const toolNames = tools.map((tool) => {
|
|
6699
6744
|
if (tool.functionDeclarations && Array.isArray(tool.functionDeclarations)) {
|
|
6700
6745
|
return tool.functionDeclarations.map((fn) => fn.name).join(", ");
|
|
@@ -7074,8 +7119,6 @@ var BasicLlmRequestProcessor = class extends BaseLlmRequestProcessor {
|
|
|
7074
7119
|
llmRequest.liveConnectConfig.realtimeInputConfig = runConfig.realtimeInputConfig;
|
|
7075
7120
|
llmRequest.liveConnectConfig.enableAffectiveDialog = runConfig.enableAffectiveDialog;
|
|
7076
7121
|
llmRequest.liveConnectConfig.proactivity = runConfig.proactivity;
|
|
7077
|
-
const tools = await agent.canonicalTools();
|
|
7078
|
-
llmRequest.appendTools(tools);
|
|
7079
7122
|
for await (const _ of []) {
|
|
7080
7123
|
yield _;
|
|
7081
7124
|
}
|
|
@@ -9069,19 +9112,19 @@ var LlmAgent = class _LlmAgent extends BaseAgent {
|
|
|
9069
9112
|
* Core logic to run this agent via text-based conversation
|
|
9070
9113
|
* This matches the Python implementation's _run_async_impl
|
|
9071
9114
|
*/
|
|
9072
|
-
async *runAsyncImpl(
|
|
9115
|
+
async *runAsyncImpl(context4) {
|
|
9073
9116
|
this.logger.debug(`Starting LlmAgent execution for "${this.name}"`);
|
|
9074
9117
|
try {
|
|
9075
|
-
for await (const event of this.llmFlow.runAsync(
|
|
9118
|
+
for await (const event of this.llmFlow.runAsync(context4)) {
|
|
9076
9119
|
this.maybeSaveOutputToState(event);
|
|
9077
9120
|
yield event;
|
|
9078
9121
|
}
|
|
9079
9122
|
} catch (error) {
|
|
9080
9123
|
this.logger.error("Error in LlmAgent execution:", error);
|
|
9081
9124
|
const errorEvent = new Event({
|
|
9082
|
-
invocationId:
|
|
9125
|
+
invocationId: context4.invocationId,
|
|
9083
9126
|
author: this.name,
|
|
9084
|
-
branch:
|
|
9127
|
+
branch: context4.branch,
|
|
9085
9128
|
content: {
|
|
9086
9129
|
parts: [
|
|
9087
9130
|
{
|
|
@@ -9349,7 +9392,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9349
9392
|
/**
|
|
9350
9393
|
* Gets the next nodes to execute based on the current node and its result
|
|
9351
9394
|
*/
|
|
9352
|
-
async getNextNodes(currentNode, lastEvent,
|
|
9395
|
+
async getNextNodes(currentNode, lastEvent, context4) {
|
|
9353
9396
|
if (!currentNode.targets || currentNode.targets.length === 0) {
|
|
9354
9397
|
return [];
|
|
9355
9398
|
}
|
|
@@ -9361,7 +9404,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9361
9404
|
continue;
|
|
9362
9405
|
}
|
|
9363
9406
|
if (targetNode.condition) {
|
|
9364
|
-
const shouldExecute = await targetNode.condition(lastEvent,
|
|
9407
|
+
const shouldExecute = await targetNode.condition(lastEvent, context4);
|
|
9365
9408
|
if (!shouldExecute) {
|
|
9366
9409
|
this.logger.debug(`Skipping node "${targetName}" due to condition`);
|
|
9367
9410
|
continue;
|
|
@@ -9374,7 +9417,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9374
9417
|
/**
|
|
9375
9418
|
* Core logic to run this agent via text-based conversation.
|
|
9376
9419
|
*/
|
|
9377
|
-
async *runAsyncImpl(
|
|
9420
|
+
async *runAsyncImpl(context4) {
|
|
9378
9421
|
this.logger.debug(
|
|
9379
9422
|
`Starting graph execution from root node "${this.rootNode}"`
|
|
9380
9423
|
);
|
|
@@ -9396,7 +9439,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9396
9439
|
return;
|
|
9397
9440
|
}
|
|
9398
9441
|
let stepCount = 0;
|
|
9399
|
-
const nodesToExecute = [{ node: rootNode, context }];
|
|
9442
|
+
const nodesToExecute = [{ node: rootNode, context: context4 }];
|
|
9400
9443
|
const executedNodes = [];
|
|
9401
9444
|
let lastEvent = null;
|
|
9402
9445
|
while (nodesToExecute.length > 0 && stepCount < this.maxSteps) {
|
|
@@ -9404,7 +9447,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9404
9447
|
const { node } = nodesToExecute.shift();
|
|
9405
9448
|
this.logger.debug(`Step ${stepCount}: Executing node "${node.name}"`);
|
|
9406
9449
|
executedNodes.push(node.name);
|
|
9407
|
-
const childContext =
|
|
9450
|
+
const childContext = context4.createChildContext(node.agent);
|
|
9408
9451
|
try {
|
|
9409
9452
|
const nodeEvents = [];
|
|
9410
9453
|
for await (const event of node.agent.runAsync(childContext)) {
|
|
@@ -9417,7 +9460,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9417
9460
|
events: nodeEvents
|
|
9418
9461
|
});
|
|
9419
9462
|
if (lastEvent) {
|
|
9420
|
-
const nextNodes = await this.getNextNodes(node, lastEvent,
|
|
9463
|
+
const nextNodes = await this.getNextNodes(node, lastEvent, context4);
|
|
9421
9464
|
for (const nextNode of nextNodes) {
|
|
9422
9465
|
nodesToExecute.push({
|
|
9423
9466
|
node: nextNode,
|
|
@@ -9460,8 +9503,8 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9460
9503
|
* Core logic to run this agent via video/audio-based conversation.
|
|
9461
9504
|
* For LangGraph, this follows the same execution pattern as text-based.
|
|
9462
9505
|
*/
|
|
9463
|
-
async *runLiveImpl(
|
|
9464
|
-
yield* this.runAsyncImpl(
|
|
9506
|
+
async *runLiveImpl(context4) {
|
|
9507
|
+
yield* this.runAsyncImpl(context4);
|
|
9465
9508
|
}
|
|
9466
9509
|
/**
|
|
9467
9510
|
* Gets the execution results from the last run
|
|
@@ -9511,10 +9554,11 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9511
9554
|
};
|
|
9512
9555
|
|
|
9513
9556
|
// src/agents/agent-builder.ts
|
|
9557
|
+
init_logger();
|
|
9514
9558
|
import { generateId } from "ai";
|
|
9515
9559
|
|
|
9516
9560
|
// src/runners.ts
|
|
9517
|
-
import { SpanStatusCode } from "@opentelemetry/api";
|
|
9561
|
+
import { SpanStatusCode, context as context3, trace as trace3 } from "@opentelemetry/api";
|
|
9518
9562
|
|
|
9519
9563
|
// src/agents/run-config.ts
|
|
9520
9564
|
var StreamingMode = /* @__PURE__ */ ((StreamingMode2) => {
|
|
@@ -9624,19 +9668,19 @@ var InMemoryArtifactService = class {
|
|
|
9624
9668
|
}
|
|
9625
9669
|
async saveArtifact(args) {
|
|
9626
9670
|
const { appName, userId, sessionId, filename, artifact } = args;
|
|
9627
|
-
const
|
|
9628
|
-
if (!this.artifacts.has(
|
|
9629
|
-
this.artifacts.set(
|
|
9671
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9672
|
+
if (!this.artifacts.has(path3)) {
|
|
9673
|
+
this.artifacts.set(path3, []);
|
|
9630
9674
|
}
|
|
9631
|
-
const versions = this.artifacts.get(
|
|
9675
|
+
const versions = this.artifacts.get(path3);
|
|
9632
9676
|
const version = versions.length;
|
|
9633
9677
|
versions.push(artifact);
|
|
9634
9678
|
return version;
|
|
9635
9679
|
}
|
|
9636
9680
|
async loadArtifact(args) {
|
|
9637
9681
|
const { appName, userId, sessionId, filename, version } = args;
|
|
9638
|
-
const
|
|
9639
|
-
const versions = this.artifacts.get(
|
|
9682
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9683
|
+
const versions = this.artifacts.get(path3);
|
|
9640
9684
|
if (!versions || versions.length === 0) {
|
|
9641
9685
|
return null;
|
|
9642
9686
|
}
|
|
@@ -9657,12 +9701,12 @@ var InMemoryArtifactService = class {
|
|
|
9657
9701
|
const sessionPrefix = `${appName}/${userId}/${sessionId}/`;
|
|
9658
9702
|
const userNamespacePrefix = `${appName}/${userId}/user/`;
|
|
9659
9703
|
const filenames = [];
|
|
9660
|
-
for (const
|
|
9661
|
-
if (
|
|
9662
|
-
const filename =
|
|
9704
|
+
for (const path3 of this.artifacts.keys()) {
|
|
9705
|
+
if (path3.startsWith(sessionPrefix)) {
|
|
9706
|
+
const filename = path3.substring(sessionPrefix.length);
|
|
9663
9707
|
filenames.push(filename);
|
|
9664
|
-
} else if (
|
|
9665
|
-
const filename =
|
|
9708
|
+
} else if (path3.startsWith(userNamespacePrefix)) {
|
|
9709
|
+
const filename = path3.substring(userNamespacePrefix.length);
|
|
9666
9710
|
filenames.push(filename);
|
|
9667
9711
|
}
|
|
9668
9712
|
}
|
|
@@ -9670,16 +9714,16 @@ var InMemoryArtifactService = class {
|
|
|
9670
9714
|
}
|
|
9671
9715
|
async deleteArtifact(args) {
|
|
9672
9716
|
const { appName, userId, sessionId, filename } = args;
|
|
9673
|
-
const
|
|
9674
|
-
if (!this.artifacts.has(
|
|
9717
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9718
|
+
if (!this.artifacts.has(path3)) {
|
|
9675
9719
|
return;
|
|
9676
9720
|
}
|
|
9677
|
-
this.artifacts.delete(
|
|
9721
|
+
this.artifacts.delete(path3);
|
|
9678
9722
|
}
|
|
9679
9723
|
async listVersions(args) {
|
|
9680
9724
|
const { appName, userId, sessionId, filename } = args;
|
|
9681
|
-
const
|
|
9682
|
-
const versions = this.artifacts.get(
|
|
9725
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9726
|
+
const versions = this.artifacts.get(path3);
|
|
9683
9727
|
if (!versions || versions.length === 0) {
|
|
9684
9728
|
return [];
|
|
9685
9729
|
}
|
|
@@ -10149,7 +10193,7 @@ var Runner = class {
|
|
|
10149
10193
|
}
|
|
10150
10194
|
};
|
|
10151
10195
|
invokeRunAsync();
|
|
10152
|
-
return function* () {
|
|
10196
|
+
return (function* () {
|
|
10153
10197
|
while (true) {
|
|
10154
10198
|
while (queueIndex >= eventQueue.length && !asyncCompleted) {
|
|
10155
10199
|
}
|
|
@@ -10162,7 +10206,7 @@ var Runner = class {
|
|
|
10162
10206
|
}
|
|
10163
10207
|
yield event;
|
|
10164
10208
|
}
|
|
10165
|
-
}();
|
|
10209
|
+
})();
|
|
10166
10210
|
}
|
|
10167
10211
|
/**
|
|
10168
10212
|
* Main entry method to run the agent in this runner.
|
|
@@ -10174,11 +10218,11 @@ var Runner = class {
|
|
|
10174
10218
|
runConfig = new RunConfig()
|
|
10175
10219
|
}) {
|
|
10176
10220
|
const span = tracer.startSpan("invocation");
|
|
10221
|
+
const spanContext = trace3.setSpan(context3.active(), span);
|
|
10177
10222
|
try {
|
|
10178
|
-
const session = await
|
|
10179
|
-
|
|
10180
|
-
userId,
|
|
10181
|
-
sessionId
|
|
10223
|
+
const session = await context3.with(
|
|
10224
|
+
spanContext,
|
|
10225
|
+
() => this.sessionService.getSession(this.appName, userId, sessionId)
|
|
10182
10226
|
);
|
|
10183
10227
|
if (!session) {
|
|
10184
10228
|
throw new Error(`Session not found: ${sessionId}`);
|
|
@@ -10188,22 +10232,34 @@ var Runner = class {
|
|
|
10188
10232
|
runConfig
|
|
10189
10233
|
});
|
|
10190
10234
|
if (newMessage) {
|
|
10191
|
-
await
|
|
10192
|
-
|
|
10193
|
-
|
|
10194
|
-
|
|
10195
|
-
|
|
10235
|
+
await context3.with(
|
|
10236
|
+
spanContext,
|
|
10237
|
+
() => this._appendNewMessageToSession(
|
|
10238
|
+
session,
|
|
10239
|
+
newMessage,
|
|
10240
|
+
invocationContext,
|
|
10241
|
+
runConfig.saveInputBlobsAsArtifacts || false
|
|
10242
|
+
)
|
|
10196
10243
|
);
|
|
10197
10244
|
}
|
|
10198
10245
|
invocationContext.agent = this._findAgentToRun(session, this.agent);
|
|
10199
|
-
|
|
10200
|
-
|
|
10201
|
-
|
|
10246
|
+
const agentGenerator = invocationContext.agent.runAsync(invocationContext);
|
|
10247
|
+
while (true) {
|
|
10248
|
+
const result = await context3.with(
|
|
10249
|
+
spanContext,
|
|
10250
|
+
() => agentGenerator.next()
|
|
10251
|
+
);
|
|
10252
|
+
if (result.done) {
|
|
10253
|
+
break;
|
|
10254
|
+
}
|
|
10255
|
+
const event = result.value;
|
|
10202
10256
|
if (!event.partial) {
|
|
10203
|
-
await
|
|
10204
|
-
|
|
10205
|
-
|
|
10206
|
-
|
|
10257
|
+
await context3.with(spanContext, async () => {
|
|
10258
|
+
await this.sessionService.appendEvent(session, event);
|
|
10259
|
+
if (this.memoryService) {
|
|
10260
|
+
await this.memoryService.addSessionToMemory(session);
|
|
10261
|
+
}
|
|
10262
|
+
});
|
|
10207
10263
|
}
|
|
10208
10264
|
yield event;
|
|
10209
10265
|
}
|
|
@@ -10350,6 +10406,12 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10350
10406
|
artifactService;
|
|
10351
10407
|
agentType = "llm";
|
|
10352
10408
|
existingSession;
|
|
10409
|
+
existingAgent;
|
|
10410
|
+
// If provided, reuse directly
|
|
10411
|
+
definitionLocked = false;
|
|
10412
|
+
// Lock further definition mutation after withAgent
|
|
10413
|
+
warnedMethods = /* @__PURE__ */ new Set();
|
|
10414
|
+
logger = new Logger({ name: "AgentBuilder" });
|
|
10353
10415
|
/**
|
|
10354
10416
|
* Private constructor - use static create() method
|
|
10355
10417
|
*/
|
|
@@ -10378,6 +10440,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10378
10440
|
* @returns This builder instance for chaining
|
|
10379
10441
|
*/
|
|
10380
10442
|
withModel(model) {
|
|
10443
|
+
this.warnIfLocked("withModel");
|
|
10381
10444
|
this.config.model = model;
|
|
10382
10445
|
return this;
|
|
10383
10446
|
}
|
|
@@ -10387,6 +10450,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10387
10450
|
* @returns This builder instance for chaining
|
|
10388
10451
|
*/
|
|
10389
10452
|
withDescription(description) {
|
|
10453
|
+
this.warnIfLocked("withDescription");
|
|
10390
10454
|
this.config.description = description;
|
|
10391
10455
|
return this;
|
|
10392
10456
|
}
|
|
@@ -10396,14 +10460,17 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10396
10460
|
* @returns This builder instance for chaining
|
|
10397
10461
|
*/
|
|
10398
10462
|
withInstruction(instruction) {
|
|
10463
|
+
this.warnIfLocked("withInstruction");
|
|
10399
10464
|
this.config.instruction = instruction;
|
|
10400
10465
|
return this;
|
|
10401
10466
|
}
|
|
10402
10467
|
withInputSchema(schema) {
|
|
10468
|
+
this.warnIfLocked("withInputSchema");
|
|
10403
10469
|
this.config.inputSchema = schema;
|
|
10404
10470
|
return this;
|
|
10405
10471
|
}
|
|
10406
10472
|
withOutputSchema(schema) {
|
|
10473
|
+
this.warnIfLocked("withOutputSchema");
|
|
10407
10474
|
this.config.outputSchema = schema;
|
|
10408
10475
|
return this;
|
|
10409
10476
|
}
|
|
@@ -10413,6 +10480,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10413
10480
|
* @returns This builder instance for chaining
|
|
10414
10481
|
*/
|
|
10415
10482
|
withTools(...tools) {
|
|
10483
|
+
this.warnIfLocked("withTools");
|
|
10416
10484
|
this.config.tools = [...this.config.tools || [], ...tools];
|
|
10417
10485
|
return this;
|
|
10418
10486
|
}
|
|
@@ -10422,6 +10490,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10422
10490
|
* @returns This builder instance for chaining
|
|
10423
10491
|
*/
|
|
10424
10492
|
withPlanner(planner) {
|
|
10493
|
+
this.warnIfLocked("withPlanner");
|
|
10425
10494
|
this.config.planner = planner;
|
|
10426
10495
|
return this;
|
|
10427
10496
|
}
|
|
@@ -10431,6 +10500,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10431
10500
|
* @returns This builder instance for chaining
|
|
10432
10501
|
*/
|
|
10433
10502
|
withCodeExecutor(codeExecutor) {
|
|
10503
|
+
this.warnIfLocked("withCodeExecutor");
|
|
10434
10504
|
this.config.codeExecutor = codeExecutor;
|
|
10435
10505
|
return this;
|
|
10436
10506
|
}
|
|
@@ -10440,6 +10510,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10440
10510
|
* @returns This builder instance for chaining
|
|
10441
10511
|
*/
|
|
10442
10512
|
withOutputKey(outputKey) {
|
|
10513
|
+
this.warnIfLocked("withOutputKey");
|
|
10443
10514
|
this.config.outputKey = outputKey;
|
|
10444
10515
|
return this;
|
|
10445
10516
|
}
|
|
@@ -10449,6 +10520,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10449
10520
|
* @returns This builder instance for chaining
|
|
10450
10521
|
*/
|
|
10451
10522
|
withSubAgents(subAgents) {
|
|
10523
|
+
this.warnIfLocked("withSubAgents");
|
|
10452
10524
|
this.config.subAgents = subAgents;
|
|
10453
10525
|
return this;
|
|
10454
10526
|
}
|
|
@@ -10458,6 +10530,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10458
10530
|
* @returns This builder instance for chaining
|
|
10459
10531
|
*/
|
|
10460
10532
|
withBeforeAgentCallback(callback) {
|
|
10533
|
+
this.warnIfLocked("withBeforeAgentCallback");
|
|
10461
10534
|
this.config.beforeAgentCallback = callback;
|
|
10462
10535
|
return this;
|
|
10463
10536
|
}
|
|
@@ -10467,15 +10540,29 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10467
10540
|
* @returns This builder instance for chaining
|
|
10468
10541
|
*/
|
|
10469
10542
|
withAfterAgentCallback(callback) {
|
|
10543
|
+
this.warnIfLocked("withAfterAgentCallback");
|
|
10470
10544
|
this.config.afterAgentCallback = callback;
|
|
10471
10545
|
return this;
|
|
10472
10546
|
}
|
|
10547
|
+
/**
|
|
10548
|
+
* Provide an already constructed agent instance. Further definition-mutating calls
|
|
10549
|
+
* (model/tools/instruction/etc.) will be ignored with a dev warning.
|
|
10550
|
+
*/
|
|
10551
|
+
withAgent(agent) {
|
|
10552
|
+
this.existingAgent = agent;
|
|
10553
|
+
this.definitionLocked = true;
|
|
10554
|
+
if (this.config.name === "default_agent" && agent.name) {
|
|
10555
|
+
this.config.name = agent.name;
|
|
10556
|
+
}
|
|
10557
|
+
return this;
|
|
10558
|
+
}
|
|
10473
10559
|
/**
|
|
10474
10560
|
* Configure as a sequential agent
|
|
10475
10561
|
* @param subAgents Sub-agents to execute in sequence
|
|
10476
10562
|
* @returns This builder instance for chaining
|
|
10477
10563
|
*/
|
|
10478
10564
|
asSequential(subAgents) {
|
|
10565
|
+
this.warnIfLocked("asSequential");
|
|
10479
10566
|
this.agentType = "sequential";
|
|
10480
10567
|
this.config.subAgents = subAgents;
|
|
10481
10568
|
return this;
|
|
@@ -10486,6 +10573,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10486
10573
|
* @returns This builder instance for chaining
|
|
10487
10574
|
*/
|
|
10488
10575
|
asParallel(subAgents) {
|
|
10576
|
+
this.warnIfLocked("asParallel");
|
|
10489
10577
|
this.agentType = "parallel";
|
|
10490
10578
|
this.config.subAgents = subAgents;
|
|
10491
10579
|
return this;
|
|
@@ -10497,6 +10585,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10497
10585
|
* @returns This builder instance for chaining
|
|
10498
10586
|
*/
|
|
10499
10587
|
asLoop(subAgents, maxIterations = 3) {
|
|
10588
|
+
this.warnIfLocked("asLoop");
|
|
10500
10589
|
this.agentType = "loop";
|
|
10501
10590
|
this.config.subAgents = subAgents;
|
|
10502
10591
|
this.config.maxIterations = maxIterations;
|
|
@@ -10509,6 +10598,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10509
10598
|
* @returns This builder instance for chaining
|
|
10510
10599
|
*/
|
|
10511
10600
|
asLangGraph(nodes, rootNode) {
|
|
10601
|
+
this.warnIfLocked("asLangGraph");
|
|
10512
10602
|
this.agentType = "langgraph";
|
|
10513
10603
|
this.config.nodes = nodes;
|
|
10514
10604
|
this.config.rootNode = rootNode;
|
|
@@ -10635,6 +10725,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10635
10725
|
* @returns Created agent instance
|
|
10636
10726
|
*/
|
|
10637
10727
|
createAgent() {
|
|
10728
|
+
if (this.existingAgent) return this.existingAgent;
|
|
10638
10729
|
switch (this.agentType) {
|
|
10639
10730
|
case "llm": {
|
|
10640
10731
|
if (!this.config.model) {
|
|
@@ -10765,6 +10856,22 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10765
10856
|
}
|
|
10766
10857
|
};
|
|
10767
10858
|
}
|
|
10859
|
+
/**
|
|
10860
|
+
* Warn (once per method) if the definition has been locked by withAgent().
|
|
10861
|
+
*/
|
|
10862
|
+
warnIfLocked(method) {
|
|
10863
|
+
if (!this.definitionLocked) return;
|
|
10864
|
+
if (this.warnedMethods.has(method)) return;
|
|
10865
|
+
this.warnedMethods.add(method);
|
|
10866
|
+
if (process.env.NODE_ENV !== "production") {
|
|
10867
|
+
const msg = `AgentBuilder: attempted to call ${method} after withAgent(); ignoring. (Wrap the agent first OR configure before withAgent).`;
|
|
10868
|
+
if (this.logger && typeof this.logger.warn === "function") {
|
|
10869
|
+
this.logger.warn(msg);
|
|
10870
|
+
} else {
|
|
10871
|
+
console.warn(msg);
|
|
10872
|
+
}
|
|
10873
|
+
}
|
|
10874
|
+
}
|
|
10768
10875
|
};
|
|
10769
10876
|
|
|
10770
10877
|
// src/memory/index.ts
|
|
@@ -10818,7 +10925,7 @@ var VertexAiSessionService = class extends BaseSessionService {
|
|
|
10818
10925
|
path: `reasoningEngines/${reasoningEngineId}/sessions`,
|
|
10819
10926
|
request_dict: sessionJsonDict
|
|
10820
10927
|
});
|
|
10821
|
-
console.
|
|
10928
|
+
console.debug("Create Session response", apiResponse);
|
|
10822
10929
|
const createdSessionId = apiResponse.name.split("/").slice(-3, -2)[0];
|
|
10823
10930
|
const operationId = apiResponse.name.split("/").pop();
|
|
10824
10931
|
let maxRetryAttempt = 5;
|
|
@@ -10929,14 +11036,14 @@ var VertexAiSessionService = class extends BaseSessionService {
|
|
|
10929
11036
|
async listSessions(appName, userId) {
|
|
10930
11037
|
const reasoningEngineId = this.getReasoningEngineId(appName);
|
|
10931
11038
|
const apiClient = this.getApiClient();
|
|
10932
|
-
let
|
|
11039
|
+
let path3 = `reasoningEngines/${reasoningEngineId}/sessions`;
|
|
10933
11040
|
if (userId) {
|
|
10934
11041
|
const parsedUserId = encodeURIComponent(`"${userId}"`);
|
|
10935
|
-
|
|
11042
|
+
path3 = `${path3}?filter=user_id=${parsedUserId}`;
|
|
10936
11043
|
}
|
|
10937
11044
|
const apiResponse = await apiClient.async_request({
|
|
10938
11045
|
http_method: "GET",
|
|
10939
|
-
path:
|
|
11046
|
+
path: path3,
|
|
10940
11047
|
request_dict: {}
|
|
10941
11048
|
});
|
|
10942
11049
|
if (apiResponse.httpHeaders) {
|
|
@@ -11752,12 +11859,1299 @@ __export(flows_exports, {
|
|
|
11752
11859
|
removeClientFunctionCallId: () => removeClientFunctionCallId
|
|
11753
11860
|
});
|
|
11754
11861
|
|
|
11862
|
+
// src/evaluation/index.ts
|
|
11863
|
+
var evaluation_exports = {};
|
|
11864
|
+
__export(evaluation_exports, {
|
|
11865
|
+
AgentEvaluator: () => AgentEvaluator,
|
|
11866
|
+
EvalResult: () => EvalResult,
|
|
11867
|
+
EvalStatus: () => EvalStatus,
|
|
11868
|
+
Evaluator: () => Evaluator,
|
|
11869
|
+
FinalResponseMatchV2Evaluator: () => FinalResponseMatchV2Evaluator,
|
|
11870
|
+
LocalEvalService: () => LocalEvalService,
|
|
11871
|
+
PrebuiltMetrics: () => PrebuiltMetrics,
|
|
11872
|
+
RougeEvaluator: () => RougeEvaluator,
|
|
11873
|
+
SafetyEvaluatorV1: () => SafetyEvaluatorV1,
|
|
11874
|
+
TrajectoryEvaluator: () => TrajectoryEvaluator
|
|
11875
|
+
});
|
|
11876
|
+
|
|
11877
|
+
// src/evaluation/evaluator.ts
|
|
11878
|
+
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
11879
|
+
EvalStatus2[EvalStatus2["PASSED"] = 1] = "PASSED";
|
|
11880
|
+
EvalStatus2[EvalStatus2["FAILED"] = 2] = "FAILED";
|
|
11881
|
+
EvalStatus2[EvalStatus2["NOT_EVALUATED"] = 3] = "NOT_EVALUATED";
|
|
11882
|
+
return EvalStatus2;
|
|
11883
|
+
})(EvalStatus || {});
|
|
11884
|
+
var Evaluator = class {
|
|
11885
|
+
constructor(metric) {
|
|
11886
|
+
this.metric = metric;
|
|
11887
|
+
}
|
|
11888
|
+
static getMetricInfo(metricName) {
|
|
11889
|
+
throw new Error("getMetricInfo() must be implemented by subclass");
|
|
11890
|
+
}
|
|
11891
|
+
};
|
|
11892
|
+
|
|
11893
|
+
// src/evaluation/eval-metrics.ts
|
|
11894
|
+
var PrebuiltMetrics = /* @__PURE__ */ ((PrebuiltMetrics2) => {
|
|
11895
|
+
PrebuiltMetrics2["TOOL_TRAJECTORY_AVG_SCORE"] = "tool_trajectory_avg_score";
|
|
11896
|
+
PrebuiltMetrics2["RESPONSE_EVALUATION_SCORE"] = "response_evaluation_score";
|
|
11897
|
+
PrebuiltMetrics2["RESPONSE_MATCH_SCORE"] = "response_match_score";
|
|
11898
|
+
PrebuiltMetrics2["SAFETY_V1"] = "safety_v1";
|
|
11899
|
+
PrebuiltMetrics2["FINAL_RESPONSE_MATCH_V2"] = "final_response_match_v2";
|
|
11900
|
+
PrebuiltMetrics2["TOOL_TRAJECTORY_SCORE"] = "tool_trajectory_score";
|
|
11901
|
+
PrebuiltMetrics2["SAFETY"] = "safety";
|
|
11902
|
+
PrebuiltMetrics2["RESPONSE_MATCH"] = "response_match";
|
|
11903
|
+
return PrebuiltMetrics2;
|
|
11904
|
+
})(PrebuiltMetrics || {});
|
|
11905
|
+
|
|
11906
|
+
// src/evaluation/eval-result.ts
|
|
11907
|
+
var EvalResult = class {
|
|
11908
|
+
evalSetResultId;
|
|
11909
|
+
evalSetResultName;
|
|
11910
|
+
evalSetId;
|
|
11911
|
+
evalCaseResults;
|
|
11912
|
+
creationTimestamp;
|
|
11913
|
+
constructor(init) {
|
|
11914
|
+
this.evalSetResultId = init.evalSetResultId || "";
|
|
11915
|
+
this.evalSetResultName = init.evalSetResultName;
|
|
11916
|
+
this.evalSetId = init.evalSetId || "";
|
|
11917
|
+
this.evalCaseResults = init.evalCaseResults || [];
|
|
11918
|
+
this.creationTimestamp = init.creationTimestamp || Date.now() / 1e3;
|
|
11919
|
+
}
|
|
11920
|
+
};
|
|
11921
|
+
|
|
11922
|
+
// src/evaluation/agent-evaluator.ts
|
|
11923
|
+
import * as fs2 from "fs/promises";
|
|
11924
|
+
import * as path2 from "path";
|
|
11925
|
+
|
|
11926
|
+
// src/evaluation/base-eval-service.ts
|
|
11927
|
+
var BaseEvalService = class {
|
|
11928
|
+
async *evaluateSession(session) {
|
|
11929
|
+
const inferenceResults = [];
|
|
11930
|
+
for await (const result of this.performInference({
|
|
11931
|
+
evalSetId: session.evalSetId,
|
|
11932
|
+
evalCases: session.evalCases
|
|
11933
|
+
})) {
|
|
11934
|
+
inferenceResults.push(result);
|
|
11935
|
+
}
|
|
11936
|
+
for await (const result of this.evaluate({
|
|
11937
|
+
inferenceResults,
|
|
11938
|
+
evaluateConfig: session.evaluateConfig
|
|
11939
|
+
})) {
|
|
11940
|
+
yield result;
|
|
11941
|
+
}
|
|
11942
|
+
}
|
|
11943
|
+
};
|
|
11944
|
+
|
|
11945
|
+
// src/evaluation/vertex-ai-eval-facade.ts
|
|
11946
|
+
var ERROR_MESSAGE_SUFFIX = `
|
|
11947
|
+
You should specify both project id and location. This metric uses Vertex Gen AI
|
|
11948
|
+
Eval SDK, and it requires google cloud credentials.
|
|
11949
|
+
|
|
11950
|
+
If using an .env file add the values there, or explicitly set in the code using
|
|
11951
|
+
the template below:
|
|
11952
|
+
|
|
11953
|
+
process.env.GOOGLE_CLOUD_LOCATION = <LOCATION>
|
|
11954
|
+
process.env.GOOGLE_CLOUD_PROJECT = <PROJECT ID>
|
|
11955
|
+
`;
|
|
11956
|
+
var VertexAiEvalFacade = class _VertexAiEvalFacade {
|
|
11957
|
+
threshold;
|
|
11958
|
+
metricName;
|
|
11959
|
+
constructor(config) {
|
|
11960
|
+
this.threshold = config.threshold;
|
|
11961
|
+
this.metricName = config.metricName;
|
|
11962
|
+
}
|
|
11963
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
11964
|
+
let totalScore = 0;
|
|
11965
|
+
let numInvocations = 0;
|
|
11966
|
+
const perInvocationResults = [];
|
|
11967
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
11968
|
+
const actual = actualInvocations[i];
|
|
11969
|
+
const expected = expectedInvocations[i];
|
|
11970
|
+
const prompt = this._getText(expected.userContent);
|
|
11971
|
+
const reference = this._getText(expected.finalResponse);
|
|
11972
|
+
const response = this._getText(actual.finalResponse);
|
|
11973
|
+
const evalCase = {
|
|
11974
|
+
prompt,
|
|
11975
|
+
reference,
|
|
11976
|
+
response
|
|
11977
|
+
};
|
|
11978
|
+
try {
|
|
11979
|
+
const evalCaseResult = await _VertexAiEvalFacade._performEval(
|
|
11980
|
+
[evalCase],
|
|
11981
|
+
[this.metricName]
|
|
11982
|
+
);
|
|
11983
|
+
const score = this._getScore(evalCaseResult);
|
|
11984
|
+
perInvocationResults.push({
|
|
11985
|
+
actualInvocation: actual,
|
|
11986
|
+
expectedInvocation: expected,
|
|
11987
|
+
score,
|
|
11988
|
+
evalStatus: this._getEvalStatus(score)
|
|
11989
|
+
});
|
|
11990
|
+
if (score !== null && score !== void 0) {
|
|
11991
|
+
totalScore += score;
|
|
11992
|
+
numInvocations++;
|
|
11993
|
+
}
|
|
11994
|
+
} catch (error) {
|
|
11995
|
+
console.error("Error evaluating invocation:", error);
|
|
11996
|
+
perInvocationResults.push({
|
|
11997
|
+
actualInvocation: actual,
|
|
11998
|
+
expectedInvocation: expected,
|
|
11999
|
+
score: void 0,
|
|
12000
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12001
|
+
});
|
|
12002
|
+
}
|
|
12003
|
+
}
|
|
12004
|
+
if (perInvocationResults.length > 0) {
|
|
12005
|
+
const overallScore = numInvocations > 0 ? totalScore / numInvocations : void 0;
|
|
12006
|
+
return {
|
|
12007
|
+
overallScore,
|
|
12008
|
+
overallEvalStatus: this._getEvalStatus(overallScore),
|
|
12009
|
+
perInvocationResults
|
|
12010
|
+
};
|
|
12011
|
+
}
|
|
12012
|
+
return {
|
|
12013
|
+
overallScore: void 0,
|
|
12014
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
12015
|
+
perInvocationResults: []
|
|
12016
|
+
};
|
|
12017
|
+
}
|
|
12018
|
+
_getText(content) {
|
|
12019
|
+
if (content?.parts) {
|
|
12020
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
|
|
12021
|
+
}
|
|
12022
|
+
return "";
|
|
12023
|
+
}
|
|
12024
|
+
_getScore(evalResult) {
|
|
12025
|
+
if (evalResult?.summaryMetrics?.[0]?.meanScore !== void 0 && typeof evalResult.summaryMetrics[0].meanScore === "number" && !Number.isNaN(evalResult.summaryMetrics[0].meanScore)) {
|
|
12026
|
+
return evalResult.summaryMetrics[0].meanScore;
|
|
12027
|
+
}
|
|
12028
|
+
return void 0;
|
|
12029
|
+
}
|
|
12030
|
+
_getEvalStatus(score) {
|
|
12031
|
+
if (score !== null && score !== void 0) {
|
|
12032
|
+
return score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12033
|
+
}
|
|
12034
|
+
return 3 /* NOT_EVALUATED */;
|
|
12035
|
+
}
|
|
12036
|
+
static async _performEval(dataset, metrics) {
|
|
12037
|
+
const projectId = process.env.GOOGLE_CLOUD_PROJECT;
|
|
12038
|
+
const location = process.env.GOOGLE_CLOUD_LOCATION;
|
|
12039
|
+
if (!projectId) {
|
|
12040
|
+
throw new Error(`Missing project id. ${ERROR_MESSAGE_SUFFIX}`);
|
|
12041
|
+
}
|
|
12042
|
+
if (!location) {
|
|
12043
|
+
throw new Error(`Missing location. ${ERROR_MESSAGE_SUFFIX}`);
|
|
12044
|
+
}
|
|
12045
|
+
console.warn(
|
|
12046
|
+
"Vertex AI evaluation is not fully implemented. Using mock response."
|
|
12047
|
+
);
|
|
12048
|
+
return {
|
|
12049
|
+
summaryMetrics: [
|
|
12050
|
+
{
|
|
12051
|
+
meanScore: Math.random() * 0.5 + 0.5
|
|
12052
|
+
}
|
|
12053
|
+
]
|
|
12054
|
+
};
|
|
12055
|
+
}
|
|
12056
|
+
};
|
|
12057
|
+
|
|
12058
|
+
// src/evaluation/response-evaluator.ts
|
|
12059
|
+
var ResponseEvaluator = class extends Evaluator {
|
|
12060
|
+
metricName;
|
|
12061
|
+
threshold;
|
|
12062
|
+
constructor(evalMetric) {
|
|
12063
|
+
super(evalMetric);
|
|
12064
|
+
if (evalMetric.metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
|
|
12065
|
+
this.metricName = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
|
|
12066
|
+
} else if (evalMetric.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12067
|
+
this.metricName = "response_match_score" /* RESPONSE_MATCH_SCORE */;
|
|
12068
|
+
} else {
|
|
12069
|
+
throw new Error(`Metric ${evalMetric.metricName} is not supported.`);
|
|
12070
|
+
}
|
|
12071
|
+
this.threshold = evalMetric.threshold;
|
|
12072
|
+
}
|
|
12073
|
+
static getMetricInfo(metricName) {
|
|
12074
|
+
if (metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
|
|
12075
|
+
return {
|
|
12076
|
+
metricName: "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */,
|
|
12077
|
+
description: "This metric evaluates how coherent agent's response was. Value range of this metric is [1,5], with values closer to 5 more desirable.",
|
|
12078
|
+
metricValueInfo: {
|
|
12079
|
+
interval: {
|
|
12080
|
+
minValue: 1,
|
|
12081
|
+
maxValue: 5,
|
|
12082
|
+
openAtMin: false,
|
|
12083
|
+
openAtMax: false
|
|
12084
|
+
}
|
|
12085
|
+
}
|
|
12086
|
+
};
|
|
12087
|
+
}
|
|
12088
|
+
if (metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12089
|
+
return {
|
|
12090
|
+
metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
|
|
12091
|
+
description: "This metric evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
12092
|
+
metricValueInfo: {
|
|
12093
|
+
interval: {
|
|
12094
|
+
minValue: 0,
|
|
12095
|
+
maxValue: 1,
|
|
12096
|
+
openAtMin: false,
|
|
12097
|
+
openAtMax: false
|
|
12098
|
+
}
|
|
12099
|
+
}
|
|
12100
|
+
};
|
|
12101
|
+
}
|
|
12102
|
+
throw new Error(`Metric ${metricName} is not supported.`);
|
|
12103
|
+
}
|
|
12104
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12105
|
+
if (this.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12106
|
+
return this.evaluateRougeScore(actualInvocations, expectedInvocations);
|
|
12107
|
+
}
|
|
12108
|
+
const vertexAiFacade = new VertexAiEvalFacade({
|
|
12109
|
+
threshold: this.threshold,
|
|
12110
|
+
metricName: this.metricName
|
|
12111
|
+
});
|
|
12112
|
+
return vertexAiFacade.evaluateInvocations(
|
|
12113
|
+
actualInvocations,
|
|
12114
|
+
expectedInvocations
|
|
12115
|
+
);
|
|
12116
|
+
}
|
|
12117
|
+
async evaluateRougeScore(actualInvocations, expectedInvocations) {
|
|
12118
|
+
if (actualInvocations.length !== expectedInvocations.length) {
|
|
12119
|
+
throw new Error("Number of actual and expected invocations must match");
|
|
12120
|
+
}
|
|
12121
|
+
const results = [];
|
|
12122
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12123
|
+
const actual = actualInvocations[i];
|
|
12124
|
+
const expected = expectedInvocations[i];
|
|
12125
|
+
const result = await this.evaluateInvocation(actual, expected);
|
|
12126
|
+
results.push(result);
|
|
12127
|
+
}
|
|
12128
|
+
const scores = results.map((r) => r.score).filter((s) => s !== void 0);
|
|
12129
|
+
const overallScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
12130
|
+
const overallStatus = overallScore !== void 0 && overallScore >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12131
|
+
return {
|
|
12132
|
+
overallScore,
|
|
12133
|
+
overallEvalStatus: overallStatus,
|
|
12134
|
+
perInvocationResults: results
|
|
12135
|
+
};
|
|
12136
|
+
}
|
|
12137
|
+
async evaluateInvocation(actual, expected) {
|
|
12138
|
+
if (!actual.finalResponse || !expected.finalResponse) {
|
|
12139
|
+
return {
|
|
12140
|
+
actualInvocation: actual,
|
|
12141
|
+
expectedInvocation: expected,
|
|
12142
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12143
|
+
};
|
|
12144
|
+
}
|
|
12145
|
+
const score = await this.computeRougeScore(
|
|
12146
|
+
actual.finalResponse,
|
|
12147
|
+
expected.finalResponse
|
|
12148
|
+
);
|
|
12149
|
+
return {
|
|
12150
|
+
actualInvocation: actual,
|
|
12151
|
+
expectedInvocation: expected,
|
|
12152
|
+
score,
|
|
12153
|
+
evalStatus: score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */
|
|
12154
|
+
};
|
|
12155
|
+
}
|
|
12156
|
+
async computeRougeScore(actual, expected) {
|
|
12157
|
+
const actualText = this.extractText(actual);
|
|
12158
|
+
const expectedText = this.extractText(expected);
|
|
12159
|
+
if (!actualText.trim() || !expectedText.trim()) {
|
|
12160
|
+
return 0;
|
|
12161
|
+
}
|
|
12162
|
+
const actualTokens = this.tokenizeText(actualText);
|
|
12163
|
+
const expectedTokens = this.tokenizeText(expectedText);
|
|
12164
|
+
const actualUnigrams = new Set(actualTokens);
|
|
12165
|
+
const expectedUnigrams = new Set(expectedTokens);
|
|
12166
|
+
const commonUnigrams = new Set(
|
|
12167
|
+
[...actualUnigrams].filter((token) => expectedUnigrams.has(token))
|
|
12168
|
+
);
|
|
12169
|
+
const precision = actualUnigrams.size > 0 ? commonUnigrams.size / actualUnigrams.size : 0;
|
|
12170
|
+
const recall = expectedUnigrams.size > 0 ? commonUnigrams.size / expectedUnigrams.size : 0;
|
|
12171
|
+
const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
12172
|
+
return fmeasure;
|
|
12173
|
+
}
|
|
12174
|
+
extractText(content) {
|
|
12175
|
+
if (content?.parts) {
|
|
12176
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join(" ");
|
|
12177
|
+
}
|
|
12178
|
+
return "";
|
|
12179
|
+
}
|
|
12180
|
+
tokenizeText(text) {
|
|
12181
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
|
|
12182
|
+
}
|
|
12183
|
+
};
|
|
12184
|
+
|
|
12185
|
+
// src/evaluation/trajectory-evaluator.ts
|
|
12186
|
+
var TrajectoryEvaluator = class extends Evaluator {
|
|
12187
|
+
static getMetricInfo() {
|
|
12188
|
+
return {
|
|
12189
|
+
metricName: "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */,
|
|
12190
|
+
description: "This metric compares two tool call trajectories (expected vs. actual) for the same user interaction. It performs an exact match on the tool name and arguments for each step in the trajectory. A score of 1.0 indicates a perfect match, while 0.0 indicates a mismatch. Higher values are better.",
|
|
12191
|
+
metricValueInfo: {
|
|
12192
|
+
interval: {
|
|
12193
|
+
minValue: 0,
|
|
12194
|
+
maxValue: 1,
|
|
12195
|
+
openAtMin: false,
|
|
12196
|
+
openAtMax: false
|
|
12197
|
+
}
|
|
12198
|
+
}
|
|
12199
|
+
};
|
|
12200
|
+
}
|
|
12201
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12202
|
+
let totalToolUseAccuracy = 0;
|
|
12203
|
+
let numInvocations = 0;
|
|
12204
|
+
const perInvocationResults = [];
|
|
12205
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12206
|
+
const actual = actualInvocations[i];
|
|
12207
|
+
const expected = expectedInvocations[i];
|
|
12208
|
+
if (!actual.intermediateData?.toolUses || !expected.intermediateData?.toolUses) {
|
|
12209
|
+
perInvocationResults.push({
|
|
12210
|
+
actualInvocation: actual,
|
|
12211
|
+
expectedInvocation: expected,
|
|
12212
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12213
|
+
});
|
|
12214
|
+
continue;
|
|
12215
|
+
}
|
|
12216
|
+
const toolUseAccuracy = this.areToolCallsEqual(
|
|
12217
|
+
actual.intermediateData.toolUses,
|
|
12218
|
+
expected.intermediateData.toolUses
|
|
12219
|
+
) ? 1 : 0;
|
|
12220
|
+
perInvocationResults.push({
|
|
12221
|
+
actualInvocation: actual,
|
|
12222
|
+
expectedInvocation: expected,
|
|
12223
|
+
score: toolUseAccuracy,
|
|
12224
|
+
evalStatus: toolUseAccuracy >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */
|
|
12225
|
+
});
|
|
12226
|
+
totalToolUseAccuracy += toolUseAccuracy;
|
|
12227
|
+
numInvocations++;
|
|
12228
|
+
}
|
|
12229
|
+
const overallScore = numInvocations > 0 ? totalToolUseAccuracy / numInvocations : 0;
|
|
12230
|
+
return {
|
|
12231
|
+
overallScore,
|
|
12232
|
+
overallEvalStatus: overallScore >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */,
|
|
12233
|
+
perInvocationResults
|
|
12234
|
+
};
|
|
12235
|
+
}
|
|
12236
|
+
areToolCallsEqual(actual, expected) {
|
|
12237
|
+
if (actual.length !== expected.length) {
|
|
12238
|
+
return false;
|
|
12239
|
+
}
|
|
12240
|
+
return actual.every((actualCall, index) => {
|
|
12241
|
+
const expectedCall = expected[index];
|
|
12242
|
+
return this.isToolCallEqual(actualCall, expectedCall);
|
|
12243
|
+
});
|
|
12244
|
+
}
|
|
12245
|
+
isToolCallEqual(actual, expected) {
|
|
12246
|
+
if (actual.name !== expected.name) {
|
|
12247
|
+
return false;
|
|
12248
|
+
}
|
|
12249
|
+
const actualArgs = actual.args || {};
|
|
12250
|
+
const expectedArgs = expected.args || {};
|
|
12251
|
+
const actualKeys = Object.keys(actualArgs).sort();
|
|
12252
|
+
const expectedKeys = Object.keys(expectedArgs).sort();
|
|
12253
|
+
if (actualKeys.length !== expectedKeys.length) {
|
|
12254
|
+
return false;
|
|
12255
|
+
}
|
|
12256
|
+
return actualKeys.every((key, index) => {
|
|
12257
|
+
const expectedKey = expectedKeys[index];
|
|
12258
|
+
if (key !== expectedKey) {
|
|
12259
|
+
return false;
|
|
12260
|
+
}
|
|
12261
|
+
return JSON.stringify(actualArgs[key]) === JSON.stringify(expectedArgs[key]);
|
|
12262
|
+
});
|
|
12263
|
+
}
|
|
12264
|
+
};
|
|
12265
|
+
|
|
12266
|
+
// src/evaluation/safety-evaluator.ts
|
|
12267
|
+
var SafetyEvaluatorV1 = class extends Evaluator {
|
|
12268
|
+
static getMetricInfo() {
|
|
12269
|
+
return {
|
|
12270
|
+
metricName: "safety_v1" /* SAFETY_V1 */,
|
|
12271
|
+
description: "This metric evaluates the safety (harmlessness) of an Agent's Response. Value range of the metric is [0, 1], with values closer to 1 to be more desirable (safe).",
|
|
12272
|
+
metricValueInfo: {
|
|
12273
|
+
interval: {
|
|
12274
|
+
minValue: 0,
|
|
12275
|
+
maxValue: 1,
|
|
12276
|
+
openAtMin: false,
|
|
12277
|
+
openAtMax: false
|
|
12278
|
+
}
|
|
12279
|
+
}
|
|
12280
|
+
};
|
|
12281
|
+
}
|
|
12282
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12283
|
+
const facade = new VertexAiEvalFacade({
|
|
12284
|
+
threshold: this.metric.threshold,
|
|
12285
|
+
metricName: "safety_v1" /* SAFETY_V1 */
|
|
12286
|
+
});
|
|
12287
|
+
return await facade.evaluateInvocations(
|
|
12288
|
+
actualInvocations,
|
|
12289
|
+
expectedInvocations
|
|
12290
|
+
);
|
|
12291
|
+
}
|
|
12292
|
+
};
|
|
12293
|
+
|
|
12294
|
+
// src/evaluation/llm-as-judge-utils.ts
|
|
12295
|
+
function getTextFromContent(content) {
|
|
12296
|
+
if (content?.parts) {
|
|
12297
|
+
return content.parts.map((part) => part.text).filter(Boolean).join("\n");
|
|
12298
|
+
}
|
|
12299
|
+
return "";
|
|
12300
|
+
}
|
|
12301
|
+
function getEvalStatus(score, threshold) {
|
|
12302
|
+
return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12303
|
+
}
|
|
12304
|
+
|
|
12305
|
+
// src/evaluation/llm-as-judge.ts
|
|
12306
|
+
var LlmAsJudge = class {
|
|
12307
|
+
async sampleJudge(prompt, numSamples, critiqueParser, judgeModelOptions) {
|
|
12308
|
+
const modelName = judgeModelOptions?.judgeModel || "gemini-2.5-flash";
|
|
12309
|
+
const model = LLMRegistry.getModelOrCreate(modelName);
|
|
12310
|
+
const config = judgeModelOptions?.judgeModelConfig || {};
|
|
12311
|
+
const samples = [];
|
|
12312
|
+
for (let i = 0; i < numSamples; i++) {
|
|
12313
|
+
try {
|
|
12314
|
+
const response = await model.generateContent({
|
|
12315
|
+
prompt,
|
|
12316
|
+
...config
|
|
12317
|
+
});
|
|
12318
|
+
const label = critiqueParser(response.text);
|
|
12319
|
+
if (label !== "not_found" /* NOT_FOUND */) {
|
|
12320
|
+
samples.push(label);
|
|
12321
|
+
}
|
|
12322
|
+
} catch (error) {
|
|
12323
|
+
console.error("Error sampling judge model:", error);
|
|
12324
|
+
}
|
|
12325
|
+
}
|
|
12326
|
+
return samples;
|
|
12327
|
+
}
|
|
12328
|
+
};
|
|
12329
|
+
|
|
12330
|
+
// src/evaluation/final-response-match-v2.ts
|
|
12331
|
+
var FINAL_RESPONSE_MATCH_V2_PROMPT = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
|
|
12332
|
+
The primary focus of this rating task is to check correctness of the model responses.
|
|
12333
|
+
|
|
12334
|
+
The data consists of:
|
|
12335
|
+
- A user query.
|
|
12336
|
+
- A model generated response for the prompt. The responses can consist of:
|
|
12337
|
+
- Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
|
|
12338
|
+
- Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
|
|
12339
|
+
You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
|
|
12340
|
+
Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
|
|
12341
|
+
When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
|
|
12342
|
+
|
|
12343
|
+
You should follow the constitutions below very carefully to rate the model response:
|
|
12344
|
+
- Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
|
|
12345
|
+
- e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
|
|
12346
|
+
- e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
|
|
12347
|
+
- e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
|
|
12348
|
+
- The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
|
|
12349
|
+
- If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
|
|
12350
|
+
- If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
|
|
12351
|
+
- When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
|
|
12352
|
+
- Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
|
|
12353
|
+
- When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
|
|
12354
|
+
- When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
|
|
12355
|
+
- When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 \u2192 0.80 is acceptable, but 0.7 is not).
|
|
12356
|
+
|
|
12357
|
+
Below are the inputs:
|
|
12358
|
+
{{
|
|
12359
|
+
"User prompt": {prompt},
|
|
12360
|
+
"Agent response": {response},
|
|
12361
|
+
"Reference response": {golden_response},
|
|
12362
|
+
}}
|
|
12363
|
+
|
|
12364
|
+
The answer should be a json alone which follows the json structure below:
|
|
12365
|
+
{{
|
|
12366
|
+
"reasoning": [reasoning],
|
|
12367
|
+
"is_the_agent_response_valid": [valid or invalid],
|
|
12368
|
+
}}
|
|
12369
|
+
Answer with assertiveness:
|
|
12370
|
+
`;
|
|
12371
|
+
var DEFAULT_NUM_SAMPLES = 5;
|
|
12372
|
+
function parseCritique(response) {
|
|
12373
|
+
const labelMatchIsResponseValid = response.match(
|
|
12374
|
+
/"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]/
|
|
12375
|
+
);
|
|
12376
|
+
if (labelMatchIsResponseValid?.[1]) {
|
|
12377
|
+
const label = labelMatchIsResponseValid[1].toLowerCase();
|
|
12378
|
+
return label === "valid" ? "valid" /* VALID */ : "invalid" /* INVALID */;
|
|
12379
|
+
}
|
|
12380
|
+
return "not_found" /* NOT_FOUND */;
|
|
12381
|
+
}
|
|
12382
|
+
var FinalResponseMatchV2Evaluator = class extends Evaluator {
|
|
12383
|
+
constructor(evalMetric, llmAsJudge = new LlmAsJudge()) {
|
|
12384
|
+
super(evalMetric);
|
|
12385
|
+
this.llmAsJudge = llmAsJudge;
|
|
12386
|
+
}
|
|
12387
|
+
static getMetricInfo() {
|
|
12388
|
+
return {
|
|
12389
|
+
metricName: "final_response_match_v2" /* FINAL_RESPONSE_MATCH_V2 */,
|
|
12390
|
+
description: "This metric evaluates if the agent's final response matches a golden/expected final response using an LLM judge. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
12391
|
+
metricValueInfo: {
|
|
12392
|
+
interval: {
|
|
12393
|
+
minValue: 0,
|
|
12394
|
+
maxValue: 1,
|
|
12395
|
+
openAtMin: false,
|
|
12396
|
+
openAtMax: false
|
|
12397
|
+
}
|
|
12398
|
+
}
|
|
12399
|
+
};
|
|
12400
|
+
}
|
|
12401
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12402
|
+
const perInvocationResults = [];
|
|
12403
|
+
let totalScore = 0;
|
|
12404
|
+
let numInvocations = 0;
|
|
12405
|
+
if (!actualInvocations.length) {
|
|
12406
|
+
return {
|
|
12407
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
12408
|
+
perInvocationResults: []
|
|
12409
|
+
};
|
|
12410
|
+
}
|
|
12411
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12412
|
+
const actual = actualInvocations[i];
|
|
12413
|
+
const expected = expectedInvocations[i];
|
|
12414
|
+
const prompt = getTextFromContent(expected.userContent);
|
|
12415
|
+
const response = getTextFromContent(actual.finalResponse);
|
|
12416
|
+
const goldenResponse = getTextFromContent(expected.finalResponse);
|
|
12417
|
+
const formattedPrompt = FINAL_RESPONSE_MATCH_V2_PROMPT.replace(
|
|
12418
|
+
"{prompt}",
|
|
12419
|
+
prompt
|
|
12420
|
+
).replace("{response}", response).replace("{golden_response}", goldenResponse);
|
|
12421
|
+
const numSamples = this.metric.judgeModelOptions?.numSamples ?? DEFAULT_NUM_SAMPLES;
|
|
12422
|
+
const labels = await this.llmAsJudge.sampleJudge(
|
|
12423
|
+
formattedPrompt,
|
|
12424
|
+
numSamples,
|
|
12425
|
+
parseCritique,
|
|
12426
|
+
this.metric.judgeModelOptions
|
|
12427
|
+
);
|
|
12428
|
+
const score = labels.filter((l) => l === "valid" /* VALID */).length / labels.length;
|
|
12429
|
+
perInvocationResults.push({
|
|
12430
|
+
actualInvocation: actual,
|
|
12431
|
+
expectedInvocation: expected,
|
|
12432
|
+
score,
|
|
12433
|
+
evalStatus: getEvalStatus(score, this.metric.threshold)
|
|
12434
|
+
});
|
|
12435
|
+
totalScore += score;
|
|
12436
|
+
numInvocations++;
|
|
12437
|
+
}
|
|
12438
|
+
const overallScore = totalScore / numInvocations;
|
|
12439
|
+
return {
|
|
12440
|
+
overallScore,
|
|
12441
|
+
overallEvalStatus: getEvalStatus(overallScore, this.metric.threshold),
|
|
12442
|
+
perInvocationResults
|
|
12443
|
+
};
|
|
12444
|
+
}
|
|
12445
|
+
};
|
|
12446
|
+
|
|
12447
|
+
// src/evaluation/metric-evaluator-registry.ts
|
|
12448
|
+
var MetricEvaluatorRegistry = class {
|
|
12449
|
+
registry = /* @__PURE__ */ new Map();
|
|
12450
|
+
getEvaluator(evalMetric) {
|
|
12451
|
+
const entry = this.registry.get(evalMetric.metricName);
|
|
12452
|
+
if (!entry) {
|
|
12453
|
+
throw new Error(`${evalMetric.metricName} not found in registry.`);
|
|
12454
|
+
}
|
|
12455
|
+
return new entry.evaluator(evalMetric);
|
|
12456
|
+
}
|
|
12457
|
+
registerEvaluator(metricInfo, evaluator) {
|
|
12458
|
+
const metricName = metricInfo.metricName;
|
|
12459
|
+
if (this.registry.has(metricName)) {
|
|
12460
|
+
console.info(
|
|
12461
|
+
`Updating Evaluator class for ${metricName} from ${this.registry.get(metricName)?.evaluator.name} to ${evaluator.name}`
|
|
12462
|
+
);
|
|
12463
|
+
}
|
|
12464
|
+
this.registry.set(metricName, {
|
|
12465
|
+
evaluator,
|
|
12466
|
+
metricInfo: { ...metricInfo }
|
|
12467
|
+
});
|
|
12468
|
+
}
|
|
12469
|
+
getRegisteredMetrics() {
|
|
12470
|
+
return Array.from(this.registry.values()).map((entry) => ({
|
|
12471
|
+
...entry.metricInfo
|
|
12472
|
+
}));
|
|
12473
|
+
}
|
|
12474
|
+
};
|
|
12475
|
+
function getDefaultMetricEvaluatorRegistry() {
|
|
12476
|
+
const registry = new MetricEvaluatorRegistry();
|
|
12477
|
+
registry.registerEvaluator(
|
|
12478
|
+
TrajectoryEvaluator.getMetricInfo(),
|
|
12479
|
+
TrajectoryEvaluator
|
|
12480
|
+
);
|
|
12481
|
+
registry.registerEvaluator(
|
|
12482
|
+
ResponseEvaluator.getMetricInfo("response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */),
|
|
12483
|
+
ResponseEvaluator
|
|
12484
|
+
);
|
|
12485
|
+
registry.registerEvaluator(
|
|
12486
|
+
ResponseEvaluator.getMetricInfo("response_match_score" /* RESPONSE_MATCH_SCORE */),
|
|
12487
|
+
ResponseEvaluator
|
|
12488
|
+
);
|
|
12489
|
+
registry.registerEvaluator(
|
|
12490
|
+
SafetyEvaluatorV1.getMetricInfo(),
|
|
12491
|
+
SafetyEvaluatorV1
|
|
12492
|
+
);
|
|
12493
|
+
registry.registerEvaluator(
|
|
12494
|
+
FinalResponseMatchV2Evaluator.getMetricInfo(),
|
|
12495
|
+
FinalResponseMatchV2Evaluator
|
|
12496
|
+
);
|
|
12497
|
+
return registry;
|
|
12498
|
+
}
|
|
12499
|
+
var DEFAULT_METRIC_EVALUATOR_REGISTRY = getDefaultMetricEvaluatorRegistry();
|
|
12500
|
+
|
|
12501
|
+
// src/evaluation/local-eval-service.ts
|
|
12502
|
+
var LocalEvalService = class extends BaseEvalService {
|
|
12503
|
+
constructor(agent, parallelism = 4) {
|
|
12504
|
+
super();
|
|
12505
|
+
this.agent = agent;
|
|
12506
|
+
this.parallelism = parallelism;
|
|
12507
|
+
this.initializeRunner();
|
|
12508
|
+
}
|
|
12509
|
+
runner;
|
|
12510
|
+
async initializeRunner() {
|
|
12511
|
+
if ("ask" in this.agent) {
|
|
12512
|
+
this.runner = this.agent;
|
|
12513
|
+
} else {
|
|
12514
|
+
try {
|
|
12515
|
+
const { runner } = await AgentBuilder.create("eval_agent").withModel("gemini-2.5-flash").withDescription("Agent for evaluation purposes").build();
|
|
12516
|
+
this.runner = {
|
|
12517
|
+
ask: async (message) => {
|
|
12518
|
+
return await runner.ask(message);
|
|
12519
|
+
}
|
|
12520
|
+
};
|
|
12521
|
+
} catch (error) {
|
|
12522
|
+
console.warn(
|
|
12523
|
+
"Failed to create AgentBuilder runner, falling back to mock:",
|
|
12524
|
+
error
|
|
12525
|
+
);
|
|
12526
|
+
this.runner = {
|
|
12527
|
+
ask: async (message) => {
|
|
12528
|
+
return `Mock response to: ${message}`;
|
|
12529
|
+
}
|
|
12530
|
+
};
|
|
12531
|
+
}
|
|
12532
|
+
}
|
|
12533
|
+
}
|
|
12534
|
+
async *performInference(request) {
|
|
12535
|
+
for (const evalSet of request.evalCases) {
|
|
12536
|
+
for (const evalCase of evalSet.evalCases) {
|
|
12537
|
+
const expected = [];
|
|
12538
|
+
for (const convo of evalCase.conversation) {
|
|
12539
|
+
if (convo.finalResponse) {
|
|
12540
|
+
expected.push({
|
|
12541
|
+
invocationId: `${evalCase.evalId}-expected-${expected.length}`,
|
|
12542
|
+
userContent: convo.userContent,
|
|
12543
|
+
finalResponse: convo.finalResponse,
|
|
12544
|
+
intermediateData: convo.intermediateData,
|
|
12545
|
+
creationTimestamp: convo.creationTimestamp
|
|
12546
|
+
});
|
|
12547
|
+
}
|
|
12548
|
+
}
|
|
12549
|
+
const actual = await this.runInference(evalCase);
|
|
12550
|
+
yield [...expected, ...actual];
|
|
12551
|
+
}
|
|
12552
|
+
}
|
|
12553
|
+
}
|
|
12554
|
+
async *evaluate(request) {
|
|
12555
|
+
const { inferenceResults, evaluateConfig } = request;
|
|
12556
|
+
const resultsByCase = /* @__PURE__ */ new Map();
|
|
12557
|
+
for (const result of inferenceResults) {
|
|
12558
|
+
const invocationId = result[0].invocationId;
|
|
12559
|
+
if (!invocationId) continue;
|
|
12560
|
+
const lastHyphenIndex = invocationId.lastIndexOf("-");
|
|
12561
|
+
const evalId = lastHyphenIndex !== -1 ? invocationId.substring(0, lastHyphenIndex) : invocationId;
|
|
12562
|
+
const existing = resultsByCase.get(evalId) || [];
|
|
12563
|
+
resultsByCase.set(evalId, [...existing, ...result]);
|
|
12564
|
+
}
|
|
12565
|
+
for (const [evalId, results] of resultsByCase) {
|
|
12566
|
+
const evalResult = {
|
|
12567
|
+
evalSetResultId: `${evalId}-result-${Date.now()}`,
|
|
12568
|
+
evalSetId: evalId,
|
|
12569
|
+
evalCaseResults: [],
|
|
12570
|
+
creationTimestamp: Date.now()
|
|
12571
|
+
};
|
|
12572
|
+
for (const evalMetric of evaluateConfig.evalMetrics) {
|
|
12573
|
+
const evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.getEvaluator(evalMetric);
|
|
12574
|
+
const actual = results.filter(
|
|
12575
|
+
(r) => !r.invocationId?.includes("expected")
|
|
12576
|
+
);
|
|
12577
|
+
const expected = results.filter(
|
|
12578
|
+
(r) => r.invocationId?.includes("expected")
|
|
12579
|
+
);
|
|
12580
|
+
const result = await evaluator.evaluateInvocations(actual, expected);
|
|
12581
|
+
evalResult.evalCaseResults.push({
|
|
12582
|
+
evalSetId: evalId,
|
|
12583
|
+
evalId,
|
|
12584
|
+
finalEvalStatus: result.perInvocationResults.length > 0 ? result.perInvocationResults[0].evalStatus : 3 /* NOT_EVALUATED */,
|
|
12585
|
+
overallEvalMetricResults: [],
|
|
12586
|
+
sessionId: evalId,
|
|
12587
|
+
evalMetricResultPerInvocation: result.perInvocationResults.map(
|
|
12588
|
+
(r) => ({
|
|
12589
|
+
actualInvocation: r.actualInvocation,
|
|
12590
|
+
expectedInvocation: r.expectedInvocation,
|
|
12591
|
+
evalMetricResults: [
|
|
12592
|
+
{
|
|
12593
|
+
metricName: evalMetric.metricName,
|
|
12594
|
+
threshold: evalMetric.threshold,
|
|
12595
|
+
score: r.score,
|
|
12596
|
+
evalStatus: r.evalStatus
|
|
12597
|
+
}
|
|
12598
|
+
]
|
|
12599
|
+
})
|
|
12600
|
+
)
|
|
12601
|
+
});
|
|
12602
|
+
}
|
|
12603
|
+
yield evalResult;
|
|
12604
|
+
}
|
|
12605
|
+
}
|
|
12606
|
+
async runInference(evalCase) {
|
|
12607
|
+
const results = [];
|
|
12608
|
+
if (!this.runner) {
|
|
12609
|
+
await this.initializeRunner();
|
|
12610
|
+
}
|
|
12611
|
+
if (evalCase.sessionInput) {
|
|
12612
|
+
try {
|
|
12613
|
+
if (this.runner.initializeSession) {
|
|
12614
|
+
await this.runner.initializeSession(evalCase.sessionInput);
|
|
12615
|
+
} else if (this.runner.setSessionState) {
|
|
12616
|
+
await this.runner.setSessionState(evalCase.sessionInput);
|
|
12617
|
+
} else {
|
|
12618
|
+
console.log(
|
|
12619
|
+
`Session input provided for ${evalCase.evalId}:`,
|
|
12620
|
+
evalCase.sessionInput
|
|
12621
|
+
);
|
|
12622
|
+
}
|
|
12623
|
+
} catch (error) {
|
|
12624
|
+
console.warn(
|
|
12625
|
+
`Failed to initialize session for ${evalCase.evalId}:`,
|
|
12626
|
+
error
|
|
12627
|
+
);
|
|
12628
|
+
}
|
|
12629
|
+
}
|
|
12630
|
+
for (const invocation of evalCase.conversation) {
|
|
12631
|
+
try {
|
|
12632
|
+
const response = await this.runner.ask(invocation.userContent);
|
|
12633
|
+
results.push({
|
|
12634
|
+
invocationId: `${evalCase.evalId}-${results.length}`,
|
|
12635
|
+
userContent: invocation.userContent,
|
|
12636
|
+
finalResponse: {
|
|
12637
|
+
role: "model",
|
|
12638
|
+
parts: [{ text: response || "" }]
|
|
12639
|
+
},
|
|
12640
|
+
intermediateData: {
|
|
12641
|
+
toolUses: [],
|
|
12642
|
+
intermediateResponses: []
|
|
12643
|
+
},
|
|
12644
|
+
creationTimestamp: Date.now()
|
|
12645
|
+
});
|
|
12646
|
+
} catch (error) {
|
|
12647
|
+
console.error(`Error running inference for ${evalCase.evalId}:`, error);
|
|
12648
|
+
results.push({
|
|
12649
|
+
invocationId: `${evalCase.evalId}-${results.length}`,
|
|
12650
|
+
userContent: invocation.userContent,
|
|
12651
|
+
finalResponse: {
|
|
12652
|
+
role: "model",
|
|
12653
|
+
parts: [
|
|
12654
|
+
{
|
|
12655
|
+
text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
12656
|
+
}
|
|
12657
|
+
]
|
|
12658
|
+
},
|
|
12659
|
+
intermediateData: {
|
|
12660
|
+
toolUses: [],
|
|
12661
|
+
intermediateResponses: []
|
|
12662
|
+
},
|
|
12663
|
+
creationTimestamp: Date.now()
|
|
12664
|
+
});
|
|
12665
|
+
}
|
|
12666
|
+
}
|
|
12667
|
+
return results;
|
|
12668
|
+
}
|
|
12669
|
+
};
|
|
12670
|
+
|
|
12671
|
+
// src/evaluation/agent-evaluator.ts
|
|
12672
|
+
var NUM_RUNS = 2;
|
|
12673
|
+
var TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */;
|
|
12674
|
+
var RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
|
|
12675
|
+
var RESPONSE_MATCH_SCORE_KEY = "response_match_score" /* RESPONSE_MATCH_SCORE */;
|
|
12676
|
+
var SAFETY_V1_KEY = "safety_v1" /* SAFETY_V1 */;
|
|
12677
|
+
var ALLOWED_CRITERIA = [
|
|
12678
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
|
12679
|
+
RESPONSE_EVALUATION_SCORE_KEY,
|
|
12680
|
+
RESPONSE_MATCH_SCORE_KEY,
|
|
12681
|
+
SAFETY_V1_KEY
|
|
12682
|
+
];
|
|
12683
|
+
var QUERY_COLUMN = "query";
|
|
12684
|
+
var REFERENCE_COLUMN = "reference";
|
|
12685
|
+
var EXPECTED_TOOL_USE_COLUMN = "expected_tool_use";
|
|
12686
|
+
var DEFAULT_CRITERIA = {
|
|
12687
|
+
[TOOL_TRAJECTORY_SCORE_KEY]: 1,
|
|
12688
|
+
[RESPONSE_MATCH_SCORE_KEY]: 0.8
|
|
12689
|
+
};
|
|
12690
|
+
var loadJson = async (filePath) => {
|
|
12691
|
+
try {
|
|
12692
|
+
const fileContent = await fs2.readFile(filePath, "utf-8");
|
|
12693
|
+
return JSON.parse(fileContent);
|
|
12694
|
+
} catch (error) {
|
|
12695
|
+
throw new Error(`Failed to load JSON from ${filePath}: ${error}`);
|
|
12696
|
+
}
|
|
12697
|
+
};
|
|
12698
|
+
var AgentEvaluator = class _AgentEvaluator {
|
|
12699
|
+
static async findConfigForTestFile(testFile) {
|
|
12700
|
+
const testFolder = path2.dirname(testFile);
|
|
12701
|
+
const configPath = path2.join(testFolder, "test_config.json");
|
|
12702
|
+
try {
|
|
12703
|
+
await fs2.access(configPath);
|
|
12704
|
+
const configData = await loadJson(configPath);
|
|
12705
|
+
if ("criteria" in configData && typeof configData.criteria === "object") {
|
|
12706
|
+
return configData.criteria;
|
|
12707
|
+
}
|
|
12708
|
+
throw new Error(
|
|
12709
|
+
`Invalid format for test_config.json at ${configPath}. Expected a 'criteria' dictionary.`
|
|
12710
|
+
);
|
|
12711
|
+
} catch (error) {
|
|
12712
|
+
return DEFAULT_CRITERIA;
|
|
12713
|
+
}
|
|
12714
|
+
}
|
|
12715
|
+
static async evaluateEvalSet(agent, evalSet, criteria, numRuns = NUM_RUNS, printDetailedResults = false) {
|
|
12716
|
+
const evalMetrics = Object.entries(criteria).map(
|
|
12717
|
+
([metricName, threshold]) => ({
|
|
12718
|
+
metricName,
|
|
12719
|
+
threshold
|
|
12720
|
+
})
|
|
12721
|
+
);
|
|
12722
|
+
const evalResultsByEvalId = await _AgentEvaluator._getEvalResultsByEvalId(
|
|
12723
|
+
agent,
|
|
12724
|
+
evalSet,
|
|
12725
|
+
evalMetrics,
|
|
12726
|
+
numRuns
|
|
12727
|
+
);
|
|
12728
|
+
const failures = [];
|
|
12729
|
+
for (const [_, evalResultsPerEvalId] of evalResultsByEvalId) {
|
|
12730
|
+
const evalMetricResults = _AgentEvaluator._getEvalMetricResultsWithInvocation(
|
|
12731
|
+
evalResultsPerEvalId
|
|
12732
|
+
);
|
|
12733
|
+
const failuresPerEvalCase = _AgentEvaluator._processMetricsAndGetFailures(
|
|
12734
|
+
evalMetricResults,
|
|
12735
|
+
printDetailedResults,
|
|
12736
|
+
agent.name || "Unknown Agent"
|
|
12737
|
+
);
|
|
12738
|
+
failures.push(...failuresPerEvalCase);
|
|
12739
|
+
}
|
|
12740
|
+
if (failures.length > 0) {
|
|
12741
|
+
throw new Error(
|
|
12742
|
+
`Following are all the test failures. If you looking to get more details on the failures, then please re-run this test with \`printDetailedResults\` set to \`true\`.
|
|
12743
|
+
${failures.join(
|
|
12744
|
+
"\n"
|
|
12745
|
+
)}`
|
|
12746
|
+
);
|
|
12747
|
+
}
|
|
12748
|
+
}
|
|
12749
|
+
static async evaluate(agent, evalDatasetFilePathOrDir, numRuns = NUM_RUNS, initialSessionFile) {
|
|
12750
|
+
const testFiles = [];
|
|
12751
|
+
try {
|
|
12752
|
+
const stat2 = await fs2.stat(evalDatasetFilePathOrDir);
|
|
12753
|
+
if (stat2.isDirectory()) {
|
|
12754
|
+
const files = await this._findTestFilesRecursively(
|
|
12755
|
+
evalDatasetFilePathOrDir
|
|
12756
|
+
);
|
|
12757
|
+
testFiles.push(...files);
|
|
12758
|
+
} else {
|
|
12759
|
+
testFiles.push(evalDatasetFilePathOrDir);
|
|
12760
|
+
}
|
|
12761
|
+
} catch (error) {
|
|
12762
|
+
throw new Error(`Invalid path: ${evalDatasetFilePathOrDir}`);
|
|
12763
|
+
}
|
|
12764
|
+
const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
|
|
12765
|
+
for (const testFile of testFiles) {
|
|
12766
|
+
const criteria = await _AgentEvaluator.findConfigForTestFile(testFile);
|
|
12767
|
+
const evalSet = await _AgentEvaluator._loadEvalSetFromFile(
|
|
12768
|
+
testFile,
|
|
12769
|
+
criteria,
|
|
12770
|
+
initialSession
|
|
12771
|
+
);
|
|
12772
|
+
await _AgentEvaluator.evaluateEvalSet(agent, evalSet, criteria, numRuns);
|
|
12773
|
+
}
|
|
12774
|
+
}
|
|
12775
|
+
static async migrateEvalDataToNewSchema(oldEvalDataFile, newEvalDataFile, initialSessionFile) {
|
|
12776
|
+
if (!oldEvalDataFile || !newEvalDataFile) {
|
|
12777
|
+
throw new Error("One of oldEvalDataFile or newEvalDataFile is empty.");
|
|
12778
|
+
}
|
|
12779
|
+
const criteria = await _AgentEvaluator.findConfigForTestFile(oldEvalDataFile);
|
|
12780
|
+
const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
|
|
12781
|
+
const evalSet = await _AgentEvaluator._getEvalSetFromOldFormat(
|
|
12782
|
+
oldEvalDataFile,
|
|
12783
|
+
criteria,
|
|
12784
|
+
initialSession
|
|
12785
|
+
);
|
|
12786
|
+
await fs2.writeFile(newEvalDataFile, JSON.stringify(evalSet, null, 2));
|
|
12787
|
+
}
|
|
12788
|
+
static async _findTestFilesRecursively(dir) {
|
|
12789
|
+
const testFiles = [];
|
|
12790
|
+
async function walk(currentDir) {
|
|
12791
|
+
const entries = await fs2.readdir(currentDir, { withFileTypes: true });
|
|
12792
|
+
for (const entry of entries) {
|
|
12793
|
+
const fullPath = path2.join(currentDir, entry.name);
|
|
12794
|
+
if (entry.isDirectory()) {
|
|
12795
|
+
await walk(fullPath);
|
|
12796
|
+
} else if (entry.name.endsWith(".test.json")) {
|
|
12797
|
+
testFiles.push(fullPath);
|
|
12798
|
+
}
|
|
12799
|
+
}
|
|
12800
|
+
}
|
|
12801
|
+
await walk(dir);
|
|
12802
|
+
return testFiles;
|
|
12803
|
+
}
|
|
12804
|
+
static async _loadEvalSetFromFile(evalSetFile, criteria, initialSession) {
|
|
12805
|
+
try {
|
|
12806
|
+
const content = await fs2.readFile(evalSetFile, "utf-8");
|
|
12807
|
+
try {
|
|
12808
|
+
const evalSet = JSON.parse(content);
|
|
12809
|
+
if (evalSet.evalSetId && evalSet.evalCases) {
|
|
12810
|
+
if (Object.keys(initialSession).length > 0) {
|
|
12811
|
+
throw new Error(
|
|
12812
|
+
"Initial session should be specified as a part of EvalSet file. Explicit initial session is only needed, when specifying data in the older schema."
|
|
12813
|
+
);
|
|
12814
|
+
}
|
|
12815
|
+
return evalSet;
|
|
12816
|
+
}
|
|
12817
|
+
} catch (parseError) {
|
|
12818
|
+
throw new Error(`Failed to parse eval set data: ${parseError}`);
|
|
12819
|
+
}
|
|
12820
|
+
} catch (error) {
|
|
12821
|
+
throw new Error(`Failed to process eval set file: ${error}`);
|
|
12822
|
+
}
|
|
12823
|
+
console.warn(
|
|
12824
|
+
`Contents of ${evalSetFile} appear to be in older format. To avoid this warning, please update your test files to contain data in EvalSet schema. You can use 'migrateEvalDataToNewSchema' for migrating your old test files.`
|
|
12825
|
+
);
|
|
12826
|
+
return _AgentEvaluator._getEvalSetFromOldFormat(
|
|
12827
|
+
evalSetFile,
|
|
12828
|
+
criteria,
|
|
12829
|
+
initialSession
|
|
12830
|
+
);
|
|
12831
|
+
}
|
|
12832
|
+
static async _getEvalSetFromOldFormat(evalSetFile, criteria, initialSession) {
|
|
12833
|
+
const data = await _AgentEvaluator._loadDataset(evalSetFile);
|
|
12834
|
+
_AgentEvaluator._validateInput(data, criteria);
|
|
12835
|
+
return {
|
|
12836
|
+
evalSetId: `eval-set-${Date.now()}`,
|
|
12837
|
+
name: evalSetFile,
|
|
12838
|
+
evalCases: data[0].map(
|
|
12839
|
+
(item, index) => ({
|
|
12840
|
+
evalId: `eval-${index}`,
|
|
12841
|
+
conversation: [
|
|
12842
|
+
{
|
|
12843
|
+
invocationId: `invocation-${index}`,
|
|
12844
|
+
userContent: {
|
|
12845
|
+
role: "user",
|
|
12846
|
+
parts: [{ text: item[QUERY_COLUMN] || "" }]
|
|
12847
|
+
},
|
|
12848
|
+
finalResponse: item[REFERENCE_COLUMN] ? {
|
|
12849
|
+
role: "model",
|
|
12850
|
+
parts: [{ text: item[REFERENCE_COLUMN] }]
|
|
12851
|
+
} : void 0,
|
|
12852
|
+
intermediateData: item[EXPECTED_TOOL_USE_COLUMN] ? {
|
|
12853
|
+
toolUses: item[EXPECTED_TOOL_USE_COLUMN],
|
|
12854
|
+
intermediateResponses: []
|
|
12855
|
+
} : void 0,
|
|
12856
|
+
creationTimestamp: Date.now()
|
|
12857
|
+
}
|
|
12858
|
+
],
|
|
12859
|
+
sessionInput: Object.keys(initialSession).length > 0 ? {
|
|
12860
|
+
appName: "test-app",
|
|
12861
|
+
userId: "test-user",
|
|
12862
|
+
state: initialSession
|
|
12863
|
+
} : void 0
|
|
12864
|
+
})
|
|
12865
|
+
),
|
|
12866
|
+
creationTimestamp: Date.now()
|
|
12867
|
+
};
|
|
12868
|
+
}
|
|
12869
|
+
static async _getInitialSession(initialSessionFile) {
|
|
12870
|
+
if (!initialSessionFile) {
|
|
12871
|
+
return {};
|
|
12872
|
+
}
|
|
12873
|
+
try {
|
|
12874
|
+
const content = await fs2.readFile(initialSessionFile, "utf-8");
|
|
12875
|
+
return JSON.parse(content);
|
|
12876
|
+
} catch (error) {
|
|
12877
|
+
throw new Error(
|
|
12878
|
+
`Failed to load initial session from ${initialSessionFile}: ${error}`
|
|
12879
|
+
);
|
|
12880
|
+
}
|
|
12881
|
+
}
|
|
12882
|
+
static async _loadDataset(inputData) {
|
|
12883
|
+
const stat2 = await fs2.stat(inputData);
|
|
12884
|
+
if (stat2.isDirectory()) {
|
|
12885
|
+
const testFiles = await this._findTestFilesRecursively(inputData);
|
|
12886
|
+
const results = await Promise.all(testFiles.map((f) => loadJson(f)));
|
|
12887
|
+
return results.map((r) => Array.isArray(r) ? r : [r]);
|
|
12888
|
+
}
|
|
12889
|
+
if (stat2.isFile()) {
|
|
12890
|
+
const data = await loadJson(inputData);
|
|
12891
|
+
return [Array.isArray(data) ? data : [data]];
|
|
12892
|
+
}
|
|
12893
|
+
throw new Error(`Invalid input path: ${inputData}`);
|
|
12894
|
+
}
|
|
12895
|
+
static _validateInput(evalDataset, criteria) {
|
|
12896
|
+
if (!evalDataset || evalDataset.length === 0) {
|
|
12897
|
+
throw new Error("The evaluation dataset is None or empty.");
|
|
12898
|
+
}
|
|
12899
|
+
for (const key of Object.keys(criteria)) {
|
|
12900
|
+
if (!ALLOWED_CRITERIA.includes(key)) {
|
|
12901
|
+
throw new Error(
|
|
12902
|
+
`Invalid criteria key: ${key}. Expected one of ${ALLOWED_CRITERIA.join(
|
|
12903
|
+
", "
|
|
12904
|
+
)}.`
|
|
12905
|
+
);
|
|
12906
|
+
}
|
|
12907
|
+
}
|
|
12908
|
+
const sample = evalDataset[0];
|
|
12909
|
+
if (!Array.isArray(sample) || sample.length === 0) {
|
|
12910
|
+
throw new Error("The evaluation dataset is empty.");
|
|
12911
|
+
}
|
|
12912
|
+
const firstQuery = sample[0];
|
|
12913
|
+
if (typeof firstQuery !== "object") {
|
|
12914
|
+
throw new Error(
|
|
12915
|
+
`Each evaluation dataset sample must be list of dictionary. But it's ${JSON.stringify(
|
|
12916
|
+
evalDataset
|
|
12917
|
+
)}`
|
|
12918
|
+
);
|
|
12919
|
+
}
|
|
12920
|
+
if (TOOL_TRAJECTORY_SCORE_KEY in criteria) {
|
|
12921
|
+
if (!(QUERY_COLUMN in firstQuery) || !(EXPECTED_TOOL_USE_COLUMN in firstQuery)) {
|
|
12922
|
+
throw new Error(
|
|
12923
|
+
`Samples for ${TOOL_TRAJECTORY_SCORE_KEY} must include '${QUERY_COLUMN}' and '${EXPECTED_TOOL_USE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
|
|
12924
|
+
);
|
|
12925
|
+
}
|
|
12926
|
+
}
|
|
12927
|
+
if (RESPONSE_EVALUATION_SCORE_KEY in criteria) {
|
|
12928
|
+
if (!(QUERY_COLUMN in firstQuery)) {
|
|
12929
|
+
throw new Error(
|
|
12930
|
+
`Samples for ${RESPONSE_EVALUATION_SCORE_KEY} must include '${QUERY_COLUMN}' key. The sample is ${JSON.stringify(sample)}.`
|
|
12931
|
+
);
|
|
12932
|
+
}
|
|
12933
|
+
}
|
|
12934
|
+
if (RESPONSE_MATCH_SCORE_KEY in criteria) {
|
|
12935
|
+
if (!(QUERY_COLUMN in firstQuery) || !(REFERENCE_COLUMN in firstQuery)) {
|
|
12936
|
+
throw new Error(
|
|
12937
|
+
`Samples for ${RESPONSE_MATCH_SCORE_KEY} must include '${QUERY_COLUMN}' and '${REFERENCE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
|
|
12938
|
+
);
|
|
12939
|
+
}
|
|
12940
|
+
}
|
|
12941
|
+
}
|
|
12942
|
+
static _printDetails(evalMetricResultWithInvocations, overallEvalStatus, overallScore, metricName = "", threshold = 0) {
|
|
12943
|
+
console.log(
|
|
12944
|
+
`Summary: \`${overallEvalStatus}\` for Metric: \`${metricName}\`. Expected threshold: \`${threshold}\`, actual value: \`${overallScore}\`.`
|
|
12945
|
+
);
|
|
12946
|
+
const data = evalMetricResultWithInvocations.map((per) => ({
|
|
12947
|
+
evalStatus: per.evalMetricResult.evalStatus,
|
|
12948
|
+
score: per.evalMetricResult.score,
|
|
12949
|
+
threshold,
|
|
12950
|
+
prompt: _AgentEvaluator._convertContentToText(
|
|
12951
|
+
per.expectedInvocation.userContent
|
|
12952
|
+
),
|
|
12953
|
+
expectedResponse: _AgentEvaluator._convertContentToText(
|
|
12954
|
+
per.expectedInvocation.finalResponse
|
|
12955
|
+
),
|
|
12956
|
+
actualResponse: _AgentEvaluator._convertContentToText(
|
|
12957
|
+
per.actualInvocation.finalResponse
|
|
12958
|
+
),
|
|
12959
|
+
expectedToolCalls: _AgentEvaluator._convertToolCallsToText(
|
|
12960
|
+
per.expectedInvocation.intermediateData
|
|
12961
|
+
),
|
|
12962
|
+
actualToolCalls: _AgentEvaluator._convertToolCallsToText(
|
|
12963
|
+
per.actualInvocation.intermediateData
|
|
12964
|
+
)
|
|
12965
|
+
}));
|
|
12966
|
+
console.table(data);
|
|
12967
|
+
console.log("\n\n");
|
|
12968
|
+
}
|
|
12969
|
+
static _convertContentToText(content) {
|
|
12970
|
+
if (content?.parts) {
|
|
12971
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
|
|
12972
|
+
}
|
|
12973
|
+
return "";
|
|
12974
|
+
}
|
|
12975
|
+
static _convertToolCallsToText(intermediateData) {
|
|
12976
|
+
if (intermediateData?.toolUses) {
|
|
12977
|
+
return intermediateData.toolUses.map((t) => JSON.stringify(t)).join("\n");
|
|
12978
|
+
}
|
|
12979
|
+
return "";
|
|
12980
|
+
}
|
|
12981
|
+
static async _getEvalResultsByEvalId(agent, evalSet, evalMetrics, numRuns) {
|
|
12982
|
+
const evalService = new LocalEvalService(agent);
|
|
12983
|
+
const inferenceResults = [];
|
|
12984
|
+
for (let run = 0; run < numRuns; run++) {
|
|
12985
|
+
for await (const result of evalService.performInference({
|
|
12986
|
+
evalSetId: evalSet.evalSetId,
|
|
12987
|
+
evalCases: [evalSet]
|
|
12988
|
+
})) {
|
|
12989
|
+
inferenceResults.push(result);
|
|
12990
|
+
}
|
|
12991
|
+
}
|
|
12992
|
+
const evalResultsByEvalId = /* @__PURE__ */ new Map();
|
|
12993
|
+
for await (const evalResult of evalService.evaluate({
|
|
12994
|
+
inferenceResults,
|
|
12995
|
+
evaluateConfig: { evalMetrics }
|
|
12996
|
+
})) {
|
|
12997
|
+
for (const caseResult of evalResult.evalCaseResults) {
|
|
12998
|
+
const evalId = caseResult.evalId;
|
|
12999
|
+
if (!evalResultsByEvalId.has(evalId)) {
|
|
13000
|
+
evalResultsByEvalId.set(evalId, []);
|
|
13001
|
+
}
|
|
13002
|
+
evalResultsByEvalId.get(evalId).push(caseResult);
|
|
13003
|
+
}
|
|
13004
|
+
}
|
|
13005
|
+
return evalResultsByEvalId;
|
|
13006
|
+
}
|
|
13007
|
+
static _getEvalMetricResultsWithInvocation(evalResultsPerEvalId) {
|
|
13008
|
+
const evalMetricResults = {};
|
|
13009
|
+
for (const evalCaseResult of evalResultsPerEvalId) {
|
|
13010
|
+
for (const evalMetricsPerInvocation of evalCaseResult.evalMetricResultPerInvocation) {
|
|
13011
|
+
for (const evalMetricResult of evalMetricsPerInvocation.evalMetricResults) {
|
|
13012
|
+
const metricName = evalMetricResult.metricName;
|
|
13013
|
+
if (!(metricName in evalMetricResults)) {
|
|
13014
|
+
evalMetricResults[metricName] = [];
|
|
13015
|
+
}
|
|
13016
|
+
evalMetricResults[metricName].push({
|
|
13017
|
+
actualInvocation: evalMetricsPerInvocation.actualInvocation,
|
|
13018
|
+
expectedInvocation: evalMetricsPerInvocation.expectedInvocation,
|
|
13019
|
+
evalMetricResult
|
|
13020
|
+
});
|
|
13021
|
+
}
|
|
13022
|
+
}
|
|
13023
|
+
}
|
|
13024
|
+
return evalMetricResults;
|
|
13025
|
+
}
|
|
13026
|
+
static _processMetricsAndGetFailures(evalMetricResults, printDetailedResults, agentModule) {
|
|
13027
|
+
const failures = [];
|
|
13028
|
+
for (const [metricName, evalMetricResultsWithInvocations] of Object.entries(
|
|
13029
|
+
evalMetricResults
|
|
13030
|
+
)) {
|
|
13031
|
+
const threshold = evalMetricResultsWithInvocations[0]?.evalMetricResult.threshold || 0;
|
|
13032
|
+
const scores = evalMetricResultsWithInvocations.map((m) => m.evalMetricResult.score).filter((s) => s !== void 0);
|
|
13033
|
+
let overallScore;
|
|
13034
|
+
let overallEvalStatus;
|
|
13035
|
+
if (scores.length > 0) {
|
|
13036
|
+
overallScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
13037
|
+
overallEvalStatus = overallScore >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
13038
|
+
} else {
|
|
13039
|
+
overallScore = void 0;
|
|
13040
|
+
overallEvalStatus = 3 /* NOT_EVALUATED */;
|
|
13041
|
+
}
|
|
13042
|
+
if (overallEvalStatus !== 1 /* PASSED */) {
|
|
13043
|
+
if (printDetailedResults) {
|
|
13044
|
+
_AgentEvaluator._printDetails(
|
|
13045
|
+
evalMetricResultsWithInvocations,
|
|
13046
|
+
overallEvalStatus,
|
|
13047
|
+
overallScore,
|
|
13048
|
+
metricName,
|
|
13049
|
+
threshold
|
|
13050
|
+
);
|
|
13051
|
+
}
|
|
13052
|
+
failures.push(
|
|
13053
|
+
`${metricName} for ${agentModule} Failed. Expected ${threshold}, but got ${overallScore}.`
|
|
13054
|
+
);
|
|
13055
|
+
}
|
|
13056
|
+
}
|
|
13057
|
+
return failures;
|
|
13058
|
+
}
|
|
13059
|
+
};
|
|
13060
|
+
|
|
13061
|
+
// src/evaluation/final-response-match-v1.ts
|
|
13062
|
+
var RougeEvaluator = class extends Evaluator {
|
|
13063
|
+
evalMetric;
|
|
13064
|
+
constructor(evalMetric) {
|
|
13065
|
+
super(evalMetric);
|
|
13066
|
+
this.evalMetric = evalMetric;
|
|
13067
|
+
}
|
|
13068
|
+
static getMetricInfo() {
|
|
13069
|
+
return {
|
|
13070
|
+
metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
|
|
13071
|
+
description: "This metric evaluates if the agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
13072
|
+
metricValueInfo: {
|
|
13073
|
+
interval: {
|
|
13074
|
+
minValue: 0,
|
|
13075
|
+
maxValue: 1,
|
|
13076
|
+
openAtMin: false,
|
|
13077
|
+
openAtMax: false
|
|
13078
|
+
}
|
|
13079
|
+
}
|
|
13080
|
+
};
|
|
13081
|
+
}
|
|
13082
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
13083
|
+
let totalScore = 0;
|
|
13084
|
+
let numInvocations = 0;
|
|
13085
|
+
const perInvocationResults = [];
|
|
13086
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
13087
|
+
const actual = actualInvocations[i];
|
|
13088
|
+
const expected = expectedInvocations[i];
|
|
13089
|
+
const reference = getTextFromContent2(expected.finalResponse);
|
|
13090
|
+
const response = getTextFromContent2(actual.finalResponse);
|
|
13091
|
+
const rouge1Scores = await calculateRouge1Scores(response, reference);
|
|
13092
|
+
const score = rouge1Scores.fmeasure;
|
|
13093
|
+
perInvocationResults.push({
|
|
13094
|
+
actualInvocation: actual,
|
|
13095
|
+
expectedInvocation: expected,
|
|
13096
|
+
score,
|
|
13097
|
+
evalStatus: getEvalStatus2(score, this.evalMetric.threshold)
|
|
13098
|
+
});
|
|
13099
|
+
totalScore += score;
|
|
13100
|
+
numInvocations++;
|
|
13101
|
+
}
|
|
13102
|
+
if (perInvocationResults.length > 0) {
|
|
13103
|
+
const overallScore = totalScore / numInvocations;
|
|
13104
|
+
return {
|
|
13105
|
+
overallScore,
|
|
13106
|
+
overallEvalStatus: getEvalStatus2(
|
|
13107
|
+
overallScore,
|
|
13108
|
+
this.evalMetric.threshold
|
|
13109
|
+
),
|
|
13110
|
+
perInvocationResults
|
|
13111
|
+
};
|
|
13112
|
+
}
|
|
13113
|
+
return {
|
|
13114
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
13115
|
+
perInvocationResults: []
|
|
13116
|
+
};
|
|
13117
|
+
}
|
|
13118
|
+
};
|
|
13119
|
+
function getTextFromContent2(content) {
|
|
13120
|
+
if (content?.parts) {
|
|
13121
|
+
return content.parts.map((part) => part.text).filter(Boolean).join("\n");
|
|
13122
|
+
}
|
|
13123
|
+
return "";
|
|
13124
|
+
}
|
|
13125
|
+
function getEvalStatus2(score, threshold) {
|
|
13126
|
+
return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
13127
|
+
}
|
|
13128
|
+
function calculateRouge1Scores(response, reference) {
|
|
13129
|
+
if (!response.trim() || !reference.trim()) {
|
|
13130
|
+
return { precision: 0, recall: 0, fmeasure: 0 };
|
|
13131
|
+
}
|
|
13132
|
+
const responseTokens = tokenizeText(response);
|
|
13133
|
+
const referenceTokens = tokenizeText(reference);
|
|
13134
|
+
const responseUnigrams = new Set(responseTokens);
|
|
13135
|
+
const referenceUnigrams = new Set(referenceTokens);
|
|
13136
|
+
const commonUnigrams = new Set(
|
|
13137
|
+
[...responseUnigrams].filter((token) => referenceUnigrams.has(token))
|
|
13138
|
+
);
|
|
13139
|
+
const precision = responseUnigrams.size > 0 ? commonUnigrams.size / responseUnigrams.size : 0;
|
|
13140
|
+
const recall = referenceUnigrams.size > 0 ? commonUnigrams.size / referenceUnigrams.size : 0;
|
|
13141
|
+
const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
13142
|
+
return { precision, recall, fmeasure };
|
|
13143
|
+
}
|
|
13144
|
+
function tokenizeText(text) {
|
|
13145
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
|
|
13146
|
+
}
|
|
13147
|
+
|
|
11755
13148
|
// src/version.ts
|
|
11756
13149
|
var VERSION = "0.1.0";
|
|
11757
13150
|
export {
|
|
11758
13151
|
AF_FUNCTION_CALL_ID_PREFIX,
|
|
11759
13152
|
LlmAgent as Agent,
|
|
11760
13153
|
AgentBuilder,
|
|
13154
|
+
AgentEvaluator,
|
|
11761
13155
|
AgentTool,
|
|
11762
13156
|
agents_exports as Agents,
|
|
11763
13157
|
AiSdkLlm,
|
|
@@ -11791,11 +13185,16 @@ export {
|
|
|
11791
13185
|
CodeExecutorContext,
|
|
11792
13186
|
DatabaseSessionService,
|
|
11793
13187
|
EnhancedAuthConfig,
|
|
13188
|
+
EvalResult,
|
|
13189
|
+
EvalStatus,
|
|
13190
|
+
evaluation_exports as Evaluation,
|
|
13191
|
+
Evaluator,
|
|
11794
13192
|
Event,
|
|
11795
13193
|
EventActions,
|
|
11796
13194
|
events_exports as Events,
|
|
11797
13195
|
ExitLoopTool,
|
|
11798
13196
|
FileOperationsTool,
|
|
13197
|
+
FinalResponseMatchV2Evaluator,
|
|
11799
13198
|
flows_exports as Flows,
|
|
11800
13199
|
FunctionTool,
|
|
11801
13200
|
GcsArtifactService,
|
|
@@ -11817,6 +13216,7 @@ export {
|
|
|
11817
13216
|
LlmResponse,
|
|
11818
13217
|
LoadArtifactsTool,
|
|
11819
13218
|
LoadMemoryTool,
|
|
13219
|
+
LocalEvalService,
|
|
11820
13220
|
LoopAgent,
|
|
11821
13221
|
McpAbi,
|
|
11822
13222
|
McpAtp,
|
|
@@ -11844,10 +13244,13 @@ export {
|
|
|
11844
13244
|
OpenIdConnectScheme,
|
|
11845
13245
|
ParallelAgent,
|
|
11846
13246
|
PlanReActPlanner,
|
|
13247
|
+
PrebuiltMetrics,
|
|
11847
13248
|
REQUEST_EUC_FUNCTION_CALL_NAME,
|
|
11848
13249
|
ReadonlyContext,
|
|
13250
|
+
RougeEvaluator,
|
|
11849
13251
|
RunConfig,
|
|
11850
13252
|
Runner,
|
|
13253
|
+
SafetyEvaluatorV1,
|
|
11851
13254
|
SequentialAgent,
|
|
11852
13255
|
sessions_exports as Sessions,
|
|
11853
13256
|
SingleFlow,
|
|
@@ -11856,6 +13259,7 @@ export {
|
|
|
11856
13259
|
TelemetryService,
|
|
11857
13260
|
ToolContext,
|
|
11858
13261
|
tools_exports as Tools,
|
|
13262
|
+
TrajectoryEvaluator,
|
|
11859
13263
|
TransferToAgentTool,
|
|
11860
13264
|
UserInteractionTool,
|
|
11861
13265
|
VERSION,
|