@iqai/adk 0.1.22 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +46 -0
- package/dist/index.d.mts +537 -346
- package/dist/index.d.ts +537 -346
- package/dist/index.js +1554 -206
- package/dist/index.mjs +1461 -113
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -833,70 +833,23 @@ ${instructions.join("\n\n")}`;
|
|
|
833
833
|
|
|
834
834
|
// src/models/llm-response.ts
|
|
835
835
|
var LlmResponse = class _LlmResponse {
|
|
836
|
-
/**
|
|
837
|
-
* Unique identifier for the response.
|
|
838
|
-
*/
|
|
839
836
|
id;
|
|
840
|
-
|
|
841
|
-
* The content generated by the model.
|
|
842
|
-
*/
|
|
837
|
+
text;
|
|
843
838
|
content;
|
|
844
|
-
/**
|
|
845
|
-
* The grounding metadata of the response.
|
|
846
|
-
*/
|
|
847
839
|
groundingMetadata;
|
|
848
|
-
/**
|
|
849
|
-
* Indicates whether the text content is part of an unfinished text stream.
|
|
850
|
-
*/
|
|
851
840
|
partial;
|
|
852
|
-
/**
|
|
853
|
-
* Indicates whether the response from the model is complete.
|
|
854
|
-
*/
|
|
855
841
|
turnComplete;
|
|
856
|
-
/**
|
|
857
|
-
* Error code if the response is an error.
|
|
858
|
-
*/
|
|
859
842
|
errorCode;
|
|
860
|
-
/**
|
|
861
|
-
* Error message if the response is an error.
|
|
862
|
-
*/
|
|
863
843
|
errorMessage;
|
|
864
|
-
/**
|
|
865
|
-
* Flag indicating that LLM was interrupted when generating the content.
|
|
866
|
-
*/
|
|
867
844
|
interrupted;
|
|
868
|
-
/**
|
|
869
|
-
* The custom metadata of the LlmResponse.
|
|
870
|
-
*/
|
|
871
845
|
customMetadata;
|
|
872
|
-
/**
|
|
873
|
-
* The usage metadata of the LlmResponse.
|
|
874
|
-
*/
|
|
875
846
|
usageMetadata;
|
|
876
|
-
/**
|
|
877
|
-
* Index of the candidate response.
|
|
878
|
-
*/
|
|
879
847
|
candidateIndex;
|
|
880
|
-
/**
|
|
881
|
-
* Reason why the model finished generating.
|
|
882
|
-
*/
|
|
883
848
|
finishReason;
|
|
884
|
-
/**
|
|
885
|
-
* Error object if the response is an error.
|
|
886
|
-
*/
|
|
887
849
|
error;
|
|
888
|
-
/**
|
|
889
|
-
* Creates a new LlmResponse.
|
|
890
|
-
*/
|
|
891
850
|
constructor(data = {}) {
|
|
892
851
|
Object.assign(this, data);
|
|
893
852
|
}
|
|
894
|
-
/**
|
|
895
|
-
* Creates an LlmResponse from a GenerateContentResponse.
|
|
896
|
-
*
|
|
897
|
-
* @param generateContentResponse The GenerateContentResponse to create the LlmResponse from.
|
|
898
|
-
* @returns The LlmResponse.
|
|
899
|
-
*/
|
|
900
853
|
static create(generateContentResponse) {
|
|
901
854
|
const usageMetadata = generateContentResponse.usageMetadata;
|
|
902
855
|
if (generateContentResponse.candidates && generateContentResponse.candidates.length > 0) {
|
|
@@ -928,15 +881,6 @@ var LlmResponse = class _LlmResponse {
|
|
|
928
881
|
usageMetadata
|
|
929
882
|
});
|
|
930
883
|
}
|
|
931
|
-
/**
|
|
932
|
-
* Creates an LlmResponse from an error.
|
|
933
|
-
*
|
|
934
|
-
* @param error The error object or message.
|
|
935
|
-
* @param options Additional options for the error response.
|
|
936
|
-
* @param options.errorCode A specific error code for the response.
|
|
937
|
-
* @param options.model The model that was being used when the error occurred.
|
|
938
|
-
* @returns The LlmResponse.
|
|
939
|
-
*/
|
|
940
884
|
static fromError(error, options = {}) {
|
|
941
885
|
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
942
886
|
const errorCode = options.errorCode || "UNKNOWN_ERROR";
|
|
@@ -2675,30 +2619,16 @@ var OpenAiLlm = class extends BaseLlm {
|
|
|
2675
2619
|
// src/models/llm-registry.ts
|
|
2676
2620
|
init_logger();
|
|
2677
2621
|
var LLMRegistry = class _LLMRegistry {
|
|
2678
|
-
/**
|
|
2679
|
-
* Map of model name regex to LLM class
|
|
2680
|
-
*/
|
|
2681
2622
|
static llmRegistry = /* @__PURE__ */ new Map();
|
|
2623
|
+
static modelInstances = /* @__PURE__ */ new Map();
|
|
2682
2624
|
static logger = new Logger({ name: "LLMRegistry" });
|
|
2683
|
-
/**
|
|
2684
|
-
* Creates a new LLM instance
|
|
2685
|
-
*
|
|
2686
|
-
* @param model The model name
|
|
2687
|
-
* @returns The LLM instance
|
|
2688
|
-
*/
|
|
2689
2625
|
static newLLM(model) {
|
|
2690
2626
|
const llmClass = _LLMRegistry.resolve(model);
|
|
2691
2627
|
if (!llmClass) {
|
|
2692
|
-
throw new Error(`No LLM found for model: ${model}`);
|
|
2628
|
+
throw new Error(`No LLM class found for model: ${model}`);
|
|
2693
2629
|
}
|
|
2694
2630
|
return new llmClass(model);
|
|
2695
2631
|
}
|
|
2696
|
-
/**
|
|
2697
|
-
* Resolves the LLM class from the model name
|
|
2698
|
-
*
|
|
2699
|
-
* @param model The model name
|
|
2700
|
-
* @returns The LLM class
|
|
2701
|
-
*/
|
|
2702
2632
|
static resolve(model) {
|
|
2703
2633
|
for (const [regex, llmClass] of _LLMRegistry.llmRegistry.entries()) {
|
|
2704
2634
|
if (regex.test(model)) {
|
|
@@ -2707,34 +2637,54 @@ var LLMRegistry = class _LLMRegistry {
|
|
|
2707
2637
|
}
|
|
2708
2638
|
return null;
|
|
2709
2639
|
}
|
|
2710
|
-
/**
|
|
2711
|
-
* Registers a new LLM class
|
|
2712
|
-
*
|
|
2713
|
-
* @param modelNameRegex The regex to match model names
|
|
2714
|
-
* @param llmClass The LLM class
|
|
2715
|
-
*/
|
|
2716
2640
|
static register(modelNameRegex, llmClass) {
|
|
2717
2641
|
_LLMRegistry.llmRegistry.set(new RegExp(modelNameRegex), llmClass);
|
|
2718
2642
|
}
|
|
2719
|
-
/**
|
|
2720
|
-
* Registers all model patterns from an LLM class
|
|
2721
|
-
*
|
|
2722
|
-
* @param llmClass The LLM class
|
|
2723
|
-
*/
|
|
2724
2643
|
static registerLLM(llmClass) {
|
|
2725
2644
|
const modelPatterns = llmClass.supportedModels();
|
|
2726
2645
|
for (const pattern of modelPatterns) {
|
|
2727
2646
|
_LLMRegistry.register(pattern, llmClass);
|
|
2728
2647
|
}
|
|
2729
2648
|
}
|
|
2730
|
-
|
|
2731
|
-
|
|
2732
|
-
|
|
2649
|
+
static registerModel(name, model) {
|
|
2650
|
+
_LLMRegistry.modelInstances.set(name, model);
|
|
2651
|
+
}
|
|
2652
|
+
static getModel(name) {
|
|
2653
|
+
const model = _LLMRegistry.modelInstances.get(name);
|
|
2654
|
+
if (!model) {
|
|
2655
|
+
throw new Error(`Model '${name}' not found in registry`);
|
|
2656
|
+
}
|
|
2657
|
+
return model;
|
|
2658
|
+
}
|
|
2659
|
+
static hasModel(name) {
|
|
2660
|
+
return _LLMRegistry.modelInstances.has(name);
|
|
2661
|
+
}
|
|
2662
|
+
static unregisterModel(name) {
|
|
2663
|
+
_LLMRegistry.modelInstances.delete(name);
|
|
2664
|
+
}
|
|
2665
|
+
static getModelOrCreate(name) {
|
|
2666
|
+
if (_LLMRegistry.hasModel(name)) {
|
|
2667
|
+
return _LLMRegistry.getModel(name);
|
|
2668
|
+
}
|
|
2669
|
+
return _LLMRegistry.newLLM(name);
|
|
2670
|
+
}
|
|
2671
|
+
static clear() {
|
|
2672
|
+
_LLMRegistry.llmRegistry.clear();
|
|
2673
|
+
_LLMRegistry.modelInstances.clear();
|
|
2674
|
+
}
|
|
2675
|
+
static clearModels() {
|
|
2676
|
+
_LLMRegistry.modelInstances.clear();
|
|
2677
|
+
}
|
|
2678
|
+
static clearClasses() {
|
|
2679
|
+
_LLMRegistry.llmRegistry.clear();
|
|
2680
|
+
}
|
|
2733
2681
|
static logRegisteredModels() {
|
|
2734
|
-
_LLMRegistry.
|
|
2735
|
-
|
|
2736
|
-
[..._LLMRegistry.llmRegistry.entries()].map(([regex]) => regex.toString())
|
|
2682
|
+
const classPatterns = [..._LLMRegistry.llmRegistry.entries()].map(
|
|
2683
|
+
([regex]) => regex.toString()
|
|
2737
2684
|
);
|
|
2685
|
+
const instanceNames = [..._LLMRegistry.modelInstances.keys()];
|
|
2686
|
+
_LLMRegistry.logger.debug("Registered LLM class patterns:", classPatterns);
|
|
2687
|
+
_LLMRegistry.logger.debug("Registered LLM instances:", instanceNames);
|
|
2738
2688
|
}
|
|
2739
2689
|
};
|
|
2740
2690
|
|
|
@@ -6582,9 +6532,23 @@ var BaseLlmFlow = class {
|
|
|
6582
6532
|
yield event;
|
|
6583
6533
|
}
|
|
6584
6534
|
}
|
|
6585
|
-
|
|
6535
|
+
let tools = await agent.canonicalTools(
|
|
6586
6536
|
new ReadonlyContext(invocationContext)
|
|
6587
6537
|
);
|
|
6538
|
+
if (tools.length > 1) {
|
|
6539
|
+
const seen = /* @__PURE__ */ new Set();
|
|
6540
|
+
const filtered = [];
|
|
6541
|
+
for (const t of tools) {
|
|
6542
|
+
const name = t?.name;
|
|
6543
|
+
if (!name) continue;
|
|
6544
|
+
if (seen.has(name)) {
|
|
6545
|
+
continue;
|
|
6546
|
+
}
|
|
6547
|
+
seen.add(name);
|
|
6548
|
+
filtered.push(t);
|
|
6549
|
+
}
|
|
6550
|
+
tools = filtered;
|
|
6551
|
+
}
|
|
6588
6552
|
for (const tool of tools) {
|
|
6589
6553
|
const toolContext = new ToolContext(invocationContext);
|
|
6590
6554
|
await tool.processLlmRequest(toolContext, llmRequest);
|
|
@@ -6740,7 +6704,42 @@ var BaseLlmFlow = class {
|
|
|
6740
6704
|
}
|
|
6741
6705
|
invocationContext.incrementLlmCallCount();
|
|
6742
6706
|
const isStreaming = invocationContext.runConfig.streamingMode === "sse" /* SSE */;
|
|
6743
|
-
|
|
6707
|
+
let tools = llmRequest.config?.tools || [];
|
|
6708
|
+
if (tools.length) {
|
|
6709
|
+
const deduped = [];
|
|
6710
|
+
const seenFn = /* @__PURE__ */ new Set();
|
|
6711
|
+
for (const t of tools) {
|
|
6712
|
+
const tool = t;
|
|
6713
|
+
if (tool && Array.isArray(tool.functionDeclarations)) {
|
|
6714
|
+
const newFds = tool.functionDeclarations.filter(
|
|
6715
|
+
(fd) => {
|
|
6716
|
+
if (fd?.name) {
|
|
6717
|
+
if (seenFn.has(fd.name)) {
|
|
6718
|
+
return false;
|
|
6719
|
+
}
|
|
6720
|
+
seenFn.add(fd.name);
|
|
6721
|
+
}
|
|
6722
|
+
return true;
|
|
6723
|
+
}
|
|
6724
|
+
);
|
|
6725
|
+
if (newFds.length) {
|
|
6726
|
+
deduped.push({ ...tool, functionDeclarations: newFds });
|
|
6727
|
+
}
|
|
6728
|
+
} else if (tool?.name) {
|
|
6729
|
+
if (seenFn.has(tool.name)) continue;
|
|
6730
|
+
seenFn.add(tool.name);
|
|
6731
|
+
deduped.push(tool);
|
|
6732
|
+
} else {
|
|
6733
|
+
deduped.push(tool);
|
|
6734
|
+
}
|
|
6735
|
+
}
|
|
6736
|
+
if (deduped.length !== tools.length) {
|
|
6737
|
+
this.logger.debug(
|
|
6738
|
+
`\u{1F501} Deduplicated tool/function declarations: ${tools.length} -> ${deduped.length}`
|
|
6739
|
+
);
|
|
6740
|
+
}
|
|
6741
|
+
llmRequest.config.tools = tools = deduped;
|
|
6742
|
+
}
|
|
6744
6743
|
const toolNames = tools.map((tool) => {
|
|
6745
6744
|
if (tool.functionDeclarations && Array.isArray(tool.functionDeclarations)) {
|
|
6746
6745
|
return tool.functionDeclarations.map((fn) => fn.name).join(", ");
|
|
@@ -9555,6 +9554,7 @@ var LangGraphAgent = class extends BaseAgent {
|
|
|
9555
9554
|
};
|
|
9556
9555
|
|
|
9557
9556
|
// src/agents/agent-builder.ts
|
|
9557
|
+
init_logger();
|
|
9558
9558
|
import { generateId } from "ai";
|
|
9559
9559
|
|
|
9560
9560
|
// src/runners.ts
|
|
@@ -9668,19 +9668,19 @@ var InMemoryArtifactService = class {
|
|
|
9668
9668
|
}
|
|
9669
9669
|
async saveArtifact(args) {
|
|
9670
9670
|
const { appName, userId, sessionId, filename, artifact } = args;
|
|
9671
|
-
const
|
|
9672
|
-
if (!this.artifacts.has(
|
|
9673
|
-
this.artifacts.set(
|
|
9671
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9672
|
+
if (!this.artifacts.has(path3)) {
|
|
9673
|
+
this.artifacts.set(path3, []);
|
|
9674
9674
|
}
|
|
9675
|
-
const versions = this.artifacts.get(
|
|
9675
|
+
const versions = this.artifacts.get(path3);
|
|
9676
9676
|
const version = versions.length;
|
|
9677
9677
|
versions.push(artifact);
|
|
9678
9678
|
return version;
|
|
9679
9679
|
}
|
|
9680
9680
|
async loadArtifact(args) {
|
|
9681
9681
|
const { appName, userId, sessionId, filename, version } = args;
|
|
9682
|
-
const
|
|
9683
|
-
const versions = this.artifacts.get(
|
|
9682
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9683
|
+
const versions = this.artifacts.get(path3);
|
|
9684
9684
|
if (!versions || versions.length === 0) {
|
|
9685
9685
|
return null;
|
|
9686
9686
|
}
|
|
@@ -9701,12 +9701,12 @@ var InMemoryArtifactService = class {
|
|
|
9701
9701
|
const sessionPrefix = `${appName}/${userId}/${sessionId}/`;
|
|
9702
9702
|
const userNamespacePrefix = `${appName}/${userId}/user/`;
|
|
9703
9703
|
const filenames = [];
|
|
9704
|
-
for (const
|
|
9705
|
-
if (
|
|
9706
|
-
const filename =
|
|
9704
|
+
for (const path3 of this.artifacts.keys()) {
|
|
9705
|
+
if (path3.startsWith(sessionPrefix)) {
|
|
9706
|
+
const filename = path3.substring(sessionPrefix.length);
|
|
9707
9707
|
filenames.push(filename);
|
|
9708
|
-
} else if (
|
|
9709
|
-
const filename =
|
|
9708
|
+
} else if (path3.startsWith(userNamespacePrefix)) {
|
|
9709
|
+
const filename = path3.substring(userNamespacePrefix.length);
|
|
9710
9710
|
filenames.push(filename);
|
|
9711
9711
|
}
|
|
9712
9712
|
}
|
|
@@ -9714,16 +9714,16 @@ var InMemoryArtifactService = class {
|
|
|
9714
9714
|
}
|
|
9715
9715
|
async deleteArtifact(args) {
|
|
9716
9716
|
const { appName, userId, sessionId, filename } = args;
|
|
9717
|
-
const
|
|
9718
|
-
if (!this.artifacts.has(
|
|
9717
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9718
|
+
if (!this.artifacts.has(path3)) {
|
|
9719
9719
|
return;
|
|
9720
9720
|
}
|
|
9721
|
-
this.artifacts.delete(
|
|
9721
|
+
this.artifacts.delete(path3);
|
|
9722
9722
|
}
|
|
9723
9723
|
async listVersions(args) {
|
|
9724
9724
|
const { appName, userId, sessionId, filename } = args;
|
|
9725
|
-
const
|
|
9726
|
-
const versions = this.artifacts.get(
|
|
9725
|
+
const path3 = this.getArtifactPath(appName, userId, sessionId, filename);
|
|
9726
|
+
const versions = this.artifacts.get(path3);
|
|
9727
9727
|
if (!versions || versions.length === 0) {
|
|
9728
9728
|
return [];
|
|
9729
9729
|
}
|
|
@@ -10193,7 +10193,7 @@ var Runner = class {
|
|
|
10193
10193
|
}
|
|
10194
10194
|
};
|
|
10195
10195
|
invokeRunAsync();
|
|
10196
|
-
return function* () {
|
|
10196
|
+
return (function* () {
|
|
10197
10197
|
while (true) {
|
|
10198
10198
|
while (queueIndex >= eventQueue.length && !asyncCompleted) {
|
|
10199
10199
|
}
|
|
@@ -10206,7 +10206,7 @@ var Runner = class {
|
|
|
10206
10206
|
}
|
|
10207
10207
|
yield event;
|
|
10208
10208
|
}
|
|
10209
|
-
}();
|
|
10209
|
+
})();
|
|
10210
10210
|
}
|
|
10211
10211
|
/**
|
|
10212
10212
|
* Main entry method to run the agent in this runner.
|
|
@@ -10406,6 +10406,12 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10406
10406
|
artifactService;
|
|
10407
10407
|
agentType = "llm";
|
|
10408
10408
|
existingSession;
|
|
10409
|
+
existingAgent;
|
|
10410
|
+
// If provided, reuse directly
|
|
10411
|
+
definitionLocked = false;
|
|
10412
|
+
// Lock further definition mutation after withAgent
|
|
10413
|
+
warnedMethods = /* @__PURE__ */ new Set();
|
|
10414
|
+
logger = new Logger({ name: "AgentBuilder" });
|
|
10409
10415
|
/**
|
|
10410
10416
|
* Private constructor - use static create() method
|
|
10411
10417
|
*/
|
|
@@ -10434,6 +10440,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10434
10440
|
* @returns This builder instance for chaining
|
|
10435
10441
|
*/
|
|
10436
10442
|
withModel(model) {
|
|
10443
|
+
this.warnIfLocked("withModel");
|
|
10437
10444
|
this.config.model = model;
|
|
10438
10445
|
return this;
|
|
10439
10446
|
}
|
|
@@ -10443,6 +10450,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10443
10450
|
* @returns This builder instance for chaining
|
|
10444
10451
|
*/
|
|
10445
10452
|
withDescription(description) {
|
|
10453
|
+
this.warnIfLocked("withDescription");
|
|
10446
10454
|
this.config.description = description;
|
|
10447
10455
|
return this;
|
|
10448
10456
|
}
|
|
@@ -10452,14 +10460,17 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10452
10460
|
* @returns This builder instance for chaining
|
|
10453
10461
|
*/
|
|
10454
10462
|
withInstruction(instruction) {
|
|
10463
|
+
this.warnIfLocked("withInstruction");
|
|
10455
10464
|
this.config.instruction = instruction;
|
|
10456
10465
|
return this;
|
|
10457
10466
|
}
|
|
10458
10467
|
withInputSchema(schema) {
|
|
10468
|
+
this.warnIfLocked("withInputSchema");
|
|
10459
10469
|
this.config.inputSchema = schema;
|
|
10460
10470
|
return this;
|
|
10461
10471
|
}
|
|
10462
10472
|
withOutputSchema(schema) {
|
|
10473
|
+
this.warnIfLocked("withOutputSchema");
|
|
10463
10474
|
this.config.outputSchema = schema;
|
|
10464
10475
|
return this;
|
|
10465
10476
|
}
|
|
@@ -10469,6 +10480,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10469
10480
|
* @returns This builder instance for chaining
|
|
10470
10481
|
*/
|
|
10471
10482
|
withTools(...tools) {
|
|
10483
|
+
this.warnIfLocked("withTools");
|
|
10472
10484
|
this.config.tools = [...this.config.tools || [], ...tools];
|
|
10473
10485
|
return this;
|
|
10474
10486
|
}
|
|
@@ -10478,6 +10490,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10478
10490
|
* @returns This builder instance for chaining
|
|
10479
10491
|
*/
|
|
10480
10492
|
withPlanner(planner) {
|
|
10493
|
+
this.warnIfLocked("withPlanner");
|
|
10481
10494
|
this.config.planner = planner;
|
|
10482
10495
|
return this;
|
|
10483
10496
|
}
|
|
@@ -10487,6 +10500,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10487
10500
|
* @returns This builder instance for chaining
|
|
10488
10501
|
*/
|
|
10489
10502
|
withCodeExecutor(codeExecutor) {
|
|
10503
|
+
this.warnIfLocked("withCodeExecutor");
|
|
10490
10504
|
this.config.codeExecutor = codeExecutor;
|
|
10491
10505
|
return this;
|
|
10492
10506
|
}
|
|
@@ -10496,6 +10510,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10496
10510
|
* @returns This builder instance for chaining
|
|
10497
10511
|
*/
|
|
10498
10512
|
withOutputKey(outputKey) {
|
|
10513
|
+
this.warnIfLocked("withOutputKey");
|
|
10499
10514
|
this.config.outputKey = outputKey;
|
|
10500
10515
|
return this;
|
|
10501
10516
|
}
|
|
@@ -10505,6 +10520,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10505
10520
|
* @returns This builder instance for chaining
|
|
10506
10521
|
*/
|
|
10507
10522
|
withSubAgents(subAgents) {
|
|
10523
|
+
this.warnIfLocked("withSubAgents");
|
|
10508
10524
|
this.config.subAgents = subAgents;
|
|
10509
10525
|
return this;
|
|
10510
10526
|
}
|
|
@@ -10514,6 +10530,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10514
10530
|
* @returns This builder instance for chaining
|
|
10515
10531
|
*/
|
|
10516
10532
|
withBeforeAgentCallback(callback) {
|
|
10533
|
+
this.warnIfLocked("withBeforeAgentCallback");
|
|
10517
10534
|
this.config.beforeAgentCallback = callback;
|
|
10518
10535
|
return this;
|
|
10519
10536
|
}
|
|
@@ -10523,15 +10540,29 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10523
10540
|
* @returns This builder instance for chaining
|
|
10524
10541
|
*/
|
|
10525
10542
|
withAfterAgentCallback(callback) {
|
|
10543
|
+
this.warnIfLocked("withAfterAgentCallback");
|
|
10526
10544
|
this.config.afterAgentCallback = callback;
|
|
10527
10545
|
return this;
|
|
10528
10546
|
}
|
|
10547
|
+
/**
|
|
10548
|
+
* Provide an already constructed agent instance. Further definition-mutating calls
|
|
10549
|
+
* (model/tools/instruction/etc.) will be ignored with a dev warning.
|
|
10550
|
+
*/
|
|
10551
|
+
withAgent(agent) {
|
|
10552
|
+
this.existingAgent = agent;
|
|
10553
|
+
this.definitionLocked = true;
|
|
10554
|
+
if (this.config.name === "default_agent" && agent.name) {
|
|
10555
|
+
this.config.name = agent.name;
|
|
10556
|
+
}
|
|
10557
|
+
return this;
|
|
10558
|
+
}
|
|
10529
10559
|
/**
|
|
10530
10560
|
* Configure as a sequential agent
|
|
10531
10561
|
* @param subAgents Sub-agents to execute in sequence
|
|
10532
10562
|
* @returns This builder instance for chaining
|
|
10533
10563
|
*/
|
|
10534
10564
|
asSequential(subAgents) {
|
|
10565
|
+
this.warnIfLocked("asSequential");
|
|
10535
10566
|
this.agentType = "sequential";
|
|
10536
10567
|
this.config.subAgents = subAgents;
|
|
10537
10568
|
return this;
|
|
@@ -10542,6 +10573,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10542
10573
|
* @returns This builder instance for chaining
|
|
10543
10574
|
*/
|
|
10544
10575
|
asParallel(subAgents) {
|
|
10576
|
+
this.warnIfLocked("asParallel");
|
|
10545
10577
|
this.agentType = "parallel";
|
|
10546
10578
|
this.config.subAgents = subAgents;
|
|
10547
10579
|
return this;
|
|
@@ -10553,6 +10585,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10553
10585
|
* @returns This builder instance for chaining
|
|
10554
10586
|
*/
|
|
10555
10587
|
asLoop(subAgents, maxIterations = 3) {
|
|
10588
|
+
this.warnIfLocked("asLoop");
|
|
10556
10589
|
this.agentType = "loop";
|
|
10557
10590
|
this.config.subAgents = subAgents;
|
|
10558
10591
|
this.config.maxIterations = maxIterations;
|
|
@@ -10565,6 +10598,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10565
10598
|
* @returns This builder instance for chaining
|
|
10566
10599
|
*/
|
|
10567
10600
|
asLangGraph(nodes, rootNode) {
|
|
10601
|
+
this.warnIfLocked("asLangGraph");
|
|
10568
10602
|
this.agentType = "langgraph";
|
|
10569
10603
|
this.config.nodes = nodes;
|
|
10570
10604
|
this.config.rootNode = rootNode;
|
|
@@ -10691,6 +10725,7 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10691
10725
|
* @returns Created agent instance
|
|
10692
10726
|
*/
|
|
10693
10727
|
createAgent() {
|
|
10728
|
+
if (this.existingAgent) return this.existingAgent;
|
|
10694
10729
|
switch (this.agentType) {
|
|
10695
10730
|
case "llm": {
|
|
10696
10731
|
if (!this.config.model) {
|
|
@@ -10821,6 +10856,22 @@ var AgentBuilder = class _AgentBuilder {
|
|
|
10821
10856
|
}
|
|
10822
10857
|
};
|
|
10823
10858
|
}
|
|
10859
|
+
/**
|
|
10860
|
+
* Warn (once per method) if the definition has been locked by withAgent().
|
|
10861
|
+
*/
|
|
10862
|
+
warnIfLocked(method) {
|
|
10863
|
+
if (!this.definitionLocked) return;
|
|
10864
|
+
if (this.warnedMethods.has(method)) return;
|
|
10865
|
+
this.warnedMethods.add(method);
|
|
10866
|
+
if (process.env.NODE_ENV !== "production") {
|
|
10867
|
+
const msg = `AgentBuilder: attempted to call ${method} after withAgent(); ignoring. (Wrap the agent first OR configure before withAgent).`;
|
|
10868
|
+
if (this.logger && typeof this.logger.warn === "function") {
|
|
10869
|
+
this.logger.warn(msg);
|
|
10870
|
+
} else {
|
|
10871
|
+
console.warn(msg);
|
|
10872
|
+
}
|
|
10873
|
+
}
|
|
10874
|
+
}
|
|
10824
10875
|
};
|
|
10825
10876
|
|
|
10826
10877
|
// src/memory/index.ts
|
|
@@ -10985,14 +11036,14 @@ var VertexAiSessionService = class extends BaseSessionService {
|
|
|
10985
11036
|
async listSessions(appName, userId) {
|
|
10986
11037
|
const reasoningEngineId = this.getReasoningEngineId(appName);
|
|
10987
11038
|
const apiClient = this.getApiClient();
|
|
10988
|
-
let
|
|
11039
|
+
let path3 = `reasoningEngines/${reasoningEngineId}/sessions`;
|
|
10989
11040
|
if (userId) {
|
|
10990
11041
|
const parsedUserId = encodeURIComponent(`"${userId}"`);
|
|
10991
|
-
|
|
11042
|
+
path3 = `${path3}?filter=user_id=${parsedUserId}`;
|
|
10992
11043
|
}
|
|
10993
11044
|
const apiResponse = await apiClient.async_request({
|
|
10994
11045
|
http_method: "GET",
|
|
10995
|
-
path:
|
|
11046
|
+
path: path3,
|
|
10996
11047
|
request_dict: {}
|
|
10997
11048
|
});
|
|
10998
11049
|
if (apiResponse.httpHeaders) {
|
|
@@ -11808,12 +11859,1299 @@ __export(flows_exports, {
|
|
|
11808
11859
|
removeClientFunctionCallId: () => removeClientFunctionCallId
|
|
11809
11860
|
});
|
|
11810
11861
|
|
|
11862
|
+
// src/evaluation/index.ts
|
|
11863
|
+
var evaluation_exports = {};
|
|
11864
|
+
__export(evaluation_exports, {
|
|
11865
|
+
AgentEvaluator: () => AgentEvaluator,
|
|
11866
|
+
EvalResult: () => EvalResult,
|
|
11867
|
+
EvalStatus: () => EvalStatus,
|
|
11868
|
+
Evaluator: () => Evaluator,
|
|
11869
|
+
FinalResponseMatchV2Evaluator: () => FinalResponseMatchV2Evaluator,
|
|
11870
|
+
LocalEvalService: () => LocalEvalService,
|
|
11871
|
+
PrebuiltMetrics: () => PrebuiltMetrics,
|
|
11872
|
+
RougeEvaluator: () => RougeEvaluator,
|
|
11873
|
+
SafetyEvaluatorV1: () => SafetyEvaluatorV1,
|
|
11874
|
+
TrajectoryEvaluator: () => TrajectoryEvaluator
|
|
11875
|
+
});
|
|
11876
|
+
|
|
11877
|
+
// src/evaluation/evaluator.ts
|
|
11878
|
+
var EvalStatus = /* @__PURE__ */ ((EvalStatus2) => {
|
|
11879
|
+
EvalStatus2[EvalStatus2["PASSED"] = 1] = "PASSED";
|
|
11880
|
+
EvalStatus2[EvalStatus2["FAILED"] = 2] = "FAILED";
|
|
11881
|
+
EvalStatus2[EvalStatus2["NOT_EVALUATED"] = 3] = "NOT_EVALUATED";
|
|
11882
|
+
return EvalStatus2;
|
|
11883
|
+
})(EvalStatus || {});
|
|
11884
|
+
var Evaluator = class {
|
|
11885
|
+
constructor(metric) {
|
|
11886
|
+
this.metric = metric;
|
|
11887
|
+
}
|
|
11888
|
+
static getMetricInfo(metricName) {
|
|
11889
|
+
throw new Error("getMetricInfo() must be implemented by subclass");
|
|
11890
|
+
}
|
|
11891
|
+
};
|
|
11892
|
+
|
|
11893
|
+
// src/evaluation/eval-metrics.ts
|
|
11894
|
+
var PrebuiltMetrics = /* @__PURE__ */ ((PrebuiltMetrics2) => {
|
|
11895
|
+
PrebuiltMetrics2["TOOL_TRAJECTORY_AVG_SCORE"] = "tool_trajectory_avg_score";
|
|
11896
|
+
PrebuiltMetrics2["RESPONSE_EVALUATION_SCORE"] = "response_evaluation_score";
|
|
11897
|
+
PrebuiltMetrics2["RESPONSE_MATCH_SCORE"] = "response_match_score";
|
|
11898
|
+
PrebuiltMetrics2["SAFETY_V1"] = "safety_v1";
|
|
11899
|
+
PrebuiltMetrics2["FINAL_RESPONSE_MATCH_V2"] = "final_response_match_v2";
|
|
11900
|
+
PrebuiltMetrics2["TOOL_TRAJECTORY_SCORE"] = "tool_trajectory_score";
|
|
11901
|
+
PrebuiltMetrics2["SAFETY"] = "safety";
|
|
11902
|
+
PrebuiltMetrics2["RESPONSE_MATCH"] = "response_match";
|
|
11903
|
+
return PrebuiltMetrics2;
|
|
11904
|
+
})(PrebuiltMetrics || {});
|
|
11905
|
+
|
|
11906
|
+
// src/evaluation/eval-result.ts
|
|
11907
|
+
var EvalResult = class {
|
|
11908
|
+
evalSetResultId;
|
|
11909
|
+
evalSetResultName;
|
|
11910
|
+
evalSetId;
|
|
11911
|
+
evalCaseResults;
|
|
11912
|
+
creationTimestamp;
|
|
11913
|
+
constructor(init) {
|
|
11914
|
+
this.evalSetResultId = init.evalSetResultId || "";
|
|
11915
|
+
this.evalSetResultName = init.evalSetResultName;
|
|
11916
|
+
this.evalSetId = init.evalSetId || "";
|
|
11917
|
+
this.evalCaseResults = init.evalCaseResults || [];
|
|
11918
|
+
this.creationTimestamp = init.creationTimestamp || Date.now() / 1e3;
|
|
11919
|
+
}
|
|
11920
|
+
};
|
|
11921
|
+
|
|
11922
|
+
// src/evaluation/agent-evaluator.ts
|
|
11923
|
+
import * as fs2 from "fs/promises";
|
|
11924
|
+
import * as path2 from "path";
|
|
11925
|
+
|
|
11926
|
+
// src/evaluation/base-eval-service.ts
|
|
11927
|
+
var BaseEvalService = class {
|
|
11928
|
+
async *evaluateSession(session) {
|
|
11929
|
+
const inferenceResults = [];
|
|
11930
|
+
for await (const result of this.performInference({
|
|
11931
|
+
evalSetId: session.evalSetId,
|
|
11932
|
+
evalCases: session.evalCases
|
|
11933
|
+
})) {
|
|
11934
|
+
inferenceResults.push(result);
|
|
11935
|
+
}
|
|
11936
|
+
for await (const result of this.evaluate({
|
|
11937
|
+
inferenceResults,
|
|
11938
|
+
evaluateConfig: session.evaluateConfig
|
|
11939
|
+
})) {
|
|
11940
|
+
yield result;
|
|
11941
|
+
}
|
|
11942
|
+
}
|
|
11943
|
+
};
|
|
11944
|
+
|
|
11945
|
+
// src/evaluation/vertex-ai-eval-facade.ts
|
|
11946
|
+
var ERROR_MESSAGE_SUFFIX = `
|
|
11947
|
+
You should specify both project id and location. This metric uses Vertex Gen AI
|
|
11948
|
+
Eval SDK, and it requires google cloud credentials.
|
|
11949
|
+
|
|
11950
|
+
If using an .env file add the values there, or explicitly set in the code using
|
|
11951
|
+
the template below:
|
|
11952
|
+
|
|
11953
|
+
process.env.GOOGLE_CLOUD_LOCATION = <LOCATION>
|
|
11954
|
+
process.env.GOOGLE_CLOUD_PROJECT = <PROJECT ID>
|
|
11955
|
+
`;
|
|
11956
|
+
var VertexAiEvalFacade = class _VertexAiEvalFacade {
|
|
11957
|
+
threshold;
|
|
11958
|
+
metricName;
|
|
11959
|
+
constructor(config) {
|
|
11960
|
+
this.threshold = config.threshold;
|
|
11961
|
+
this.metricName = config.metricName;
|
|
11962
|
+
}
|
|
11963
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
11964
|
+
let totalScore = 0;
|
|
11965
|
+
let numInvocations = 0;
|
|
11966
|
+
const perInvocationResults = [];
|
|
11967
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
11968
|
+
const actual = actualInvocations[i];
|
|
11969
|
+
const expected = expectedInvocations[i];
|
|
11970
|
+
const prompt = this._getText(expected.userContent);
|
|
11971
|
+
const reference = this._getText(expected.finalResponse);
|
|
11972
|
+
const response = this._getText(actual.finalResponse);
|
|
11973
|
+
const evalCase = {
|
|
11974
|
+
prompt,
|
|
11975
|
+
reference,
|
|
11976
|
+
response
|
|
11977
|
+
};
|
|
11978
|
+
try {
|
|
11979
|
+
const evalCaseResult = await _VertexAiEvalFacade._performEval(
|
|
11980
|
+
[evalCase],
|
|
11981
|
+
[this.metricName]
|
|
11982
|
+
);
|
|
11983
|
+
const score = this._getScore(evalCaseResult);
|
|
11984
|
+
perInvocationResults.push({
|
|
11985
|
+
actualInvocation: actual,
|
|
11986
|
+
expectedInvocation: expected,
|
|
11987
|
+
score,
|
|
11988
|
+
evalStatus: this._getEvalStatus(score)
|
|
11989
|
+
});
|
|
11990
|
+
if (score !== null && score !== void 0) {
|
|
11991
|
+
totalScore += score;
|
|
11992
|
+
numInvocations++;
|
|
11993
|
+
}
|
|
11994
|
+
} catch (error) {
|
|
11995
|
+
console.error("Error evaluating invocation:", error);
|
|
11996
|
+
perInvocationResults.push({
|
|
11997
|
+
actualInvocation: actual,
|
|
11998
|
+
expectedInvocation: expected,
|
|
11999
|
+
score: void 0,
|
|
12000
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12001
|
+
});
|
|
12002
|
+
}
|
|
12003
|
+
}
|
|
12004
|
+
if (perInvocationResults.length > 0) {
|
|
12005
|
+
const overallScore = numInvocations > 0 ? totalScore / numInvocations : void 0;
|
|
12006
|
+
return {
|
|
12007
|
+
overallScore,
|
|
12008
|
+
overallEvalStatus: this._getEvalStatus(overallScore),
|
|
12009
|
+
perInvocationResults
|
|
12010
|
+
};
|
|
12011
|
+
}
|
|
12012
|
+
return {
|
|
12013
|
+
overallScore: void 0,
|
|
12014
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
12015
|
+
perInvocationResults: []
|
|
12016
|
+
};
|
|
12017
|
+
}
|
|
12018
|
+
_getText(content) {
|
|
12019
|
+
if (content?.parts) {
|
|
12020
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
|
|
12021
|
+
}
|
|
12022
|
+
return "";
|
|
12023
|
+
}
|
|
12024
|
+
_getScore(evalResult) {
|
|
12025
|
+
if (evalResult?.summaryMetrics?.[0]?.meanScore !== void 0 && typeof evalResult.summaryMetrics[0].meanScore === "number" && !Number.isNaN(evalResult.summaryMetrics[0].meanScore)) {
|
|
12026
|
+
return evalResult.summaryMetrics[0].meanScore;
|
|
12027
|
+
}
|
|
12028
|
+
return void 0;
|
|
12029
|
+
}
|
|
12030
|
+
_getEvalStatus(score) {
|
|
12031
|
+
if (score !== null && score !== void 0) {
|
|
12032
|
+
return score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12033
|
+
}
|
|
12034
|
+
return 3 /* NOT_EVALUATED */;
|
|
12035
|
+
}
|
|
12036
|
+
static async _performEval(dataset, metrics) {
|
|
12037
|
+
const projectId = process.env.GOOGLE_CLOUD_PROJECT;
|
|
12038
|
+
const location = process.env.GOOGLE_CLOUD_LOCATION;
|
|
12039
|
+
if (!projectId) {
|
|
12040
|
+
throw new Error(`Missing project id. ${ERROR_MESSAGE_SUFFIX}`);
|
|
12041
|
+
}
|
|
12042
|
+
if (!location) {
|
|
12043
|
+
throw new Error(`Missing location. ${ERROR_MESSAGE_SUFFIX}`);
|
|
12044
|
+
}
|
|
12045
|
+
console.warn(
|
|
12046
|
+
"Vertex AI evaluation is not fully implemented. Using mock response."
|
|
12047
|
+
);
|
|
12048
|
+
return {
|
|
12049
|
+
summaryMetrics: [
|
|
12050
|
+
{
|
|
12051
|
+
meanScore: Math.random() * 0.5 + 0.5
|
|
12052
|
+
}
|
|
12053
|
+
]
|
|
12054
|
+
};
|
|
12055
|
+
}
|
|
12056
|
+
};
|
|
12057
|
+
|
|
12058
|
+
// src/evaluation/response-evaluator.ts
|
|
12059
|
+
var ResponseEvaluator = class extends Evaluator {
|
|
12060
|
+
metricName;
|
|
12061
|
+
threshold;
|
|
12062
|
+
constructor(evalMetric) {
|
|
12063
|
+
super(evalMetric);
|
|
12064
|
+
if (evalMetric.metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
|
|
12065
|
+
this.metricName = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
|
|
12066
|
+
} else if (evalMetric.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12067
|
+
this.metricName = "response_match_score" /* RESPONSE_MATCH_SCORE */;
|
|
12068
|
+
} else {
|
|
12069
|
+
throw new Error(`Metric ${evalMetric.metricName} is not supported.`);
|
|
12070
|
+
}
|
|
12071
|
+
this.threshold = evalMetric.threshold;
|
|
12072
|
+
}
|
|
12073
|
+
static getMetricInfo(metricName) {
|
|
12074
|
+
if (metricName === "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */) {
|
|
12075
|
+
return {
|
|
12076
|
+
metricName: "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */,
|
|
12077
|
+
description: "This metric evaluates how coherent agent's response was. Value range of this metric is [1,5], with values closer to 5 more desirable.",
|
|
12078
|
+
metricValueInfo: {
|
|
12079
|
+
interval: {
|
|
12080
|
+
minValue: 1,
|
|
12081
|
+
maxValue: 5,
|
|
12082
|
+
openAtMin: false,
|
|
12083
|
+
openAtMax: false
|
|
12084
|
+
}
|
|
12085
|
+
}
|
|
12086
|
+
};
|
|
12087
|
+
}
|
|
12088
|
+
if (metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12089
|
+
return {
|
|
12090
|
+
metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
|
|
12091
|
+
description: "This metric evaluates if agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
12092
|
+
metricValueInfo: {
|
|
12093
|
+
interval: {
|
|
12094
|
+
minValue: 0,
|
|
12095
|
+
maxValue: 1,
|
|
12096
|
+
openAtMin: false,
|
|
12097
|
+
openAtMax: false
|
|
12098
|
+
}
|
|
12099
|
+
}
|
|
12100
|
+
};
|
|
12101
|
+
}
|
|
12102
|
+
throw new Error(`Metric ${metricName} is not supported.`);
|
|
12103
|
+
}
|
|
12104
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12105
|
+
if (this.metricName === "response_match_score" /* RESPONSE_MATCH_SCORE */) {
|
|
12106
|
+
return this.evaluateRougeScore(actualInvocations, expectedInvocations);
|
|
12107
|
+
}
|
|
12108
|
+
const vertexAiFacade = new VertexAiEvalFacade({
|
|
12109
|
+
threshold: this.threshold,
|
|
12110
|
+
metricName: this.metricName
|
|
12111
|
+
});
|
|
12112
|
+
return vertexAiFacade.evaluateInvocations(
|
|
12113
|
+
actualInvocations,
|
|
12114
|
+
expectedInvocations
|
|
12115
|
+
);
|
|
12116
|
+
}
|
|
12117
|
+
async evaluateRougeScore(actualInvocations, expectedInvocations) {
|
|
12118
|
+
if (actualInvocations.length !== expectedInvocations.length) {
|
|
12119
|
+
throw new Error("Number of actual and expected invocations must match");
|
|
12120
|
+
}
|
|
12121
|
+
const results = [];
|
|
12122
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12123
|
+
const actual = actualInvocations[i];
|
|
12124
|
+
const expected = expectedInvocations[i];
|
|
12125
|
+
const result = await this.evaluateInvocation(actual, expected);
|
|
12126
|
+
results.push(result);
|
|
12127
|
+
}
|
|
12128
|
+
const scores = results.map((r) => r.score).filter((s) => s !== void 0);
|
|
12129
|
+
const overallScore = scores.length > 0 ? scores.reduce((a, b) => a + b, 0) / scores.length : void 0;
|
|
12130
|
+
const overallStatus = overallScore !== void 0 && overallScore >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12131
|
+
return {
|
|
12132
|
+
overallScore,
|
|
12133
|
+
overallEvalStatus: overallStatus,
|
|
12134
|
+
perInvocationResults: results
|
|
12135
|
+
};
|
|
12136
|
+
}
|
|
12137
|
+
async evaluateInvocation(actual, expected) {
|
|
12138
|
+
if (!actual.finalResponse || !expected.finalResponse) {
|
|
12139
|
+
return {
|
|
12140
|
+
actualInvocation: actual,
|
|
12141
|
+
expectedInvocation: expected,
|
|
12142
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12143
|
+
};
|
|
12144
|
+
}
|
|
12145
|
+
const score = await this.computeRougeScore(
|
|
12146
|
+
actual.finalResponse,
|
|
12147
|
+
expected.finalResponse
|
|
12148
|
+
);
|
|
12149
|
+
return {
|
|
12150
|
+
actualInvocation: actual,
|
|
12151
|
+
expectedInvocation: expected,
|
|
12152
|
+
score,
|
|
12153
|
+
evalStatus: score >= this.threshold ? 1 /* PASSED */ : 2 /* FAILED */
|
|
12154
|
+
};
|
|
12155
|
+
}
|
|
12156
|
+
async computeRougeScore(actual, expected) {
|
|
12157
|
+
const actualText = this.extractText(actual);
|
|
12158
|
+
const expectedText = this.extractText(expected);
|
|
12159
|
+
if (!actualText.trim() || !expectedText.trim()) {
|
|
12160
|
+
return 0;
|
|
12161
|
+
}
|
|
12162
|
+
const actualTokens = this.tokenizeText(actualText);
|
|
12163
|
+
const expectedTokens = this.tokenizeText(expectedText);
|
|
12164
|
+
const actualUnigrams = new Set(actualTokens);
|
|
12165
|
+
const expectedUnigrams = new Set(expectedTokens);
|
|
12166
|
+
const commonUnigrams = new Set(
|
|
12167
|
+
[...actualUnigrams].filter((token) => expectedUnigrams.has(token))
|
|
12168
|
+
);
|
|
12169
|
+
const precision = actualUnigrams.size > 0 ? commonUnigrams.size / actualUnigrams.size : 0;
|
|
12170
|
+
const recall = expectedUnigrams.size > 0 ? commonUnigrams.size / expectedUnigrams.size : 0;
|
|
12171
|
+
const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
12172
|
+
return fmeasure;
|
|
12173
|
+
}
|
|
12174
|
+
extractText(content) {
|
|
12175
|
+
if (content?.parts) {
|
|
12176
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join(" ");
|
|
12177
|
+
}
|
|
12178
|
+
return "";
|
|
12179
|
+
}
|
|
12180
|
+
tokenizeText(text) {
|
|
12181
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
|
|
12182
|
+
}
|
|
12183
|
+
};
|
|
12184
|
+
|
|
12185
|
+
// src/evaluation/trajectory-evaluator.ts
|
|
12186
|
+
var TrajectoryEvaluator = class extends Evaluator {
|
|
12187
|
+
static getMetricInfo() {
|
|
12188
|
+
return {
|
|
12189
|
+
metricName: "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */,
|
|
12190
|
+
description: "This metric compares two tool call trajectories (expected vs. actual) for the same user interaction. It performs an exact match on the tool name and arguments for each step in the trajectory. A score of 1.0 indicates a perfect match, while 0.0 indicates a mismatch. Higher values are better.",
|
|
12191
|
+
metricValueInfo: {
|
|
12192
|
+
interval: {
|
|
12193
|
+
minValue: 0,
|
|
12194
|
+
maxValue: 1,
|
|
12195
|
+
openAtMin: false,
|
|
12196
|
+
openAtMax: false
|
|
12197
|
+
}
|
|
12198
|
+
}
|
|
12199
|
+
};
|
|
12200
|
+
}
|
|
12201
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12202
|
+
let totalToolUseAccuracy = 0;
|
|
12203
|
+
let numInvocations = 0;
|
|
12204
|
+
const perInvocationResults = [];
|
|
12205
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12206
|
+
const actual = actualInvocations[i];
|
|
12207
|
+
const expected = expectedInvocations[i];
|
|
12208
|
+
if (!actual.intermediateData?.toolUses || !expected.intermediateData?.toolUses) {
|
|
12209
|
+
perInvocationResults.push({
|
|
12210
|
+
actualInvocation: actual,
|
|
12211
|
+
expectedInvocation: expected,
|
|
12212
|
+
evalStatus: 3 /* NOT_EVALUATED */
|
|
12213
|
+
});
|
|
12214
|
+
continue;
|
|
12215
|
+
}
|
|
12216
|
+
const toolUseAccuracy = this.areToolCallsEqual(
|
|
12217
|
+
actual.intermediateData.toolUses,
|
|
12218
|
+
expected.intermediateData.toolUses
|
|
12219
|
+
) ? 1 : 0;
|
|
12220
|
+
perInvocationResults.push({
|
|
12221
|
+
actualInvocation: actual,
|
|
12222
|
+
expectedInvocation: expected,
|
|
12223
|
+
score: toolUseAccuracy,
|
|
12224
|
+
evalStatus: toolUseAccuracy >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */
|
|
12225
|
+
});
|
|
12226
|
+
totalToolUseAccuracy += toolUseAccuracy;
|
|
12227
|
+
numInvocations++;
|
|
12228
|
+
}
|
|
12229
|
+
const overallScore = numInvocations > 0 ? totalToolUseAccuracy / numInvocations : 0;
|
|
12230
|
+
return {
|
|
12231
|
+
overallScore,
|
|
12232
|
+
overallEvalStatus: overallScore >= this.metric.threshold ? 1 /* PASSED */ : 2 /* FAILED */,
|
|
12233
|
+
perInvocationResults
|
|
12234
|
+
};
|
|
12235
|
+
}
|
|
12236
|
+
areToolCallsEqual(actual, expected) {
|
|
12237
|
+
if (actual.length !== expected.length) {
|
|
12238
|
+
return false;
|
|
12239
|
+
}
|
|
12240
|
+
return actual.every((actualCall, index) => {
|
|
12241
|
+
const expectedCall = expected[index];
|
|
12242
|
+
return this.isToolCallEqual(actualCall, expectedCall);
|
|
12243
|
+
});
|
|
12244
|
+
}
|
|
12245
|
+
isToolCallEqual(actual, expected) {
|
|
12246
|
+
if (actual.name !== expected.name) {
|
|
12247
|
+
return false;
|
|
12248
|
+
}
|
|
12249
|
+
const actualArgs = actual.args || {};
|
|
12250
|
+
const expectedArgs = expected.args || {};
|
|
12251
|
+
const actualKeys = Object.keys(actualArgs).sort();
|
|
12252
|
+
const expectedKeys = Object.keys(expectedArgs).sort();
|
|
12253
|
+
if (actualKeys.length !== expectedKeys.length) {
|
|
12254
|
+
return false;
|
|
12255
|
+
}
|
|
12256
|
+
return actualKeys.every((key, index) => {
|
|
12257
|
+
const expectedKey = expectedKeys[index];
|
|
12258
|
+
if (key !== expectedKey) {
|
|
12259
|
+
return false;
|
|
12260
|
+
}
|
|
12261
|
+
return JSON.stringify(actualArgs[key]) === JSON.stringify(expectedArgs[key]);
|
|
12262
|
+
});
|
|
12263
|
+
}
|
|
12264
|
+
};
|
|
12265
|
+
|
|
12266
|
+
// src/evaluation/safety-evaluator.ts
|
|
12267
|
+
var SafetyEvaluatorV1 = class extends Evaluator {
|
|
12268
|
+
static getMetricInfo() {
|
|
12269
|
+
return {
|
|
12270
|
+
metricName: "safety_v1" /* SAFETY_V1 */,
|
|
12271
|
+
description: "This metric evaluates the safety (harmlessness) of an Agent's Response. Value range of the metric is [0, 1], with values closer to 1 to be more desirable (safe).",
|
|
12272
|
+
metricValueInfo: {
|
|
12273
|
+
interval: {
|
|
12274
|
+
minValue: 0,
|
|
12275
|
+
maxValue: 1,
|
|
12276
|
+
openAtMin: false,
|
|
12277
|
+
openAtMax: false
|
|
12278
|
+
}
|
|
12279
|
+
}
|
|
12280
|
+
};
|
|
12281
|
+
}
|
|
12282
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12283
|
+
const facade = new VertexAiEvalFacade({
|
|
12284
|
+
threshold: this.metric.threshold,
|
|
12285
|
+
metricName: "safety_v1" /* SAFETY_V1 */
|
|
12286
|
+
});
|
|
12287
|
+
return await facade.evaluateInvocations(
|
|
12288
|
+
actualInvocations,
|
|
12289
|
+
expectedInvocations
|
|
12290
|
+
);
|
|
12291
|
+
}
|
|
12292
|
+
};
|
|
12293
|
+
|
|
12294
|
+
// src/evaluation/llm-as-judge-utils.ts
|
|
12295
|
+
function getTextFromContent(content) {
|
|
12296
|
+
if (content?.parts) {
|
|
12297
|
+
return content.parts.map((part) => part.text).filter(Boolean).join("\n");
|
|
12298
|
+
}
|
|
12299
|
+
return "";
|
|
12300
|
+
}
|
|
12301
|
+
function getEvalStatus(score, threshold) {
|
|
12302
|
+
return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
12303
|
+
}
|
|
12304
|
+
|
|
12305
|
+
// src/evaluation/llm-as-judge.ts
|
|
12306
|
+
var LlmAsJudge = class {
|
|
12307
|
+
async sampleJudge(prompt, numSamples, critiqueParser, judgeModelOptions) {
|
|
12308
|
+
const modelName = judgeModelOptions?.judgeModel || "gemini-2.5-flash";
|
|
12309
|
+
const model = LLMRegistry.getModelOrCreate(modelName);
|
|
12310
|
+
const config = judgeModelOptions?.judgeModelConfig || {};
|
|
12311
|
+
const samples = [];
|
|
12312
|
+
for (let i = 0; i < numSamples; i++) {
|
|
12313
|
+
try {
|
|
12314
|
+
const response = await model.generateContent({
|
|
12315
|
+
prompt,
|
|
12316
|
+
...config
|
|
12317
|
+
});
|
|
12318
|
+
const label = critiqueParser(response.text);
|
|
12319
|
+
if (label !== "not_found" /* NOT_FOUND */) {
|
|
12320
|
+
samples.push(label);
|
|
12321
|
+
}
|
|
12322
|
+
} catch (error) {
|
|
12323
|
+
console.error("Error sampling judge model:", error);
|
|
12324
|
+
}
|
|
12325
|
+
}
|
|
12326
|
+
return samples;
|
|
12327
|
+
}
|
|
12328
|
+
};
|
|
12329
|
+
|
|
12330
|
+
// src/evaluation/final-response-match-v2.ts
|
|
12331
|
+
var FINAL_RESPONSE_MATCH_V2_PROMPT = `You are an expert rater for an AI agent. The AI agent is going to call an API to answer the user query and generate API tool use code based for the choice of the API and API arguments. The ideal model response should be a function call that fulfills user query, or a natural language response hedges or asks users for further clarification if a function call does not apply.
|
|
12332
|
+
The primary focus of this rating task is to check correctness of the model responses.
|
|
12333
|
+
|
|
12334
|
+
The data consists of:
|
|
12335
|
+
- A user query.
|
|
12336
|
+
- A model generated response for the prompt. The responses can consist of:
|
|
12337
|
+
- Natural language, when the model is asking for clarification, or tells the user it does not possess the requested functionality / option.
|
|
12338
|
+
- Code, in the form of one or multiple python function calls, and additional code as needed, for when the model is fulfilling the user request.
|
|
12339
|
+
You can use the help from a reference response annotated by a human rater. This reference response is of high quality. You can compare the agent's response with the reference response and decide if the agent's response is valid.
|
|
12340
|
+
Note sometimes the reference response only contains the key entities of the correct answer and you need to be flexible to allow the agent response to contain more information than the reference response, or to present the key entities in a different format or structure or in shorter or longer format.
|
|
12341
|
+
When the agent response is provided in the form of tables/dataframes or should be best provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response. Likewise, if you have the reference response, then find out the key entities and main components in them and check whether you can retrieve those from the agent response. If the prompt does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
|
|
12342
|
+
|
|
12343
|
+
You should follow the constitutions below very carefully to rate the model response:
|
|
12344
|
+
- Allow flexibility of format even when reference code only uses one of the possible format, unless API spec or user prompt has explicit format requirement
|
|
12345
|
+
- e.g. For state name, allow both abbreviation and full name unless API spec has explicit requirement. e.g. both 'tx' and 'Texas' should be allowed in the agent response even when reference code only uses one of them.
|
|
12346
|
+
- e.g. If a reference response list outputs in a list format, the agent response is allowed to use sentence format and vice versa unless user prompt explicitly asks for a specific format.
|
|
12347
|
+
- e.g. For numbers, allow flexibility of formatting, e.g. 1000000 vs 1,000,000.
|
|
12348
|
+
- The model shouldn't assume that it doesn't have access to according data or incapable of answering the question if reference response is able to find a legit answer.
|
|
12349
|
+
- If the model response contains the correct final answer, rate it as valid even when the model response contains more information than the reference response.
|
|
12350
|
+
- If the user prompt has csv or other table format data, don't read it yourself. Trust the reference response final answer instead.
|
|
12351
|
+
- When the validation needs maths, date calculations, do not use your own calculator. Trust the reference response final answer instead.
|
|
12352
|
+
- Be mindful about unit of numbers. For example, if the reference response says 100 miles, but the model response says 100 km, it is invalid.
|
|
12353
|
+
- When the agent response or the reference response is provided in the form of tables/dataframes: focus on the key entities and main components requested in the user query and check whether you can retrieve those from the agent response and whether those match the reference response. If the user query does not specify any format instructions and the main items/components are included in the response then tolerate the differences in the formatting of those tables/dataframes.
|
|
12354
|
+
- When the answer is in numeric format, check whether there are any format requirements in the numeric format, rounding, precision, number of decimals, etc. specified in the user query and the prompt. If there are no such instructions, then tolerate different numerical formats.
|
|
12355
|
+
- When the answer is in numeric format and there are rounding or precision differences between the agent response and the reference response, if no further instructions are provided evaluate if the rounding strategy or precision in the agent response follows the standards for that entity. For instance, model accuracy scores must be reported with at least two decimal places (e.g., 0.798 \u2192 0.80 is acceptable, but 0.7 is not).
|
|
12356
|
+
|
|
12357
|
+
Below are the inputs:
|
|
12358
|
+
{{
|
|
12359
|
+
"User prompt": {prompt},
|
|
12360
|
+
"Agent response": {response},
|
|
12361
|
+
"Reference response": {golden_response},
|
|
12362
|
+
}}
|
|
12363
|
+
|
|
12364
|
+
The answer should be a json alone which follows the json structure below:
|
|
12365
|
+
{{
|
|
12366
|
+
"reasoning": [reasoning],
|
|
12367
|
+
"is_the_agent_response_valid": [valid or invalid],
|
|
12368
|
+
}}
|
|
12369
|
+
Answer with assertiveness:
|
|
12370
|
+
`;
|
|
12371
|
+
var DEFAULT_NUM_SAMPLES = 5;
|
|
12372
|
+
function parseCritique(response) {
|
|
12373
|
+
const labelMatchIsResponseValid = response.match(
|
|
12374
|
+
/"is_the_agent_response_valid":\s*\[*[\n\s]*"*([^"^\]^\s]*)"*[\n\s]*\]*\s*[,\n\}]/
|
|
12375
|
+
);
|
|
12376
|
+
if (labelMatchIsResponseValid?.[1]) {
|
|
12377
|
+
const label = labelMatchIsResponseValid[1].toLowerCase();
|
|
12378
|
+
return label === "valid" ? "valid" /* VALID */ : "invalid" /* INVALID */;
|
|
12379
|
+
}
|
|
12380
|
+
return "not_found" /* NOT_FOUND */;
|
|
12381
|
+
}
|
|
12382
|
+
var FinalResponseMatchV2Evaluator = class extends Evaluator {
|
|
12383
|
+
constructor(evalMetric, llmAsJudge = new LlmAsJudge()) {
|
|
12384
|
+
super(evalMetric);
|
|
12385
|
+
this.llmAsJudge = llmAsJudge;
|
|
12386
|
+
}
|
|
12387
|
+
static getMetricInfo() {
|
|
12388
|
+
return {
|
|
12389
|
+
metricName: "final_response_match_v2" /* FINAL_RESPONSE_MATCH_V2 */,
|
|
12390
|
+
description: "This metric evaluates if the agent's final response matches a golden/expected final response using an LLM judge. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
12391
|
+
metricValueInfo: {
|
|
12392
|
+
interval: {
|
|
12393
|
+
minValue: 0,
|
|
12394
|
+
maxValue: 1,
|
|
12395
|
+
openAtMin: false,
|
|
12396
|
+
openAtMax: false
|
|
12397
|
+
}
|
|
12398
|
+
}
|
|
12399
|
+
};
|
|
12400
|
+
}
|
|
12401
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
12402
|
+
const perInvocationResults = [];
|
|
12403
|
+
let totalScore = 0;
|
|
12404
|
+
let numInvocations = 0;
|
|
12405
|
+
if (!actualInvocations.length) {
|
|
12406
|
+
return {
|
|
12407
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
12408
|
+
perInvocationResults: []
|
|
12409
|
+
};
|
|
12410
|
+
}
|
|
12411
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
12412
|
+
const actual = actualInvocations[i];
|
|
12413
|
+
const expected = expectedInvocations[i];
|
|
12414
|
+
const prompt = getTextFromContent(expected.userContent);
|
|
12415
|
+
const response = getTextFromContent(actual.finalResponse);
|
|
12416
|
+
const goldenResponse = getTextFromContent(expected.finalResponse);
|
|
12417
|
+
const formattedPrompt = FINAL_RESPONSE_MATCH_V2_PROMPT.replace(
|
|
12418
|
+
"{prompt}",
|
|
12419
|
+
prompt
|
|
12420
|
+
).replace("{response}", response).replace("{golden_response}", goldenResponse);
|
|
12421
|
+
const numSamples = this.metric.judgeModelOptions?.numSamples ?? DEFAULT_NUM_SAMPLES;
|
|
12422
|
+
const labels = await this.llmAsJudge.sampleJudge(
|
|
12423
|
+
formattedPrompt,
|
|
12424
|
+
numSamples,
|
|
12425
|
+
parseCritique,
|
|
12426
|
+
this.metric.judgeModelOptions
|
|
12427
|
+
);
|
|
12428
|
+
const score = labels.filter((l) => l === "valid" /* VALID */).length / labels.length;
|
|
12429
|
+
perInvocationResults.push({
|
|
12430
|
+
actualInvocation: actual,
|
|
12431
|
+
expectedInvocation: expected,
|
|
12432
|
+
score,
|
|
12433
|
+
evalStatus: getEvalStatus(score, this.metric.threshold)
|
|
12434
|
+
});
|
|
12435
|
+
totalScore += score;
|
|
12436
|
+
numInvocations++;
|
|
12437
|
+
}
|
|
12438
|
+
const overallScore = totalScore / numInvocations;
|
|
12439
|
+
return {
|
|
12440
|
+
overallScore,
|
|
12441
|
+
overallEvalStatus: getEvalStatus(overallScore, this.metric.threshold),
|
|
12442
|
+
perInvocationResults
|
|
12443
|
+
};
|
|
12444
|
+
}
|
|
12445
|
+
};
|
|
12446
|
+
|
|
12447
|
+
// src/evaluation/metric-evaluator-registry.ts
|
|
12448
|
+
var MetricEvaluatorRegistry = class {
|
|
12449
|
+
registry = /* @__PURE__ */ new Map();
|
|
12450
|
+
getEvaluator(evalMetric) {
|
|
12451
|
+
const entry = this.registry.get(evalMetric.metricName);
|
|
12452
|
+
if (!entry) {
|
|
12453
|
+
throw new Error(`${evalMetric.metricName} not found in registry.`);
|
|
12454
|
+
}
|
|
12455
|
+
return new entry.evaluator(evalMetric);
|
|
12456
|
+
}
|
|
12457
|
+
registerEvaluator(metricInfo, evaluator) {
|
|
12458
|
+
const metricName = metricInfo.metricName;
|
|
12459
|
+
if (this.registry.has(metricName)) {
|
|
12460
|
+
console.info(
|
|
12461
|
+
`Updating Evaluator class for ${metricName} from ${this.registry.get(metricName)?.evaluator.name} to ${evaluator.name}`
|
|
12462
|
+
);
|
|
12463
|
+
}
|
|
12464
|
+
this.registry.set(metricName, {
|
|
12465
|
+
evaluator,
|
|
12466
|
+
metricInfo: { ...metricInfo }
|
|
12467
|
+
});
|
|
12468
|
+
}
|
|
12469
|
+
getRegisteredMetrics() {
|
|
12470
|
+
return Array.from(this.registry.values()).map((entry) => ({
|
|
12471
|
+
...entry.metricInfo
|
|
12472
|
+
}));
|
|
12473
|
+
}
|
|
12474
|
+
};
|
|
12475
|
+
function getDefaultMetricEvaluatorRegistry() {
|
|
12476
|
+
const registry = new MetricEvaluatorRegistry();
|
|
12477
|
+
registry.registerEvaluator(
|
|
12478
|
+
TrajectoryEvaluator.getMetricInfo(),
|
|
12479
|
+
TrajectoryEvaluator
|
|
12480
|
+
);
|
|
12481
|
+
registry.registerEvaluator(
|
|
12482
|
+
ResponseEvaluator.getMetricInfo("response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */),
|
|
12483
|
+
ResponseEvaluator
|
|
12484
|
+
);
|
|
12485
|
+
registry.registerEvaluator(
|
|
12486
|
+
ResponseEvaluator.getMetricInfo("response_match_score" /* RESPONSE_MATCH_SCORE */),
|
|
12487
|
+
ResponseEvaluator
|
|
12488
|
+
);
|
|
12489
|
+
registry.registerEvaluator(
|
|
12490
|
+
SafetyEvaluatorV1.getMetricInfo(),
|
|
12491
|
+
SafetyEvaluatorV1
|
|
12492
|
+
);
|
|
12493
|
+
registry.registerEvaluator(
|
|
12494
|
+
FinalResponseMatchV2Evaluator.getMetricInfo(),
|
|
12495
|
+
FinalResponseMatchV2Evaluator
|
|
12496
|
+
);
|
|
12497
|
+
return registry;
|
|
12498
|
+
}
|
|
12499
|
+
var DEFAULT_METRIC_EVALUATOR_REGISTRY = getDefaultMetricEvaluatorRegistry();
|
|
12500
|
+
|
|
12501
|
+
// src/evaluation/local-eval-service.ts
|
|
12502
|
+
var LocalEvalService = class extends BaseEvalService {
|
|
12503
|
+
constructor(agent, parallelism = 4) {
|
|
12504
|
+
super();
|
|
12505
|
+
this.agent = agent;
|
|
12506
|
+
this.parallelism = parallelism;
|
|
12507
|
+
this.initializeRunner();
|
|
12508
|
+
}
|
|
12509
|
+
runner;
|
|
12510
|
+
async initializeRunner() {
|
|
12511
|
+
if ("ask" in this.agent) {
|
|
12512
|
+
this.runner = this.agent;
|
|
12513
|
+
} else {
|
|
12514
|
+
try {
|
|
12515
|
+
const { runner } = await AgentBuilder.create("eval_agent").withModel("gemini-2.5-flash").withDescription("Agent for evaluation purposes").build();
|
|
12516
|
+
this.runner = {
|
|
12517
|
+
ask: async (message) => {
|
|
12518
|
+
return await runner.ask(message);
|
|
12519
|
+
}
|
|
12520
|
+
};
|
|
12521
|
+
} catch (error) {
|
|
12522
|
+
console.warn(
|
|
12523
|
+
"Failed to create AgentBuilder runner, falling back to mock:",
|
|
12524
|
+
error
|
|
12525
|
+
);
|
|
12526
|
+
this.runner = {
|
|
12527
|
+
ask: async (message) => {
|
|
12528
|
+
return `Mock response to: ${message}`;
|
|
12529
|
+
}
|
|
12530
|
+
};
|
|
12531
|
+
}
|
|
12532
|
+
}
|
|
12533
|
+
}
|
|
12534
|
+
async *performInference(request) {
|
|
12535
|
+
for (const evalSet of request.evalCases) {
|
|
12536
|
+
for (const evalCase of evalSet.evalCases) {
|
|
12537
|
+
const expected = [];
|
|
12538
|
+
for (const convo of evalCase.conversation) {
|
|
12539
|
+
if (convo.finalResponse) {
|
|
12540
|
+
expected.push({
|
|
12541
|
+
invocationId: `${evalCase.evalId}-expected-${expected.length}`,
|
|
12542
|
+
userContent: convo.userContent,
|
|
12543
|
+
finalResponse: convo.finalResponse,
|
|
12544
|
+
intermediateData: convo.intermediateData,
|
|
12545
|
+
creationTimestamp: convo.creationTimestamp
|
|
12546
|
+
});
|
|
12547
|
+
}
|
|
12548
|
+
}
|
|
12549
|
+
const actual = await this.runInference(evalCase);
|
|
12550
|
+
yield [...expected, ...actual];
|
|
12551
|
+
}
|
|
12552
|
+
}
|
|
12553
|
+
}
|
|
12554
|
+
async *evaluate(request) {
|
|
12555
|
+
const { inferenceResults, evaluateConfig } = request;
|
|
12556
|
+
const resultsByCase = /* @__PURE__ */ new Map();
|
|
12557
|
+
for (const result of inferenceResults) {
|
|
12558
|
+
const invocationId = result[0].invocationId;
|
|
12559
|
+
if (!invocationId) continue;
|
|
12560
|
+
const lastHyphenIndex = invocationId.lastIndexOf("-");
|
|
12561
|
+
const evalId = lastHyphenIndex !== -1 ? invocationId.substring(0, lastHyphenIndex) : invocationId;
|
|
12562
|
+
const existing = resultsByCase.get(evalId) || [];
|
|
12563
|
+
resultsByCase.set(evalId, [...existing, ...result]);
|
|
12564
|
+
}
|
|
12565
|
+
for (const [evalId, results] of resultsByCase) {
|
|
12566
|
+
const evalResult = {
|
|
12567
|
+
evalSetResultId: `${evalId}-result-${Date.now()}`,
|
|
12568
|
+
evalSetId: evalId,
|
|
12569
|
+
evalCaseResults: [],
|
|
12570
|
+
creationTimestamp: Date.now()
|
|
12571
|
+
};
|
|
12572
|
+
for (const evalMetric of evaluateConfig.evalMetrics) {
|
|
12573
|
+
const evaluator = DEFAULT_METRIC_EVALUATOR_REGISTRY.getEvaluator(evalMetric);
|
|
12574
|
+
const actual = results.filter(
|
|
12575
|
+
(r) => !r.invocationId?.includes("expected")
|
|
12576
|
+
);
|
|
12577
|
+
const expected = results.filter(
|
|
12578
|
+
(r) => r.invocationId?.includes("expected")
|
|
12579
|
+
);
|
|
12580
|
+
const result = await evaluator.evaluateInvocations(actual, expected);
|
|
12581
|
+
evalResult.evalCaseResults.push({
|
|
12582
|
+
evalSetId: evalId,
|
|
12583
|
+
evalId,
|
|
12584
|
+
finalEvalStatus: result.perInvocationResults.length > 0 ? result.perInvocationResults[0].evalStatus : 3 /* NOT_EVALUATED */,
|
|
12585
|
+
overallEvalMetricResults: [],
|
|
12586
|
+
sessionId: evalId,
|
|
12587
|
+
evalMetricResultPerInvocation: result.perInvocationResults.map(
|
|
12588
|
+
(r) => ({
|
|
12589
|
+
actualInvocation: r.actualInvocation,
|
|
12590
|
+
expectedInvocation: r.expectedInvocation,
|
|
12591
|
+
evalMetricResults: [
|
|
12592
|
+
{
|
|
12593
|
+
metricName: evalMetric.metricName,
|
|
12594
|
+
threshold: evalMetric.threshold,
|
|
12595
|
+
score: r.score,
|
|
12596
|
+
evalStatus: r.evalStatus
|
|
12597
|
+
}
|
|
12598
|
+
]
|
|
12599
|
+
})
|
|
12600
|
+
)
|
|
12601
|
+
});
|
|
12602
|
+
}
|
|
12603
|
+
yield evalResult;
|
|
12604
|
+
}
|
|
12605
|
+
}
|
|
12606
|
+
async runInference(evalCase) {
|
|
12607
|
+
const results = [];
|
|
12608
|
+
if (!this.runner) {
|
|
12609
|
+
await this.initializeRunner();
|
|
12610
|
+
}
|
|
12611
|
+
if (evalCase.sessionInput) {
|
|
12612
|
+
try {
|
|
12613
|
+
if (this.runner.initializeSession) {
|
|
12614
|
+
await this.runner.initializeSession(evalCase.sessionInput);
|
|
12615
|
+
} else if (this.runner.setSessionState) {
|
|
12616
|
+
await this.runner.setSessionState(evalCase.sessionInput);
|
|
12617
|
+
} else {
|
|
12618
|
+
console.log(
|
|
12619
|
+
`Session input provided for ${evalCase.evalId}:`,
|
|
12620
|
+
evalCase.sessionInput
|
|
12621
|
+
);
|
|
12622
|
+
}
|
|
12623
|
+
} catch (error) {
|
|
12624
|
+
console.warn(
|
|
12625
|
+
`Failed to initialize session for ${evalCase.evalId}:`,
|
|
12626
|
+
error
|
|
12627
|
+
);
|
|
12628
|
+
}
|
|
12629
|
+
}
|
|
12630
|
+
for (const invocation of evalCase.conversation) {
|
|
12631
|
+
try {
|
|
12632
|
+
const response = await this.runner.ask(invocation.userContent);
|
|
12633
|
+
results.push({
|
|
12634
|
+
invocationId: `${evalCase.evalId}-${results.length}`,
|
|
12635
|
+
userContent: invocation.userContent,
|
|
12636
|
+
finalResponse: {
|
|
12637
|
+
role: "model",
|
|
12638
|
+
parts: [{ text: response || "" }]
|
|
12639
|
+
},
|
|
12640
|
+
intermediateData: {
|
|
12641
|
+
toolUses: [],
|
|
12642
|
+
intermediateResponses: []
|
|
12643
|
+
},
|
|
12644
|
+
creationTimestamp: Date.now()
|
|
12645
|
+
});
|
|
12646
|
+
} catch (error) {
|
|
12647
|
+
console.error(`Error running inference for ${evalCase.evalId}:`, error);
|
|
12648
|
+
results.push({
|
|
12649
|
+
invocationId: `${evalCase.evalId}-${results.length}`,
|
|
12650
|
+
userContent: invocation.userContent,
|
|
12651
|
+
finalResponse: {
|
|
12652
|
+
role: "model",
|
|
12653
|
+
parts: [
|
|
12654
|
+
{
|
|
12655
|
+
text: `Error: ${error instanceof Error ? error.message : "Unknown error"}`
|
|
12656
|
+
}
|
|
12657
|
+
]
|
|
12658
|
+
},
|
|
12659
|
+
intermediateData: {
|
|
12660
|
+
toolUses: [],
|
|
12661
|
+
intermediateResponses: []
|
|
12662
|
+
},
|
|
12663
|
+
creationTimestamp: Date.now()
|
|
12664
|
+
});
|
|
12665
|
+
}
|
|
12666
|
+
}
|
|
12667
|
+
return results;
|
|
12668
|
+
}
|
|
12669
|
+
};
|
|
12670
|
+
|
|
12671
|
+
// src/evaluation/agent-evaluator.ts
|
|
12672
|
+
var NUM_RUNS = 2;
|
|
12673
|
+
var TOOL_TRAJECTORY_SCORE_KEY = "tool_trajectory_avg_score" /* TOOL_TRAJECTORY_AVG_SCORE */;
|
|
12674
|
+
var RESPONSE_EVALUATION_SCORE_KEY = "response_evaluation_score" /* RESPONSE_EVALUATION_SCORE */;
|
|
12675
|
+
var RESPONSE_MATCH_SCORE_KEY = "response_match_score" /* RESPONSE_MATCH_SCORE */;
|
|
12676
|
+
var SAFETY_V1_KEY = "safety_v1" /* SAFETY_V1 */;
|
|
12677
|
+
var ALLOWED_CRITERIA = [
|
|
12678
|
+
TOOL_TRAJECTORY_SCORE_KEY,
|
|
12679
|
+
RESPONSE_EVALUATION_SCORE_KEY,
|
|
12680
|
+
RESPONSE_MATCH_SCORE_KEY,
|
|
12681
|
+
SAFETY_V1_KEY
|
|
12682
|
+
];
|
|
12683
|
+
var QUERY_COLUMN = "query";
|
|
12684
|
+
var REFERENCE_COLUMN = "reference";
|
|
12685
|
+
var EXPECTED_TOOL_USE_COLUMN = "expected_tool_use";
|
|
12686
|
+
var DEFAULT_CRITERIA = {
|
|
12687
|
+
[TOOL_TRAJECTORY_SCORE_KEY]: 1,
|
|
12688
|
+
[RESPONSE_MATCH_SCORE_KEY]: 0.8
|
|
12689
|
+
};
|
|
12690
|
+
var loadJson = async (filePath) => {
|
|
12691
|
+
try {
|
|
12692
|
+
const fileContent = await fs2.readFile(filePath, "utf-8");
|
|
12693
|
+
return JSON.parse(fileContent);
|
|
12694
|
+
} catch (error) {
|
|
12695
|
+
throw new Error(`Failed to load JSON from ${filePath}: ${error}`);
|
|
12696
|
+
}
|
|
12697
|
+
};
|
|
12698
|
+
var AgentEvaluator = class _AgentEvaluator {
|
|
12699
|
+
static async findConfigForTestFile(testFile) {
|
|
12700
|
+
const testFolder = path2.dirname(testFile);
|
|
12701
|
+
const configPath = path2.join(testFolder, "test_config.json");
|
|
12702
|
+
try {
|
|
12703
|
+
await fs2.access(configPath);
|
|
12704
|
+
const configData = await loadJson(configPath);
|
|
12705
|
+
if ("criteria" in configData && typeof configData.criteria === "object") {
|
|
12706
|
+
return configData.criteria;
|
|
12707
|
+
}
|
|
12708
|
+
throw new Error(
|
|
12709
|
+
`Invalid format for test_config.json at ${configPath}. Expected a 'criteria' dictionary.`
|
|
12710
|
+
);
|
|
12711
|
+
} catch (error) {
|
|
12712
|
+
return DEFAULT_CRITERIA;
|
|
12713
|
+
}
|
|
12714
|
+
}
|
|
12715
|
+
static async evaluateEvalSet(agent, evalSet, criteria, numRuns = NUM_RUNS, printDetailedResults = false) {
|
|
12716
|
+
const evalMetrics = Object.entries(criteria).map(
|
|
12717
|
+
([metricName, threshold]) => ({
|
|
12718
|
+
metricName,
|
|
12719
|
+
threshold
|
|
12720
|
+
})
|
|
12721
|
+
);
|
|
12722
|
+
const evalResultsByEvalId = await _AgentEvaluator._getEvalResultsByEvalId(
|
|
12723
|
+
agent,
|
|
12724
|
+
evalSet,
|
|
12725
|
+
evalMetrics,
|
|
12726
|
+
numRuns
|
|
12727
|
+
);
|
|
12728
|
+
const failures = [];
|
|
12729
|
+
for (const [_, evalResultsPerEvalId] of evalResultsByEvalId) {
|
|
12730
|
+
const evalMetricResults = _AgentEvaluator._getEvalMetricResultsWithInvocation(
|
|
12731
|
+
evalResultsPerEvalId
|
|
12732
|
+
);
|
|
12733
|
+
const failuresPerEvalCase = _AgentEvaluator._processMetricsAndGetFailures(
|
|
12734
|
+
evalMetricResults,
|
|
12735
|
+
printDetailedResults,
|
|
12736
|
+
agent.name || "Unknown Agent"
|
|
12737
|
+
);
|
|
12738
|
+
failures.push(...failuresPerEvalCase);
|
|
12739
|
+
}
|
|
12740
|
+
if (failures.length > 0) {
|
|
12741
|
+
throw new Error(
|
|
12742
|
+
`Following are all the test failures. If you looking to get more details on the failures, then please re-run this test with \`printDetailedResults\` set to \`true\`.
|
|
12743
|
+
${failures.join(
|
|
12744
|
+
"\n"
|
|
12745
|
+
)}`
|
|
12746
|
+
);
|
|
12747
|
+
}
|
|
12748
|
+
}
|
|
12749
|
+
static async evaluate(agent, evalDatasetFilePathOrDir, numRuns = NUM_RUNS, initialSessionFile) {
|
|
12750
|
+
const testFiles = [];
|
|
12751
|
+
try {
|
|
12752
|
+
const stat2 = await fs2.stat(evalDatasetFilePathOrDir);
|
|
12753
|
+
if (stat2.isDirectory()) {
|
|
12754
|
+
const files = await this._findTestFilesRecursively(
|
|
12755
|
+
evalDatasetFilePathOrDir
|
|
12756
|
+
);
|
|
12757
|
+
testFiles.push(...files);
|
|
12758
|
+
} else {
|
|
12759
|
+
testFiles.push(evalDatasetFilePathOrDir);
|
|
12760
|
+
}
|
|
12761
|
+
} catch (error) {
|
|
12762
|
+
throw new Error(`Invalid path: ${evalDatasetFilePathOrDir}`);
|
|
12763
|
+
}
|
|
12764
|
+
const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
|
|
12765
|
+
for (const testFile of testFiles) {
|
|
12766
|
+
const criteria = await _AgentEvaluator.findConfigForTestFile(testFile);
|
|
12767
|
+
const evalSet = await _AgentEvaluator._loadEvalSetFromFile(
|
|
12768
|
+
testFile,
|
|
12769
|
+
criteria,
|
|
12770
|
+
initialSession
|
|
12771
|
+
);
|
|
12772
|
+
await _AgentEvaluator.evaluateEvalSet(agent, evalSet, criteria, numRuns);
|
|
12773
|
+
}
|
|
12774
|
+
}
|
|
12775
|
+
static async migrateEvalDataToNewSchema(oldEvalDataFile, newEvalDataFile, initialSessionFile) {
|
|
12776
|
+
if (!oldEvalDataFile || !newEvalDataFile) {
|
|
12777
|
+
throw new Error("One of oldEvalDataFile or newEvalDataFile is empty.");
|
|
12778
|
+
}
|
|
12779
|
+
const criteria = await _AgentEvaluator.findConfigForTestFile(oldEvalDataFile);
|
|
12780
|
+
const initialSession = await _AgentEvaluator._getInitialSession(initialSessionFile);
|
|
12781
|
+
const evalSet = await _AgentEvaluator._getEvalSetFromOldFormat(
|
|
12782
|
+
oldEvalDataFile,
|
|
12783
|
+
criteria,
|
|
12784
|
+
initialSession
|
|
12785
|
+
);
|
|
12786
|
+
await fs2.writeFile(newEvalDataFile, JSON.stringify(evalSet, null, 2));
|
|
12787
|
+
}
|
|
12788
|
+
static async _findTestFilesRecursively(dir) {
|
|
12789
|
+
const testFiles = [];
|
|
12790
|
+
async function walk(currentDir) {
|
|
12791
|
+
const entries = await fs2.readdir(currentDir, { withFileTypes: true });
|
|
12792
|
+
for (const entry of entries) {
|
|
12793
|
+
const fullPath = path2.join(currentDir, entry.name);
|
|
12794
|
+
if (entry.isDirectory()) {
|
|
12795
|
+
await walk(fullPath);
|
|
12796
|
+
} else if (entry.name.endsWith(".test.json")) {
|
|
12797
|
+
testFiles.push(fullPath);
|
|
12798
|
+
}
|
|
12799
|
+
}
|
|
12800
|
+
}
|
|
12801
|
+
await walk(dir);
|
|
12802
|
+
return testFiles;
|
|
12803
|
+
}
|
|
12804
|
+
static async _loadEvalSetFromFile(evalSetFile, criteria, initialSession) {
|
|
12805
|
+
try {
|
|
12806
|
+
const content = await fs2.readFile(evalSetFile, "utf-8");
|
|
12807
|
+
try {
|
|
12808
|
+
const evalSet = JSON.parse(content);
|
|
12809
|
+
if (evalSet.evalSetId && evalSet.evalCases) {
|
|
12810
|
+
if (Object.keys(initialSession).length > 0) {
|
|
12811
|
+
throw new Error(
|
|
12812
|
+
"Initial session should be specified as a part of EvalSet file. Explicit initial session is only needed, when specifying data in the older schema."
|
|
12813
|
+
);
|
|
12814
|
+
}
|
|
12815
|
+
return evalSet;
|
|
12816
|
+
}
|
|
12817
|
+
} catch (parseError) {
|
|
12818
|
+
throw new Error(`Failed to parse eval set data: ${parseError}`);
|
|
12819
|
+
}
|
|
12820
|
+
} catch (error) {
|
|
12821
|
+
throw new Error(`Failed to process eval set file: ${error}`);
|
|
12822
|
+
}
|
|
12823
|
+
console.warn(
|
|
12824
|
+
`Contents of ${evalSetFile} appear to be in older format. To avoid this warning, please update your test files to contain data in EvalSet schema. You can use 'migrateEvalDataToNewSchema' for migrating your old test files.`
|
|
12825
|
+
);
|
|
12826
|
+
return _AgentEvaluator._getEvalSetFromOldFormat(
|
|
12827
|
+
evalSetFile,
|
|
12828
|
+
criteria,
|
|
12829
|
+
initialSession
|
|
12830
|
+
);
|
|
12831
|
+
}
|
|
12832
|
+
static async _getEvalSetFromOldFormat(evalSetFile, criteria, initialSession) {
|
|
12833
|
+
const data = await _AgentEvaluator._loadDataset(evalSetFile);
|
|
12834
|
+
_AgentEvaluator._validateInput(data, criteria);
|
|
12835
|
+
return {
|
|
12836
|
+
evalSetId: `eval-set-${Date.now()}`,
|
|
12837
|
+
name: evalSetFile,
|
|
12838
|
+
evalCases: data[0].map(
|
|
12839
|
+
(item, index) => ({
|
|
12840
|
+
evalId: `eval-${index}`,
|
|
12841
|
+
conversation: [
|
|
12842
|
+
{
|
|
12843
|
+
invocationId: `invocation-${index}`,
|
|
12844
|
+
userContent: {
|
|
12845
|
+
role: "user",
|
|
12846
|
+
parts: [{ text: item[QUERY_COLUMN] || "" }]
|
|
12847
|
+
},
|
|
12848
|
+
finalResponse: item[REFERENCE_COLUMN] ? {
|
|
12849
|
+
role: "model",
|
|
12850
|
+
parts: [{ text: item[REFERENCE_COLUMN] }]
|
|
12851
|
+
} : void 0,
|
|
12852
|
+
intermediateData: item[EXPECTED_TOOL_USE_COLUMN] ? {
|
|
12853
|
+
toolUses: item[EXPECTED_TOOL_USE_COLUMN],
|
|
12854
|
+
intermediateResponses: []
|
|
12855
|
+
} : void 0,
|
|
12856
|
+
creationTimestamp: Date.now()
|
|
12857
|
+
}
|
|
12858
|
+
],
|
|
12859
|
+
sessionInput: Object.keys(initialSession).length > 0 ? {
|
|
12860
|
+
appName: "test-app",
|
|
12861
|
+
userId: "test-user",
|
|
12862
|
+
state: initialSession
|
|
12863
|
+
} : void 0
|
|
12864
|
+
})
|
|
12865
|
+
),
|
|
12866
|
+
creationTimestamp: Date.now()
|
|
12867
|
+
};
|
|
12868
|
+
}
|
|
12869
|
+
static async _getInitialSession(initialSessionFile) {
|
|
12870
|
+
if (!initialSessionFile) {
|
|
12871
|
+
return {};
|
|
12872
|
+
}
|
|
12873
|
+
try {
|
|
12874
|
+
const content = await fs2.readFile(initialSessionFile, "utf-8");
|
|
12875
|
+
return JSON.parse(content);
|
|
12876
|
+
} catch (error) {
|
|
12877
|
+
throw new Error(
|
|
12878
|
+
`Failed to load initial session from ${initialSessionFile}: ${error}`
|
|
12879
|
+
);
|
|
12880
|
+
}
|
|
12881
|
+
}
|
|
12882
|
+
static async _loadDataset(inputData) {
|
|
12883
|
+
const stat2 = await fs2.stat(inputData);
|
|
12884
|
+
if (stat2.isDirectory()) {
|
|
12885
|
+
const testFiles = await this._findTestFilesRecursively(inputData);
|
|
12886
|
+
const results = await Promise.all(testFiles.map((f) => loadJson(f)));
|
|
12887
|
+
return results.map((r) => Array.isArray(r) ? r : [r]);
|
|
12888
|
+
}
|
|
12889
|
+
if (stat2.isFile()) {
|
|
12890
|
+
const data = await loadJson(inputData);
|
|
12891
|
+
return [Array.isArray(data) ? data : [data]];
|
|
12892
|
+
}
|
|
12893
|
+
throw new Error(`Invalid input path: ${inputData}`);
|
|
12894
|
+
}
|
|
12895
|
+
static _validateInput(evalDataset, criteria) {
|
|
12896
|
+
if (!evalDataset || evalDataset.length === 0) {
|
|
12897
|
+
throw new Error("The evaluation dataset is None or empty.");
|
|
12898
|
+
}
|
|
12899
|
+
for (const key of Object.keys(criteria)) {
|
|
12900
|
+
if (!ALLOWED_CRITERIA.includes(key)) {
|
|
12901
|
+
throw new Error(
|
|
12902
|
+
`Invalid criteria key: ${key}. Expected one of ${ALLOWED_CRITERIA.join(
|
|
12903
|
+
", "
|
|
12904
|
+
)}.`
|
|
12905
|
+
);
|
|
12906
|
+
}
|
|
12907
|
+
}
|
|
12908
|
+
const sample = evalDataset[0];
|
|
12909
|
+
if (!Array.isArray(sample) || sample.length === 0) {
|
|
12910
|
+
throw new Error("The evaluation dataset is empty.");
|
|
12911
|
+
}
|
|
12912
|
+
const firstQuery = sample[0];
|
|
12913
|
+
if (typeof firstQuery !== "object") {
|
|
12914
|
+
throw new Error(
|
|
12915
|
+
`Each evaluation dataset sample must be list of dictionary. But it's ${JSON.stringify(
|
|
12916
|
+
evalDataset
|
|
12917
|
+
)}`
|
|
12918
|
+
);
|
|
12919
|
+
}
|
|
12920
|
+
if (TOOL_TRAJECTORY_SCORE_KEY in criteria) {
|
|
12921
|
+
if (!(QUERY_COLUMN in firstQuery) || !(EXPECTED_TOOL_USE_COLUMN in firstQuery)) {
|
|
12922
|
+
throw new Error(
|
|
12923
|
+
`Samples for ${TOOL_TRAJECTORY_SCORE_KEY} must include '${QUERY_COLUMN}' and '${EXPECTED_TOOL_USE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
|
|
12924
|
+
);
|
|
12925
|
+
}
|
|
12926
|
+
}
|
|
12927
|
+
if (RESPONSE_EVALUATION_SCORE_KEY in criteria) {
|
|
12928
|
+
if (!(QUERY_COLUMN in firstQuery)) {
|
|
12929
|
+
throw new Error(
|
|
12930
|
+
`Samples for ${RESPONSE_EVALUATION_SCORE_KEY} must include '${QUERY_COLUMN}' key. The sample is ${JSON.stringify(sample)}.`
|
|
12931
|
+
);
|
|
12932
|
+
}
|
|
12933
|
+
}
|
|
12934
|
+
if (RESPONSE_MATCH_SCORE_KEY in criteria) {
|
|
12935
|
+
if (!(QUERY_COLUMN in firstQuery) || !(REFERENCE_COLUMN in firstQuery)) {
|
|
12936
|
+
throw new Error(
|
|
12937
|
+
`Samples for ${RESPONSE_MATCH_SCORE_KEY} must include '${QUERY_COLUMN}' and '${REFERENCE_COLUMN}' keys. The sample is ${JSON.stringify(sample)}.`
|
|
12938
|
+
);
|
|
12939
|
+
}
|
|
12940
|
+
}
|
|
12941
|
+
}
|
|
12942
|
+
static _printDetails(evalMetricResultWithInvocations, overallEvalStatus, overallScore, metricName = "", threshold = 0) {
|
|
12943
|
+
console.log(
|
|
12944
|
+
`Summary: \`${overallEvalStatus}\` for Metric: \`${metricName}\`. Expected threshold: \`${threshold}\`, actual value: \`${overallScore}\`.`
|
|
12945
|
+
);
|
|
12946
|
+
const data = evalMetricResultWithInvocations.map((per) => ({
|
|
12947
|
+
evalStatus: per.evalMetricResult.evalStatus,
|
|
12948
|
+
score: per.evalMetricResult.score,
|
|
12949
|
+
threshold,
|
|
12950
|
+
prompt: _AgentEvaluator._convertContentToText(
|
|
12951
|
+
per.expectedInvocation.userContent
|
|
12952
|
+
),
|
|
12953
|
+
expectedResponse: _AgentEvaluator._convertContentToText(
|
|
12954
|
+
per.expectedInvocation.finalResponse
|
|
12955
|
+
),
|
|
12956
|
+
actualResponse: _AgentEvaluator._convertContentToText(
|
|
12957
|
+
per.actualInvocation.finalResponse
|
|
12958
|
+
),
|
|
12959
|
+
expectedToolCalls: _AgentEvaluator._convertToolCallsToText(
|
|
12960
|
+
per.expectedInvocation.intermediateData
|
|
12961
|
+
),
|
|
12962
|
+
actualToolCalls: _AgentEvaluator._convertToolCallsToText(
|
|
12963
|
+
per.actualInvocation.intermediateData
|
|
12964
|
+
)
|
|
12965
|
+
}));
|
|
12966
|
+
console.table(data);
|
|
12967
|
+
console.log("\n\n");
|
|
12968
|
+
}
|
|
12969
|
+
static _convertContentToText(content) {
|
|
12970
|
+
if (content?.parts) {
|
|
12971
|
+
return content.parts.map((p) => p.text || "").filter((text) => text.length > 0).join("\n");
|
|
12972
|
+
}
|
|
12973
|
+
return "";
|
|
12974
|
+
}
|
|
12975
|
+
static _convertToolCallsToText(intermediateData) {
|
|
12976
|
+
if (intermediateData?.toolUses) {
|
|
12977
|
+
return intermediateData.toolUses.map((t) => JSON.stringify(t)).join("\n");
|
|
12978
|
+
}
|
|
12979
|
+
return "";
|
|
12980
|
+
}
|
|
12981
|
+
static async _getEvalResultsByEvalId(agent, evalSet, evalMetrics, numRuns) {
|
|
12982
|
+
const evalService = new LocalEvalService(agent);
|
|
12983
|
+
const inferenceResults = [];
|
|
12984
|
+
for (let run = 0; run < numRuns; run++) {
|
|
12985
|
+
for await (const result of evalService.performInference({
|
|
12986
|
+
evalSetId: evalSet.evalSetId,
|
|
12987
|
+
evalCases: [evalSet]
|
|
12988
|
+
})) {
|
|
12989
|
+
inferenceResults.push(result);
|
|
12990
|
+
}
|
|
12991
|
+
}
|
|
12992
|
+
const evalResultsByEvalId = /* @__PURE__ */ new Map();
|
|
12993
|
+
for await (const evalResult of evalService.evaluate({
|
|
12994
|
+
inferenceResults,
|
|
12995
|
+
evaluateConfig: { evalMetrics }
|
|
12996
|
+
})) {
|
|
12997
|
+
for (const caseResult of evalResult.evalCaseResults) {
|
|
12998
|
+
const evalId = caseResult.evalId;
|
|
12999
|
+
if (!evalResultsByEvalId.has(evalId)) {
|
|
13000
|
+
evalResultsByEvalId.set(evalId, []);
|
|
13001
|
+
}
|
|
13002
|
+
evalResultsByEvalId.get(evalId).push(caseResult);
|
|
13003
|
+
}
|
|
13004
|
+
}
|
|
13005
|
+
return evalResultsByEvalId;
|
|
13006
|
+
}
|
|
13007
|
+
static _getEvalMetricResultsWithInvocation(evalResultsPerEvalId) {
|
|
13008
|
+
const evalMetricResults = {};
|
|
13009
|
+
for (const evalCaseResult of evalResultsPerEvalId) {
|
|
13010
|
+
for (const evalMetricsPerInvocation of evalCaseResult.evalMetricResultPerInvocation) {
|
|
13011
|
+
for (const evalMetricResult of evalMetricsPerInvocation.evalMetricResults) {
|
|
13012
|
+
const metricName = evalMetricResult.metricName;
|
|
13013
|
+
if (!(metricName in evalMetricResults)) {
|
|
13014
|
+
evalMetricResults[metricName] = [];
|
|
13015
|
+
}
|
|
13016
|
+
evalMetricResults[metricName].push({
|
|
13017
|
+
actualInvocation: evalMetricsPerInvocation.actualInvocation,
|
|
13018
|
+
expectedInvocation: evalMetricsPerInvocation.expectedInvocation,
|
|
13019
|
+
evalMetricResult
|
|
13020
|
+
});
|
|
13021
|
+
}
|
|
13022
|
+
}
|
|
13023
|
+
}
|
|
13024
|
+
return evalMetricResults;
|
|
13025
|
+
}
|
|
13026
|
+
static _processMetricsAndGetFailures(evalMetricResults, printDetailedResults, agentModule) {
|
|
13027
|
+
const failures = [];
|
|
13028
|
+
for (const [metricName, evalMetricResultsWithInvocations] of Object.entries(
|
|
13029
|
+
evalMetricResults
|
|
13030
|
+
)) {
|
|
13031
|
+
const threshold = evalMetricResultsWithInvocations[0]?.evalMetricResult.threshold || 0;
|
|
13032
|
+
const scores = evalMetricResultsWithInvocations.map((m) => m.evalMetricResult.score).filter((s) => s !== void 0);
|
|
13033
|
+
let overallScore;
|
|
13034
|
+
let overallEvalStatus;
|
|
13035
|
+
if (scores.length > 0) {
|
|
13036
|
+
overallScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
13037
|
+
overallEvalStatus = overallScore >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
13038
|
+
} else {
|
|
13039
|
+
overallScore = void 0;
|
|
13040
|
+
overallEvalStatus = 3 /* NOT_EVALUATED */;
|
|
13041
|
+
}
|
|
13042
|
+
if (overallEvalStatus !== 1 /* PASSED */) {
|
|
13043
|
+
if (printDetailedResults) {
|
|
13044
|
+
_AgentEvaluator._printDetails(
|
|
13045
|
+
evalMetricResultsWithInvocations,
|
|
13046
|
+
overallEvalStatus,
|
|
13047
|
+
overallScore,
|
|
13048
|
+
metricName,
|
|
13049
|
+
threshold
|
|
13050
|
+
);
|
|
13051
|
+
}
|
|
13052
|
+
failures.push(
|
|
13053
|
+
`${metricName} for ${agentModule} Failed. Expected ${threshold}, but got ${overallScore}.`
|
|
13054
|
+
);
|
|
13055
|
+
}
|
|
13056
|
+
}
|
|
13057
|
+
return failures;
|
|
13058
|
+
}
|
|
13059
|
+
};
|
|
13060
|
+
|
|
13061
|
+
// src/evaluation/final-response-match-v1.ts
|
|
13062
|
+
var RougeEvaluator = class extends Evaluator {
|
|
13063
|
+
evalMetric;
|
|
13064
|
+
constructor(evalMetric) {
|
|
13065
|
+
super(evalMetric);
|
|
13066
|
+
this.evalMetric = evalMetric;
|
|
13067
|
+
}
|
|
13068
|
+
static getMetricInfo() {
|
|
13069
|
+
return {
|
|
13070
|
+
metricName: "response_match_score" /* RESPONSE_MATCH_SCORE */,
|
|
13071
|
+
description: "This metric evaluates if the agent's final response matches a golden/expected final response using Rouge_1 metric. Value range for this metric is [0,1], with values closer to 1 more desirable.",
|
|
13072
|
+
metricValueInfo: {
|
|
13073
|
+
interval: {
|
|
13074
|
+
minValue: 0,
|
|
13075
|
+
maxValue: 1,
|
|
13076
|
+
openAtMin: false,
|
|
13077
|
+
openAtMax: false
|
|
13078
|
+
}
|
|
13079
|
+
}
|
|
13080
|
+
};
|
|
13081
|
+
}
|
|
13082
|
+
async evaluateInvocations(actualInvocations, expectedInvocations) {
|
|
13083
|
+
let totalScore = 0;
|
|
13084
|
+
let numInvocations = 0;
|
|
13085
|
+
const perInvocationResults = [];
|
|
13086
|
+
for (let i = 0; i < actualInvocations.length; i++) {
|
|
13087
|
+
const actual = actualInvocations[i];
|
|
13088
|
+
const expected = expectedInvocations[i];
|
|
13089
|
+
const reference = getTextFromContent2(expected.finalResponse);
|
|
13090
|
+
const response = getTextFromContent2(actual.finalResponse);
|
|
13091
|
+
const rouge1Scores = await calculateRouge1Scores(response, reference);
|
|
13092
|
+
const score = rouge1Scores.fmeasure;
|
|
13093
|
+
perInvocationResults.push({
|
|
13094
|
+
actualInvocation: actual,
|
|
13095
|
+
expectedInvocation: expected,
|
|
13096
|
+
score,
|
|
13097
|
+
evalStatus: getEvalStatus2(score, this.evalMetric.threshold)
|
|
13098
|
+
});
|
|
13099
|
+
totalScore += score;
|
|
13100
|
+
numInvocations++;
|
|
13101
|
+
}
|
|
13102
|
+
if (perInvocationResults.length > 0) {
|
|
13103
|
+
const overallScore = totalScore / numInvocations;
|
|
13104
|
+
return {
|
|
13105
|
+
overallScore,
|
|
13106
|
+
overallEvalStatus: getEvalStatus2(
|
|
13107
|
+
overallScore,
|
|
13108
|
+
this.evalMetric.threshold
|
|
13109
|
+
),
|
|
13110
|
+
perInvocationResults
|
|
13111
|
+
};
|
|
13112
|
+
}
|
|
13113
|
+
return {
|
|
13114
|
+
overallEvalStatus: 3 /* NOT_EVALUATED */,
|
|
13115
|
+
perInvocationResults: []
|
|
13116
|
+
};
|
|
13117
|
+
}
|
|
13118
|
+
};
|
|
13119
|
+
function getTextFromContent2(content) {
|
|
13120
|
+
if (content?.parts) {
|
|
13121
|
+
return content.parts.map((part) => part.text).filter(Boolean).join("\n");
|
|
13122
|
+
}
|
|
13123
|
+
return "";
|
|
13124
|
+
}
|
|
13125
|
+
function getEvalStatus2(score, threshold) {
|
|
13126
|
+
return score >= threshold ? 1 /* PASSED */ : 2 /* FAILED */;
|
|
13127
|
+
}
|
|
13128
|
+
function calculateRouge1Scores(response, reference) {
|
|
13129
|
+
if (!response.trim() || !reference.trim()) {
|
|
13130
|
+
return { precision: 0, recall: 0, fmeasure: 0 };
|
|
13131
|
+
}
|
|
13132
|
+
const responseTokens = tokenizeText(response);
|
|
13133
|
+
const referenceTokens = tokenizeText(reference);
|
|
13134
|
+
const responseUnigrams = new Set(responseTokens);
|
|
13135
|
+
const referenceUnigrams = new Set(referenceTokens);
|
|
13136
|
+
const commonUnigrams = new Set(
|
|
13137
|
+
[...responseUnigrams].filter((token) => referenceUnigrams.has(token))
|
|
13138
|
+
);
|
|
13139
|
+
const precision = responseUnigrams.size > 0 ? commonUnigrams.size / responseUnigrams.size : 0;
|
|
13140
|
+
const recall = referenceUnigrams.size > 0 ? commonUnigrams.size / referenceUnigrams.size : 0;
|
|
13141
|
+
const fmeasure = precision + recall > 0 ? 2 * precision * recall / (precision + recall) : 0;
|
|
13142
|
+
return { precision, recall, fmeasure };
|
|
13143
|
+
}
|
|
13144
|
+
function tokenizeText(text) {
|
|
13145
|
+
return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((token) => token.length > 0);
|
|
13146
|
+
}
|
|
13147
|
+
|
|
11811
13148
|
// src/version.ts
|
|
11812
13149
|
var VERSION = "0.1.0";
|
|
11813
13150
|
export {
|
|
11814
13151
|
AF_FUNCTION_CALL_ID_PREFIX,
|
|
11815
13152
|
LlmAgent as Agent,
|
|
11816
13153
|
AgentBuilder,
|
|
13154
|
+
AgentEvaluator,
|
|
11817
13155
|
AgentTool,
|
|
11818
13156
|
agents_exports as Agents,
|
|
11819
13157
|
AiSdkLlm,
|
|
@@ -11847,11 +13185,16 @@ export {
|
|
|
11847
13185
|
CodeExecutorContext,
|
|
11848
13186
|
DatabaseSessionService,
|
|
11849
13187
|
EnhancedAuthConfig,
|
|
13188
|
+
EvalResult,
|
|
13189
|
+
EvalStatus,
|
|
13190
|
+
evaluation_exports as Evaluation,
|
|
13191
|
+
Evaluator,
|
|
11850
13192
|
Event,
|
|
11851
13193
|
EventActions,
|
|
11852
13194
|
events_exports as Events,
|
|
11853
13195
|
ExitLoopTool,
|
|
11854
13196
|
FileOperationsTool,
|
|
13197
|
+
FinalResponseMatchV2Evaluator,
|
|
11855
13198
|
flows_exports as Flows,
|
|
11856
13199
|
FunctionTool,
|
|
11857
13200
|
GcsArtifactService,
|
|
@@ -11873,6 +13216,7 @@ export {
|
|
|
11873
13216
|
LlmResponse,
|
|
11874
13217
|
LoadArtifactsTool,
|
|
11875
13218
|
LoadMemoryTool,
|
|
13219
|
+
LocalEvalService,
|
|
11876
13220
|
LoopAgent,
|
|
11877
13221
|
McpAbi,
|
|
11878
13222
|
McpAtp,
|
|
@@ -11900,10 +13244,13 @@ export {
|
|
|
11900
13244
|
OpenIdConnectScheme,
|
|
11901
13245
|
ParallelAgent,
|
|
11902
13246
|
PlanReActPlanner,
|
|
13247
|
+
PrebuiltMetrics,
|
|
11903
13248
|
REQUEST_EUC_FUNCTION_CALL_NAME,
|
|
11904
13249
|
ReadonlyContext,
|
|
13250
|
+
RougeEvaluator,
|
|
11905
13251
|
RunConfig,
|
|
11906
13252
|
Runner,
|
|
13253
|
+
SafetyEvaluatorV1,
|
|
11907
13254
|
SequentialAgent,
|
|
11908
13255
|
sessions_exports as Sessions,
|
|
11909
13256
|
SingleFlow,
|
|
@@ -11912,6 +13259,7 @@ export {
|
|
|
11912
13259
|
TelemetryService,
|
|
11913
13260
|
ToolContext,
|
|
11914
13261
|
tools_exports as Tools,
|
|
13262
|
+
TrajectoryEvaluator,
|
|
11915
13263
|
TransferToAgentTool,
|
|
11916
13264
|
UserInteractionTool,
|
|
11917
13265
|
VERSION,
|