npm - @empiricalrun/test-gen - Versions diffs - 0.76.0 → 0.77.0 - Mend

@empiricalrun/test-gen 0.76.0 → 0.77.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (230) hide show

package/CHANGELOG.md +33 -0
package/dist/agent/base/index.d.ts +25 -21
package/dist/agent/base/index.d.ts.map +1 -1
package/dist/agent/base/index.js +48 -37
package/dist/agent/browsing/run.d.ts +1 -2
package/dist/agent/browsing/run.d.ts.map +1 -1
package/dist/agent/browsing/run.js +3 -9
package/dist/agent/browsing/utils.d.ts +2 -9
package/dist/agent/browsing/utils.d.ts.map +1 -1
package/dist/agent/browsing/utils.js +5 -109
package/dist/agent/chat/agent-loop.d.ts +5 -5
package/dist/agent/chat/agent-loop.d.ts.map +1 -1
package/dist/agent/chat/agent-loop.js +3 -8
package/dist/agent/chat/exports.d.ts +5 -4
package/dist/agent/chat/exports.d.ts.map +1 -1
package/dist/agent/chat/exports.js +4 -7
package/dist/agent/chat/index.d.ts +2 -2
package/dist/agent/chat/index.d.ts.map +1 -1
package/dist/agent/chat/index.js +23 -35
package/dist/agent/chat/models.d.ts +0 -2
package/dist/agent/chat/models.d.ts.map +1 -1
package/dist/agent/chat/models.js +12 -26
package/dist/agent/chat/prompt/pw-utils-docs.d.ts +1 -1
package/dist/agent/chat/prompt/pw-utils-docs.d.ts.map +1 -1
package/dist/agent/chat/prompt/pw-utils-docs.js +52 -0
package/dist/agent/chat/prompt/repo.d.ts.map +1 -1
package/dist/agent/chat/prompt/repo.js +11 -22
package/dist/agent/chat/prompt/test-case-def.d.ts +2 -0
package/dist/agent/chat/prompt/test-case-def.d.ts.map +1 -0
package/dist/agent/chat/prompt/test-case-def.js +44 -0
package/dist/agent/chat/state.d.ts +7 -6
package/dist/agent/chat/state.d.ts.map +1 -1
package/dist/agent/chat/state.js +15 -45
package/dist/agent/chat/utils.d.ts +2 -2
package/dist/agent/chat/utils.d.ts.map +1 -1
package/dist/agent/chat/utils.js +14 -7
package/dist/agent/cli.d.ts.map +1 -1
package/dist/agent/cli.js +62 -58
package/dist/agent/code-review/executor/index.d.ts +5 -0
package/dist/agent/code-review/executor/index.d.ts.map +1 -0
package/dist/agent/code-review/executor/index.js +13 -0
package/dist/agent/code-review/index.d.ts +8 -3
package/dist/agent/code-review/index.d.ts.map +1 -1
package/dist/agent/code-review/index.js +115 -21
package/dist/agent/code-review/parser.d.ts +5 -0
package/dist/agent/code-review/parser.d.ts.map +1 -0
package/dist/agent/code-review/parser.js +70 -0
package/dist/agent/code-review/types.d.ts +36 -0
package/dist/agent/code-review/types.d.ts.map +1 -0
package/dist/agent/code-review/types.js +13 -0
package/dist/agent/cua/index.d.ts.map +1 -1
package/dist/agent/cua/index.js +18 -2
package/dist/agent/cua/model.d.ts.map +1 -1
package/dist/agent/cua/model.js +4 -1
package/dist/agent/cua/pw-codegen/pw-pause/index.d.ts.map +1 -1
package/dist/agent/triage/index.d.ts +2 -2
package/dist/agent/triage/index.d.ts.map +1 -1
package/dist/agent/triage/index.js +8 -7
package/dist/agent/video-analysis/executor/index.d.ts +5 -0
package/dist/agent/video-analysis/executor/index.d.ts.map +1 -0
package/dist/agent/video-analysis/executor/index.js +10 -0
package/dist/agent/video-analysis/index.d.ts +2 -2
package/dist/agent/video-analysis/index.d.ts.map +1 -1
package/dist/agent/video-analysis/index.js +38 -13
package/dist/artifacts/index.d.ts +1 -1
package/dist/artifacts/index.d.ts.map +1 -1
package/dist/artifacts/index.js +3 -1
package/dist/artifacts/utils.d.ts.map +1 -1
package/dist/bin/index.js +66 -21
package/dist/constants/index.d.ts +14 -0
package/dist/constants/index.d.ts.map +1 -1
package/dist/constants/index.js +33 -1
package/dist/file/server.d.ts +1 -3
package/dist/file/server.d.ts.map +1 -1
package/dist/file/server.js +0 -13
package/dist/file-info/adapters/file-system/index.d.ts.map +1 -1
package/dist/file-info/adapters/file-system/reader.d.ts.map +1 -1
package/dist/file-info/adapters/file-system/reader.js +8 -1
package/dist/file-info/adapters/github/index.d.ts.map +1 -1
package/dist/file-info/adapters/github/reader.d.ts +1 -1
package/dist/file-info/adapters/github/reader.d.ts.map +1 -1
package/dist/file-info/adapters/github/reader.js +8 -5
package/dist/index.d.ts.map +1 -1
package/dist/tools/analyse-video/index.d.ts +5 -0
package/dist/tools/analyse-video/index.d.ts.map +1 -0
package/dist/tools/analyse-video/index.js +50 -0
package/dist/tools/create-pull-request/index.js +4 -6
package/dist/tools/create-pull-request/utils.d.ts +1 -1
package/dist/tools/definitions/{fetch-video-analysis.d.ts → analyse-video.d.ts} +13 -8
package/dist/tools/definitions/analyse-video.d.ts.map +1 -0
package/dist/tools/definitions/analyse-video.js +60 -0
package/dist/tools/definitions/review-pull-request.d.ts +3 -0
package/dist/tools/definitions/review-pull-request.d.ts.map +1 -0
package/dist/tools/definitions/review-pull-request.js +16 -0
package/dist/tools/definitions/str_replace_editor.d.ts +1 -0
package/dist/tools/definitions/str_replace_editor.d.ts.map +1 -1
package/dist/tools/definitions/str_replace_editor.js +4 -1
package/dist/tools/definitions/test-gen-browser.d.ts +0 -3
package/dist/tools/definitions/test-gen-browser.d.ts.map +1 -1
package/dist/tools/definitions/test-gen-browser.js +33 -8
package/dist/tools/delete-file/index.d.ts.map +1 -1
package/dist/tools/delete-file/index.js +1 -19
package/dist/tools/executor/base.d.ts +32 -0
package/dist/tools/executor/base.d.ts.map +1 -0
package/dist/tools/executor/base.js +114 -0
package/dist/tools/executor/index.d.ts +3 -22
package/dist/tools/executor/index.d.ts.map +1 -1
package/dist/tools/executor/index.js +7 -100
package/dist/tools/executor/utils/checkpoint.d.ts +1 -1
package/dist/tools/executor/utils/checkpoint.d.ts.map +1 -1
package/dist/tools/executor/utils/checkpoint.js +6 -2
package/dist/tools/executor/utils/git.d.ts +2 -2
package/dist/tools/executor/utils/git.d.ts.map +1 -1
package/dist/tools/executor/utils/git.js +7 -3
package/dist/tools/executor/utils/index.d.ts.map +1 -1
package/dist/tools/executor/utils/index.js +1 -1
package/dist/tools/fetch-session-diff/index.js +2 -2
package/dist/tools/file-operations/create.d.ts.map +1 -1
package/dist/tools/file-operations/create.js +1 -4
package/dist/tools/file-operations/index.d.ts +2 -1
package/dist/tools/file-operations/index.d.ts.map +1 -1
package/dist/tools/file-operations/index.js +4 -1
package/dist/tools/file-operations/insert.d.ts +1 -2
package/dist/tools/file-operations/insert.d.ts.map +1 -1
package/dist/tools/file-operations/insert.js +1 -4
package/dist/tools/file-operations/replace.d.ts.map +1 -1
package/dist/tools/file-operations/replace.js +1 -4
package/dist/tools/grep/index.d.ts.map +1 -1
package/dist/tools/grep/index.js +18 -11
package/dist/tools/index.d.ts +5 -5
package/dist/tools/index.d.ts.map +1 -1
package/dist/tools/index.js +17 -16
package/dist/tools/merge-conflicts/index.d.ts.map +1 -1
package/dist/tools/merge-conflicts/index.js +1 -1
package/dist/tools/rename-file/index.js +1 -1
package/dist/tools/review-pull-request/index.d.ts.map +1 -1
package/dist/tools/review-pull-request/index.js +45 -59
package/dist/tools/run-test.d.ts.map +1 -1
package/dist/tools/run-test.js +25 -3
package/dist/tools/test-gen-browser.d.ts.map +1 -1
package/dist/tools/test-gen-browser.js +51 -47
package/dist/utils/artifact-paths.d.ts +20 -0
package/dist/utils/artifact-paths.d.ts.map +1 -0
package/dist/utils/artifact-paths.js +16 -0
package/dist/utils/dedup-image-fs.d.ts +2 -16
package/dist/utils/dedup-image-fs.d.ts.map +1 -1
package/dist/utils/dedup-image-fs.js +12 -16
package/dist/utils/dedup-image.d.ts +1 -14
package/dist/utils/dedup-image.d.ts.map +1 -1
package/dist/utils/dedup-image.js +7 -62
package/dist/utils/{local-ffmpeg-client.d.ts → ffmpeg/index.d.ts} +6 -7
package/dist/utils/ffmpeg/index.d.ts.map +1 -0
package/dist/utils/{local-ffmpeg-client.js → ffmpeg/index.js} +169 -53
package/dist/utils/find-threshold.d.ts +8 -0
package/dist/utils/find-threshold.d.ts.map +1 -0
package/dist/utils/find-threshold.js +55 -0
package/dist/utils/hash.d.ts +2 -0
package/dist/utils/hash.d.ts.map +1 -0
package/dist/utils/hash.js +24 -0
package/dist/utils/model.d.ts +1 -1
package/dist/utils/model.d.ts.map +1 -1
package/dist/utils/model.js +7 -5
package/dist/utils/repo-tree.d.ts +0 -1
package/dist/utils/repo-tree.d.ts.map +1 -1
package/dist/utils/repo-tree.js +2 -14
package/dist/utils/slug.js +1 -1
package/dist/video-core/agent-orchestrator.d.ts +14 -0
package/dist/video-core/agent-orchestrator.d.ts.map +1 -0
package/dist/video-core/agent-orchestrator.js +78 -0
package/dist/video-core/analysis-server.d.ts +24 -0
package/dist/video-core/analysis-server.d.ts.map +1 -0
package/dist/video-core/analysis-server.js +398 -0
package/dist/video-core/analysis-viewer.html +1374 -0
package/dist/video-core/index.d.ts +44 -0
package/dist/video-core/index.d.ts.map +1 -0
package/dist/video-core/index.js +204 -0
package/dist/video-core/model-limits.d.ts +4 -0
package/dist/video-core/model-limits.d.ts.map +1 -0
package/dist/video-core/model-limits.js +67 -0
package/dist/video-core/storage-manager.d.ts +5 -0
package/dist/video-core/storage-manager.d.ts.map +1 -0
package/dist/video-core/storage-manager.js +55 -0
package/dist/video-core/types.d.ts +13 -0
package/dist/video-core/types.d.ts.map +1 -0
package/dist/video-core/types.js +2 -0
package/dist/video-core/utils.d.ts +25 -0
package/dist/video-core/utils.d.ts.map +1 -0
package/dist/video-core/utils.js +211 -0
package/dist/video-core/xml-parser.d.ts +3 -0
package/dist/video-core/xml-parser.d.ts.map +1 -0
package/dist/video-core/xml-parser.js +27 -0
package/package.json +5 -6
package/tsconfig.tsbuildinfo +1 -1
package/dist/agent/chat/prompt/index.d.ts +0 -6
package/dist/agent/chat/prompt/index.d.ts.map +0 -1
package/dist/agent/chat/prompt/index.js +0 -200
package/dist/agent/code-review/prompt.d.ts +0 -2
package/dist/agent/code-review/prompt.d.ts.map +0 -1
package/dist/agent/code-review/prompt.js +0 -55
package/dist/agent/diagnosis-agent/index.d.ts +0 -11
package/dist/agent/diagnosis-agent/index.d.ts.map +0 -1
package/dist/agent/diagnosis-agent/index.js +0 -88
package/dist/agent/diagnosis-agent/strict-mode-violation.d.ts +0 -10
package/dist/agent/diagnosis-agent/strict-mode-violation.d.ts.map +0 -1
package/dist/agent/diagnosis-agent/strict-mode-violation.js +0 -30
package/dist/tools/definitions/extract-frames-from-video.d.ts +0 -39
package/dist/tools/definitions/extract-frames-from-video.d.ts.map +0 -1
package/dist/tools/definitions/extract-frames-from-video.js +0 -60
package/dist/tools/definitions/fetch-video-analysis.d.ts.map +0 -1
package/dist/tools/definitions/fetch-video-analysis.js +0 -61
package/dist/tools/extract-frames-from-video/index.d.ts +0 -7
package/dist/tools/extract-frames-from-video/index.d.ts.map +0 -1
package/dist/tools/extract-frames-from-video/index.js +0 -145
package/dist/tools/fetch-video-analysis/index.d.ts +0 -5
package/dist/tools/fetch-video-analysis/index.d.ts.map +0 -1
package/dist/tools/fetch-video-analysis/index.js +0 -149
package/dist/tools/fetch-video-analysis/open-ai.d.ts +0 -6
package/dist/tools/fetch-video-analysis/open-ai.d.ts.map +0 -1
package/dist/tools/fetch-video-analysis/open-ai.js +0 -37
package/dist/tools/fetch-video-analysis/utils.d.ts +0 -16
package/dist/tools/fetch-video-analysis/utils.d.ts.map +0 -1
package/dist/tools/fetch-video-analysis/utils.js +0 -121
package/dist/tools/fetch-video-analysis/video-analysis.d.ts +0 -7
package/dist/tools/fetch-video-analysis/video-analysis.d.ts.map +0 -1
package/dist/tools/fetch-video-analysis/video-analysis.js +0 -70
package/dist/tools/file-operations/shared/git-helper.d.ts +0 -4
package/dist/tools/file-operations/shared/git-helper.d.ts.map +0 -1
package/dist/tools/file-operations/shared/git-helper.js +0 -29
package/dist/utils/local-ffmpeg-client.d.ts.map +0 -1
package/eslint.config.mjs +0 -43

package/dist/agent/cli.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/agent/cli.ts"],"names":[],"mappings":"~~AAEA~~,OAAO,EACL,aAAa,~~EAMb~~,mBAAmB,EACpB,MAAM,4BAA4B,CAAC;~~AAwCpC~~,wBAAsB,yBAAyB,IAAI,OAAO,CACxD,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CACvB,CAkCA;AAED,wBAAsB,kBAAkB,CAAC,EACvC,mBAAmB,EACnB,aAAa,EACb,oBAAoB,EACpB,SAAS,EACT,SAAS,EACT,UAAU,GACX,EAAE;IACD,aAAa,EAAE,mBAAmB,CAAC;IACnC,mBAAmB,EAAE,OAAO,CAAC;IAC7B,oBAAoB,EAAE,MAAM,GAAG,SAAS,CAAC;IACzC,SAAS,EAAE,aAAa,CAAC;IACzB,UAAU,EAAE,OAAO,CAAC;IACpB,SAAS,EAAE,OAAO,CAAC;CACpB,~~iBAoLA~~"}
1	+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../../src/agent/cli.ts"],"names":[],"mappings":"AACA,OAAO,EACL,aAAa,EAKb,mBAAmB,EACpB,MAAM,4BAA4B,CAAC;AA+BpC,wBAAsB,yBAAyB,IAAI,OAAO,CACxD,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CACvB,CAkCA;AAED,wBAAsB,kBAAkB,CAAC,EACvC,mBAAmB,EACnB,aAAa,EACb,oBAAoB,EACpB,SAAS,EACT,SAAS,EACT,UAAU,GACX,EAAE;IACD,aAAa,EAAE,mBAAmB,CAAC;IACnC,mBAAmB,EAAE,OAAO,CAAC;IAC7B,oBAAoB,EAAE,MAAM,GAAG,SAAS,CAAC;IACzC,SAAS,EAAE,aAAa,CAAC;IACzB,UAAU,EAAE,OAAO,CAAC;IACpB,SAAS,EAAE,OAAO,CAAC;CACpB,iBA2KA"}

package/dist/agent/cli.js CHANGED Viewed

@@ -3,7 +3,6 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.fetchEnvironmentVariables = fetchEnvironmentVariables;
 exports.runChatAgentForCLI = runChatAgentForCLI;
 const llm_1 = require("@empiricalrun/llm");
-const chat_1 = require("@empiricalrun/llm/chat");
 const picocolors_1 = require("picocolors");
 const client_1 = require("../dashboard/client");
 const reader_1 = require("../file-info/adapters/file-system/reader");
@@ -11,17 +10,17 @@ const human_in_the_loop_1 = require("../human-in-the-loop");
 const validation_1 = require("../recorder/validation");
 const executor_1 = require("../tools/executor");
 const git_1 = require("../tools/executor/utils/git");
-const filesystem_cache_1 = require("./chat/filesystem-cache");
 const state_1 = require("./chat/state");
 const utils_1 = require("./chat/utils");
 const index_1 = require("./index");
 function stopCriteria(userPrompt) {
     return userPrompt?.toLowerCase() === "stop";
 }
-function concludeAgent(chatModel, useDiskForChatState, selectedModel, error) {
-    console.log(`\n${(0, picocolors_1.gray)("Usage summary -> " + (0, state_1.getUsageSummary)(chatModel))}`);
+function concludeAgent(agent, useDiskForChatState) {
+    const chatState = agent.chatState;
+    console.log(`\n${(0, picocolors_1.gray)("Usage summary -> " + (0, state_1.getUsageSummary)(chatState))}`);
     if (useDiskForChatState) {
-        (0, state_1.saveToDisk)(chatModel.messages, selectedModel, chatModel.askUserForInput, error);
+        (0, state_1.saveToDisk)(chatState);
     }
 }
 async function fetchEnvironmentVariables() {
@@ -53,41 +52,47 @@ async function fetchEnvironmentVariables() {
     return envVars;
 }
 async function runChatAgentForCLI({ useDiskForChatState, selectedModel, initialPromptContent, agentMode, resetChat, useFSCache, }) {
-    let chatState;
     const enableStreaming = !useFSCache;
-    const cache = useFSCache ? new filesystem_cache_1.FilesystemLLMCache() : undefined;
-    if (resetChat) {
-        (0, state_1.clearChatState)();
-    }
-    if (useDiskForChatState) {
-        chatState = (0, state_1.loadChatState)();
-    }
+    // TODO: Implement cache support in BaseAgent
+    // const cache = useFSCache ? new FilesystemLLMCache() : undefined;
     // TODO: Store branch name in chat state so that we don't recreate it every time
     const randomId = crypto.randomUUID().substring(0, 8);
     const branchName = `branch-${randomId}`;
     await (0, git_1.checkoutBranch)(branchName, process.cwd());
-    let messagesLoadedFromDisk = chatState?.messages || [];
-    let chatModel = (0, chat_1.createChatModel)(messagesLoadedFromDisk, selectedModel, undefined, cache);
-    chatModel.validateEnvVarsForAuth();
-    if (initialPromptContent && chatModel.messages.length === 0) {
-        chatModel.pushUserMessage(initialPromptContent, []);
-    }
-    else if (initialPromptContent && chatModel.messages.length > 0) {
-        console.warn(`Ignoring initial prompt because we have existing messages.`);
-    }
-    if (chatModel.askUserForInput) {
-        // Show last message to the user for context when we loaded from disk
-        const latest = chatModel.getHumanReadableLatestMessage();
-        if (latest) {
-            console.log(`${(0, picocolors_1.blue)(latest.role)}: ${latest.textMessage}`);
-        }
+    let chatState;
+    if (useDiskForChatState) {
+        chatState = (0, state_1.loadChatState)({ resetChat });
     }
     if (chatState && chatState.error) {
         // Reset error state as we are attempting a retry
         chatState.error = null;
     }
+    if (initialPromptContent) {
+        if (!chatState) {
+            const { text, attachments } = (0, utils_1.extractAttachments)(initialPromptContent);
+            chatState = (0, state_1.createChatState)({
+                userPrompt: text,
+                attachments: attachments,
+                existingState: undefined,
+                selectedModel,
+                error: null,
+            });
+        }
+        else {
+            console.warn(`Ignoring initial prompt because we have existing chat state.`);
+        }
+    }
+    if (!chatState) {
+        chatState = (0, state_1.createChatState)({
+            userPrompt: undefined,
+            attachments: [],
+            selectedModel,
+            existingState: undefined,
+            error: null,
+        });
+    }
     const handleSigInt = () => {
-        concludeAgent(chatModel, useDiskForChatState, selectedModel, null);
+        concludeAgent(agent, useDiskForChatState);
         process.exit(0);
     };
     process.once("SIGINT", handleSigInt);
@@ -95,7 +100,7 @@ async function runChatAgentForCLI({ useDiskForChatState, selectedModel, initialP
     let userPrompt;
     let reporterFunc = async (chatState, latest) => {
         if (useDiskForChatState) {
-            (0, state_1.saveToDisk)(chatState.messages, selectedModel, chatState.askUserForInput, chatState.error);
+            (0, state_1.saveToDisk)(chatState);
         }
         if (latest) {
             if (!enableStreaming) {
@@ -122,13 +127,31 @@ async function runChatAgentForCLI({ useDiskForChatState, selectedModel, initialP
     const apiClient = new client_1.DashboardAPIClient({
         authType,
     });
+    const toolExecutor = new executor_1.ToolExecutor({
+        chatSession: { branchName },
+        repoPath: process.cwd(),
+        apiClient,
+        trace,
+        featureFlags: [],
+        environmentOverrides: await fetchEnvironmentVariables(),
+    });
     const fileInfoBuilder = () => (0, reader_1.getFileInfoFromFS)(process.cwd());
     const agentParams = {
         selectedModel,
+        featureFlags: [],
+        chatState,
+        toolExecutor,
     };
-    const agent = index_1.MODE_TO_AGENT_MAP[agentMode](agentParams);
+    const agent = index_1.MODE_TO_AGENT_MAP[agentMode]({ ...agentParams });
+    if (agent.askUserForInput) {
+        // Show last message to the user for context when we loaded from disk
+        const latest = agent.getHumanReadableLatestMessage();
+        if (latest) {
+            console.log(`${(0, picocolors_1.blue)(latest.role)}: ${latest.textMessage}`);
+        }
+    }
     while (!stopCriteria(userPrompt)) {
-        if (chatModel.askUserForInput) {
+        if (agent.askUserForInput) {
             try {
                 userPrompt = await human_in_the_loop_1.humanLoop.getFeedback({
                     message: "User:",
@@ -137,33 +160,19 @@ async function runChatAgentForCLI({ useDiskForChatState, selectedModel, initialP
             catch (e) {
                 // https://github.com/SBoudrias/Inquirer.js/issues/1502#issuecomment-2275991680
                 if (e instanceof Error && e.name === "ExitPromptError") {
-                    concludeAgent(chatModel, useDiskForChatState, selectedModel, null);
+                    concludeAgent(agent, useDiskForChatState);
                     process.exit(0);
                 }
-                concludeAgent(chatModel, useDiskForChatState, selectedModel, {
-                    message: e.message,
-                    stack: e.stack || "Stack trace not available",
-                    timestamp: new Date().toISOString(),
-                });
+                concludeAgent(agent, useDiskForChatState);
                 throw e;
             }
             if (!stopCriteria(userPrompt)) {
                 const { text, attachments } = (0, utils_1.extractAttachments)(userPrompt);
-                chatModel.pushUserMessage(text, attachments);
+                agent.pushUserMessage(text, attachments);
             }
         }
         else {
-            const toolExecutor = new executor_1.ToolExecutor({
-                chatSession: null,
-                branchName,
-                repoPath: process.cwd(),
-                apiClient,
-                trace,
-                featureFlags: [],
-                environmentOverrides: await fetchEnvironmentVariables(),
-            });
             await agent.runLoop({
-                messages: chatModel.messages,
                 reporter: reporterFunc,
                 streamingMessageReporter: (() => {
                     if (!enableStreaming) {
@@ -192,18 +201,13 @@ async function runChatAgentForCLI({ useDiskForChatState, selectedModel, initialP
                 trace,
                 repoInfoBuilder: fileInfoBuilder,
                 onPendingToolCall: async (toolCalls) => {
-                    const toolResults = await toolExecutor.execute(toolCalls);
-                    chatModel.pushToolResultsMessage(toolCalls, toolResults);
+                    const { toolResults, checkpoint } = await toolExecutor.execute(toolCalls);
+                    agent.processToolResults(toolCalls, toolResults, checkpoint);
                 },
             });
-            // Update the chatModel with the agent's final state for next iteration
-            if (agent.messages) {
-                chatModel = (0, chat_1.createChatModel)(agent.messages, selectedModel, undefined, cache);
-            }
         }
     }
-    trace?.update({ output: { messages: chatModel.messages } });
+    trace?.update({ output: { messages: agent.messages } });
     await llm_1.langfuseInstance?.flushAsync();
-    const usageSummary = (0, state_1.getUsageSummary)(chatModel);
-    console.log(`\n${(0, picocolors_1.gray)("Usage summary -> " + usageSummary)}`);
+    console.log(`\n${(0, picocolors_1.gray)("Usage summary -> " + (0, state_1.getUsageSummary)(agent.chatState))}`);
 }

package/dist/agent/code-review/executor/index.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import { BaseToolExecutor, BaseToolExecutorProps } from "../../../tools/executor/base";
+export declare class CodeReviewToolExecutor extends BaseToolExecutor {
+    constructor(params: Omit<BaseToolExecutorProps, "tools">);
+}
+//# sourceMappingURL=index.d.ts.map

package/dist/agent/code-review/executor/index.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/agent/code-review/executor/index.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,gBAAgB,EAChB,qBAAqB,EACtB,MAAM,8BAA8B,CAAC;AAItC,qBAAa,sBAAuB,SAAQ,gBAAgB;gBAC9C,MAAM,EAAE,IAAI,CAAC,qBAAqB,EAAE,OAAO,CAAC;CAIzD"}

package/dist/agent/code-review/executor/index.js ADDED Viewed

@@ -0,0 +1,13 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CodeReviewToolExecutor = void 0;
+const base_1 = require("../../../tools/executor/base");
+const fetch_session_diff_1 = require("../../../tools/fetch-session-diff");
+const file_operations_1 = require("../../../tools/file-operations");
+class CodeReviewToolExecutor extends base_1.BaseToolExecutor {
+    constructor(params) {
+        const tools = [fetch_session_diff_1.fetchSessionDiffTool, ...file_operations_1.viewOnlyTools];
+        super({ ...params, tools });
+    }
+}
+exports.CodeReviewToolExecutor = CodeReviewToolExecutor;

package/dist/agent/code-review/index.d.ts CHANGED Viewed

@@ -1,7 +1,12 @@
-import type { ToolDefinition } from "@empiricalrun/shared-types";
+import type { ToolsForLLM } from "@empiricalrun/shared-types";
 import { BaseAgent } from "../base";
+import { type CodeReviewResultV0, type CodeReviewResultV1, type CodeReviewResultV2, CodeReviewSeverity, CodeReviewVerdict } from "./types";
+export type { CodeReviewResultV1, CodeReviewResultV0, CodeReviewResultV2 };
+export { CodeReviewVerdict, CodeReviewSeverity };
+export type CodeReviewVersionedResult = CodeReviewResultV1 | CodeReviewResultV0 | CodeReviewResultV2;
 export declare class CodeReviewAgent extends BaseAgent {
-    protected getTools(): ToolDefinition[];
-    protected buildSystemPrompt(): Promise<string>;
+    protected getTools(): ToolsForLLM;
+    getResult(): CodeReviewVersionedResult | undefined;
+    protected buildSystemPrompt(repoContext: string): Promise<string>;
 }
 //# sourceMappingURL=index.d.ts.map

package/dist/agent/code-review/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/code-review/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,~~EAAE~~,~~cAAc~~,EAAE,MAAM,4BAA4B,CAAC;~~AAGjE~~,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAEpC,qBAAa,eAAgB,SAAQ,SAAS;IAC5C,SAAS,CAAC,QAAQ,IAAI,~~cAAc~~,~~EAAE~~;~~cAItB~~,iBAAiB,~~IAAI~~,OAAO,CAAC,MAAM,CAAC;~~CAsDrD~~"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/code-review/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAmB,WAAW,EAAE,MAAM,4BAA4B,CAAC;AAG/E,OAAO,EAAE,SAAS,EAAE,MAAM,SAAS,CAAC;AAEpC,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,kBAAkB,EACvB,KAAK,kBAAkB,EACvB,kBAAkB,EAClB,iBAAiB,EAClB,MAAM,SAAS,CAAC;AAEjB,YAAY,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,CAAC;AAC3E,OAAO,EAAE,iBAAiB,EAAE,kBAAkB,EAAE,CAAC;AACjD,MAAM,MAAM,yBAAyB,GACjC,kBAAkB,GAClB,kBAAkB,GAClB,kBAAkB,CAAC;AAEvB,qBAAa,eAAgB,SAAQ,SAAS;IAC5C,SAAS,CAAC,QAAQ,IAAI,WAAW;IAWjC,SAAS,IAAI,yBAAyB,GAAG,SAAS;cAmBlC,iBAAiB,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;CAuHxE"}

package/dist/agent/code-review/index.js CHANGED Viewed

@@ -1,52 +1,137 @@
 "use strict";
 Object.defineProperty(exports, "__esModule", { value: true });
-exports.CodeReviewAgent = void 0;
+exports.CodeReviewAgent = exports.CodeReviewSeverity = exports.CodeReviewVerdict = void 0;
 const tools_1 = require("../../tools");
+const fetch_session_diff_1 = require("../../tools/fetch-session-diff");
 const base_1 = require("../base");
+const parser_1 = require("./parser");
+const types_1 = require("./types");
+Object.defineProperty(exports, "CodeReviewSeverity", { enumerable: true, get: function () { return types_1.CodeReviewSeverity; } });
+Object.defineProperty(exports, "CodeReviewVerdict", { enumerable: true, get: function () { return types_1.CodeReviewVerdict; } });
 class CodeReviewAgent extends base_1.BaseAgent {
     getTools() {
-        return [tools_1.fetchSessionDiffTool];
+        const custom = [
+            fetch_session_diff_1.fetchSessionDiffTool,
+            ...(0, tools_1.textViewToolsForModel)(this.selectedModel),
+        ];
+        return {
+            custom,
+            builtInTextEditor: (0, tools_1.hasBuiltInTextEditor)(this.selectedModel),
+        };
     }
-    async buildSystemPrompt() {
+    getResult() {
+        const messages = this.messages || [];
+        const lastMessage = messages.length
+            ? messages[messages.length - 1]
+            : undefined;
+        const lastMessageTextPart = lastMessage
+            ? lastMessage.parts
+                .filter((p) => "text" in p)
+                .find((p) => "text" in p && !!p.text)
+            : undefined;
+        const textPart = lastMessageTextPart;
+        const text = textPart?.text.trim();
+        if (!text) {
+            return undefined;
+        }
+        return (0, parser_1.convertXmlToV2Format)(text);
+    }
+    async buildSystemPrompt(repoContext) {
         return `
 You are an expert code reviewer that specializes in reviewing Playwright test code. You are
-provided with tools to fetch diff for a code review, where a test has been added, test modified,
+provided with tools to fetch diff and pull-request metadata for a code review, where a test has been added, test modified,
 or some configuration has changed.
 # Your goals
-- Identify code smells in test code - see below
-- Call out test data assumptions or lack of clean up
+- Understand the purpose and scope of the code change. You can use available tools to gather context of the change.
+- Identify critical issues that must be fixed before the code can be safely merged.
+- Detect code smells, anti-patterns, and non-deterministic behaviors that reduce test reliability - see below.
+- Call out test data assumptions or lack of clean up.
+- Suggest improvements and best practices to enhance maintainability and readability.
+- Form a definite conclusion on whether the code can be merged or not.
+- Share your findings and conclusion in the structured format shared below
 # Output format
-- You are expected to return two sections in your response: describe_code_change and code_review_comments
-- describe_code_change: A brief summary of what the code change is doing. This should be 4-6 sentences in a bullet list.
-- code_review_comments: A bulleted list of code review comments that catch for any of the specific bits below or other
-  red flags you might see in the code. Each comment should be 1-2 sentences.
+- You are expected to return the following sections in your response: last_commit, describe_code_change, line_comments, verdict and version
+- The last commit comes from the session diff tool call along with pull request metadata -- reproduce the commit sha as it is, without any additions (ignore the last commit timestamp) or bullet points
+- describe_code_change: A brief summary of what the code change is doing. This should be 4-6 sentences in a bullet list, formatted in markdown where each bullet must begin with a hyphen followed by a space (- ).
+  Do not use any other character for bullets.
+- line_comments: Individual comments for specific issues found in the code, attributed to one or more lines of problematic code. There can be multiple separate issues for each line, share them using separate comments of varying severity.
+  Each comment should correspond to one issue and include the following tags:
+  - file: The relative path to the file from repository root
+  - line-start: Starting line number of the issue
+  - line-end: Ending line number of the issue (same as line-start for if issue is in a single line of code)
+  - severity: Either "merge-blocking" or "warning". It is possible to have multiple issues in the same set of lines of either nature, in that case add multiple line_comments for those lines.
+    Here severity denotes the nature of the issue - any issue that is preventing the code from being safe to merge and should be considered high priority is "merge-blocking". Look for any of the specific
+    bits below or other red flags you might see in the code. Each comment should be 1-2 sentences. If no blocking issues are found, a review will have no "merge-blocking" line comments.
+    Alternatively, the severity should be "warning" for situations where best practices were not followed or contain minor issues or warnings that can be safely ignored ie
+  - message: 1-2 sentences describing the specific issue and suggested fixes or improvements.
+- verdict: "Approved" if code can be merged to production ie there are no "merge-blocking" line-comments or "Rejected" if the issues cannot be safely ignored. "Approved" or
+  "Rejected" are the only two possible values for this field.
 Return these as XML tags with markdown inside them
+<last_commit>
+...
+</last_commit>
 <describe_code_change>
-- ...
+...
 </describe_code_change>
-<code_review_comments>
-- ...
-</code_review_comments>
+<line_comments>
+  <comment>
+    <file>..</file>
+    <line-start>..</line-start>
+    <line-end>..</line-end>
+    <severity>..</severity>
+    <message>..</message>
+  </comment>
+  <comment>
+    <file>..</file>
+    <line-start>..</line-start>
+    <line-end>..</line-end>
+    <severity>..</severity>
+    <message>..</message>
+  </comment>
+</line_comments>
+<verdict>
+...
+</verdict>
-# Specific bits to catch in the code review
+# Severity: Merge blocking
-## Code smells to look for
-- Any form of try-catch or exception handling is a code smell in test code. If there's an
+## Functionality regression
+- If the change is modifying an existing test, we need to ensure the functionality of the original test
+  is maintained in the new version. No hacking our way to get a green test!
+## Exception handling
+  - Any form of try-catch or exception handling is a code smell in test code. If there's an
   exception, the test should fail
-- Any conditionals (if, switch, ternary) in test code is a code smell. Tests are expected to be
+## Conditionals
+  - Any conditionals (if, switch, ternary) in test code is a code smell. Tests are expected to be
   deterministic. If you see conditionals, check if there's a comment explaining why it's needed.
   Critically review the comment -- if it's not convincing, call it out as a code smell.
-## Ensure Playwright best practices
+## Playwright common mistakes
+- Don't use waitForLoadState or networkidle - these are not required since Playwright auto-waits after navigations. networkidle
+  can cause failures because modern web apps often have background network activity, which never settles.
 - Use locators instead of selectors: waitForSelector, $, $$ are bad - use locators instead (e.g. locator.waitFor)
 - If the test relies on some Playwright APIs that do not auto-wait (e.g. isVisible(), count()), we need to ensure
   they are used AFTER some action that ensures the page has loaded. If nothing, at least it should have a waitForTimeout
-- Don't use waitForLoadState or networkidle - these are not required since Playwright auto-waits after navigations
+## Deprecated patterns
+- test.describe.serial(...) is not prefered: use test.describe.configure({ mode: "serial" }) if the tests need to be serial
+## Repo conventions
+- Tests are located in files in the tests/ directory (e.g. in tests/example.spec.ts)
+- Helper methods (that are imported in the tests) should be in pages/ directory (e.g. pages/common.ts)
+- Helper methods should be functional - not classes (conventional class-based page object models are NOT recommended - use functions!)
+# Severity: Warning
 ## Call out test data assumptions
 - If new test data is created (e.g. creating a new entity in the app, doing some actions on it) - it should be cleaned up
@@ -58,7 +143,16 @@ Return these as XML tags with markdown inside them
 - Dependency on static data that can change across environments (e.g. number of rows in a table) should be avoided.
 ## Remove debug artifacts
-- If there are console.logs or page.screenshot usage, call it out. They should be removed before merging.
+- If there are console.logs or page.screenshot usage, call it out.
+## Extra waits
+- Wait for timeout for static values are bad, but sometimes needed. Some apps are flaky and need additional waiting.
+## Element locators
+- CSS selectors can be brittle - prefer user facing selectors like getByRole, getByText
+# Repo context
+${repoContext}
 `;
     }
 }

package/dist/agent/code-review/parser.d.ts ADDED Viewed

@@ -0,0 +1,5 @@
+import { type CodeReviewResultV2 } from "./types";
+export type { CodeReviewLineComment, CodeReviewResultV0, CodeReviewResultV1, CodeReviewResultV2, } from "./types";
+export { CodeReviewSeverity, CodeReviewVerdict } from "./types";
+export declare function convertXmlToV2Format(output: string): CodeReviewResultV2;
+//# sourceMappingURL=parser.d.ts.map

package/dist/agent/code-review/parser.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"parser.d.ts","sourceRoot":"","sources":["../../../src/agent/code-review/parser.ts"],"names":[],"mappings":"AAAA,OAAO,EAEL,KAAK,kBAAkB,EAGxB,MAAM,SAAS,CAAC;AAEjB,YAAY,EACV,qBAAqB,EACrB,kBAAkB,EAClB,kBAAkB,EAClB,kBAAkB,GACnB,MAAM,SAAS,CAAC;AAEjB,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,EAAE,MAAM,SAAS,CAAC;AAchE,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,MAAM,GAAG,kBAAkB,CAmFvE"}

package/dist/agent/code-review/parser.js ADDED Viewed

@@ -0,0 +1,70 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CodeReviewVerdict = exports.CodeReviewSeverity = void 0;
+exports.convertXmlToV2Format = convertXmlToV2Format;
+const types_1 = require("./types");
+var types_2 = require("./types");
+Object.defineProperty(exports, "CodeReviewSeverity", { enumerable: true, get: function () { return types_2.CodeReviewSeverity; } });
+Object.defineProperty(exports, "CodeReviewVerdict", { enumerable: true, get: function () { return types_2.CodeReviewVerdict; } });
+function inferVerdictFromCommentsV2(lineComments) {
+    const hasMergeBlockingIssues = lineComments.some((comment) => comment.severity === types_1.CodeReviewSeverity.MergeBlocking);
+    return hasMergeBlockingIssues
+        ? types_1.CodeReviewVerdict.Rejected
+        : types_1.CodeReviewVerdict.Approved;
+}
+function convertXmlToV2Format(output) {
+    const lastCommitMatch = output.match(/<last_commit>([\s\S]*?)<\/last_commit>/i);
+    const codeChangeMatch = output.match(/<describe_code_change>([\s\S]*?)<\/describe_code_change>/i);
+    const verdictMatch = output.match(/<verdict>([\s\S]*?)<\/verdict>/i);
+    const lineComments = [];
+    const lineCommentsMatch = output.match(/<line_comments>([\s\S]*?)<\/line_comments>/i);
+    if (lineCommentsMatch) {
+        const commentsContent = lineCommentsMatch[1];
+        const commentMatches = commentsContent.match(/<comment>([\s\S]*?)<\/comment>/gi);
+        if (commentMatches) {
+            for (const commentMatch of commentMatches) {
+                const fileMatch = commentMatch.match(/<file>([\s\S]*?)<\/file>/i);
+                const lineStartMatch = commentMatch.match(/<line-start>([\s\S]*?)<\/line-start>/i);
+                const lineEndMatch = commentMatch.match(/<line-end>([\s\S]*?)<\/line-end>/i);
+                const severityMatch = commentMatch.match(/<severity>([\s\S]*?)<\/severity>/i);
+                const messageMatch = commentMatch.match(/<message>([\s\S]*?)<\/message>/i);
+                if (fileMatch &&
+                    lineStartMatch &&
+                    lineEndMatch &&
+                    severityMatch &&
+                    messageMatch) {
+                    const severityText = severityMatch[1].trim();
+                    const severity = severityText === "merge-blocking"
+                        ? types_1.CodeReviewSeverity.MergeBlocking
+                        : severityText === "warning"
+                            ? types_1.CodeReviewSeverity.Warning
+                            : null;
+                    if (severity !== null) {
+                        lineComments.push({
+                            file: fileMatch[1].trim(),
+                            line_start: parseInt(lineStartMatch[1].trim(), 10),
+                            line_end: parseInt(lineEndMatch[1].trim(), 10),
+                            severity: severity,
+                            message: messageMatch[1].trim(),
+                        });
+                    }
+                }
+            }
+        }
+    }
+    const lastCommit = lastCommitMatch[1].trim();
+    const describeCodeChange = codeChangeMatch[1].trim();
+    const verdict = verdictMatch?.[1]?.trim();
+    const finalVerdict = verdict
+        ? verdict === types_1.CodeReviewVerdict.Approved
+            ? types_1.CodeReviewVerdict.Approved
+            : types_1.CodeReviewVerdict.Rejected
+        : inferVerdictFromCommentsV2(lineComments);
+    return {
+        version: "2.0",
+        last_commit: lastCommit,
+        describe_code_change: describeCodeChange,
+        line_comments: lineComments,
+        verdict: finalVerdict,
+    };
+}

package/dist/agent/code-review/types.d.ts ADDED Viewed

@@ -0,0 +1,36 @@
+export declare enum CodeReviewVerdict {
+    Approved = "Approved",
+    Rejected = "Rejected"
+}
+export declare enum CodeReviewSeverity {
+    MergeBlocking = "merge-blocking",
+    Warning = "warning"
+}
+export type CodeReviewResultV1 = {
+    version: "1.0";
+    last_commit: string | null;
+    code_review_comments: string | null;
+    describe_code_change: string | null;
+    merge_blocking_issues: string | null;
+    best_practices_and_warnings: string | null;
+    verdict: "Approved" | "Rejected" | null;
+};
+export type CodeReviewLineComment = {
+    file: string;
+    line_start: number;
+    line_end: number;
+    severity: CodeReviewSeverity;
+    message: string;
+};
+export type CodeReviewResultV2 = {
+    version: "2.0";
+    last_commit: string;
+    describe_code_change: string;
+    line_comments: CodeReviewLineComment[];
+    verdict: CodeReviewVerdict;
+};
+export type CodeReviewResultV0 = {
+    version: "0.1";
+    result: string;
+};
+//# sourceMappingURL=types.d.ts.map

package/dist/agent/code-review/types.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/agent/code-review/types.ts"],"names":[],"mappings":"AAAA,oBAAY,iBAAiB;IAC3B,QAAQ,aAAa;IACrB,QAAQ,aAAa;CACtB;AAED,oBAAY,kBAAkB;IAC5B,aAAa,mBAAmB;IAChC,OAAO,YAAY;CACpB;AAED,MAAM,MAAM,kBAAkB,GAAG;IAC/B,OAAO,EAAE,KAAK,CAAC;IACf,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,qBAAqB,EAAE,MAAM,GAAG,IAAI,CAAC;IACrC,2BAA2B,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3C,OAAO,EAAE,UAAU,GAAG,UAAU,GAAG,IAAI,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,kBAAkB,CAAC;IAC7B,OAAO,EAAE,MAAM,CAAC;CACjB,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG;IAC/B,OAAO,EAAE,KAAK,CAAC;IACf,WAAW,EAAE,MAAM,CAAC;IACpB,oBAAoB,EAAE,MAAM,CAAC;IAC7B,aAAa,EAAE,qBAAqB,EAAE,CAAC;IACvC,OAAO,EAAE,iBAAiB,CAAC;CAC5B,CAAC;AAEF,MAAM,MAAM,kBAAkB,GAAG;IAC/B,OAAO,EAAE,KAAK,CAAC;IAEf,MAAM,EAAE,MAAM,CAAC;CAChB,CAAC"}

package/dist/agent/code-review/types.js ADDED Viewed

@@ -0,0 +1,13 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.CodeReviewSeverity = exports.CodeReviewVerdict = void 0;
+var CodeReviewVerdict;
+(function (CodeReviewVerdict) {
+    CodeReviewVerdict["Approved"] = "Approved";
+    CodeReviewVerdict["Rejected"] = "Rejected";
+})(CodeReviewVerdict || (exports.CodeReviewVerdict = CodeReviewVerdict = {}));
+var CodeReviewSeverity;
+(function (CodeReviewSeverity) {
+    CodeReviewSeverity["MergeBlocking"] = "merge-blocking";
+    CodeReviewSeverity["Warning"] = "warning";
+})(CodeReviewSeverity || (exports.CodeReviewSeverity = CodeReviewSeverity = {}));

package/dist/agent/cua/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,KAAK,EAAE,MAAM,4BAA4B,CAAC;AAStE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;~~AAmClC~~,MAAM,MAAM,kBAAkB,GAAG,KAAK,CAClC;IACE,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd,GACD;IACE,IAAI,EAAE,YAAY,CAAC;IACnB,UAAU,EAAE,iBAAiB,CAAC;CAC/B,CACJ,CAAC;AAEF,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,8BAAsC,GACvC,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,8BAA8B,CAAC,EAAE,OAAO,CAAC;CAC1C,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,MAAM,EAAE,kBAAkB,CAAC;IAC3B,KAAK,EAAE,KAAK,CAAC;CACd,CAAC,CAuND"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAiB,WAAW,EAAE,MAAM,mBAAmB,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,KAAK,EAAE,MAAM,4BAA4B,CAAC;AAStE,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAoDlC,MAAM,MAAM,kBAAkB,GAAG,KAAK,CAClC;IACE,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;CACd,GACD;IACE,IAAI,EAAE,YAAY,CAAC;IACnB,UAAU,EAAE,iBAAiB,CAAC;CAC/B,CACJ,CAAC;AAEF,wBAAsB,+BAA+B,CAAC,EACpD,IAAI,EACJ,IAAI,EACJ,KAAK,EACL,8BAAsC,GACvC,EAAE;IACD,IAAI,EAAE,IAAI,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,8BAA8B,CAAC,EAAE,OAAO,CAAC;CAC1C,GAAG,OAAO,CAAC;IACV,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,EAAE,CAAC;IACtB,MAAM,EAAE,kBAAkB,CAAC;IAC3B,KAAK,EAAE,KAAK,CAAC;CACd,CAAC,CAuND"}

package/dist/agent/cua/index.js CHANGED Viewed

@@ -28,6 +28,22 @@ function artifact(screenshot, name) {
         data: Buffer.from(screenshot, "base64"),
     };
 }
+function stateOfTheBrowser(page) {
+    const browserContext = page.context();
+    const pages = browserContext.pages();
+    return `
+## Browser window
+### Current page (what you are working on)
+Current page URL: ${page.url()}
+Current page title: ${page.title()}
+### All pages
+Number of open pages: ${pages.length}
+URLs and titles:
+${pages.map((p) => `  - ${p.url()} - ${p.title()}`).join("\n")}`;
+}
 async function createTestUsingComputerUseAgent({ page, task, trace, prefersElementFromPointCodegen = false, }) {
     const codegen = await getCodegenInstance(prefersElementFromPointCodegen);
     await codegen.initialize(page);
@@ -53,7 +69,7 @@ async function createTestUsingComputerUseAgent({ page, task, trace, prefersEleme
                 content: [
                     {
                         type: "input_text",
-                        text: `Task to execute: ${task}\n\nCurrent page URL: ${page.url()}`,
+                        text: `Task to execute: ${task}\n\n${stateOfTheBrowser(page)}`,
                     },
                     {
                         type: "input_image",
@@ -177,7 +193,7 @@ async function createTestUsingComputerUseAgent({ page, task, trace, prefersEleme
                     content: [
                         {
                             type: "input_text",
-                            text: `Action executed: ${executedActionSummary || "None"}\nCurrent page URL: ${page.url()}`,
+                            text: `Action executed: ${executedActionSummary || "None"}\n\n${stateOfTheBrowser(page)}`,
                         },
                     ],
                 },

package/dist/agent/cua/model.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;~~AA8BlD~~,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,EACZ,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB;AAED,wBAAgB,YAAY,CAAC,MAAM,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE;;;EAQrE"}
1	+ {"version":3,"file":"model.d.ts","sourceRoot":"","sources":["../../../src/agent/cua/model.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAEL,QAAQ,EACR,iBAAiB,EAClB,MAAM,0CAA0C,CAAC;AAiClD,wBAAsB,oBAAoB,CAAC,EACzC,KAAK,EACL,kBAAkB,EAClB,WAAW,EACX,YAAY,EACZ,YAAY,GACb,EAAE;IACD,KAAK,EAAE,iBAAiB,EAAE,CAAC;IAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;CACtB,GAAG,OAAO,CAAC,QAAQ,CAAC,CAuBpB;AAED,wBAAgB,YAAY,CAAC,MAAM,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE;;;EAQrE"}

package/dist/agent/cua/model.js CHANGED Viewed

@@ -9,7 +9,10 @@ For example, if the user message says "Click on Submit button", then
 you click on the submit button -- even if it looks like a scary action.
 If you have been asked to retrieve text or verify something on the UI, then communicate
-that in your responses so that the user can see your thinking process in its entirety.`;
+that in your responses so that the user can see your thinking process in its entirety.
+Your work is limited to the current browser page (tab) that you are provided with. You will
+have to conclude your actions before the user can ask you to do actions on different pages (tabs).`;
 const pageGotoTool = {
     type: "function",
     name: "page_goto",