@wix/evalforge-evaluator 0.96.0 → 0.97.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +22 -16
- package/build/index.js.map +2 -2
- package/build/index.mjs +23 -16
- package/build/index.mjs.map +2 -2
- package/build/types/run-scenario/agents/claude-code/types.d.ts +7 -0
- package/package.json +4 -4
package/build/index.mjs
CHANGED
|
@@ -680,6 +680,7 @@ import { AgentRunCommand } from "@wix/evalforge-types";
|
|
|
680
680
|
// src/run-scenario/agents/claude-code/execute.ts
|
|
681
681
|
import {
|
|
682
682
|
ClaudeModel,
|
|
683
|
+
DEFAULT_EVALUATOR_SYSTEM_PROMPT,
|
|
683
684
|
LLMStepType,
|
|
684
685
|
LiveTraceEventType,
|
|
685
686
|
TRACE_EVENT_PREFIX
|
|
@@ -1128,6 +1129,20 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1128
1129
|
permissionMode: "default",
|
|
1129
1130
|
canUseTool
|
|
1130
1131
|
};
|
|
1132
|
+
if (options.systemPrompt === null || options.systemPrompt === "") {
|
|
1133
|
+
} else if (options.systemPrompt != null) {
|
|
1134
|
+
queryOptions.systemPrompt = {
|
|
1135
|
+
type: "preset",
|
|
1136
|
+
preset: "claude_code",
|
|
1137
|
+
append: options.systemPrompt
|
|
1138
|
+
};
|
|
1139
|
+
} else {
|
|
1140
|
+
queryOptions.systemPrompt = {
|
|
1141
|
+
type: "preset",
|
|
1142
|
+
preset: "claude_code",
|
|
1143
|
+
append: DEFAULT_EVALUATOR_SYSTEM_PROMPT
|
|
1144
|
+
};
|
|
1145
|
+
}
|
|
1131
1146
|
if (options.temperature !== void 0) {
|
|
1132
1147
|
queryOptions.temperature = options.temperature;
|
|
1133
1148
|
}
|
|
@@ -1148,6 +1163,7 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1148
1163
|
"[SDK-DEBUG] canUseTool:",
|
|
1149
1164
|
queryOptions.canUseTool ? "custom handler (auto-allow)" : "not set"
|
|
1150
1165
|
);
|
|
1166
|
+
console.log("[SDK-DEBUG] systemPrompt:", queryOptions.systemPrompt);
|
|
1151
1167
|
console.log("[SDK-DEBUG] settingSources:", queryOptions.settingSources);
|
|
1152
1168
|
console.log("[SDK-DEBUG] allowedTools:", queryOptions.allowedTools);
|
|
1153
1169
|
console.log("[SDK-DEBUG] Calling SDK query()...");
|
|
@@ -1245,20 +1261,8 @@ async function executeWithClaudeCode(skills, scenario, options) {
|
|
|
1245
1261
|
}, HEARTBEAT_INTERVAL_MS);
|
|
1246
1262
|
}
|
|
1247
1263
|
const sdkPromise = (async () => {
|
|
1248
|
-
const evaluatorPromptSuffix = `
|
|
1249
|
-
|
|
1250
|
-
IMPORTANT: This is an automated evaluation run. Follow these guidelines:
|
|
1251
|
-
1. Execute the requested changes immediately without asking for confirmation.
|
|
1252
|
-
2. Do NOT ask "would you like me to proceed?" or similar questions.
|
|
1253
|
-
3. Do NOT use the Task tool to delegate simple operations - do them directly yourself.
|
|
1254
|
-
4. Keep your approach simple and direct - avoid excessive planning.
|
|
1255
|
-
5. Make targeted edits using Read and Edit tools rather than exploring the entire codebase.
|
|
1256
|
-
6. If you encounter an error, fix it directly rather than starting over.
|
|
1257
|
-
7. Your project root is the current working directory. Always create and modify source code files relative to the project root, NOT inside .claude/skills/ directories.
|
|
1258
|
-
8. Before finishing, run the project's package manager install command (e.g. \`npm install\`, \`yarn install\`, or \`pnpm install\` depending on the lockfile present) to ensure all dependencies are installed and the project is ready to build.`;
|
|
1259
|
-
const fullPrompt = scenario.triggerPrompt + evaluatorPromptSuffix;
|
|
1260
1264
|
for await (const message of query({
|
|
1261
|
-
prompt:
|
|
1265
|
+
prompt: scenario.triggerPrompt,
|
|
1262
1266
|
options: queryOptions
|
|
1263
1267
|
})) {
|
|
1264
1268
|
messageCount++;
|
|
@@ -1769,7 +1773,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1769
1773
|
traceContext,
|
|
1770
1774
|
mcps,
|
|
1771
1775
|
subAgents,
|
|
1772
|
-
rules
|
|
1776
|
+
rules,
|
|
1777
|
+
systemPrompt
|
|
1773
1778
|
} = context;
|
|
1774
1779
|
const modelForSdk = modelConfig?.model;
|
|
1775
1780
|
const options = {
|
|
@@ -1782,7 +1787,8 @@ var ClaudeCodeAdapter = class {
|
|
|
1782
1787
|
traceContext,
|
|
1783
1788
|
mcps,
|
|
1784
1789
|
subAgents,
|
|
1785
|
-
rules
|
|
1790
|
+
rules,
|
|
1791
|
+
systemPrompt
|
|
1786
1792
|
};
|
|
1787
1793
|
const { result, llmTrace } = await executeWithClaudeCode(
|
|
1788
1794
|
skills,
|
|
@@ -2565,7 +2571,8 @@ async function runAgentWithContext(config, evalRunId2, scenario, evalData, workD
|
|
|
2565
2571
|
},
|
|
2566
2572
|
mcps: evalData.mcps.length > 0 ? evalData.mcps : void 0,
|
|
2567
2573
|
subAgents: evalData.subAgents.length > 0 ? evalData.subAgents : void 0,
|
|
2568
|
-
rules: evalData.rules?.length > 0 ? evalData.rules : void 0
|
|
2574
|
+
rules: evalData.rules?.length > 0 ? evalData.rules : void 0,
|
|
2575
|
+
systemPrompt: agent?.systemPrompt
|
|
2569
2576
|
};
|
|
2570
2577
|
const { outputText, durationMs, llmTrace } = await adapter.execute(executionContext);
|
|
2571
2578
|
const completedAt = (/* @__PURE__ */ new Date()).toISOString();
|