create-walle 0.9.19 → 0.9.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/package.json +1 -1
- package/template/claude-task-manager/db.js +131 -0
- package/template/claude-task-manager/docs/microsoft-dev-tunnel-phone-access-design.md +58 -50
- package/template/claude-task-manager/docs/phone-access-design.md +23 -7
- package/template/claude-task-manager/docs/walle-session-model-preferences.md +119 -0
- package/template/claude-task-manager/lib/microsoft-dev-tunnel-setup.js +32 -48
- package/template/claude-task-manager/lib/remote-relay-protocol.js +5 -0
- package/template/claude-task-manager/lib/walle-external-actions.js +20 -3
- package/template/claude-task-manager/public/index.html +25 -0
- package/template/claude-task-manager/public/js/setup.js +16 -12
- package/template/claude-task-manager/public/js/walle-session.js +31 -3
- package/template/claude-task-manager/public/js/walle.js +93 -23
- package/template/claude-task-manager/public/m/app.css +417 -21
- package/template/claude-task-manager/public/m/app.js +831 -44
- package/template/claude-task-manager/public/m/claim.html +1 -1
- package/template/claude-task-manager/public/m/index.html +41 -7
- package/template/claude-task-manager/public/m/sw.js +1 -1
- package/template/claude-task-manager/server.js +377 -30
- package/template/claude-task-manager/workers/state-detectors/codex.js +18 -3
- package/template/package.json +1 -1
- package/template/wall-e/chat.js +32 -2
- package/template/wall-e/coding/stream-processor.js +36 -0
- package/template/wall-e/coding-orchestrator.js +45 -0
- package/template/wall-e/deploy.sh +1 -1
- package/template/wall-e/docs/external-action-controller.md +60 -2
- package/template/wall-e/external-action-controller.js +23 -1
- package/template/wall-e/external-action-gateway.js +163 -0
- package/template/wall-e/fly.toml +1 -0
- package/template/wall-e/tools/local-tools.js +122 -4
- package/template/website/index.html +2 -2
package/template/wall-e/chat.js
CHANGED
|
@@ -31,6 +31,7 @@ const {
|
|
|
31
31
|
inputForExternalActionEnvelope,
|
|
32
32
|
reviewExternalAction,
|
|
33
33
|
} = require('./external-action-controller');
|
|
34
|
+
const { reviewExternalActionGateway } = require('./external-action-gateway');
|
|
34
35
|
const { runShadow } = require('./eval/shadow');
|
|
35
36
|
const {
|
|
36
37
|
buildCodeReviewContextBlock,
|
|
@@ -984,7 +985,14 @@ function _externalActionLine(envelope = {}, result = {}) {
|
|
|
984
985
|
const suffix = target ? `: ${target}` : '';
|
|
985
986
|
if (result.alreadyExecuted) return `Already ${verb}${suffix}; I did not run it again.`;
|
|
986
987
|
if (result.error) return `Failed to ${_externalActionVerb(envelope, 'present')}${suffix}: ${result.error}`;
|
|
987
|
-
return `Approved action ${verb}${suffix}.`;
|
|
988
|
+
if (_externalActionResultVerified(result)) return `Approved action ${verb} and verified${suffix}.`;
|
|
989
|
+
const verifyReason = result?.verification?.reason || result?.verification?.error || 'no read-after-write verification evidence was returned';
|
|
990
|
+
return `Approved action ${verb}${suffix}, but verification did not confirm it: ${verifyReason}`;
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
function _externalActionResultVerified(result = {}) {
|
|
994
|
+
if (!result || result.error || result.ok === false || result.success === false) return false;
|
|
995
|
+
return result.verified === true || result.verification?.ok === true || result.alreadyExecuted === true;
|
|
988
996
|
}
|
|
989
997
|
|
|
990
998
|
function _progressToolResultPayload(result, resultStr) {
|
|
@@ -1057,6 +1065,7 @@ async function _executeApprovedExternalActions({
|
|
|
1057
1065
|
}
|
|
1058
1066
|
|
|
1059
1067
|
const failed = !!(result && (result.error || result.ok === false || result.success === false));
|
|
1068
|
+
const verified = _externalActionResultVerified(result || {});
|
|
1060
1069
|
try {
|
|
1061
1070
|
const confidence = require('./decision/confidence');
|
|
1062
1071
|
if (envelope.domain && !result?.alreadyExecuted) confidence.recordAction(envelope.domain, !failed);
|
|
@@ -1071,7 +1080,9 @@ async function _executeApprovedExternalActions({
|
|
|
1071
1080
|
? 'Approved action was already executed'
|
|
1072
1081
|
: failed
|
|
1073
1082
|
? 'Approved action failed'
|
|
1074
|
-
:
|
|
1083
|
+
: verified
|
|
1084
|
+
? 'Approved action executed and verified'
|
|
1085
|
+
: 'Approved action executed without verification',
|
|
1075
1086
|
error: failed ? (result.error || 'External action failed') : null,
|
|
1076
1087
|
});
|
|
1077
1088
|
lines.push(_externalActionLine(envelope, result || {}));
|
|
@@ -2191,6 +2202,15 @@ async function chat(message, opts = {}) {
|
|
|
2191
2202
|
name = normalizedCall.name;
|
|
2192
2203
|
input = normalizedCall.input;
|
|
2193
2204
|
|
|
2205
|
+
const gatewayReview = reviewExternalActionGateway({
|
|
2206
|
+
toolName: name,
|
|
2207
|
+
input,
|
|
2208
|
+
userMessage: routingMessage || message,
|
|
2209
|
+
});
|
|
2210
|
+
if (!gatewayReview.admitted) {
|
|
2211
|
+
return gatewayReview.result;
|
|
2212
|
+
}
|
|
2213
|
+
|
|
2194
2214
|
// Eval hook: allow test harness to intercept tool calls with mock results
|
|
2195
2215
|
if (typeof opts.toolInterceptor === 'function') {
|
|
2196
2216
|
const intercepted = await opts.toolInterceptor(name, input);
|
|
@@ -2252,6 +2272,16 @@ async function chat(message, opts = {}) {
|
|
|
2252
2272
|
args: name === 'run_shell' ? input.args : undefined,
|
|
2253
2273
|
reason: perm.reason,
|
|
2254
2274
|
source: perm.source,
|
|
2275
|
+
approval_options: [
|
|
2276
|
+
{ id: 'allow_once', label: 'Yes', scope: 'this_tool_call' },
|
|
2277
|
+
{ id: 'allow_always', label: 'Always yes for this project/pattern', scope: 'project_permission_pattern' },
|
|
2278
|
+
{ id: 'deny', label: 'No', scope: 'this_tool_call' },
|
|
2279
|
+
],
|
|
2280
|
+
approval_policy: {
|
|
2281
|
+
kind: 'local_permission',
|
|
2282
|
+
allow_always: true,
|
|
2283
|
+
approval_scope: 'project_permission_pattern',
|
|
2284
|
+
},
|
|
2255
2285
|
});
|
|
2256
2286
|
// Store resolver so server can call it when user responds
|
|
2257
2287
|
if (!opts._permissionResolvers) opts._permissionResolvers = new Map();
|
|
@@ -12,6 +12,7 @@ const { recoverAllowedTextToolCalls } = require('../llm/text-tool-calls');
|
|
|
12
12
|
const { ArtifactStore } = require('./artifact-store');
|
|
13
13
|
const { normalizeResponse, transformRequest, providerId } = require('./provider-transform');
|
|
14
14
|
const { Confidence, EventName, FailureClass, Provenance, createLaneEvent } = require('./lane-events');
|
|
15
|
+
const { reviewExternalActionGateway } = require('../external-action-gateway');
|
|
15
16
|
|
|
16
17
|
async function* streamFromChat(provider, request) {
|
|
17
18
|
const response = normalizeResponse(recoverAllowedTextToolCalls(await provider.chat(request), request.tools), {
|
|
@@ -345,6 +346,41 @@ class StreamProcessor extends EventEmitter {
|
|
|
345
346
|
name: call.name,
|
|
346
347
|
data: { input: call.input },
|
|
347
348
|
});
|
|
349
|
+
const gatewayReview = reviewExternalActionGateway({
|
|
350
|
+
toolName: call.name,
|
|
351
|
+
input: call.input,
|
|
352
|
+
});
|
|
353
|
+
if (!gatewayReview.admitted) {
|
|
354
|
+
const result = gatewayReview.result;
|
|
355
|
+
state.toolResults.push({ toolCallId: call.id, name: call.name, result });
|
|
356
|
+
await this._runtimeItem(sessionId, cwd, {
|
|
357
|
+
threadId: state.threadId,
|
|
358
|
+
turnId: state.turnId,
|
|
359
|
+
itemId: call.id,
|
|
360
|
+
role: 'tool',
|
|
361
|
+
status: 'completed',
|
|
362
|
+
name: call.name,
|
|
363
|
+
data: { result },
|
|
364
|
+
});
|
|
365
|
+
await this._record(sessionId, cwd, 'tool', {
|
|
366
|
+
state: 'blocked',
|
|
367
|
+
toolCallId: call.id,
|
|
368
|
+
name: call.name,
|
|
369
|
+
input: call.input,
|
|
370
|
+
result,
|
|
371
|
+
});
|
|
372
|
+
await this._laneEvent({
|
|
373
|
+
name: EventName.TOOL_FINISHED,
|
|
374
|
+
sessionId,
|
|
375
|
+
cwd,
|
|
376
|
+
provider: state.provider,
|
|
377
|
+
model: state.model,
|
|
378
|
+
runId: state.messageId,
|
|
379
|
+
confidence: Confidence.HIGH,
|
|
380
|
+
data: { toolCallId: call.id, name: call.name, blocked: true, reason: result.reason },
|
|
381
|
+
});
|
|
382
|
+
return;
|
|
383
|
+
}
|
|
348
384
|
if (this.permissionService?.authorize) {
|
|
349
385
|
await this._record(sessionId, cwd, 'tool', {
|
|
350
386
|
state: 'permission_check',
|
|
@@ -565,6 +565,27 @@ function hasToolCall(toolCallHistory = [], names = new Set()) {
|
|
|
565
565
|
return (toolCallHistory || []).some((call) => names.has(call.name));
|
|
566
566
|
}
|
|
567
567
|
|
|
568
|
+
function isVerificationToolCall(call = {}) {
|
|
569
|
+
const name = String(call.name || '');
|
|
570
|
+
const input = String(call.inputHash || JSON.stringify(call.input || {}));
|
|
571
|
+
if (name === 'browser_screenshot') return true;
|
|
572
|
+
if (name === 'run_shell') {
|
|
573
|
+
return /\b(?:test|spec|lint|build|typecheck|tsc|pytest|jest|mocha|vitest|playwright|node\s+--(?:test|check)|npm\s+(?:test|run)|pnpm\s+(?:test|run)|yarn\s+(?:test|run)|git\s+diff\s+--check)\b/i.test(input);
|
|
574
|
+
}
|
|
575
|
+
return /(?:test|verify|screenshot|diagnostic|lint|build)/i.test(name);
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
function hasVerificationEvidence(toolCallHistory = []) {
|
|
579
|
+
return (toolCallHistory || []).some(isVerificationToolCall);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
function isVerificationBlockerResponse(content) {
|
|
583
|
+
const text = contentToText(content);
|
|
584
|
+
if (!text.trim()) return false;
|
|
585
|
+
return /\b(?:could not|couldn'?t|unable to|not able to|cannot)\b[\s\S]{0,120}\b(?:test|verify|build|run|execute)\b/i.test(text)
|
|
586
|
+
|| /\b(?:tests?|verification|build)\b[\s\S]{0,120}\b(?:not run|not available|blocked|unavailable|missing)\b/i.test(text);
|
|
587
|
+
}
|
|
588
|
+
|
|
568
589
|
function stripPathLikeTokens(text) {
|
|
569
590
|
return String(text || '').replace(/(?:^|[\s`'"(])((?:\.?[A-Za-z0-9_.@-]+\/)+[A-Za-z0-9_.@-]+)(?=[\s`'",):;.\]]|$)/g, ' ');
|
|
570
591
|
}
|
|
@@ -610,6 +631,29 @@ function getNoActionContinuation({ prompt, content, toolCallHistory = [], mode,
|
|
|
610
631
|
if (!isActionRequiredPrompt(prompt, { mode })) return null;
|
|
611
632
|
|
|
612
633
|
const madeEdits = hasToolCall(toolCallHistory, EDIT_TOOL_NAMES);
|
|
634
|
+
if (madeEdits && !hasVerificationEvidence(toolCallHistory) && !isVerificationBlockerResponse(content)) {
|
|
635
|
+
const reason = 'The assistant made file changes but ended before running verification.';
|
|
636
|
+
if (!toolsAvailable) {
|
|
637
|
+
return {
|
|
638
|
+
action: 'fail',
|
|
639
|
+
reason: `${reason} No tool turns remain.`,
|
|
640
|
+
};
|
|
641
|
+
}
|
|
642
|
+
if (nudges >= maxNudges) {
|
|
643
|
+
return {
|
|
644
|
+
action: 'fail',
|
|
645
|
+
reason: `${reason} Verification continuation limit reached.`,
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
return {
|
|
649
|
+
action: 'continue',
|
|
650
|
+
reason,
|
|
651
|
+
message: `[SYSTEM] ${reason} This is not complete.\n` +
|
|
652
|
+
`Run the relevant verification now: tests, lint, build, typecheck, browser screenshot, or at minimum git diff --check when no project test exists.\n` +
|
|
653
|
+
`Only summarize success after a tool result proves the work. If verification is genuinely impossible, state the blocker with tool-backed evidence.\n` +
|
|
654
|
+
`Working directory: ${cwd}`,
|
|
655
|
+
};
|
|
656
|
+
}
|
|
613
657
|
if (madeEdits) return null;
|
|
614
658
|
if (isLegitimateNoEditResponse(content, toolCallHistory)) return null;
|
|
615
659
|
|
|
@@ -3294,6 +3338,7 @@ module.exports = {
|
|
|
3294
3338
|
isActionRequiredPrompt,
|
|
3295
3339
|
isPrematureActionResponse,
|
|
3296
3340
|
getNoActionContinuation,
|
|
3341
|
+
hasVerificationEvidence,
|
|
3297
3342
|
subtaskRequiresFileChanges,
|
|
3298
3343
|
screenshotTrackerHook,
|
|
3299
3344
|
collectEmptyChangedFiles,
|
|
@@ -14,7 +14,7 @@ fi
|
|
|
14
14
|
if ! fly apps list 2>/dev/null | grep -q my-wall-e; then
|
|
15
15
|
echo "Creating Fly.io app..."
|
|
16
16
|
fly apps create my-wall-e
|
|
17
|
-
fly volumes create wall_e_data --region sjc --size
|
|
17
|
+
fly volumes create wall_e_data --region sjc --size 5
|
|
18
18
|
fi
|
|
19
19
|
|
|
20
20
|
# Set secrets
|
|
@@ -32,6 +32,38 @@ user confirmation, and the exact approved envelope is replayed back to Wall-E.
|
|
|
32
32
|
Wall-E then executes the original payload directly rather than asking the model
|
|
33
33
|
to recreate it.
|
|
34
34
|
|
|
35
|
+
## Approval Tiers
|
|
36
|
+
|
|
37
|
+
Wall-E uses two approval tiers:
|
|
38
|
+
|
|
39
|
+
1. **Local permission approval** is for reversible project-local work such as
|
|
40
|
+
shell test commands, builds, and file edits. The UI may offer `Yes`,
|
|
41
|
+
`Always yes for this project/pattern`, and `No`. The persisted rule is scoped
|
|
42
|
+
to the project plus the permission pattern, never to vague model wording.
|
|
43
|
+
2. **External action approval** is for real-world side effects: email, calendar,
|
|
44
|
+
Slack, SMS, reminders, and notifications. These approvals are exact-payload
|
|
45
|
+
approvals. The default choices are `Approve once` or `Cancel`; broad
|
|
46
|
+
`always yes` is intentionally disabled because the next payload may target a
|
|
47
|
+
different person, account, calendar, or time.
|
|
48
|
+
|
|
49
|
+
This keeps the fast Claude Code-style flow for local coding work without giving
|
|
50
|
+
models an ambient ability to send messages or schedule events.
|
|
51
|
+
|
|
52
|
+
## Side-Effect Gateway
|
|
53
|
+
|
|
54
|
+
All side-effecting routes pass through a host-side gateway before normal
|
|
55
|
+
permission checks. The gateway blocks shell, AppleScript, and generic MCP calls
|
|
56
|
+
that try to dispatch external actions directly, for example:
|
|
57
|
+
|
|
58
|
+
- `osascript`/JXA creating Calendar events.
|
|
59
|
+
- `gws calendar events insert`.
|
|
60
|
+
- Gmail `messages.send` through shell, curl, or MCP.
|
|
61
|
+
- Slack `chat.postMessage` through shell or MCP.
|
|
62
|
+
|
|
63
|
+
The gateway returns a structured `external_action_gateway` tool result that
|
|
64
|
+
tells the model which dedicated tool to use. The action is not executed, and the
|
|
65
|
+
model is instructed not to claim success.
|
|
66
|
+
|
|
35
67
|
Sequence:
|
|
36
68
|
|
|
37
69
|
1. Wall-E blocks `mail_send`, `mail_reply`, `calendar_create`, and other external actions and
|
|
@@ -46,8 +78,9 @@ Sequence:
|
|
|
46
78
|
to `chat()`.
|
|
47
79
|
5. `chat()` validates that each approval id/hash still matches the reconstructed
|
|
48
80
|
tool input, checks validation issues, applies a session-scoped idempotency
|
|
49
|
-
guard, executes the local tool,
|
|
50
|
-
progress
|
|
81
|
+
guard, executes the local tool, runs read-after-write verification where the
|
|
82
|
+
connector supports it, and emits normal `tool_call` / `tool_result` progress
|
|
83
|
+
events.
|
|
51
84
|
|
|
52
85
|
This means provider wording is not part of the safety decision. DeepSeek,
|
|
53
86
|
Anthropic, OpenAI, and other providers all use the same envelope replay path.
|
|
@@ -81,14 +114,39 @@ Anthropic, OpenAI, and other providers all use the same envelope replay path.
|
|
|
81
114
|
approve mail/calendar side effects.
|
|
82
115
|
- Approved envelopes are idempotent per Wall-E session and payload hash to avoid
|
|
83
116
|
accidental duplicate sends from double-submit or retry.
|
|
117
|
+
- Calendar approval envelopes preserve `account`, `source`, `calendarId`,
|
|
118
|
+
`calendar`, `location`, and time fields so replay cannot silently fall back to
|
|
119
|
+
a different provider or calendar.
|
|
120
|
+
- Google Calendar creates verify with `calendar.events.get` against the same
|
|
121
|
+
account, calendar id, and event id before Wall-E can summarize the action as
|
|
122
|
+
verified.
|
|
123
|
+
- Gmail sends and replies verify with `gmail.messages.get`. Replies keep the
|
|
124
|
+
original message id separately from the sent reply id so thread evidence is
|
|
125
|
+
not confused with source evidence.
|
|
126
|
+
- Final summary guards treat `sent` or `created` without `verified` evidence as
|
|
127
|
+
incomplete. Shell stdout, AppleScript UIDs, or model prose do not count as
|
|
128
|
+
external-action completion evidence.
|
|
129
|
+
|
|
130
|
+
## Coding-Agent Completion Contract
|
|
131
|
+
|
|
132
|
+
Wall-E coding sessions use the same evidence rule. If an action-oriented coding
|
|
133
|
+
prompt caused file edits, the agent must run a relevant verification tool before
|
|
134
|
+
ending with a success summary. Accepted evidence includes tests, lint, build,
|
|
135
|
+
typecheck, Playwright/browser screenshots for UI work, or `git diff --check`
|
|
136
|
+
when no project-specific verifier exists. If verification is impossible, the
|
|
137
|
+
agent must say so with tool-backed evidence instead of claiming success.
|
|
84
138
|
|
|
85
139
|
## Tests
|
|
86
140
|
|
|
87
141
|
Focused regressions:
|
|
88
142
|
|
|
89
143
|
- `claude-task-manager/tests/walle-permission-policy.test.js`
|
|
144
|
+
- `claude-task-manager/tests/walle-external-actions.test.js`
|
|
90
145
|
- `wall-e/tests/external-action-controller.test.js`
|
|
146
|
+
- `wall-e/tests/external-action-gateway.test.js`
|
|
91
147
|
- `wall-e/tests/local-tools-gws-live-files.test.js`
|
|
148
|
+
- `wall-e/tests/coding-orchestrator.test.js`
|
|
149
|
+
- `wall-e/tests/coding-stream-processor.test.js`
|
|
92
150
|
- `wall-e/tests/execution-trace.test.js`
|
|
93
151
|
- `wall-e/tests/chat.test.js` with `stages a draft email`
|
|
94
152
|
|
|
@@ -133,7 +133,13 @@ function targetForTool(toolName, input = {}) {
|
|
|
133
133
|
return { channel: input.channel_name || input.channel || null };
|
|
134
134
|
}
|
|
135
135
|
if (toolName === 'calendar_create') {
|
|
136
|
-
return {
|
|
136
|
+
return {
|
|
137
|
+
calendar: input.calendar || input.calendar_name || input.calendarId || input.calendar_id || null,
|
|
138
|
+
calendarId: input.calendarId || input.calendar_id || null,
|
|
139
|
+
account: input.account || null,
|
|
140
|
+
source: input.source || input.provider || null,
|
|
141
|
+
attendees: normalizeAddressList(input.attendees),
|
|
142
|
+
};
|
|
137
143
|
}
|
|
138
144
|
if (toolName === 'reminder_create') {
|
|
139
145
|
return { list: input.list || null };
|
|
@@ -163,6 +169,7 @@ function payloadForTool(toolName, input = {}) {
|
|
|
163
169
|
title: input.title || '',
|
|
164
170
|
start: calendarStart(input),
|
|
165
171
|
end: calendarEnd(input),
|
|
172
|
+
location: input.location || '',
|
|
166
173
|
notes: input.notes || '',
|
|
167
174
|
};
|
|
168
175
|
}
|
|
@@ -208,10 +215,14 @@ function inputForExternalActionEnvelope(envelope = {}) {
|
|
|
208
215
|
if (toolName === 'calendar_create') {
|
|
209
216
|
return {
|
|
210
217
|
calendar: target.calendar || null,
|
|
218
|
+
calendarId: target.calendarId || null,
|
|
219
|
+
account: actor.account || target.account || null,
|
|
220
|
+
source: target.source || null,
|
|
211
221
|
attendees: normalizeAddressList(target.attendees),
|
|
212
222
|
title: payload.title || '',
|
|
213
223
|
start_date: payload.start || null,
|
|
214
224
|
end_date: payload.end || null,
|
|
225
|
+
location: payload.location || '',
|
|
215
226
|
notes: payload.notes || '',
|
|
216
227
|
};
|
|
217
228
|
}
|
|
@@ -315,6 +326,17 @@ function buildBlockedToolResult(envelope, decision) {
|
|
|
315
326
|
? 'External action was staged because the user asked for prepared content, not dispatch.'
|
|
316
327
|
: 'External action requires validation and explicit confirmation before execution.',
|
|
317
328
|
action: envelope,
|
|
329
|
+
approval_options: isPreview
|
|
330
|
+
? []
|
|
331
|
+
: [
|
|
332
|
+
{ id: 'approve_once', label: 'Approve once', scope: 'exact_action_payload' },
|
|
333
|
+
{ id: 'cancel', label: 'Cancel', scope: 'exact_action_payload' },
|
|
334
|
+
],
|
|
335
|
+
approval_policy: {
|
|
336
|
+
kind: 'external_action',
|
|
337
|
+
allow_always: false,
|
|
338
|
+
approval_scope: 'exact_action_payload',
|
|
339
|
+
},
|
|
318
340
|
draft: envelope.domain === 'email'
|
|
319
341
|
? {
|
|
320
342
|
to: envelope.toolName === 'mail_reply' ? 'derived_from_original_message' : envelope.target.to,
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const SHELL_SIDE_EFFECT_RULES = [
|
|
4
|
+
{
|
|
5
|
+
domain: 'calendar',
|
|
6
|
+
operation: 'create_event',
|
|
7
|
+
recommendedTool: 'calendar_create',
|
|
8
|
+
reason: 'Calendar event creation must use calendar_create so account, calendar, approval, and verification are tracked.',
|
|
9
|
+
patterns: [
|
|
10
|
+
/\bgws\b[\s\S]*\bcalendar\b[\s\S]*\bevents?\b[\s\S]*\binsert\b/i,
|
|
11
|
+
/calendar\.googleapis\.com[\s\S]*\/events/i,
|
|
12
|
+
/osascript[\s\S]*(?:tell\s+application\s+"Calendar"|Application\(["']Calendar["']\))[\s\S]*(?:make\s+new\s+event|events\.push|new\s+event\s+with\s+properties)/i,
|
|
13
|
+
/(?:tell\s+application\s+"Calendar"|Application\(["']Calendar["']\))[\s\S]*(?:make\s+new\s+event|events\.push|new\s+event\s+with\s+properties)/i,
|
|
14
|
+
],
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
domain: 'email',
|
|
18
|
+
operation: 'send_email',
|
|
19
|
+
recommendedTool: 'mail_send or mail_reply',
|
|
20
|
+
reason: 'Email dispatch must use mail_send or mail_reply so recipients, threading, approval, and sent-mail verification are tracked.',
|
|
21
|
+
patterns: [
|
|
22
|
+
/\bgws\b[\s\S]*\bgmail\b[\s\S]*(?:\+send|messages\s+send|users\s+messages\s+send)\b/i,
|
|
23
|
+
/gmail\.googleapis\.com[\s\S]*\/messages\/send\b/i,
|
|
24
|
+
/osascript[\s\S]*(?:tell\s+application\s+"Mail"|Application\(["']Mail["']\))[\s\S]*(?:make\s+new\s+outgoing\s+message|send\b)/i,
|
|
25
|
+
/(?:tell\s+application\s+"Mail"|Application\(["']Mail["']\))[\s\S]*(?:make\s+new\s+outgoing\s+message|send\b)/i,
|
|
26
|
+
],
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
domain: 'slack',
|
|
30
|
+
operation: 'send_message',
|
|
31
|
+
recommendedTool: 'slack_send_message',
|
|
32
|
+
reason: 'Slack message dispatch must use slack_send_message so channel, approval, and execution evidence are tracked.',
|
|
33
|
+
patterns: [
|
|
34
|
+
/slack\.com\/api\/chat\.postMessage/i,
|
|
35
|
+
/\bslack\b[\s\S]*\bchat\.postMessage\b/i,
|
|
36
|
+
],
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
domain: 'reminder',
|
|
40
|
+
operation: 'create_reminder',
|
|
41
|
+
recommendedTool: 'reminder_create',
|
|
42
|
+
reason: 'Reminder creation must use reminder_create so approval and execution evidence are tracked.',
|
|
43
|
+
patterns: [
|
|
44
|
+
/osascript[\s\S]*(?:tell\s+application\s+"Reminders"|Application\(["']Reminders["']\))[\s\S]*(?:make\s+new\s+reminder|new\s+reminder)/i,
|
|
45
|
+
/(?:tell\s+application\s+"Reminders"|Application\(["']Reminders["']\))[\s\S]*(?:make\s+new\s+reminder|new\s+reminder)/i,
|
|
46
|
+
],
|
|
47
|
+
},
|
|
48
|
+
];
|
|
49
|
+
|
|
50
|
+
const MCP_SIDE_EFFECT_RULES = [
|
|
51
|
+
{
|
|
52
|
+
domain: 'calendar',
|
|
53
|
+
operation: 'create_event',
|
|
54
|
+
recommendedTool: 'calendar_create',
|
|
55
|
+
pattern: /(?:calendar|google[-_ ]?calendar).*?(?:create|insert|add|schedule)|(?:create|insert|add|schedule).*?(?:calendar|event)/i,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
domain: 'email',
|
|
59
|
+
operation: 'send_email',
|
|
60
|
+
recommendedTool: 'mail_send or mail_reply',
|
|
61
|
+
pattern: /(?:gmail|mail|email).*?(?:send|reply|respond)|(?:send|reply|respond).*?(?:gmail|mail|email)/i,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
domain: 'slack',
|
|
65
|
+
operation: 'send_message',
|
|
66
|
+
recommendedTool: 'slack_send_message',
|
|
67
|
+
pattern: /(?:slack).*?(?:send|post|message)|(?:send|post).*?(?:slack|channel)/i,
|
|
68
|
+
},
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
function commandText(input = {}) {
|
|
72
|
+
if (!input || typeof input !== 'object') return '';
|
|
73
|
+
if (typeof input.command === 'string') {
|
|
74
|
+
const args = Array.isArray(input.args) && input.args.length ? ` ${input.args.join(' ')}` : '';
|
|
75
|
+
return `${input.command}${args}`;
|
|
76
|
+
}
|
|
77
|
+
if (typeof input.script === 'string') return input.script;
|
|
78
|
+
return '';
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function classifyExternalActionBypass(toolName, input = {}) {
|
|
82
|
+
const name = String(toolName || '').trim();
|
|
83
|
+
if (name === 'run_shell' || name === 'shell' || name === 'bash' || name === 'terminal' || name === 'applescript') {
|
|
84
|
+
const text = commandText(input);
|
|
85
|
+
if (!text) return null;
|
|
86
|
+
for (const rule of SHELL_SIDE_EFFECT_RULES) {
|
|
87
|
+
if (rule.patterns.some((pattern) => pattern.test(text))) {
|
|
88
|
+
return {
|
|
89
|
+
toolName: name,
|
|
90
|
+
domain: rule.domain,
|
|
91
|
+
operation: rule.operation,
|
|
92
|
+
recommendedTool: rule.recommendedTool,
|
|
93
|
+
reason: rule.reason,
|
|
94
|
+
evidence: text.slice(0, 500),
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (name === 'mcp_call' || name.startsWith('mcp__')) {
|
|
101
|
+
const callName = name === 'mcp_call'
|
|
102
|
+
? `${input.server || ''}.${input.tool || ''}`
|
|
103
|
+
: name;
|
|
104
|
+
const text = `${callName} ${JSON.stringify(input.arguments || input.args || {})}`.slice(0, 1000);
|
|
105
|
+
for (const rule of MCP_SIDE_EFFECT_RULES) {
|
|
106
|
+
if (rule.pattern.test(text)) {
|
|
107
|
+
return {
|
|
108
|
+
toolName: name,
|
|
109
|
+
domain: rule.domain,
|
|
110
|
+
operation: rule.operation,
|
|
111
|
+
recommendedTool: rule.recommendedTool,
|
|
112
|
+
reason: `MCP side-effect tools must be routed through ${rule.recommendedTool} so approval and verification are tracked.`,
|
|
113
|
+
evidence: text.slice(0, 500),
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return null;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function buildGatewayBlockedResult(finding) {
|
|
123
|
+
return {
|
|
124
|
+
external_action: true,
|
|
125
|
+
external_action_gateway: true,
|
|
126
|
+
blocked: true,
|
|
127
|
+
executed: false,
|
|
128
|
+
verified: false,
|
|
129
|
+
decision: 'dedicated_tool_required',
|
|
130
|
+
domain: finding.domain,
|
|
131
|
+
operation: finding.operation,
|
|
132
|
+
original_tool: finding.toolName,
|
|
133
|
+
recommended_tool: finding.recommendedTool,
|
|
134
|
+
reason: finding.reason,
|
|
135
|
+
evidence: finding.evidence,
|
|
136
|
+
approval_policy: {
|
|
137
|
+
kind: 'external_action',
|
|
138
|
+
allow_always: false,
|
|
139
|
+
approval_scope: 'exact_action_payload',
|
|
140
|
+
},
|
|
141
|
+
model_instruction: [
|
|
142
|
+
`Do not retry this ${finding.domain} side effect through ${finding.toolName}.`,
|
|
143
|
+
`Use ${finding.recommendedTool} instead so Wall-E can stage an exact approval envelope, execute the approved payload, and verify the result before summarizing success.`,
|
|
144
|
+
'Do not claim this action was executed.',
|
|
145
|
+
].join(' '),
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function reviewExternalActionGateway({ toolName, input = {} } = {}) {
|
|
150
|
+
const finding = classifyExternalActionBypass(toolName, input);
|
|
151
|
+
if (!finding) return { admitted: true };
|
|
152
|
+
return {
|
|
153
|
+
admitted: false,
|
|
154
|
+
finding,
|
|
155
|
+
result: buildGatewayBlockedResult(finding),
|
|
156
|
+
};
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
module.exports = {
|
|
160
|
+
buildGatewayBlockedResult,
|
|
161
|
+
classifyExternalActionBypass,
|
|
162
|
+
reviewExternalActionGateway,
|
|
163
|
+
};
|
package/template/wall-e/fly.toml
CHANGED