@swarmclawai/swarmclaw 1.9.21 → 1.9.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/package.json +2 -2
- package/src/components/chat/activity-moment.tsx +4 -0
- package/src/components/chat/tool-call-bubble.tsx +6 -0
- package/src/lib/server/capability-router.test.ts +4 -4
- package/src/lib/server/capability-router.ts +1 -0
- package/src/lib/server/chat-execution/chat-execution-advanced.test.ts +27 -0
- package/src/lib/server/chat-execution/chat-execution-utils.ts +21 -0
- package/src/lib/server/chat-execution/iteration-event-handler.ts +1 -1
- package/src/lib/server/chat-execution/stream-continuation.ts +6 -2
- package/src/lib/server/plugins-advanced.test.ts +7 -3
- package/src/lib/server/session-tools/web-crawl.test.ts +106 -0
- package/src/lib/server/session-tools/web-inputs.test.ts +5 -0
- package/src/lib/server/session-tools/web-utils.ts +8 -2
- package/src/lib/server/session-tools/web.ts +256 -29
- package/src/lib/server/storage.ts +2 -0
- package/src/lib/server/tool-aliases.ts +1 -1
- package/src/lib/server/tool-capability-policy-advanced.test.ts +3 -3
- package/src/lib/server/tool-capability-policy.ts +4 -1
- package/src/lib/server/tool-planning.test.ts +2 -1
- package/src/lib/server/tool-planning.ts +31 -0
- package/src/lib/server/untrusted-content.ts +2 -2
- package/src/types/session.ts +2 -0
package/README.md
CHANGED
|
@@ -409,6 +409,15 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
409
409
|
|
|
410
410
|
## Releases
|
|
411
411
|
|
|
412
|
+
### v1.9.22 Highlights
|
|
413
|
+
|
|
414
|
+
Research tools release: agents now get direct `web_extract` and `web_crawl` tools alongside `web_search`, `web_fetch`, and the unified `web` tool.
|
|
415
|
+
|
|
416
|
+
- **Source-grounded extraction.** `web_extract` returns a page title, canonical URL, and readable content for known source URLs.
|
|
417
|
+
- **Bounded crawls.** `web_crawl` walks same-origin links by default with conservative page and depth caps, plus an explicit external-link opt-in.
|
|
418
|
+
- **Better routing.** Tool aliases, capability policy, planning hints, continuation recovery, and the chat UI all recognize the granular research tools.
|
|
419
|
+
- **Regression coverage.** New tests cover action inference, tool-call translation, direct tool registration, extraction cleanup, and same-origin crawl bounds.
|
|
420
|
+
|
|
412
421
|
### v1.9.21 Highlights
|
|
413
422
|
|
|
414
423
|
Provider diagnostics release: connection checks now return a structured step timeline across setup, provider settings, and agent editing.
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.9.
|
|
3
|
+
"version": "1.9.22",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -88,7 +88,7 @@
|
|
|
88
88
|
"test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/electron-signing-config.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
|
|
89
89
|
"test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
|
|
90
90
|
"test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
|
|
91
|
-
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/agent-planning-mode.test.ts src/lib/agent-config-history.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/provider-diagnostics.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/prompt-sections.planning-mode.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/chats/session-context-pack.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/runs/run-handoff.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/schedules/schedule-history.test.ts src/lib/server/schedules/schedule-preview.test.ts src/lib/quality/release-readiness.test.ts src/lib/quality/architecture-health.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-execution-policy.test.ts src/lib/server/tasks/task-handoff.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-pack-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/config-versions/config-versions-route.test.ts src/app/api/runs/run-handoff-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/schedules/preview/route.test.ts src/app/api/schedules/schedule-history-route.test.ts src/app/api/tts/route.test.ts",
|
|
91
|
+
"test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/agent-planning-mode.test.ts src/lib/agent-config-history.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/provider-diagnostics.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/prompt-sections.planning-mode.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/chats/session-context-pack.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/runs/run-handoff.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/schedules/schedule-history.test.ts src/lib/server/schedules/schedule-preview.test.ts src/lib/quality/release-readiness.test.ts src/lib/quality/architecture-health.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-execution-policy.test.ts src/lib/server/tasks/task-handoff.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/server/session-tools/web-crawl.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-pack-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/config-versions/config-versions-route.test.ts src/app/api/runs/run-handoff-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/schedules/preview/route.test.ts src/app/api/schedules/schedule-history-route.test.ts src/app/api/tts/route.test.ts",
|
|
92
92
|
"test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
|
|
93
93
|
"test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
|
|
94
94
|
"test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
|
|
@@ -19,6 +19,9 @@ const NOTABLE_TOOLS: Record<string, { label: string; color: string; icon: 'brain
|
|
|
19
19
|
delegate_to_agent: { label: 'Delegating task', color: '#6366F1', icon: 'delegate' },
|
|
20
20
|
check_delegation_status: { label: 'Checking delegation', color: '#6366F1', icon: 'delegate' },
|
|
21
21
|
web_search: { label: 'Searched the web', color: '#22C55E', icon: 'search' },
|
|
22
|
+
web_fetch: { label: 'Read a web page', color: '#22C55E', icon: 'search' },
|
|
23
|
+
web_extract: { label: 'Extracted a web page', color: '#22C55E', icon: 'search' },
|
|
24
|
+
web_crawl: { label: 'Crawled a site', color: '#22C55E', icon: 'search' },
|
|
22
25
|
connector_message_tool: { label: 'Sent a message', color: '#F97316', icon: 'message' },
|
|
23
26
|
}
|
|
24
27
|
|
|
@@ -35,6 +38,7 @@ function extractSnippet(toolName: string, toolInput: string): string | null {
|
|
|
35
38
|
if (toolName === 'check_delegation_status' && parsed.agentName) return parsed.agentName
|
|
36
39
|
if (toolName.startsWith('delegate_to_') && parsed.task) return parsed.task
|
|
37
40
|
if (toolName === 'web_search' && parsed.query) return parsed.query
|
|
41
|
+
if ((toolName === 'web_fetch' || toolName === 'web_extract' || toolName === 'web_crawl') && parsed.url) return parsed.url
|
|
38
42
|
if (toolName === 'connector_message_tool' && parsed.to) return parsed.to
|
|
39
43
|
} catch { /* ignore parse errors */ }
|
|
40
44
|
return null
|
|
@@ -20,6 +20,8 @@ const TOOL_COLORS: Record<string, string> = {
|
|
|
20
20
|
create_spreadsheet: '#10B981',
|
|
21
21
|
web_search: '#3B82F6',
|
|
22
22
|
web_fetch: '#3B82F6',
|
|
23
|
+
web_extract: '#3B82F6',
|
|
24
|
+
web_crawl: '#3B82F6',
|
|
23
25
|
spawn_subagent: '#8B5CF6',
|
|
24
26
|
delegate_to_agent: '#6366F1',
|
|
25
27
|
check_delegation_status: '#6366F1',
|
|
@@ -77,6 +79,8 @@ export const TOOL_LABELS: Record<string, string> = {
|
|
|
77
79
|
create_spreadsheet: 'Create Spreadsheet',
|
|
78
80
|
web_search: 'Web Search',
|
|
79
81
|
web_fetch: 'Web Fetch',
|
|
82
|
+
web_extract: 'Web Extract',
|
|
83
|
+
web_crawl: 'Web Crawl',
|
|
80
84
|
claude_code: 'Claude Code',
|
|
81
85
|
codex_cli: 'Codex CLI',
|
|
82
86
|
opencode_cli: 'OpenCode CLI',
|
|
@@ -127,6 +131,8 @@ export const TOOL_DESCRIPTIONS: Record<string, string> = {
|
|
|
127
131
|
create_spreadsheet: 'Create Excel or CSV files from structured data',
|
|
128
132
|
web_search: 'Search the web for information',
|
|
129
133
|
web_fetch: 'Fetch and read web page content',
|
|
134
|
+
web_extract: 'Extract readable content from a source URL',
|
|
135
|
+
web_crawl: 'Crawl a bounded set of pages from one site',
|
|
130
136
|
claude_code: 'Enable delegation to Claude Code CLI',
|
|
131
137
|
codex_cli: 'Enable delegation to OpenAI Codex CLI',
|
|
132
138
|
opencode_cli: 'Enable delegation to OpenCode CLI',
|
|
@@ -26,7 +26,7 @@ test('routeTaskIntent keeps coding prompts prioritized over memory keywords', ()
|
|
|
26
26
|
test('routeTaskIntent keeps hybrid research-plus-media prompts in research intent', () => {
|
|
27
27
|
const decision = routeTaskIntent(
|
|
28
28
|
'Can you tell me more if there is any news related to the US-Iran war, and can you send me some screenshots and give me a summary and maybe send me a voice note about it?',
|
|
29
|
-
['web_search', 'web_fetch', 'browser', 'manage_connectors'],
|
|
29
|
+
['web_search', 'web_fetch', 'web_crawl', 'browser', 'manage_connectors'],
|
|
30
30
|
null,
|
|
31
31
|
makeClassification({
|
|
32
32
|
taskIntent: 'research',
|
|
@@ -39,7 +39,7 @@ test('routeTaskIntent keeps hybrid research-plus-media prompts in research inten
|
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
assert.equal(decision.intent, 'research')
|
|
42
|
-
assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'browser', 'connector_message_tool'])
|
|
42
|
+
assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'connector_message_tool'])
|
|
43
43
|
})
|
|
44
44
|
|
|
45
45
|
test('routeTaskIntent treats direct voice-note delivery as outreach', () => {
|
|
@@ -72,7 +72,7 @@ test('routeTaskIntent treats keep-watching update requests as research even with
|
|
|
72
72
|
)
|
|
73
73
|
|
|
74
74
|
assert.equal(decision.intent, 'research')
|
|
75
|
-
assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch'])
|
|
75
|
+
assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'web_extract', 'web_crawl'])
|
|
76
76
|
})
|
|
77
77
|
|
|
78
78
|
test('routeTaskIntent uses structured classification when available', () => {
|
|
@@ -99,7 +99,7 @@ test('routeTaskIntent uses structured classification when available', () => {
|
|
|
99
99
|
)
|
|
100
100
|
|
|
101
101
|
assert.equal(decision.intent, 'browsing')
|
|
102
|
-
assert.deepEqual(decision.preferredTools, ['browser', 'web_fetch'])
|
|
102
|
+
assert.deepEqual(decision.preferredTools, ['browser', 'web_fetch', 'web_extract'])
|
|
103
103
|
})
|
|
104
104
|
|
|
105
105
|
function makeClassification(overrides: Partial<MessageClassification>): MessageClassification {
|
|
@@ -144,6 +144,7 @@ export function routeTaskIntent(
|
|
|
144
144
|
[
|
|
145
145
|
TOOL_CAPABILITY.researchSearch,
|
|
146
146
|
TOOL_CAPABILITY.researchFetch,
|
|
147
|
+
TOOL_CAPABILITY.researchCrawl,
|
|
147
148
|
...(wantsScreenshots ? [TOOL_CAPABILITY.browserCapture] : []),
|
|
148
149
|
...(wantsVoiceDelivery ? [TOOL_CAPABILITY.deliveryVoiceNote] : []),
|
|
149
150
|
...(wantsOutboundDelivery ? [TOOL_CAPABILITY.deliveryMedia, TOOL_CAPABILITY.deliveryMessage] : []),
|
|
@@ -407,6 +407,33 @@ describe('translateRequestedToolInvocation advanced', () => {
|
|
|
407
407
|
assert.equal(args.action, 'search')
|
|
408
408
|
assert.equal(args.query, 'test query')
|
|
409
409
|
})
|
|
410
|
+
|
|
411
|
+
it('maps web_extract to web with action=extract', () => {
|
|
412
|
+
const { toolName, args } = translateRequestedToolInvocation(
|
|
413
|
+
'web_extract',
|
|
414
|
+
{ url: 'https://example.com/source' },
|
|
415
|
+
'',
|
|
416
|
+
['web'],
|
|
417
|
+
)
|
|
418
|
+
assert.equal(toolName, 'web')
|
|
419
|
+
assert.equal(args.action, 'extract')
|
|
420
|
+
assert.equal(args.url, 'https://example.com/source')
|
|
421
|
+
})
|
|
422
|
+
|
|
423
|
+
it('maps web_crawl to web with bounded crawl arguments', () => {
|
|
424
|
+
const { toolName, args } = translateRequestedToolInvocation(
|
|
425
|
+
'web_crawl',
|
|
426
|
+
{ url: 'https://example.com/', maxPages: 4, maxDepth: 1, includeExternal: false },
|
|
427
|
+
'',
|
|
428
|
+
['web'],
|
|
429
|
+
)
|
|
430
|
+
assert.equal(toolName, 'web')
|
|
431
|
+
assert.equal(args.action, 'crawl')
|
|
432
|
+
assert.equal(args.url, 'https://example.com/')
|
|
433
|
+
assert.equal(args.maxPages, 4)
|
|
434
|
+
assert.equal(args.maxDepth, 1)
|
|
435
|
+
assert.equal(args.includeExternal, false)
|
|
436
|
+
})
|
|
410
437
|
})
|
|
411
438
|
|
|
412
439
|
// ---------------------------------------------------------------------------
|
|
@@ -127,6 +127,27 @@ export function translateRequestedToolInvocation(
|
|
|
127
127
|
},
|
|
128
128
|
}
|
|
129
129
|
}
|
|
130
|
+
if (requestedName === 'web_extract') {
|
|
131
|
+
return {
|
|
132
|
+
toolName: 'web',
|
|
133
|
+
args: {
|
|
134
|
+
action: 'extract',
|
|
135
|
+
url: rawArgs.url,
|
|
136
|
+
},
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
if (requestedName === 'web_crawl') {
|
|
140
|
+
return {
|
|
141
|
+
toolName: 'web',
|
|
142
|
+
args: {
|
|
143
|
+
action: 'crawl',
|
|
144
|
+
url: rawArgs.url || rawArgs.query,
|
|
145
|
+
maxPages: rawArgs.maxPages ?? rawArgs.maxResults,
|
|
146
|
+
maxDepth: rawArgs.maxDepth,
|
|
147
|
+
includeExternal: rawArgs.includeExternal,
|
|
148
|
+
},
|
|
149
|
+
}
|
|
150
|
+
}
|
|
130
151
|
if (requestedName === 'delegate_to_claude_code') {
|
|
131
152
|
return { toolName: 'delegate', args: { ...rawArgs, backend: 'claude' } }
|
|
132
153
|
}
|
|
@@ -349,7 +349,7 @@ export async function processIterationEvents(opts: ProcessIterationEventsOpts):
|
|
|
349
349
|
}
|
|
350
350
|
if (
|
|
351
351
|
boundedExternalExecutionTask
|
|
352
|
-
&& ['http_request', 'web', 'web_search', 'web_fetch', 'browser'].includes(toolName)
|
|
352
|
+
&& ['http_request', 'web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser'].includes(toolName)
|
|
353
353
|
&& countExternalExecutionResearchSteps(state.streamedToolEvents) >= 5
|
|
354
354
|
&& countDistinctExternalResearchHosts(state.streamedToolEvents) >= 3
|
|
355
355
|
) {
|
|
@@ -196,7 +196,7 @@ function getRequestedArtifactStatus(params: {
|
|
|
196
196
|
|
|
197
197
|
export function countExternalExecutionResearchSteps(toolEvents: MessageToolEvent[]): number {
|
|
198
198
|
return toolEvents.filter((event) => {
|
|
199
|
-
return ['http_request', 'web', 'web_search', 'web_fetch', 'browser'].includes(event.name)
|
|
199
|
+
return ['http_request', 'web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser'].includes(event.name)
|
|
200
200
|
}).length
|
|
201
201
|
}
|
|
202
202
|
|
|
@@ -300,6 +300,8 @@ const RECOVERABLE_TOOL_ERROR_NAMES = new Set([
|
|
|
300
300
|
'web',
|
|
301
301
|
'web_search',
|
|
302
302
|
'web_fetch',
|
|
303
|
+
'web_extract',
|
|
304
|
+
'web_crawl',
|
|
303
305
|
'http_request',
|
|
304
306
|
])
|
|
305
307
|
|
|
@@ -390,6 +392,8 @@ export function getToolFrequencyHint(toolName: string, sessionExtensions: string
|
|
|
390
392
|
case 'http_request':
|
|
391
393
|
case 'web_search':
|
|
392
394
|
case 'web_fetch':
|
|
395
|
+
case 'web_extract':
|
|
396
|
+
case 'web_crawl':
|
|
393
397
|
return 'Hint: You have done extensive research. Stop gathering more sources and use the information you already have to complete the task.'
|
|
394
398
|
|
|
395
399
|
case 'spawn_subagent':
|
|
@@ -490,7 +494,7 @@ function buildDeliverableFollowthroughPrompt(params: {
|
|
|
490
494
|
}
|
|
491
495
|
|
|
492
496
|
if (
|
|
493
|
-
params.toolEvents.some((event) => ['web', 'web_search', 'web_fetch', 'browser', 'http_request'].includes(event.name))
|
|
497
|
+
params.toolEvents.some((event) => ['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'http_request'].includes(event.name))
|
|
494
498
|
&& !params.toolEvents.some((event) => ['files', 'write_file', 'edit_file', 'shell', 'execute_command'].includes(event.name))
|
|
495
499
|
) {
|
|
496
500
|
lines.push(
|
|
@@ -135,11 +135,13 @@ describe('expandExtensionIds', () => {
|
|
|
135
135
|
}
|
|
136
136
|
})
|
|
137
137
|
|
|
138
|
-
it('web expands to include
|
|
138
|
+
it('web expands to include granular web tools', () => {
|
|
139
139
|
const result = expandExtensionIds(['web'])
|
|
140
140
|
assert.ok(result.includes('web'))
|
|
141
141
|
assert.ok(result.includes('web_search'))
|
|
142
142
|
assert.ok(result.includes('web_fetch'))
|
|
143
|
+
assert.ok(result.includes('web_extract'))
|
|
144
|
+
assert.ok(result.includes('web_crawl'))
|
|
143
145
|
})
|
|
144
146
|
|
|
145
147
|
it('removes duplicates after expansion', () => {
|
|
@@ -199,12 +201,14 @@ describe('expandExtensionIds', () => {
|
|
|
199
201
|
// getExtensionAliases
|
|
200
202
|
// ---------------------------------------------------------------------------
|
|
201
203
|
describe('getExtensionAliases', () => {
|
|
202
|
-
it('web returns
|
|
204
|
+
it('web returns the full web alias group', () => {
|
|
203
205
|
const result = getExtensionAliases('web')
|
|
204
206
|
assert.ok(result.includes('web'))
|
|
205
207
|
assert.ok(result.includes('web_search'))
|
|
206
208
|
assert.ok(result.includes('web_fetch'))
|
|
207
|
-
assert.
|
|
209
|
+
assert.ok(result.includes('web_extract'))
|
|
210
|
+
assert.ok(result.includes('web_crawl'))
|
|
211
|
+
assert.equal(result.length, 7) // web, web_search, web_fetch, web_extract, web_crawl, http_request, http
|
|
208
212
|
})
|
|
209
213
|
|
|
210
214
|
it('web_search returns the same group as web', () => {
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { afterEach, describe, it } from 'node:test'
|
|
3
|
+
import { buildWebTools } from './web'
|
|
4
|
+
import type { ToolBuildContext } from './context'
|
|
5
|
+
|
|
6
|
+
const originalFetch = globalThis.fetch
|
|
7
|
+
|
|
8
|
+
function createContext(): ToolBuildContext {
|
|
9
|
+
return {
|
|
10
|
+
cwd: process.cwd(),
|
|
11
|
+
ctx: undefined,
|
|
12
|
+
hasExtension: (name: string) => name === 'web',
|
|
13
|
+
hasTool: (name: string) => name === 'web',
|
|
14
|
+
cleanupFns: [],
|
|
15
|
+
commandTimeoutMs: 1000,
|
|
16
|
+
claudeTimeoutMs: 1000,
|
|
17
|
+
cliProcessTimeoutMs: 1000,
|
|
18
|
+
persistDelegateResumeId: () => {},
|
|
19
|
+
readStoredDelegateResumeId: () => null,
|
|
20
|
+
resolveCurrentSession: () => null,
|
|
21
|
+
activeExtensions: ['web'],
|
|
22
|
+
} as ToolBuildContext
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function mockFetch(pages: Record<string, string>, calls: string[] = []): void {
|
|
26
|
+
globalThis.fetch = (async (input: RequestInfo | URL) => {
|
|
27
|
+
const url = input instanceof Request ? input.url : String(input)
|
|
28
|
+
calls.push(url)
|
|
29
|
+
const html = pages[url]
|
|
30
|
+
if (!html) {
|
|
31
|
+
return new Response('missing', { status: 404, statusText: 'Not Found' })
|
|
32
|
+
}
|
|
33
|
+
return new Response(html, {
|
|
34
|
+
status: 200,
|
|
35
|
+
headers: { 'content-type': 'text/html; charset=utf-8' },
|
|
36
|
+
})
|
|
37
|
+
}) as typeof fetch
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
afterEach(() => {
|
|
41
|
+
globalThis.fetch = originalFetch
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
describe('web extract and crawl tools', () => {
|
|
45
|
+
it('registers direct granular web tools when web is enabled', () => {
|
|
46
|
+
const names = buildWebTools(createContext()).map((entry) => entry.name).sort()
|
|
47
|
+
|
|
48
|
+
assert.deepEqual(names.filter((name) => name.startsWith('web')), [
|
|
49
|
+
'web',
|
|
50
|
+
'web_crawl',
|
|
51
|
+
'web_extract',
|
|
52
|
+
'web_fetch',
|
|
53
|
+
'web_search',
|
|
54
|
+
])
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
it('extracts readable page content with title and source URL', async () => {
|
|
58
|
+
mockFetch({
|
|
59
|
+
'https://example.test/article': `
|
|
60
|
+
<!doctype html>
|
|
61
|
+
<title>Feature Page</title>
|
|
62
|
+
<header>Ignore navigation</header>
|
|
63
|
+
<main>
|
|
64
|
+
<h1>Feature Page</h1>
|
|
65
|
+
<p>Readable body text for the agent.</p>
|
|
66
|
+
</main>
|
|
67
|
+
<script>console.log('hidden')</script>
|
|
68
|
+
`,
|
|
69
|
+
})
|
|
70
|
+
const tool = buildWebTools(createContext()).find((entry) => entry.name === 'web_extract')
|
|
71
|
+
assert.ok(tool)
|
|
72
|
+
|
|
73
|
+
const output = String(await tool.invoke({ url: 'https://example.test/article#section' }))
|
|
74
|
+
|
|
75
|
+
assert.match(output, /Title: Feature Page/)
|
|
76
|
+
assert.match(output, /URL: https:\/\/example\.test\/article/)
|
|
77
|
+
assert.match(output, /Readable body text for the agent\./)
|
|
78
|
+
assert.doesNotMatch(output, /Ignore navigation/)
|
|
79
|
+
assert.doesNotMatch(output, /console\.log/)
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
it('crawls same-origin pages within the requested page and depth bounds', async () => {
|
|
83
|
+
const calls: string[] = []
|
|
84
|
+
mockFetch({
|
|
85
|
+
'https://site.test/': `
|
|
86
|
+
<title>Start</title>
|
|
87
|
+
<main>Start page <a href="/a">A</a> <a href="/b">B</a> <a href="https://external.test/x">External</a></main>
|
|
88
|
+
`,
|
|
89
|
+
'https://site.test/a': '<title>A page</title><main>Alpha content</main>',
|
|
90
|
+
'https://site.test/b': '<title>B page</title><main>Beta content</main>',
|
|
91
|
+
'https://external.test/x': '<title>External</title><main>Should not be fetched</main>',
|
|
92
|
+
}, calls)
|
|
93
|
+
const tool = buildWebTools(createContext()).find((entry) => entry.name === 'web_crawl')
|
|
94
|
+
assert.ok(tool)
|
|
95
|
+
|
|
96
|
+
const output = String(await tool.invoke({ url: 'https://site.test/', maxPages: 3, maxDepth: 1 }))
|
|
97
|
+
|
|
98
|
+
assert.match(output, /Crawl results for: https:\/\/site\.test\//)
|
|
99
|
+
assert.match(output, /Pages crawled: 3/)
|
|
100
|
+
assert.match(output, /Start page/)
|
|
101
|
+
assert.match(output, /Alpha content/)
|
|
102
|
+
assert.match(output, /Beta content/)
|
|
103
|
+
assert.doesNotMatch(output, /Should not be fetched/)
|
|
104
|
+
assert.deepEqual(calls, ['https://site.test/', 'https://site.test/a', 'https://site.test/b'])
|
|
105
|
+
})
|
|
106
|
+
})
|
|
@@ -20,6 +20,11 @@ describe('inferWebActionFromArgs', () => {
|
|
|
20
20
|
assert.equal(inferWebActionFromArgs({ action: 'search', url: 'https://example.com/article' }), 'search')
|
|
21
21
|
})
|
|
22
22
|
|
|
23
|
+
it('preserves explicit extract and crawl actions', () => {
|
|
24
|
+
assert.equal(inferWebActionFromArgs({ action: 'extract', url: 'https://example.com/article' }), 'extract')
|
|
25
|
+
assert.equal(inferWebActionFromArgs({ action: 'crawl', url: 'https://example.com/' }), 'crawl')
|
|
26
|
+
})
|
|
27
|
+
|
|
23
28
|
it('normalizes stringified browser form payloads', () => {
|
|
24
29
|
const normalized = normalizeBrowserActionParams({
|
|
25
30
|
input: JSON.stringify({
|
|
@@ -176,8 +176,14 @@ export function inferWebActionFromArgs(params: {
|
|
|
176
176
|
query?: string
|
|
177
177
|
url?: string
|
|
178
178
|
method?: string
|
|
179
|
-
}): 'search' | 'fetch' | 'api' | undefined {
|
|
180
|
-
if (
|
|
179
|
+
}): 'search' | 'fetch' | 'extract' | 'crawl' | 'api' | undefined {
|
|
180
|
+
if (
|
|
181
|
+
params.action === 'search'
|
|
182
|
+
|| params.action === 'fetch'
|
|
183
|
+
|| params.action === 'extract'
|
|
184
|
+
|| params.action === 'crawl'
|
|
185
|
+
|| params.action === 'api'
|
|
186
|
+
) return params.action
|
|
181
187
|
if (typeof params.method === 'string' && params.method.trim()) return 'api'
|
|
182
188
|
if (typeof params.url === 'string' && /^https?:\/\//i.test(params.url.trim())) return 'fetch'
|
|
183
189
|
if (typeof params.query === 'string' && params.query.trim()) return 'search'
|
|
@@ -199,6 +199,149 @@ async function executeWebApiAction(normalized: Record<string, unknown>) {
|
|
|
199
199
|
}, requestArgs)
|
|
200
200
|
}
|
|
201
201
|
|
|
202
|
+
interface ExtractedWebPage {
|
|
203
|
+
url: string
|
|
204
|
+
title: string
|
|
205
|
+
text: string
|
|
206
|
+
links: string[]
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function normalizeHttpUrl(rawUrl: string): string {
|
|
210
|
+
const trimmed = rawUrl.trim()
|
|
211
|
+
if (!trimmed) throw new Error('URL is required.')
|
|
212
|
+
const parsed = new URL(trimmed)
|
|
213
|
+
if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
|
|
214
|
+
throw new Error('Only http and https URLs are supported.')
|
|
215
|
+
}
|
|
216
|
+
parsed.hash = ''
|
|
217
|
+
return parsed.toString()
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
function clampNumber(value: unknown, fallback: number, min: number, max: number): number {
|
|
221
|
+
const parsed = typeof value === 'number'
|
|
222
|
+
? value
|
|
223
|
+
: typeof value === 'string'
|
|
224
|
+
? Number.parseInt(value, 10)
|
|
225
|
+
: Number.NaN
|
|
226
|
+
if (!Number.isFinite(parsed)) return fallback
|
|
227
|
+
return Math.max(min, Math.min(max, Math.trunc(parsed)))
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function extractLinks($: ReturnType<typeof cheerio.load>, pageUrl: string): string[] {
|
|
231
|
+
const links: string[] = []
|
|
232
|
+
$('a[href]').each((_index, element) => {
|
|
233
|
+
const rawHref = $(element).attr('href') || ''
|
|
234
|
+
try {
|
|
235
|
+
const resolved = new URL(rawHref, pageUrl)
|
|
236
|
+
if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') return
|
|
237
|
+
resolved.hash = ''
|
|
238
|
+
const href = resolved.toString()
|
|
239
|
+
if (!links.includes(href)) links.push(href)
|
|
240
|
+
} catch {
|
|
241
|
+
// Ignore malformed links from the crawled page.
|
|
242
|
+
}
|
|
243
|
+
})
|
|
244
|
+
return links
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
async function extractReadablePage(fetchUrl: string): Promise<ExtractedWebPage> {
|
|
248
|
+
const url = normalizeHttpUrl(fetchUrl)
|
|
249
|
+
const res = await fetch(url, {
|
|
250
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; SwarmClaw/1.0)' },
|
|
251
|
+
signal: AbortSignal.timeout(15000),
|
|
252
|
+
})
|
|
253
|
+
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`)
|
|
254
|
+
const contentType = res.headers.get('content-type') || ''
|
|
255
|
+
if (contentType.includes('application/pdf')) {
|
|
256
|
+
const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse')
|
|
257
|
+
const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }>
|
|
258
|
+
const arrayBuffer = await res.arrayBuffer()
|
|
259
|
+
const result = await pdfParse(Buffer.from(arrayBuffer))
|
|
260
|
+
return { url, title: url, text: result.text, links: [] }
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const html = await res.text()
|
|
264
|
+
const $ = cheerio.load(html)
|
|
265
|
+
const title = $('title').first().text().replace(/\s+/g, ' ').trim() || url
|
|
266
|
+
const links = extractLinks($, url)
|
|
267
|
+
$('script, style, noscript, nav, footer, header').remove()
|
|
268
|
+
const main = $('article, main, [role="main"]').first()
|
|
269
|
+
const text = (main.length ? main.text() : $('body').text()).replace(/\s+/g, ' ').trim()
|
|
270
|
+
return { url, title, text, links }
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function formatExtractedPage(page: ExtractedWebPage): string {
|
|
274
|
+
const lines = [`Title: ${page.title}`, `URL: ${page.url}`, '', page.text || '(no readable text found)']
|
|
275
|
+
return truncate(lines.join('\n'), MAX_OUTPUT)
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
function formatCrawlResults(startUrl: string, pages: ExtractedWebPage[]): string {
|
|
279
|
+
if (pages.length === 0) return `No crawl results found for: ${startUrl}`
|
|
280
|
+
const sections = [`Crawl results for: ${startUrl}`, `Pages crawled: ${pages.length}`]
|
|
281
|
+
for (let index = 0; index < pages.length; index++) {
|
|
282
|
+
const page = pages[index]
|
|
283
|
+
const text = truncate(page.text || '(no readable text found)', 1200)
|
|
284
|
+
sections.push(`${index + 1}. ${page.title}\nURL: ${page.url}\nText: ${text}`)
|
|
285
|
+
}
|
|
286
|
+
return truncate(sections.join('\n\n'), MAX_OUTPUT)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
async function executeWebExtractAction(normalized: Record<string, unknown>) {
|
|
290
|
+
const rawUrl = String(normalized.url || normalized.query || '')
|
|
291
|
+
if (!rawUrl.trim()) return 'Error: "url" is required for extract action.'
|
|
292
|
+
try {
|
|
293
|
+
return formatExtractedPage(await extractReadablePage(rawUrl))
|
|
294
|
+
} catch (err: unknown) {
|
|
295
|
+
return `Error: ${errorMessage(err)}`
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
async function executeWebCrawlAction(normalized: Record<string, unknown>) {
|
|
300
|
+
const rawUrl = String(normalized.url || normalized.query || '')
|
|
301
|
+
if (!rawUrl.trim()) return 'Error: "url" is required for crawl action.'
|
|
302
|
+
|
|
303
|
+
let startUrl: string
|
|
304
|
+
try {
|
|
305
|
+
startUrl = normalizeHttpUrl(rawUrl)
|
|
306
|
+
} catch (err: unknown) {
|
|
307
|
+
return `Error: ${errorMessage(err)}`
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
const maxPages = clampNumber(normalized.maxPages ?? normalized.maxResults, 5, 1, 25)
|
|
311
|
+
const maxDepth = clampNumber(normalized.maxDepth, 1, 0, 3)
|
|
312
|
+
const includeExternal = normalized.includeExternal === true || normalized.sameOrigin === false
|
|
313
|
+
const startOrigin = new URL(startUrl).origin
|
|
314
|
+
const queue: Array<{ url: string; depth: number }> = [{ url: startUrl, depth: 0 }]
|
|
315
|
+
const seen = new Set<string>()
|
|
316
|
+
const pages: ExtractedWebPage[] = []
|
|
317
|
+
|
|
318
|
+
while (queue.length > 0 && pages.length < maxPages) {
|
|
319
|
+
const next = queue.shift()
|
|
320
|
+
if (!next) break
|
|
321
|
+
if (seen.has(next.url)) continue
|
|
322
|
+
seen.add(next.url)
|
|
323
|
+
|
|
324
|
+
let page: ExtractedWebPage
|
|
325
|
+
try {
|
|
326
|
+
page = await extractReadablePage(next.url)
|
|
327
|
+
} catch (err: unknown) {
|
|
328
|
+
page = { url: next.url, title: next.url, text: `Error: ${errorMessage(err)}`, links: [] }
|
|
329
|
+
}
|
|
330
|
+
pages.push(page)
|
|
331
|
+
|
|
332
|
+
if (next.depth >= maxDepth) continue
|
|
333
|
+
for (const link of page.links) {
|
|
334
|
+
if (seen.has(link)) continue
|
|
335
|
+
if (!includeExternal && new URL(link).origin !== startOrigin) continue
|
|
336
|
+
if (queue.some((entry) => entry.url === link)) continue
|
|
337
|
+
queue.push({ url: link, depth: next.depth + 1 })
|
|
338
|
+
if (queue.length + seen.size >= maxPages * 4) break
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return formatCrawlResults(startUrl, pages)
|
|
343
|
+
}
|
|
344
|
+
|
|
202
345
|
async function executeWebAction(args: Record<string, unknown>) {
|
|
203
346
|
const normalized = normalizeToolInputArgs(args)
|
|
204
347
|
const { query, url, maxResults } = normalized as { query?: string; url?: string; maxResults?: number }
|
|
@@ -219,32 +362,13 @@ async function executeWebAction(args: Record<string, unknown>) {
|
|
|
219
362
|
const results = await provider.search(searchQuery, limit)
|
|
220
363
|
if (results.length === 0) return 'No results found.'
|
|
221
364
|
return formatWebSearchResults(searchQuery, results)
|
|
222
|
-
} else if (action === 'fetch') {
|
|
365
|
+
} else if (action === 'fetch' || action === 'extract') {
|
|
223
366
|
const fetchUrl = url || query
|
|
224
|
-
if (!fetchUrl) return
|
|
225
|
-
const
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
if (!res.ok) return `HTTP ${res.status}: ${res.statusText}`
|
|
230
|
-
const contentType = res.headers.get('content-type') || ''
|
|
231
|
-
if (contentType.includes('application/pdf')) {
|
|
232
|
-
try {
|
|
233
|
-
const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse')
|
|
234
|
-
const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }>
|
|
235
|
-
const arrayBuffer = await res.arrayBuffer()
|
|
236
|
-
const result = await pdfParse(Buffer.from(arrayBuffer))
|
|
237
|
-
return truncate(result.text, MAX_OUTPUT)
|
|
238
|
-
} catch (err: unknown) {
|
|
239
|
-
return `Error parsing PDF: ${errorMessage(err)}`
|
|
240
|
-
}
|
|
241
|
-
}
|
|
242
|
-
const html = await res.text()
|
|
243
|
-
const $ = cheerio.load(html)
|
|
244
|
-
$('script, style, noscript, nav, footer, header').remove()
|
|
245
|
-
const main = $('article, main, [role="main"]').first()
|
|
246
|
-
const text = (main.length ? main.text() : $('body').text()).replace(/\s+/g, ' ').trim()
|
|
247
|
-
return truncate(text, MAX_OUTPUT)
|
|
367
|
+
if (!fetchUrl) return `Error: "url" is required for ${action} action.`
|
|
368
|
+
const page = await extractReadablePage(fetchUrl)
|
|
369
|
+
return action === 'extract' ? formatExtractedPage(page) : truncate(page.text, MAX_OUTPUT)
|
|
370
|
+
} else if (action === 'crawl') {
|
|
371
|
+
return executeWebCrawlAction(normalized)
|
|
248
372
|
} else if (action === 'api') {
|
|
249
373
|
return executeWebApiAction(normalized)
|
|
250
374
|
}
|
|
@@ -259,21 +383,25 @@ async function executeWebAction(args: Record<string, unknown>) {
|
|
|
259
383
|
*/
|
|
260
384
|
const WebExtension: Extension = {
|
|
261
385
|
name: 'Core Web',
|
|
262
|
-
description: 'Search the web,
|
|
386
|
+
description: 'Search the web, extract pages, crawl sites, and make HTTP API calls.',
|
|
263
387
|
hooks: {
|
|
264
|
-
getCapabilityDescription: () => 'I can use
|
|
388
|
+
getCapabilityDescription: () => 'I can use `web_search` for fresh research, `web_extract` for a specific URL, `web_crawl` for bounded multi-page site reads, and the unified `web` tool for search, fetch, crawl, and raw HTTP API calls.',
|
|
265
389
|
} as ExtensionHooks,
|
|
266
390
|
tools: [
|
|
267
391
|
{
|
|
268
392
|
name: 'web',
|
|
269
|
-
description: 'Unified web access tool. Actions: search (web search), fetch (read URL content), api (raw HTTP request with method/headers/body).',
|
|
393
|
+
description: 'Unified web access tool. Actions: search (web search), fetch/extract (read URL content), crawl (bounded same-origin crawl), api (raw HTTP request with method/headers/body).',
|
|
270
394
|
parameters: {
|
|
271
395
|
type: 'object',
|
|
272
396
|
properties: {
|
|
273
|
-
action: { type: 'string', enum: ['search', 'fetch', 'api'] },
|
|
397
|
+
action: { type: 'string', enum: ['search', 'fetch', 'extract', 'crawl', 'api'] },
|
|
274
398
|
query: { type: 'string' },
|
|
275
399
|
url: { type: 'string' },
|
|
276
400
|
maxResults: { type: 'number' },
|
|
401
|
+
maxPages: { type: 'number', description: 'Maximum pages for crawl action, default 5, max 25' },
|
|
402
|
+
maxDepth: { type: 'number', description: 'Maximum crawl depth, default 1, max 3' },
|
|
403
|
+
includeExternal: { type: 'boolean', description: 'Allow crawl to leave the starting origin, default false' },
|
|
404
|
+
sameOrigin: { type: 'boolean', description: 'Keep crawl on the starting origin when true, default true' },
|
|
277
405
|
method: { type: 'string', enum: ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'], description: 'HTTP method (for api action)' },
|
|
278
406
|
headers: { type: 'object', additionalProperties: { type: 'string' }, description: 'Request headers (for api action)' },
|
|
279
407
|
body: { type: 'string', description: 'Request body (for api action)' },
|
|
@@ -283,6 +411,71 @@ const WebExtension: Extension = {
|
|
|
283
411
|
required: ['action']
|
|
284
412
|
},
|
|
285
413
|
execute: async (args) => executeWebAction(args)
|
|
414
|
+
},
|
|
415
|
+
{
|
|
416
|
+
name: 'web_search',
|
|
417
|
+
description: 'Search the web and return ranked results with URLs and snippets.',
|
|
418
|
+
parameters: {
|
|
419
|
+
type: 'object',
|
|
420
|
+
properties: {
|
|
421
|
+
query: { type: 'string' },
|
|
422
|
+
maxResults: { type: 'number' },
|
|
423
|
+
},
|
|
424
|
+
required: ['query'],
|
|
425
|
+
},
|
|
426
|
+
planning: {
|
|
427
|
+
capabilities: ['research.search'],
|
|
428
|
+
disciplineGuidance: ['Use `web_search` for fresh information, then fetch or extract only the sources you need.'],
|
|
429
|
+
},
|
|
430
|
+
execute: async (args) => executeWebAction({ ...normalizeToolInputArgs(args), action: 'search' }),
|
|
431
|
+
},
|
|
432
|
+
{
|
|
433
|
+
name: 'web_fetch',
|
|
434
|
+
description: 'Read a specific URL and return readable page text.',
|
|
435
|
+
parameters: {
|
|
436
|
+
type: 'object',
|
|
437
|
+
properties: { url: { type: 'string' } },
|
|
438
|
+
required: ['url'],
|
|
439
|
+
},
|
|
440
|
+
planning: {
|
|
441
|
+
capabilities: ['research.fetch'],
|
|
442
|
+
disciplineGuidance: ['Use `web_fetch` when you already have a URL and only need the readable text.'],
|
|
443
|
+
},
|
|
444
|
+
execute: async (args) => executeWebAction({ ...normalizeToolInputArgs(args), action: 'fetch' }),
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
name: 'web_extract',
|
|
448
|
+
description: 'Extract readable content from a URL with title and source URL included.',
|
|
449
|
+
parameters: {
|
|
450
|
+
type: 'object',
|
|
451
|
+
properties: { url: { type: 'string' } },
|
|
452
|
+
required: ['url'],
|
|
453
|
+
},
|
|
454
|
+
planning: {
|
|
455
|
+
capabilities: ['research.fetch'],
|
|
456
|
+
disciplineGuidance: ['Use `web_extract` for source-grounded page reads where the title and URL should stay attached to the extracted text.'],
|
|
457
|
+
},
|
|
458
|
+
execute: async (args) => executeWebExtractAction(normalizeToolInputArgs(args)),
|
|
459
|
+
},
|
|
460
|
+
{
|
|
461
|
+
name: 'web_crawl',
|
|
462
|
+
description: 'Crawl a small set of pages starting from one URL. Same-origin by default, bounded by maxPages and maxDepth.',
|
|
463
|
+
parameters: {
|
|
464
|
+
type: 'object',
|
|
465
|
+
properties: {
|
|
466
|
+
url: { type: 'string' },
|
|
467
|
+
maxPages: { type: 'number' },
|
|
468
|
+
maxDepth: { type: 'number' },
|
|
469
|
+
includeExternal: { type: 'boolean' },
|
|
470
|
+
sameOrigin: { type: 'boolean' },
|
|
471
|
+
},
|
|
472
|
+
required: ['url'],
|
|
473
|
+
},
|
|
474
|
+
planning: {
|
|
475
|
+
capabilities: ['research.crawl'],
|
|
476
|
+
disciplineGuidance: ['Use `web_crawl` only when the task needs multiple pages from the same site. Keep maxPages low and summarize after one crawl.'],
|
|
477
|
+
},
|
|
478
|
+
execute: async (args) => executeWebCrawlAction(normalizeToolInputArgs(args)),
|
|
286
479
|
}
|
|
287
480
|
]
|
|
288
481
|
}
|
|
@@ -307,6 +500,40 @@ export function buildWebTools(bctx: ToolBuildContext): StructuredToolInterface[]
|
|
|
307
500
|
}
|
|
308
501
|
)
|
|
309
502
|
)
|
|
503
|
+
tools.push(
|
|
504
|
+
tool(
|
|
505
|
+
async (args) => executeWebAction({ ...normalizeToolInputArgs((args ?? {}) as Record<string, unknown>), action: 'search' }),
|
|
506
|
+
{
|
|
507
|
+
name: 'web_search',
|
|
508
|
+
description: 'Search the web and return ranked results with URLs and snippets.',
|
|
509
|
+
schema: z.object({}).passthrough()
|
|
510
|
+
}
|
|
511
|
+
),
|
|
512
|
+
tool(
|
|
513
|
+
async (args) => executeWebAction({ ...normalizeToolInputArgs((args ?? {}) as Record<string, unknown>), action: 'fetch' }),
|
|
514
|
+
{
|
|
515
|
+
name: 'web_fetch',
|
|
516
|
+
description: 'Read a specific URL and return readable page text.',
|
|
517
|
+
schema: z.object({}).passthrough()
|
|
518
|
+
}
|
|
519
|
+
),
|
|
520
|
+
tool(
|
|
521
|
+
async (args) => executeWebExtractAction(normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)),
|
|
522
|
+
{
|
|
523
|
+
name: 'web_extract',
|
|
524
|
+
description: 'Extract readable content from a URL with title and source URL included.',
|
|
525
|
+
schema: z.object({}).passthrough()
|
|
526
|
+
}
|
|
527
|
+
),
|
|
528
|
+
tool(
|
|
529
|
+
async (args) => executeWebCrawlAction(normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)),
|
|
530
|
+
{
|
|
531
|
+
name: 'web_crawl',
|
|
532
|
+
description: 'Crawl a small set of pages starting from one URL. Same-origin by default, bounded by maxPages and maxDepth.',
|
|
533
|
+
schema: z.object({}).passthrough()
|
|
534
|
+
}
|
|
535
|
+
)
|
|
536
|
+
)
|
|
310
537
|
}
|
|
311
538
|
|
|
312
539
|
// Browser tool (kept as direct injection for now due to complexity)
|
|
@@ -3,7 +3,7 @@ const EXTENSION_ALIAS_GROUPS: string[][] = [
|
|
|
3
3
|
['execute', 'sandbox'],
|
|
4
4
|
['files', 'read_file', 'write_file', 'list_files', 'copy_file', 'move_file', 'delete_file', 'send_file'],
|
|
5
5
|
['edit_file'],
|
|
6
|
-
['web', 'web_search', 'web_fetch', 'http_request', 'http'],
|
|
6
|
+
['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'http_request', 'http'],
|
|
7
7
|
['browser', 'openclaw_browser'],
|
|
8
8
|
['delegate', 'claude_code', 'codex_cli', 'opencode_cli', 'gemini_cli', 'copilot_cli', 'droid_cli', 'cursor_cli', 'qwen_code_cli', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli', 'delegate_to_copilot_cli', 'delegate_to_droid_cli', 'delegate_to_cursor_cli', 'delegate_to_qwen_code_cli'],
|
|
9
9
|
['manage_platform'],
|
|
@@ -255,12 +255,12 @@ describe('explicit allows override mode blocks', () => {
|
|
|
255
255
|
// Category blocks
|
|
256
256
|
// ---------------------------------------------------------------------------
|
|
257
257
|
describe('category blocks', () => {
|
|
258
|
-
it('blocking network category blocks web
|
|
259
|
-
const d = resolveSessionToolPolicy(['web', 'web_search', 'web_fetch', 'memory'], {
|
|
258
|
+
it('blocking network category blocks granular web tools', () => {
|
|
259
|
+
const d = resolveSessionToolPolicy(['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'memory'], {
|
|
260
260
|
capabilityBlockedCategories: ['network'],
|
|
261
261
|
})
|
|
262
262
|
assert.deepStrictEqual(d.enabledExtensions, ['memory'])
|
|
263
|
-
assert.equal(d.blockedExtensions.length,
|
|
263
|
+
assert.equal(d.blockedExtensions.length, 5)
|
|
264
264
|
for (const b of d.blockedExtensions) {
|
|
265
265
|
assert.match(b.reason, /category "network"/)
|
|
266
266
|
}
|
|
@@ -49,9 +49,11 @@ const TOOL_DESCRIPTORS: Record<string, ToolDescriptor> = {
|
|
|
49
49
|
move_file: { categories: ['filesystem'], concreteTools: ['move_file'] },
|
|
50
50
|
edit_file: { categories: ['filesystem'], concreteTools: ['edit_file'] },
|
|
51
51
|
delete_file: { categories: ['filesystem'], concreteTools: ['delete_file'], destructive: true },
|
|
52
|
-
web: { categories: ['network'], concreteTools: ['web', 'web_search', 'web_fetch'] },
|
|
52
|
+
web: { categories: ['network'], concreteTools: ['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl'] },
|
|
53
53
|
web_search: { categories: ['network'], concreteTools: ['web_search'] },
|
|
54
54
|
web_fetch: { categories: ['network'], concreteTools: ['web_fetch'] },
|
|
55
|
+
web_extract: { categories: ['network'], concreteTools: ['web_extract'] },
|
|
56
|
+
web_crawl: { categories: ['network'], concreteTools: ['web_crawl'] },
|
|
55
57
|
browser: { categories: ['browser', 'network'], concreteTools: ['browser', 'openclaw_browser'] },
|
|
56
58
|
delegate: { categories: ['delegation', 'execution'], concreteTools: ['delegate', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli', 'delegate_to_copilot_cli', 'delegate_to_droid_cli', 'delegate_to_cursor_cli', 'delegate_to_qwen_code_cli'] },
|
|
57
59
|
claude_code: { categories: ['delegation', 'execution'], concreteTools: ['delegate_to_claude_code'] },
|
|
@@ -85,6 +87,7 @@ const TOOL_DESCRIPTORS: Record<string, ToolDescriptor> = {
|
|
|
85
87
|
spawn_subagent: { categories: ['delegation', 'platform'], concreteTools: ['spawn_subagent', 'delegate_to_agent'] },
|
|
86
88
|
context_mgmt: { categories: ['memory'], concreteTools: ['context_mgmt', 'context_status', 'context_summarize'] },
|
|
87
89
|
extension_creator: { categories: ['filesystem', 'execution'], concreteTools: ['extension_creator', 'extension_creator_tool'] },
|
|
90
|
+
wallet: { categories: ['outbound'], concreteTools: ['wallet'] },
|
|
88
91
|
mailbox: { categories: ['network', 'platform', 'outbound'], concreteTools: ['mailbox', 'inbox'] },
|
|
89
92
|
ask_human: { categories: ['platform'], concreteTools: ['ask_human', 'human_loop'] },
|
|
90
93
|
google_workspace: { categories: ['network'], concreteTools: ['google_workspace', 'gws'] },
|
|
@@ -12,10 +12,11 @@ function uniqueExtensionId(prefix: string): string {
|
|
|
12
12
|
|
|
13
13
|
describe('tool-planning', () => {
|
|
14
14
|
it('collects core planning metadata for aliased built-in tools', () => {
|
|
15
|
-
const view = getEnabledToolPlanningView(['web_search', 'web_fetch', 'browser', 'manage_connectors'])
|
|
15
|
+
const view = getEnabledToolPlanningView(['web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'manage_connectors'])
|
|
16
16
|
|
|
17
17
|
assert.deepEqual(view.displayToolIds, ['browser', 'manage_connectors', 'web'])
|
|
18
18
|
assert.deepEqual(getToolsForCapability(['web_search'], TOOL_CAPABILITY.researchSearch), ['web_search'])
|
|
19
|
+
assert.deepEqual(getToolsForCapability(['web_crawl'], TOOL_CAPABILITY.researchCrawl), ['web_crawl'])
|
|
19
20
|
assert.deepEqual(getToolsForCapability(['manage_connectors'], TOOL_CAPABILITY.deliveryVoiceNote), ['connector_message_tool'])
|
|
20
21
|
})
|
|
21
22
|
|
|
@@ -7,6 +7,7 @@ import { canonicalizeExtensionId, expandExtensionIds } from './tool-aliases'
|
|
|
7
7
|
export const TOOL_CAPABILITY = {
|
|
8
8
|
researchSearch: 'research.search',
|
|
9
9
|
researchFetch: 'research.fetch',
|
|
10
|
+
researchCrawl: 'research.crawl',
|
|
10
11
|
browserNavigate: 'browser.navigate',
|
|
11
12
|
browserCapture: 'browser.capture',
|
|
12
13
|
artifactPdf: 'artifact.pdf',
|
|
@@ -98,6 +99,36 @@ const CORE_TOOL_PLANNING: Record<string, LegacyToolPlanningEntry[]> = {
|
|
|
98
99
|
},
|
|
99
100
|
],
|
|
100
101
|
},
|
|
102
|
+
{
|
|
103
|
+
toolName: 'web_extract',
|
|
104
|
+
capabilities: [TOOL_CAPABILITY.researchFetch],
|
|
105
|
+
disciplineGuidance: [
|
|
106
|
+
'For `web_extract`, use `{"url":"https://..."}` when source title and URL should remain attached to extracted page text.',
|
|
107
|
+
'Extract the exact pages you need, then synthesize. Do not extract the same page repeatedly.',
|
|
108
|
+
],
|
|
109
|
+
requestMatchers: [
|
|
110
|
+
{
|
|
111
|
+
capability: TOOL_CAPABILITY.researchFetch,
|
|
112
|
+
patterns: ['extract', 'readable content', 'page text', 'source text'],
|
|
113
|
+
requireLiteralUrl: true,
|
|
114
|
+
},
|
|
115
|
+
],
|
|
116
|
+
},
|
|
117
|
+
{
|
|
118
|
+
toolName: 'web_crawl',
|
|
119
|
+
capabilities: [TOOL_CAPABILITY.researchCrawl],
|
|
120
|
+
disciplineGuidance: [
|
|
121
|
+
'For `web_crawl`, use `{"url":"https://...","maxPages":5,"maxDepth":1}` only when a task needs several pages from the same site.',
|
|
122
|
+
'Keep crawls bounded and summarize after one crawl. Prefer `web_extract` for a single known URL.',
|
|
123
|
+
],
|
|
124
|
+
requestMatchers: [
|
|
125
|
+
{
|
|
126
|
+
capability: TOOL_CAPABILITY.researchCrawl,
|
|
127
|
+
patterns: ['crawl', 'site map', 'sitemap', 'multiple pages', 'whole site', 'scan site'],
|
|
128
|
+
requireLiteralUrl: true,
|
|
129
|
+
},
|
|
130
|
+
],
|
|
131
|
+
},
|
|
101
132
|
],
|
|
102
133
|
browser: [
|
|
103
134
|
{
|
|
@@ -4,11 +4,11 @@ const INJECTION_PATTERNS: Array<{ code: string; re: RegExp; note: string }> = [
|
|
|
4
4
|
{ code: 'ignore_instructions', re: /\bignore (?:all |any |the )?(?:previous|prior|above|system|developer) instructions\b/i, note: 'tries to override existing instructions' },
|
|
5
5
|
{ code: 'reveal_prompt', re: /\b(?:reveal|show|print|dump)\b[\s\S]{0,40}\b(?:system prompt|developer prompt|hidden prompt)\b/i, note: 'asks for hidden prompt data' },
|
|
6
6
|
{ code: 'credential_theft', re: /\b(?:api key|token|password|secret|credential)s?\b[\s\S]{0,40}\b(?:send|share|reveal|print|dump|exfiltrat)/i, note: 'asks for secrets or credentials' },
|
|
7
|
-
{ code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' },
|
|
7
|
+
{ code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|web_extract|web_crawl|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' },
|
|
8
8
|
{ code: 'workflow_override', re: /\b(?:act as|pretend to be)\b[\s\S]{0,40}\b(?:system|developer|administrator|operator)\b/i, note: 'tries to impersonate a higher-priority role' },
|
|
9
9
|
]
|
|
10
10
|
|
|
11
|
-
const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'http_request'])
|
|
11
|
+
const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'http_request'])
|
|
12
12
|
|
|
13
13
|
function normalizeMode(value: unknown): 'off' | 'warn' | 'block' {
|
|
14
14
|
const normalized = typeof value === 'string' ? value.trim().toLowerCase() : ''
|