@swarmclawai/swarmclaw 1.9.21 → 1.9.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -409,6 +409,15 @@ Operational docs: https://swarmclaw.ai/docs/observability
409
409
 
410
410
  ## Releases
411
411
 
412
+ ### v1.9.22 Highlights
413
+
414
+ Research tools release: agents now get direct `web_extract` and `web_crawl` tools alongside `web_search`, `web_fetch`, and the unified `web` tool.
415
+
416
+ - **Source-grounded extraction.** `web_extract` returns a page title, canonical URL, and readable content for known source URLs.
417
+ - **Bounded crawls.** `web_crawl` walks same-origin links by default with conservative page and depth caps, plus an explicit external-link opt-in.
418
+ - **Better routing.** Tool aliases, capability policy, planning hints, continuation recovery, and the chat UI all recognize the granular research tools.
419
+ - **Regression coverage.** New tests cover action inference, tool-call translation, direct tool registration, extraction cleanup, and same-origin crawl bounds.
420
+
412
421
  ### v1.9.21 Highlights
413
422
 
414
423
  Provider diagnostics release: connection checks now return a structured step timeline across setup, provider settings, and agent editing.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.9.21",
3
+ "version": "1.9.22",
4
4
  "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "main": "electron-dist/main.js",
6
6
  "license": "MIT",
@@ -88,7 +88,7 @@
88
88
  "test:cli": "node --test src/cli/*.test.js bin/*.test.js scripts/electron-after-pack.test.mjs scripts/electron-signing-config.test.mjs scripts/ensure-sandbox-browser-image.test.mjs scripts/postinstall.test.mjs scripts/run-next-build.test.mjs scripts/run-next-typegen.test.mjs",
89
89
  "test:setup": "tsx --test src/app/api/setup/check-provider/route.test.ts src/lib/server/provider-model-discovery.test.ts src/components/auth/setup-wizard/utils.test.ts src/components/auth/setup-wizard/types.test.ts src/hooks/setup-done-detection.test.ts src/lib/setup-defaults.test.ts src/lib/server/storage-auth.test.ts src/lib/server/storage-auth-docker.test.ts",
90
90
  "test:openclaw": "tsx --test src/lib/openclaw/openclaw-agent-id.test.ts src/lib/openclaw/openclaw-endpoint.test.ts src/lib/server/agents/agent-runtime-config.test.ts src/lib/server/build-llm.test.ts src/lib/server/connectors/connector-routing.test.ts src/lib/server/connectors/openclaw.test.ts src/lib/server/connectors/swarmdock.test.ts src/lib/server/gateway/protocol.test.ts src/lib/server/gateways/gateway-topology.test.ts src/lib/server/llm-response-cache.test.ts src/lib/server/mcp-conformance.test.ts src/lib/server/openclaw/agent-resolver.test.ts src/lib/server/openclaw/deploy.test.ts src/lib/server/openclaw/skills-normalize.test.ts src/lib/server/session-tools/openclaw-nodes.test.ts src/lib/server/session-tools/swarmdock.test.ts src/lib/server/tasks/task-quality-gate.test.ts src/lib/server/tasks/task-validation.test.ts src/lib/server/tool-capability-policy.test.ts src/lib/providers/openai.test.ts src/lib/providers/openclaw-exports.test.ts src/app/api/gateways/topology-route.test.ts src/app/api/openclaw/dashboard-url/route.test.ts",
91
- "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/agent-planning-mode.test.ts src/lib/agent-config-history.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/provider-diagnostics.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/prompt-sections.planning-mode.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/chats/session-context-pack.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/runs/run-handoff.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/schedules/schedule-history.test.ts src/lib/server/schedules/schedule-preview.test.ts src/lib/quality/release-readiness.test.ts src/lib/quality/architecture-health.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-execution-policy.test.ts src/lib/server/tasks/task-handoff.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-pack-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/config-versions/config-versions-route.test.ts src/app/api/runs/run-handoff-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/schedules/preview/route.test.ts src/app/api/schedules/schedule-history-route.test.ts src/app/api/tts/route.test.ts",
91
+ "test:runtime": "tsx --test src/lib/a2a/agent-card.test.ts src/lib/agent-planning-mode.test.ts src/lib/agent-config-history.test.ts src/lib/strip-internal-metadata.test.ts src/lib/provider-sets.test.ts src/lib/providers/opencode-cli.test.ts src/lib/providers/cli-provider-metadata.test.ts src/lib/providers/cli-utils.test.ts src/lib/providers/generic-cli.test.ts src/lib/server/agents/delegation-advisory.test.ts src/lib/server/cli-provider-readiness.test.ts src/lib/server/provider-health.test.ts src/lib/server/provider-diagnostics.test.ts src/lib/server/mcp-gateway-runtime.test.ts src/lib/server/mcp-connection-pool.test.ts src/lib/server/knowledge-sources.test.ts src/lib/server/extension-managed-resources.test.ts src/lib/server/eval/baseline.test.ts src/lib/server/eval/environment-plan.test.ts src/lib/server/chat-execution/chat-execution-grounding.test.ts src/lib/server/chat-execution/chat-turn-preparation.test.ts src/lib/server/chat-execution/iteration-timers.test.ts src/lib/server/chat-execution/post-stream-finalization.test.ts src/lib/server/chat-execution/prompt-sections.planning-mode.test.ts src/lib/server/chat-execution/reasoning-tag-scrubber.test.ts src/lib/server/chats/clear-undo-snapshots.test.ts src/lib/server/chats/session-context-pack.test.ts src/lib/server/connectors/email.test.ts src/lib/server/protocols/protocol-service.test.ts src/lib/server/runtime/run-ledger.test.ts src/lib/server/runtime/queue-retry-policy.test.ts src/lib/server/runs/run-brief.test.ts src/lib/server/runs/run-handoff.test.ts src/lib/server/operations/operation-pulse.test.ts src/lib/server/schedules/schedule-history.test.ts src/lib/server/schedules/schedule-preview.test.ts src/lib/quality/release-readiness.test.ts src/lib/quality/architecture-health.test.ts src/lib/server/artifacts/artifact-resolver.test.ts src/lib/server/observability/otel-config.test.ts src/lib/server/safe-parse-body.test.ts src/lib/server/missions/mission-templates.test.ts src/lib/server/sharing/share-link-repository.test.ts src/lib/server/sharing/share-resolver.test.ts src/lib/server/tasks/task-execution-workspace.test.ts src/lib/server/tasks/task-execution-policy.test.ts src/lib/server/tasks/task-handoff.test.ts src/lib/server/tasks/task-service.test.ts src/lib/server/session-tools/execute.test.ts src/lib/server/session-tools/manage-tasks.test.ts src/lib/server/session-tools/web-crawl.test.ts src/lib/app/view-constants.test.ts src/lib/quality/quality-summary.test.ts src/app/api/approvals/route.test.ts src/app/api/agents/agents-route.test.ts src/app/api/tasks/tasks-route.test.ts src/app/api/tasks/task-workspace-route.test.ts src/app/api/chats/chat-route.test.ts src/app/api/chats/clear-route.test.ts src/app/api/chats/compact-route.test.ts src/app/api/chats/context-pack-route.test.ts src/app/api/chats/context-status-route.test.ts src/app/api/config-versions/config-versions-route.test.ts src/app/api/runs/run-handoff-route.test.ts src/app/api/connectors/connector-doctor-route.test.ts src/app/api/extensions/managed-resources/route.test.ts src/app/api/healthz/route.test.ts src/app/api/logs/route.test.ts src/app/api/portability/export/route.test.ts src/app/api/portability/import/route.test.ts src/app/api/providers/[id]/route.test.ts src/app/api/schedules/preview/route.test.ts src/app/api/schedules/schedule-history-route.test.ts src/app/api/tts/route.test.ts",
92
92
  "test:builder": "tsx --test src/features/protocols/builder/utils/nodes-to-template.test.ts src/features/protocols/builder/utils/template-to-nodes.test.ts src/features/protocols/builder/validators/dag-validator.test.ts",
93
93
  "test:e2e": "node --import tsx scripts/browser-e2e-smoke.ts",
94
94
  "test:mcp:conformance": "node --import tsx ./scripts/mcp-conformance-check.ts",
@@ -19,6 +19,9 @@ const NOTABLE_TOOLS: Record<string, { label: string; color: string; icon: 'brain
19
19
  delegate_to_agent: { label: 'Delegating task', color: '#6366F1', icon: 'delegate' },
20
20
  check_delegation_status: { label: 'Checking delegation', color: '#6366F1', icon: 'delegate' },
21
21
  web_search: { label: 'Searched the web', color: '#22C55E', icon: 'search' },
22
+ web_fetch: { label: 'Read a web page', color: '#22C55E', icon: 'search' },
23
+ web_extract: { label: 'Extracted a web page', color: '#22C55E', icon: 'search' },
24
+ web_crawl: { label: 'Crawled a site', color: '#22C55E', icon: 'search' },
22
25
  connector_message_tool: { label: 'Sent a message', color: '#F97316', icon: 'message' },
23
26
  }
24
27
 
@@ -35,6 +38,7 @@ function extractSnippet(toolName: string, toolInput: string): string | null {
35
38
  if (toolName === 'check_delegation_status' && parsed.agentName) return parsed.agentName
36
39
  if (toolName.startsWith('delegate_to_') && parsed.task) return parsed.task
37
40
  if (toolName === 'web_search' && parsed.query) return parsed.query
41
+ if ((toolName === 'web_fetch' || toolName === 'web_extract' || toolName === 'web_crawl') && parsed.url) return parsed.url
38
42
  if (toolName === 'connector_message_tool' && parsed.to) return parsed.to
39
43
  } catch { /* ignore parse errors */ }
40
44
  return null
@@ -20,6 +20,8 @@ const TOOL_COLORS: Record<string, string> = {
20
20
  create_spreadsheet: '#10B981',
21
21
  web_search: '#3B82F6',
22
22
  web_fetch: '#3B82F6',
23
+ web_extract: '#3B82F6',
24
+ web_crawl: '#3B82F6',
23
25
  spawn_subagent: '#8B5CF6',
24
26
  delegate_to_agent: '#6366F1',
25
27
  check_delegation_status: '#6366F1',
@@ -77,6 +79,8 @@ export const TOOL_LABELS: Record<string, string> = {
77
79
  create_spreadsheet: 'Create Spreadsheet',
78
80
  web_search: 'Web Search',
79
81
  web_fetch: 'Web Fetch',
82
+ web_extract: 'Web Extract',
83
+ web_crawl: 'Web Crawl',
80
84
  claude_code: 'Claude Code',
81
85
  codex_cli: 'Codex CLI',
82
86
  opencode_cli: 'OpenCode CLI',
@@ -127,6 +131,8 @@ export const TOOL_DESCRIPTIONS: Record<string, string> = {
127
131
  create_spreadsheet: 'Create Excel or CSV files from structured data',
128
132
  web_search: 'Search the web for information',
129
133
  web_fetch: 'Fetch and read web page content',
134
+ web_extract: 'Extract readable content from a source URL',
135
+ web_crawl: 'Crawl a bounded set of pages from one site',
130
136
  claude_code: 'Enable delegation to Claude Code CLI',
131
137
  codex_cli: 'Enable delegation to OpenAI Codex CLI',
132
138
  opencode_cli: 'Enable delegation to OpenCode CLI',
@@ -26,7 +26,7 @@ test('routeTaskIntent keeps coding prompts prioritized over memory keywords', ()
26
26
  test('routeTaskIntent keeps hybrid research-plus-media prompts in research intent', () => {
27
27
  const decision = routeTaskIntent(
28
28
  'Can you tell me more if there is any news related to the US-Iran war, and can you send me some screenshots and give me a summary and maybe send me a voice note about it?',
29
- ['web_search', 'web_fetch', 'browser', 'manage_connectors'],
29
+ ['web_search', 'web_fetch', 'web_crawl', 'browser', 'manage_connectors'],
30
30
  null,
31
31
  makeClassification({
32
32
  taskIntent: 'research',
@@ -39,7 +39,7 @@ test('routeTaskIntent keeps hybrid research-plus-media prompts in research inten
39
39
  )
40
40
 
41
41
  assert.equal(decision.intent, 'research')
42
- assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'browser', 'connector_message_tool'])
42
+ assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'connector_message_tool'])
43
43
  })
44
44
 
45
45
  test('routeTaskIntent treats direct voice-note delivery as outreach', () => {
@@ -72,7 +72,7 @@ test('routeTaskIntent treats keep-watching update requests as research even with
72
72
  )
73
73
 
74
74
  assert.equal(decision.intent, 'research')
75
- assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch'])
75
+ assert.deepEqual(decision.preferredTools, ['web_search', 'web_fetch', 'web_extract', 'web_crawl'])
76
76
  })
77
77
 
78
78
  test('routeTaskIntent uses structured classification when available', () => {
@@ -99,7 +99,7 @@ test('routeTaskIntent uses structured classification when available', () => {
99
99
  )
100
100
 
101
101
  assert.equal(decision.intent, 'browsing')
102
- assert.deepEqual(decision.preferredTools, ['browser', 'web_fetch'])
102
+ assert.deepEqual(decision.preferredTools, ['browser', 'web_fetch', 'web_extract'])
103
103
  })
104
104
 
105
105
  function makeClassification(overrides: Partial<MessageClassification>): MessageClassification {
@@ -144,6 +144,7 @@ export function routeTaskIntent(
144
144
  [
145
145
  TOOL_CAPABILITY.researchSearch,
146
146
  TOOL_CAPABILITY.researchFetch,
147
+ TOOL_CAPABILITY.researchCrawl,
147
148
  ...(wantsScreenshots ? [TOOL_CAPABILITY.browserCapture] : []),
148
149
  ...(wantsVoiceDelivery ? [TOOL_CAPABILITY.deliveryVoiceNote] : []),
149
150
  ...(wantsOutboundDelivery ? [TOOL_CAPABILITY.deliveryMedia, TOOL_CAPABILITY.deliveryMessage] : []),
@@ -407,6 +407,33 @@ describe('translateRequestedToolInvocation advanced', () => {
407
407
  assert.equal(args.action, 'search')
408
408
  assert.equal(args.query, 'test query')
409
409
  })
410
+
411
+ it('maps web_extract to web with action=extract', () => {
412
+ const { toolName, args } = translateRequestedToolInvocation(
413
+ 'web_extract',
414
+ { url: 'https://example.com/source' },
415
+ '',
416
+ ['web'],
417
+ )
418
+ assert.equal(toolName, 'web')
419
+ assert.equal(args.action, 'extract')
420
+ assert.equal(args.url, 'https://example.com/source')
421
+ })
422
+
423
+ it('maps web_crawl to web with bounded crawl arguments', () => {
424
+ const { toolName, args } = translateRequestedToolInvocation(
425
+ 'web_crawl',
426
+ { url: 'https://example.com/', maxPages: 4, maxDepth: 1, includeExternal: false },
427
+ '',
428
+ ['web'],
429
+ )
430
+ assert.equal(toolName, 'web')
431
+ assert.equal(args.action, 'crawl')
432
+ assert.equal(args.url, 'https://example.com/')
433
+ assert.equal(args.maxPages, 4)
434
+ assert.equal(args.maxDepth, 1)
435
+ assert.equal(args.includeExternal, false)
436
+ })
410
437
  })
411
438
 
412
439
  // ---------------------------------------------------------------------------
@@ -127,6 +127,27 @@ export function translateRequestedToolInvocation(
127
127
  },
128
128
  }
129
129
  }
130
+ if (requestedName === 'web_extract') {
131
+ return {
132
+ toolName: 'web',
133
+ args: {
134
+ action: 'extract',
135
+ url: rawArgs.url,
136
+ },
137
+ }
138
+ }
139
+ if (requestedName === 'web_crawl') {
140
+ return {
141
+ toolName: 'web',
142
+ args: {
143
+ action: 'crawl',
144
+ url: rawArgs.url || rawArgs.query,
145
+ maxPages: rawArgs.maxPages ?? rawArgs.maxResults,
146
+ maxDepth: rawArgs.maxDepth,
147
+ includeExternal: rawArgs.includeExternal,
148
+ },
149
+ }
150
+ }
130
151
  if (requestedName === 'delegate_to_claude_code') {
131
152
  return { toolName: 'delegate', args: { ...rawArgs, backend: 'claude' } }
132
153
  }
@@ -349,7 +349,7 @@ export async function processIterationEvents(opts: ProcessIterationEventsOpts):
349
349
  }
350
350
  if (
351
351
  boundedExternalExecutionTask
352
- && ['http_request', 'web', 'web_search', 'web_fetch', 'browser'].includes(toolName)
352
+ && ['http_request', 'web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser'].includes(toolName)
353
353
  && countExternalExecutionResearchSteps(state.streamedToolEvents) >= 5
354
354
  && countDistinctExternalResearchHosts(state.streamedToolEvents) >= 3
355
355
  ) {
@@ -196,7 +196,7 @@ function getRequestedArtifactStatus(params: {
196
196
 
197
197
  export function countExternalExecutionResearchSteps(toolEvents: MessageToolEvent[]): number {
198
198
  return toolEvents.filter((event) => {
199
- return ['http_request', 'web', 'web_search', 'web_fetch', 'browser'].includes(event.name)
199
+ return ['http_request', 'web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser'].includes(event.name)
200
200
  }).length
201
201
  }
202
202
 
@@ -300,6 +300,8 @@ const RECOVERABLE_TOOL_ERROR_NAMES = new Set([
300
300
  'web',
301
301
  'web_search',
302
302
  'web_fetch',
303
+ 'web_extract',
304
+ 'web_crawl',
303
305
  'http_request',
304
306
  ])
305
307
 
@@ -390,6 +392,8 @@ export function getToolFrequencyHint(toolName: string, sessionExtensions: string
390
392
  case 'http_request':
391
393
  case 'web_search':
392
394
  case 'web_fetch':
395
+ case 'web_extract':
396
+ case 'web_crawl':
393
397
  return 'Hint: You have done extensive research. Stop gathering more sources and use the information you already have to complete the task.'
394
398
 
395
399
  case 'spawn_subagent':
@@ -490,7 +494,7 @@ function buildDeliverableFollowthroughPrompt(params: {
490
494
  }
491
495
 
492
496
  if (
493
- params.toolEvents.some((event) => ['web', 'web_search', 'web_fetch', 'browser', 'http_request'].includes(event.name))
497
+ params.toolEvents.some((event) => ['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'http_request'].includes(event.name))
494
498
  && !params.toolEvents.some((event) => ['files', 'write_file', 'edit_file', 'shell', 'execute_command'].includes(event.name))
495
499
  ) {
496
500
  lines.push(
@@ -135,11 +135,13 @@ describe('expandExtensionIds', () => {
135
135
  }
136
136
  })
137
137
 
138
- it('web expands to include web_search and web_fetch', () => {
138
+ it('web expands to include granular web tools', () => {
139
139
  const result = expandExtensionIds(['web'])
140
140
  assert.ok(result.includes('web'))
141
141
  assert.ok(result.includes('web_search'))
142
142
  assert.ok(result.includes('web_fetch'))
143
+ assert.ok(result.includes('web_extract'))
144
+ assert.ok(result.includes('web_crawl'))
143
145
  })
144
146
 
145
147
  it('removes duplicates after expansion', () => {
@@ -199,12 +201,14 @@ describe('expandExtensionIds', () => {
199
201
  // getExtensionAliases
200
202
  // ---------------------------------------------------------------------------
201
203
  describe('getExtensionAliases', () => {
202
- it('web returns [web, web_search, web_fetch]', () => {
204
+ it('web returns the full web alias group', () => {
203
205
  const result = getExtensionAliases('web')
204
206
  assert.ok(result.includes('web'))
205
207
  assert.ok(result.includes('web_search'))
206
208
  assert.ok(result.includes('web_fetch'))
207
- assert.equal(result.length, 5) // web, web_search, web_fetch, http_request, http
209
+ assert.ok(result.includes('web_extract'))
210
+ assert.ok(result.includes('web_crawl'))
211
+ assert.equal(result.length, 7) // web, web_search, web_fetch, web_extract, web_crawl, http_request, http
208
212
  })
209
213
 
210
214
  it('web_search returns the same group as web', () => {
@@ -0,0 +1,106 @@
1
+ import assert from 'node:assert/strict'
2
+ import { afterEach, describe, it } from 'node:test'
3
+ import { buildWebTools } from './web'
4
+ import type { ToolBuildContext } from './context'
5
+
6
+ const originalFetch = globalThis.fetch
7
+
8
+ function createContext(): ToolBuildContext {
9
+ return {
10
+ cwd: process.cwd(),
11
+ ctx: undefined,
12
+ hasExtension: (name: string) => name === 'web',
13
+ hasTool: (name: string) => name === 'web',
14
+ cleanupFns: [],
15
+ commandTimeoutMs: 1000,
16
+ claudeTimeoutMs: 1000,
17
+ cliProcessTimeoutMs: 1000,
18
+ persistDelegateResumeId: () => {},
19
+ readStoredDelegateResumeId: () => null,
20
+ resolveCurrentSession: () => null,
21
+ activeExtensions: ['web'],
22
+ } as ToolBuildContext
23
+ }
24
+
25
+ function mockFetch(pages: Record<string, string>, calls: string[] = []): void {
26
+ globalThis.fetch = (async (input: RequestInfo | URL) => {
27
+ const url = input instanceof Request ? input.url : String(input)
28
+ calls.push(url)
29
+ const html = pages[url]
30
+ if (!html) {
31
+ return new Response('missing', { status: 404, statusText: 'Not Found' })
32
+ }
33
+ return new Response(html, {
34
+ status: 200,
35
+ headers: { 'content-type': 'text/html; charset=utf-8' },
36
+ })
37
+ }) as typeof fetch
38
+ }
39
+
40
+ afterEach(() => {
41
+ globalThis.fetch = originalFetch
42
+ })
43
+
44
+ describe('web extract and crawl tools', () => {
45
+ it('registers direct granular web tools when web is enabled', () => {
46
+ const names = buildWebTools(createContext()).map((entry) => entry.name).sort()
47
+
48
+ assert.deepEqual(names.filter((name) => name.startsWith('web')), [
49
+ 'web',
50
+ 'web_crawl',
51
+ 'web_extract',
52
+ 'web_fetch',
53
+ 'web_search',
54
+ ])
55
+ })
56
+
57
+ it('extracts readable page content with title and source URL', async () => {
58
+ mockFetch({
59
+ 'https://example.test/article': `
60
+ <!doctype html>
61
+ <title>Feature Page</title>
62
+ <header>Ignore navigation</header>
63
+ <main>
64
+ <h1>Feature Page</h1>
65
+ <p>Readable body text for the agent.</p>
66
+ </main>
67
+ <script>console.log('hidden')</script>
68
+ `,
69
+ })
70
+ const tool = buildWebTools(createContext()).find((entry) => entry.name === 'web_extract')
71
+ assert.ok(tool)
72
+
73
+ const output = String(await tool.invoke({ url: 'https://example.test/article#section' }))
74
+
75
+ assert.match(output, /Title: Feature Page/)
76
+ assert.match(output, /URL: https:\/\/example\.test\/article/)
77
+ assert.match(output, /Readable body text for the agent\./)
78
+ assert.doesNotMatch(output, /Ignore navigation/)
79
+ assert.doesNotMatch(output, /console\.log/)
80
+ })
81
+
82
+ it('crawls same-origin pages within the requested page and depth bounds', async () => {
83
+ const calls: string[] = []
84
+ mockFetch({
85
+ 'https://site.test/': `
86
+ <title>Start</title>
87
+ <main>Start page <a href="/a">A</a> <a href="/b">B</a> <a href="https://external.test/x">External</a></main>
88
+ `,
89
+ 'https://site.test/a': '<title>A page</title><main>Alpha content</main>',
90
+ 'https://site.test/b': '<title>B page</title><main>Beta content</main>',
91
+ 'https://external.test/x': '<title>External</title><main>Should not be fetched</main>',
92
+ }, calls)
93
+ const tool = buildWebTools(createContext()).find((entry) => entry.name === 'web_crawl')
94
+ assert.ok(tool)
95
+
96
+ const output = String(await tool.invoke({ url: 'https://site.test/', maxPages: 3, maxDepth: 1 }))
97
+
98
+ assert.match(output, /Crawl results for: https:\/\/site\.test\//)
99
+ assert.match(output, /Pages crawled: 3/)
100
+ assert.match(output, /Start page/)
101
+ assert.match(output, /Alpha content/)
102
+ assert.match(output, /Beta content/)
103
+ assert.doesNotMatch(output, /Should not be fetched/)
104
+ assert.deepEqual(calls, ['https://site.test/', 'https://site.test/a', 'https://site.test/b'])
105
+ })
106
+ })
@@ -20,6 +20,11 @@ describe('inferWebActionFromArgs', () => {
20
20
  assert.equal(inferWebActionFromArgs({ action: 'search', url: 'https://example.com/article' }), 'search')
21
21
  })
22
22
 
23
+ it('preserves explicit extract and crawl actions', () => {
24
+ assert.equal(inferWebActionFromArgs({ action: 'extract', url: 'https://example.com/article' }), 'extract')
25
+ assert.equal(inferWebActionFromArgs({ action: 'crawl', url: 'https://example.com/' }), 'crawl')
26
+ })
27
+
23
28
  it('normalizes stringified browser form payloads', () => {
24
29
  const normalized = normalizeBrowserActionParams({
25
30
  input: JSON.stringify({
@@ -176,8 +176,14 @@ export function inferWebActionFromArgs(params: {
176
176
  query?: string
177
177
  url?: string
178
178
  method?: string
179
- }): 'search' | 'fetch' | 'api' | undefined {
180
- if (params.action === 'search' || params.action === 'fetch' || params.action === 'api') return params.action
179
+ }): 'search' | 'fetch' | 'extract' | 'crawl' | 'api' | undefined {
180
+ if (
181
+ params.action === 'search'
182
+ || params.action === 'fetch'
183
+ || params.action === 'extract'
184
+ || params.action === 'crawl'
185
+ || params.action === 'api'
186
+ ) return params.action
181
187
  if (typeof params.method === 'string' && params.method.trim()) return 'api'
182
188
  if (typeof params.url === 'string' && /^https?:\/\//i.test(params.url.trim())) return 'fetch'
183
189
  if (typeof params.query === 'string' && params.query.trim()) return 'search'
@@ -199,6 +199,149 @@ async function executeWebApiAction(normalized: Record<string, unknown>) {
199
199
  }, requestArgs)
200
200
  }
201
201
 
202
+ interface ExtractedWebPage {
203
+ url: string
204
+ title: string
205
+ text: string
206
+ links: string[]
207
+ }
208
+
209
+ function normalizeHttpUrl(rawUrl: string): string {
210
+ const trimmed = rawUrl.trim()
211
+ if (!trimmed) throw new Error('URL is required.')
212
+ const parsed = new URL(trimmed)
213
+ if (parsed.protocol !== 'http:' && parsed.protocol !== 'https:') {
214
+ throw new Error('Only http and https URLs are supported.')
215
+ }
216
+ parsed.hash = ''
217
+ return parsed.toString()
218
+ }
219
+
220
+ function clampNumber(value: unknown, fallback: number, min: number, max: number): number {
221
+ const parsed = typeof value === 'number'
222
+ ? value
223
+ : typeof value === 'string'
224
+ ? Number.parseInt(value, 10)
225
+ : Number.NaN
226
+ if (!Number.isFinite(parsed)) return fallback
227
+ return Math.max(min, Math.min(max, Math.trunc(parsed)))
228
+ }
229
+
230
+ function extractLinks($: ReturnType<typeof cheerio.load>, pageUrl: string): string[] {
231
+ const links: string[] = []
232
+ $('a[href]').each((_index, element) => {
233
+ const rawHref = $(element).attr('href') || ''
234
+ try {
235
+ const resolved = new URL(rawHref, pageUrl)
236
+ if (resolved.protocol !== 'http:' && resolved.protocol !== 'https:') return
237
+ resolved.hash = ''
238
+ const href = resolved.toString()
239
+ if (!links.includes(href)) links.push(href)
240
+ } catch {
241
+ // Ignore malformed links from the crawled page.
242
+ }
243
+ })
244
+ return links
245
+ }
246
+
247
+ async function extractReadablePage(fetchUrl: string): Promise<ExtractedWebPage> {
248
+ const url = normalizeHttpUrl(fetchUrl)
249
+ const res = await fetch(url, {
250
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; SwarmClaw/1.0)' },
251
+ signal: AbortSignal.timeout(15000),
252
+ })
253
+ if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`)
254
+ const contentType = res.headers.get('content-type') || ''
255
+ if (contentType.includes('application/pdf')) {
256
+ const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse')
257
+ const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }>
258
+ const arrayBuffer = await res.arrayBuffer()
259
+ const result = await pdfParse(Buffer.from(arrayBuffer))
260
+ return { url, title: url, text: result.text, links: [] }
261
+ }
262
+
263
+ const html = await res.text()
264
+ const $ = cheerio.load(html)
265
+ const title = $('title').first().text().replace(/\s+/g, ' ').trim() || url
266
+ const links = extractLinks($, url)
267
+ $('script, style, noscript, nav, footer, header').remove()
268
+ const main = $('article, main, [role="main"]').first()
269
+ const text = (main.length ? main.text() : $('body').text()).replace(/\s+/g, ' ').trim()
270
+ return { url, title, text, links }
271
+ }
272
+
273
+ function formatExtractedPage(page: ExtractedWebPage): string {
274
+ const lines = [`Title: ${page.title}`, `URL: ${page.url}`, '', page.text || '(no readable text found)']
275
+ return truncate(lines.join('\n'), MAX_OUTPUT)
276
+ }
277
+
278
+ function formatCrawlResults(startUrl: string, pages: ExtractedWebPage[]): string {
279
+ if (pages.length === 0) return `No crawl results found for: ${startUrl}`
280
+ const sections = [`Crawl results for: ${startUrl}`, `Pages crawled: ${pages.length}`]
281
+ for (let index = 0; index < pages.length; index++) {
282
+ const page = pages[index]
283
+ const text = truncate(page.text || '(no readable text found)', 1200)
284
+ sections.push(`${index + 1}. ${page.title}\nURL: ${page.url}\nText: ${text}`)
285
+ }
286
+ return truncate(sections.join('\n\n'), MAX_OUTPUT)
287
+ }
288
+
289
+ async function executeWebExtractAction(normalized: Record<string, unknown>) {
290
+ const rawUrl = String(normalized.url || normalized.query || '')
291
+ if (!rawUrl.trim()) return 'Error: "url" is required for extract action.'
292
+ try {
293
+ return formatExtractedPage(await extractReadablePage(rawUrl))
294
+ } catch (err: unknown) {
295
+ return `Error: ${errorMessage(err)}`
296
+ }
297
+ }
298
+
299
+ async function executeWebCrawlAction(normalized: Record<string, unknown>) {
300
+ const rawUrl = String(normalized.url || normalized.query || '')
301
+ if (!rawUrl.trim()) return 'Error: "url" is required for crawl action.'
302
+
303
+ let startUrl: string
304
+ try {
305
+ startUrl = normalizeHttpUrl(rawUrl)
306
+ } catch (err: unknown) {
307
+ return `Error: ${errorMessage(err)}`
308
+ }
309
+
310
+ const maxPages = clampNumber(normalized.maxPages ?? normalized.maxResults, 5, 1, 25)
311
+ const maxDepth = clampNumber(normalized.maxDepth, 1, 0, 3)
312
+ const includeExternal = normalized.includeExternal === true || normalized.sameOrigin === false
313
+ const startOrigin = new URL(startUrl).origin
314
+ const queue: Array<{ url: string; depth: number }> = [{ url: startUrl, depth: 0 }]
315
+ const seen = new Set<string>()
316
+ const pages: ExtractedWebPage[] = []
317
+
318
+ while (queue.length > 0 && pages.length < maxPages) {
319
+ const next = queue.shift()
320
+ if (!next) break
321
+ if (seen.has(next.url)) continue
322
+ seen.add(next.url)
323
+
324
+ let page: ExtractedWebPage
325
+ try {
326
+ page = await extractReadablePage(next.url)
327
+ } catch (err: unknown) {
328
+ page = { url: next.url, title: next.url, text: `Error: ${errorMessage(err)}`, links: [] }
329
+ }
330
+ pages.push(page)
331
+
332
+ if (next.depth >= maxDepth) continue
333
+ for (const link of page.links) {
334
+ if (seen.has(link)) continue
335
+ if (!includeExternal && new URL(link).origin !== startOrigin) continue
336
+ if (queue.some((entry) => entry.url === link)) continue
337
+ queue.push({ url: link, depth: next.depth + 1 })
338
+ if (queue.length + seen.size >= maxPages * 4) break
339
+ }
340
+ }
341
+
342
+ return formatCrawlResults(startUrl, pages)
343
+ }
344
+
202
345
  async function executeWebAction(args: Record<string, unknown>) {
203
346
  const normalized = normalizeToolInputArgs(args)
204
347
  const { query, url, maxResults } = normalized as { query?: string; url?: string; maxResults?: number }
@@ -219,32 +362,13 @@ async function executeWebAction(args: Record<string, unknown>) {
219
362
  const results = await provider.search(searchQuery, limit)
220
363
  if (results.length === 0) return 'No results found.'
221
364
  return formatWebSearchResults(searchQuery, results)
222
- } else if (action === 'fetch') {
365
+ } else if (action === 'fetch' || action === 'extract') {
223
366
  const fetchUrl = url || query
224
- if (!fetchUrl) return 'Error: "url" is required for fetch action.'
225
- const res = await fetch(fetchUrl, {
226
- headers: { 'User-Agent': 'Mozilla/5.0 (compatible; SwarmClaw/1.0)' },
227
- signal: AbortSignal.timeout(15000),
228
- })
229
- if (!res.ok) return `HTTP ${res.status}: ${res.statusText}`
230
- const contentType = res.headers.get('content-type') || ''
231
- if (contentType.includes('application/pdf')) {
232
- try {
233
- const pdfMod = await import(/* webpackIgnore: true */ 'pdf-parse')
234
- const pdfParse = ((pdfMod as Record<string, unknown>).default ?? pdfMod) as (buf: Buffer) => Promise<{ text: string }>
235
- const arrayBuffer = await res.arrayBuffer()
236
- const result = await pdfParse(Buffer.from(arrayBuffer))
237
- return truncate(result.text, MAX_OUTPUT)
238
- } catch (err: unknown) {
239
- return `Error parsing PDF: ${errorMessage(err)}`
240
- }
241
- }
242
- const html = await res.text()
243
- const $ = cheerio.load(html)
244
- $('script, style, noscript, nav, footer, header').remove()
245
- const main = $('article, main, [role="main"]').first()
246
- const text = (main.length ? main.text() : $('body').text()).replace(/\s+/g, ' ').trim()
247
- return truncate(text, MAX_OUTPUT)
367
+ if (!fetchUrl) return `Error: "url" is required for ${action} action.`
368
+ const page = await extractReadablePage(fetchUrl)
369
+ return action === 'extract' ? formatExtractedPage(page) : truncate(page.text, MAX_OUTPUT)
370
+ } else if (action === 'crawl') {
371
+ return executeWebCrawlAction(normalized)
248
372
  } else if (action === 'api') {
249
373
  return executeWebApiAction(normalized)
250
374
  }
@@ -259,21 +383,25 @@ async function executeWebAction(args: Record<string, unknown>) {
259
383
  */
260
384
  const WebExtension: Extension = {
261
385
  name: 'Core Web',
262
- description: 'Search the web, fetch content, and make HTTP API calls.',
386
+ description: 'Search the web, extract pages, crawl sites, and make HTTP API calls.',
263
387
  hooks: {
264
- getCapabilityDescription: () => 'I can use the unified `web` tool with action `search` for research, `fetch` for reading a URL, and `api` for raw HTTP API calls with full control over method/headers/body.',
388
+ getCapabilityDescription: () => 'I can use `web_search` for fresh research, `web_extract` for a specific URL, `web_crawl` for bounded multi-page site reads, and the unified `web` tool for search, fetch, crawl, and raw HTTP API calls.',
265
389
  } as ExtensionHooks,
266
390
  tools: [
267
391
  {
268
392
  name: 'web',
269
- description: 'Unified web access tool. Actions: search (web search), fetch (read URL content), api (raw HTTP request with method/headers/body).',
393
+ description: 'Unified web access tool. Actions: search (web search), fetch/extract (read URL content), crawl (bounded same-origin crawl), api (raw HTTP request with method/headers/body).',
270
394
  parameters: {
271
395
  type: 'object',
272
396
  properties: {
273
- action: { type: 'string', enum: ['search', 'fetch', 'api'] },
397
+ action: { type: 'string', enum: ['search', 'fetch', 'extract', 'crawl', 'api'] },
274
398
  query: { type: 'string' },
275
399
  url: { type: 'string' },
276
400
  maxResults: { type: 'number' },
401
+ maxPages: { type: 'number', description: 'Maximum pages for crawl action, default 5, max 25' },
402
+ maxDepth: { type: 'number', description: 'Maximum crawl depth, default 1, max 3' },
403
+ includeExternal: { type: 'boolean', description: 'Allow crawl to leave the starting origin, default false' },
404
+ sameOrigin: { type: 'boolean', description: 'Keep crawl on the starting origin when true, default true' },
277
405
  method: { type: 'string', enum: ['GET', 'POST', 'PUT', 'PATCH', 'DELETE', 'HEAD', 'OPTIONS'], description: 'HTTP method (for api action)' },
278
406
  headers: { type: 'object', additionalProperties: { type: 'string' }, description: 'Request headers (for api action)' },
279
407
  body: { type: 'string', description: 'Request body (for api action)' },
@@ -283,6 +411,71 @@ const WebExtension: Extension = {
283
411
  required: ['action']
284
412
  },
285
413
  execute: async (args) => executeWebAction(args)
414
+ },
415
+ {
416
+ name: 'web_search',
417
+ description: 'Search the web and return ranked results with URLs and snippets.',
418
+ parameters: {
419
+ type: 'object',
420
+ properties: {
421
+ query: { type: 'string' },
422
+ maxResults: { type: 'number' },
423
+ },
424
+ required: ['query'],
425
+ },
426
+ planning: {
427
+ capabilities: ['research.search'],
428
+ disciplineGuidance: ['Use `web_search` for fresh information, then fetch or extract only the sources you need.'],
429
+ },
430
+ execute: async (args) => executeWebAction({ ...normalizeToolInputArgs(args), action: 'search' }),
431
+ },
432
+ {
433
+ name: 'web_fetch',
434
+ description: 'Read a specific URL and return readable page text.',
435
+ parameters: {
436
+ type: 'object',
437
+ properties: { url: { type: 'string' } },
438
+ required: ['url'],
439
+ },
440
+ planning: {
441
+ capabilities: ['research.fetch'],
442
+ disciplineGuidance: ['Use `web_fetch` when you already have a URL and only need the readable text.'],
443
+ },
444
+ execute: async (args) => executeWebAction({ ...normalizeToolInputArgs(args), action: 'fetch' }),
445
+ },
446
+ {
447
+ name: 'web_extract',
448
+ description: 'Extract readable content from a URL with title and source URL included.',
449
+ parameters: {
450
+ type: 'object',
451
+ properties: { url: { type: 'string' } },
452
+ required: ['url'],
453
+ },
454
+ planning: {
455
+ capabilities: ['research.fetch'],
456
+ disciplineGuidance: ['Use `web_extract` for source-grounded page reads where the title and URL should stay attached to the extracted text.'],
457
+ },
458
+ execute: async (args) => executeWebExtractAction(normalizeToolInputArgs(args)),
459
+ },
460
+ {
461
+ name: 'web_crawl',
462
+ description: 'Crawl a small set of pages starting from one URL. Same-origin by default, bounded by maxPages and maxDepth.',
463
+ parameters: {
464
+ type: 'object',
465
+ properties: {
466
+ url: { type: 'string' },
467
+ maxPages: { type: 'number' },
468
+ maxDepth: { type: 'number' },
469
+ includeExternal: { type: 'boolean' },
470
+ sameOrigin: { type: 'boolean' },
471
+ },
472
+ required: ['url'],
473
+ },
474
+ planning: {
475
+ capabilities: ['research.crawl'],
476
+ disciplineGuidance: ['Use `web_crawl` only when the task needs multiple pages from the same site. Keep maxPages low and summarize after one crawl.'],
477
+ },
478
+ execute: async (args) => executeWebCrawlAction(normalizeToolInputArgs(args)),
286
479
  }
287
480
  ]
288
481
  }
@@ -307,6 +500,40 @@ export function buildWebTools(bctx: ToolBuildContext): StructuredToolInterface[]
307
500
  }
308
501
  )
309
502
  )
503
+ tools.push(
504
+ tool(
505
+ async (args) => executeWebAction({ ...normalizeToolInputArgs((args ?? {}) as Record<string, unknown>), action: 'search' }),
506
+ {
507
+ name: 'web_search',
508
+ description: 'Search the web and return ranked results with URLs and snippets.',
509
+ schema: z.object({}).passthrough()
510
+ }
511
+ ),
512
+ tool(
513
+ async (args) => executeWebAction({ ...normalizeToolInputArgs((args ?? {}) as Record<string, unknown>), action: 'fetch' }),
514
+ {
515
+ name: 'web_fetch',
516
+ description: 'Read a specific URL and return readable page text.',
517
+ schema: z.object({}).passthrough()
518
+ }
519
+ ),
520
+ tool(
521
+ async (args) => executeWebExtractAction(normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)),
522
+ {
523
+ name: 'web_extract',
524
+ description: 'Extract readable content from a URL with title and source URL included.',
525
+ schema: z.object({}).passthrough()
526
+ }
527
+ ),
528
+ tool(
529
+ async (args) => executeWebCrawlAction(normalizeToolInputArgs((args ?? {}) as Record<string, unknown>)),
530
+ {
531
+ name: 'web_crawl',
532
+ description: 'Crawl a small set of pages starting from one URL. Same-origin by default, bounded by maxPages and maxDepth.',
533
+ schema: z.object({}).passthrough()
534
+ }
535
+ )
536
+ )
310
537
  }
311
538
 
312
539
  // Browser tool (kept as direct injection for now due to complexity)
@@ -664,6 +664,8 @@ if (!IS_BUILD_BOOTSTRAP) {
664
664
  'files',
665
665
  'web_search',
666
666
  'web_fetch',
667
+ 'web_extract',
668
+ 'web_crawl',
667
669
  'browser',
668
670
  'manage_agents',
669
671
  'manage_tasks',
@@ -3,7 +3,7 @@ const EXTENSION_ALIAS_GROUPS: string[][] = [
3
3
  ['execute', 'sandbox'],
4
4
  ['files', 'read_file', 'write_file', 'list_files', 'copy_file', 'move_file', 'delete_file', 'send_file'],
5
5
  ['edit_file'],
6
- ['web', 'web_search', 'web_fetch', 'http_request', 'http'],
6
+ ['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'http_request', 'http'],
7
7
  ['browser', 'openclaw_browser'],
8
8
  ['delegate', 'claude_code', 'codex_cli', 'opencode_cli', 'gemini_cli', 'copilot_cli', 'droid_cli', 'cursor_cli', 'qwen_code_cli', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli', 'delegate_to_copilot_cli', 'delegate_to_droid_cli', 'delegate_to_cursor_cli', 'delegate_to_qwen_code_cli'],
9
9
  ['manage_platform'],
@@ -255,12 +255,12 @@ describe('explicit allows override mode blocks', () => {
255
255
  // Category blocks
256
256
  // ---------------------------------------------------------------------------
257
257
  describe('category blocks', () => {
258
- it('blocking network category blocks web, web_search, web_fetch', () => {
259
- const d = resolveSessionToolPolicy(['web', 'web_search', 'web_fetch', 'memory'], {
258
+ it('blocking network category blocks granular web tools', () => {
259
+ const d = resolveSessionToolPolicy(['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'memory'], {
260
260
  capabilityBlockedCategories: ['network'],
261
261
  })
262
262
  assert.deepStrictEqual(d.enabledExtensions, ['memory'])
263
- assert.equal(d.blockedExtensions.length, 3)
263
+ assert.equal(d.blockedExtensions.length, 5)
264
264
  for (const b of d.blockedExtensions) {
265
265
  assert.match(b.reason, /category "network"/)
266
266
  }
@@ -49,9 +49,11 @@ const TOOL_DESCRIPTORS: Record<string, ToolDescriptor> = {
49
49
  move_file: { categories: ['filesystem'], concreteTools: ['move_file'] },
50
50
  edit_file: { categories: ['filesystem'], concreteTools: ['edit_file'] },
51
51
  delete_file: { categories: ['filesystem'], concreteTools: ['delete_file'], destructive: true },
52
- web: { categories: ['network'], concreteTools: ['web', 'web_search', 'web_fetch'] },
52
+ web: { categories: ['network'], concreteTools: ['web', 'web_search', 'web_fetch', 'web_extract', 'web_crawl'] },
53
53
  web_search: { categories: ['network'], concreteTools: ['web_search'] },
54
54
  web_fetch: { categories: ['network'], concreteTools: ['web_fetch'] },
55
+ web_extract: { categories: ['network'], concreteTools: ['web_extract'] },
56
+ web_crawl: { categories: ['network'], concreteTools: ['web_crawl'] },
55
57
  browser: { categories: ['browser', 'network'], concreteTools: ['browser', 'openclaw_browser'] },
56
58
  delegate: { categories: ['delegation', 'execution'], concreteTools: ['delegate', 'delegate_to_claude_code', 'delegate_to_codex_cli', 'delegate_to_opencode_cli', 'delegate_to_gemini_cli', 'delegate_to_copilot_cli', 'delegate_to_droid_cli', 'delegate_to_cursor_cli', 'delegate_to_qwen_code_cli'] },
57
59
  claude_code: { categories: ['delegation', 'execution'], concreteTools: ['delegate_to_claude_code'] },
@@ -85,6 +87,7 @@ const TOOL_DESCRIPTORS: Record<string, ToolDescriptor> = {
85
87
  spawn_subagent: { categories: ['delegation', 'platform'], concreteTools: ['spawn_subagent', 'delegate_to_agent'] },
86
88
  context_mgmt: { categories: ['memory'], concreteTools: ['context_mgmt', 'context_status', 'context_summarize'] },
87
89
  extension_creator: { categories: ['filesystem', 'execution'], concreteTools: ['extension_creator', 'extension_creator_tool'] },
90
+ wallet: { categories: ['outbound'], concreteTools: ['wallet'] },
88
91
  mailbox: { categories: ['network', 'platform', 'outbound'], concreteTools: ['mailbox', 'inbox'] },
89
92
  ask_human: { categories: ['platform'], concreteTools: ['ask_human', 'human_loop'] },
90
93
  google_workspace: { categories: ['network'], concreteTools: ['google_workspace', 'gws'] },
@@ -12,10 +12,11 @@ function uniqueExtensionId(prefix: string): string {
12
12
 
13
13
  describe('tool-planning', () => {
14
14
  it('collects core planning metadata for aliased built-in tools', () => {
15
- const view = getEnabledToolPlanningView(['web_search', 'web_fetch', 'browser', 'manage_connectors'])
15
+ const view = getEnabledToolPlanningView(['web_search', 'web_fetch', 'web_extract', 'web_crawl', 'browser', 'manage_connectors'])
16
16
 
17
17
  assert.deepEqual(view.displayToolIds, ['browser', 'manage_connectors', 'web'])
18
18
  assert.deepEqual(getToolsForCapability(['web_search'], TOOL_CAPABILITY.researchSearch), ['web_search'])
19
+ assert.deepEqual(getToolsForCapability(['web_crawl'], TOOL_CAPABILITY.researchCrawl), ['web_crawl'])
19
20
  assert.deepEqual(getToolsForCapability(['manage_connectors'], TOOL_CAPABILITY.deliveryVoiceNote), ['connector_message_tool'])
20
21
  })
21
22
 
@@ -7,6 +7,7 @@ import { canonicalizeExtensionId, expandExtensionIds } from './tool-aliases'
7
7
  export const TOOL_CAPABILITY = {
8
8
  researchSearch: 'research.search',
9
9
  researchFetch: 'research.fetch',
10
+ researchCrawl: 'research.crawl',
10
11
  browserNavigate: 'browser.navigate',
11
12
  browserCapture: 'browser.capture',
12
13
  artifactPdf: 'artifact.pdf',
@@ -98,6 +99,36 @@ const CORE_TOOL_PLANNING: Record<string, LegacyToolPlanningEntry[]> = {
98
99
  },
99
100
  ],
100
101
  },
102
+ {
103
+ toolName: 'web_extract',
104
+ capabilities: [TOOL_CAPABILITY.researchFetch],
105
+ disciplineGuidance: [
106
+ 'For `web_extract`, use `{"url":"https://..."}` when source title and URL should remain attached to extracted page text.',
107
+ 'Extract the exact pages you need, then synthesize. Do not extract the same page repeatedly.',
108
+ ],
109
+ requestMatchers: [
110
+ {
111
+ capability: TOOL_CAPABILITY.researchFetch,
112
+ patterns: ['extract', 'readable content', 'page text', 'source text'],
113
+ requireLiteralUrl: true,
114
+ },
115
+ ],
116
+ },
117
+ {
118
+ toolName: 'web_crawl',
119
+ capabilities: [TOOL_CAPABILITY.researchCrawl],
120
+ disciplineGuidance: [
121
+ 'For `web_crawl`, use `{"url":"https://...","maxPages":5,"maxDepth":1}` only when a task needs several pages from the same site.',
122
+ 'Keep crawls bounded and summarize after one crawl. Prefer `web_extract` for a single known URL.',
123
+ ],
124
+ requestMatchers: [
125
+ {
126
+ capability: TOOL_CAPABILITY.researchCrawl,
127
+ patterns: ['crawl', 'site map', 'sitemap', 'multiple pages', 'whole site', 'scan site'],
128
+ requireLiteralUrl: true,
129
+ },
130
+ ],
131
+ },
101
132
  ],
102
133
  browser: [
103
134
  {
@@ -4,11 +4,11 @@ const INJECTION_PATTERNS: Array<{ code: string; re: RegExp; note: string }> = [
4
4
  { code: 'ignore_instructions', re: /\bignore (?:all |any |the )?(?:previous|prior|above|system|developer) instructions\b/i, note: 'tries to override existing instructions' },
5
5
  { code: 'reveal_prompt', re: /\b(?:reveal|show|print|dump)\b[\s\S]{0,40}\b(?:system prompt|developer prompt|hidden prompt)\b/i, note: 'asks for hidden prompt data' },
6
6
  { code: 'credential_theft', re: /\b(?:api key|token|password|secret|credential)s?\b[\s\S]{0,40}\b(?:send|share|reveal|print|dump|exfiltrat)/i, note: 'asks for secrets or credentials' },
7
- { code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' },
7
+ { code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|web_extract|web_crawl|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' },
8
8
  { code: 'workflow_override', re: /\b(?:act as|pretend to be)\b[\s\S]{0,40}\b(?:system|developer|administrator|operator)\b/i, note: 'tries to impersonate a higher-priority role' },
9
9
  ]
10
10
 
11
- const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'http_request'])
11
+ const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'web_extract', 'web_crawl', 'http_request'])
12
12
 
13
13
  function normalizeMode(value: unknown): 'off' | 'warn' | 'block' {
14
14
  const normalized = typeof value === 'string' ? value.trim().toLowerCase() : ''
@@ -218,6 +218,8 @@ export type SessionTool =
218
218
  | 'qwen_code_cli'
219
219
  | 'web_search'
220
220
  | 'web_fetch'
221
+ | 'web_extract'
222
+ | 'web_crawl'
221
223
  | 'edit_file'
222
224
  | 'process'
223
225
  | 'spawn_subagent'