@hybridaione/hybridclaw 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/CHANGELOG.md +38 -0
  2. package/README.md +49 -15
  3. package/config.example.json +4 -1
  4. package/container/package-lock.json +2 -2
  5. package/container/package.json +1 -1
  6. package/container/src/browser-tools.ts +1 -1
  7. package/container/src/index.ts +243 -14
  8. package/container/src/token-usage.ts +18 -2
  9. package/container/src/tools.ts +339 -1
  10. package/container/src/types.ts +28 -2
  11. package/dist/agent.d.ts +2 -2
  12. package/dist/agent.d.ts.map +1 -1
  13. package/dist/agent.js +2 -2
  14. package/dist/agent.js.map +1 -1
  15. package/dist/channels/discord/attachments.d.ts +9 -0
  16. package/dist/channels/discord/attachments.d.ts.map +1 -0
  17. package/dist/channels/discord/attachments.js +245 -0
  18. package/dist/channels/discord/attachments.js.map +1 -0
  19. package/dist/channels/discord/delivery.d.ts +31 -0
  20. package/dist/channels/discord/delivery.d.ts.map +1 -0
  21. package/dist/channels/discord/delivery.js +60 -0
  22. package/dist/channels/discord/delivery.js.map +1 -0
  23. package/dist/channels/discord/inbound.d.ts +20 -0
  24. package/dist/channels/discord/inbound.d.ts.map +1 -0
  25. package/dist/channels/discord/inbound.js +44 -0
  26. package/dist/channels/discord/inbound.js.map +1 -0
  27. package/dist/channels/discord/mentions.d.ts +14 -0
  28. package/dist/channels/discord/mentions.d.ts.map +1 -0
  29. package/dist/channels/discord/mentions.js +118 -0
  30. package/dist/channels/discord/mentions.js.map +1 -0
  31. package/dist/channels/discord/runtime.d.ts +22 -0
  32. package/dist/channels/discord/runtime.d.ts.map +1 -0
  33. package/dist/channels/discord/runtime.js +972 -0
  34. package/dist/channels/discord/runtime.js.map +1 -0
  35. package/dist/channels/discord/stream.d.ts +32 -0
  36. package/dist/channels/discord/stream.d.ts.map +1 -0
  37. package/dist/channels/discord/stream.js +196 -0
  38. package/dist/channels/discord/stream.js.map +1 -0
  39. package/dist/channels/discord/tool-actions.d.ts +31 -0
  40. package/dist/channels/discord/tool-actions.d.ts.map +1 -0
  41. package/dist/channels/discord/tool-actions.js +268 -0
  42. package/dist/channels/discord/tool-actions.js.map +1 -0
  43. package/dist/container-runner.d.ts +2 -2
  44. package/dist/container-runner.d.ts.map +1 -1
  45. package/dist/container-runner.js +12 -2
  46. package/dist/container-runner.js.map +1 -1
  47. package/dist/discord.basic.test.d.ts +2 -0
  48. package/dist/discord.basic.test.d.ts.map +1 -0
  49. package/dist/discord.basic.test.js +38 -0
  50. package/dist/discord.basic.test.js.map +1 -0
  51. package/dist/discord.d.ts +5 -44
  52. package/dist/discord.d.ts.map +1 -1
  53. package/dist/discord.js +3 -1468
  54. package/dist/discord.js.map +1 -1
  55. package/dist/gateway-service.d.ts +7 -1
  56. package/dist/gateway-service.d.ts.map +1 -1
  57. package/dist/gateway-service.js +111 -2
  58. package/dist/gateway-service.js.map +1 -1
  59. package/dist/gateway-service.media-routing.test.d.ts +2 -0
  60. package/dist/gateway-service.media-routing.test.d.ts.map +1 -0
  61. package/dist/gateway-service.media-routing.test.js +29 -0
  62. package/dist/gateway-service.media-routing.test.js.map +1 -0
  63. package/dist/gateway-types.d.ts +8 -0
  64. package/dist/gateway-types.d.ts.map +1 -1
  65. package/dist/gateway-types.js.map +1 -1
  66. package/dist/gateway.js +5 -2
  67. package/dist/gateway.js.map +1 -1
  68. package/dist/health.d.ts.map +1 -1
  69. package/dist/health.js +1 -1
  70. package/dist/health.js.map +1 -1
  71. package/dist/heartbeat.d.ts.map +1 -1
  72. package/dist/heartbeat.js +2 -0
  73. package/dist/heartbeat.js.map +1 -1
  74. package/dist/token-efficiency.basic.test.d.ts +2 -0
  75. package/dist/token-efficiency.basic.test.d.ts.map +1 -0
  76. package/dist/token-efficiency.basic.test.js +29 -0
  77. package/dist/token-efficiency.basic.test.js.map +1 -0
  78. package/dist/token-efficiency.d.ts.map +1 -1
  79. package/dist/token-efficiency.js +18 -1
  80. package/dist/token-efficiency.js.map +1 -1
  81. package/dist/types.d.ts +23 -1
  82. package/dist/types.d.ts.map +1 -1
  83. package/package.json +10 -2
  84. package/src/agent.ts +11 -1
  85. package/src/channels/discord/attachments.ts +282 -0
  86. package/src/channels/discord/delivery.ts +99 -0
  87. package/src/channels/discord/inbound.ts +78 -0
  88. package/src/channels/discord/mentions.ts +130 -0
  89. package/src/{discord.ts → channels/discord/runtime.ts} +164 -633
  90. package/src/{discord-stream.ts → channels/discord/stream.ts} +2 -2
  91. package/src/channels/discord/tool-actions.ts +332 -0
  92. package/src/config.ts +6 -0
  93. package/src/container-runner.ts +24 -1
  94. package/src/gateway-service.ts +220 -1
  95. package/src/gateway-types.ts +8 -0
  96. package/src/gateway.ts +5 -5
  97. package/src/health.ts +2 -1
  98. package/src/heartbeat.ts +2 -0
  99. package/src/runtime-config.ts +77 -0
  100. package/src/token-efficiency.ts +17 -1
  101. package/src/types.ts +27 -1
  102. package/tests/discord.basic.test.ts +95 -0
  103. package/tests/gateway-service.media-routing.test.ts +33 -0
  104. package/tests/token-efficiency.basic.test.ts +32 -0
  105. package/vitest.e2e.config.ts +15 -0
  106. package/vitest.integration.config.ts +15 -0
  107. package/vitest.live.config.ts +16 -0
  108. package/vitest.unit.config.ts +15 -0
package/CHANGELOG.md CHANGED
@@ -8,6 +8,44 @@
8
8
 
9
9
  ### Fixed
10
10
 
11
+ ## [0.2.3](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.3)
12
+
13
+ ### Added
14
+
15
+ - **Discord channel policy config**: Added typed runtime config support for `discord.groupPolicy` (`open`/`allowlist`/`disabled`), `discord.freeResponseChannels`, and per-guild/per-channel mode overrides at `discord.guilds.<guildId>.channels.<channelId>.mode`.
16
+ - **Discord channel mode slash command**: Added `/channel-mode` with `off`, `mention`, and `free` options to set the active guild channel behavior directly from Discord.
17
+ - **Gateway channel control commands**: Added `channel mode` and `channel policy` command flows for inspecting/updating Discord channel response behavior via `!claw` commands.
18
+
19
+ ### Changed
20
+
21
+ - **Discord trigger enforcement**: Guild message handling now applies channel mode + group policy before normal trigger checks, while still allowing prefixed commands in disabled channels.
22
+ - **Activation/status labeling**: Runtime status output now reflects `disabled`/`allowlist`/mixed free-channel activation modes instead of only legacy mention/all-messages labels.
23
+
24
+ ### Fixed
25
+
26
+ ## [0.2.2](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.2)
27
+
28
+ ### Added
29
+
30
+ - **Discord image attachment ingest/cache**: Added receive-time image ingest with local cache under `data/discord-media-cache`, preserving attachment order and carrying `path`, `mimeType`, `sizeBytes`, and `originalUrl` per media item.
31
+ - **Structured media context pipeline**: Added typed media payload (`MediaPaths`/`MediaUrls`/`MediaTypes` equivalents) from Discord runtime through gateway/container request boundaries.
32
+ - **Attachment vision tools**: Added `vision_analyze` (and `image` alias) for Discord-uploaded image analysis using local cached paths first, with Discord CDN URL fallback.
33
+ - **Native multimodal injection**: Added direct image-part injection for vision-capable models, with automatic retry without image parts if the model rejects multimodal payloads.
34
+ - **Scoped Vitest test configs**: Added dedicated `vitest.{unit,integration,e2e,live}.config.ts` files and matching npm scripts (`test:unit`, `test:integration`, `test:e2e`, `test:live`, `test:watch`) for explicit suite boundaries.
35
+
36
+ ### Changed
37
+
38
+ - **Discord channel module layout**: Completed migration of Discord runtime internals into `src/channels/discord/*`, including `runtime.ts` and `stream.ts`, and removed legacy root-level `src/discord.ts` shim.
39
+ - **Image-question tool routing**: Discord image questions now prioritize attachment vision (`vision_analyze`) and block `browser_vision` unless the user explicitly asks about the active browser tab/page.
40
+ - **Browser vision scope guidance**: Updated `browser_vision` tool description to clarify it is for browser-page tasks only, not Discord-uploaded files.
41
+ - **Test runner strategy**: Switched from compiled test artifacts (`dist-tests` + `tsconfig.tests.json`) to direct TypeScript execution via Vitest.
42
+ - **Test file location and conventions**: Moved basic test files from `src/*.test.ts` to `tests/` and aligned naming/scoping conventions for unit/integration/e2e/live suites.
43
+
44
+ ### Fixed
45
+
46
+ - **Discord image analysis fallback behavior**: Added safer cache/CDN fallback handling and guardrails (Discord CDN allowlist, size/type limits, per-image success/failure logging) to avoid brittle image-analysis failures.
47
+ - **Regression coverage for wrong vision tool selection**: Added basic regression test coverage that Discord image questions should not route to browser screenshot vision.
48
+
11
49
  ## [0.2.1](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.1)
12
50
 
13
51
  ### Added
package/README.md CHANGED
@@ -11,15 +11,15 @@ npm install -g @hybridaione/hybridclaw
11
11
  hybridclaw onboarding
12
12
  ```
13
13
 
14
- Latest release: [v0.2.1](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.1)
14
+ Latest release: [v0.2.3](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.3)
15
15
 
16
- ## What's new in v0.2.1
16
+ ## What's new in v0.2.3
17
17
 
18
- - Added OpenClaw-style Discord `message` tool actions (`read`, `member-info`, `channel-info`) to the container runtime
19
- - Added gateway endpoint `POST /api/discord/action` for Discord context lookups from tools
20
- - Replaced prompt-time Discord presence snapshots with cache-backed `member-info` presence fields (`status`, `activities`)
21
- - Routed Discord context lookups through gateway API from container with host remapping and token propagation
22
- - Enabled `message` tool in heartbeat and base subagent allowlists
18
+ - Added Discord guild channel policy controls with typed config: `discord.groupPolicy`, `discord.freeResponseChannels`, and `discord.guilds.<guildId>.channels.<channelId>.mode`
19
+ - Added `/channel-mode` slash command to switch a channel between `off`, `mention`, and `free`
20
+ - Added `!claw channel mode` and `!claw channel policy` command flows for in-chat policy changes
21
+ - Enforced channel mode/policy in Discord trigger logic while keeping prefixed commands available
22
+ - Updated status/activation labeling to reflect allowlist/disabled/mixed channel policy modes
23
23
 
24
24
  ## HybridAI Advantage
25
25
 
@@ -106,6 +106,10 @@ HybridClaw uses typed runtime config in `config.json` (auto-created on first run
106
106
  - `discord.respondToAllMessages` changes guild trigger behavior: `false` (default) replies only on mention/`!claw`; `true` replies to every user message in the channel
107
107
  - `discord.commandUserId` restricts `!claw <command>` admin commands to a single Discord user ID (all other messages still use normal chat handling)
108
108
  - `discord.commandsOnly` optional hard mode: if `true`, the bot ignores non-`!claw` messages and only accepts prefixed commands (optionally limited by `discord.commandUserId`)
109
+ - `discord.groupPolicy` controls guild channel scope: `open` (default), `allowlist`, or `disabled`
110
+ - `discord.freeResponseChannels` is a Hermes-style channel ID list that gets free-response behavior while other channels remain mention-gated
111
+ - `discord.guilds.<guildId>.channels.<channelId>.mode` sets per-channel behavior to `off`, `mention`, or `free` (works with `allowlist` policy)
112
+ - Discord slash commands: `/status` and `/channel-mode <off|mention|free>` (ephemeral replies)
109
113
  - `skills.extraDirs` adds additional enterprise/shared skill roots (lowest precedence tier)
110
114
  - `proactive.*` controls autonomous behavior (`activeHours`, `delegation`, `autoRetry`, `ralph`)
111
115
  - `proactive.ralph.maxIterations` enables Ralph loop (`0` off, `-1` unlimited, `>0` extra autonomous iterations before forcing completion)
@@ -349,6 +353,34 @@ System prompt assembly is handled by a formal hook pipeline:
349
353
 
350
354
  Hook toggles live in `config.json` under `promptHooks`.
351
355
 
356
+ ## Testing
357
+
358
+ Run checks locally:
359
+
360
+ ```bash
361
+ # Typecheck only (no emit)
362
+ npm run typecheck
363
+
364
+ # Strict TS lint gate (unused locals/params)
365
+ npm run lint
366
+
367
+ # Unit tests (default `npm test`)
368
+ npm run test:unit
369
+
370
+ # Scoped suites (ready for dedicated tests)
371
+ npm run test:integration
372
+ npm run test:e2e
373
+ npm run test:live
374
+ ```
375
+
376
+ Test layout and scopes:
377
+
378
+ - tests live under `tests/` (not `src/`)
379
+ - unit tests: `tests/**/*.test.ts` (excluding `*.integration|*.e2e|*.live`)
380
+ - integration tests: `tests/**/*.integration.test.ts`
381
+ - e2e tests: `tests/**/*.e2e.test.ts`
382
+ - live tests: `tests/**/*.live.test.ts`
383
+
352
384
  ## Commands
353
385
 
354
386
  CLI runtime commands:
@@ -379,12 +411,14 @@ In Discord, use `!claw help` to see all commands. Key ones:
379
411
  ## Project structure
380
412
 
381
413
  ```
382
- src/gateway.ts Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
383
- src/tui.ts Terminal adapter (thin client to gateway)
384
- src/discord.ts Discord integration and message transport
385
- src/gateway-service.ts Core shared agent/session logic used by gateway API
386
- src/gateway-client.ts HTTP client used by thin clients (e.g. TUI)
387
- container/src/ Agent code (tools, HybridAI client, IPC)
388
- templates/ Workspace bootstrap files
389
- data/ Runtime data (gitignored): SQLite DB, sessions, agent workspaces
414
+ src/gateway.ts Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
415
+ src/tui.ts Terminal adapter (thin client to gateway)
416
+ src/channels/discord/runtime.ts Discord runtime integration and message transport
417
+ src/channels/discord/*.ts Discord responsibility modules (inbound, delivery, mentions, attachments, tools, stream)
418
+ src/gateway-service.ts Core shared agent/session logic used by gateway API
419
+ src/gateway-client.ts HTTP client used by thin clients (e.g. TUI)
420
+ tests/ Vitest suites (unit/integration/e2e/live scopes)
421
+ container/src/ Agent code (tools, HybridAI client, IPC)
422
+ templates/ Workspace bootstrap files
423
+ data/ Runtime data (gitignored): SQLite DB, sessions, agent workspaces
390
424
  ```
@@ -15,7 +15,10 @@
15
15
  "presenceIntent": false,
16
16
  "respondToAllMessages": false,
17
17
  "commandsOnly": false,
18
- "commandUserId": ""
18
+ "commandUserId": "",
19
+ "groupPolicy": "open",
20
+ "freeResponseChannels": [],
21
+ "guilds": {}
19
22
  },
20
23
  "hybridai": {
21
24
  "baseUrl": "https://hybridai.one",
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "hybridclaw-agent",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "hybridclaw-agent",
9
- "version": "0.2.1",
9
+ "version": "0.2.3",
10
10
  "dependencies": {
11
11
  "@mozilla/readability": "^0.6.0",
12
12
  "agent-browser": "^0.15.1",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "hybridclaw-agent",
3
- "version": "0.2.1",
3
+ "version": "0.2.3",
4
4
  "type": "module",
5
5
  "scripts": {
6
6
  "build": "tsc",
@@ -1199,7 +1199,7 @@ export const BROWSER_TOOL_DEFINITIONS: ToolDefinition[] = [
1199
1199
  function: {
1200
1200
  name: 'browser_vision',
1201
1201
  description:
1202
- 'Capture a screenshot and analyze it with a vision model using the provided question.',
1202
+ 'Capture the current browser page screenshot and analyze it with a vision model. Use only for active browser-tab/page tasks, not for Discord-uploaded files.',
1203
1203
  parameters: {
1204
1204
  type: 'object',
1205
1205
  properties: {
@@ -1,4 +1,6 @@
1
1
  import path from 'path';
2
+ import fs from 'fs';
3
+ import { URL } from 'url';
2
4
 
3
5
  import { emitRuntimeEvent, runAfterToolHooks, runBeforeToolHooks } from './extensions.js';
4
6
  import { callHybridAI, callHybridAIStream, HybridAIRequestError } from './hybridai-client.js';
@@ -15,12 +17,23 @@ import {
15
17
  getPendingSideEffects,
16
18
  resetSideEffects,
17
19
  setGatewayContext,
20
+ setMediaContext,
18
21
  setModelContext,
19
22
  setScheduledTasks,
20
23
  setSessionContext,
21
24
  TOOL_DEFINITIONS,
22
25
  } from './tools.js';
23
- import type { ArtifactMetadata, ChatMessage, ContainerInput, ContainerOutput, ToolDefinition, ToolExecution } from './types.js';
26
+ import type {
27
+ ArtifactMetadata,
28
+ ChatContentPart,
29
+ ChatMessage,
30
+ ChatMessageContent,
31
+ ContainerInput,
32
+ ContainerOutput,
33
+ MediaContextItem,
34
+ ToolDefinition,
35
+ ToolExecution,
36
+ } from './types.js';
24
37
 
25
38
  const MAX_ITERATIONS = 20;
26
39
  const IDLE_TIMEOUT_MS = parseInt(process.env.CONTAINER_IDLE_TIMEOUT || '300000', 10); // 5 min
@@ -45,10 +58,159 @@ const ARTIFACT_MIME_TYPES: Record<string, string> = {
45
58
  '.svg': 'image/svg+xml',
46
59
  '.webp': 'image/webp',
47
60
  };
61
+ const DISCORD_MEDIA_CACHE_ROOT = '/discord-media-cache';
62
+ const NATIVE_VISION_MAX_IMAGES = 8;
63
+ const NATIVE_VISION_MAX_IMAGE_BYTES = 10 * 1024 * 1024;
64
+ const DISCORD_CDN_HOST_PATTERNS: RegExp[] = [
65
+ /^cdn\.discordapp\.com$/i,
66
+ /^media\.discordapp\.net$/i,
67
+ /^cdn\.discordapp\.net$/i,
68
+ /^images-ext-\d+\.discordapp\.net$/i,
69
+ ];
48
70
 
49
71
  /** API key received once via stdin, held in memory for the container lifetime. */
50
72
  let storedApiKey = '';
51
73
 
74
+ function normalizeMessageContentToText(content: ChatMessageContent): string {
75
+ if (typeof content === 'string') return content;
76
+ if (!Array.isArray(content)) return '';
77
+ const chunks: string[] = [];
78
+ for (const part of content) {
79
+ if (!part || typeof part !== 'object') continue;
80
+ if (part.type !== 'text') continue;
81
+ if (typeof part.text !== 'string') continue;
82
+ if (part.text.trim()) chunks.push(part.text.trim());
83
+ }
84
+ return chunks.join('\n').trim();
85
+ }
86
+
87
+ function normalizePathSlashes(raw: string): string {
88
+ return raw.replace(/\\/g, '/');
89
+ }
90
+
91
+ function normalizeAllowedLocalImagePath(rawPath: string): string | null {
92
+ const trimmed = rawPath.trim();
93
+ if (!trimmed) return null;
94
+
95
+ const workspace = path.posix.normalize(WORKSPACE_ROOT);
96
+ const mediaRoot = path.posix.normalize(DISCORD_MEDIA_CACHE_ROOT);
97
+
98
+ const candidate = trimmed.startsWith('/')
99
+ ? path.posix.normalize(normalizePathSlashes(trimmed))
100
+ : path.posix.normalize(path.posix.join(workspace, normalizePathSlashes(trimmed)));
101
+
102
+ const underWorkspace = candidate === workspace || candidate.startsWith(`${workspace}/`);
103
+ const underMediaRoot = candidate === mediaRoot || candidate.startsWith(`${mediaRoot}/`);
104
+ if (!underWorkspace && !underMediaRoot) return null;
105
+ return candidate;
106
+ }
107
+
108
+ function inferImageMimeType(filePath: string, fallbackMime: string | null | undefined): string {
109
+ const normalizedFallback = String(fallbackMime || '').trim().toLowerCase();
110
+ if (normalizedFallback.startsWith('image/')) return normalizedFallback;
111
+ const ext = path.posix.extname(filePath).toLowerCase();
112
+ return ARTIFACT_MIME_TYPES[ext] || 'image/png';
113
+ }
114
+
115
+ function isSafeDiscordCdnUrl(raw: string): boolean {
116
+ let parsed: URL;
117
+ try {
118
+ parsed = new URL(raw);
119
+ } catch {
120
+ return false;
121
+ }
122
+ if (parsed.protocol !== 'https:') return false;
123
+ return DISCORD_CDN_HOST_PATTERNS.some((pattern) => pattern.test(parsed.hostname));
124
+ }
125
+
126
+ function modelSupportsNativeVision(model: string): boolean {
127
+ const normalized = model.toLowerCase();
128
+ if (!normalized) return false;
129
+ if (
130
+ normalized.includes('gpt-5')
131
+ || normalized.includes('gpt-4o')
132
+ || normalized.includes('gpt-4.1')
133
+ || normalized.includes('o1')
134
+ || normalized.includes('o3')
135
+ || normalized.includes('vision')
136
+ || normalized.includes('multimodal')
137
+ || normalized.includes('gemini')
138
+ || normalized.includes('claude-3')
139
+ ) {
140
+ return true;
141
+ }
142
+ return false;
143
+ }
144
+
145
+ async function resolveMediaImagePartUrl(item: MediaContextItem): Promise<string | null> {
146
+ const localPath = item.path ? normalizeAllowedLocalImagePath(item.path) : null;
147
+ if (localPath) {
148
+ try {
149
+ const image = await fs.promises.readFile(localPath);
150
+ if (image.length > NATIVE_VISION_MAX_IMAGE_BYTES) {
151
+ console.error(`[media] skipping ${localPath}: ${image.length}B exceeds native vision max`);
152
+ } else {
153
+ const mimeType = inferImageMimeType(localPath, item.mimeType);
154
+ const base64 = image.toString('base64');
155
+ return `data:${mimeType};base64,${base64}`;
156
+ }
157
+ } catch (err) {
158
+ console.error(`[media] failed to read local media ${localPath}: ${err instanceof Error ? err.message : String(err)}`);
159
+ }
160
+ }
161
+
162
+ const fallbackCandidates = [item.url, item.originalUrl].map((value) => String(value || '').trim()).filter(Boolean);
163
+ for (const candidate of fallbackCandidates) {
164
+ if (!isSafeDiscordCdnUrl(candidate)) continue;
165
+ return candidate;
166
+ }
167
+ return null;
168
+ }
169
+
170
+ async function injectNativeVisionContent(
171
+ messages: ChatMessage[],
172
+ model: string,
173
+ media: MediaContextItem[] | undefined,
174
+ ): Promise<ChatMessage[]> {
175
+ if (!Array.isArray(media) || media.length === 0) return messages;
176
+ if (!modelSupportsNativeVision(model)) return messages;
177
+
178
+ const mediaSlice = media.slice(0, NATIVE_VISION_MAX_IMAGES);
179
+ const imageParts: ChatContentPart[] = [];
180
+ for (const item of mediaSlice) {
181
+ const url = await resolveMediaImagePartUrl(item);
182
+ if (!url) continue;
183
+ imageParts.push({ type: 'image_url', image_url: { url } });
184
+ }
185
+ if (imageParts.length === 0) return messages;
186
+
187
+ const latestUserIndex = (() => {
188
+ for (let i = messages.length - 1; i >= 0; i -= 1) {
189
+ if (messages[i].role === 'user') return i;
190
+ }
191
+ return -1;
192
+ })();
193
+ if (latestUserIndex < 0) return messages;
194
+
195
+ const cloned = messages.map((msg) => ({ ...msg }));
196
+ const existingText = normalizeMessageContentToText(cloned[latestUserIndex].content);
197
+ const contentParts: ChatContentPart[] = [];
198
+ const nativeVisionHint =
199
+ '[NativeVision] Image parts are attached in this message. Analyze them directly and skip extra vision tool pre-analysis unless explicitly required.';
200
+ if (existingText) {
201
+ contentParts.push({ type: 'text', text: `${existingText}\n\n${nativeVisionHint}` });
202
+ } else {
203
+ contentParts.push({ type: 'text', text: nativeVisionHint });
204
+ }
205
+ contentParts.push(...imageParts);
206
+ cloned[latestUserIndex] = {
207
+ ...cloned[latestUserIndex],
208
+ content: contentParts,
209
+ };
210
+ console.error(`[media] injected ${imageParts.length} native vision image part(s) for model ${model}`);
211
+ return cloned;
212
+ }
213
+
52
214
  /**
53
215
  * Read a single line from stdin (the initial request JSON containing secrets).
54
216
  * Resolves on the first newline — does not consume the entire stream, so docker -i
@@ -104,20 +266,21 @@ function latestUserPrompt(messages: ChatMessage[]): string {
104
266
  for (let i = messages.length - 1; i >= 0; i--) {
105
267
  const message = messages[i];
106
268
  if (message.role !== 'user') continue;
107
- const text = String(message.content || '').replace(/\s+/g, ' ').trim();
269
+ const text = normalizeMessageContentToText(message.content).replace(/\s+/g, ' ').trim();
108
270
  if (!text) continue;
109
271
  return text.slice(0, 1_200);
110
272
  }
111
273
  return 'Continue the task';
112
274
  }
113
275
 
114
- function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
115
- if (!content) return null;
276
+ function parseRalphChoice(content: ChatMessageContent): 'CONTINUE' | 'STOP' | null {
277
+ const normalizedContent = normalizeMessageContentToText(content);
278
+ if (!normalizedContent) return null;
116
279
  const re = /<choice>\s*([^<]*)\s*<\/choice>/gi;
117
280
  let match: RegExpExecArray | null = null;
118
281
  let lastChoice: string | null = null;
119
282
  while (true) {
120
- match = re.exec(content);
283
+ match = re.exec(normalizedContent);
121
284
  if (!match) break;
122
285
  lastChoice = (match[1] || '').trim().toUpperCase();
123
286
  }
@@ -125,13 +288,14 @@ function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
125
288
  return null;
126
289
  }
127
290
 
128
- function stripRalphChoiceTags(content: string | null): string | null {
129
- if (content == null) return content;
130
- const stripped = content
291
+ function stripRalphChoiceTags(content: ChatMessageContent): string | null {
292
+ const normalizedContent = normalizeMessageContentToText(content);
293
+ if (!normalizedContent) return null;
294
+ const stripped = normalizedContent
131
295
  .replace(/<choice>\s*[^<]*\s*<\/choice>/gi, '')
132
296
  .replace(/\n{3,}/g, '\n\n')
133
297
  .trim();
134
- return stripped || content;
298
+ return stripped || normalizedContent;
135
299
  }
136
300
 
137
301
  function buildRalphPrompt(taskPrompt: string, missingChoice: boolean): string {
@@ -487,14 +651,35 @@ async function processRequest(
487
651
  * Main loop: read first request from stdin (with secrets), then poll IPC for follow-ups.
488
652
  */
489
653
  function resolveTools(input: ContainerInput): ToolDefinition[] {
490
- const tools = input.allowedTools
654
+ let tools = input.allowedTools
491
655
  ? TOOL_DEFINITIONS.filter((t) => input.allowedTools!.includes(t.function.name))
492
656
  : [...TOOL_DEFINITIONS];
657
+ if (Array.isArray(input.blockedTools) && input.blockedTools.length > 0) {
658
+ const blocked = new Set(
659
+ input.blockedTools
660
+ .map((name) => String(name || '').trim())
661
+ .filter(Boolean),
662
+ );
663
+ tools = tools.filter((tool) => !blocked.has(tool.function.name));
664
+ }
493
665
  // Sort alphabetically for deterministic system-prompt ordering (KV cache stability)
494
666
  tools.sort((a, b) => a.function.name.localeCompare(b.function.name));
495
667
  return tools;
496
668
  }
497
669
 
670
+ function shouldRetryWithoutNativeVision(error: string | undefined): boolean {
671
+ const normalized = String(error || '').toLowerCase();
672
+ if (!normalized) return false;
673
+ return (
674
+ normalized.includes('image_url')
675
+ || normalized.includes('unsupported image')
676
+ || normalized.includes('unsupported content')
677
+ || normalized.includes('vision')
678
+ || normalized.includes('multimodal')
679
+ || normalized.includes('content part')
680
+ );
681
+ }
682
+
498
683
  async function main(): Promise<void> {
499
684
  console.error(`[hybridclaw-agent] started, idle timeout ${IDLE_TIMEOUT_MS}ms`);
500
685
 
@@ -510,9 +695,15 @@ async function main(): Promise<void> {
510
695
  setSessionContext(firstInput.sessionId);
511
696
  setGatewayContext(firstInput.gatewayBaseUrl, firstInput.gatewayApiToken, firstInput.channelId);
512
697
  setModelContext(firstInput.baseUrl, storedApiKey, firstInput.model, firstInput.chatbotId);
513
-
514
- const firstOutput = await processRequest(
698
+ setMediaContext(firstInput.media);
699
+ const firstMessages = await injectNativeVisionContent(
515
700
  firstInput.messages,
701
+ firstInput.model,
702
+ firstInput.media,
703
+ );
704
+
705
+ let firstOutput = await processRequest(
706
+ firstMessages,
516
707
  storedApiKey,
517
708
  firstInput.baseUrl,
518
709
  firstInput.model,
@@ -520,6 +711,22 @@ async function main(): Promise<void> {
520
711
  firstInput.enableRag,
521
712
  resolveTools(firstInput),
522
713
  );
714
+ if (
715
+ firstMessages !== firstInput.messages
716
+ && firstOutput.status === 'error'
717
+ && shouldRetryWithoutNativeVision(firstOutput.error)
718
+ ) {
719
+ console.error('[media] native vision injection rejected by model; retrying without image parts');
720
+ firstOutput = await processRequest(
721
+ firstInput.messages,
722
+ storedApiKey,
723
+ firstInput.baseUrl,
724
+ firstInput.model,
725
+ firstInput.chatbotId,
726
+ firstInput.enableRag,
727
+ resolveTools(firstInput),
728
+ );
729
+ }
523
730
 
524
731
  firstOutput.sideEffects = getPendingSideEffects();
525
732
  writeOutput(firstOutput);
@@ -544,9 +751,15 @@ async function main(): Promise<void> {
544
751
  setSessionContext(input.sessionId);
545
752
  setGatewayContext(input.gatewayBaseUrl, input.gatewayApiToken, input.channelId);
546
753
  setModelContext(input.baseUrl, apiKey, input.model, input.chatbotId);
547
-
548
- const output = await processRequest(
754
+ setMediaContext(input.media);
755
+ const preparedMessages = await injectNativeVisionContent(
549
756
  input.messages,
757
+ input.model,
758
+ input.media,
759
+ );
760
+
761
+ let output = await processRequest(
762
+ preparedMessages,
550
763
  apiKey,
551
764
  input.baseUrl,
552
765
  input.model,
@@ -554,6 +767,22 @@ async function main(): Promise<void> {
554
767
  input.enableRag,
555
768
  resolveTools(input),
556
769
  );
770
+ if (
771
+ preparedMessages !== input.messages
772
+ && output.status === 'error'
773
+ && shouldRetryWithoutNativeVision(output.error)
774
+ ) {
775
+ console.error('[media] native vision injection rejected by model; retrying without image parts');
776
+ output = await processRequest(
777
+ input.messages,
778
+ apiKey,
779
+ input.baseUrl,
780
+ input.model,
781
+ input.chatbotId,
782
+ input.enableRag,
783
+ resolveTools(input),
784
+ );
785
+ }
557
786
 
558
787
  output.sideEffects = getPendingSideEffects();
559
788
  writeOutput(output);
@@ -26,12 +26,28 @@ export function createTokenUsageStats(): TokenUsageStats {
26
26
  };
27
27
  }
28
28
 
29
- export function estimateTextTokens(text: string | null | undefined): number {
29
+ export function estimateTextTokens(text: unknown): number {
30
30
  const normalized = typeof text === 'string' ? text : '';
31
31
  if (!normalized) return 0;
32
32
  return Math.max(1, Math.ceil(normalized.length / CHARS_PER_TOKEN));
33
33
  }
34
34
 
35
+ function normalizeContentText(content: ChatMessage['content']): string {
36
+ if (typeof content === 'string') return content;
37
+ if (!Array.isArray(content)) return '';
38
+ const chunks: string[] = [];
39
+ for (const part of content) {
40
+ if (part?.type === 'text' && typeof part.text === 'string') {
41
+ chunks.push(part.text);
42
+ continue;
43
+ }
44
+ if (part?.type === 'image_url' && part.image_url?.url) {
45
+ chunks.push('[image]');
46
+ }
47
+ }
48
+ return chunks.join('\n');
49
+ }
50
+
35
51
  export function estimateMessageTokens(messages: ChatMessage[]): number {
36
52
  if (!Array.isArray(messages) || messages.length === 0) return 0;
37
53
 
@@ -39,7 +55,7 @@ export function estimateMessageTokens(messages: ChatMessage[]): number {
39
55
  for (const message of messages) {
40
56
  total += 4;
41
57
  total += estimateTextTokens(message.role);
42
- total += estimateTextTokens(message.content);
58
+ total += estimateTextTokens(normalizeContentText(message.content));
43
59
  if (message.tool_calls) total += estimateTextTokens(JSON.stringify(message.tool_calls));
44
60
  if (message.tool_call_id) total += estimateTextTokens(message.tool_call_id);
45
61
  }