@hybridaione/hybridclaw 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/CHANGELOG.md +23 -0
  2. package/README.md +47 -15
  3. package/container/package-lock.json +2 -2
  4. package/container/package.json +1 -1
  5. package/container/src/browser-tools.ts +1 -1
  6. package/container/src/index.ts +243 -14
  7. package/container/src/token-usage.ts +18 -2
  8. package/container/src/tools.ts +339 -1
  9. package/container/src/types.ts +28 -2
  10. package/dist/agent.d.ts +2 -2
  11. package/dist/agent.d.ts.map +1 -1
  12. package/dist/agent.js +2 -2
  13. package/dist/agent.js.map +1 -1
  14. package/dist/channels/discord/attachments.d.ts +9 -0
  15. package/dist/channels/discord/attachments.d.ts.map +1 -0
  16. package/dist/channels/discord/attachments.js +245 -0
  17. package/dist/channels/discord/attachments.js.map +1 -0
  18. package/dist/channels/discord/delivery.d.ts +31 -0
  19. package/dist/channels/discord/delivery.d.ts.map +1 -0
  20. package/dist/channels/discord/delivery.js +60 -0
  21. package/dist/channels/discord/delivery.js.map +1 -0
  22. package/dist/channels/discord/inbound.d.ts +20 -0
  23. package/dist/channels/discord/inbound.d.ts.map +1 -0
  24. package/dist/channels/discord/inbound.js +44 -0
  25. package/dist/channels/discord/inbound.js.map +1 -0
  26. package/dist/channels/discord/mentions.d.ts +14 -0
  27. package/dist/channels/discord/mentions.d.ts.map +1 -0
  28. package/dist/channels/discord/mentions.js +118 -0
  29. package/dist/channels/discord/mentions.js.map +1 -0
  30. package/dist/channels/discord/runtime.d.ts +22 -0
  31. package/dist/channels/discord/runtime.d.ts.map +1 -0
  32. package/dist/channels/discord/runtime.js +972 -0
  33. package/dist/channels/discord/runtime.js.map +1 -0
  34. package/dist/channels/discord/stream.d.ts +32 -0
  35. package/dist/channels/discord/stream.d.ts.map +1 -0
  36. package/dist/channels/discord/stream.js +196 -0
  37. package/dist/channels/discord/stream.js.map +1 -0
  38. package/dist/channels/discord/tool-actions.d.ts +31 -0
  39. package/dist/channels/discord/tool-actions.d.ts.map +1 -0
  40. package/dist/channels/discord/tool-actions.js +268 -0
  41. package/dist/channels/discord/tool-actions.js.map +1 -0
  42. package/dist/container-runner.d.ts +2 -2
  43. package/dist/container-runner.d.ts.map +1 -1
  44. package/dist/container-runner.js +12 -2
  45. package/dist/container-runner.js.map +1 -1
  46. package/dist/discord.basic.test.d.ts +2 -0
  47. package/dist/discord.basic.test.d.ts.map +1 -0
  48. package/dist/discord.basic.test.js +38 -0
  49. package/dist/discord.basic.test.js.map +1 -0
  50. package/dist/discord.d.ts +5 -44
  51. package/dist/discord.d.ts.map +1 -1
  52. package/dist/discord.js +3 -1468
  53. package/dist/discord.js.map +1 -1
  54. package/dist/gateway-service.d.ts +7 -1
  55. package/dist/gateway-service.d.ts.map +1 -1
  56. package/dist/gateway-service.js +111 -2
  57. package/dist/gateway-service.js.map +1 -1
  58. package/dist/gateway-service.media-routing.test.d.ts +2 -0
  59. package/dist/gateway-service.media-routing.test.d.ts.map +1 -0
  60. package/dist/gateway-service.media-routing.test.js +29 -0
  61. package/dist/gateway-service.media-routing.test.js.map +1 -0
  62. package/dist/gateway-types.d.ts +8 -0
  63. package/dist/gateway-types.d.ts.map +1 -1
  64. package/dist/gateway-types.js.map +1 -1
  65. package/dist/gateway.js +5 -2
  66. package/dist/gateway.js.map +1 -1
  67. package/dist/health.d.ts.map +1 -1
  68. package/dist/health.js +1 -1
  69. package/dist/health.js.map +1 -1
  70. package/dist/heartbeat.d.ts.map +1 -1
  71. package/dist/heartbeat.js +2 -0
  72. package/dist/heartbeat.js.map +1 -1
  73. package/dist/token-efficiency.basic.test.d.ts +2 -0
  74. package/dist/token-efficiency.basic.test.d.ts.map +1 -0
  75. package/dist/token-efficiency.basic.test.js +29 -0
  76. package/dist/token-efficiency.basic.test.js.map +1 -0
  77. package/dist/token-efficiency.d.ts.map +1 -1
  78. package/dist/token-efficiency.js +18 -1
  79. package/dist/token-efficiency.js.map +1 -1
  80. package/dist/types.d.ts +23 -1
  81. package/dist/types.d.ts.map +1 -1
  82. package/package.json +10 -2
  83. package/src/agent.ts +11 -1
  84. package/src/channels/discord/attachments.ts +282 -0
  85. package/src/channels/discord/delivery.ts +99 -0
  86. package/src/channels/discord/inbound.ts +72 -0
  87. package/src/channels/discord/mentions.ts +130 -0
  88. package/src/{discord.ts → channels/discord/runtime.ts} +77 -615
  89. package/src/{discord-stream.ts → channels/discord/stream.ts} +2 -2
  90. package/src/channels/discord/tool-actions.ts +332 -0
  91. package/src/container-runner.ts +24 -1
  92. package/src/gateway-service.ts +125 -1
  93. package/src/gateway-types.ts +8 -0
  94. package/src/gateway.ts +5 -5
  95. package/src/health.ts +2 -1
  96. package/src/heartbeat.ts +2 -0
  97. package/src/token-efficiency.ts +17 -1
  98. package/src/types.ts +27 -1
  99. package/tests/discord.basic.test.ts +43 -0
  100. package/tests/gateway-service.media-routing.test.ts +33 -0
  101. package/tests/token-efficiency.basic.test.ts +32 -0
  102. package/vitest.e2e.config.ts +15 -0
  103. package/vitest.integration.config.ts +15 -0
  104. package/vitest.live.config.ts +16 -0
  105. package/vitest.unit.config.ts +15 -0
package/CHANGELOG.md CHANGED
@@ -8,6 +8,29 @@
8
8
 
9
9
  ### Fixed
10
10
 
11
+ ## [0.2.2](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.2)
12
+
13
+ ### Added
14
+
15
+ - **Discord image attachment ingest/cache**: Added receive-time image ingest with local cache under `data/discord-media-cache`, preserving attachment order and carrying `path`, `mimeType`, `sizeBytes`, and `originalUrl` per media item.
16
+ - **Structured media context pipeline**: Added typed media payload (`MediaPaths`/`MediaUrls`/`MediaTypes` equivalents) from Discord runtime through gateway/container request boundaries.
17
+ - **Attachment vision tools**: Added `vision_analyze` (and `image` alias) for Discord-uploaded image analysis using local cached paths first, with Discord CDN URL fallback.
18
+ - **Native multimodal injection**: Added direct image-part injection for vision-capable models, with automatic retry without image parts if the model rejects multimodal payloads.
19
+ - **Scoped Vitest test configs**: Added dedicated `vitest.{unit,integration,e2e,live}.config.ts` files and matching npm scripts (`test:unit`, `test:integration`, `test:e2e`, `test:live`, `test:watch`) for explicit suite boundaries.
20
+
21
+ ### Changed
22
+
23
+ - **Discord channel module layout**: Completed migration of Discord runtime internals into `src/channels/discord/*`, including `runtime.ts` and `stream.ts`, and removed legacy root-level `src/discord.ts` shim.
24
+ - **Image-question tool routing**: Discord image questions now prioritize attachment vision (`vision_analyze`) and block `browser_vision` unless the user explicitly asks about the active browser tab/page.
25
+ - **Browser vision scope guidance**: Updated `browser_vision` tool description to clarify it is for browser-page tasks only, not Discord-uploaded files.
26
+ - **Test runner strategy**: Switched from compiled test artifacts (`dist-tests` + `tsconfig.tests.json`) to direct TypeScript execution via Vitest.
27
+ - **Test file location and conventions**: Moved basic test files from `src/*.test.ts` to `tests/` and aligned naming/scoping conventions for unit/integration/e2e/live suites.
28
+
29
+ ### Fixed
30
+
31
+ - **Discord image analysis fallback behavior**: Added safer cache/CDN fallback handling and guardrails (Discord CDN allowlist, size/type limits, per-image success/failure logging) to avoid brittle image-analysis failures.
32
+ - **Regression coverage for wrong vision tool selection**: Added basic regression test coverage that Discord image questions should not route to browser screenshot vision.
33
+
11
34
  ## [0.2.1](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.1)
12
35
 
13
36
  ### Added
package/README.md CHANGED
@@ -11,15 +11,17 @@ npm install -g @hybridaione/hybridclaw
11
11
  hybridclaw onboarding
12
12
  ```
13
13
 
14
- Latest release: [v0.2.1](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.1)
14
+ Latest release: [v0.2.2](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.2)
15
15
 
16
- ## What's new in v0.2.1
16
+ ## What's new in v0.2.2
17
17
 
18
- - Added OpenClaw-style Discord `message` tool actions (`read`, `member-info`, `channel-info`) to the container runtime
19
- - Added gateway endpoint `POST /api/discord/action` for Discord context lookups from tools
20
- - Replaced prompt-time Discord presence snapshots with cache-backed `member-info` presence fields (`status`, `activities`)
21
- - Routed Discord context lookups through gateway API from container with host remapping and token propagation
22
- - Enabled `message` tool in heartbeat and base subagent allowlists
18
+ - Added Discord attachment ingest/cache with structured media context (`path`, `mime`, `size`, `original_url`) passed into the agent pipeline
19
+ - Added `vision_analyze`/`image` tools for Discord-uploaded image analysis (local cached path first, Discord CDN fallback)
20
+ - Added native model vision image-part injection for vision-capable models, with safe fallback if multimodal input is rejected
21
+ - Routed Discord image questions away from `browser_vision` (unless explicitly about the active browser tab/page)
22
+ - Completed Discord runtime migration into `src/channels/discord/*` and removed the legacy root-level `src/discord.ts` shim
23
+ - Switched tests from compiled `dist-tests` artifacts to direct TypeScript execution via Vitest
24
+ - Moved basic tests to `tests/` with explicit scope naming conventions
23
25
 
24
26
  ## HybridAI Advantage
25
27
 
@@ -349,6 +351,34 @@ System prompt assembly is handled by a formal hook pipeline:
349
351
 
350
352
  Hook toggles live in `config.json` under `promptHooks`.
351
353
 
354
+ ## Testing
355
+
356
+ Run checks locally:
357
+
358
+ ```bash
359
+ # Typecheck only (no emit)
360
+ npm run typecheck
361
+
362
+ # Strict TS lint gate (unused locals/params)
363
+ npm run lint
364
+
365
+ # Unit tests (default `npm test`)
366
+ npm run test:unit
367
+
368
+ # Scoped suites (ready for dedicated tests)
369
+ npm run test:integration
370
+ npm run test:e2e
371
+ npm run test:live
372
+ ```
373
+
374
+ Test layout and scopes:
375
+
376
+ - tests live under `tests/` (not `src/`)
377
+ - unit tests: `tests/**/*.test.ts` (excluding `*.integration|*.e2e|*.live`)
378
+ - integration tests: `tests/**/*.integration.test.ts`
379
+ - e2e tests: `tests/**/*.e2e.test.ts`
380
+ - live tests: `tests/**/*.live.test.ts`
381
+
352
382
  ## Commands
353
383
 
354
384
  CLI runtime commands:
@@ -379,12 +409,14 @@ In Discord, use `!claw help` to see all commands. Key ones:
379
409
  ## Project structure
380
410
 
381
411
  ```
382
- src/gateway.ts Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
383
- src/tui.ts Terminal adapter (thin client to gateway)
384
- src/discord.ts Discord integration and message transport
385
- src/gateway-service.ts Core shared agent/session logic used by gateway API
386
- src/gateway-client.ts HTTP client used by thin clients (e.g. TUI)
387
- container/src/ Agent code (tools, HybridAI client, IPC)
388
- templates/ Workspace bootstrap files
389
- data/ Runtime data (gitignored): SQLite DB, sessions, agent workspaces
412
+ src/gateway.ts Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
413
+ src/tui.ts Terminal adapter (thin client to gateway)
414
+ src/channels/discord/runtime.ts Discord runtime integration and message transport
415
+ src/channels/discord/*.ts Discord responsibility modules (inbound, delivery, mentions, attachments, tools, stream)
416
+ src/gateway-service.ts Core shared agent/session logic used by gateway API
417
+ src/gateway-client.ts HTTP client used by thin clients (e.g. TUI)
418
+ tests/ Vitest suites (unit/integration/e2e/live scopes)
419
+ container/src/ Agent code (tools, HybridAI client, IPC)
420
+ templates/ Workspace bootstrap files
421
+ data/ Runtime data (gitignored): SQLite DB, sessions, agent workspaces
390
422
  ```
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "hybridclaw-agent",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "hybridclaw-agent",
9
- "version": "0.2.1",
9
+ "version": "0.2.2",
10
10
  "dependencies": {
11
11
  "@mozilla/readability": "^0.6.0",
12
12
  "agent-browser": "^0.15.1",
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "hybridclaw-agent",
3
- "version": "0.2.1",
3
+ "version": "0.2.2",
4
4
  "type": "module",
5
5
  "scripts": {
6
6
  "build": "tsc",
@@ -1199,7 +1199,7 @@ export const BROWSER_TOOL_DEFINITIONS: ToolDefinition[] = [
1199
1199
  function: {
1200
1200
  name: 'browser_vision',
1201
1201
  description:
1202
- 'Capture a screenshot and analyze it with a vision model using the provided question.',
1202
+ 'Capture the current browser page screenshot and analyze it with a vision model. Use only for active browser-tab/page tasks, not for Discord-uploaded files.',
1203
1203
  parameters: {
1204
1204
  type: 'object',
1205
1205
  properties: {
@@ -1,4 +1,6 @@
1
1
  import path from 'path';
2
+ import fs from 'fs';
3
+ import { URL } from 'url';
2
4
 
3
5
  import { emitRuntimeEvent, runAfterToolHooks, runBeforeToolHooks } from './extensions.js';
4
6
  import { callHybridAI, callHybridAIStream, HybridAIRequestError } from './hybridai-client.js';
@@ -15,12 +17,23 @@ import {
15
17
  getPendingSideEffects,
16
18
  resetSideEffects,
17
19
  setGatewayContext,
20
+ setMediaContext,
18
21
  setModelContext,
19
22
  setScheduledTasks,
20
23
  setSessionContext,
21
24
  TOOL_DEFINITIONS,
22
25
  } from './tools.js';
23
- import type { ArtifactMetadata, ChatMessage, ContainerInput, ContainerOutput, ToolDefinition, ToolExecution } from './types.js';
26
+ import type {
27
+ ArtifactMetadata,
28
+ ChatContentPart,
29
+ ChatMessage,
30
+ ChatMessageContent,
31
+ ContainerInput,
32
+ ContainerOutput,
33
+ MediaContextItem,
34
+ ToolDefinition,
35
+ ToolExecution,
36
+ } from './types.js';
24
37
 
25
38
  const MAX_ITERATIONS = 20;
26
39
  const IDLE_TIMEOUT_MS = parseInt(process.env.CONTAINER_IDLE_TIMEOUT || '300000', 10); // 5 min
@@ -45,10 +58,159 @@ const ARTIFACT_MIME_TYPES: Record<string, string> = {
45
58
  '.svg': 'image/svg+xml',
46
59
  '.webp': 'image/webp',
47
60
  };
61
+ const DISCORD_MEDIA_CACHE_ROOT = '/discord-media-cache';
62
+ const NATIVE_VISION_MAX_IMAGES = 8;
63
+ const NATIVE_VISION_MAX_IMAGE_BYTES = 10 * 1024 * 1024;
64
+ const DISCORD_CDN_HOST_PATTERNS: RegExp[] = [
65
+ /^cdn\.discordapp\.com$/i,
66
+ /^media\.discordapp\.net$/i,
67
+ /^cdn\.discordapp\.net$/i,
68
+ /^images-ext-\d+\.discordapp\.net$/i,
69
+ ];
48
70
 
49
71
  /** API key received once via stdin, held in memory for the container lifetime. */
50
72
  let storedApiKey = '';
51
73
 
74
+ function normalizeMessageContentToText(content: ChatMessageContent): string {
75
+ if (typeof content === 'string') return content;
76
+ if (!Array.isArray(content)) return '';
77
+ const chunks: string[] = [];
78
+ for (const part of content) {
79
+ if (!part || typeof part !== 'object') continue;
80
+ if (part.type !== 'text') continue;
81
+ if (typeof part.text !== 'string') continue;
82
+ if (part.text.trim()) chunks.push(part.text.trim());
83
+ }
84
+ return chunks.join('\n').trim();
85
+ }
86
+
87
+ function normalizePathSlashes(raw: string): string {
88
+ return raw.replace(/\\/g, '/');
89
+ }
90
+
91
+ function normalizeAllowedLocalImagePath(rawPath: string): string | null {
92
+ const trimmed = rawPath.trim();
93
+ if (!trimmed) return null;
94
+
95
+ const workspace = path.posix.normalize(WORKSPACE_ROOT);
96
+ const mediaRoot = path.posix.normalize(DISCORD_MEDIA_CACHE_ROOT);
97
+
98
+ const candidate = trimmed.startsWith('/')
99
+ ? path.posix.normalize(normalizePathSlashes(trimmed))
100
+ : path.posix.normalize(path.posix.join(workspace, normalizePathSlashes(trimmed)));
101
+
102
+ const underWorkspace = candidate === workspace || candidate.startsWith(`${workspace}/`);
103
+ const underMediaRoot = candidate === mediaRoot || candidate.startsWith(`${mediaRoot}/`);
104
+ if (!underWorkspace && !underMediaRoot) return null;
105
+ return candidate;
106
+ }
107
+
108
+ function inferImageMimeType(filePath: string, fallbackMime: string | null | undefined): string {
109
+ const normalizedFallback = String(fallbackMime || '').trim().toLowerCase();
110
+ if (normalizedFallback.startsWith('image/')) return normalizedFallback;
111
+ const ext = path.posix.extname(filePath).toLowerCase();
112
+ return ARTIFACT_MIME_TYPES[ext] || 'image/png';
113
+ }
114
+
115
+ function isSafeDiscordCdnUrl(raw: string): boolean {
116
+ let parsed: URL;
117
+ try {
118
+ parsed = new URL(raw);
119
+ } catch {
120
+ return false;
121
+ }
122
+ if (parsed.protocol !== 'https:') return false;
123
+ return DISCORD_CDN_HOST_PATTERNS.some((pattern) => pattern.test(parsed.hostname));
124
+ }
125
+
126
+ function modelSupportsNativeVision(model: string): boolean {
127
+ const normalized = model.toLowerCase();
128
+ if (!normalized) return false;
129
+ if (
130
+ normalized.includes('gpt-5')
131
+ || normalized.includes('gpt-4o')
132
+ || normalized.includes('gpt-4.1')
133
+ || normalized.includes('o1')
134
+ || normalized.includes('o3')
135
+ || normalized.includes('vision')
136
+ || normalized.includes('multimodal')
137
+ || normalized.includes('gemini')
138
+ || normalized.includes('claude-3')
139
+ ) {
140
+ return true;
141
+ }
142
+ return false;
143
+ }
144
+
145
+ async function resolveMediaImagePartUrl(item: MediaContextItem): Promise<string | null> {
146
+ const localPath = item.path ? normalizeAllowedLocalImagePath(item.path) : null;
147
+ if (localPath) {
148
+ try {
149
+ const image = await fs.promises.readFile(localPath);
150
+ if (image.length > NATIVE_VISION_MAX_IMAGE_BYTES) {
151
+ console.error(`[media] skipping ${localPath}: ${image.length}B exceeds native vision max`);
152
+ } else {
153
+ const mimeType = inferImageMimeType(localPath, item.mimeType);
154
+ const base64 = image.toString('base64');
155
+ return `data:${mimeType};base64,${base64}`;
156
+ }
157
+ } catch (err) {
158
+ console.error(`[media] failed to read local media ${localPath}: ${err instanceof Error ? err.message : String(err)}`);
159
+ }
160
+ }
161
+
162
+ const fallbackCandidates = [item.url, item.originalUrl].map((value) => String(value || '').trim()).filter(Boolean);
163
+ for (const candidate of fallbackCandidates) {
164
+ if (!isSafeDiscordCdnUrl(candidate)) continue;
165
+ return candidate;
166
+ }
167
+ return null;
168
+ }
169
+
170
+ async function injectNativeVisionContent(
171
+ messages: ChatMessage[],
172
+ model: string,
173
+ media: MediaContextItem[] | undefined,
174
+ ): Promise<ChatMessage[]> {
175
+ if (!Array.isArray(media) || media.length === 0) return messages;
176
+ if (!modelSupportsNativeVision(model)) return messages;
177
+
178
+ const mediaSlice = media.slice(0, NATIVE_VISION_MAX_IMAGES);
179
+ const imageParts: ChatContentPart[] = [];
180
+ for (const item of mediaSlice) {
181
+ const url = await resolveMediaImagePartUrl(item);
182
+ if (!url) continue;
183
+ imageParts.push({ type: 'image_url', image_url: { url } });
184
+ }
185
+ if (imageParts.length === 0) return messages;
186
+
187
+ const latestUserIndex = (() => {
188
+ for (let i = messages.length - 1; i >= 0; i -= 1) {
189
+ if (messages[i].role === 'user') return i;
190
+ }
191
+ return -1;
192
+ })();
193
+ if (latestUserIndex < 0) return messages;
194
+
195
+ const cloned = messages.map((msg) => ({ ...msg }));
196
+ const existingText = normalizeMessageContentToText(cloned[latestUserIndex].content);
197
+ const contentParts: ChatContentPart[] = [];
198
+ const nativeVisionHint =
199
+ '[NativeVision] Image parts are attached in this message. Analyze them directly and skip extra vision tool pre-analysis unless explicitly required.';
200
+ if (existingText) {
201
+ contentParts.push({ type: 'text', text: `${existingText}\n\n${nativeVisionHint}` });
202
+ } else {
203
+ contentParts.push({ type: 'text', text: nativeVisionHint });
204
+ }
205
+ contentParts.push(...imageParts);
206
+ cloned[latestUserIndex] = {
207
+ ...cloned[latestUserIndex],
208
+ content: contentParts,
209
+ };
210
+ console.error(`[media] injected ${imageParts.length} native vision image part(s) for model ${model}`);
211
+ return cloned;
212
+ }
213
+
52
214
  /**
53
215
  * Read a single line from stdin (the initial request JSON containing secrets).
54
216
  * Resolves on the first newline — does not consume the entire stream, so docker -i
@@ -104,20 +266,21 @@ function latestUserPrompt(messages: ChatMessage[]): string {
104
266
  for (let i = messages.length - 1; i >= 0; i--) {
105
267
  const message = messages[i];
106
268
  if (message.role !== 'user') continue;
107
- const text = String(message.content || '').replace(/\s+/g, ' ').trim();
269
+ const text = normalizeMessageContentToText(message.content).replace(/\s+/g, ' ').trim();
108
270
  if (!text) continue;
109
271
  return text.slice(0, 1_200);
110
272
  }
111
273
  return 'Continue the task';
112
274
  }
113
275
 
114
- function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
115
- if (!content) return null;
276
+ function parseRalphChoice(content: ChatMessageContent): 'CONTINUE' | 'STOP' | null {
277
+ const normalizedContent = normalizeMessageContentToText(content);
278
+ if (!normalizedContent) return null;
116
279
  const re = /<choice>\s*([^<]*)\s*<\/choice>/gi;
117
280
  let match: RegExpExecArray | null = null;
118
281
  let lastChoice: string | null = null;
119
282
  while (true) {
120
- match = re.exec(content);
283
+ match = re.exec(normalizedContent);
121
284
  if (!match) break;
122
285
  lastChoice = (match[1] || '').trim().toUpperCase();
123
286
  }
@@ -125,13 +288,14 @@ function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
125
288
  return null;
126
289
  }
127
290
 
128
- function stripRalphChoiceTags(content: string | null): string | null {
129
- if (content == null) return content;
130
- const stripped = content
291
+ function stripRalphChoiceTags(content: ChatMessageContent): string | null {
292
+ const normalizedContent = normalizeMessageContentToText(content);
293
+ if (!normalizedContent) return null;
294
+ const stripped = normalizedContent
131
295
  .replace(/<choice>\s*[^<]*\s*<\/choice>/gi, '')
132
296
  .replace(/\n{3,}/g, '\n\n')
133
297
  .trim();
134
- return stripped || content;
298
+ return stripped || normalizedContent;
135
299
  }
136
300
 
137
301
  function buildRalphPrompt(taskPrompt: string, missingChoice: boolean): string {
@@ -487,14 +651,35 @@ async function processRequest(
487
651
  * Main loop: read first request from stdin (with secrets), then poll IPC for follow-ups.
488
652
  */
489
653
  function resolveTools(input: ContainerInput): ToolDefinition[] {
490
- const tools = input.allowedTools
654
+ let tools = input.allowedTools
491
655
  ? TOOL_DEFINITIONS.filter((t) => input.allowedTools!.includes(t.function.name))
492
656
  : [...TOOL_DEFINITIONS];
657
+ if (Array.isArray(input.blockedTools) && input.blockedTools.length > 0) {
658
+ const blocked = new Set(
659
+ input.blockedTools
660
+ .map((name) => String(name || '').trim())
661
+ .filter(Boolean),
662
+ );
663
+ tools = tools.filter((tool) => !blocked.has(tool.function.name));
664
+ }
493
665
  // Sort alphabetically for deterministic system-prompt ordering (KV cache stability)
494
666
  tools.sort((a, b) => a.function.name.localeCompare(b.function.name));
495
667
  return tools;
496
668
  }
497
669
 
670
+ function shouldRetryWithoutNativeVision(error: string | undefined): boolean {
671
+ const normalized = String(error || '').toLowerCase();
672
+ if (!normalized) return false;
673
+ return (
674
+ normalized.includes('image_url')
675
+ || normalized.includes('unsupported image')
676
+ || normalized.includes('unsupported content')
677
+ || normalized.includes('vision')
678
+ || normalized.includes('multimodal')
679
+ || normalized.includes('content part')
680
+ );
681
+ }
682
+
498
683
  async function main(): Promise<void> {
499
684
  console.error(`[hybridclaw-agent] started, idle timeout ${IDLE_TIMEOUT_MS}ms`);
500
685
 
@@ -510,9 +695,15 @@ async function main(): Promise<void> {
510
695
  setSessionContext(firstInput.sessionId);
511
696
  setGatewayContext(firstInput.gatewayBaseUrl, firstInput.gatewayApiToken, firstInput.channelId);
512
697
  setModelContext(firstInput.baseUrl, storedApiKey, firstInput.model, firstInput.chatbotId);
513
-
514
- const firstOutput = await processRequest(
698
+ setMediaContext(firstInput.media);
699
+ const firstMessages = await injectNativeVisionContent(
515
700
  firstInput.messages,
701
+ firstInput.model,
702
+ firstInput.media,
703
+ );
704
+
705
+ let firstOutput = await processRequest(
706
+ firstMessages,
516
707
  storedApiKey,
517
708
  firstInput.baseUrl,
518
709
  firstInput.model,
@@ -520,6 +711,22 @@ async function main(): Promise<void> {
520
711
  firstInput.enableRag,
521
712
  resolveTools(firstInput),
522
713
  );
714
+ if (
715
+ firstMessages !== firstInput.messages
716
+ && firstOutput.status === 'error'
717
+ && shouldRetryWithoutNativeVision(firstOutput.error)
718
+ ) {
719
+ console.error('[media] native vision injection rejected by model; retrying without image parts');
720
+ firstOutput = await processRequest(
721
+ firstInput.messages,
722
+ storedApiKey,
723
+ firstInput.baseUrl,
724
+ firstInput.model,
725
+ firstInput.chatbotId,
726
+ firstInput.enableRag,
727
+ resolveTools(firstInput),
728
+ );
729
+ }
523
730
 
524
731
  firstOutput.sideEffects = getPendingSideEffects();
525
732
  writeOutput(firstOutput);
@@ -544,9 +751,15 @@ async function main(): Promise<void> {
544
751
  setSessionContext(input.sessionId);
545
752
  setGatewayContext(input.gatewayBaseUrl, input.gatewayApiToken, input.channelId);
546
753
  setModelContext(input.baseUrl, apiKey, input.model, input.chatbotId);
547
-
548
- const output = await processRequest(
754
+ setMediaContext(input.media);
755
+ const preparedMessages = await injectNativeVisionContent(
549
756
  input.messages,
757
+ input.model,
758
+ input.media,
759
+ );
760
+
761
+ let output = await processRequest(
762
+ preparedMessages,
550
763
  apiKey,
551
764
  input.baseUrl,
552
765
  input.model,
@@ -554,6 +767,22 @@ async function main(): Promise<void> {
554
767
  input.enableRag,
555
768
  resolveTools(input),
556
769
  );
770
+ if (
771
+ preparedMessages !== input.messages
772
+ && output.status === 'error'
773
+ && shouldRetryWithoutNativeVision(output.error)
774
+ ) {
775
+ console.error('[media] native vision injection rejected by model; retrying without image parts');
776
+ output = await processRequest(
777
+ input.messages,
778
+ apiKey,
779
+ input.baseUrl,
780
+ input.model,
781
+ input.chatbotId,
782
+ input.enableRag,
783
+ resolveTools(input),
784
+ );
785
+ }
557
786
 
558
787
  output.sideEffects = getPendingSideEffects();
559
788
  writeOutput(output);
@@ -26,12 +26,28 @@ export function createTokenUsageStats(): TokenUsageStats {
26
26
  };
27
27
  }
28
28
 
29
- export function estimateTextTokens(text: string | null | undefined): number {
29
+ export function estimateTextTokens(text: unknown): number {
30
30
  const normalized = typeof text === 'string' ? text : '';
31
31
  if (!normalized) return 0;
32
32
  return Math.max(1, Math.ceil(normalized.length / CHARS_PER_TOKEN));
33
33
  }
34
34
 
35
+ function normalizeContentText(content: ChatMessage['content']): string {
36
+ if (typeof content === 'string') return content;
37
+ if (!Array.isArray(content)) return '';
38
+ const chunks: string[] = [];
39
+ for (const part of content) {
40
+ if (part?.type === 'text' && typeof part.text === 'string') {
41
+ chunks.push(part.text);
42
+ continue;
43
+ }
44
+ if (part?.type === 'image_url' && part.image_url?.url) {
45
+ chunks.push('[image]');
46
+ }
47
+ }
48
+ return chunks.join('\n');
49
+ }
50
+
35
51
  export function estimateMessageTokens(messages: ChatMessage[]): number {
36
52
  if (!Array.isArray(messages) || messages.length === 0) return 0;
37
53
 
@@ -39,7 +55,7 @@ export function estimateMessageTokens(messages: ChatMessage[]): number {
39
55
  for (const message of messages) {
40
56
  total += 4;
41
57
  total += estimateTextTokens(message.role);
42
- total += estimateTextTokens(message.content);
58
+ total += estimateTextTokens(normalizeContentText(message.content));
43
59
  if (message.tool_calls) total += estimateTextTokens(JSON.stringify(message.tool_calls));
44
60
  if (message.tool_call_id) total += estimateTextTokens(message.tool_call_id);
45
61
  }