npm - @hybridaione/hybridclaw - Versions diffs - 0.2.1 → 0.2.3 - Mend

@hybridaione/hybridclaw 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/CHANGELOG.md +38 -0
package/README.md +49 -15
package/config.example.json +4 -1
package/container/package-lock.json +2 -2
package/container/package.json +1 -1
package/container/src/browser-tools.ts +1 -1
package/container/src/index.ts +243 -14
package/container/src/token-usage.ts +18 -2
package/container/src/tools.ts +339 -1
package/container/src/types.ts +28 -2
package/dist/agent.d.ts +2 -2
package/dist/agent.d.ts.map +1 -1
package/dist/agent.js +2 -2
package/dist/agent.js.map +1 -1
package/dist/channels/discord/attachments.d.ts +9 -0
package/dist/channels/discord/attachments.d.ts.map +1 -0
package/dist/channels/discord/attachments.js +245 -0
package/dist/channels/discord/attachments.js.map +1 -0
package/dist/channels/discord/delivery.d.ts +31 -0
package/dist/channels/discord/delivery.d.ts.map +1 -0
package/dist/channels/discord/delivery.js +60 -0
package/dist/channels/discord/delivery.js.map +1 -0
package/dist/channels/discord/inbound.d.ts +20 -0
package/dist/channels/discord/inbound.d.ts.map +1 -0
package/dist/channels/discord/inbound.js +44 -0
package/dist/channels/discord/inbound.js.map +1 -0
package/dist/channels/discord/mentions.d.ts +14 -0
package/dist/channels/discord/mentions.d.ts.map +1 -0
package/dist/channels/discord/mentions.js +118 -0
package/dist/channels/discord/mentions.js.map +1 -0
package/dist/channels/discord/runtime.d.ts +22 -0
package/dist/channels/discord/runtime.d.ts.map +1 -0
package/dist/channels/discord/runtime.js +972 -0
package/dist/channels/discord/runtime.js.map +1 -0
package/dist/channels/discord/stream.d.ts +32 -0
package/dist/channels/discord/stream.d.ts.map +1 -0
package/dist/channels/discord/stream.js +196 -0
package/dist/channels/discord/stream.js.map +1 -0
package/dist/channels/discord/tool-actions.d.ts +31 -0
package/dist/channels/discord/tool-actions.d.ts.map +1 -0
package/dist/channels/discord/tool-actions.js +268 -0
package/dist/channels/discord/tool-actions.js.map +1 -0
package/dist/container-runner.d.ts +2 -2
package/dist/container-runner.d.ts.map +1 -1
package/dist/container-runner.js +12 -2
package/dist/container-runner.js.map +1 -1
package/dist/discord.basic.test.d.ts +2 -0
package/dist/discord.basic.test.d.ts.map +1 -0
package/dist/discord.basic.test.js +38 -0
package/dist/discord.basic.test.js.map +1 -0
package/dist/discord.d.ts +5 -44
package/dist/discord.d.ts.map +1 -1
package/dist/discord.js +3 -1468
package/dist/discord.js.map +1 -1
package/dist/gateway-service.d.ts +7 -1
package/dist/gateway-service.d.ts.map +1 -1
package/dist/gateway-service.js +111 -2
package/dist/gateway-service.js.map +1 -1
package/dist/gateway-service.media-routing.test.d.ts +2 -0
package/dist/gateway-service.media-routing.test.d.ts.map +1 -0
package/dist/gateway-service.media-routing.test.js +29 -0
package/dist/gateway-service.media-routing.test.js.map +1 -0
package/dist/gateway-types.d.ts +8 -0
package/dist/gateway-types.d.ts.map +1 -1
package/dist/gateway-types.js.map +1 -1
package/dist/gateway.js +5 -2
package/dist/gateway.js.map +1 -1
package/dist/health.d.ts.map +1 -1
package/dist/health.js +1 -1
package/dist/health.js.map +1 -1
package/dist/heartbeat.d.ts.map +1 -1
package/dist/heartbeat.js +2 -0
package/dist/heartbeat.js.map +1 -1
package/dist/token-efficiency.basic.test.d.ts +2 -0
package/dist/token-efficiency.basic.test.d.ts.map +1 -0
package/dist/token-efficiency.basic.test.js +29 -0
package/dist/token-efficiency.basic.test.js.map +1 -0
package/dist/token-efficiency.d.ts.map +1 -1
package/dist/token-efficiency.js +18 -1
package/dist/token-efficiency.js.map +1 -1
package/dist/types.d.ts +23 -1
package/dist/types.d.ts.map +1 -1
package/package.json +10 -2
package/src/agent.ts +11 -1
package/src/channels/discord/attachments.ts +282 -0
package/src/channels/discord/delivery.ts +99 -0
package/src/channels/discord/inbound.ts +78 -0
package/src/channels/discord/mentions.ts +130 -0
package/src/{discord.ts → channels/discord/runtime.ts} +164 -633
package/src/{discord-stream.ts → channels/discord/stream.ts} +2 -2
package/src/channels/discord/tool-actions.ts +332 -0
package/src/config.ts +6 -0
package/src/container-runner.ts +24 -1
package/src/gateway-service.ts +220 -1
package/src/gateway-types.ts +8 -0
package/src/gateway.ts +5 -5
package/src/health.ts +2 -1
package/src/heartbeat.ts +2 -0
package/src/runtime-config.ts +77 -0
package/src/token-efficiency.ts +17 -1
package/src/types.ts +27 -1
package/tests/discord.basic.test.ts +95 -0
package/tests/gateway-service.media-routing.test.ts +33 -0
package/tests/token-efficiency.basic.test.ts +32 -0
package/vitest.e2e.config.ts +15 -0
package/vitest.integration.config.ts +15 -0
package/vitest.live.config.ts +16 -0
package/vitest.unit.config.ts +15 -0

package/CHANGELOG.md CHANGED Viewed

@@ -8,6 +8,44 @@
 ### Fixed
+## [0.2.3](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.3)
+### Added
+- **Discord channel policy config**: Added typed runtime config support for `discord.groupPolicy` (`open`/`allowlist`/`disabled`), `discord.freeResponseChannels`, and per-guild/per-channel mode overrides at `discord.guilds.<guildId>.channels.<channelId>.mode`.
+- **Discord channel mode slash command**: Added `/channel-mode` with `off`, `mention`, and `free` options to set the active guild channel behavior directly from Discord.
+- **Gateway channel control commands**: Added `channel mode` and `channel policy` command flows for inspecting/updating Discord channel response behavior via `!claw` commands.
+### Changed
+- **Discord trigger enforcement**: Guild message handling now applies channel mode + group policy before normal trigger checks, while still allowing prefixed commands in disabled channels.
+- **Activation/status labeling**: Runtime status output now reflects `disabled`/`allowlist`/mixed free-channel activation modes instead of only legacy mention/all-messages labels.
+### Fixed
+## [0.2.2](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.2)
+### Added
+- **Discord image attachment ingest/cache**: Added receive-time image ingest with local cache under `data/discord-media-cache`, preserving attachment order and carrying `path`, `mimeType`, `sizeBytes`, and `originalUrl` per media item.
+- **Structured media context pipeline**: Added typed media payload (`MediaPaths`/`MediaUrls`/`MediaTypes` equivalents) from Discord runtime through gateway/container request boundaries.
+- **Attachment vision tools**: Added `vision_analyze` (and `image` alias) for Discord-uploaded image analysis using local cached paths first, with Discord CDN URL fallback.
+- **Native multimodal injection**: Added direct image-part injection for vision-capable models, with automatic retry without image parts if the model rejects multimodal payloads.
+- **Scoped Vitest test configs**: Added dedicated `vitest.{unit,integration,e2e,live}.config.ts` files and matching npm scripts (`test:unit`, `test:integration`, `test:e2e`, `test:live`, `test:watch`) for explicit suite boundaries.
+### Changed
+- **Discord channel module layout**: Completed migration of Discord runtime internals into `src/channels/discord/*`, including `runtime.ts` and `stream.ts`, and removed legacy root-level `src/discord.ts` shim.
+- **Image-question tool routing**: Discord image questions now prioritize attachment vision (`vision_analyze`) and block `browser_vision` unless the user explicitly asks about the active browser tab/page.
+- **Browser vision scope guidance**: Updated `browser_vision` tool description to clarify it is for browser-page tasks only, not Discord-uploaded files.
+- **Test runner strategy**: Switched from compiled test artifacts (`dist-tests` + `tsconfig.tests.json`) to direct TypeScript execution via Vitest.
+- **Test file location and conventions**: Moved basic test files from `src/*.test.ts` to `tests/` and aligned naming/scoping conventions for unit/integration/e2e/live suites.
+### Fixed
+- **Discord image analysis fallback behavior**: Added safer cache/CDN fallback handling and guardrails (Discord CDN allowlist, size/type limits, per-image success/failure logging) to avoid brittle image-analysis failures.
+- **Regression coverage for wrong vision tool selection**: Added basic regression test coverage that Discord image questions should not route to browser screenshot vision.
 ## [0.2.1](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.1)
 ### Added

package/README.md CHANGED Viewed

@@ -11,15 +11,15 @@ npm install -g @hybridaione/hybridclaw
 hybridclaw onboarding
 ```
-Latest release: [v0.2.1](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.1)
+Latest release: [v0.2.3](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.3)
-## What's new in v0.2.1
+## What's new in v0.2.3
-- Added OpenClaw-style Discord `message` tool actions (`read`, `member-info`, `channel-info`) to the container runtime
-- Added gateway endpoint `POST /api/discord/action` for Discord context lookups from tools
-- Replaced prompt-time Discord presence snapshots with cache-backed `member-info` presence fields (`status`, `activities`)
-- Routed Discord context lookups through gateway API from container with host remapping and token propagation
-- Enabled `message` tool in heartbeat and base subagent allowlists
+- Added Discord guild channel policy controls with typed config: `discord.groupPolicy`, `discord.freeResponseChannels`, and `discord.guilds.<guildId>.channels.<channelId>.mode`
+- Added `/channel-mode` slash command to switch a channel between `off`, `mention`, and `free`
+- Added `!claw channel mode` and `!claw channel policy` command flows for in-chat policy changes
+- Enforced channel mode/policy in Discord trigger logic while keeping prefixed commands available
+- Updated status/activation labeling to reflect allowlist/disabled/mixed channel policy modes
 ## HybridAI Advantage
@@ -106,6 +106,10 @@ HybridClaw uses typed runtime config in `config.json` (auto-created on first run
 - `discord.respondToAllMessages` changes guild trigger behavior: `false` (default) replies only on mention/`!claw`; `true` replies to every user message in the channel
 - `discord.commandUserId` restricts `!claw <command>` admin commands to a single Discord user ID (all other messages still use normal chat handling)
 - `discord.commandsOnly` optional hard mode: if `true`, the bot ignores non-`!claw` messages and only accepts prefixed commands (optionally limited by `discord.commandUserId`)
+- `discord.groupPolicy` controls guild channel scope: `open` (default), `allowlist`, or `disabled`
+- `discord.freeResponseChannels` is a Hermes-style channel ID list that gets free-response behavior while other channels remain mention-gated
+- `discord.guilds.<guildId>.channels.<channelId>.mode` sets per-channel behavior to `off`, `mention`, or `free` (works with `allowlist` policy)
+- Discord slash commands: `/status` and `/channel-mode <off|mention|free>` (ephemeral replies)
 - `skills.extraDirs` adds additional enterprise/shared skill roots (lowest precedence tier)
 - `proactive.*` controls autonomous behavior (`activeHours`, `delegation`, `autoRetry`, `ralph`)
 - `proactive.ralph.maxIterations` enables Ralph loop (`0` off, `-1` unlimited, `>0` extra autonomous iterations before forcing completion)
@@ -349,6 +353,34 @@ System prompt assembly is handled by a formal hook pipeline:
 Hook toggles live in `config.json` under `promptHooks`.
+## Testing
+Run checks locally:
+```bash
+# Typecheck only (no emit)
+npm run typecheck
+# Strict TS lint gate (unused locals/params)
+npm run lint
+# Unit tests (default `npm test`)
+npm run test:unit
+# Scoped suites (ready for dedicated tests)
+npm run test:integration
+npm run test:e2e
+npm run test:live
+```
+Test layout and scopes:
+- tests live under `tests/` (not `src/`)
+- unit tests: `tests/**/*.test.ts` (excluding `*.integration|*.e2e|*.live`)
+- integration tests: `tests/**/*.integration.test.ts`
+- e2e tests: `tests/**/*.e2e.test.ts`
+- live tests: `tests/**/*.live.test.ts`
 ## Commands
 CLI runtime commands:
@@ -379,12 +411,14 @@ In Discord, use `!claw help` to see all commands. Key ones:
 ## Project structure
 ```
-src/gateway.ts          Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
-src/tui.ts              Terminal adapter (thin client to gateway)
-src/discord.ts          Discord integration and message transport
-src/gateway-service.ts  Core shared agent/session logic used by gateway API
-src/gateway-client.ts   HTTP client used by thin clients (e.g. TUI)
-container/src/          Agent code (tools, HybridAI client, IPC)
-templates/              Workspace bootstrap files
-data/                   Runtime data (gitignored): SQLite DB, sessions, agent workspaces
+src/gateway.ts                    Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
+src/tui.ts                        Terminal adapter (thin client to gateway)
+src/channels/discord/runtime.ts   Discord runtime integration and message transport
+src/channels/discord/*.ts         Discord responsibility modules (inbound, delivery, mentions, attachments, tools, stream)
+src/gateway-service.ts            Core shared agent/session logic used by gateway API
+src/gateway-client.ts             HTTP client used by thin clients (e.g. TUI)
+tests/                            Vitest suites (unit/integration/e2e/live scopes)
+container/src/                    Agent code (tools, HybridAI client, IPC)
+templates/                        Workspace bootstrap files
+data/                             Runtime data (gitignored): SQLite DB, sessions, agent workspaces
 ```

package/config.example.json CHANGED Viewed

@@ -15,7 +15,10 @@
     "presenceIntent": false,
     "respondToAllMessages": false,
     "commandsOnly": false,
-    "commandUserId": ""
+    "commandUserId": "",
+    "groupPolicy": "open",
+    "freeResponseChannels": [],
+    "guilds": {}
   },
   "hybridai": {
     "baseUrl": "https://hybridai.one",

package/container/package-lock.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "hybridclaw-agent",
-  "version": "0.2.1",
+  "version": "0.2.3",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "hybridclaw-agent",
-      "version": "0.2.1",
+      "version": "0.2.3",
       "dependencies": {
         "@mozilla/readability": "^0.6.0",
         "agent-browser": "^0.15.1",

package/container/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "hybridclaw-agent",
-  "version": "0.2.1",
+  "version": "0.2.3",
   "type": "module",
   "scripts": {
     "build": "tsc",

package/container/src/browser-tools.ts CHANGED Viewed

@@ -1199,7 +1199,7 @@ export const BROWSER_TOOL_DEFINITIONS: ToolDefinition[] = [
     function: {
       name: 'browser_vision',
       description:
-        'Capture a screenshot and analyze it with a vision model using the provided question.',
+        'Capture the current browser page screenshot and analyze it with a vision model. Use only for active browser-tab/page tasks, not for Discord-uploaded files.',
       parameters: {
         type: 'object',
         properties: {

package/container/src/index.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import path from 'path';
+import fs from 'fs';
+import { URL } from 'url';
 import { emitRuntimeEvent, runAfterToolHooks, runBeforeToolHooks } from './extensions.js';
 import { callHybridAI, callHybridAIStream, HybridAIRequestError } from './hybridai-client.js';
@@ -15,12 +17,23 @@ import {
   getPendingSideEffects,
   resetSideEffects,
   setGatewayContext,
+  setMediaContext,
   setModelContext,
   setScheduledTasks,
   setSessionContext,
   TOOL_DEFINITIONS,
 } from './tools.js';
-import type { ArtifactMetadata, ChatMessage, ContainerInput, ContainerOutput, ToolDefinition, ToolExecution } from './types.js';
+import type {
+  ArtifactMetadata,
+  ChatContentPart,
+  ChatMessage,
+  ChatMessageContent,
+  ContainerInput,
+  ContainerOutput,
+  MediaContextItem,
+  ToolDefinition,
+  ToolExecution,
+} from './types.js';
 const MAX_ITERATIONS = 20;
 const IDLE_TIMEOUT_MS = parseInt(process.env.CONTAINER_IDLE_TIMEOUT || '300000', 10); // 5 min
@@ -45,10 +58,159 @@ const ARTIFACT_MIME_TYPES: Record<string, string> = {
   '.svg': 'image/svg+xml',
   '.webp': 'image/webp',
 };
+const DISCORD_MEDIA_CACHE_ROOT = '/discord-media-cache';
+const NATIVE_VISION_MAX_IMAGES = 8;
+const NATIVE_VISION_MAX_IMAGE_BYTES = 10 * 1024 * 1024;
+const DISCORD_CDN_HOST_PATTERNS: RegExp[] = [
+  /^cdn\.discordapp\.com$/i,
+  /^media\.discordapp\.net$/i,
+  /^cdn\.discordapp\.net$/i,
+  /^images-ext-\d+\.discordapp\.net$/i,
+];
 /** API key received once via stdin, held in memory for the container lifetime. */
 let storedApiKey = '';
+function normalizeMessageContentToText(content: ChatMessageContent): string {
+  if (typeof content === 'string') return content;
+  if (!Array.isArray(content)) return '';
+  const chunks: string[] = [];
+  for (const part of content) {
+    if (!part || typeof part !== 'object') continue;
+    if (part.type !== 'text') continue;
+    if (typeof part.text !== 'string') continue;
+    if (part.text.trim()) chunks.push(part.text.trim());
+  }
+  return chunks.join('\n').trim();
+}
+function normalizePathSlashes(raw: string): string {
+  return raw.replace(/\\/g, '/');
+}
+function normalizeAllowedLocalImagePath(rawPath: string): string | null {
+  const trimmed = rawPath.trim();
+  if (!trimmed) return null;
+  const workspace = path.posix.normalize(WORKSPACE_ROOT);
+  const mediaRoot = path.posix.normalize(DISCORD_MEDIA_CACHE_ROOT);
+  const candidate = trimmed.startsWith('/')
+    ? path.posix.normalize(normalizePathSlashes(trimmed))
+    : path.posix.normalize(path.posix.join(workspace, normalizePathSlashes(trimmed)));
+  const underWorkspace = candidate === workspace || candidate.startsWith(`${workspace}/`);
+  const underMediaRoot = candidate === mediaRoot || candidate.startsWith(`${mediaRoot}/`);
+  if (!underWorkspace && !underMediaRoot) return null;
+  return candidate;
+}
+function inferImageMimeType(filePath: string, fallbackMime: string | null | undefined): string {
+  const normalizedFallback = String(fallbackMime || '').trim().toLowerCase();
+  if (normalizedFallback.startsWith('image/')) return normalizedFallback;
+  const ext = path.posix.extname(filePath).toLowerCase();
+  return ARTIFACT_MIME_TYPES[ext] || 'image/png';
+}
+function isSafeDiscordCdnUrl(raw: string): boolean {
+  let parsed: URL;
+  try {
+    parsed = new URL(raw);
+  } catch {
+    return false;
+  }
+  if (parsed.protocol !== 'https:') return false;
+  return DISCORD_CDN_HOST_PATTERNS.some((pattern) => pattern.test(parsed.hostname));
+}
+function modelSupportsNativeVision(model: string): boolean {
+  const normalized = model.toLowerCase();
+  if (!normalized) return false;
+  if (
+    normalized.includes('gpt-5')
+    || normalized.includes('gpt-4o')
+    || normalized.includes('gpt-4.1')
+    || normalized.includes('o1')
+    || normalized.includes('o3')
+    || normalized.includes('vision')
+    || normalized.includes('multimodal')
+    || normalized.includes('gemini')
+    || normalized.includes('claude-3')
+  ) {
+    return true;
+  }
+  return false;
+}
+async function resolveMediaImagePartUrl(item: MediaContextItem): Promise<string | null> {
+  const localPath = item.path ? normalizeAllowedLocalImagePath(item.path) : null;
+  if (localPath) {
+    try {
+      const image = await fs.promises.readFile(localPath);
+      if (image.length > NATIVE_VISION_MAX_IMAGE_BYTES) {
+        console.error(`[media] skipping ${localPath}: ${image.length}B exceeds native vision max`);
+      } else {
+        const mimeType = inferImageMimeType(localPath, item.mimeType);
+        const base64 = image.toString('base64');
+        return `data:${mimeType};base64,${base64}`;
+      }
+    } catch (err) {
+      console.error(`[media] failed to read local media ${localPath}: ${err instanceof Error ? err.message : String(err)}`);
+    }
+  }
+  const fallbackCandidates = [item.url, item.originalUrl].map((value) => String(value || '').trim()).filter(Boolean);
+  for (const candidate of fallbackCandidates) {
+    if (!isSafeDiscordCdnUrl(candidate)) continue;
+    return candidate;
+  }
+  return null;
+}
+async function injectNativeVisionContent(
+  messages: ChatMessage[],
+  model: string,
+  media: MediaContextItem[] | undefined,
+): Promise<ChatMessage[]> {
+  if (!Array.isArray(media) || media.length === 0) return messages;
+  if (!modelSupportsNativeVision(model)) return messages;
+  const mediaSlice = media.slice(0, NATIVE_VISION_MAX_IMAGES);
+  const imageParts: ChatContentPart[] = [];
+  for (const item of mediaSlice) {
+    const url = await resolveMediaImagePartUrl(item);
+    if (!url) continue;
+    imageParts.push({ type: 'image_url', image_url: { url } });
+  }
+  if (imageParts.length === 0) return messages;
+  const latestUserIndex = (() => {
+    for (let i = messages.length - 1; i >= 0; i -= 1) {
+      if (messages[i].role === 'user') return i;
+    }
+    return -1;
+  })();
+  if (latestUserIndex < 0) return messages;
+  const cloned = messages.map((msg) => ({ ...msg }));
+  const existingText = normalizeMessageContentToText(cloned[latestUserIndex].content);
+  const contentParts: ChatContentPart[] = [];
+  const nativeVisionHint =
+    '[NativeVision] Image parts are attached in this message. Analyze them directly and skip extra vision tool pre-analysis unless explicitly required.';
+  if (existingText) {
+    contentParts.push({ type: 'text', text: `${existingText}\n\n${nativeVisionHint}` });
+  } else {
+    contentParts.push({ type: 'text', text: nativeVisionHint });
+  }
+  contentParts.push(...imageParts);
+  cloned[latestUserIndex] = {
+    ...cloned[latestUserIndex],
+    content: contentParts,
+  };
+  console.error(`[media] injected ${imageParts.length} native vision image part(s) for model ${model}`);
+  return cloned;
+}
 /**
  * Read a single line from stdin (the initial request JSON containing secrets).
  * Resolves on the first newline — does not consume the entire stream, so docker -i
@@ -104,20 +266,21 @@ function latestUserPrompt(messages: ChatMessage[]): string {
   for (let i = messages.length - 1; i >= 0; i--) {
     const message = messages[i];
     if (message.role !== 'user') continue;
-    const text = String(message.content || '').replace(/\s+/g, ' ').trim();
+    const text = normalizeMessageContentToText(message.content).replace(/\s+/g, ' ').trim();
     if (!text) continue;
     return text.slice(0, 1_200);
   }
   return 'Continue the task';
 }
-function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
-  if (!content) return null;
+function parseRalphChoice(content: ChatMessageContent): 'CONTINUE' | 'STOP' | null {
+  const normalizedContent = normalizeMessageContentToText(content);
+  if (!normalizedContent) return null;
   const re = /<choice>\s*([^<]*)\s*<\/choice>/gi;
   let match: RegExpExecArray | null = null;
   let lastChoice: string | null = null;
   while (true) {
-    match = re.exec(content);
+    match = re.exec(normalizedContent);
     if (!match) break;
     lastChoice = (match[1] || '').trim().toUpperCase();
   }
@@ -125,13 +288,14 @@ function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
   return null;
 }
-function stripRalphChoiceTags(content: string | null): string | null {
-  if (content == null) return content;
-  const stripped = content
+function stripRalphChoiceTags(content: ChatMessageContent): string | null {
+  const normalizedContent = normalizeMessageContentToText(content);
+  if (!normalizedContent) return null;
+  const stripped = normalizedContent
     .replace(/<choice>\s*[^<]*\s*<\/choice>/gi, '')
     .replace(/\n{3,}/g, '\n\n')
     .trim();
-  return stripped || content;
+  return stripped || normalizedContent;
 }
 function buildRalphPrompt(taskPrompt: string, missingChoice: boolean): string {
@@ -487,14 +651,35 @@ async function processRequest(
  * Main loop: read first request from stdin (with secrets), then poll IPC for follow-ups.
  */
 function resolveTools(input: ContainerInput): ToolDefinition[] {
-  const tools = input.allowedTools
+  let tools = input.allowedTools
     ? TOOL_DEFINITIONS.filter((t) => input.allowedTools!.includes(t.function.name))
     : [...TOOL_DEFINITIONS];
+  if (Array.isArray(input.blockedTools) && input.blockedTools.length > 0) {
+    const blocked = new Set(
+      input.blockedTools
+        .map((name) => String(name || '').trim())
+        .filter(Boolean),
+    );
+    tools = tools.filter((tool) => !blocked.has(tool.function.name));
+  }
   // Sort alphabetically for deterministic system-prompt ordering (KV cache stability)
   tools.sort((a, b) => a.function.name.localeCompare(b.function.name));
   return tools;
 }
+function shouldRetryWithoutNativeVision(error: string | undefined): boolean {
+  const normalized = String(error || '').toLowerCase();
+  if (!normalized) return false;
+  return (
+    normalized.includes('image_url')
+    || normalized.includes('unsupported image')
+    || normalized.includes('unsupported content')
+    || normalized.includes('vision')
+    || normalized.includes('multimodal')
+    || normalized.includes('content part')
+  );
+}
 async function main(): Promise<void> {
   console.error(`[hybridclaw-agent] started, idle timeout ${IDLE_TIMEOUT_MS}ms`);
@@ -510,9 +695,15 @@ async function main(): Promise<void> {
   setSessionContext(firstInput.sessionId);
   setGatewayContext(firstInput.gatewayBaseUrl, firstInput.gatewayApiToken, firstInput.channelId);
   setModelContext(firstInput.baseUrl, storedApiKey, firstInput.model, firstInput.chatbotId);
-  const firstOutput = await processRequest(
+  setMediaContext(firstInput.media);
+  const firstMessages = await injectNativeVisionContent(
     firstInput.messages,
+    firstInput.model,
+    firstInput.media,
+  );
+  let firstOutput = await processRequest(
+    firstMessages,
     storedApiKey,
     firstInput.baseUrl,
     firstInput.model,
@@ -520,6 +711,22 @@ async function main(): Promise<void> {
     firstInput.enableRag,
     resolveTools(firstInput),
   );
+  if (
+    firstMessages !== firstInput.messages
+    && firstOutput.status === 'error'
+    && shouldRetryWithoutNativeVision(firstOutput.error)
+  ) {
+    console.error('[media] native vision injection rejected by model; retrying without image parts');
+    firstOutput = await processRequest(
+      firstInput.messages,
+      storedApiKey,
+      firstInput.baseUrl,
+      firstInput.model,
+      firstInput.chatbotId,
+      firstInput.enableRag,
+      resolveTools(firstInput),
+    );
+  }
   firstOutput.sideEffects = getPendingSideEffects();
   writeOutput(firstOutput);
@@ -544,9 +751,15 @@ async function main(): Promise<void> {
     setSessionContext(input.sessionId);
     setGatewayContext(input.gatewayBaseUrl, input.gatewayApiToken, input.channelId);
     setModelContext(input.baseUrl, apiKey, input.model, input.chatbotId);
-    const output = await processRequest(
+    setMediaContext(input.media);
+    const preparedMessages = await injectNativeVisionContent(
       input.messages,
+      input.model,
+      input.media,
+    );
+    let output = await processRequest(
+      preparedMessages,
       apiKey,
       input.baseUrl,
       input.model,
@@ -554,6 +767,22 @@ async function main(): Promise<void> {
       input.enableRag,
       resolveTools(input),
     );
+    if (
+      preparedMessages !== input.messages
+      && output.status === 'error'
+      && shouldRetryWithoutNativeVision(output.error)
+    ) {
+      console.error('[media] native vision injection rejected by model; retrying without image parts');
+      output = await processRequest(
+        input.messages,
+        apiKey,
+        input.baseUrl,
+        input.model,
+        input.chatbotId,
+        input.enableRag,
+        resolveTools(input),
+      );
+    }
     output.sideEffects = getPendingSideEffects();
     writeOutput(output);

package/container/src/token-usage.ts CHANGED Viewed

@@ -26,12 +26,28 @@ export function createTokenUsageStats(): TokenUsageStats {
   };
 }
-export function estimateTextTokens(text: string | null | undefined): number {
+export function estimateTextTokens(text: unknown): number {
   const normalized = typeof text === 'string' ? text : '';
   if (!normalized) return 0;
   return Math.max(1, Math.ceil(normalized.length / CHARS_PER_TOKEN));
 }
+function normalizeContentText(content: ChatMessage['content']): string {
+  if (typeof content === 'string') return content;
+  if (!Array.isArray(content)) return '';
+  const chunks: string[] = [];
+  for (const part of content) {
+    if (part?.type === 'text' && typeof part.text === 'string') {
+      chunks.push(part.text);
+      continue;
+    }
+    if (part?.type === 'image_url' && part.image_url?.url) {
+      chunks.push('[image]');
+    }
+  }
+  return chunks.join('\n');
+}
 export function estimateMessageTokens(messages: ChatMessage[]): number {
   if (!Array.isArray(messages) || messages.length === 0) return 0;
@@ -39,7 +55,7 @@ export function estimateMessageTokens(messages: ChatMessage[]): number {
   for (const message of messages) {
     total += 4;
     total += estimateTextTokens(message.role);
-    total += estimateTextTokens(message.content);
+    total += estimateTextTokens(normalizeContentText(message.content));
     if (message.tool_calls) total += estimateTextTokens(JSON.stringify(message.tool_calls));
     if (message.tool_call_id) total += estimateTextTokens(message.tool_call_id);
   }