@hybridaione/hybridclaw 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +38 -0
- package/README.md +49 -15
- package/config.example.json +4 -1
- package/container/package-lock.json +2 -2
- package/container/package.json +1 -1
- package/container/src/browser-tools.ts +1 -1
- package/container/src/index.ts +243 -14
- package/container/src/token-usage.ts +18 -2
- package/container/src/tools.ts +339 -1
- package/container/src/types.ts +28 -2
- package/dist/agent.d.ts +2 -2
- package/dist/agent.d.ts.map +1 -1
- package/dist/agent.js +2 -2
- package/dist/agent.js.map +1 -1
- package/dist/channels/discord/attachments.d.ts +9 -0
- package/dist/channels/discord/attachments.d.ts.map +1 -0
- package/dist/channels/discord/attachments.js +245 -0
- package/dist/channels/discord/attachments.js.map +1 -0
- package/dist/channels/discord/delivery.d.ts +31 -0
- package/dist/channels/discord/delivery.d.ts.map +1 -0
- package/dist/channels/discord/delivery.js +60 -0
- package/dist/channels/discord/delivery.js.map +1 -0
- package/dist/channels/discord/inbound.d.ts +20 -0
- package/dist/channels/discord/inbound.d.ts.map +1 -0
- package/dist/channels/discord/inbound.js +44 -0
- package/dist/channels/discord/inbound.js.map +1 -0
- package/dist/channels/discord/mentions.d.ts +14 -0
- package/dist/channels/discord/mentions.d.ts.map +1 -0
- package/dist/channels/discord/mentions.js +118 -0
- package/dist/channels/discord/mentions.js.map +1 -0
- package/dist/channels/discord/runtime.d.ts +22 -0
- package/dist/channels/discord/runtime.d.ts.map +1 -0
- package/dist/channels/discord/runtime.js +972 -0
- package/dist/channels/discord/runtime.js.map +1 -0
- package/dist/channels/discord/stream.d.ts +32 -0
- package/dist/channels/discord/stream.d.ts.map +1 -0
- package/dist/channels/discord/stream.js +196 -0
- package/dist/channels/discord/stream.js.map +1 -0
- package/dist/channels/discord/tool-actions.d.ts +31 -0
- package/dist/channels/discord/tool-actions.d.ts.map +1 -0
- package/dist/channels/discord/tool-actions.js +268 -0
- package/dist/channels/discord/tool-actions.js.map +1 -0
- package/dist/container-runner.d.ts +2 -2
- package/dist/container-runner.d.ts.map +1 -1
- package/dist/container-runner.js +12 -2
- package/dist/container-runner.js.map +1 -1
- package/dist/discord.basic.test.d.ts +2 -0
- package/dist/discord.basic.test.d.ts.map +1 -0
- package/dist/discord.basic.test.js +38 -0
- package/dist/discord.basic.test.js.map +1 -0
- package/dist/discord.d.ts +5 -44
- package/dist/discord.d.ts.map +1 -1
- package/dist/discord.js +3 -1468
- package/dist/discord.js.map +1 -1
- package/dist/gateway-service.d.ts +7 -1
- package/dist/gateway-service.d.ts.map +1 -1
- package/dist/gateway-service.js +111 -2
- package/dist/gateway-service.js.map +1 -1
- package/dist/gateway-service.media-routing.test.d.ts +2 -0
- package/dist/gateway-service.media-routing.test.d.ts.map +1 -0
- package/dist/gateway-service.media-routing.test.js +29 -0
- package/dist/gateway-service.media-routing.test.js.map +1 -0
- package/dist/gateway-types.d.ts +8 -0
- package/dist/gateway-types.d.ts.map +1 -1
- package/dist/gateway-types.js.map +1 -1
- package/dist/gateway.js +5 -2
- package/dist/gateway.js.map +1 -1
- package/dist/health.d.ts.map +1 -1
- package/dist/health.js +1 -1
- package/dist/health.js.map +1 -1
- package/dist/heartbeat.d.ts.map +1 -1
- package/dist/heartbeat.js +2 -0
- package/dist/heartbeat.js.map +1 -1
- package/dist/token-efficiency.basic.test.d.ts +2 -0
- package/dist/token-efficiency.basic.test.d.ts.map +1 -0
- package/dist/token-efficiency.basic.test.js +29 -0
- package/dist/token-efficiency.basic.test.js.map +1 -0
- package/dist/token-efficiency.d.ts.map +1 -1
- package/dist/token-efficiency.js +18 -1
- package/dist/token-efficiency.js.map +1 -1
- package/dist/types.d.ts +23 -1
- package/dist/types.d.ts.map +1 -1
- package/package.json +10 -2
- package/src/agent.ts +11 -1
- package/src/channels/discord/attachments.ts +282 -0
- package/src/channels/discord/delivery.ts +99 -0
- package/src/channels/discord/inbound.ts +78 -0
- package/src/channels/discord/mentions.ts +130 -0
- package/src/{discord.ts → channels/discord/runtime.ts} +164 -633
- package/src/{discord-stream.ts → channels/discord/stream.ts} +2 -2
- package/src/channels/discord/tool-actions.ts +332 -0
- package/src/config.ts +6 -0
- package/src/container-runner.ts +24 -1
- package/src/gateway-service.ts +220 -1
- package/src/gateway-types.ts +8 -0
- package/src/gateway.ts +5 -5
- package/src/health.ts +2 -1
- package/src/heartbeat.ts +2 -0
- package/src/runtime-config.ts +77 -0
- package/src/token-efficiency.ts +17 -1
- package/src/types.ts +27 -1
- package/tests/discord.basic.test.ts +95 -0
- package/tests/gateway-service.media-routing.test.ts +33 -0
- package/tests/token-efficiency.basic.test.ts +32 -0
- package/vitest.e2e.config.ts +15 -0
- package/vitest.integration.config.ts +15 -0
- package/vitest.live.config.ts +16 -0
- package/vitest.unit.config.ts +15 -0
package/CHANGELOG.md
CHANGED
|
@@ -8,6 +8,44 @@
|
|
|
8
8
|
|
|
9
9
|
### Fixed
|
|
10
10
|
|
|
11
|
+
## [0.2.3](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.3)
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- **Discord channel policy config**: Added typed runtime config support for `discord.groupPolicy` (`open`/`allowlist`/`disabled`), `discord.freeResponseChannels`, and per-guild/per-channel mode overrides at `discord.guilds.<guildId>.channels.<channelId>.mode`.
|
|
16
|
+
- **Discord channel mode slash command**: Added `/channel-mode` with `off`, `mention`, and `free` options to set the active guild channel behavior directly from Discord.
|
|
17
|
+
- **Gateway channel control commands**: Added `channel mode` and `channel policy` command flows for inspecting/updating Discord channel response behavior via `!claw` commands.
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
|
|
21
|
+
- **Discord trigger enforcement**: Guild message handling now applies channel mode + group policy before normal trigger checks, while still allowing prefixed commands in disabled channels.
|
|
22
|
+
- **Activation/status labeling**: Runtime status output now reflects `disabled`/`allowlist`/mixed free-channel activation modes instead of only legacy mention/all-messages labels.
|
|
23
|
+
|
|
24
|
+
### Fixed
|
|
25
|
+
|
|
26
|
+
## [0.2.2](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.2)
|
|
27
|
+
|
|
28
|
+
### Added
|
|
29
|
+
|
|
30
|
+
- **Discord image attachment ingest/cache**: Added receive-time image ingest with local cache under `data/discord-media-cache`, preserving attachment order and carrying `path`, `mimeType`, `sizeBytes`, and `originalUrl` per media item.
|
|
31
|
+
- **Structured media context pipeline**: Added typed media payload (`MediaPaths`/`MediaUrls`/`MediaTypes` equivalents) from Discord runtime through gateway/container request boundaries.
|
|
32
|
+
- **Attachment vision tools**: Added `vision_analyze` (and `image` alias) for Discord-uploaded image analysis using local cached paths first, with Discord CDN URL fallback.
|
|
33
|
+
- **Native multimodal injection**: Added direct image-part injection for vision-capable models, with automatic retry without image parts if the model rejects multimodal payloads.
|
|
34
|
+
- **Scoped Vitest test configs**: Added dedicated `vitest.{unit,integration,e2e,live}.config.ts` files and matching npm scripts (`test:unit`, `test:integration`, `test:e2e`, `test:live`, `test:watch`) for explicit suite boundaries.
|
|
35
|
+
|
|
36
|
+
### Changed
|
|
37
|
+
|
|
38
|
+
- **Discord channel module layout**: Completed migration of Discord runtime internals into `src/channels/discord/*`, including `runtime.ts` and `stream.ts`, and removed legacy root-level `src/discord.ts` shim.
|
|
39
|
+
- **Image-question tool routing**: Discord image questions now prioritize attachment vision (`vision_analyze`) and block `browser_vision` unless the user explicitly asks about the active browser tab/page.
|
|
40
|
+
- **Browser vision scope guidance**: Updated `browser_vision` tool description to clarify it is for browser-page tasks only, not Discord-uploaded files.
|
|
41
|
+
- **Test runner strategy**: Switched from compiled test artifacts (`dist-tests` + `tsconfig.tests.json`) to direct TypeScript execution via Vitest.
|
|
42
|
+
- **Test file location and conventions**: Moved basic test files from `src/*.test.ts` to `tests/` and aligned naming/scoping conventions for unit/integration/e2e/live suites.
|
|
43
|
+
|
|
44
|
+
### Fixed
|
|
45
|
+
|
|
46
|
+
- **Discord image analysis fallback behavior**: Added safer cache/CDN fallback handling and guardrails (Discord CDN allowlist, size/type limits, per-image success/failure logging) to avoid brittle image-analysis failures.
|
|
47
|
+
- **Regression coverage for wrong vision tool selection**: Added basic regression test coverage that Discord image questions should not route to browser screenshot vision.
|
|
48
|
+
|
|
11
49
|
## [0.2.1](https://github.com/HybridAIOne/hybridclaw/tree/v0.2.1)
|
|
12
50
|
|
|
13
51
|
### Added
|
package/README.md
CHANGED
|
@@ -11,15 +11,15 @@ npm install -g @hybridaione/hybridclaw
|
|
|
11
11
|
hybridclaw onboarding
|
|
12
12
|
```
|
|
13
13
|
|
|
14
|
-
Latest release: [v0.2.
|
|
14
|
+
Latest release: [v0.2.3](https://github.com/HybridAIOne/hybridclaw/releases/tag/v0.2.3)
|
|
15
15
|
|
|
16
|
-
## What's new in v0.2.
|
|
16
|
+
## What's new in v0.2.3
|
|
17
17
|
|
|
18
|
-
- Added
|
|
19
|
-
- Added
|
|
20
|
-
-
|
|
21
|
-
-
|
|
22
|
-
-
|
|
18
|
+
- Added Discord guild channel policy controls with typed config: `discord.groupPolicy`, `discord.freeResponseChannels`, and `discord.guilds.<guildId>.channels.<channelId>.mode`
|
|
19
|
+
- Added `/channel-mode` slash command to switch a channel between `off`, `mention`, and `free`
|
|
20
|
+
- Added `!claw channel mode` and `!claw channel policy` command flows for in-chat policy changes
|
|
21
|
+
- Enforced channel mode/policy in Discord trigger logic while keeping prefixed commands available
|
|
22
|
+
- Updated status/activation labeling to reflect allowlist/disabled/mixed channel policy modes
|
|
23
23
|
|
|
24
24
|
## HybridAI Advantage
|
|
25
25
|
|
|
@@ -106,6 +106,10 @@ HybridClaw uses typed runtime config in `config.json` (auto-created on first run
|
|
|
106
106
|
- `discord.respondToAllMessages` changes guild trigger behavior: `false` (default) replies only on mention/`!claw`; `true` replies to every user message in the channel
|
|
107
107
|
- `discord.commandUserId` restricts `!claw <command>` admin commands to a single Discord user ID (all other messages still use normal chat handling)
|
|
108
108
|
- `discord.commandsOnly` optional hard mode: if `true`, the bot ignores non-`!claw` messages and only accepts prefixed commands (optionally limited by `discord.commandUserId`)
|
|
109
|
+
- `discord.groupPolicy` controls guild channel scope: `open` (default), `allowlist`, or `disabled`
|
|
110
|
+
- `discord.freeResponseChannels` is a Hermes-style channel ID list that gets free-response behavior while other channels remain mention-gated
|
|
111
|
+
- `discord.guilds.<guildId>.channels.<channelId>.mode` sets per-channel behavior to `off`, `mention`, or `free` (works with `allowlist` policy)
|
|
112
|
+
- Discord slash commands: `/status` and `/channel-mode <off|mention|free>` (ephemeral replies)
|
|
109
113
|
- `skills.extraDirs` adds additional enterprise/shared skill roots (lowest precedence tier)
|
|
110
114
|
- `proactive.*` controls autonomous behavior (`activeHours`, `delegation`, `autoRetry`, `ralph`)
|
|
111
115
|
- `proactive.ralph.maxIterations` enables Ralph loop (`0` off, `-1` unlimited, `>0` extra autonomous iterations before forcing completion)
|
|
@@ -349,6 +353,34 @@ System prompt assembly is handled by a formal hook pipeline:
|
|
|
349
353
|
|
|
350
354
|
Hook toggles live in `config.json` under `promptHooks`.
|
|
351
355
|
|
|
356
|
+
## Testing
|
|
357
|
+
|
|
358
|
+
Run checks locally:
|
|
359
|
+
|
|
360
|
+
```bash
|
|
361
|
+
# Typecheck only (no emit)
|
|
362
|
+
npm run typecheck
|
|
363
|
+
|
|
364
|
+
# Strict TS lint gate (unused locals/params)
|
|
365
|
+
npm run lint
|
|
366
|
+
|
|
367
|
+
# Unit tests (default `npm test`)
|
|
368
|
+
npm run test:unit
|
|
369
|
+
|
|
370
|
+
# Scoped suites (ready for dedicated tests)
|
|
371
|
+
npm run test:integration
|
|
372
|
+
npm run test:e2e
|
|
373
|
+
npm run test:live
|
|
374
|
+
```
|
|
375
|
+
|
|
376
|
+
Test layout and scopes:
|
|
377
|
+
|
|
378
|
+
- tests live under `tests/` (not `src/`)
|
|
379
|
+
- unit tests: `tests/**/*.test.ts` (excluding `*.integration|*.e2e|*.live`)
|
|
380
|
+
- integration tests: `tests/**/*.integration.test.ts`
|
|
381
|
+
- e2e tests: `tests/**/*.e2e.test.ts`
|
|
382
|
+
- live tests: `tests/**/*.live.test.ts`
|
|
383
|
+
|
|
352
384
|
## Commands
|
|
353
385
|
|
|
354
386
|
CLI runtime commands:
|
|
@@ -379,12 +411,14 @@ In Discord, use `!claw help` to see all commands. Key ones:
|
|
|
379
411
|
## Project structure
|
|
380
412
|
|
|
381
413
|
```
|
|
382
|
-
src/gateway.ts
|
|
383
|
-
src/tui.ts
|
|
384
|
-
src/discord.ts
|
|
385
|
-
src/
|
|
386
|
-
src/gateway-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
414
|
+
src/gateway.ts Core runtime entrypoint (DB, scheduler, heartbeat, HTTP API)
|
|
415
|
+
src/tui.ts Terminal adapter (thin client to gateway)
|
|
416
|
+
src/channels/discord/runtime.ts Discord runtime integration and message transport
|
|
417
|
+
src/channels/discord/*.ts Discord responsibility modules (inbound, delivery, mentions, attachments, tools, stream)
|
|
418
|
+
src/gateway-service.ts Core shared agent/session logic used by gateway API
|
|
419
|
+
src/gateway-client.ts HTTP client used by thin clients (e.g. TUI)
|
|
420
|
+
tests/ Vitest suites (unit/integration/e2e/live scopes)
|
|
421
|
+
container/src/ Agent code (tools, HybridAI client, IPC)
|
|
422
|
+
templates/ Workspace bootstrap files
|
|
423
|
+
data/ Runtime data (gitignored): SQLite DB, sessions, agent workspaces
|
|
390
424
|
```
|
package/config.example.json
CHANGED
|
@@ -15,7 +15,10 @@
|
|
|
15
15
|
"presenceIntent": false,
|
|
16
16
|
"respondToAllMessages": false,
|
|
17
17
|
"commandsOnly": false,
|
|
18
|
-
"commandUserId": ""
|
|
18
|
+
"commandUserId": "",
|
|
19
|
+
"groupPolicy": "open",
|
|
20
|
+
"freeResponseChannels": [],
|
|
21
|
+
"guilds": {}
|
|
19
22
|
},
|
|
20
23
|
"hybridai": {
|
|
21
24
|
"baseUrl": "https://hybridai.one",
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "hybridclaw-agent",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
4
|
"lockfileVersion": 3,
|
|
5
5
|
"requires": true,
|
|
6
6
|
"packages": {
|
|
7
7
|
"": {
|
|
8
8
|
"name": "hybridclaw-agent",
|
|
9
|
-
"version": "0.2.
|
|
9
|
+
"version": "0.2.3",
|
|
10
10
|
"dependencies": {
|
|
11
11
|
"@mozilla/readability": "^0.6.0",
|
|
12
12
|
"agent-browser": "^0.15.1",
|
package/container/package.json
CHANGED
|
@@ -1199,7 +1199,7 @@ export const BROWSER_TOOL_DEFINITIONS: ToolDefinition[] = [
|
|
|
1199
1199
|
function: {
|
|
1200
1200
|
name: 'browser_vision',
|
|
1201
1201
|
description:
|
|
1202
|
-
'Capture
|
|
1202
|
+
'Capture the current browser page screenshot and analyze it with a vision model. Use only for active browser-tab/page tasks, not for Discord-uploaded files.',
|
|
1203
1203
|
parameters: {
|
|
1204
1204
|
type: 'object',
|
|
1205
1205
|
properties: {
|
package/container/src/index.ts
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import path from 'path';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import { URL } from 'url';
|
|
2
4
|
|
|
3
5
|
import { emitRuntimeEvent, runAfterToolHooks, runBeforeToolHooks } from './extensions.js';
|
|
4
6
|
import { callHybridAI, callHybridAIStream, HybridAIRequestError } from './hybridai-client.js';
|
|
@@ -15,12 +17,23 @@ import {
|
|
|
15
17
|
getPendingSideEffects,
|
|
16
18
|
resetSideEffects,
|
|
17
19
|
setGatewayContext,
|
|
20
|
+
setMediaContext,
|
|
18
21
|
setModelContext,
|
|
19
22
|
setScheduledTasks,
|
|
20
23
|
setSessionContext,
|
|
21
24
|
TOOL_DEFINITIONS,
|
|
22
25
|
} from './tools.js';
|
|
23
|
-
import type {
|
|
26
|
+
import type {
|
|
27
|
+
ArtifactMetadata,
|
|
28
|
+
ChatContentPart,
|
|
29
|
+
ChatMessage,
|
|
30
|
+
ChatMessageContent,
|
|
31
|
+
ContainerInput,
|
|
32
|
+
ContainerOutput,
|
|
33
|
+
MediaContextItem,
|
|
34
|
+
ToolDefinition,
|
|
35
|
+
ToolExecution,
|
|
36
|
+
} from './types.js';
|
|
24
37
|
|
|
25
38
|
const MAX_ITERATIONS = 20;
|
|
26
39
|
const IDLE_TIMEOUT_MS = parseInt(process.env.CONTAINER_IDLE_TIMEOUT || '300000', 10); // 5 min
|
|
@@ -45,10 +58,159 @@ const ARTIFACT_MIME_TYPES: Record<string, string> = {
|
|
|
45
58
|
'.svg': 'image/svg+xml',
|
|
46
59
|
'.webp': 'image/webp',
|
|
47
60
|
};
|
|
61
|
+
const DISCORD_MEDIA_CACHE_ROOT = '/discord-media-cache';
|
|
62
|
+
const NATIVE_VISION_MAX_IMAGES = 8;
|
|
63
|
+
const NATIVE_VISION_MAX_IMAGE_BYTES = 10 * 1024 * 1024;
|
|
64
|
+
const DISCORD_CDN_HOST_PATTERNS: RegExp[] = [
|
|
65
|
+
/^cdn\.discordapp\.com$/i,
|
|
66
|
+
/^media\.discordapp\.net$/i,
|
|
67
|
+
/^cdn\.discordapp\.net$/i,
|
|
68
|
+
/^images-ext-\d+\.discordapp\.net$/i,
|
|
69
|
+
];
|
|
48
70
|
|
|
49
71
|
/** API key received once via stdin, held in memory for the container lifetime. */
|
|
50
72
|
let storedApiKey = '';
|
|
51
73
|
|
|
74
|
+
function normalizeMessageContentToText(content: ChatMessageContent): string {
|
|
75
|
+
if (typeof content === 'string') return content;
|
|
76
|
+
if (!Array.isArray(content)) return '';
|
|
77
|
+
const chunks: string[] = [];
|
|
78
|
+
for (const part of content) {
|
|
79
|
+
if (!part || typeof part !== 'object') continue;
|
|
80
|
+
if (part.type !== 'text') continue;
|
|
81
|
+
if (typeof part.text !== 'string') continue;
|
|
82
|
+
if (part.text.trim()) chunks.push(part.text.trim());
|
|
83
|
+
}
|
|
84
|
+
return chunks.join('\n').trim();
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function normalizePathSlashes(raw: string): string {
|
|
88
|
+
return raw.replace(/\\/g, '/');
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function normalizeAllowedLocalImagePath(rawPath: string): string | null {
|
|
92
|
+
const trimmed = rawPath.trim();
|
|
93
|
+
if (!trimmed) return null;
|
|
94
|
+
|
|
95
|
+
const workspace = path.posix.normalize(WORKSPACE_ROOT);
|
|
96
|
+
const mediaRoot = path.posix.normalize(DISCORD_MEDIA_CACHE_ROOT);
|
|
97
|
+
|
|
98
|
+
const candidate = trimmed.startsWith('/')
|
|
99
|
+
? path.posix.normalize(normalizePathSlashes(trimmed))
|
|
100
|
+
: path.posix.normalize(path.posix.join(workspace, normalizePathSlashes(trimmed)));
|
|
101
|
+
|
|
102
|
+
const underWorkspace = candidate === workspace || candidate.startsWith(`${workspace}/`);
|
|
103
|
+
const underMediaRoot = candidate === mediaRoot || candidate.startsWith(`${mediaRoot}/`);
|
|
104
|
+
if (!underWorkspace && !underMediaRoot) return null;
|
|
105
|
+
return candidate;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function inferImageMimeType(filePath: string, fallbackMime: string | null | undefined): string {
|
|
109
|
+
const normalizedFallback = String(fallbackMime || '').trim().toLowerCase();
|
|
110
|
+
if (normalizedFallback.startsWith('image/')) return normalizedFallback;
|
|
111
|
+
const ext = path.posix.extname(filePath).toLowerCase();
|
|
112
|
+
return ARTIFACT_MIME_TYPES[ext] || 'image/png';
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
function isSafeDiscordCdnUrl(raw: string): boolean {
|
|
116
|
+
let parsed: URL;
|
|
117
|
+
try {
|
|
118
|
+
parsed = new URL(raw);
|
|
119
|
+
} catch {
|
|
120
|
+
return false;
|
|
121
|
+
}
|
|
122
|
+
if (parsed.protocol !== 'https:') return false;
|
|
123
|
+
return DISCORD_CDN_HOST_PATTERNS.some((pattern) => pattern.test(parsed.hostname));
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function modelSupportsNativeVision(model: string): boolean {
|
|
127
|
+
const normalized = model.toLowerCase();
|
|
128
|
+
if (!normalized) return false;
|
|
129
|
+
if (
|
|
130
|
+
normalized.includes('gpt-5')
|
|
131
|
+
|| normalized.includes('gpt-4o')
|
|
132
|
+
|| normalized.includes('gpt-4.1')
|
|
133
|
+
|| normalized.includes('o1')
|
|
134
|
+
|| normalized.includes('o3')
|
|
135
|
+
|| normalized.includes('vision')
|
|
136
|
+
|| normalized.includes('multimodal')
|
|
137
|
+
|| normalized.includes('gemini')
|
|
138
|
+
|| normalized.includes('claude-3')
|
|
139
|
+
) {
|
|
140
|
+
return true;
|
|
141
|
+
}
|
|
142
|
+
return false;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
async function resolveMediaImagePartUrl(item: MediaContextItem): Promise<string | null> {
|
|
146
|
+
const localPath = item.path ? normalizeAllowedLocalImagePath(item.path) : null;
|
|
147
|
+
if (localPath) {
|
|
148
|
+
try {
|
|
149
|
+
const image = await fs.promises.readFile(localPath);
|
|
150
|
+
if (image.length > NATIVE_VISION_MAX_IMAGE_BYTES) {
|
|
151
|
+
console.error(`[media] skipping ${localPath}: ${image.length}B exceeds native vision max`);
|
|
152
|
+
} else {
|
|
153
|
+
const mimeType = inferImageMimeType(localPath, item.mimeType);
|
|
154
|
+
const base64 = image.toString('base64');
|
|
155
|
+
return `data:${mimeType};base64,${base64}`;
|
|
156
|
+
}
|
|
157
|
+
} catch (err) {
|
|
158
|
+
console.error(`[media] failed to read local media ${localPath}: ${err instanceof Error ? err.message : String(err)}`);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const fallbackCandidates = [item.url, item.originalUrl].map((value) => String(value || '').trim()).filter(Boolean);
|
|
163
|
+
for (const candidate of fallbackCandidates) {
|
|
164
|
+
if (!isSafeDiscordCdnUrl(candidate)) continue;
|
|
165
|
+
return candidate;
|
|
166
|
+
}
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
async function injectNativeVisionContent(
|
|
171
|
+
messages: ChatMessage[],
|
|
172
|
+
model: string,
|
|
173
|
+
media: MediaContextItem[] | undefined,
|
|
174
|
+
): Promise<ChatMessage[]> {
|
|
175
|
+
if (!Array.isArray(media) || media.length === 0) return messages;
|
|
176
|
+
if (!modelSupportsNativeVision(model)) return messages;
|
|
177
|
+
|
|
178
|
+
const mediaSlice = media.slice(0, NATIVE_VISION_MAX_IMAGES);
|
|
179
|
+
const imageParts: ChatContentPart[] = [];
|
|
180
|
+
for (const item of mediaSlice) {
|
|
181
|
+
const url = await resolveMediaImagePartUrl(item);
|
|
182
|
+
if (!url) continue;
|
|
183
|
+
imageParts.push({ type: 'image_url', image_url: { url } });
|
|
184
|
+
}
|
|
185
|
+
if (imageParts.length === 0) return messages;
|
|
186
|
+
|
|
187
|
+
const latestUserIndex = (() => {
|
|
188
|
+
for (let i = messages.length - 1; i >= 0; i -= 1) {
|
|
189
|
+
if (messages[i].role === 'user') return i;
|
|
190
|
+
}
|
|
191
|
+
return -1;
|
|
192
|
+
})();
|
|
193
|
+
if (latestUserIndex < 0) return messages;
|
|
194
|
+
|
|
195
|
+
const cloned = messages.map((msg) => ({ ...msg }));
|
|
196
|
+
const existingText = normalizeMessageContentToText(cloned[latestUserIndex].content);
|
|
197
|
+
const contentParts: ChatContentPart[] = [];
|
|
198
|
+
const nativeVisionHint =
|
|
199
|
+
'[NativeVision] Image parts are attached in this message. Analyze them directly and skip extra vision tool pre-analysis unless explicitly required.';
|
|
200
|
+
if (existingText) {
|
|
201
|
+
contentParts.push({ type: 'text', text: `${existingText}\n\n${nativeVisionHint}` });
|
|
202
|
+
} else {
|
|
203
|
+
contentParts.push({ type: 'text', text: nativeVisionHint });
|
|
204
|
+
}
|
|
205
|
+
contentParts.push(...imageParts);
|
|
206
|
+
cloned[latestUserIndex] = {
|
|
207
|
+
...cloned[latestUserIndex],
|
|
208
|
+
content: contentParts,
|
|
209
|
+
};
|
|
210
|
+
console.error(`[media] injected ${imageParts.length} native vision image part(s) for model ${model}`);
|
|
211
|
+
return cloned;
|
|
212
|
+
}
|
|
213
|
+
|
|
52
214
|
/**
|
|
53
215
|
* Read a single line from stdin (the initial request JSON containing secrets).
|
|
54
216
|
* Resolves on the first newline — does not consume the entire stream, so docker -i
|
|
@@ -104,20 +266,21 @@ function latestUserPrompt(messages: ChatMessage[]): string {
|
|
|
104
266
|
for (let i = messages.length - 1; i >= 0; i--) {
|
|
105
267
|
const message = messages[i];
|
|
106
268
|
if (message.role !== 'user') continue;
|
|
107
|
-
const text =
|
|
269
|
+
const text = normalizeMessageContentToText(message.content).replace(/\s+/g, ' ').trim();
|
|
108
270
|
if (!text) continue;
|
|
109
271
|
return text.slice(0, 1_200);
|
|
110
272
|
}
|
|
111
273
|
return 'Continue the task';
|
|
112
274
|
}
|
|
113
275
|
|
|
114
|
-
function parseRalphChoice(content:
|
|
115
|
-
|
|
276
|
+
function parseRalphChoice(content: ChatMessageContent): 'CONTINUE' | 'STOP' | null {
|
|
277
|
+
const normalizedContent = normalizeMessageContentToText(content);
|
|
278
|
+
if (!normalizedContent) return null;
|
|
116
279
|
const re = /<choice>\s*([^<]*)\s*<\/choice>/gi;
|
|
117
280
|
let match: RegExpExecArray | null = null;
|
|
118
281
|
let lastChoice: string | null = null;
|
|
119
282
|
while (true) {
|
|
120
|
-
match = re.exec(
|
|
283
|
+
match = re.exec(normalizedContent);
|
|
121
284
|
if (!match) break;
|
|
122
285
|
lastChoice = (match[1] || '').trim().toUpperCase();
|
|
123
286
|
}
|
|
@@ -125,13 +288,14 @@ function parseRalphChoice(content: string | null): 'CONTINUE' | 'STOP' | null {
|
|
|
125
288
|
return null;
|
|
126
289
|
}
|
|
127
290
|
|
|
128
|
-
function stripRalphChoiceTags(content:
|
|
129
|
-
|
|
130
|
-
|
|
291
|
+
function stripRalphChoiceTags(content: ChatMessageContent): string | null {
|
|
292
|
+
const normalizedContent = normalizeMessageContentToText(content);
|
|
293
|
+
if (!normalizedContent) return null;
|
|
294
|
+
const stripped = normalizedContent
|
|
131
295
|
.replace(/<choice>\s*[^<]*\s*<\/choice>/gi, '')
|
|
132
296
|
.replace(/\n{3,}/g, '\n\n')
|
|
133
297
|
.trim();
|
|
134
|
-
return stripped ||
|
|
298
|
+
return stripped || normalizedContent;
|
|
135
299
|
}
|
|
136
300
|
|
|
137
301
|
function buildRalphPrompt(taskPrompt: string, missingChoice: boolean): string {
|
|
@@ -487,14 +651,35 @@ async function processRequest(
|
|
|
487
651
|
* Main loop: read first request from stdin (with secrets), then poll IPC for follow-ups.
|
|
488
652
|
*/
|
|
489
653
|
function resolveTools(input: ContainerInput): ToolDefinition[] {
|
|
490
|
-
|
|
654
|
+
let tools = input.allowedTools
|
|
491
655
|
? TOOL_DEFINITIONS.filter((t) => input.allowedTools!.includes(t.function.name))
|
|
492
656
|
: [...TOOL_DEFINITIONS];
|
|
657
|
+
if (Array.isArray(input.blockedTools) && input.blockedTools.length > 0) {
|
|
658
|
+
const blocked = new Set(
|
|
659
|
+
input.blockedTools
|
|
660
|
+
.map((name) => String(name || '').trim())
|
|
661
|
+
.filter(Boolean),
|
|
662
|
+
);
|
|
663
|
+
tools = tools.filter((tool) => !blocked.has(tool.function.name));
|
|
664
|
+
}
|
|
493
665
|
// Sort alphabetically for deterministic system-prompt ordering (KV cache stability)
|
|
494
666
|
tools.sort((a, b) => a.function.name.localeCompare(b.function.name));
|
|
495
667
|
return tools;
|
|
496
668
|
}
|
|
497
669
|
|
|
670
|
+
function shouldRetryWithoutNativeVision(error: string | undefined): boolean {
|
|
671
|
+
const normalized = String(error || '').toLowerCase();
|
|
672
|
+
if (!normalized) return false;
|
|
673
|
+
return (
|
|
674
|
+
normalized.includes('image_url')
|
|
675
|
+
|| normalized.includes('unsupported image')
|
|
676
|
+
|| normalized.includes('unsupported content')
|
|
677
|
+
|| normalized.includes('vision')
|
|
678
|
+
|| normalized.includes('multimodal')
|
|
679
|
+
|| normalized.includes('content part')
|
|
680
|
+
);
|
|
681
|
+
}
|
|
682
|
+
|
|
498
683
|
async function main(): Promise<void> {
|
|
499
684
|
console.error(`[hybridclaw-agent] started, idle timeout ${IDLE_TIMEOUT_MS}ms`);
|
|
500
685
|
|
|
@@ -510,9 +695,15 @@ async function main(): Promise<void> {
|
|
|
510
695
|
setSessionContext(firstInput.sessionId);
|
|
511
696
|
setGatewayContext(firstInput.gatewayBaseUrl, firstInput.gatewayApiToken, firstInput.channelId);
|
|
512
697
|
setModelContext(firstInput.baseUrl, storedApiKey, firstInput.model, firstInput.chatbotId);
|
|
513
|
-
|
|
514
|
-
const
|
|
698
|
+
setMediaContext(firstInput.media);
|
|
699
|
+
const firstMessages = await injectNativeVisionContent(
|
|
515
700
|
firstInput.messages,
|
|
701
|
+
firstInput.model,
|
|
702
|
+
firstInput.media,
|
|
703
|
+
);
|
|
704
|
+
|
|
705
|
+
let firstOutput = await processRequest(
|
|
706
|
+
firstMessages,
|
|
516
707
|
storedApiKey,
|
|
517
708
|
firstInput.baseUrl,
|
|
518
709
|
firstInput.model,
|
|
@@ -520,6 +711,22 @@ async function main(): Promise<void> {
|
|
|
520
711
|
firstInput.enableRag,
|
|
521
712
|
resolveTools(firstInput),
|
|
522
713
|
);
|
|
714
|
+
if (
|
|
715
|
+
firstMessages !== firstInput.messages
|
|
716
|
+
&& firstOutput.status === 'error'
|
|
717
|
+
&& shouldRetryWithoutNativeVision(firstOutput.error)
|
|
718
|
+
) {
|
|
719
|
+
console.error('[media] native vision injection rejected by model; retrying without image parts');
|
|
720
|
+
firstOutput = await processRequest(
|
|
721
|
+
firstInput.messages,
|
|
722
|
+
storedApiKey,
|
|
723
|
+
firstInput.baseUrl,
|
|
724
|
+
firstInput.model,
|
|
725
|
+
firstInput.chatbotId,
|
|
726
|
+
firstInput.enableRag,
|
|
727
|
+
resolveTools(firstInput),
|
|
728
|
+
);
|
|
729
|
+
}
|
|
523
730
|
|
|
524
731
|
firstOutput.sideEffects = getPendingSideEffects();
|
|
525
732
|
writeOutput(firstOutput);
|
|
@@ -544,9 +751,15 @@ async function main(): Promise<void> {
|
|
|
544
751
|
setSessionContext(input.sessionId);
|
|
545
752
|
setGatewayContext(input.gatewayBaseUrl, input.gatewayApiToken, input.channelId);
|
|
546
753
|
setModelContext(input.baseUrl, apiKey, input.model, input.chatbotId);
|
|
547
|
-
|
|
548
|
-
const
|
|
754
|
+
setMediaContext(input.media);
|
|
755
|
+
const preparedMessages = await injectNativeVisionContent(
|
|
549
756
|
input.messages,
|
|
757
|
+
input.model,
|
|
758
|
+
input.media,
|
|
759
|
+
);
|
|
760
|
+
|
|
761
|
+
let output = await processRequest(
|
|
762
|
+
preparedMessages,
|
|
550
763
|
apiKey,
|
|
551
764
|
input.baseUrl,
|
|
552
765
|
input.model,
|
|
@@ -554,6 +767,22 @@ async function main(): Promise<void> {
|
|
|
554
767
|
input.enableRag,
|
|
555
768
|
resolveTools(input),
|
|
556
769
|
);
|
|
770
|
+
if (
|
|
771
|
+
preparedMessages !== input.messages
|
|
772
|
+
&& output.status === 'error'
|
|
773
|
+
&& shouldRetryWithoutNativeVision(output.error)
|
|
774
|
+
) {
|
|
775
|
+
console.error('[media] native vision injection rejected by model; retrying without image parts');
|
|
776
|
+
output = await processRequest(
|
|
777
|
+
input.messages,
|
|
778
|
+
apiKey,
|
|
779
|
+
input.baseUrl,
|
|
780
|
+
input.model,
|
|
781
|
+
input.chatbotId,
|
|
782
|
+
input.enableRag,
|
|
783
|
+
resolveTools(input),
|
|
784
|
+
);
|
|
785
|
+
}
|
|
557
786
|
|
|
558
787
|
output.sideEffects = getPendingSideEffects();
|
|
559
788
|
writeOutput(output);
|
|
@@ -26,12 +26,28 @@ export function createTokenUsageStats(): TokenUsageStats {
|
|
|
26
26
|
};
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
-
export function estimateTextTokens(text:
|
|
29
|
+
export function estimateTextTokens(text: unknown): number {
|
|
30
30
|
const normalized = typeof text === 'string' ? text : '';
|
|
31
31
|
if (!normalized) return 0;
|
|
32
32
|
return Math.max(1, Math.ceil(normalized.length / CHARS_PER_TOKEN));
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
function normalizeContentText(content: ChatMessage['content']): string {
|
|
36
|
+
if (typeof content === 'string') return content;
|
|
37
|
+
if (!Array.isArray(content)) return '';
|
|
38
|
+
const chunks: string[] = [];
|
|
39
|
+
for (const part of content) {
|
|
40
|
+
if (part?.type === 'text' && typeof part.text === 'string') {
|
|
41
|
+
chunks.push(part.text);
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (part?.type === 'image_url' && part.image_url?.url) {
|
|
45
|
+
chunks.push('[image]');
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return chunks.join('\n');
|
|
49
|
+
}
|
|
50
|
+
|
|
35
51
|
export function estimateMessageTokens(messages: ChatMessage[]): number {
|
|
36
52
|
if (!Array.isArray(messages) || messages.length === 0) return 0;
|
|
37
53
|
|
|
@@ -39,7 +55,7 @@ export function estimateMessageTokens(messages: ChatMessage[]): number {
|
|
|
39
55
|
for (const message of messages) {
|
|
40
56
|
total += 4;
|
|
41
57
|
total += estimateTextTokens(message.role);
|
|
42
|
-
total += estimateTextTokens(message.content);
|
|
58
|
+
total += estimateTextTokens(normalizeContentText(message.content));
|
|
43
59
|
if (message.tool_calls) total += estimateTextTokens(JSON.stringify(message.tool_calls));
|
|
44
60
|
if (message.tool_call_id) total += estimateTextTokens(message.tool_call_id);
|
|
45
61
|
}
|