@swarmclawai/swarmclaw 0.6.3 → 0.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/README.md +5 -3
  2. package/package.json +5 -1
  3. package/src/app/api/chatrooms/[id]/chat/route.ts +41 -2
  4. package/src/app/api/chatrooms/[id]/route.ts +15 -1
  5. package/src/app/api/chatrooms/route.ts +15 -2
  6. package/src/app/api/schedules/[id]/run/route.ts +3 -0
  7. package/src/app/api/tasks/route.ts +24 -0
  8. package/src/app/api/wallets/[id]/approve/route.ts +62 -0
  9. package/src/app/api/wallets/[id]/balance-history/route.ts +18 -0
  10. package/src/app/api/wallets/[id]/route.ts +118 -0
  11. package/src/app/api/wallets/[id]/send/route.ts +118 -0
  12. package/src/app/api/wallets/[id]/transactions/route.ts +18 -0
  13. package/src/app/api/wallets/route.ts +74 -0
  14. package/src/app/globals.css +8 -0
  15. package/src/app/page.tsx +7 -3
  16. package/src/cli/index.js +15 -0
  17. package/src/cli/spec.js +14 -0
  18. package/src/components/agents/agent-avatar.tsx +15 -1
  19. package/src/components/agents/agent-card.tsx +1 -0
  20. package/src/components/agents/agent-chat-list.tsx +1 -1
  21. package/src/components/agents/agent-sheet.tsx +112 -26
  22. package/src/components/auth/access-key-gate.tsx +22 -11
  23. package/src/components/chat/chat-area.tsx +2 -2
  24. package/src/components/chat/chat-header.tsx +48 -19
  25. package/src/components/chat/chat-tool-toggles.tsx +1 -1
  26. package/src/components/chat/delegation-banner.test.ts +27 -0
  27. package/src/components/chat/delegation-banner.tsx +109 -23
  28. package/src/components/chat/message-bubble.tsx +14 -3
  29. package/src/components/chat/message-list.tsx +5 -4
  30. package/src/components/chat/streaming-bubble.tsx +3 -2
  31. package/src/components/chat/thinking-indicator.tsx +3 -2
  32. package/src/components/chat/tool-call-bubble.test.ts +28 -0
  33. package/src/components/chat/tool-call-bubble.tsx +13 -1
  34. package/src/components/chat/transfer-agent-picker.tsx +1 -1
  35. package/src/components/chatrooms/agent-hover-card.tsx +1 -1
  36. package/src/components/chatrooms/chatroom-input.tsx +7 -6
  37. package/src/components/chatrooms/chatroom-message.tsx +1 -1
  38. package/src/components/chatrooms/chatroom-sheet.tsx +1 -1
  39. package/src/components/chatrooms/chatroom-typing-bar.tsx +1 -1
  40. package/src/components/chatrooms/chatroom-view.tsx +1 -1
  41. package/src/components/connectors/connector-list.tsx +1 -1
  42. package/src/components/home/home-view.tsx +2 -1
  43. package/src/components/input/chat-input.tsx +5 -4
  44. package/src/components/knowledge/knowledge-list.tsx +1 -1
  45. package/src/components/knowledge/knowledge-sheet.tsx +1 -1
  46. package/src/components/layout/app-layout.tsx +23 -9
  47. package/src/components/logs/log-list.tsx +7 -7
  48. package/src/components/memory/memory-agent-list.tsx +1 -1
  49. package/src/components/memory/memory-browser.tsx +1 -0
  50. package/src/components/memory/memory-card.tsx +3 -2
  51. package/src/components/memory/memory-detail.tsx +3 -3
  52. package/src/components/memory/memory-sheet.tsx +2 -2
  53. package/src/components/projects/project-detail.tsx +4 -4
  54. package/src/components/secrets/secret-sheet.tsx +1 -1
  55. package/src/components/secrets/secrets-list.tsx +1 -1
  56. package/src/components/sessions/new-session-sheet.tsx +4 -3
  57. package/src/components/sessions/session-card.tsx +1 -1
  58. package/src/components/shared/agent-picker-list.tsx +1 -1
  59. package/src/components/shared/agent-switch-dialog.tsx +1 -1
  60. package/src/components/shared/settings/section-user-preferences.tsx +4 -4
  61. package/src/components/skills/skill-list.tsx +1 -1
  62. package/src/components/skills/skill-sheet.tsx +1 -1
  63. package/src/components/tasks/task-board.tsx +3 -3
  64. package/src/components/tasks/task-sheet.tsx +21 -1
  65. package/src/components/wallets/wallet-approval-dialog.tsx +99 -0
  66. package/src/components/wallets/wallet-panel.tsx +616 -0
  67. package/src/components/wallets/wallet-section.tsx +100 -0
  68. package/src/hooks/use-media-query.ts +30 -4
  69. package/src/lib/api-client.ts +6 -18
  70. package/src/lib/fetch-timeout.ts +17 -0
  71. package/src/lib/notification-sounds.ts +4 -4
  72. package/src/lib/safe-storage.ts +42 -0
  73. package/src/lib/server/agent-registry.ts +2 -2
  74. package/src/lib/server/chat-execution.ts +35 -3
  75. package/src/lib/server/chatroom-health.ts +60 -0
  76. package/src/lib/server/chatroom-helpers.test.ts +94 -0
  77. package/src/lib/server/chatroom-helpers.ts +64 -11
  78. package/src/lib/server/connectors/inbound-audio-transcription.test.ts +191 -0
  79. package/src/lib/server/connectors/inbound-audio-transcription.ts +261 -0
  80. package/src/lib/server/connectors/manager.ts +80 -2
  81. package/src/lib/server/connectors/whatsapp-text.test.ts +29 -0
  82. package/src/lib/server/connectors/whatsapp-text.ts +26 -0
  83. package/src/lib/server/connectors/whatsapp.ts +8 -5
  84. package/src/lib/server/orchestrator-lg.ts +12 -2
  85. package/src/lib/server/orchestrator.ts +6 -1
  86. package/src/lib/server/queue-followups.test.ts +224 -0
  87. package/src/lib/server/queue.ts +226 -24
  88. package/src/lib/server/scheduler.ts +3 -0
  89. package/src/lib/server/session-tools/chatroom.ts +11 -2
  90. package/src/lib/server/session-tools/context-mgmt.ts +2 -2
  91. package/src/lib/server/session-tools/index.ts +6 -2
  92. package/src/lib/server/session-tools/memory.ts +1 -1
  93. package/src/lib/server/session-tools/shell.ts +1 -1
  94. package/src/lib/server/session-tools/wallet.ts +124 -0
  95. package/src/lib/server/session-tools/web-output.test.ts +29 -0
  96. package/src/lib/server/session-tools/web-output.ts +16 -0
  97. package/src/lib/server/session-tools/web.ts +7 -3
  98. package/src/lib/server/solana.ts +122 -0
  99. package/src/lib/server/storage.ts +38 -0
  100. package/src/lib/server/stream-agent-chat.ts +126 -63
  101. package/src/lib/server/task-mention.test.ts +41 -0
  102. package/src/lib/server/task-mention.ts +3 -2
  103. package/src/lib/tool-definitions.ts +1 -0
  104. package/src/lib/view-routes.ts +6 -1
  105. package/src/stores/use-app-store.ts +17 -11
  106. package/src/types/index.ts +60 -1
@@ -0,0 +1,191 @@
1
+ import { afterEach, beforeEach, describe, it } from 'node:test'
2
+ import assert from 'node:assert/strict'
3
+ import fs from 'node:fs'
4
+ import path from 'node:path'
5
+ import { enrichInboundMessageWithAudioTranscript } from './inbound-audio-transcription'
6
+ import type { InboundMessage } from './types'
7
+ import { UPLOAD_DIR, loadSettings, saveSettings } from '../storage'
8
+
9
+ const ENV_KEYS = [
10
+ 'OPENAI_API_KEY',
11
+ 'SWARMCLAW_OPENAI_STT_API_KEY',
12
+ 'SWARMCLAW_OPENAI_STT_BASE_URL',
13
+ 'SWARMCLAW_OPENAI_STT_MODEL',
14
+ 'SWARMCLAW_ELEVENLABS_STT_MODEL',
15
+ 'SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE',
16
+ 'SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE_TIMEOUT_MS',
17
+ 'SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE_MAX_BYTES',
18
+ 'ELEVENLABS_API_KEY',
19
+ ] as const
20
+
21
+ type EnvSnapshot = Record<(typeof ENV_KEYS)[number], string | undefined>
22
+
23
+ let originalFetch: typeof fetch
24
+ let originalSettings: Record<string, unknown>
25
+ let originalEnv: EnvSnapshot
26
+ let tempFiles: string[] = []
27
+
28
+ function setEnv(name: (typeof ENV_KEYS)[number], value: string | undefined): void {
29
+ if (value === undefined) delete process.env[name]
30
+ else process.env[name] = value
31
+ }
32
+
33
+ function createAudioFixture(name: string): string {
34
+ fs.mkdirSync(UPLOAD_DIR, { recursive: true })
35
+ const filePath = path.join(UPLOAD_DIR, `${Date.now()}-${name}.ogg`)
36
+ fs.writeFileSync(filePath, Buffer.from('voice-note-bytes'))
37
+ tempFiles.push(filePath)
38
+ return filePath
39
+ }
40
+
41
+ function buildInboundMessage(localPath: string, text = '(media message)'): InboundMessage {
42
+ return {
43
+ platform: 'whatsapp',
44
+ channelId: '15550001111@s.whatsapp.net',
45
+ senderId: '15550001111@s.whatsapp.net',
46
+ senderName: 'Tester',
47
+ text,
48
+ media: [{ type: 'audio', localPath, mimeType: 'audio/ogg', fileName: 'voice.ogg' }],
49
+ }
50
+ }
51
+
52
+ beforeEach(() => {
53
+ originalFetch = global.fetch
54
+ originalSettings = loadSettings()
55
+ originalEnv = Object.fromEntries(
56
+ ENV_KEYS.map((key) => [key, process.env[key]]),
57
+ ) as EnvSnapshot
58
+ tempFiles = []
59
+ })
60
+
61
+ afterEach(() => {
62
+ global.fetch = originalFetch
63
+ saveSettings(originalSettings)
64
+ for (const key of ENV_KEYS) setEnv(key, originalEnv[key])
65
+ for (const filePath of tempFiles) fs.rmSync(filePath, { force: true })
66
+ })
67
+
68
+ describe('enrichInboundMessageWithAudioTranscript', () => {
69
+ it('transcribes placeholder audio messages with OpenAI STT', async () => {
70
+ const audioPath = createAudioFixture('openai')
71
+ setEnv('OPENAI_API_KEY', 'openai-test-key')
72
+ setEnv('SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE_TIMEOUT_MS', '5000')
73
+ saveSettings({
74
+ ...originalSettings,
75
+ elevenLabsEnabled: false,
76
+ elevenLabsApiKey: null,
77
+ })
78
+
79
+ let called = 0
80
+ global.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => {
81
+ called += 1
82
+ const url = String(input)
83
+ assert.ok(url.endsWith('/audio/transcriptions'))
84
+ assert.equal(init?.method, 'POST')
85
+ assert.equal((init?.headers as Record<string, string>)?.Authorization, 'Bearer openai-test-key')
86
+ return new Response(JSON.stringify({ text: 'Please move this task to tomorrow morning.' }), {
87
+ status: 200,
88
+ headers: { 'Content-Type': 'application/json' },
89
+ })
90
+ }) as typeof fetch
91
+
92
+ const inbound = buildInboundMessage(audioPath)
93
+ const enriched = await enrichInboundMessageWithAudioTranscript({ msg: inbound })
94
+
95
+ assert.equal(called, 1)
96
+ assert.equal(enriched.text, 'Please move this task to tomorrow morning.')
97
+ })
98
+
99
+ it('tries ElevenLabs first and falls back to OpenAI when ElevenLabs fails', async () => {
100
+ const audioPath = createAudioFixture('fallback')
101
+ setEnv('OPENAI_API_KEY', 'openai-fallback-key')
102
+ saveSettings({
103
+ ...originalSettings,
104
+ elevenLabsEnabled: true,
105
+ elevenLabsApiKey: 'el-test-key',
106
+ })
107
+
108
+ const calledUrls: string[] = []
109
+ global.fetch = (async (input: RequestInfo | URL) => {
110
+ const url = String(input)
111
+ calledUrls.push(url)
112
+ if (url.includes('api.elevenlabs.io/v1/speech-to-text')) {
113
+ return new Response(JSON.stringify({ detail: 'upstream unavailable' }), {
114
+ status: 503,
115
+ headers: { 'Content-Type': 'application/json' },
116
+ })
117
+ }
118
+ if (url.endsWith('/audio/transcriptions')) {
119
+ return new Response(JSON.stringify({ text: 'Fallback transcription succeeded.' }), {
120
+ status: 200,
121
+ headers: { 'Content-Type': 'application/json' },
122
+ })
123
+ }
124
+ return new Response('unexpected url', { status: 404 })
125
+ }) as typeof fetch
126
+
127
+ const inbound = buildInboundMessage(audioPath)
128
+ const enriched = await enrichInboundMessageWithAudioTranscript({ msg: inbound })
129
+
130
+ assert.equal(enriched.text, 'Fallback transcription succeeded.')
131
+ assert.equal(calledUrls.length, 2)
132
+ assert.ok(calledUrls[0].includes('api.elevenlabs.io/v1/speech-to-text'))
133
+ assert.ok(calledUrls[1].endsWith('/audio/transcriptions'))
134
+ })
135
+
136
+ it('skips transcription when the inbound message already has non-placeholder text', async () => {
137
+ const audioPath = createAudioFixture('skip')
138
+ setEnv('OPENAI_API_KEY', 'openai-test-key')
139
+
140
+ let called = false
141
+ global.fetch = (async () => {
142
+ called = true
143
+ return new Response(JSON.stringify({ text: 'should not be used' }), {
144
+ status: 200,
145
+ headers: { 'Content-Type': 'application/json' },
146
+ })
147
+ }) as typeof fetch
148
+
149
+ const inbound = buildInboundMessage(audioPath, 'Already typed this manually')
150
+ const enriched = await enrichInboundMessageWithAudioTranscript({ msg: inbound })
151
+
152
+ assert.equal(enriched.text, 'Already typed this manually')
153
+ assert.equal(called, false)
154
+ })
155
+
156
+ it('returns a clear failure note when STT providers error out', async () => {
157
+ const audioPath = createAudioFixture('provider-error')
158
+ setEnv('OPENAI_API_KEY', 'openai-error-key')
159
+ saveSettings({
160
+ ...originalSettings,
161
+ elevenLabsEnabled: true,
162
+ elevenLabsApiKey: 'el-error-key',
163
+ })
164
+
165
+ global.fetch = (async () => {
166
+ return new Response(JSON.stringify({ error: 'upstream down' }), {
167
+ status: 500,
168
+ headers: { 'Content-Type': 'application/json' },
169
+ })
170
+ }) as typeof fetch
171
+
172
+ const inbound = buildInboundMessage(audioPath)
173
+ const enriched = await enrichInboundMessageWithAudioTranscript({ msg: inbound })
174
+
175
+ assert.ok(enriched.text.toLowerCase().includes('automatic transcription failed'))
176
+ })
177
+
178
+ it('returns a clear note when inbound audio cannot be loaded from disk', async () => {
179
+ const inbound: InboundMessage = {
180
+ platform: 'whatsapp',
181
+ channelId: '15550001111@s.whatsapp.net',
182
+ senderId: '15550001111@s.whatsapp.net',
183
+ senderName: 'Tester',
184
+ text: '(media message)',
185
+ media: [{ type: 'audio', localPath: '/tmp/nonexistent-voice-note.ogg', mimeType: 'audio/ogg', fileName: 'voice.ogg' }],
186
+ }
187
+
188
+ const enriched = await enrichInboundMessageWithAudioTranscript({ msg: inbound })
189
+ assert.ok(enriched.text.toLowerCase().includes('audio attachment could not be loaded'))
190
+ })
191
+ })
@@ -0,0 +1,261 @@
1
+ import fs from 'node:fs'
2
+ import path from 'node:path'
3
+ import { decryptKey, loadCredentials, loadSettings } from '../storage'
4
+ import { mimeFromPath } from './media'
5
+ import type { InboundMessage, InboundMedia } from './types'
6
+
7
+ const PLACEHOLDER_TEXT = new Set([
8
+ '',
9
+ '(media message)',
10
+ '(audio message)',
11
+ '(voice message)',
12
+ '<media:attachment>',
13
+ ])
14
+
15
+ const DEFAULT_MAX_AUDIO_BYTES = 25 * 1024 * 1024
16
+ const DEFAULT_TIMEOUT_MS = 30_000
17
+ const TRANSCRIPTION_UNAVAILABLE_NOTE = '[Voice note received — automatic transcription is unavailable (no STT provider key configured).]'
18
+ const TRANSCRIPTION_FAILED_NOTE = '[Voice note received — automatic transcription failed. Please check STT provider configuration/logs.]'
19
+ const AUDIO_DOWNLOAD_FAILED_NOTE = '[Voice note received — audio attachment could not be loaded for transcription.]'
20
+
21
+ function boolFromEnv(name: string, fallback: boolean): boolean {
22
+ const raw = String(process.env[name] || '').trim().toLowerCase()
23
+ if (!raw) return fallback
24
+ if (['1', 'true', 'yes', 'on', 'enabled'].includes(raw)) return true
25
+ if (['0', 'false', 'no', 'off', 'disabled'].includes(raw)) return false
26
+ return fallback
27
+ }
28
+
29
+ function numberFromEnv(name: string, fallback: number): number {
30
+ const raw = Number.parseInt(String(process.env[name] || '').trim(), 10)
31
+ if (!Number.isFinite(raw) || raw <= 0) return fallback
32
+ return raw
33
+ }
34
+
35
+ function normalizeLanguageCode(raw: unknown): string | undefined {
36
+ const normalized = typeof raw === 'string' ? raw.trim() : ''
37
+ if (!normalized) return undefined
38
+ const token = normalized.split(/[-_]/)[0]?.toLowerCase() || ''
39
+ return /^[a-z]{2,3}$/.test(token) ? token : undefined
40
+ }
41
+
42
+ function isAudioPlaceholder(text: string): boolean {
43
+ return PLACEHOLDER_TEXT.has(text.trim().toLowerCase())
44
+ }
45
+
46
+ function pickInboundAudio(msg: InboundMessage): InboundMedia | null {
47
+ if (!Array.isArray(msg.media) || msg.media.length === 0) return null
48
+ for (const media of msg.media) {
49
+ if (!media) continue
50
+ const isAudio = media.type === 'audio'
51
+ || (typeof media.mimeType === 'string' && media.mimeType.toLowerCase().startsWith('audio/'))
52
+ if (!isAudio) continue
53
+ const localPath = typeof media.localPath === 'string' ? media.localPath.trim() : ''
54
+ if (!localPath || !fs.existsSync(localPath)) continue
55
+ return media
56
+ }
57
+ return null
58
+ }
59
+
60
+ function hasInboundAudio(msg: InboundMessage): boolean {
61
+ if (!Array.isArray(msg.media) || msg.media.length === 0) return false
62
+ return msg.media.some((media) => media?.type === 'audio'
63
+ || (typeof media?.mimeType === 'string' && media.mimeType.toLowerCase().startsWith('audio/')))
64
+ }
65
+
66
+ function extractTranscriptText(payload: unknown): string {
67
+ if (!payload || typeof payload !== 'object') return ''
68
+ const obj = payload as Record<string, unknown>
69
+ if (typeof obj.text === 'string' && obj.text.trim()) return obj.text.trim()
70
+ if (Array.isArray(obj.transcripts)) {
71
+ const merged = obj.transcripts
72
+ .map((entry) => {
73
+ if (!entry || typeof entry !== 'object') return ''
74
+ const text = (entry as Record<string, unknown>).text
75
+ return typeof text === 'string' ? text.trim() : ''
76
+ })
77
+ .filter(Boolean)
78
+ .join(' ')
79
+ .trim()
80
+ if (merged) return merged
81
+ }
82
+ return ''
83
+ }
84
+
85
+ function resolveOpenAiApiKey(preferredCredentialId?: string | null): string | null {
86
+ const envKey = String(process.env.SWARMCLAW_OPENAI_STT_API_KEY || process.env.OPENAI_API_KEY || '').trim()
87
+ if (envKey) return envKey
88
+
89
+ const creds = loadCredentials() as Record<string, { provider?: string; encryptedKey?: string }>
90
+ const candidates: string[] = []
91
+ if (preferredCredentialId) candidates.push(preferredCredentialId)
92
+ for (const [id, cred] of Object.entries(creds)) {
93
+ const provider = String(cred?.provider || '').trim().toLowerCase()
94
+ if (provider === 'openai') candidates.push(id)
95
+ }
96
+ const seen = new Set<string>()
97
+ for (const id of candidates) {
98
+ if (!id || seen.has(id)) continue
99
+ seen.add(id)
100
+ const cred = creds[id]
101
+ const provider = String(cred?.provider || '').trim().toLowerCase()
102
+ if (provider !== 'openai') continue
103
+ if (!cred?.encryptedKey) continue
104
+ try {
105
+ const decrypted = decryptKey(cred.encryptedKey).trim()
106
+ if (decrypted) return decrypted
107
+ } catch { /* ignore invalid credential */ }
108
+ }
109
+
110
+ return null
111
+ }
112
+
113
+ function resolveElevenLabsKey(): string | null {
114
+ const settings = loadSettings()
115
+ const key = String(settings.elevenLabsApiKey || process.env.ELEVENLABS_API_KEY || '').trim()
116
+ return key || null
117
+ }
118
+
119
+ async function transcribeWithElevenLabs(params: {
120
+ apiKey: string
121
+ audioPath: string
122
+ fileName: string
123
+ mimeType: string
124
+ language?: string
125
+ timeoutMs: number
126
+ }): Promise<string> {
127
+ const form = new FormData()
128
+ const modelId = String(process.env.SWARMCLAW_ELEVENLABS_STT_MODEL || 'scribe_v1').trim() || 'scribe_v1'
129
+ form.set('model_id', modelId)
130
+ if (params.language) form.set('language_code', params.language)
131
+ const fileBuffer = fs.readFileSync(params.audioPath)
132
+ const blob = new Blob([fileBuffer], { type: params.mimeType })
133
+ form.set('file', blob, params.fileName)
134
+
135
+ const response = await fetch('https://api.elevenlabs.io/v1/speech-to-text', {
136
+ method: 'POST',
137
+ headers: { 'xi-api-key': params.apiKey },
138
+ body: form,
139
+ signal: AbortSignal.timeout(params.timeoutMs),
140
+ })
141
+ if (!response.ok) {
142
+ const body = await response.text().catch(() => '')
143
+ throw new Error(`elevenlabs stt ${response.status}: ${body.slice(0, 160)}`)
144
+ }
145
+ const json = await response.json().catch(() => null)
146
+ return extractTranscriptText(json)
147
+ }
148
+
149
+ async function transcribeWithOpenAI(params: {
150
+ apiKey: string
151
+ audioPath: string
152
+ fileName: string
153
+ mimeType: string
154
+ language?: string
155
+ timeoutMs: number
156
+ }): Promise<string> {
157
+ const form = new FormData()
158
+ const model = String(process.env.SWARMCLAW_OPENAI_STT_MODEL || 'gpt-4o-mini-transcribe').trim() || 'gpt-4o-mini-transcribe'
159
+ form.set('model', model)
160
+ if (params.language) form.set('language', params.language)
161
+ const fileBuffer = fs.readFileSync(params.audioPath)
162
+ const blob = new Blob([fileBuffer], { type: params.mimeType })
163
+ form.set('file', blob, params.fileName)
164
+
165
+ const base = String(process.env.SWARMCLAW_OPENAI_STT_BASE_URL || 'https://api.openai.com/v1').trim().replace(/\/+$/, '')
166
+ const response = await fetch(`${base}/audio/transcriptions`, {
167
+ method: 'POST',
168
+ headers: { Authorization: `Bearer ${params.apiKey}` },
169
+ body: form,
170
+ signal: AbortSignal.timeout(params.timeoutMs),
171
+ })
172
+ if (!response.ok) {
173
+ const body = await response.text().catch(() => '')
174
+ throw new Error(`openai stt ${response.status}: ${body.slice(0, 160)}`)
175
+ }
176
+ const json = await response.json().catch(() => null)
177
+ return extractTranscriptText(json)
178
+ }
179
+
180
+ /**
181
+ * Convert inbound audio media into text before routing to the agent.
182
+ * This prevents "(media message)" placeholders from reaching the model.
183
+ */
184
+ export async function enrichInboundMessageWithAudioTranscript(params: {
185
+ msg: InboundMessage
186
+ preferredCredentialId?: string | null
187
+ }): Promise<InboundMessage> {
188
+ const { preferredCredentialId } = params
189
+ const msg = params.msg
190
+ if (!boolFromEnv('SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE', true)) return msg
191
+
192
+ const originalText = String(msg.text || '').trim()
193
+ if (!isAudioPlaceholder(originalText)) return msg
194
+
195
+ const inboundAudio = pickInboundAudio(msg)
196
+ if (!inboundAudio) {
197
+ if (hasInboundAudio(msg)) return { ...msg, text: AUDIO_DOWNLOAD_FAILED_NOTE }
198
+ return msg
199
+ }
200
+
201
+ const localPath = String(inboundAudio.localPath || '').trim()
202
+ if (!localPath || !fs.existsSync(localPath)) return { ...msg, text: AUDIO_DOWNLOAD_FAILED_NOTE }
203
+
204
+ const maxBytes = numberFromEnv('SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE_MAX_BYTES', DEFAULT_MAX_AUDIO_BYTES)
205
+ const stat = fs.statSync(localPath)
206
+ if (!stat.isFile() || stat.size <= 0 || stat.size > maxBytes) {
207
+ return { ...msg, text: TRANSCRIPTION_FAILED_NOTE }
208
+ }
209
+
210
+ const mimeType = (inboundAudio.mimeType || mimeFromPath(localPath) || 'application/octet-stream').split(';')[0].trim()
211
+ const fileName = inboundAudio.fileName || path.basename(localPath)
212
+ const timeoutMs = numberFromEnv('SWARMCLAW_CONNECTOR_AUDIO_TRANSCRIBE_TIMEOUT_MS', DEFAULT_TIMEOUT_MS)
213
+ const language = normalizeLanguageCode(loadSettings().speechRecognitionLang)
214
+
215
+ const attempts: Array<{ provider: 'elevenlabs' | 'openai'; run: () => Promise<string> }> = []
216
+ const elevenKey = resolveElevenLabsKey()
217
+ if (elevenKey) {
218
+ attempts.push({
219
+ provider: 'elevenlabs',
220
+ run: () => transcribeWithElevenLabs({
221
+ apiKey: elevenKey,
222
+ audioPath: localPath,
223
+ fileName,
224
+ mimeType,
225
+ language,
226
+ timeoutMs,
227
+ }),
228
+ })
229
+ }
230
+
231
+ const openAiKey = resolveOpenAiApiKey(preferredCredentialId)
232
+ if (openAiKey) {
233
+ attempts.push({
234
+ provider: 'openai',
235
+ run: () => transcribeWithOpenAI({
236
+ apiKey: openAiKey,
237
+ audioPath: localPath,
238
+ fileName,
239
+ mimeType,
240
+ language,
241
+ timeoutMs,
242
+ }),
243
+ })
244
+ }
245
+
246
+ if (attempts.length === 0) return { ...msg, text: TRANSCRIPTION_UNAVAILABLE_NOTE }
247
+
248
+ for (const attempt of attempts) {
249
+ try {
250
+ const transcript = (await attempt.run()).replace(/\s+/g, ' ').trim()
251
+ if (!transcript) continue
252
+ console.log(`[connector] Inbound audio transcribed via ${attempt.provider}: ${path.basename(localPath)}`)
253
+ return { ...msg, text: transcript }
254
+ } catch (err: unknown) {
255
+ const reason = err instanceof Error ? err.message : String(err)
256
+ console.warn(`[connector] Inbound audio transcription failed via ${attempt.provider}: ${reason}`)
257
+ }
258
+ }
259
+
260
+ return { ...msg, text: TRANSCRIPTION_FAILED_NOTE }
261
+ }
@@ -16,12 +16,16 @@ import { requestHeartbeatNow } from '../heartbeat-wake'
16
16
  import { buildCurrentDateTimePromptContext } from '../prompt-runtime-context'
17
17
  import {
18
18
  parseMentions,
19
+ compactChatroomMessages,
19
20
  buildChatroomSystemPrompt,
20
21
  buildSyntheticSession,
21
22
  buildAgentSystemPromptForChatroom,
22
23
  buildHistoryForAgent,
23
24
  resolveApiKey as resolveApiKeyHelper,
24
25
  } from '../chatroom-helpers'
26
+ import { filterHealthyChatroomAgents } from '../chatroom-health'
27
+ import { markProviderFailure, markProviderSuccess } from '../provider-health'
28
+ import { getProvider } from '@/lib/providers'
25
29
  import type { Connector, MessageSource, Chatroom, ChatroomMessage } from '@/types'
26
30
  import type { ConnectorInstance, InboundMessage, InboundMedia } from './types'
27
31
  import {
@@ -35,6 +39,7 @@ import {
35
39
  parsePairingPolicy,
36
40
  type PairingPolicy,
37
41
  } from './pairing'
42
+ import { enrichInboundMessageWithAudioTranscript } from './inbound-audio-transcription'
38
43
 
39
44
  function resolveUploadPathFromUrl(rawUrl: string): string | null {
40
45
  if (!rawUrl) return null
@@ -657,10 +662,27 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
657
662
  if (!chatroom) return '[Error] Chatroom not found.'
658
663
 
659
664
  const agents = loadAgents()
665
+ const preferredCredentialId = (() => {
666
+ if (connector.agentId && agents[connector.agentId]?.credentialId) {
667
+ return agents[connector.agentId].credentialId as string
668
+ }
669
+ for (const agentId of chatroom.agentIds) {
670
+ const credentialId = agents[agentId]?.credentialId
671
+ if (credentialId) return credentialId as string
672
+ }
673
+ return null
674
+ })()
675
+ msg = await enrichInboundMessageWithAudioTranscript({
676
+ msg,
677
+ preferredCredentialId,
678
+ })
679
+
660
680
  const source: MessageSource = {
661
681
  platform: connector.platform,
662
682
  connectorId: connector.id,
663
683
  connectorName: connector.name,
684
+ channelId: msg.channelId,
685
+ senderId: msg.senderId,
664
686
  senderName: msg.senderName,
665
687
  }
666
688
  const inboundText = formatInboundUserText(msg)
@@ -673,6 +695,8 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
673
695
  if (chatroom.autoAddress && mentions.length === 0) {
674
696
  mentions = [...chatroom.agentIds]
675
697
  }
698
+ const mentionHealth = filterHealthyChatroomAgents(mentions, agents)
699
+ mentions = mentionHealth.healthyAgentIds
676
700
 
677
701
  // Create and persist the user message in the chatroom
678
702
  const userMessage: ChatroomMessage = {
@@ -689,12 +713,23 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
689
713
  source,
690
714
  }
691
715
  chatroom.messages.push(userMessage)
716
+ compactChatroomMessages(chatroom)
692
717
  chatroom.updatedAt = Date.now()
693
718
  chatrooms[chatroomId] = chatroom
694
719
  saveChatrooms(chatrooms)
695
720
  notify('chatrooms')
696
721
  notify(`chatroom:${chatroomId}`)
697
722
 
723
+ if (mentions.length === 0) {
724
+ if (mentionHealth.skipped.length > 0) {
725
+ const skippedSummary = mentionHealth.skipped
726
+ .map((row) => `${agents[row.agentId]?.name || row.agentId}: ${row.reason}`)
727
+ .join(', ')
728
+ return `[Error] No healthy agents were available for this request. Skipped: ${skippedSummary}`
729
+ }
730
+ return '[Error] No agents were selected for this request.'
731
+ }
732
+
698
733
  // Process mentioned agents sequentially and collect responses
699
734
  const responses: string[] = []
700
735
  for (const agentId of mentions) {
@@ -704,6 +739,23 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
704
739
  const apiKey = resolveApiKeyHelper(agent.credentialId)
705
740
  const freshChatrooms = loadChatrooms()
706
741
  const freshChatroom = freshChatrooms[chatroomId] as Chatroom
742
+ if (compactChatroomMessages(freshChatroom)) {
743
+ freshChatrooms[chatroomId] = freshChatroom
744
+ saveChatrooms(freshChatrooms)
745
+ notify(`chatroom:${chatroomId}`)
746
+ }
747
+
748
+ const providerInfo = getProvider(agent.provider)
749
+ if (providerInfo?.requiresApiKey && !apiKey) {
750
+ markProviderFailure(agent.provider, 'missing_api_credentials')
751
+ responses.push(`[${agent.name}] [Error] Missing API credentials.`)
752
+ continue
753
+ }
754
+ if (providerInfo?.requiresEndpoint && !agent.apiEndpoint) {
755
+ markProviderFailure(agent.provider, 'missing_api_endpoint')
756
+ responses.push(`[${agent.name}] [Error] Missing endpoint configuration.`)
757
+ continue
758
+ }
707
759
 
708
760
  const syntheticSession = buildSyntheticSession(agent, chatroomId)
709
761
  const agentSystemPrompt = buildAgentSystemPromptForChatroom(agent)
@@ -730,6 +782,7 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
730
782
  platform: connector.platform,
731
783
  connectorId: connector.id,
732
784
  connectorName: connector.name,
785
+ channelId: msg.channelId,
733
786
  }
734
787
  const agentMessage: ChatroomMessage = {
735
788
  id: genId(),
@@ -737,7 +790,10 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
737
790
  senderName: agent.name,
738
791
  role: 'assistant',
739
792
  text: responseText,
740
- mentions: parseMentions(responseText, agents, freshChatroom.agentIds),
793
+ mentions: filterHealthyChatroomAgents(
794
+ parseMentions(responseText, agents, freshChatroom.agentIds),
795
+ agents,
796
+ ).healthyAgentIds,
741
797
  reactions: [],
742
798
  time: Date.now(),
743
799
  source: agentSource,
@@ -750,10 +806,14 @@ async function routeMessageToChatroom(connector: Connector, msg: InboundMessage)
750
806
  saveChatrooms(latestChatrooms)
751
807
  notify(`chatroom:${chatroomId}`)
752
808
 
809
+ markProviderSuccess(agent.provider)
753
810
  responses.push(`[${agent.name}] ${responseText}`)
811
+ } else {
812
+ markProviderSuccess(agent.provider)
754
813
  }
755
814
  } catch (err: unknown) {
756
815
  const errMsg = err instanceof Error ? err.message : String(err)
816
+ markProviderFailure(agent.provider, errMsg)
757
817
  console.error(`[connector] Chatroom agent ${agent.name} error:`, errMsg)
758
818
  }
759
819
  }
@@ -798,6 +858,10 @@ async function routeMessage(connector: Connector, msg: InboundMessage): Promise<
798
858
  if (!effectiveAgentId) return '[Error] Connector has no agent configured.'
799
859
  const agent = agents[effectiveAgentId]
800
860
  if (!agent) return '[Error] Connector agent not found.'
861
+ msg = await enrichInboundMessageWithAudioTranscript({
862
+ msg,
863
+ preferredCredentialId: agent.credentialId || null,
864
+ })
801
865
 
802
866
  // Enqueue system event + heartbeat wake for the agent
803
867
  const preview = (msg.text || '').slice(0, 80)
@@ -931,9 +995,14 @@ async function routeMessage(connector: Connector, msg: InboundMessage): Promise<
931
995
  return commandResult
932
996
  }
933
997
 
934
- // Build system prompt: [userPrompt] \n\n [soul] \n\n [systemPrompt]
998
+ // Build system prompt: [identity] \n\n [userPrompt] \n\n [soul] \n\n [systemPrompt]
935
999
  const settings = loadSettings()
936
1000
  const promptParts: string[] = []
1001
+ // Identity block — agent needs to know who it is
1002
+ const identityLines = [`## My Identity`, `My name is ${agent.name}.`]
1003
+ if (agent.description) identityLines.push(agent.description)
1004
+ identityLines.push('I should always refer to myself by this name. I am not "Assistant" — I have my own name and identity.')
1005
+ promptParts.push(identityLines.join(' '))
937
1006
  if (settings.userPrompt) promptParts.push(settings.userPrompt)
938
1007
  promptParts.push(buildCurrentDateTimePromptContext())
939
1008
  if (agent.soul) promptParts.push(agent.soul)
@@ -960,6 +1029,11 @@ Do not end every reply with a question.
960
1029
  Only ask a question when a specific missing detail blocks progress.
961
1030
  When a task is complete, state the result plainly and stop.
962
1031
 
1032
+ ## Async Update Routing
1033
+ When you start work that may finish later (task, schedule, delegated run), tell the user where updates will be sent.
1034
+ Default to this same ${msg.platform} chat unless the user requested another destination.
1035
+ If channel preference is ambiguous and there are multiple reasonable destinations, ask one short routing question.
1036
+
963
1037
  ## Knowing When Not to Reply
964
1038
  Real conversations have natural pauses — not every message needs a response. Reply with exactly "NO_MESSAGE" (nothing else) to stay silent when replying would feel unnatural or forced.
965
1039
  Stay silent for simple acknowledgments ("okay", "alright", "cool", "got it", "sounds good"), conversation closers ("thanks", "bye", "night", "ttyl"), reactions (emoji, "haha", "lol"), and forwarded content with no question attached.
@@ -987,6 +1061,8 @@ If media sending fails, report the exact error and retry with a corrected path/t
987
1061
  platform: connector.platform,
988
1062
  connectorId: connector.id,
989
1063
  connectorName: connector.name,
1064
+ channelId: msg.channelId,
1065
+ senderId: msg.senderId,
990
1066
  senderName: msg.senderName,
991
1067
  }
992
1068
  session.messages.push({
@@ -1002,6 +1078,7 @@ If media sending fails, report the exact error and retry with a corrected path/t
1002
1078
  const s1 = loadSessions()
1003
1079
  s1[session.id] = session
1004
1080
  saveSessions(s1)
1081
+ notify(`messages:${session.id}`)
1005
1082
 
1006
1083
  // Stream the response
1007
1084
  let fullText = ''
@@ -1109,6 +1186,7 @@ If media sending fails, report the exact error and retry with a corrected path/t
1109
1186
  platform: connector.platform,
1110
1187
  connectorId: connector.id,
1111
1188
  connectorName: connector.name,
1189
+ channelId: msg.channelId,
1112
1190
  }
1113
1191
  if (fullText.trim()) {
1114
1192
  session.messages.push({ role: 'assistant', text: fullText.trim(), time: Date.now(), source: assistantSource })
@@ -0,0 +1,29 @@
1
+ import { describe, it } from 'node:test'
2
+ import assert from 'node:assert/strict'
3
+ import { formatTextForWhatsApp } from './whatsapp-text'
4
+
5
+ describe('formatTextForWhatsApp', () => {
6
+ it('converts markdown links to readable whatsapp text', () => {
7
+ const input = 'See [Google](https://google.com) and [https://x.com](https://x.com)'
8
+ const output = formatTextForWhatsApp(input)
9
+ assert.equal(output, 'See Google: https://google.com and https://x.com')
10
+ })
11
+
12
+ it('converts common markdown emphasis syntax', () => {
13
+ const input = '**Bold** __Italic__ ~~Strike~~'
14
+ const output = formatTextForWhatsApp(input)
15
+ assert.equal(output, 'Bold Italic Strike')
16
+ })
17
+
18
+ it('removes headings and preserves body text', () => {
19
+ const input = '# Title\n\n## Subtitle\nBody line'
20
+ const output = formatTextForWhatsApp(input)
21
+ assert.equal(output, 'Title\n\nSubtitle\nBody line')
22
+ })
23
+
24
+ it('converts code fences to plain text content', () => {
25
+ const input = '```ts\nconst x = 1\n```\n\nDone.'
26
+ const output = formatTextForWhatsApp(input)
27
+ assert.equal(output, 'const x = 1\n\nDone.')
28
+ })
29
+ })