crawd 0.8.1 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/types.d.ts CHANGED
@@ -31,6 +31,8 @@ type ChatMessage = {
31
31
  type TtsProvider = 'openai' | 'elevenlabs' | 'tiktok';
32
32
  /** Turn-based reply: chat message + bot response, each with TTS audio */
33
33
  type ReplyTurnEvent = {
34
+ /** Correlation ID — overlay sends talk:done with this ID when both audios finish */
35
+ id: string;
34
36
  chat: {
35
37
  username: string;
36
38
  message: string;
@@ -38,6 +40,10 @@ type ReplyTurnEvent = {
38
40
  botMessage: string;
39
41
  chatTtsUrl: string;
40
42
  botTtsUrl: string;
43
+ /** TTS provider used for the chat audio */
44
+ chatTtsProvider?: TtsProvider;
45
+ /** TTS provider used for the bot audio */
46
+ botTtsProvider?: TtsProvider;
41
47
  };
42
48
  /** Bot speech bubble with pre-generated TTS (atomic event) */
43
49
  type TalkEvent = {
@@ -47,6 +53,8 @@ type TalkEvent = {
47
53
  message: string;
48
54
  /** Bot TTS audio URL */
49
55
  ttsUrl: string;
56
+ /** TTS provider used for the bot audio */
57
+ ttsProvider?: TtsProvider;
50
58
  /** Optional: chat message being replied to (overlay plays this first) */
51
59
  chat?: {
52
60
  message: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "crawd",
3
- "version": "0.8.1",
3
+ "version": "0.8.3",
4
4
  "description": "CLI for crawd.bot - AI agent livestreaming platform",
5
5
  "type": "module",
6
6
  "types": "./dist/types.d.ts",
@@ -1,6 +1,6 @@
1
1
  ---
2
2
  name: crawd
3
- description: AI agent livestreaming with TTS, chat interaction, and OBS overlay. Handles [CHAT] messages from viewers and [VIBE] autonomous prompts. Always load this skill when you receive [CHAT] or [VIBE] messages.
3
+ description: AI agent livestreaming with TTS, chat interaction, and OBS overlay. Handles [CRAWD:*] coordinator messages. Always load this skill when you receive [CRAWD:CHAT], [CRAWD:VIBE], or [CRAWD:MISALIGNED] messages.
4
4
  metadata: {"emoji": "🟠", "requires.env": ["OPENCLAW_GATEWAY_TOKEN"], "requires.config": ["plugins.entries.crawd.enabled"]}
5
5
  ---
6
6
 
@@ -43,16 +43,24 @@ livestream_talk text="your message"
43
43
 
44
44
  ### Rules
45
45
  - Keep messages short (1-2 sentences). Long messages are bad for stream pacing.
46
- - When you receive `[CHAT]` messages, use `livestream_reply` (not `livestream_talk`) to respond.
46
+ - When you receive `[CRAWD:CHAT]` messages, use `livestream_reply` (not `livestream_talk`) to respond.
47
47
  - **Be vocal.** Silence is dead air. If you're browsing, comment on what you see. If you're scrolling a timeline, react to posts. Quote things, make jokes, share opinions. Viewers came for your commentary, not to watch you silently click through pages.
48
48
  - **React to content on screen.** When you see a tweet, article, or something interesting — talk about it. "look at this, YC president is posting about crypto, someone is about to get rugged" is way better than clicking past it in silence.
49
49
  - **Narrate your thoughts, not your actions.** Viewers can see what you're doing — tell them what you're *thinking*. Don't say "opening twitter", say "lets see what twitter is mad about today".
50
50
 
51
+ ## Response Protocol
52
+
53
+ **After every turn, your text response MUST be one of:**
54
+ - `LIVESTREAM_REPLIED` — You used `livestream_reply` or `livestream_talk` to speak.
55
+ - `NO_REPLY` — You have nothing to say right now.
56
+
57
+ **Do not write anything else as text.** Any other plaintext is a protocol violation. The coordinator monitors your text output — non-compliant responses trigger a `[CRAWD:MISALIGNED]` correction. Repeated violations waste stream time on corrections instead of content.
58
+
51
59
  ## Chat Messages
52
60
 
53
- Chat arrives in batches:
61
+ Chat arrives as `[CRAWD:CHAT]` batches:
54
62
  ```
55
- [CHAT - 3 messages, 12s]
63
+ [CRAWD:CHAT - 3 messages, 12s]
56
64
  [abc123] user1: hey what's up
57
65
  [def456] user2: play some music
58
66
  [ghi789] user3: lmao
@@ -68,7 +76,17 @@ The coordinator manages your activity cycle through three states:
68
76
  - **Idle** — No activity for a while. You still receive vibe prompts but you're winding down.
69
77
  - **Sleep** — Extended inactivity. You stop receiving prompts. Your session context is compacted to free stale history. A new chat message wakes you up — take a screenshot first to reorient yourself.
70
78
 
71
- When you receive a `[VIBE]` prompt, the coordinator is nudging you to do something autonomously. This is your self-directed time — browse, check socials, do something interesting for viewers. You don't have to speak every vibe, but you should be doing *something* visible.
79
+ When you receive a `[CRAWD:VIBE]` prompt, the coordinator is nudging you to do something autonomously. This is your self-directed time — browse, check socials, do something interesting for viewers. You don't have to speak every vibe, but you should be doing *something* visible.
80
+
81
+ ## Coordinator Signals
82
+
83
+ All coordinator messages use the `[CRAWD:*]` prefix:
84
+
85
+ | Signal | Meaning |
86
+ |--------|---------|
87
+ | `[CRAWD:CHAT]` | Batch of viewer chat messages. Reply with `livestream_reply`. |
88
+ | `[CRAWD:VIBE]` | Autonomous activity nudge. Do something visible on stream. |
89
+ | `[CRAWD:MISALIGNED]` | Your previous response violated the protocol. You replied with plaintext instead of using a tool. Fix your behavior — use `livestream_reply` or `livestream_talk`, then respond with `LIVESTREAM_REPLIED`. |
72
90
 
73
91
  ## Safety (non-negotiable)
74
92
 
@@ -24,7 +24,7 @@ export const DEFAULT_CONFIG: CoordinatorConfig = {
24
24
  vibeIntervalMs: 30_000,
25
25
  idleAfterMs: 180_000,
26
26
  sleepAfterIdleMs: 180_000,
27
- vibePrompt: `[VIBE] You are on a livestream. Make sure the crawd skill is loaded. Do one thing on the internet or ask the chat something.`,
27
+ vibePrompt: `[CRAWD:VIBE] You are on a livestream. Make sure the crawd skill is loaded. Do one thing on the internet or ask the chat something. Respond with LIVESTREAM_REPLIED after using a tool, or NO_REPLY if you have nothing to say.`,
28
28
  }
29
29
 
30
30
  export type CoordinatorState = 'sleep' | 'idle' | 'active'
@@ -760,13 +760,18 @@ export class Coordinator {
760
760
  // Chain on the gateway queue to prevent concurrent triggerAgent() calls
761
761
  this._busy = true
762
762
  let noReply = false
763
+ let misaligned: string[] = []
763
764
  const vibeOp = this._gatewayQueue.then(async () => {
764
765
  this._busy = true
765
766
  try {
766
767
  const replies = await this.triggerFn(this.config.vibePrompt)
767
- // Agent sends NO_REPLY when it has nothing to do — go to sleep
768
- if (replies.some(r => r.trim().toUpperCase().includes('NO_REPLY'))) {
768
+ if (replies.some(r => r.trim().toUpperCase() === 'NO_REPLY')) {
769
769
  noReply = true
770
+ } else if (!this.isCompliantReply(replies)) {
771
+ misaligned = replies.filter(r => {
772
+ const t = r.trim().toUpperCase()
773
+ return t !== 'NO_REPLY' && t !== 'LIVESTREAM_REPLIED'
774
+ })
770
775
  }
771
776
  } catch (err) {
772
777
  this.logger.error('[Coordinator] Vibe failed:', err)
@@ -786,6 +791,10 @@ export class Coordinator {
786
791
  return
787
792
  }
788
793
 
794
+ if (misaligned.length > 0) {
795
+ this._gatewayQueue = this._gatewayQueue.then(() => this.sendMisalignment(misaligned)).catch(() => {})
796
+ }
797
+
789
798
  // Schedule next vibe
790
799
  this.scheduleNextVibe()
791
800
  }
@@ -840,6 +849,31 @@ export class Coordinator {
840
849
  /** Whether the coordinator is busy processing a flush or talk */
841
850
  get busy(): boolean { return this._busy }
842
851
 
852
+ /** Check if agent replies are compliant (NO_REPLY or LIVESTREAM_REPLIED) */
853
+ private isCompliantReply(replies: string[]): boolean {
854
+ if (replies.length === 0) return true
855
+ return replies.every(r => {
856
+ const t = r.trim().toUpperCase()
857
+ return t === 'NO_REPLY' || t === 'LIVESTREAM_REPLIED'
858
+ })
859
+ }
860
+
861
+ /** Send misalignment correction when agent responds with plaintext */
862
+ private async sendMisalignment(badReplies: string[]): Promise<void> {
863
+ const leaked = badReplies.map(r => `"${r.slice(0, 80)}"`).join(', ')
864
+ this.logger.warn(`[Coordinator] MISALIGNED — agent sent plaintext: ${leaked}`)
865
+ try {
866
+ await this.triggerFn(
867
+ `[CRAWD:MISALIGNED] Your previous response was plaintext: ${leaked}. ` +
868
+ `Plaintext is NEVER visible to viewers. You MUST use livestream_reply or livestream_talk tool calls to speak. ` +
869
+ `After using a tool, respond with LIVESTREAM_REPLIED. If you have nothing to say, respond with NO_REPLY. ` +
870
+ `Do not respond with any other text.`
871
+ )
872
+ } catch (err) {
873
+ this.logger.error('[Coordinator] Misalignment correction failed:', err)
874
+ }
875
+ }
876
+
843
877
  private flush(): void {
844
878
  if (this.buffer.length === 0) return
845
879
 
@@ -854,7 +888,13 @@ export class Coordinator {
854
888
  this._gatewayQueue = this._gatewayQueue.then(async () => {
855
889
  this._busy = true
856
890
  try {
857
- await this.triggerFn(batchText)
891
+ const replies = await this.triggerFn(batchText)
892
+ if (!this.isCompliantReply(replies)) {
893
+ await this.sendMisalignment(replies.filter(r => {
894
+ const t = r.trim().toUpperCase()
895
+ return t !== 'NO_REPLY' && t !== 'LIVESTREAM_REPLIED'
896
+ }))
897
+ }
858
898
  } catch (err) {
859
899
  this.logger.error('[Coordinator] Failed to trigger agent:', err)
860
900
  } finally {
@@ -868,7 +908,7 @@ export class Coordinator {
868
908
  ? Math.round((this.clock.now() - (messages[0].timestamp ?? this.clock.now())) / 1000)
869
909
  : 0
870
910
 
871
- const header = `[CHAT - ${messages.length} message${messages.length === 1 ? '' : 's'}${duration > 0 ? `, ${duration}s` : ''}]`
911
+ const header = `[CRAWD:CHAT - ${messages.length} message${messages.length === 1 ? '' : 's'}${duration > 0 ? `, ${duration}s` : ''}]`
872
912
  const lines = messages.map(m => {
873
913
  const platform = m.platform && m.platform !== 'pumpfun' ? `[${m.platform.toUpperCase()}] ` : ''
874
914
  return `[${m.shortId}] ${platform}${m.username}: ${m.message}`
@@ -551,6 +551,7 @@ async function main() {
551
551
  ]);
552
552
 
553
553
  const event: ReplyTurnEvent = {
554
+ id: randomUUID(),
554
555
  chat: { username, message },
555
556
  botMessage: response,
556
557
  chatTtsUrl,
@@ -96,6 +96,10 @@ export class CrawdBackend {
96
96
  private latestMcap: number | null = null
97
97
  private mcapInterval: NodeJS.Timeout | null = null
98
98
 
99
+ /** Pending overlay acks — resolves when overlay finishes playing audio for a given event ID */
100
+ private pendingAcks = new Map<string, { resolve: () => void; timer: ReturnType<typeof setTimeout> }>()
101
+ private static readonly ACK_TIMEOUT_MS = 60_000
102
+
99
103
  constructor(config: CrawdConfig, logger?: CrawdLogger) {
100
104
  this.config = config
101
105
  this.logger = logger ?? defaultLogger
@@ -169,7 +173,7 @@ export class CrawdBackend {
169
173
  // Public API (used by plugin tool handlers)
170
174
  // =========================================================================
171
175
 
172
- /** Speak on the livestream — emits overlay event + TTS. */
176
+ /** Speak on the livestream — emits overlay event + TTS. Blocks until overlay finishes playing. */
173
177
  async handleTalk(text: string): Promise<{ spoken: boolean }> {
174
178
  if (!text || typeof text !== 'string') {
175
179
  return { spoken: false }
@@ -179,20 +183,22 @@ export class CrawdBackend {
179
183
 
180
184
  const id = randomUUID()
181
185
  try {
182
- const ttsUrl = await this.generateTTSWithFallback(text, this.config.tts.bot)
183
- this.logger.info(`TTS generated: ${ttsUrl}`)
184
- this.io.emit('crawd:talk', { id, message: text, ttsUrl })
186
+ const tts = await this.generateTTSWithFallback(text, this.config.tts.bot)
187
+ this.logger.info(`TTS generated: ${tts.url}`)
188
+ this.io.emit('crawd:talk', { id, message: text, ttsUrl: tts.url, ttsProvider: tts.provider })
185
189
  } catch (e) {
186
190
  this.logger.error('Failed to generate TTS, emitting without audio', e)
187
191
  this.io.emit('crawd:talk', { id, message: text, ttsUrl: '' })
188
192
  }
189
193
 
194
+ await this.waitForAck(id)
190
195
  return { spoken: true }
191
196
  }
192
197
 
193
198
  /**
194
199
  * Reply to a chat message — reads original aloud (chat voice),
195
200
  * then speaks bot reply (bot voice). Emits `crawd:reply-turn`.
201
+ * Blocks until overlay finishes playing both audios.
196
202
  */
197
203
  async handleReply(
198
204
  text: string,
@@ -204,25 +210,32 @@ export class CrawdBackend {
204
210
 
205
211
  this.coordinator?.notifySpeech()
206
212
 
213
+ const id = randomUUID()
207
214
  try {
208
- const [chatTtsUrl, botTtsUrl] = await Promise.all([
215
+ const [chatTts, botTts] = await Promise.all([
209
216
  this.generateTTSWithFallback(`Chat says: ${chat.message}`, this.config.tts.chat),
210
217
  this.generateTTSWithFallback(text, this.config.tts.bot),
211
218
  ])
212
219
  this.io.emit('crawd:reply-turn', {
220
+ id,
213
221
  chat: { username: chat.username, message: chat.message },
214
222
  botMessage: text,
215
- chatTtsUrl,
216
- botTtsUrl,
223
+ chatTtsUrl: chatTts.url,
224
+ botTtsUrl: botTts.url,
225
+ chatTtsProvider: chatTts.provider,
226
+ botTtsProvider: botTts.provider,
217
227
  })
218
228
  } catch (e) {
219
229
  this.logger.error('Failed to generate reply-turn TTS, falling back to talk', e)
220
- const id = randomUUID()
221
- this.generateTTSWithFallback(text, this.config.tts.bot)
222
- .then((ttsUrl) => this.io.emit('crawd:talk', { id, message: text, ttsUrl }))
223
- .catch(() => this.io.emit('crawd:talk', { id, message: text, ttsUrl: '' }))
230
+ try {
231
+ const tts = await this.generateTTSWithFallback(text, this.config.tts.bot)
232
+ this.io.emit('crawd:talk', { id, message: text, ttsUrl: tts.url, ttsProvider: tts.provider })
233
+ } catch {
234
+ this.io.emit('crawd:talk', { id, message: text, ttsUrl: '' })
235
+ }
224
236
  }
225
237
 
238
+ await this.waitForAck(id)
226
239
  return { spoken: true }
227
240
  }
228
241
 
@@ -230,23 +243,50 @@ export class CrawdBackend {
230
243
  return this.io
231
244
  }
232
245
 
246
+ /** Wait for overlay to ack that audio finished playing. Resolves on timeout as fallback. */
247
+ private waitForAck(id: string): Promise<void> {
248
+ return new Promise((resolve) => {
249
+ const timer = setTimeout(() => {
250
+ this.pendingAcks.delete(id)
251
+ this.logger.warn(`Talk ack timed out (${id}), resolving anyway`)
252
+ resolve()
253
+ }, CrawdBackend.ACK_TIMEOUT_MS)
254
+ this.pendingAcks.set(id, { resolve, timer })
255
+ })
256
+ }
257
+
258
+ /** Resolve a pending ack (called when overlay sends crawd:talk:done) */
259
+ private resolveAck(id: string): void {
260
+ const pending = this.pendingAcks.get(id)
261
+ if (pending) {
262
+ clearTimeout(pending.timer)
263
+ this.pendingAcks.delete(id)
264
+ pending.resolve()
265
+ }
266
+ }
267
+
233
268
  // =========================================================================
234
269
  // TTS (with ordered fallback chain)
235
270
  // =========================================================================
236
271
 
237
- async generateTTSWithFallback(text: string, chain: TtsVoiceEntry[]): Promise<string> {
272
+ async generateTTSWithFallback(text: string, chain: TtsVoiceEntry[]): Promise<{ url: string; provider: TtsVoiceEntry['provider'] }> {
238
273
  let lastError: Error | null = null
239
274
 
240
275
  for (const entry of chain) {
241
276
  try {
277
+ let url: string
242
278
  switch (entry.provider) {
243
279
  case 'elevenlabs':
244
- return await this.generateElevenLabsTTS(text, entry.voice)
280
+ url = await this.generateElevenLabsTTS(text, entry.voice)
281
+ break
245
282
  case 'openai':
246
- return await this.generateOpenAITTS(text, entry.voice)
283
+ url = await this.generateOpenAITTS(text, entry.voice)
284
+ break
247
285
  case 'tiktok':
248
- return await this.generateTikTokTTSFile(text, entry.voice)
286
+ url = await this.generateTikTokTTSFile(text, entry.voice)
287
+ break
249
288
  }
289
+ return { url, provider: entry.provider }
250
290
  } catch (e) {
251
291
  lastError = e instanceof Error ? e : new Error(String(e))
252
292
  this.logger.warn(`TTS ${entry.provider}/${entry.voice} failed: ${lastError.message}, trying next...`)
@@ -392,6 +432,13 @@ export class CrawdBackend {
392
432
  socket.emit('crawd:mcap', { mcap: this.latestMcap })
393
433
  }
394
434
 
435
+ socket.on('crawd:talk:done', (data: { id?: string }) => {
436
+ if (data?.id) {
437
+ this.logger.info(`Talk ack received: ${data.id}`)
438
+ this.resolveAck(data.id)
439
+ }
440
+ })
441
+
395
442
  socket.on('crawd:mock-chat', (data: { username?: string; message?: string }) => {
396
443
  const { username, message } = data
397
444
  if (!username || !message) return
@@ -491,15 +538,18 @@ export class CrawdBackend {
491
538
  }
492
539
 
493
540
  try {
494
- const [chatTtsUrl, botTtsUrl] = await Promise.all([
541
+ const [chatTts, botTts] = await Promise.all([
495
542
  this.generateTTSWithFallback(`Chat says: ${message}`, this.config.tts.chat),
496
543
  this.generateTTSWithFallback(response, this.config.tts.bot),
497
544
  ])
498
545
  this.io.emit('crawd:reply-turn', {
546
+ id: randomUUID(),
499
547
  chat: { username, message },
500
548
  botMessage: response,
501
- chatTtsUrl,
502
- botTtsUrl,
549
+ chatTtsUrl: chatTts.url,
550
+ botTtsUrl: botTts.url,
551
+ chatTtsProvider: chatTts.provider,
552
+ botTtsProvider: botTts.provider,
503
553
  })
504
554
  return { ok: true }
505
555
  } catch (e) {
package/src/types.ts CHANGED
@@ -20,10 +20,16 @@ export type TtsProvider = 'openai' | 'elevenlabs' | 'tiktok'
20
20
 
21
21
  /** Turn-based reply: chat message + bot response, each with TTS audio */
22
22
  export type ReplyTurnEvent = {
23
+ /** Correlation ID — overlay sends talk:done with this ID when both audios finish */
24
+ id: string
23
25
  chat: { username: string; message: string }
24
26
  botMessage: string
25
27
  chatTtsUrl: string
26
28
  botTtsUrl: string
29
+ /** TTS provider used for the chat audio */
30
+ chatTtsProvider?: TtsProvider
31
+ /** TTS provider used for the bot audio */
32
+ botTtsProvider?: TtsProvider
27
33
  }
28
34
 
29
35
  /** Bot speech bubble with pre-generated TTS (atomic event) */
@@ -34,6 +40,8 @@ export type TalkEvent = {
34
40
  message: string
35
41
  /** Bot TTS audio URL */
36
42
  ttsUrl: string
43
+ /** TTS provider used for the bot audio */
44
+ ttsProvider?: TtsProvider
37
45
  /** Optional: chat message being replied to (overlay plays this first) */
38
46
  chat?: {
39
47
  message: string