oomi-ai 0.2.16 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -133,7 +133,7 @@ That bridge:
133
133
  This is the part of the package most likely to matter when debugging voice turn failures.
134
134
 
135
135
  For managed voice replies, the extension also preserves an explicit hidden `metadata.spoken` sidecar when upstream provides one.
136
- If upstream does not provide one, the extension now synthesizes a conservative hidden fallback from the visible assistant text so backend TTS can speak a cleaner version without changing user-visible chat.
136
+ If upstream does not provide one, the extension now synthesizes a bounded hidden fallback from the visible assistant text so backend TTS can speak a cleaner and more varied version without changing user-visible chat.
137
137
 
138
138
  ## Bridge Health States
139
139
 
@@ -164,13 +164,15 @@ Rules:
164
164
  - visible `content` remains the source of truth for Oomi chat rendering
165
165
  - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
166
166
  - `metadata.spoken.text` is for backend TTS only
167
+ - `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
168
+ - `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
167
169
  - `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
168
170
  - `metadata.spoken.style` is optional metadata for debugging/future mapping
169
171
  - if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
170
172
 
171
173
  Current plugin behavior:
172
174
  - if you provide `metadata.spoken`, the plugin preserves it unchanged
173
- - if you do not provide `metadata.spoken`, the plugin now synthesizes a conservative hidden fallback from visible assistant text for backend TTS
175
+ - if you do not provide `metadata.spoken`, the plugin now synthesizes a bounded hidden fallback from visible assistant text for backend TTS
174
176
  - visible chat text is still never rewritten by the plugin
175
177
 
176
178
  ## Avatar Commands
@@ -178,6 +178,154 @@ function extractCorrelationId(payload) {
178
178
  return '';
179
179
  }
180
180
 
181
+ const BOUNDED_LANGUAGE_TYPES = new Set([
182
+ 'Auto',
183
+ 'Chinese',
184
+ 'English',
185
+ 'German',
186
+ 'Italian',
187
+ 'Portuguese',
188
+ 'Spanish',
189
+ 'Japanese',
190
+ 'Korean',
191
+ 'French',
192
+ 'Russian',
193
+ ]);
194
+
195
+ const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
196
+ const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
197
+ const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
198
+ const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
199
+
200
+ function inferSpokenLanguage(text) {
201
+ const normalized = toString(text);
202
+ if (!normalized) return 'English';
203
+ return 'English';
204
+ }
205
+
206
+ function normalizeSpokenSegment(segment) {
207
+ if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
208
+
209
+ const text = toString(segment.text);
210
+ if (!text) return null;
211
+
212
+ const normalized = { text };
213
+ const pace = toString(segment.pace);
214
+ const pitch = toString(segment.pitch);
215
+ const energy = toString(segment.energy);
216
+ const volume = toString(segment.volume);
217
+ const pauseAfterMs = toNumber(segment.pause_after_ms, 0, { min: 0, max: 1200 });
218
+
219
+ if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
220
+ if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
221
+ if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
222
+ if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
223
+ normalized.pause_after_ms = pauseAfterMs;
224
+
225
+ return normalized;
226
+ }
227
+
228
+ function splitSpeechSegments(text) {
229
+ const normalized = normalizeSpeechText(text);
230
+ if (!normalized) return [];
231
+
232
+ const baseSegments = normalized
233
+ .split(/(?<=[.!?])\s+/)
234
+ .map((segment) => segment.trim())
235
+ .filter(Boolean);
236
+
237
+ const segments = [];
238
+ for (const segment of baseSegments) {
239
+ if (segment.length <= 96) {
240
+ segments.push(segment);
241
+ continue;
242
+ }
243
+
244
+ const clauseParts = segment
245
+ .split(/,\s+/)
246
+ .map((part) => part.trim())
247
+ .filter(Boolean);
248
+
249
+ if (clauseParts.length > 1) {
250
+ for (let index = 0; index < clauseParts.length; index += 1) {
251
+ const part = clauseParts[index];
252
+ const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
253
+ segments.push(needsComma ? `${part},` : part);
254
+ }
255
+ continue;
256
+ }
257
+
258
+ segments.push(segment);
259
+ }
260
+
261
+ if (segments.length <= 5) return segments;
262
+
263
+ return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
264
+ }
265
+
266
+ function inferSegmentStyle(segmentText, index, totalSegments) {
267
+ const normalized = segmentText.toLowerCase();
268
+ const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
269
+ const curious = /\?/.test(segmentText);
270
+ const reflective =
271
+ /\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
272
+ segmentText.length > 60;
273
+
274
+ if (curious) {
275
+ return {
276
+ pace: 'medium',
277
+ pitch: 'slightly_high',
278
+ energy: 'warm',
279
+ volume: 'normal',
280
+ pause_after_ms: 0,
281
+ };
282
+ }
283
+
284
+ if (exclamatory) {
285
+ return {
286
+ pace: 'medium_fast',
287
+ pitch: 'slightly_high',
288
+ energy: 'bright',
289
+ volume: 'normal',
290
+ pause_after_ms: index < totalSegments - 1 ? 220 : 0,
291
+ };
292
+ }
293
+
294
+ if (reflective) {
295
+ return {
296
+ pace: 'medium',
297
+ pitch: 'neutral',
298
+ energy: 'warm',
299
+ volume: 'normal',
300
+ pause_after_ms: index < totalSegments - 1 ? 260 : 0,
301
+ };
302
+ }
303
+
304
+ return {
305
+ pace: 'medium',
306
+ pitch: 'neutral',
307
+ energy: 'warm',
308
+ volume: 'normal',
309
+ pause_after_ms: index < totalSegments - 1 ? 180 : 0,
310
+ };
311
+ }
312
+
313
+ function synthesizeSpokenSegments(text) {
314
+ const language = inferSpokenLanguage(text);
315
+ const rawSegments = splitSpeechSegments(text);
316
+ if (rawSegments.length === 0) return null;
317
+
318
+ const segments = rawSegments.map((segmentText, index) => ({
319
+ text: segmentText,
320
+ ...inferSegmentStyle(segmentText, index, rawSegments.length),
321
+ }));
322
+
323
+ return {
324
+ language,
325
+ segments,
326
+ };
327
+ }
328
+
181
329
  function normalizeSpokenMetadata(spoken) {
182
330
  if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
183
331
 
@@ -185,12 +333,33 @@ function normalizeSpokenMetadata(spoken) {
185
333
  if (!text) return null;
186
334
 
187
335
  const normalized = { text };
336
+ const language = toString(spoken.language);
337
+ if (BOUNDED_LANGUAGE_TYPES.has(language)) {
338
+ normalized.language = language;
339
+ }
340
+
341
+ const explicitSegments =
342
+ Array.isArray(spoken.segments)
343
+ ? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
344
+ : [];
345
+ if (explicitSegments.length > 0) {
346
+ normalized.segments = explicitSegments;
347
+ }
348
+
188
349
  const instructions = toString(spoken.instructions);
189
350
  if (instructions) normalized.instructions = instructions;
190
351
  if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
191
352
  normalized.style = spoken.style;
192
353
  }
193
354
 
355
+ const fallbackSegments = synthesizeSpokenSegments(text);
356
+ if (!normalized.language && fallbackSegments?.language) {
357
+ normalized.language = fallbackSegments.language;
358
+ }
359
+ if (!normalized.segments && fallbackSegments?.segments?.length) {
360
+ normalized.segments = fallbackSegments.segments;
361
+ }
362
+
194
363
  return normalized;
195
364
  }
196
365
 
@@ -216,6 +385,7 @@ function normalizeSpeechText(text) {
216
385
  function inferSpokenMetadataFromContent(content) {
217
386
  const text = normalizeSpeechText(toString(content));
218
387
  if (!text) return null;
388
+ const synthesized = synthesizeSpokenSegments(text);
219
389
 
220
390
  const normalized = text.toLowerCase();
221
391
  const upbeat =
@@ -228,6 +398,8 @@ function inferSpokenMetadataFromContent(content) {
228
398
  if (upbeat) {
229
399
  return {
230
400
  text,
401
+ language: synthesized?.language || 'English',
402
+ segments: synthesized?.segments,
231
403
  instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
232
404
  style: { emotion: 'upbeat', energy: 'medium' },
233
405
  };
@@ -236,6 +408,8 @@ function inferSpokenMetadataFromContent(content) {
236
408
  if (gentle) {
237
409
  return {
238
410
  text,
411
+ language: synthesized?.language || 'English',
412
+ segments: synthesized?.segments,
239
413
  instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
240
414
  style: { emotion: 'gentle', energy: 'low' },
241
415
  };
@@ -244,6 +418,8 @@ function inferSpokenMetadataFromContent(content) {
244
418
  if (curious) {
245
419
  return {
246
420
  text,
421
+ language: synthesized?.language || 'English',
422
+ segments: synthesized?.segments,
247
423
  instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
248
424
  style: { emotion: 'curious', energy: 'medium' },
249
425
  };
@@ -251,6 +427,8 @@ function inferSpokenMetadataFromContent(content) {
251
427
 
252
428
  return {
253
429
  text,
430
+ language: synthesized?.language || 'English',
431
+ segments: synthesized?.segments,
254
432
  instructions: 'Speak naturally with light warmth and conversational pacing.',
255
433
  style: { emotion: 'neutral', energy: 'medium' },
256
434
  };
@@ -2,7 +2,7 @@
2
2
  "id": "oomi-ai",
3
3
  "name": "Oomi Channel Plugin",
4
4
  "description": "Managed Oomi channel integration for OpenClaw.",
5
- "version": "0.2.16",
5
+ "version": "0.2.17",
6
6
  "author": "Oomi",
7
7
  "license": "MIT",
8
8
  "openclawVersion": ">=0.5.0",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "oomi-ai",
3
- "version": "0.2.16",
3
+ "version": "0.2.17",
4
4
  "description": "Oomi OpenClaw channel plugin and bridge tooling",
5
5
  "bin": {
6
6
  "oomi": "bin/oomi-ai.js"
@@ -139,6 +139,25 @@ Use this shape when a voice turn needs more natural delivery without changing vi
139
139
  "metadata": {
140
140
  "spoken": {
141
141
  "text": "Speech-optimized text for TTS only.",
142
+ "language": "English",
143
+ "segments": [
144
+ {
145
+ "text": "Hey! It's Nemu, but close enough.",
146
+ "pace": "medium_fast",
147
+ "pitch": "slightly_high",
148
+ "energy": "bright",
149
+ "volume": "normal",
150
+ "pause_after_ms": 220
151
+ },
152
+ {
153
+ "text": "Right now, I'm just waking up into this conversation with you.",
154
+ "pace": "medium",
155
+ "pitch": "neutral",
156
+ "energy": "warm",
157
+ "volume": "normal",
158
+ "pause_after_ms": 280
159
+ }
160
+ ],
142
161
  "instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
143
162
  "style": {
144
163
  "emotion": "excited",
@@ -154,9 +173,11 @@ Rules:
154
173
  - do not place raw intonation tags in visible chat
155
174
  - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
156
175
  - `metadata.spoken.text` is backend TTS input only
176
+ - `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
177
+ - `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
157
178
  - `metadata.spoken.instructions` should use natural-language speaking guidance
158
179
  - if the speech sidecar is absent, Oomi speaks the visible assistant text
159
- - if you omit `metadata.spoken`, the plugin synthesizes a conservative hidden fallback from visible assistant text
180
+ - if you omit `metadata.spoken`, the plugin synthesizes a bounded hidden fallback from visible assistant text
160
181
 
161
182
  ## Avatar Control
162
183
 
@@ -71,8 +71,10 @@ Rules:
71
71
  - visible `content` remains the source of truth for Oomi chat rendering
72
72
  - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
73
73
  - `metadata.spoken.text` is for backend TTS only
74
+ - `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
75
+ - `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
74
76
  - `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
75
77
  - `metadata.spoken.style` is optional metadata for debugging or future mapping
76
78
  - if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
77
- - if you omit `metadata.spoken`, the plugin now synthesizes a conservative hidden fallback from visible assistant text
79
+ - if you omit `metadata.spoken`, the plugin now synthesizes a bounded hidden fallback from visible assistant text
78
80
  - visible chat text is never rewritten by the plugin