oomi-ai 0.2.16 → 0.2.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/agent_instructions.md +3 -1
- package/openclaw.extension.js +178 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/oomi/SKILL.md +22 -1
- package/skills/oomi/agent_instructions.md +3 -1
package/README.md
CHANGED
|
@@ -133,7 +133,7 @@ That bridge:
|
|
|
133
133
|
This is the part of the package most likely to matter when debugging voice turn failures.
|
|
134
134
|
|
|
135
135
|
For managed voice replies, the extension also preserves an explicit hidden `metadata.spoken` sidecar when upstream provides one.
|
|
136
|
-
If upstream does not provide one, the extension now synthesizes a
|
|
136
|
+
If upstream does not provide one, the extension now synthesizes a bounded hidden fallback from the visible assistant text so backend TTS can speak a cleaner and more varied version without changing user-visible chat.
|
|
137
137
|
|
|
138
138
|
## Bridge Health States
|
|
139
139
|
|
package/agent_instructions.md
CHANGED
|
@@ -164,13 +164,15 @@ Rules:
|
|
|
164
164
|
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
165
|
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
166
166
|
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
168
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
167
169
|
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
168
170
|
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
169
171
|
- if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
|
|
170
172
|
|
|
171
173
|
Current plugin behavior:
|
|
172
174
|
- if you provide `metadata.spoken`, the plugin preserves it unchanged
|
|
173
|
-
- if you do not provide `metadata.spoken`, the plugin now synthesizes a
|
|
175
|
+
- if you do not provide `metadata.spoken`, the plugin now synthesizes a bounded hidden fallback from visible assistant text for backend TTS
|
|
174
176
|
- visible chat text is still never rewritten by the plugin
|
|
175
177
|
|
|
176
178
|
## Avatar Commands
|
package/openclaw.extension.js
CHANGED
|
@@ -178,6 +178,154 @@ function extractCorrelationId(payload) {
|
|
|
178
178
|
return '';
|
|
179
179
|
}
|
|
180
180
|
|
|
181
|
+
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
182
|
+
'Auto',
|
|
183
|
+
'Chinese',
|
|
184
|
+
'English',
|
|
185
|
+
'German',
|
|
186
|
+
'Italian',
|
|
187
|
+
'Portuguese',
|
|
188
|
+
'Spanish',
|
|
189
|
+
'Japanese',
|
|
190
|
+
'Korean',
|
|
191
|
+
'French',
|
|
192
|
+
'Russian',
|
|
193
|
+
]);
|
|
194
|
+
|
|
195
|
+
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
196
|
+
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
197
|
+
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
198
|
+
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
199
|
+
|
|
200
|
+
function inferSpokenLanguage(text) {
|
|
201
|
+
const normalized = toString(text);
|
|
202
|
+
if (!normalized) return 'English';
|
|
203
|
+
return 'English';
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
function normalizeSpokenSegment(segment) {
|
|
207
|
+
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
208
|
+
|
|
209
|
+
const text = toString(segment.text);
|
|
210
|
+
if (!text) return null;
|
|
211
|
+
|
|
212
|
+
const normalized = { text };
|
|
213
|
+
const pace = toString(segment.pace);
|
|
214
|
+
const pitch = toString(segment.pitch);
|
|
215
|
+
const energy = toString(segment.energy);
|
|
216
|
+
const volume = toString(segment.volume);
|
|
217
|
+
const pauseAfterMs = toNumber(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
218
|
+
|
|
219
|
+
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
220
|
+
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
221
|
+
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
222
|
+
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
223
|
+
normalized.pause_after_ms = pauseAfterMs;
|
|
224
|
+
|
|
225
|
+
return normalized;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
function splitSpeechSegments(text) {
|
|
229
|
+
const normalized = normalizeSpeechText(text);
|
|
230
|
+
if (!normalized) return [];
|
|
231
|
+
|
|
232
|
+
const baseSegments = normalized
|
|
233
|
+
.split(/(?<=[.!?])\s+/)
|
|
234
|
+
.map((segment) => segment.trim())
|
|
235
|
+
.filter(Boolean);
|
|
236
|
+
|
|
237
|
+
const segments = [];
|
|
238
|
+
for (const segment of baseSegments) {
|
|
239
|
+
if (segment.length <= 96) {
|
|
240
|
+
segments.push(segment);
|
|
241
|
+
continue;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
const clauseParts = segment
|
|
245
|
+
.split(/,\s+/)
|
|
246
|
+
.map((part) => part.trim())
|
|
247
|
+
.filter(Boolean);
|
|
248
|
+
|
|
249
|
+
if (clauseParts.length > 1) {
|
|
250
|
+
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
251
|
+
const part = clauseParts[index];
|
|
252
|
+
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
253
|
+
segments.push(needsComma ? `${part},` : part);
|
|
254
|
+
}
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
segments.push(segment);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
if (segments.length <= 5) return segments;
|
|
262
|
+
|
|
263
|
+
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
267
|
+
const normalized = segmentText.toLowerCase();
|
|
268
|
+
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
269
|
+
const curious = /\?/.test(segmentText);
|
|
270
|
+
const reflective =
|
|
271
|
+
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
272
|
+
segmentText.length > 60;
|
|
273
|
+
|
|
274
|
+
if (curious) {
|
|
275
|
+
return {
|
|
276
|
+
pace: 'medium',
|
|
277
|
+
pitch: 'slightly_high',
|
|
278
|
+
energy: 'warm',
|
|
279
|
+
volume: 'normal',
|
|
280
|
+
pause_after_ms: 0,
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
if (exclamatory) {
|
|
285
|
+
return {
|
|
286
|
+
pace: 'medium_fast',
|
|
287
|
+
pitch: 'slightly_high',
|
|
288
|
+
energy: 'bright',
|
|
289
|
+
volume: 'normal',
|
|
290
|
+
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if (reflective) {
|
|
295
|
+
return {
|
|
296
|
+
pace: 'medium',
|
|
297
|
+
pitch: 'neutral',
|
|
298
|
+
energy: 'warm',
|
|
299
|
+
volume: 'normal',
|
|
300
|
+
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
pace: 'medium',
|
|
306
|
+
pitch: 'neutral',
|
|
307
|
+
energy: 'warm',
|
|
308
|
+
volume: 'normal',
|
|
309
|
+
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function synthesizeSpokenSegments(text) {
|
|
314
|
+
const language = inferSpokenLanguage(text);
|
|
315
|
+
const rawSegments = splitSpeechSegments(text);
|
|
316
|
+
if (rawSegments.length === 0) return null;
|
|
317
|
+
|
|
318
|
+
const segments = rawSegments.map((segmentText, index) => ({
|
|
319
|
+
text: segmentText,
|
|
320
|
+
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
321
|
+
}));
|
|
322
|
+
|
|
323
|
+
return {
|
|
324
|
+
language,
|
|
325
|
+
segments,
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
|
|
181
329
|
function normalizeSpokenMetadata(spoken) {
|
|
182
330
|
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
183
331
|
|
|
@@ -185,12 +333,33 @@ function normalizeSpokenMetadata(spoken) {
|
|
|
185
333
|
if (!text) return null;
|
|
186
334
|
|
|
187
335
|
const normalized = { text };
|
|
336
|
+
const language = toString(spoken.language);
|
|
337
|
+
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
338
|
+
normalized.language = language;
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const explicitSegments =
|
|
342
|
+
Array.isArray(spoken.segments)
|
|
343
|
+
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
344
|
+
: [];
|
|
345
|
+
if (explicitSegments.length > 0) {
|
|
346
|
+
normalized.segments = explicitSegments;
|
|
347
|
+
}
|
|
348
|
+
|
|
188
349
|
const instructions = toString(spoken.instructions);
|
|
189
350
|
if (instructions) normalized.instructions = instructions;
|
|
190
351
|
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
191
352
|
normalized.style = spoken.style;
|
|
192
353
|
}
|
|
193
354
|
|
|
355
|
+
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
356
|
+
if (!normalized.language && fallbackSegments?.language) {
|
|
357
|
+
normalized.language = fallbackSegments.language;
|
|
358
|
+
}
|
|
359
|
+
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
360
|
+
normalized.segments = fallbackSegments.segments;
|
|
361
|
+
}
|
|
362
|
+
|
|
194
363
|
return normalized;
|
|
195
364
|
}
|
|
196
365
|
|
|
@@ -216,6 +385,7 @@ function normalizeSpeechText(text) {
|
|
|
216
385
|
function inferSpokenMetadataFromContent(content) {
|
|
217
386
|
const text = normalizeSpeechText(toString(content));
|
|
218
387
|
if (!text) return null;
|
|
388
|
+
const synthesized = synthesizeSpokenSegments(text);
|
|
219
389
|
|
|
220
390
|
const normalized = text.toLowerCase();
|
|
221
391
|
const upbeat =
|
|
@@ -228,6 +398,8 @@ function inferSpokenMetadataFromContent(content) {
|
|
|
228
398
|
if (upbeat) {
|
|
229
399
|
return {
|
|
230
400
|
text,
|
|
401
|
+
language: synthesized?.language || 'English',
|
|
402
|
+
segments: synthesized?.segments,
|
|
231
403
|
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
232
404
|
style: { emotion: 'upbeat', energy: 'medium' },
|
|
233
405
|
};
|
|
@@ -236,6 +408,8 @@ function inferSpokenMetadataFromContent(content) {
|
|
|
236
408
|
if (gentle) {
|
|
237
409
|
return {
|
|
238
410
|
text,
|
|
411
|
+
language: synthesized?.language || 'English',
|
|
412
|
+
segments: synthesized?.segments,
|
|
239
413
|
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
240
414
|
style: { emotion: 'gentle', energy: 'low' },
|
|
241
415
|
};
|
|
@@ -244,6 +418,8 @@ function inferSpokenMetadataFromContent(content) {
|
|
|
244
418
|
if (curious) {
|
|
245
419
|
return {
|
|
246
420
|
text,
|
|
421
|
+
language: synthesized?.language || 'English',
|
|
422
|
+
segments: synthesized?.segments,
|
|
247
423
|
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
248
424
|
style: { emotion: 'curious', energy: 'medium' },
|
|
249
425
|
};
|
|
@@ -251,6 +427,8 @@ function inferSpokenMetadataFromContent(content) {
|
|
|
251
427
|
|
|
252
428
|
return {
|
|
253
429
|
text,
|
|
430
|
+
language: synthesized?.language || 'English',
|
|
431
|
+
segments: synthesized?.segments,
|
|
254
432
|
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
255
433
|
style: { emotion: 'neutral', energy: 'medium' },
|
|
256
434
|
};
|
package/openclaw.plugin.json
CHANGED
package/package.json
CHANGED
package/skills/oomi/SKILL.md
CHANGED
|
@@ -139,6 +139,25 @@ Use this shape when a voice turn needs more natural delivery without changing vi
|
|
|
139
139
|
"metadata": {
|
|
140
140
|
"spoken": {
|
|
141
141
|
"text": "Speech-optimized text for TTS only.",
|
|
142
|
+
"language": "English",
|
|
143
|
+
"segments": [
|
|
144
|
+
{
|
|
145
|
+
"text": "Hey! It's Nemu, but close enough.",
|
|
146
|
+
"pace": "medium_fast",
|
|
147
|
+
"pitch": "slightly_high",
|
|
148
|
+
"energy": "bright",
|
|
149
|
+
"volume": "normal",
|
|
150
|
+
"pause_after_ms": 220
|
|
151
|
+
},
|
|
152
|
+
{
|
|
153
|
+
"text": "Right now, I'm just waking up into this conversation with you.",
|
|
154
|
+
"pace": "medium",
|
|
155
|
+
"pitch": "neutral",
|
|
156
|
+
"energy": "warm",
|
|
157
|
+
"volume": "normal",
|
|
158
|
+
"pause_after_ms": 280
|
|
159
|
+
}
|
|
160
|
+
],
|
|
142
161
|
"instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
|
|
143
162
|
"style": {
|
|
144
163
|
"emotion": "excited",
|
|
@@ -154,9 +173,11 @@ Rules:
|
|
|
154
173
|
- do not place raw intonation tags in visible chat
|
|
155
174
|
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
156
175
|
- `metadata.spoken.text` is backend TTS input only
|
|
176
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
177
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
157
178
|
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
158
179
|
- if the speech sidecar is absent, Oomi speaks the visible assistant text
|
|
159
|
-
- if you omit `metadata.spoken`, the plugin synthesizes a
|
|
180
|
+
- if you omit `metadata.spoken`, the plugin synthesizes a bounded hidden fallback from visible assistant text
|
|
160
181
|
|
|
161
182
|
## Avatar Control
|
|
162
183
|
|
|
@@ -71,8 +71,10 @@ Rules:
|
|
|
71
71
|
- visible `content` remains the source of truth for Oomi chat rendering
|
|
72
72
|
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
73
73
|
- `metadata.spoken.text` is for backend TTS only
|
|
74
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
75
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
74
76
|
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
75
77
|
- `metadata.spoken.style` is optional metadata for debugging or future mapping
|
|
76
78
|
- if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
|
|
77
|
-
- if you omit `metadata.spoken`, the plugin now synthesizes a
|
|
79
|
+
- if you omit `metadata.spoken`, the plugin now synthesizes a bounded hidden fallback from visible assistant text
|
|
78
80
|
- visible chat text is never rewritten by the plugin
|