oomi-ai 0.2.14 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/agent_instructions.md +35 -0
- package/openclaw.extension.js +107 -2
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/oomi/SKILL.md +30 -0
- package/skills/oomi/agent_instructions.md +32 -0
package/README.md
CHANGED
|
@@ -132,6 +132,9 @@ That bridge:
|
|
|
132
132
|
|
|
133
133
|
This is the part of the package most likely to matter when debugging voice turn failures.
|
|
134
134
|
|
|
135
|
+
For managed voice replies, the extension also preserves an explicit hidden `metadata.spoken` sidecar when upstream provides one.
|
|
136
|
+
If upstream does not provide one, the extension now synthesizes a conservative hidden fallback from the visible assistant text so backend TTS can speak a cleaner version without changing user-visible chat.
|
|
137
|
+
|
|
135
138
|
## Bridge Health States
|
|
136
139
|
|
|
137
140
|
The bridge status file is written locally and should roughly be interpreted as:
|
|
@@ -198,6 +201,7 @@ If you are inspecting this package on npm, the main architectural points are:
|
|
|
198
201
|
- `idempotencyKey` handling
|
|
199
202
|
- bridge status that does not report `connected` before managed subscription is ready
|
|
200
203
|
- runtime fault isolation so local session failures are less likely to crash the whole provider
|
|
204
|
+
- hidden managed-voice speech metadata forwarding, with a synthesized fallback when upstream does not provide `metadata.spoken`
|
|
201
205
|
|
|
202
206
|
If you are developing the plugin, test the packaged surface with:
|
|
203
207
|
|
package/agent_instructions.md
CHANGED
|
@@ -138,6 +138,41 @@ That means:
|
|
|
138
138
|
- STT can succeed even when assistant reply delivery is broken later in the run
|
|
139
139
|
- a `voice_session_*` failure should be investigated, but it should not automatically be treated as proof that all normal Oomi chat is down
|
|
140
140
|
|
|
141
|
+
### Hidden Speech Payload
|
|
142
|
+
|
|
143
|
+
For managed voice turns, keep visible assistant chat text natural and user-facing.
|
|
144
|
+
Do not put spoken-style tags like `[happy]`, `[sad]`, or `[excited]` into visible chat text.
|
|
145
|
+
|
|
146
|
+
When the runtime supports it, voice turns may include a hidden speech sidecar on the assistant message:
|
|
147
|
+
|
|
148
|
+
```json
|
|
149
|
+
{
|
|
150
|
+
"metadata": {
|
|
151
|
+
"spoken": {
|
|
152
|
+
"text": "Speech-optimized text for TTS only.",
|
|
153
|
+
"instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
|
|
154
|
+
"style": {
|
|
155
|
+
"emotion": "excited",
|
|
156
|
+
"energy": "medium_high"
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Rules:
|
|
164
|
+
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
|
+
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
166
|
+
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
+
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
168
|
+
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
169
|
+
- if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
|
|
170
|
+
|
|
171
|
+
Current plugin behavior:
|
|
172
|
+
- if you provide `metadata.spoken`, the plugin preserves it unchanged
|
|
173
|
+
- if you do not provide `metadata.spoken`, the plugin now synthesizes a conservative hidden fallback from visible assistant text for backend TTS
|
|
174
|
+
- visible chat text is still never rewritten by the plugin
|
|
175
|
+
|
|
141
176
|
## Avatar Commands
|
|
142
177
|
|
|
143
178
|
Before using avatar commands, call `get_avatar_capabilities` and prefer canonical values.
|
package/openclaw.extension.js
CHANGED
|
@@ -178,6 +178,110 @@ function extractCorrelationId(payload) {
|
|
|
178
178
|
return '';
|
|
179
179
|
}
|
|
180
180
|
|
|
181
|
+
function normalizeSpokenMetadata(spoken) {
|
|
182
|
+
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
183
|
+
|
|
184
|
+
const text = toString(spoken.text);
|
|
185
|
+
if (!text) return null;
|
|
186
|
+
|
|
187
|
+
const normalized = { text };
|
|
188
|
+
const instructions = toString(spoken.instructions);
|
|
189
|
+
if (instructions) normalized.instructions = instructions;
|
|
190
|
+
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
191
|
+
normalized.style = spoken.style;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return normalized;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function stripEmoji(text) {
|
|
198
|
+
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function normalizeSpeechText(text) {
|
|
202
|
+
return stripEmoji(text)
|
|
203
|
+
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
204
|
+
.replace(/__(.*?)__/g, '$1')
|
|
205
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
206
|
+
.replace(/[–—]/g, ', ')
|
|
207
|
+
.replace(/…/g, '...')
|
|
208
|
+
.replace(/\s+/g, ' ')
|
|
209
|
+
.replace(/\s+([,.;!?])/g, '$1')
|
|
210
|
+
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
211
|
+
.replace(/,\s*,+/g, ', ')
|
|
212
|
+
.replace(/\s+/g, ' ')
|
|
213
|
+
.trim();
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function inferSpokenMetadataFromContent(content) {
|
|
217
|
+
const text = normalizeSpeechText(toString(content));
|
|
218
|
+
if (!text) return null;
|
|
219
|
+
|
|
220
|
+
const normalized = text.toLowerCase();
|
|
221
|
+
const upbeat =
|
|
222
|
+
/!/.test(text) ||
|
|
223
|
+
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
224
|
+
const gentle =
|
|
225
|
+
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
226
|
+
const curious = /\?/.test(text);
|
|
227
|
+
|
|
228
|
+
if (upbeat) {
|
|
229
|
+
return {
|
|
230
|
+
text,
|
|
231
|
+
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
232
|
+
style: { emotion: 'upbeat', energy: 'medium' },
|
|
233
|
+
};
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (gentle) {
|
|
237
|
+
return {
|
|
238
|
+
text,
|
|
239
|
+
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
240
|
+
style: { emotion: 'gentle', energy: 'low' },
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (curious) {
|
|
245
|
+
return {
|
|
246
|
+
text,
|
|
247
|
+
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
248
|
+
style: { emotion: 'curious', energy: 'medium' },
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
text,
|
|
254
|
+
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
255
|
+
style: { emotion: 'neutral', energy: 'medium' },
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function normalizeOutgoingMetadata(payloadMetadata, { accountId, correlationId, content }) {
|
|
260
|
+
const metadata =
|
|
261
|
+
payloadMetadata && typeof payloadMetadata === 'object' && !Array.isArray(payloadMetadata)
|
|
262
|
+
? { ...payloadMetadata }
|
|
263
|
+
: {};
|
|
264
|
+
|
|
265
|
+
const explicitSpokenPresent = Object.prototype.hasOwnProperty.call(metadata, 'spoken');
|
|
266
|
+
const spoken =
|
|
267
|
+
normalizeSpokenMetadata(metadata.spoken) ||
|
|
268
|
+
(!explicitSpokenPresent ? inferSpokenMetadataFromContent(content) : null);
|
|
269
|
+
if (spoken) {
|
|
270
|
+
metadata.spoken = spoken;
|
|
271
|
+
} else {
|
|
272
|
+
delete metadata.spoken;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
metadata.accountId = accountId;
|
|
276
|
+
if (correlationId) {
|
|
277
|
+
metadata.correlationId = correlationId;
|
|
278
|
+
} else {
|
|
279
|
+
delete metadata.correlationId;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return metadata;
|
|
283
|
+
}
|
|
284
|
+
|
|
181
285
|
async function postJson({ url, token, body, timeoutMs }) {
|
|
182
286
|
const controller = new AbortController();
|
|
183
287
|
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
@@ -289,10 +393,11 @@ const oomiChannelPlugin = {
|
|
|
289
393
|
sessionKey,
|
|
290
394
|
content,
|
|
291
395
|
source: 'openclaw.channel',
|
|
292
|
-
metadata: {
|
|
396
|
+
metadata: normalizeOutgoingMetadata(payload?.metadata, {
|
|
293
397
|
accountId: resolvedAccountId,
|
|
294
398
|
correlationId,
|
|
295
|
-
|
|
399
|
+
content,
|
|
400
|
+
}),
|
|
296
401
|
},
|
|
297
402
|
});
|
|
298
403
|
|
package/openclaw.plugin.json
CHANGED
package/package.json
CHANGED
package/skills/oomi/SKILL.md
CHANGED
|
@@ -128,6 +128,36 @@ Install packaged Oomi operator instructions into an OpenClaw `AGENTS.md` file.
|
|
|
128
128
|
python3 skills/oomi/scripts/install_agent_instructions.py
|
|
129
129
|
```
|
|
130
130
|
|
|
131
|
+
## Hidden Speech Payload
|
|
132
|
+
|
|
133
|
+
Managed voice can carry a hidden TTS-only speech sidecar alongside the normal assistant message.
|
|
134
|
+
|
|
135
|
+
Use this shape when a voice turn needs more natural delivery without changing visible chat text:
|
|
136
|
+
|
|
137
|
+
```json
|
|
138
|
+
{
|
|
139
|
+
"metadata": {
|
|
140
|
+
"spoken": {
|
|
141
|
+
"text": "Speech-optimized text for TTS only.",
|
|
142
|
+
"instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
|
|
143
|
+
"style": {
|
|
144
|
+
"emotion": "excited",
|
|
145
|
+
"energy": "medium_high"
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Rules:
|
|
153
|
+
- keep visible assistant `content` clean and user-facing
|
|
154
|
+
- do not place raw intonation tags in visible chat
|
|
155
|
+
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
156
|
+
- `metadata.spoken.text` is backend TTS input only
|
|
157
|
+
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
158
|
+
- if the speech sidecar is absent, Oomi speaks the visible assistant text
|
|
159
|
+
- if you omit `metadata.spoken`, the plugin synthesizes a conservative hidden fallback from visible assistant text
|
|
160
|
+
|
|
131
161
|
## Avatar Control
|
|
132
162
|
|
|
133
163
|
Before emitting avatar commands, call `get_avatar_capabilities` and prefer canonical values.
|
|
@@ -44,3 +44,35 @@ Primary UX requirement:
|
|
|
44
44
|
- Return `/connect/<invite-token>` auth links only.
|
|
45
45
|
|
|
46
46
|
Do not ask users to paste gateway IP/token/password when managed connect is available.
|
|
47
|
+
|
|
48
|
+
## Hidden Speech Payload
|
|
49
|
+
|
|
50
|
+
For managed voice turns, keep visible assistant chat text natural and user-facing.
|
|
51
|
+
Do not put spoken-style tags like `[happy]`, `[sad]`, or `[excited]` into visible chat text.
|
|
52
|
+
|
|
53
|
+
When the runtime supports it, voice turns may include a hidden speech sidecar on the assistant message:
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"metadata": {
|
|
58
|
+
"spoken": {
|
|
59
|
+
"text": "Speech-optimized text for TTS only.",
|
|
60
|
+
"instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
|
|
61
|
+
"style": {
|
|
62
|
+
"emotion": "excited",
|
|
63
|
+
"energy": "medium_high"
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Rules:
|
|
71
|
+
- visible `content` remains the source of truth for Oomi chat rendering
|
|
72
|
+
- for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
|
|
73
|
+
- `metadata.spoken.text` is for backend TTS only
|
|
74
|
+
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
75
|
+
- `metadata.spoken.style` is optional metadata for debugging or future mapping
|
|
76
|
+
- if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
|
|
77
|
+
- if you omit `metadata.spoken`, the plugin now synthesizes a conservative hidden fallback from visible assistant text
|
|
78
|
+
- visible chat text is never rewritten by the plugin
|