oomi-ai 0.2.14 → 0.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -138,6 +138,36 @@ That means:
138
138
  - STT can succeed even when assistant reply delivery is broken later in the run
139
139
  - a `voice_session_*` failure should be investigated, but it should not automatically be treated as proof that all normal Oomi chat is down
140
140
 
141
+ ### Hidden Speech Payload
142
+
143
+ For managed voice turns, keep visible assistant chat text natural and user-facing.
144
+ Do not put spoken-style tags like `[happy]`, `[sad]`, or `[excited]` into visible chat text.
145
+
146
+ When the runtime supports it, voice turns may include a hidden speech sidecar on the assistant message:
147
+
148
+ ```json
149
+ {
150
+ "metadata": {
151
+ "spoken": {
152
+ "text": "Speech-optimized text for TTS only.",
153
+ "instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
154
+ "style": {
155
+ "emotion": "excited",
156
+ "energy": "medium_high"
157
+ }
158
+ }
159
+ }
160
+ }
161
+ ```
162
+
163
+ Rules:
164
+ - visible `content` remains the source of truth for Oomi chat rendering
165
+ - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
166
+ - `metadata.spoken.text` is for backend TTS only
167
+ - `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
168
+ - `metadata.spoken.style` is optional metadata for debugging/future mapping
169
+ - if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text
170
+
141
171
  ## Avatar Commands
142
172
 
143
173
  Before using avatar commands, call `get_avatar_capabilities` and prefer canonical values.
@@ -178,6 +178,45 @@ function extractCorrelationId(payload) {
178
178
  return '';
179
179
  }
180
180
 
181
+ function normalizeSpokenMetadata(spoken) {
182
+ if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
183
+
184
+ const text = toString(spoken.text);
185
+ if (!text) return null;
186
+
187
+ const normalized = { text };
188
+ const instructions = toString(spoken.instructions);
189
+ if (instructions) normalized.instructions = instructions;
190
+ if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
191
+ normalized.style = spoken.style;
192
+ }
193
+
194
+ return normalized;
195
+ }
196
+
197
+ function normalizeOutgoingMetadata(payloadMetadata, { accountId, correlationId }) {
198
+ const metadata =
199
+ payloadMetadata && typeof payloadMetadata === 'object' && !Array.isArray(payloadMetadata)
200
+ ? { ...payloadMetadata }
201
+ : {};
202
+
203
+ const spoken = normalizeSpokenMetadata(metadata.spoken);
204
+ if (spoken) {
205
+ metadata.spoken = spoken;
206
+ } else {
207
+ delete metadata.spoken;
208
+ }
209
+
210
+ metadata.accountId = accountId;
211
+ if (correlationId) {
212
+ metadata.correlationId = correlationId;
213
+ } else {
214
+ delete metadata.correlationId;
215
+ }
216
+
217
+ return metadata;
218
+ }
219
+
181
220
  async function postJson({ url, token, body, timeoutMs }) {
182
221
  const controller = new AbortController();
183
222
  const timeout = setTimeout(() => controller.abort(), timeoutMs);
@@ -289,10 +328,10 @@ const oomiChannelPlugin = {
289
328
  sessionKey,
290
329
  content,
291
330
  source: 'openclaw.channel',
292
- metadata: {
331
+ metadata: normalizeOutgoingMetadata(payload?.metadata, {
293
332
  accountId: resolvedAccountId,
294
333
  correlationId,
295
- },
334
+ }),
296
335
  },
297
336
  });
298
337
 
@@ -2,7 +2,7 @@
2
2
  "id": "oomi-ai",
3
3
  "name": "Oomi Channel Plugin",
4
4
  "description": "Managed Oomi channel integration for OpenClaw.",
5
- "version": "0.2.14",
5
+ "version": "0.2.15",
6
6
  "author": "Oomi",
7
7
  "license": "MIT",
8
8
  "openclawVersion": ">=0.5.0",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "oomi-ai",
3
- "version": "0.2.14",
3
+ "version": "0.2.15",
4
4
  "description": "Oomi OpenClaw channel plugin and bridge tooling",
5
5
  "bin": {
6
6
  "oomi": "bin/oomi-ai.js"
@@ -128,6 +128,35 @@ Install packaged Oomi operator instructions into an OpenClaw `AGENTS.md` file.
128
128
  python3 skills/oomi/scripts/install_agent_instructions.py
129
129
  ```
130
130
 
131
+ ## Hidden Speech Payload
132
+
133
+ Managed voice can carry a hidden TTS-only speech sidecar alongside the normal assistant message.
134
+
135
+ Use this shape when a voice turn needs more natural delivery without changing visible chat text:
136
+
137
+ ```json
138
+ {
139
+ "metadata": {
140
+ "spoken": {
141
+ "text": "Speech-optimized text for TTS only.",
142
+ "instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
143
+ "style": {
144
+ "emotion": "excited",
145
+ "energy": "medium_high"
146
+ }
147
+ }
148
+ }
149
+ }
150
+ ```
151
+
152
+ Rules:
153
+ - keep visible assistant `content` clean and user-facing
154
+ - do not place raw intonation tags in visible chat
155
+ - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
156
+ - `metadata.spoken.text` is backend TTS input only
157
+ - `metadata.spoken.instructions` should use natural-language speaking guidance
158
+ - if the speech sidecar is absent, Oomi speaks the visible assistant text
159
+
131
160
  ## Avatar Control
132
161
 
133
162
  Before emitting avatar commands, call `get_avatar_capabilities` and prefer canonical values.
@@ -44,3 +44,33 @@ Primary UX requirement:
44
44
  - Return `/connect/<invite-token>` auth links only.
45
45
 
46
46
  Do not ask users to paste gateway IP/token/password when managed connect is available.
47
+
48
+ ## Hidden Speech Payload
49
+
50
+ For managed voice turns, keep visible assistant chat text natural and user-facing.
51
+ Do not put spoken-style tags like `[happy]`, `[sad]`, or `[excited]` into visible chat text.
52
+
53
+ When the runtime supports it, voice turns may include a hidden speech sidecar on the assistant message:
54
+
55
+ ```json
56
+ {
57
+ "metadata": {
58
+ "spoken": {
59
+ "text": "Speech-optimized text for TTS only.",
60
+ "instructions": "Speak with upbeat, warm excitement and slightly rising intonation.",
61
+ "style": {
62
+ "emotion": "excited",
63
+ "energy": "medium_high"
64
+ }
65
+ }
66
+ }
67
+ }
68
+ ```
69
+
70
+ Rules:
71
+ - visible `content` remains the source of truth for Oomi chat rendering
72
+ - for managed voice replies, include `metadata.spoken` when delivery benefits from cleaner phrasing or explicit speaking guidance
73
+ - `metadata.spoken.text` is for backend TTS only
74
+ - `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
75
+ - `metadata.spoken.style` is optional metadata for debugging or future mapping
76
+ - if no hidden speech sidecar exists, Oomi falls back to speaking the visible assistant text