oomi-ai 0.2.19 → 0.2.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/oomi-ai.js CHANGED
@@ -12,6 +12,7 @@ import { scaffoldPersonaApp } from '../lib/scaffold.js';
12
12
  import { createPersonaApiClient } from '../lib/personaApiClient.js';
13
13
  import { startPersonaJobPoller } from '../lib/personaJobPoller.js';
14
14
  import { executePersonaJob } from '../lib/personaJobExecutor.js';
15
+ import { inferSpokenMetadataFromContent, normalizeSpokenMetadata } from '../lib/spokenMetadata.js';
15
16
  import {
16
17
  buildLocalPersonaRuntime,
17
18
  defaultPersonaWorkspaceRoot,
@@ -1648,13 +1649,80 @@ function prepareGatewayFrameForLocalGateway(frameText, gatewayAuth, options = {}
1648
1649
  }
1649
1650
  }
1650
1651
 
1651
- function parseJsonPayload(raw) {
1652
- try {
1653
- return JSON.parse(raw);
1654
- } catch {
1655
- return null;
1656
- }
1657
- }
1652
+ function parseJsonPayload(raw) {
1653
+ try {
1654
+ return JSON.parse(raw);
1655
+ } catch {
1656
+ return null;
1657
+ }
1658
+ }
1659
+
1660
+ function extractTextFromGatewayMessage(message) {
1661
+ if (!message || typeof message !== 'object') return '';
1662
+
1663
+ if (typeof message.content === 'string' && message.content.trim()) {
1664
+ return message.content.trim();
1665
+ }
1666
+
1667
+ if (!Array.isArray(message.content)) return '';
1668
+
1669
+ return message.content
1670
+ .filter((block) => block && typeof block === 'object' && block.type === 'text' && typeof block.text === 'string')
1671
+ .map((block) => block.text.trim())
1672
+ .filter(Boolean)
1673
+ .join(' ');
1674
+ }
1675
+
1676
+ function ensureVoiceAssistantSpokenMetadata(frameText) {
1677
+ const frame = parseJsonPayload(frameText);
1678
+ if (!frame || typeof frame !== 'object') {
1679
+ return { frameText, changed: false, reason: '' };
1680
+ }
1681
+ if (frame.type !== 'event' || frame.event !== 'chat') {
1682
+ return { frameText, changed: false, reason: '' };
1683
+ }
1684
+
1685
+ const payload = frame.payload && typeof frame.payload === 'object' ? frame.payload : null;
1686
+ if (!payload || payload.state !== 'final') {
1687
+ return { frameText, changed: false, reason: '' };
1688
+ }
1689
+
1690
+ const message = payload.message && typeof payload.message === 'object' ? payload.message : null;
1691
+ if (!message || message.role !== 'assistant') {
1692
+ return { frameText, changed: false, reason: '' };
1693
+ }
1694
+
1695
+ const originalMetadata =
1696
+ message.metadata && typeof message.metadata === 'object' && !Array.isArray(message.metadata)
1697
+ ? message.metadata
1698
+ : {};
1699
+ const metadata = { ...originalMetadata };
1700
+ const explicitSpokenPresent = Object.prototype.hasOwnProperty.call(originalMetadata, 'spoken');
1701
+ const spoken =
1702
+ normalizeSpokenMetadata(originalMetadata.spoken) ||
1703
+ (!explicitSpokenPresent ? inferSpokenMetadataFromContent(extractTextFromGatewayMessage(message)) : null);
1704
+ if (!spoken) {
1705
+ return { frameText, changed: false, reason: '' };
1706
+ }
1707
+
1708
+ metadata.spoken = spoken;
1709
+ const nextFrame = JSON.stringify({
1710
+ ...frame,
1711
+ payload: {
1712
+ ...payload,
1713
+ message: {
1714
+ ...message,
1715
+ metadata,
1716
+ },
1717
+ },
1718
+ });
1719
+
1720
+ return {
1721
+ frameText: nextFrame,
1722
+ changed: nextFrame !== frameText,
1723
+ reason: explicitSpokenPresent ? 'normalized' : 'synthesized',
1724
+ };
1725
+ }
1658
1726
 
1659
1727
  function extractCorrelationId(params) {
1660
1728
  if (!params || typeof params !== 'object') return '';
@@ -2882,11 +2950,18 @@ async function startOpenclawBridge(flags) {
2882
2950
  flushSessionQueue(sessionBridge);
2883
2951
  });
2884
2952
 
2885
- gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
2886
- const frame = typeof gatewayRaw === 'string' ? gatewayRaw : gatewayRaw.toString();
2887
- const gatewayPayload = parseJsonPayload(frame);
2888
- if (gatewayPayload?.event === 'connect.challenge') {
2889
- console.log(`[bridge] gateway.connect.challenge ${sessionId}`);
2953
+ gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
2954
+ let frame = typeof gatewayRaw === 'string' ? gatewayRaw : gatewayRaw.toString();
2955
+ if (classifyBridgeSessionScope(sessionId) === 'voice') {
2956
+ const spokenNormalized = ensureVoiceAssistantSpokenMetadata(frame);
2957
+ if (spokenNormalized.changed) {
2958
+ frame = spokenNormalized.frameText;
2959
+ console.log(`[bridge] voice.spoken_metadata.${spokenNormalized.reason} ${sessionId}`);
2960
+ }
2961
+ }
2962
+ const gatewayPayload = parseJsonPayload(frame);
2963
+ if (gatewayPayload?.event === 'connect.challenge') {
2964
+ console.log(`[bridge] gateway.connect.challenge ${sessionId}`);
2890
2965
  const nonce =
2891
2966
  gatewayPayload.payload && typeof gatewayPayload.payload.nonce === 'string'
2892
2967
  ? gatewayPayload.payload.nonce.trim()
@@ -4119,11 +4194,12 @@ if (__isDirectExecution) {
4119
4194
  });
4120
4195
  }
4121
4196
 
4122
- export {
4123
- prepareGatewayFrameForLocalGateway,
4124
- classifyBridgeFailure,
4125
- classifyBridgeSessionScope,
4126
- createBridgeProcessFaultHandler,
4197
+ export {
4198
+ prepareGatewayFrameForLocalGateway,
4199
+ ensureVoiceAssistantSpokenMetadata,
4200
+ classifyBridgeFailure,
4201
+ classifyBridgeSessionScope,
4202
+ createBridgeProcessFaultHandler,
4127
4203
  computeReconnectDelayMs,
4128
4204
  resolveBridgeStatusForBrokerOpen,
4129
4205
  resolveBridgeStatusForRuntimeFault,
@@ -0,0 +1,273 @@
1
+ function trimString(value, fallback = '') {
2
+ return typeof value === 'string' && value.trim() ? value.trim() : fallback;
3
+ }
4
+
5
+ function clampInteger(value, fallback, { min = 1, max = Number.MAX_SAFE_INTEGER } = {}) {
6
+ if (typeof value !== 'number' || !Number.isFinite(value)) return fallback;
7
+ const normalized = Math.floor(value);
8
+ if (normalized < min) return fallback;
9
+ if (normalized > max) return max;
10
+ return normalized;
11
+ }
12
+
13
+ const BOUNDED_LANGUAGE_TYPES = new Set([
14
+ 'Auto',
15
+ 'Chinese',
16
+ 'English',
17
+ 'German',
18
+ 'Italian',
19
+ 'Portuguese',
20
+ 'Spanish',
21
+ 'Japanese',
22
+ 'Korean',
23
+ 'French',
24
+ 'Russian',
25
+ ]);
26
+
27
+ const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
28
+ const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
29
+ const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
30
+ const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
31
+
32
+ function inferSpokenLanguage(text) {
33
+ const normalized = trimString(text);
34
+ if (!normalized) return 'English';
35
+ return 'English';
36
+ }
37
+
38
+ function normalizeSpokenSegment(segment) {
39
+ if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
40
+
41
+ const text = trimString(segment.text);
42
+ if (!text) return null;
43
+
44
+ const normalized = { text };
45
+ const pace = trimString(segment.pace);
46
+ const pitch = trimString(segment.pitch);
47
+ const energy = trimString(segment.energy);
48
+ const volume = trimString(segment.volume);
49
+ const pauseAfterMs = clampInteger(segment.pause_after_ms, 0, { min: 0, max: 1200 });
50
+
51
+ if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
52
+ if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
53
+ if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
54
+ if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
55
+ normalized.pause_after_ms = pauseAfterMs;
56
+
57
+ return normalized;
58
+ }
59
+
60
+ function stripEmoji(text) {
61
+ return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
62
+ }
63
+
64
+ function normalizeSpeechText(text) {
65
+ return stripEmoji(text)
66
+ .replace(/\*\*(.*?)\*\*/g, '$1')
67
+ .replace(/__(.*?)__/g, '$1')
68
+ .replace(/`([^`]+)`/g, '$1')
69
+ .replace(/[–—]/g, ', ')
70
+ .replace(/…/g, '...')
71
+ .replace(/\s+/g, ' ')
72
+ .replace(/\s+([,.;!?])/g, '$1')
73
+ .replace(/([,.;!?])(?=[^\s])/g, '$1 ')
74
+ .replace(/,\s*,+/g, ', ')
75
+ .replace(/\s+/g, ' ')
76
+ .trim();
77
+ }
78
+
79
+ function splitSpeechSegments(text) {
80
+ const normalized = normalizeSpeechText(text);
81
+ if (!normalized) return [];
82
+
83
+ const baseSegments = normalized
84
+ .split(/(?<=[.!?])\s+/)
85
+ .map((segment) => segment.trim())
86
+ .filter(Boolean);
87
+
88
+ const segments = [];
89
+ for (const segment of baseSegments) {
90
+ if (segment.length <= 96) {
91
+ segments.push(segment);
92
+ continue;
93
+ }
94
+
95
+ const clauseParts = segment
96
+ .split(/,\s+/)
97
+ .map((part) => part.trim())
98
+ .filter(Boolean);
99
+
100
+ if (clauseParts.length > 1) {
101
+ for (let index = 0; index < clauseParts.length; index += 1) {
102
+ const part = clauseParts[index];
103
+ const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
104
+ segments.push(needsComma ? `${part},` : part);
105
+ }
106
+ continue;
107
+ }
108
+
109
+ segments.push(segment);
110
+ }
111
+
112
+ if (segments.length <= 5) return segments;
113
+
114
+ return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
115
+ }
116
+
117
+ function inferSegmentStyle(segmentText, index, totalSegments) {
118
+ const normalized = segmentText.toLowerCase();
119
+ const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
120
+ const curious = /\?/.test(segmentText);
121
+ const reflective =
122
+ /\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
123
+ segmentText.length > 60;
124
+
125
+ if (curious) {
126
+ return {
127
+ pace: 'medium',
128
+ pitch: 'slightly_high',
129
+ energy: 'warm',
130
+ volume: 'normal',
131
+ pause_after_ms: 0,
132
+ };
133
+ }
134
+
135
+ if (exclamatory) {
136
+ return {
137
+ pace: 'medium_fast',
138
+ pitch: 'slightly_high',
139
+ energy: 'bright',
140
+ volume: 'normal',
141
+ pause_after_ms: index < totalSegments - 1 ? 220 : 0,
142
+ };
143
+ }
144
+
145
+ if (reflective) {
146
+ return {
147
+ pace: 'medium',
148
+ pitch: 'neutral',
149
+ energy: 'warm',
150
+ volume: 'normal',
151
+ pause_after_ms: index < totalSegments - 1 ? 260 : 0,
152
+ };
153
+ }
154
+
155
+ return {
156
+ pace: 'medium',
157
+ pitch: 'neutral',
158
+ energy: 'warm',
159
+ volume: 'normal',
160
+ pause_after_ms: index < totalSegments - 1 ? 180 : 0,
161
+ };
162
+ }
163
+
164
+ function synthesizeSpokenSegments(text) {
165
+ const language = inferSpokenLanguage(text);
166
+ const rawSegments = splitSpeechSegments(text);
167
+ if (rawSegments.length === 0) return null;
168
+
169
+ const segments = rawSegments.map((segmentText, index) => ({
170
+ text: segmentText,
171
+ ...inferSegmentStyle(segmentText, index, rawSegments.length),
172
+ }));
173
+
174
+ return {
175
+ language,
176
+ segments,
177
+ };
178
+ }
179
+
180
+ function normalizeSpokenMetadata(spoken) {
181
+ if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
182
+
183
+ const text = trimString(spoken.text);
184
+ if (!text) return null;
185
+
186
+ const normalized = { text };
187
+ const language = trimString(spoken.language);
188
+ if (BOUNDED_LANGUAGE_TYPES.has(language)) {
189
+ normalized.language = language;
190
+ }
191
+
192
+ const explicitSegments =
193
+ Array.isArray(spoken.segments)
194
+ ? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
195
+ : [];
196
+ if (explicitSegments.length > 0) {
197
+ normalized.segments = explicitSegments;
198
+ }
199
+
200
+ const instructions = trimString(spoken.instructions);
201
+ if (instructions) normalized.instructions = instructions;
202
+ if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
203
+ normalized.style = spoken.style;
204
+ }
205
+
206
+ const fallbackSegments = synthesizeSpokenSegments(text);
207
+ if (!normalized.language && fallbackSegments?.language) {
208
+ normalized.language = fallbackSegments.language;
209
+ }
210
+ if (!normalized.segments && fallbackSegments?.segments?.length) {
211
+ normalized.segments = fallbackSegments.segments;
212
+ }
213
+
214
+ return normalized;
215
+ }
216
+
217
+ function inferSpokenMetadataFromContent(content) {
218
+ const text = normalizeSpeechText(trimString(content));
219
+ if (!text) return null;
220
+ const synthesized = synthesizeSpokenSegments(text);
221
+
222
+ const normalized = text.toLowerCase();
223
+ const upbeat =
224
+ /!/.test(text) ||
225
+ /\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
226
+ const gentle =
227
+ /\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
228
+ const curious = /\?/.test(text);
229
+
230
+ if (upbeat) {
231
+ return {
232
+ text,
233
+ language: synthesized?.language || 'English',
234
+ segments: synthesized?.segments,
235
+ instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
236
+ style: { emotion: 'upbeat', energy: 'medium' },
237
+ };
238
+ }
239
+
240
+ if (gentle) {
241
+ return {
242
+ text,
243
+ language: synthesized?.language || 'English',
244
+ segments: synthesized?.segments,
245
+ instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
246
+ style: { emotion: 'gentle', energy: 'low' },
247
+ };
248
+ }
249
+
250
+ if (curious) {
251
+ return {
252
+ text,
253
+ language: synthesized?.language || 'English',
254
+ segments: synthesized?.segments,
255
+ instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
256
+ style: { emotion: 'curious', energy: 'medium' },
257
+ };
258
+ }
259
+
260
+ return {
261
+ text,
262
+ language: synthesized?.language || 'English',
263
+ segments: synthesized?.segments,
264
+ instructions: 'Speak naturally with light warmth and conversational pacing.',
265
+ style: { emotion: 'neutral', energy: 'medium' },
266
+ };
267
+ }
268
+
269
+ export {
270
+ inferSpokenMetadataFromContent,
271
+ normalizeSpokenMetadata,
272
+ normalizeSpeechText,
273
+ };
@@ -2,7 +2,7 @@
2
2
  "id": "oomi-ai",
3
3
  "name": "Oomi Channel Plugin",
4
4
  "description": "Managed Oomi channel integration for OpenClaw.",
5
- "version": "0.2.19",
5
+ "version": "0.2.20",
6
6
  "author": "Oomi",
7
7
  "license": "MIT",
8
8
  "openclawVersion": ">=0.5.0",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "oomi-ai",
3
- "version": "0.2.19",
3
+ "version": "0.2.20",
4
4
  "description": "Oomi OpenClaw channel plugin and bridge tooling",
5
5
  "bin": {
6
6
  "oomi": "bin/oomi-ai.js"