oomi-ai 0.2.19 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/oomi-ai.js +93 -17
- package/lib/spokenMetadata.js +273 -0
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
package/bin/oomi-ai.js
CHANGED
|
@@ -12,6 +12,7 @@ import { scaffoldPersonaApp } from '../lib/scaffold.js';
|
|
|
12
12
|
import { createPersonaApiClient } from '../lib/personaApiClient.js';
|
|
13
13
|
import { startPersonaJobPoller } from '../lib/personaJobPoller.js';
|
|
14
14
|
import { executePersonaJob } from '../lib/personaJobExecutor.js';
|
|
15
|
+
import { inferSpokenMetadataFromContent, normalizeSpokenMetadata } from '../lib/spokenMetadata.js';
|
|
15
16
|
import {
|
|
16
17
|
buildLocalPersonaRuntime,
|
|
17
18
|
defaultPersonaWorkspaceRoot,
|
|
@@ -1648,13 +1649,80 @@ function prepareGatewayFrameForLocalGateway(frameText, gatewayAuth, options = {}
|
|
|
1648
1649
|
}
|
|
1649
1650
|
}
|
|
1650
1651
|
|
|
1651
|
-
function parseJsonPayload(raw) {
|
|
1652
|
-
try {
|
|
1653
|
-
return JSON.parse(raw);
|
|
1654
|
-
} catch {
|
|
1655
|
-
return null;
|
|
1656
|
-
}
|
|
1657
|
-
}
|
|
1652
|
+
function parseJsonPayload(raw) {
|
|
1653
|
+
try {
|
|
1654
|
+
return JSON.parse(raw);
|
|
1655
|
+
} catch {
|
|
1656
|
+
return null;
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
function extractTextFromGatewayMessage(message) {
|
|
1661
|
+
if (!message || typeof message !== 'object') return '';
|
|
1662
|
+
|
|
1663
|
+
if (typeof message.content === 'string' && message.content.trim()) {
|
|
1664
|
+
return message.content.trim();
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
if (!Array.isArray(message.content)) return '';
|
|
1668
|
+
|
|
1669
|
+
return message.content
|
|
1670
|
+
.filter((block) => block && typeof block === 'object' && block.type === 'text' && typeof block.text === 'string')
|
|
1671
|
+
.map((block) => block.text.trim())
|
|
1672
|
+
.filter(Boolean)
|
|
1673
|
+
.join(' ');
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
1677
|
+
const frame = parseJsonPayload(frameText);
|
|
1678
|
+
if (!frame || typeof frame !== 'object') {
|
|
1679
|
+
return { frameText, changed: false, reason: '' };
|
|
1680
|
+
}
|
|
1681
|
+
if (frame.type !== 'event' || frame.event !== 'chat') {
|
|
1682
|
+
return { frameText, changed: false, reason: '' };
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
const payload = frame.payload && typeof frame.payload === 'object' ? frame.payload : null;
|
|
1686
|
+
if (!payload || payload.state !== 'final') {
|
|
1687
|
+
return { frameText, changed: false, reason: '' };
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
const message = payload.message && typeof payload.message === 'object' ? payload.message : null;
|
|
1691
|
+
if (!message || message.role !== 'assistant') {
|
|
1692
|
+
return { frameText, changed: false, reason: '' };
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1695
|
+
const originalMetadata =
|
|
1696
|
+
message.metadata && typeof message.metadata === 'object' && !Array.isArray(message.metadata)
|
|
1697
|
+
? message.metadata
|
|
1698
|
+
: {};
|
|
1699
|
+
const metadata = { ...originalMetadata };
|
|
1700
|
+
const explicitSpokenPresent = Object.prototype.hasOwnProperty.call(originalMetadata, 'spoken');
|
|
1701
|
+
const spoken =
|
|
1702
|
+
normalizeSpokenMetadata(originalMetadata.spoken) ||
|
|
1703
|
+
(!explicitSpokenPresent ? inferSpokenMetadataFromContent(extractTextFromGatewayMessage(message)) : null);
|
|
1704
|
+
if (!spoken) {
|
|
1705
|
+
return { frameText, changed: false, reason: '' };
|
|
1706
|
+
}
|
|
1707
|
+
|
|
1708
|
+
metadata.spoken = spoken;
|
|
1709
|
+
const nextFrame = JSON.stringify({
|
|
1710
|
+
...frame,
|
|
1711
|
+
payload: {
|
|
1712
|
+
...payload,
|
|
1713
|
+
message: {
|
|
1714
|
+
...message,
|
|
1715
|
+
metadata,
|
|
1716
|
+
},
|
|
1717
|
+
},
|
|
1718
|
+
});
|
|
1719
|
+
|
|
1720
|
+
return {
|
|
1721
|
+
frameText: nextFrame,
|
|
1722
|
+
changed: nextFrame !== frameText,
|
|
1723
|
+
reason: explicitSpokenPresent ? 'normalized' : 'synthesized',
|
|
1724
|
+
};
|
|
1725
|
+
}
|
|
1658
1726
|
|
|
1659
1727
|
function extractCorrelationId(params) {
|
|
1660
1728
|
if (!params || typeof params !== 'object') return '';
|
|
@@ -2882,11 +2950,18 @@ async function startOpenclawBridge(flags) {
|
|
|
2882
2950
|
flushSessionQueue(sessionBridge);
|
|
2883
2951
|
});
|
|
2884
2952
|
|
|
2885
|
-
gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
|
|
2886
|
-
|
|
2887
|
-
|
|
2888
|
-
|
|
2889
|
-
|
|
2953
|
+
gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
|
|
2954
|
+
let frame = typeof gatewayRaw === 'string' ? gatewayRaw : gatewayRaw.toString();
|
|
2955
|
+
if (classifyBridgeSessionScope(sessionId) === 'voice') {
|
|
2956
|
+
const spokenNormalized = ensureVoiceAssistantSpokenMetadata(frame);
|
|
2957
|
+
if (spokenNormalized.changed) {
|
|
2958
|
+
frame = spokenNormalized.frameText;
|
|
2959
|
+
console.log(`[bridge] voice.spoken_metadata.${spokenNormalized.reason} ${sessionId}`);
|
|
2960
|
+
}
|
|
2961
|
+
}
|
|
2962
|
+
const gatewayPayload = parseJsonPayload(frame);
|
|
2963
|
+
if (gatewayPayload?.event === 'connect.challenge') {
|
|
2964
|
+
console.log(`[bridge] gateway.connect.challenge ${sessionId}`);
|
|
2890
2965
|
const nonce =
|
|
2891
2966
|
gatewayPayload.payload && typeof gatewayPayload.payload.nonce === 'string'
|
|
2892
2967
|
? gatewayPayload.payload.nonce.trim()
|
|
@@ -4119,11 +4194,12 @@ if (__isDirectExecution) {
|
|
|
4119
4194
|
});
|
|
4120
4195
|
}
|
|
4121
4196
|
|
|
4122
|
-
export {
|
|
4123
|
-
prepareGatewayFrameForLocalGateway,
|
|
4124
|
-
|
|
4125
|
-
|
|
4126
|
-
|
|
4197
|
+
export {
|
|
4198
|
+
prepareGatewayFrameForLocalGateway,
|
|
4199
|
+
ensureVoiceAssistantSpokenMetadata,
|
|
4200
|
+
classifyBridgeFailure,
|
|
4201
|
+
classifyBridgeSessionScope,
|
|
4202
|
+
createBridgeProcessFaultHandler,
|
|
4127
4203
|
computeReconnectDelayMs,
|
|
4128
4204
|
resolveBridgeStatusForBrokerOpen,
|
|
4129
4205
|
resolveBridgeStatusForRuntimeFault,
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
function trimString(value, fallback = '') {
|
|
2
|
+
return typeof value === 'string' && value.trim() ? value.trim() : fallback;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function clampInteger(value, fallback, { min = 1, max = Number.MAX_SAFE_INTEGER } = {}) {
|
|
6
|
+
if (typeof value !== 'number' || !Number.isFinite(value)) return fallback;
|
|
7
|
+
const normalized = Math.floor(value);
|
|
8
|
+
if (normalized < min) return fallback;
|
|
9
|
+
if (normalized > max) return max;
|
|
10
|
+
return normalized;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
14
|
+
'Auto',
|
|
15
|
+
'Chinese',
|
|
16
|
+
'English',
|
|
17
|
+
'German',
|
|
18
|
+
'Italian',
|
|
19
|
+
'Portuguese',
|
|
20
|
+
'Spanish',
|
|
21
|
+
'Japanese',
|
|
22
|
+
'Korean',
|
|
23
|
+
'French',
|
|
24
|
+
'Russian',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
28
|
+
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
29
|
+
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
30
|
+
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
31
|
+
|
|
32
|
+
function inferSpokenLanguage(text) {
|
|
33
|
+
const normalized = trimString(text);
|
|
34
|
+
if (!normalized) return 'English';
|
|
35
|
+
return 'English';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function normalizeSpokenSegment(segment) {
|
|
39
|
+
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
40
|
+
|
|
41
|
+
const text = trimString(segment.text);
|
|
42
|
+
if (!text) return null;
|
|
43
|
+
|
|
44
|
+
const normalized = { text };
|
|
45
|
+
const pace = trimString(segment.pace);
|
|
46
|
+
const pitch = trimString(segment.pitch);
|
|
47
|
+
const energy = trimString(segment.energy);
|
|
48
|
+
const volume = trimString(segment.volume);
|
|
49
|
+
const pauseAfterMs = clampInteger(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
50
|
+
|
|
51
|
+
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
52
|
+
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
53
|
+
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
54
|
+
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
55
|
+
normalized.pause_after_ms = pauseAfterMs;
|
|
56
|
+
|
|
57
|
+
return normalized;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function stripEmoji(text) {
|
|
61
|
+
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function normalizeSpeechText(text) {
|
|
65
|
+
return stripEmoji(text)
|
|
66
|
+
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
67
|
+
.replace(/__(.*?)__/g, '$1')
|
|
68
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
69
|
+
.replace(/[–—]/g, ', ')
|
|
70
|
+
.replace(/…/g, '...')
|
|
71
|
+
.replace(/\s+/g, ' ')
|
|
72
|
+
.replace(/\s+([,.;!?])/g, '$1')
|
|
73
|
+
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
74
|
+
.replace(/,\s*,+/g, ', ')
|
|
75
|
+
.replace(/\s+/g, ' ')
|
|
76
|
+
.trim();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function splitSpeechSegments(text) {
|
|
80
|
+
const normalized = normalizeSpeechText(text);
|
|
81
|
+
if (!normalized) return [];
|
|
82
|
+
|
|
83
|
+
const baseSegments = normalized
|
|
84
|
+
.split(/(?<=[.!?])\s+/)
|
|
85
|
+
.map((segment) => segment.trim())
|
|
86
|
+
.filter(Boolean);
|
|
87
|
+
|
|
88
|
+
const segments = [];
|
|
89
|
+
for (const segment of baseSegments) {
|
|
90
|
+
if (segment.length <= 96) {
|
|
91
|
+
segments.push(segment);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const clauseParts = segment
|
|
96
|
+
.split(/,\s+/)
|
|
97
|
+
.map((part) => part.trim())
|
|
98
|
+
.filter(Boolean);
|
|
99
|
+
|
|
100
|
+
if (clauseParts.length > 1) {
|
|
101
|
+
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
102
|
+
const part = clauseParts[index];
|
|
103
|
+
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
104
|
+
segments.push(needsComma ? `${part},` : part);
|
|
105
|
+
}
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
segments.push(segment);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (segments.length <= 5) return segments;
|
|
113
|
+
|
|
114
|
+
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
118
|
+
const normalized = segmentText.toLowerCase();
|
|
119
|
+
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
120
|
+
const curious = /\?/.test(segmentText);
|
|
121
|
+
const reflective =
|
|
122
|
+
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
123
|
+
segmentText.length > 60;
|
|
124
|
+
|
|
125
|
+
if (curious) {
|
|
126
|
+
return {
|
|
127
|
+
pace: 'medium',
|
|
128
|
+
pitch: 'slightly_high',
|
|
129
|
+
energy: 'warm',
|
|
130
|
+
volume: 'normal',
|
|
131
|
+
pause_after_ms: 0,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (exclamatory) {
|
|
136
|
+
return {
|
|
137
|
+
pace: 'medium_fast',
|
|
138
|
+
pitch: 'slightly_high',
|
|
139
|
+
energy: 'bright',
|
|
140
|
+
volume: 'normal',
|
|
141
|
+
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (reflective) {
|
|
146
|
+
return {
|
|
147
|
+
pace: 'medium',
|
|
148
|
+
pitch: 'neutral',
|
|
149
|
+
energy: 'warm',
|
|
150
|
+
volume: 'normal',
|
|
151
|
+
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
pace: 'medium',
|
|
157
|
+
pitch: 'neutral',
|
|
158
|
+
energy: 'warm',
|
|
159
|
+
volume: 'normal',
|
|
160
|
+
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function synthesizeSpokenSegments(text) {
|
|
165
|
+
const language = inferSpokenLanguage(text);
|
|
166
|
+
const rawSegments = splitSpeechSegments(text);
|
|
167
|
+
if (rawSegments.length === 0) return null;
|
|
168
|
+
|
|
169
|
+
const segments = rawSegments.map((segmentText, index) => ({
|
|
170
|
+
text: segmentText,
|
|
171
|
+
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
172
|
+
}));
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
language,
|
|
176
|
+
segments,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function normalizeSpokenMetadata(spoken) {
|
|
181
|
+
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
182
|
+
|
|
183
|
+
const text = trimString(spoken.text);
|
|
184
|
+
if (!text) return null;
|
|
185
|
+
|
|
186
|
+
const normalized = { text };
|
|
187
|
+
const language = trimString(spoken.language);
|
|
188
|
+
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
189
|
+
normalized.language = language;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const explicitSegments =
|
|
193
|
+
Array.isArray(spoken.segments)
|
|
194
|
+
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
195
|
+
: [];
|
|
196
|
+
if (explicitSegments.length > 0) {
|
|
197
|
+
normalized.segments = explicitSegments;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const instructions = trimString(spoken.instructions);
|
|
201
|
+
if (instructions) normalized.instructions = instructions;
|
|
202
|
+
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
203
|
+
normalized.style = spoken.style;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
207
|
+
if (!normalized.language && fallbackSegments?.language) {
|
|
208
|
+
normalized.language = fallbackSegments.language;
|
|
209
|
+
}
|
|
210
|
+
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
211
|
+
normalized.segments = fallbackSegments.segments;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return normalized;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function inferSpokenMetadataFromContent(content) {
|
|
218
|
+
const text = normalizeSpeechText(trimString(content));
|
|
219
|
+
if (!text) return null;
|
|
220
|
+
const synthesized = synthesizeSpokenSegments(text);
|
|
221
|
+
|
|
222
|
+
const normalized = text.toLowerCase();
|
|
223
|
+
const upbeat =
|
|
224
|
+
/!/.test(text) ||
|
|
225
|
+
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
226
|
+
const gentle =
|
|
227
|
+
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
228
|
+
const curious = /\?/.test(text);
|
|
229
|
+
|
|
230
|
+
if (upbeat) {
|
|
231
|
+
return {
|
|
232
|
+
text,
|
|
233
|
+
language: synthesized?.language || 'English',
|
|
234
|
+
segments: synthesized?.segments,
|
|
235
|
+
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
236
|
+
style: { emotion: 'upbeat', energy: 'medium' },
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (gentle) {
|
|
241
|
+
return {
|
|
242
|
+
text,
|
|
243
|
+
language: synthesized?.language || 'English',
|
|
244
|
+
segments: synthesized?.segments,
|
|
245
|
+
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
246
|
+
style: { emotion: 'gentle', energy: 'low' },
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (curious) {
|
|
251
|
+
return {
|
|
252
|
+
text,
|
|
253
|
+
language: synthesized?.language || 'English',
|
|
254
|
+
segments: synthesized?.segments,
|
|
255
|
+
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
256
|
+
style: { emotion: 'curious', energy: 'medium' },
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
text,
|
|
262
|
+
language: synthesized?.language || 'English',
|
|
263
|
+
segments: synthesized?.segments,
|
|
264
|
+
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
265
|
+
style: { emotion: 'neutral', energy: 'medium' },
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export {
|
|
270
|
+
inferSpokenMetadataFromContent,
|
|
271
|
+
normalizeSpokenMetadata,
|
|
272
|
+
normalizeSpeechText,
|
|
273
|
+
};
|
package/openclaw.plugin.json
CHANGED