oomi-ai 0.2.19 → 0.2.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/agent_instructions.md +14 -14
- package/bin/oomi-ai.js +98 -17
- package/lib/spokenMetadata.js +273 -0
- package/openclaw.extension.js +3 -257
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/oomi/SKILL.md +10 -10
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ OpenClaw channel plugin and bridge tooling for Oomi managed chat and voice.
|
|
|
4
4
|
|
|
5
5
|
## Current Focus
|
|
6
6
|
|
|
7
|
-
`0.2.
|
|
7
|
+
`0.2.21` adds the first live persona automation lane:
|
|
8
8
|
- WebSpatial-based persona scaffolding for generated Oomi apps
|
|
9
9
|
- a high-level `oomi personas create-managed` command for agent-driven persona creation
|
|
10
10
|
- device-authenticated persona runtime registration and job callbacks
|
|
@@ -141,8 +141,12 @@ That bridge:
|
|
|
141
141
|
|
|
142
142
|
This is the part of the package most likely to matter when debugging voice turn failures.
|
|
143
143
|
|
|
144
|
-
For managed voice replies, the
|
|
145
|
-
|
|
144
|
+
For managed cloned-voice replies, the canonical contract is:
|
|
145
|
+
- visible assistant `content` stays user-facing
|
|
146
|
+
- hidden `metadata.spoken` carries the backend TTS payload
|
|
147
|
+
- the shared helper in `lib/spokenMetadata.js` is used by both the extension and the local bridge to preserve or normalize that sidecar before it reaches the backend
|
|
148
|
+
|
|
149
|
+
The backend cloned-voice path is intentionally strict. If `metadata.spoken` does not reach Oomi, backend TTS fails instead of speaking a flat fallback voice.
|
|
146
150
|
|
|
147
151
|
## Persona Scaffolding
|
|
148
152
|
|
|
@@ -242,7 +246,7 @@ If you are inspecting this package on npm, the main architectural points are:
|
|
|
242
246
|
- `idempotencyKey` handling
|
|
243
247
|
- bridge status that does not report `connected` before managed subscription is ready
|
|
244
248
|
- runtime fault isolation so local session failures are less likely to crash the whole provider
|
|
245
|
-
- hidden managed-voice speech metadata
|
|
249
|
+
- one shared hidden managed-voice speech metadata helper used by both the extension and the local bridge
|
|
246
250
|
|
|
247
251
|
If you are developing the plugin, test the packaged surface with:
|
|
248
252
|
|
package/agent_instructions.md
CHANGED
|
@@ -160,20 +160,20 @@ When the runtime supports it, voice turns may include a hidden speech sidecar on
|
|
|
160
160
|
}
|
|
161
161
|
```
|
|
162
162
|
|
|
163
|
-
Rules:
|
|
164
|
-
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
|
-
- for managed voice replies, include `metadata.spoken`
|
|
166
|
-
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
-
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
168
|
-
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
169
|
-
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
170
|
-
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
- if you
|
|
175
|
-
-
|
|
176
|
-
-
|
|
163
|
+
Rules:
|
|
164
|
+
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
|
+
- for managed cloned-voice replies, include `metadata.spoken` whenever backend TTS should speak the turn
|
|
166
|
+
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
168
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
169
|
+
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
170
|
+
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
171
|
+
|
|
172
|
+
Current package behavior:
|
|
173
|
+
- if you provide `metadata.spoken`, the package preserves it unchanged
|
|
174
|
+
- if you omit `metadata.spoken`, the shared package helper may synthesize it as a compatibility guardrail before backend TTS
|
|
175
|
+
- visible chat text is never rewritten by the package
|
|
176
|
+
- backend cloned voice is strict: if `metadata.spoken` does not reach Oomi, playback fails instead of falling back to flat speech
|
|
177
177
|
|
|
178
178
|
## Avatar Commands
|
|
179
179
|
|
package/bin/oomi-ai.js
CHANGED
|
@@ -12,6 +12,7 @@ import { scaffoldPersonaApp } from '../lib/scaffold.js';
|
|
|
12
12
|
import { createPersonaApiClient } from '../lib/personaApiClient.js';
|
|
13
13
|
import { startPersonaJobPoller } from '../lib/personaJobPoller.js';
|
|
14
14
|
import { executePersonaJob } from '../lib/personaJobExecutor.js';
|
|
15
|
+
import { inferSpokenMetadataFromContent, normalizeSpokenMetadata } from '../lib/spokenMetadata.js';
|
|
15
16
|
import {
|
|
16
17
|
buildLocalPersonaRuntime,
|
|
17
18
|
defaultPersonaWorkspaceRoot,
|
|
@@ -1648,13 +1649,85 @@ function prepareGatewayFrameForLocalGateway(frameText, gatewayAuth, options = {}
|
|
|
1648
1649
|
}
|
|
1649
1650
|
}
|
|
1650
1651
|
|
|
1651
|
-
function parseJsonPayload(raw) {
|
|
1652
|
-
try {
|
|
1653
|
-
return JSON.parse(raw);
|
|
1654
|
-
} catch {
|
|
1655
|
-
return null;
|
|
1656
|
-
}
|
|
1657
|
-
}
|
|
1652
|
+
function parseJsonPayload(raw) {
|
|
1653
|
+
try {
|
|
1654
|
+
return JSON.parse(raw);
|
|
1655
|
+
} catch {
|
|
1656
|
+
return null;
|
|
1657
|
+
}
|
|
1658
|
+
}
|
|
1659
|
+
|
|
1660
|
+
function extractTextFromGatewayMessage(message) {
|
|
1661
|
+
if (!message || typeof message !== 'object') return '';
|
|
1662
|
+
|
|
1663
|
+
if (typeof message.content === 'string' && message.content.trim()) {
|
|
1664
|
+
return message.content.trim();
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
if (!Array.isArray(message.content)) return '';
|
|
1668
|
+
|
|
1669
|
+
return message.content
|
|
1670
|
+
.filter((block) => block && typeof block === 'object' && block.type === 'text' && typeof block.text === 'string')
|
|
1671
|
+
.map((block) => block.text.trim())
|
|
1672
|
+
.filter(Boolean)
|
|
1673
|
+
.join(' ');
|
|
1674
|
+
}
|
|
1675
|
+
|
|
1676
|
+
function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
1677
|
+
const frame = parseJsonPayload(frameText);
|
|
1678
|
+
if (!frame || typeof frame !== 'object') {
|
|
1679
|
+
return { frameText, changed: false, reason: '' };
|
|
1680
|
+
}
|
|
1681
|
+
if (frame.type !== 'event' || frame.event !== 'chat') {
|
|
1682
|
+
return { frameText, changed: false, reason: '' };
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
const payload = frame.payload && typeof frame.payload === 'object' ? frame.payload : null;
|
|
1686
|
+
if (!payload || payload.state !== 'final') {
|
|
1687
|
+
return { frameText, changed: false, reason: '' };
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
const message = payload.message && typeof payload.message === 'object' ? payload.message : null;
|
|
1691
|
+
if (!message) {
|
|
1692
|
+
return { frameText, changed: false, reason: '' };
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1695
|
+
const messageRole = typeof message.role === 'string' ? message.role.trim() : '';
|
|
1696
|
+
if (messageRole && messageRole !== 'assistant') {
|
|
1697
|
+
return { frameText, changed: false, reason: '' };
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
const originalMetadata =
|
|
1701
|
+
message.metadata && typeof message.metadata === 'object' && !Array.isArray(message.metadata)
|
|
1702
|
+
? message.metadata
|
|
1703
|
+
: {};
|
|
1704
|
+
const metadata = { ...originalMetadata };
|
|
1705
|
+
const explicitSpokenPresent = Object.prototype.hasOwnProperty.call(originalMetadata, 'spoken');
|
|
1706
|
+
const spoken =
|
|
1707
|
+
normalizeSpokenMetadata(originalMetadata.spoken) ||
|
|
1708
|
+
(!explicitSpokenPresent ? inferSpokenMetadataFromContent(extractTextFromGatewayMessage(message)) : null);
|
|
1709
|
+
if (!spoken) {
|
|
1710
|
+
return { frameText, changed: false, reason: '' };
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
metadata.spoken = spoken;
|
|
1714
|
+
const nextFrame = JSON.stringify({
|
|
1715
|
+
...frame,
|
|
1716
|
+
payload: {
|
|
1717
|
+
...payload,
|
|
1718
|
+
message: {
|
|
1719
|
+
...message,
|
|
1720
|
+
metadata,
|
|
1721
|
+
},
|
|
1722
|
+
},
|
|
1723
|
+
});
|
|
1724
|
+
|
|
1725
|
+
return {
|
|
1726
|
+
frameText: nextFrame,
|
|
1727
|
+
changed: nextFrame !== frameText,
|
|
1728
|
+
reason: explicitSpokenPresent ? 'normalized' : (messageRole ? 'synthesized' : 'synthesized_missing_role'),
|
|
1729
|
+
};
|
|
1730
|
+
}
|
|
1658
1731
|
|
|
1659
1732
|
function extractCorrelationId(params) {
|
|
1660
1733
|
if (!params || typeof params !== 'object') return '';
|
|
@@ -2882,11 +2955,18 @@ async function startOpenclawBridge(flags) {
|
|
|
2882
2955
|
flushSessionQueue(sessionBridge);
|
|
2883
2956
|
});
|
|
2884
2957
|
|
|
2885
|
-
gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
|
|
2886
|
-
|
|
2887
|
-
|
|
2888
|
-
|
|
2889
|
-
|
|
2958
|
+
gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
|
|
2959
|
+
let frame = typeof gatewayRaw === 'string' ? gatewayRaw : gatewayRaw.toString();
|
|
2960
|
+
if (classifyBridgeSessionScope(sessionId) === 'voice') {
|
|
2961
|
+
const spokenNormalized = ensureVoiceAssistantSpokenMetadata(frame);
|
|
2962
|
+
if (spokenNormalized.changed) {
|
|
2963
|
+
frame = spokenNormalized.frameText;
|
|
2964
|
+
console.log(`[bridge] voice.spoken_metadata.${spokenNormalized.reason} ${sessionId}`);
|
|
2965
|
+
}
|
|
2966
|
+
}
|
|
2967
|
+
const gatewayPayload = parseJsonPayload(frame);
|
|
2968
|
+
if (gatewayPayload?.event === 'connect.challenge') {
|
|
2969
|
+
console.log(`[bridge] gateway.connect.challenge ${sessionId}`);
|
|
2890
2970
|
const nonce =
|
|
2891
2971
|
gatewayPayload.payload && typeof gatewayPayload.payload.nonce === 'string'
|
|
2892
2972
|
? gatewayPayload.payload.nonce.trim()
|
|
@@ -4119,11 +4199,12 @@ if (__isDirectExecution) {
|
|
|
4119
4199
|
});
|
|
4120
4200
|
}
|
|
4121
4201
|
|
|
4122
|
-
export {
|
|
4123
|
-
prepareGatewayFrameForLocalGateway,
|
|
4124
|
-
|
|
4125
|
-
|
|
4126
|
-
|
|
4202
|
+
export {
|
|
4203
|
+
prepareGatewayFrameForLocalGateway,
|
|
4204
|
+
ensureVoiceAssistantSpokenMetadata,
|
|
4205
|
+
classifyBridgeFailure,
|
|
4206
|
+
classifyBridgeSessionScope,
|
|
4207
|
+
createBridgeProcessFaultHandler,
|
|
4127
4208
|
computeReconnectDelayMs,
|
|
4128
4209
|
resolveBridgeStatusForBrokerOpen,
|
|
4129
4210
|
resolveBridgeStatusForRuntimeFault,
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
function trimString(value, fallback = '') {
|
|
2
|
+
return typeof value === 'string' && value.trim() ? value.trim() : fallback;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function clampInteger(value, fallback, { min = 1, max = Number.MAX_SAFE_INTEGER } = {}) {
|
|
6
|
+
if (typeof value !== 'number' || !Number.isFinite(value)) return fallback;
|
|
7
|
+
const normalized = Math.floor(value);
|
|
8
|
+
if (normalized < min) return fallback;
|
|
9
|
+
if (normalized > max) return max;
|
|
10
|
+
return normalized;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
14
|
+
'Auto',
|
|
15
|
+
'Chinese',
|
|
16
|
+
'English',
|
|
17
|
+
'German',
|
|
18
|
+
'Italian',
|
|
19
|
+
'Portuguese',
|
|
20
|
+
'Spanish',
|
|
21
|
+
'Japanese',
|
|
22
|
+
'Korean',
|
|
23
|
+
'French',
|
|
24
|
+
'Russian',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
28
|
+
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
29
|
+
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
30
|
+
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
31
|
+
|
|
32
|
+
function inferSpokenLanguage(text) {
|
|
33
|
+
const normalized = trimString(text);
|
|
34
|
+
if (!normalized) return 'English';
|
|
35
|
+
return 'English';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function normalizeSpokenSegment(segment) {
|
|
39
|
+
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
40
|
+
|
|
41
|
+
const text = trimString(segment.text);
|
|
42
|
+
if (!text) return null;
|
|
43
|
+
|
|
44
|
+
const normalized = { text };
|
|
45
|
+
const pace = trimString(segment.pace);
|
|
46
|
+
const pitch = trimString(segment.pitch);
|
|
47
|
+
const energy = trimString(segment.energy);
|
|
48
|
+
const volume = trimString(segment.volume);
|
|
49
|
+
const pauseAfterMs = clampInteger(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
50
|
+
|
|
51
|
+
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
52
|
+
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
53
|
+
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
54
|
+
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
55
|
+
normalized.pause_after_ms = pauseAfterMs;
|
|
56
|
+
|
|
57
|
+
return normalized;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function stripEmoji(text) {
|
|
61
|
+
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function normalizeSpeechText(text) {
|
|
65
|
+
return stripEmoji(text)
|
|
66
|
+
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
67
|
+
.replace(/__(.*?)__/g, '$1')
|
|
68
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
69
|
+
.replace(/[\u2013\u2014]/g, ', ')
|
|
70
|
+
.replace(/\u2026/g, '...')
|
|
71
|
+
.replace(/\s+/g, ' ')
|
|
72
|
+
.replace(/\s+([,.;!?])/g, '$1')
|
|
73
|
+
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
74
|
+
.replace(/,\s*,+/g, ', ')
|
|
75
|
+
.replace(/\s+/g, ' ')
|
|
76
|
+
.trim();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function splitSpeechSegments(text) {
|
|
80
|
+
const normalized = normalizeSpeechText(text);
|
|
81
|
+
if (!normalized) return [];
|
|
82
|
+
|
|
83
|
+
const baseSegments = normalized
|
|
84
|
+
.split(/(?<=[.!?])\s+/)
|
|
85
|
+
.map((segment) => segment.trim())
|
|
86
|
+
.filter(Boolean);
|
|
87
|
+
|
|
88
|
+
const segments = [];
|
|
89
|
+
for (const segment of baseSegments) {
|
|
90
|
+
if (segment.length <= 96) {
|
|
91
|
+
segments.push(segment);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const clauseParts = segment
|
|
96
|
+
.split(/,\s+/)
|
|
97
|
+
.map((part) => part.trim())
|
|
98
|
+
.filter(Boolean);
|
|
99
|
+
|
|
100
|
+
if (clauseParts.length > 1) {
|
|
101
|
+
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
102
|
+
const part = clauseParts[index];
|
|
103
|
+
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
104
|
+
segments.push(needsComma ? `${part},` : part);
|
|
105
|
+
}
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
segments.push(segment);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (segments.length <= 5) return segments;
|
|
113
|
+
|
|
114
|
+
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
118
|
+
const normalized = segmentText.toLowerCase();
|
|
119
|
+
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
120
|
+
const curious = /\?/.test(segmentText);
|
|
121
|
+
const reflective =
|
|
122
|
+
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
123
|
+
segmentText.length > 60;
|
|
124
|
+
|
|
125
|
+
if (curious) {
|
|
126
|
+
return {
|
|
127
|
+
pace: 'medium',
|
|
128
|
+
pitch: 'slightly_high',
|
|
129
|
+
energy: 'warm',
|
|
130
|
+
volume: 'normal',
|
|
131
|
+
pause_after_ms: 0,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (exclamatory) {
|
|
136
|
+
return {
|
|
137
|
+
pace: 'medium_fast',
|
|
138
|
+
pitch: 'slightly_high',
|
|
139
|
+
energy: 'bright',
|
|
140
|
+
volume: 'normal',
|
|
141
|
+
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (reflective) {
|
|
146
|
+
return {
|
|
147
|
+
pace: 'medium',
|
|
148
|
+
pitch: 'neutral',
|
|
149
|
+
energy: 'warm',
|
|
150
|
+
volume: 'normal',
|
|
151
|
+
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
pace: 'medium',
|
|
157
|
+
pitch: 'neutral',
|
|
158
|
+
energy: 'warm',
|
|
159
|
+
volume: 'normal',
|
|
160
|
+
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function synthesizeSpokenSegments(text) {
|
|
165
|
+
const language = inferSpokenLanguage(text);
|
|
166
|
+
const rawSegments = splitSpeechSegments(text);
|
|
167
|
+
if (rawSegments.length === 0) return null;
|
|
168
|
+
|
|
169
|
+
const segments = rawSegments.map((segmentText, index) => ({
|
|
170
|
+
text: segmentText,
|
|
171
|
+
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
172
|
+
}));
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
language,
|
|
176
|
+
segments,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function normalizeSpokenMetadata(spoken) {
|
|
181
|
+
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
182
|
+
|
|
183
|
+
const text = trimString(spoken.text);
|
|
184
|
+
if (!text) return null;
|
|
185
|
+
|
|
186
|
+
const normalized = { text };
|
|
187
|
+
const language = trimString(spoken.language);
|
|
188
|
+
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
189
|
+
normalized.language = language;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const explicitSegments =
|
|
193
|
+
Array.isArray(spoken.segments)
|
|
194
|
+
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
195
|
+
: [];
|
|
196
|
+
if (explicitSegments.length > 0) {
|
|
197
|
+
normalized.segments = explicitSegments;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const instructions = trimString(spoken.instructions);
|
|
201
|
+
if (instructions) normalized.instructions = instructions;
|
|
202
|
+
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
203
|
+
normalized.style = spoken.style;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
207
|
+
if (!normalized.language && fallbackSegments?.language) {
|
|
208
|
+
normalized.language = fallbackSegments.language;
|
|
209
|
+
}
|
|
210
|
+
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
211
|
+
normalized.segments = fallbackSegments.segments;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return normalized;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function inferSpokenMetadataFromContent(content) {
|
|
218
|
+
const text = normalizeSpeechText(trimString(content));
|
|
219
|
+
if (!text) return null;
|
|
220
|
+
const synthesized = synthesizeSpokenSegments(text);
|
|
221
|
+
|
|
222
|
+
const normalized = text.toLowerCase();
|
|
223
|
+
const upbeat =
|
|
224
|
+
/!/.test(text) ||
|
|
225
|
+
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
226
|
+
const gentle =
|
|
227
|
+
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
228
|
+
const curious = /\?/.test(text);
|
|
229
|
+
|
|
230
|
+
if (upbeat) {
|
|
231
|
+
return {
|
|
232
|
+
text,
|
|
233
|
+
language: synthesized?.language || 'English',
|
|
234
|
+
segments: synthesized?.segments,
|
|
235
|
+
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
236
|
+
style: { emotion: 'upbeat', energy: 'medium' },
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (gentle) {
|
|
241
|
+
return {
|
|
242
|
+
text,
|
|
243
|
+
language: synthesized?.language || 'English',
|
|
244
|
+
segments: synthesized?.segments,
|
|
245
|
+
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
246
|
+
style: { emotion: 'gentle', energy: 'low' },
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (curious) {
|
|
251
|
+
return {
|
|
252
|
+
text,
|
|
253
|
+
language: synthesized?.language || 'English',
|
|
254
|
+
segments: synthesized?.segments,
|
|
255
|
+
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
256
|
+
style: { emotion: 'curious', energy: 'medium' },
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
text,
|
|
262
|
+
language: synthesized?.language || 'English',
|
|
263
|
+
segments: synthesized?.segments,
|
|
264
|
+
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
265
|
+
style: { emotion: 'neutral', energy: 'medium' },
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export {
|
|
270
|
+
inferSpokenMetadataFromContent,
|
|
271
|
+
normalizeSpokenMetadata,
|
|
272
|
+
normalizeSpeechText,
|
|
273
|
+
};
|
package/openclaw.extension.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import { inferSpokenMetadataFromContent, normalizeSpokenMetadata } from './lib/spokenMetadata.js';
|
|
2
|
+
|
|
3
|
+
const CHANNEL_ID = 'oomi';
|
|
2
4
|
const DEFAULT_SESSION_KEY = 'agent:main:webchat:channel:oomi';
|
|
3
5
|
const DEFAULT_TIMEOUT_MS = 15000;
|
|
4
6
|
|
|
@@ -178,262 +180,6 @@ function extractCorrelationId(payload) {
|
|
|
178
180
|
return '';
|
|
179
181
|
}
|
|
180
182
|
|
|
181
|
-
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
182
|
-
'Auto',
|
|
183
|
-
'Chinese',
|
|
184
|
-
'English',
|
|
185
|
-
'German',
|
|
186
|
-
'Italian',
|
|
187
|
-
'Portuguese',
|
|
188
|
-
'Spanish',
|
|
189
|
-
'Japanese',
|
|
190
|
-
'Korean',
|
|
191
|
-
'French',
|
|
192
|
-
'Russian',
|
|
193
|
-
]);
|
|
194
|
-
|
|
195
|
-
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
196
|
-
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
197
|
-
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
198
|
-
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
199
|
-
|
|
200
|
-
function inferSpokenLanguage(text) {
|
|
201
|
-
const normalized = toString(text);
|
|
202
|
-
if (!normalized) return 'English';
|
|
203
|
-
return 'English';
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
function normalizeSpokenSegment(segment) {
|
|
207
|
-
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
208
|
-
|
|
209
|
-
const text = toString(segment.text);
|
|
210
|
-
if (!text) return null;
|
|
211
|
-
|
|
212
|
-
const normalized = { text };
|
|
213
|
-
const pace = toString(segment.pace);
|
|
214
|
-
const pitch = toString(segment.pitch);
|
|
215
|
-
const energy = toString(segment.energy);
|
|
216
|
-
const volume = toString(segment.volume);
|
|
217
|
-
const pauseAfterMs = toNumber(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
218
|
-
|
|
219
|
-
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
220
|
-
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
221
|
-
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
222
|
-
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
223
|
-
normalized.pause_after_ms = pauseAfterMs;
|
|
224
|
-
|
|
225
|
-
return normalized;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
function splitSpeechSegments(text) {
|
|
229
|
-
const normalized = normalizeSpeechText(text);
|
|
230
|
-
if (!normalized) return [];
|
|
231
|
-
|
|
232
|
-
const baseSegments = normalized
|
|
233
|
-
.split(/(?<=[.!?])\s+/)
|
|
234
|
-
.map((segment) => segment.trim())
|
|
235
|
-
.filter(Boolean);
|
|
236
|
-
|
|
237
|
-
const segments = [];
|
|
238
|
-
for (const segment of baseSegments) {
|
|
239
|
-
if (segment.length <= 96) {
|
|
240
|
-
segments.push(segment);
|
|
241
|
-
continue;
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
const clauseParts = segment
|
|
245
|
-
.split(/,\s+/)
|
|
246
|
-
.map((part) => part.trim())
|
|
247
|
-
.filter(Boolean);
|
|
248
|
-
|
|
249
|
-
if (clauseParts.length > 1) {
|
|
250
|
-
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
251
|
-
const part = clauseParts[index];
|
|
252
|
-
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
253
|
-
segments.push(needsComma ? `${part},` : part);
|
|
254
|
-
}
|
|
255
|
-
continue;
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
segments.push(segment);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
if (segments.length <= 5) return segments;
|
|
262
|
-
|
|
263
|
-
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
267
|
-
const normalized = segmentText.toLowerCase();
|
|
268
|
-
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
269
|
-
const curious = /\?/.test(segmentText);
|
|
270
|
-
const reflective =
|
|
271
|
-
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
272
|
-
segmentText.length > 60;
|
|
273
|
-
|
|
274
|
-
if (curious) {
|
|
275
|
-
return {
|
|
276
|
-
pace: 'medium',
|
|
277
|
-
pitch: 'slightly_high',
|
|
278
|
-
energy: 'warm',
|
|
279
|
-
volume: 'normal',
|
|
280
|
-
pause_after_ms: 0,
|
|
281
|
-
};
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
if (exclamatory) {
|
|
285
|
-
return {
|
|
286
|
-
pace: 'medium_fast',
|
|
287
|
-
pitch: 'slightly_high',
|
|
288
|
-
energy: 'bright',
|
|
289
|
-
volume: 'normal',
|
|
290
|
-
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
291
|
-
};
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
if (reflective) {
|
|
295
|
-
return {
|
|
296
|
-
pace: 'medium',
|
|
297
|
-
pitch: 'neutral',
|
|
298
|
-
energy: 'warm',
|
|
299
|
-
volume: 'normal',
|
|
300
|
-
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
301
|
-
};
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
return {
|
|
305
|
-
pace: 'medium',
|
|
306
|
-
pitch: 'neutral',
|
|
307
|
-
energy: 'warm',
|
|
308
|
-
volume: 'normal',
|
|
309
|
-
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
310
|
-
};
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
function synthesizeSpokenSegments(text) {
|
|
314
|
-
const language = inferSpokenLanguage(text);
|
|
315
|
-
const rawSegments = splitSpeechSegments(text);
|
|
316
|
-
if (rawSegments.length === 0) return null;
|
|
317
|
-
|
|
318
|
-
const segments = rawSegments.map((segmentText, index) => ({
|
|
319
|
-
text: segmentText,
|
|
320
|
-
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
321
|
-
}));
|
|
322
|
-
|
|
323
|
-
return {
|
|
324
|
-
language,
|
|
325
|
-
segments,
|
|
326
|
-
};
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
function normalizeSpokenMetadata(spoken) {
|
|
330
|
-
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
331
|
-
|
|
332
|
-
const text = toString(spoken.text);
|
|
333
|
-
if (!text) return null;
|
|
334
|
-
|
|
335
|
-
const normalized = { text };
|
|
336
|
-
const language = toString(spoken.language);
|
|
337
|
-
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
338
|
-
normalized.language = language;
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
const explicitSegments =
|
|
342
|
-
Array.isArray(spoken.segments)
|
|
343
|
-
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
344
|
-
: [];
|
|
345
|
-
if (explicitSegments.length > 0) {
|
|
346
|
-
normalized.segments = explicitSegments;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
const instructions = toString(spoken.instructions);
|
|
350
|
-
if (instructions) normalized.instructions = instructions;
|
|
351
|
-
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
352
|
-
normalized.style = spoken.style;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
356
|
-
if (!normalized.language && fallbackSegments?.language) {
|
|
357
|
-
normalized.language = fallbackSegments.language;
|
|
358
|
-
}
|
|
359
|
-
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
360
|
-
normalized.segments = fallbackSegments.segments;
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
return normalized;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
function stripEmoji(text) {
|
|
367
|
-
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
function normalizeSpeechText(text) {
|
|
371
|
-
return stripEmoji(text)
|
|
372
|
-
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
373
|
-
.replace(/__(.*?)__/g, '$1')
|
|
374
|
-
.replace(/`([^`]+)`/g, '$1')
|
|
375
|
-
.replace(/[–—]/g, ', ')
|
|
376
|
-
.replace(/…/g, '...')
|
|
377
|
-
.replace(/\s+/g, ' ')
|
|
378
|
-
.replace(/\s+([,.;!?])/g, '$1')
|
|
379
|
-
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
380
|
-
.replace(/,\s*,+/g, ', ')
|
|
381
|
-
.replace(/\s+/g, ' ')
|
|
382
|
-
.trim();
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
function inferSpokenMetadataFromContent(content) {
|
|
386
|
-
const text = normalizeSpeechText(toString(content));
|
|
387
|
-
if (!text) return null;
|
|
388
|
-
const synthesized = synthesizeSpokenSegments(text);
|
|
389
|
-
|
|
390
|
-
const normalized = text.toLowerCase();
|
|
391
|
-
const upbeat =
|
|
392
|
-
/!/.test(text) ||
|
|
393
|
-
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
394
|
-
const gentle =
|
|
395
|
-
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
396
|
-
const curious = /\?/.test(text);
|
|
397
|
-
|
|
398
|
-
if (upbeat) {
|
|
399
|
-
return {
|
|
400
|
-
text,
|
|
401
|
-
language: synthesized?.language || 'English',
|
|
402
|
-
segments: synthesized?.segments,
|
|
403
|
-
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
404
|
-
style: { emotion: 'upbeat', energy: 'medium' },
|
|
405
|
-
};
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
if (gentle) {
|
|
409
|
-
return {
|
|
410
|
-
text,
|
|
411
|
-
language: synthesized?.language || 'English',
|
|
412
|
-
segments: synthesized?.segments,
|
|
413
|
-
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
414
|
-
style: { emotion: 'gentle', energy: 'low' },
|
|
415
|
-
};
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
if (curious) {
|
|
419
|
-
return {
|
|
420
|
-
text,
|
|
421
|
-
language: synthesized?.language || 'English',
|
|
422
|
-
segments: synthesized?.segments,
|
|
423
|
-
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
424
|
-
style: { emotion: 'curious', energy: 'medium' },
|
|
425
|
-
};
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
return {
|
|
429
|
-
text,
|
|
430
|
-
language: synthesized?.language || 'English',
|
|
431
|
-
segments: synthesized?.segments,
|
|
432
|
-
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
433
|
-
style: { emotion: 'neutral', energy: 'medium' },
|
|
434
|
-
};
|
|
435
|
-
}
|
|
436
|
-
|
|
437
183
|
function normalizeOutgoingMetadata(payloadMetadata, { accountId, correlationId, content }) {
|
|
438
184
|
const metadata =
|
|
439
185
|
payloadMetadata && typeof payloadMetadata === 'object' && !Array.isArray(payloadMetadata)
|
package/openclaw.plugin.json
CHANGED
package/package.json
CHANGED
package/skills/oomi/SKILL.md
CHANGED
|
@@ -168,16 +168,16 @@ Use this shape when a voice turn needs more natural delivery without changing vi
|
|
|
168
168
|
}
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
-
Rules:
|
|
172
|
-
- keep visible assistant `content` clean and user-facing
|
|
173
|
-
- do not place raw intonation tags in visible chat
|
|
174
|
-
- for managed voice replies, include `metadata.spoken` when
|
|
175
|
-
- `metadata.spoken.text` is backend TTS input only
|
|
176
|
-
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
177
|
-
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
178
|
-
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
179
|
-
- if the
|
|
180
|
-
-
|
|
171
|
+
Rules:
|
|
172
|
+
- keep visible assistant `content` clean and user-facing
|
|
173
|
+
- do not place raw intonation tags in visible chat
|
|
174
|
+
- for managed cloned-voice replies, include `metadata.spoken` when backend TTS should speak the turn
|
|
175
|
+
- `metadata.spoken.text` is backend TTS input only
|
|
176
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
177
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
178
|
+
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
179
|
+
- if you omit `metadata.spoken`, the shared package helper may synthesize it as a compatibility guardrail before backend TTS
|
|
180
|
+
- backend cloned voice is strict: if `metadata.spoken` does not reach Oomi, playback fails instead of falling back to flat speech
|
|
181
181
|
|
|
182
182
|
## Avatar Control
|
|
183
183
|
|