oomi-ai 0.2.20 → 0.2.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -4
- package/agent_instructions.md +14 -14
- package/bin/oomi-ai.js +53 -13
- package/lib/spokenMetadata.js +273 -273
- package/openclaw.extension.js +6 -261
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/skills/oomi/SKILL.md +10 -10
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@ OpenClaw channel plugin and bridge tooling for Oomi managed chat and voice.
|
|
|
4
4
|
|
|
5
5
|
## Current Focus
|
|
6
6
|
|
|
7
|
-
`0.2.
|
|
7
|
+
`0.2.21` adds the first live persona automation lane:
|
|
8
8
|
- WebSpatial-based persona scaffolding for generated Oomi apps
|
|
9
9
|
- a high-level `oomi personas create-managed` command for agent-driven persona creation
|
|
10
10
|
- device-authenticated persona runtime registration and job callbacks
|
|
@@ -141,8 +141,12 @@ That bridge:
|
|
|
141
141
|
|
|
142
142
|
This is the part of the package most likely to matter when debugging voice turn failures.
|
|
143
143
|
|
|
144
|
-
For managed voice replies, the
|
|
145
|
-
|
|
144
|
+
For managed cloned-voice replies, the canonical contract is:
|
|
145
|
+
- visible assistant `content` stays user-facing
|
|
146
|
+
- hidden `metadata.spoken` carries the backend TTS payload
|
|
147
|
+
- the shared helper in `lib/spokenMetadata.js` is used by both the extension and the local bridge to preserve or normalize that sidecar before it reaches the backend
|
|
148
|
+
|
|
149
|
+
The backend cloned-voice path is intentionally strict. If `metadata.spoken` does not reach Oomi, backend TTS fails instead of speaking a flat fallback voice.
|
|
146
150
|
|
|
147
151
|
## Persona Scaffolding
|
|
148
152
|
|
|
@@ -242,7 +246,7 @@ If you are inspecting this package on npm, the main architectural points are:
|
|
|
242
246
|
- `idempotencyKey` handling
|
|
243
247
|
- bridge status that does not report `connected` before managed subscription is ready
|
|
244
248
|
- runtime fault isolation so local session failures are less likely to crash the whole provider
|
|
245
|
-
- hidden managed-voice speech metadata
|
|
249
|
+
- one shared hidden managed-voice speech metadata helper used by both the extension and the local bridge
|
|
246
250
|
|
|
247
251
|
If you are developing the plugin, test the packaged surface with:
|
|
248
252
|
|
package/agent_instructions.md
CHANGED
|
@@ -160,20 +160,20 @@ When the runtime supports it, voice turns may include a hidden speech sidecar on
|
|
|
160
160
|
}
|
|
161
161
|
```
|
|
162
162
|
|
|
163
|
-
Rules:
|
|
164
|
-
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
|
-
- for managed voice replies, include `metadata.spoken`
|
|
166
|
-
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
-
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
168
|
-
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
169
|
-
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
170
|
-
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
- if you
|
|
175
|
-
-
|
|
176
|
-
-
|
|
163
|
+
Rules:
|
|
164
|
+
- visible `content` remains the source of truth for Oomi chat rendering
|
|
165
|
+
- for managed cloned-voice replies, include `metadata.spoken` whenever backend TTS should speak the turn
|
|
166
|
+
- `metadata.spoken.text` is for backend TTS only
|
|
167
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
168
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
169
|
+
- `metadata.spoken.instructions` should be natural-language guidance, not raw bracket tags
|
|
170
|
+
- `metadata.spoken.style` is optional metadata for debugging/future mapping
|
|
171
|
+
|
|
172
|
+
Current package behavior:
|
|
173
|
+
- if you provide `metadata.spoken`, the package preserves it unchanged
|
|
174
|
+
- if you omit `metadata.spoken`, the shared package helper may synthesize it as a compatibility guardrail before backend TTS
|
|
175
|
+
- visible chat text is never rewritten by the package
|
|
176
|
+
- backend cloned voice is strict: if `metadata.spoken` does not reach Oomi, playback fails instead of falling back to flat speech
|
|
177
177
|
|
|
178
178
|
## Avatar Commands
|
|
179
179
|
|
package/bin/oomi-ai.js
CHANGED
|
@@ -1673,6 +1673,31 @@ function extractTextFromGatewayMessage(message) {
|
|
|
1673
1673
|
.join(' ');
|
|
1674
1674
|
}
|
|
1675
1675
|
|
|
1676
|
+
function summarizeVoiceFrameContract(frameText) {
|
|
1677
|
+
const frame = parseJsonPayload(frameText);
|
|
1678
|
+
if (!frame || typeof frame !== 'object') {
|
|
1679
|
+
return { parseable: false };
|
|
1680
|
+
}
|
|
1681
|
+
const payload = frame.payload && typeof frame.payload === 'object' ? frame.payload : {};
|
|
1682
|
+
const message = payload.message && typeof payload.message === 'object' ? payload.message : {};
|
|
1683
|
+
const metadata = message.metadata && typeof message.metadata === 'object' ? message.metadata : {};
|
|
1684
|
+
const spokenRaw = Object.prototype.hasOwnProperty.call(metadata, 'spoken') ? metadata.spoken : undefined;
|
|
1685
|
+
const spokenNormalized = normalizeSpokenMetadata(spokenRaw);
|
|
1686
|
+
const text = extractTextFromGatewayMessage(message);
|
|
1687
|
+
return {
|
|
1688
|
+
parseable: true,
|
|
1689
|
+
event: typeof frame.event === 'string' ? frame.event : '',
|
|
1690
|
+
state: typeof payload.state === 'string' ? payload.state : '',
|
|
1691
|
+
role: typeof message.role === 'string' ? message.role : '',
|
|
1692
|
+
contentLength: text.length,
|
|
1693
|
+
hasMetadata: Object.keys(metadata).length > 0,
|
|
1694
|
+
hasSpokenKey: Object.prototype.hasOwnProperty.call(metadata, 'spoken'),
|
|
1695
|
+
spokenRawType: spokenRaw === undefined ? 'missing' : Array.isArray(spokenRaw) ? 'array' : typeof spokenRaw,
|
|
1696
|
+
spokenNormalized: Boolean(spokenNormalized),
|
|
1697
|
+
spokenSegmentCount: Array.isArray(spokenNormalized?.segments) ? spokenNormalized.segments.length : 0,
|
|
1698
|
+
};
|
|
1699
|
+
}
|
|
1700
|
+
|
|
1676
1701
|
function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
1677
1702
|
const frame = parseJsonPayload(frameText);
|
|
1678
1703
|
if (!frame || typeof frame !== 'object') {
|
|
@@ -1688,7 +1713,12 @@ function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
|
1688
1713
|
}
|
|
1689
1714
|
|
|
1690
1715
|
const message = payload.message && typeof payload.message === 'object' ? payload.message : null;
|
|
1691
|
-
if (!message
|
|
1716
|
+
if (!message) {
|
|
1717
|
+
return { frameText, changed: false, reason: '' };
|
|
1718
|
+
}
|
|
1719
|
+
|
|
1720
|
+
const messageRole = typeof message.role === 'string' ? message.role.trim() : '';
|
|
1721
|
+
if (messageRole && messageRole !== 'assistant') {
|
|
1692
1722
|
return { frameText, changed: false, reason: '' };
|
|
1693
1723
|
}
|
|
1694
1724
|
|
|
@@ -1697,10 +1727,10 @@ function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
|
1697
1727
|
? message.metadata
|
|
1698
1728
|
: {};
|
|
1699
1729
|
const metadata = { ...originalMetadata };
|
|
1700
|
-
const
|
|
1730
|
+
const normalizedExplicitSpoken = normalizeSpokenMetadata(originalMetadata.spoken);
|
|
1701
1731
|
const spoken =
|
|
1702
|
-
|
|
1703
|
-
|
|
1732
|
+
normalizedExplicitSpoken ||
|
|
1733
|
+
inferSpokenMetadataFromContent(extractTextFromGatewayMessage(message));
|
|
1704
1734
|
if (!spoken) {
|
|
1705
1735
|
return { frameText, changed: false, reason: '' };
|
|
1706
1736
|
}
|
|
@@ -1720,7 +1750,7 @@ function ensureVoiceAssistantSpokenMetadata(frameText) {
|
|
|
1720
1750
|
return {
|
|
1721
1751
|
frameText: nextFrame,
|
|
1722
1752
|
changed: nextFrame !== frameText,
|
|
1723
|
-
reason:
|
|
1753
|
+
reason: normalizedExplicitSpoken ? 'normalized' : (messageRole ? 'synthesized' : 'synthesized_missing_role'),
|
|
1724
1754
|
};
|
|
1725
1755
|
}
|
|
1726
1756
|
|
|
@@ -2953,10 +2983,16 @@ async function startOpenclawBridge(flags) {
|
|
|
2953
2983
|
gatewaySocket.on('message', runBridgeCallbackSafely((gatewayRaw) => {
|
|
2954
2984
|
let frame = typeof gatewayRaw === 'string' ? gatewayRaw : gatewayRaw.toString();
|
|
2955
2985
|
if (classifyBridgeSessionScope(sessionId) === 'voice') {
|
|
2986
|
+
const beforeSummary = summarizeVoiceFrameContract(frame);
|
|
2956
2987
|
const spokenNormalized = ensureVoiceAssistantSpokenMetadata(frame);
|
|
2957
2988
|
if (spokenNormalized.changed) {
|
|
2958
2989
|
frame = spokenNormalized.frameText;
|
|
2959
|
-
console.log(`[bridge] voice.spoken_metadata.${spokenNormalized.reason} ${sessionId}
|
|
2990
|
+
console.log(`[bridge] voice.spoken_metadata.${spokenNormalized.reason} ${sessionId} ${JSON.stringify({
|
|
2991
|
+
before: beforeSummary,
|
|
2992
|
+
after: summarizeVoiceFrameContract(frame),
|
|
2993
|
+
})}`);
|
|
2994
|
+
} else if (beforeSummary.event === 'chat' && beforeSummary.state === 'final') {
|
|
2995
|
+
console.log(`[bridge] voice.chat.final ${sessionId} ${JSON.stringify(beforeSummary)}`);
|
|
2960
2996
|
}
|
|
2961
2997
|
}
|
|
2962
2998
|
const gatewayPayload = parseJsonPayload(frame);
|
|
@@ -3313,13 +3349,17 @@ async function startOpenclawBridge(flags) {
|
|
|
3313
3349
|
return;
|
|
3314
3350
|
}
|
|
3315
3351
|
|
|
3316
|
-
if (payload.type === 'client.frame') {
|
|
3317
|
-
const sessionId = String(payload.sessionId || '').trim();
|
|
3318
|
-
const frame = typeof payload.frame === 'string' ? payload.frame : '';
|
|
3319
|
-
if (!sessionId || !frame) return;
|
|
3320
|
-
|
|
3321
|
-
|
|
3322
|
-
|
|
3352
|
+
if (payload.type === 'client.frame') {
|
|
3353
|
+
const sessionId = String(payload.sessionId || '').trim();
|
|
3354
|
+
const frame = typeof payload.frame === 'string' ? payload.frame : '';
|
|
3355
|
+
if (!sessionId || !frame) return;
|
|
3356
|
+
if (classifyBridgeSessionScope(sessionId) === 'voice') {
|
|
3357
|
+
console.log(`[bridge] client.frame ${sessionId} ${JSON.stringify(summarizeVoiceFrameContract(frame))}`);
|
|
3358
|
+
} else {
|
|
3359
|
+
console.log(`[bridge] client.frame ${sessionId}`);
|
|
3360
|
+
}
|
|
3361
|
+
const sessionBridge = getOrCreateGatewaySession(sessionId);
|
|
3362
|
+
if (!sessionBridge) return;
|
|
3323
3363
|
const requestMeta = extractGatewayRequestMeta(frame);
|
|
3324
3364
|
if (requestMeta) {
|
|
3325
3365
|
if (!(sessionBridge.pendingRequests instanceof Map)) {
|
package/lib/spokenMetadata.js
CHANGED
|
@@ -1,273 +1,273 @@
|
|
|
1
|
-
function trimString(value, fallback = '') {
|
|
2
|
-
return typeof value === 'string' && value.trim() ? value.trim() : fallback;
|
|
3
|
-
}
|
|
4
|
-
|
|
5
|
-
function clampInteger(value, fallback, { min = 1, max = Number.MAX_SAFE_INTEGER } = {}) {
|
|
6
|
-
if (typeof value !== 'number' || !Number.isFinite(value)) return fallback;
|
|
7
|
-
const normalized = Math.floor(value);
|
|
8
|
-
if (normalized < min) return fallback;
|
|
9
|
-
if (normalized > max) return max;
|
|
10
|
-
return normalized;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
14
|
-
'Auto',
|
|
15
|
-
'Chinese',
|
|
16
|
-
'English',
|
|
17
|
-
'German',
|
|
18
|
-
'Italian',
|
|
19
|
-
'Portuguese',
|
|
20
|
-
'Spanish',
|
|
21
|
-
'Japanese',
|
|
22
|
-
'Korean',
|
|
23
|
-
'French',
|
|
24
|
-
'Russian',
|
|
25
|
-
]);
|
|
26
|
-
|
|
27
|
-
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
28
|
-
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
29
|
-
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
30
|
-
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
31
|
-
|
|
32
|
-
function inferSpokenLanguage(text) {
|
|
33
|
-
const normalized = trimString(text);
|
|
34
|
-
if (!normalized) return 'English';
|
|
35
|
-
return 'English';
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
function normalizeSpokenSegment(segment) {
|
|
39
|
-
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
40
|
-
|
|
41
|
-
const text = trimString(segment.text);
|
|
42
|
-
if (!text) return null;
|
|
43
|
-
|
|
44
|
-
const normalized = { text };
|
|
45
|
-
const pace = trimString(segment.pace);
|
|
46
|
-
const pitch = trimString(segment.pitch);
|
|
47
|
-
const energy = trimString(segment.energy);
|
|
48
|
-
const volume = trimString(segment.volume);
|
|
49
|
-
const pauseAfterMs = clampInteger(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
50
|
-
|
|
51
|
-
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
52
|
-
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
53
|
-
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
54
|
-
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
55
|
-
normalized.pause_after_ms = pauseAfterMs;
|
|
56
|
-
|
|
57
|
-
return normalized;
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
function stripEmoji(text) {
|
|
61
|
-
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
function normalizeSpeechText(text) {
|
|
65
|
-
return stripEmoji(text)
|
|
66
|
-
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
67
|
-
.replace(/__(.*?)__/g, '$1')
|
|
68
|
-
.replace(/`([^`]+)`/g, '$1')
|
|
69
|
-
.replace(/[
|
|
70
|
-
.replace(
|
|
71
|
-
.replace(/\s+/g, ' ')
|
|
72
|
-
.replace(/\s+([,.;!?])/g, '$1')
|
|
73
|
-
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
74
|
-
.replace(/,\s*,+/g, ', ')
|
|
75
|
-
.replace(/\s+/g, ' ')
|
|
76
|
-
.trim();
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
function splitSpeechSegments(text) {
|
|
80
|
-
const normalized = normalizeSpeechText(text);
|
|
81
|
-
if (!normalized) return [];
|
|
82
|
-
|
|
83
|
-
const baseSegments = normalized
|
|
84
|
-
.split(/(?<=[.!?])\s+/)
|
|
85
|
-
.map((segment) => segment.trim())
|
|
86
|
-
.filter(Boolean);
|
|
87
|
-
|
|
88
|
-
const segments = [];
|
|
89
|
-
for (const segment of baseSegments) {
|
|
90
|
-
if (segment.length <= 96) {
|
|
91
|
-
segments.push(segment);
|
|
92
|
-
continue;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
const clauseParts = segment
|
|
96
|
-
.split(/,\s+/)
|
|
97
|
-
.map((part) => part.trim())
|
|
98
|
-
.filter(Boolean);
|
|
99
|
-
|
|
100
|
-
if (clauseParts.length > 1) {
|
|
101
|
-
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
102
|
-
const part = clauseParts[index];
|
|
103
|
-
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
104
|
-
segments.push(needsComma ? `${part},` : part);
|
|
105
|
-
}
|
|
106
|
-
continue;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
segments.push(segment);
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
if (segments.length <= 5) return segments;
|
|
113
|
-
|
|
114
|
-
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
118
|
-
const normalized = segmentText.toLowerCase();
|
|
119
|
-
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
120
|
-
const curious = /\?/.test(segmentText);
|
|
121
|
-
const reflective =
|
|
122
|
-
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
123
|
-
segmentText.length > 60;
|
|
124
|
-
|
|
125
|
-
if (curious) {
|
|
126
|
-
return {
|
|
127
|
-
pace: 'medium',
|
|
128
|
-
pitch: 'slightly_high',
|
|
129
|
-
energy: 'warm',
|
|
130
|
-
volume: 'normal',
|
|
131
|
-
pause_after_ms: 0,
|
|
132
|
-
};
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
if (exclamatory) {
|
|
136
|
-
return {
|
|
137
|
-
pace: 'medium_fast',
|
|
138
|
-
pitch: 'slightly_high',
|
|
139
|
-
energy: 'bright',
|
|
140
|
-
volume: 'normal',
|
|
141
|
-
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
142
|
-
};
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
if (reflective) {
|
|
146
|
-
return {
|
|
147
|
-
pace: 'medium',
|
|
148
|
-
pitch: 'neutral',
|
|
149
|
-
energy: 'warm',
|
|
150
|
-
volume: 'normal',
|
|
151
|
-
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
152
|
-
};
|
|
153
|
-
}
|
|
154
|
-
|
|
155
|
-
return {
|
|
156
|
-
pace: 'medium',
|
|
157
|
-
pitch: 'neutral',
|
|
158
|
-
energy: 'warm',
|
|
159
|
-
volume: 'normal',
|
|
160
|
-
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
161
|
-
};
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
function synthesizeSpokenSegments(text) {
|
|
165
|
-
const language = inferSpokenLanguage(text);
|
|
166
|
-
const rawSegments = splitSpeechSegments(text);
|
|
167
|
-
if (rawSegments.length === 0) return null;
|
|
168
|
-
|
|
169
|
-
const segments = rawSegments.map((segmentText, index) => ({
|
|
170
|
-
text: segmentText,
|
|
171
|
-
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
172
|
-
}));
|
|
173
|
-
|
|
174
|
-
return {
|
|
175
|
-
language,
|
|
176
|
-
segments,
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
function normalizeSpokenMetadata(spoken) {
|
|
181
|
-
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
182
|
-
|
|
183
|
-
const text = trimString(spoken.text);
|
|
184
|
-
if (!text) return null;
|
|
185
|
-
|
|
186
|
-
const normalized = { text };
|
|
187
|
-
const language = trimString(spoken.language);
|
|
188
|
-
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
189
|
-
normalized.language = language;
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
const explicitSegments =
|
|
193
|
-
Array.isArray(spoken.segments)
|
|
194
|
-
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
195
|
-
: [];
|
|
196
|
-
if (explicitSegments.length > 0) {
|
|
197
|
-
normalized.segments = explicitSegments;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
const instructions = trimString(spoken.instructions);
|
|
201
|
-
if (instructions) normalized.instructions = instructions;
|
|
202
|
-
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
203
|
-
normalized.style = spoken.style;
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
207
|
-
if (!normalized.language && fallbackSegments?.language) {
|
|
208
|
-
normalized.language = fallbackSegments.language;
|
|
209
|
-
}
|
|
210
|
-
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
211
|
-
normalized.segments = fallbackSegments.segments;
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
return normalized;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
function inferSpokenMetadataFromContent(content) {
|
|
218
|
-
const text = normalizeSpeechText(trimString(content));
|
|
219
|
-
if (!text) return null;
|
|
220
|
-
const synthesized = synthesizeSpokenSegments(text);
|
|
221
|
-
|
|
222
|
-
const normalized = text.toLowerCase();
|
|
223
|
-
const upbeat =
|
|
224
|
-
/!/.test(text) ||
|
|
225
|
-
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
226
|
-
const gentle =
|
|
227
|
-
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
228
|
-
const curious = /\?/.test(text);
|
|
229
|
-
|
|
230
|
-
if (upbeat) {
|
|
231
|
-
return {
|
|
232
|
-
text,
|
|
233
|
-
language: synthesized?.language || 'English',
|
|
234
|
-
segments: synthesized?.segments,
|
|
235
|
-
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
236
|
-
style: { emotion: 'upbeat', energy: 'medium' },
|
|
237
|
-
};
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
if (gentle) {
|
|
241
|
-
return {
|
|
242
|
-
text,
|
|
243
|
-
language: synthesized?.language || 'English',
|
|
244
|
-
segments: synthesized?.segments,
|
|
245
|
-
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
246
|
-
style: { emotion: 'gentle', energy: 'low' },
|
|
247
|
-
};
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
if (curious) {
|
|
251
|
-
return {
|
|
252
|
-
text,
|
|
253
|
-
language: synthesized?.language || 'English',
|
|
254
|
-
segments: synthesized?.segments,
|
|
255
|
-
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
256
|
-
style: { emotion: 'curious', energy: 'medium' },
|
|
257
|
-
};
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
return {
|
|
261
|
-
text,
|
|
262
|
-
language: synthesized?.language || 'English',
|
|
263
|
-
segments: synthesized?.segments,
|
|
264
|
-
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
265
|
-
style: { emotion: 'neutral', energy: 'medium' },
|
|
266
|
-
};
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
export {
|
|
270
|
-
inferSpokenMetadataFromContent,
|
|
271
|
-
normalizeSpokenMetadata,
|
|
272
|
-
normalizeSpeechText,
|
|
273
|
-
};
|
|
1
|
+
function trimString(value, fallback = '') {
|
|
2
|
+
return typeof value === 'string' && value.trim() ? value.trim() : fallback;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
function clampInteger(value, fallback, { min = 1, max = Number.MAX_SAFE_INTEGER } = {}) {
|
|
6
|
+
if (typeof value !== 'number' || !Number.isFinite(value)) return fallback;
|
|
7
|
+
const normalized = Math.floor(value);
|
|
8
|
+
if (normalized < min) return fallback;
|
|
9
|
+
if (normalized > max) return max;
|
|
10
|
+
return normalized;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
14
|
+
'Auto',
|
|
15
|
+
'Chinese',
|
|
16
|
+
'English',
|
|
17
|
+
'German',
|
|
18
|
+
'Italian',
|
|
19
|
+
'Portuguese',
|
|
20
|
+
'Spanish',
|
|
21
|
+
'Japanese',
|
|
22
|
+
'Korean',
|
|
23
|
+
'French',
|
|
24
|
+
'Russian',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
28
|
+
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
29
|
+
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
30
|
+
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
31
|
+
|
|
32
|
+
function inferSpokenLanguage(text) {
|
|
33
|
+
const normalized = trimString(text);
|
|
34
|
+
if (!normalized) return 'English';
|
|
35
|
+
return 'English';
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function normalizeSpokenSegment(segment) {
|
|
39
|
+
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
40
|
+
|
|
41
|
+
const text = trimString(segment.text);
|
|
42
|
+
if (!text) return null;
|
|
43
|
+
|
|
44
|
+
const normalized = { text };
|
|
45
|
+
const pace = trimString(segment.pace);
|
|
46
|
+
const pitch = trimString(segment.pitch);
|
|
47
|
+
const energy = trimString(segment.energy);
|
|
48
|
+
const volume = trimString(segment.volume);
|
|
49
|
+
const pauseAfterMs = clampInteger(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
50
|
+
|
|
51
|
+
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
52
|
+
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
53
|
+
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
54
|
+
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
55
|
+
normalized.pause_after_ms = pauseAfterMs;
|
|
56
|
+
|
|
57
|
+
return normalized;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function stripEmoji(text) {
|
|
61
|
+
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function normalizeSpeechText(text) {
|
|
65
|
+
return stripEmoji(text)
|
|
66
|
+
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
67
|
+
.replace(/__(.*?)__/g, '$1')
|
|
68
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
69
|
+
.replace(/[\u2013\u2014]/g, ', ')
|
|
70
|
+
.replace(/\u2026/g, '...')
|
|
71
|
+
.replace(/\s+/g, ' ')
|
|
72
|
+
.replace(/\s+([,.;!?])/g, '$1')
|
|
73
|
+
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
74
|
+
.replace(/,\s*,+/g, ', ')
|
|
75
|
+
.replace(/\s+/g, ' ')
|
|
76
|
+
.trim();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function splitSpeechSegments(text) {
|
|
80
|
+
const normalized = normalizeSpeechText(text);
|
|
81
|
+
if (!normalized) return [];
|
|
82
|
+
|
|
83
|
+
const baseSegments = normalized
|
|
84
|
+
.split(/(?<=[.!?])\s+/)
|
|
85
|
+
.map((segment) => segment.trim())
|
|
86
|
+
.filter(Boolean);
|
|
87
|
+
|
|
88
|
+
const segments = [];
|
|
89
|
+
for (const segment of baseSegments) {
|
|
90
|
+
if (segment.length <= 96) {
|
|
91
|
+
segments.push(segment);
|
|
92
|
+
continue;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const clauseParts = segment
|
|
96
|
+
.split(/,\s+/)
|
|
97
|
+
.map((part) => part.trim())
|
|
98
|
+
.filter(Boolean);
|
|
99
|
+
|
|
100
|
+
if (clauseParts.length > 1) {
|
|
101
|
+
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
102
|
+
const part = clauseParts[index];
|
|
103
|
+
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
104
|
+
segments.push(needsComma ? `${part},` : part);
|
|
105
|
+
}
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
segments.push(segment);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (segments.length <= 5) return segments;
|
|
113
|
+
|
|
114
|
+
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
118
|
+
const normalized = segmentText.toLowerCase();
|
|
119
|
+
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
120
|
+
const curious = /\?/.test(segmentText);
|
|
121
|
+
const reflective =
|
|
122
|
+
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
123
|
+
segmentText.length > 60;
|
|
124
|
+
|
|
125
|
+
if (curious) {
|
|
126
|
+
return {
|
|
127
|
+
pace: 'medium',
|
|
128
|
+
pitch: 'slightly_high',
|
|
129
|
+
energy: 'warm',
|
|
130
|
+
volume: 'normal',
|
|
131
|
+
pause_after_ms: 0,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if (exclamatory) {
|
|
136
|
+
return {
|
|
137
|
+
pace: 'medium_fast',
|
|
138
|
+
pitch: 'slightly_high',
|
|
139
|
+
energy: 'bright',
|
|
140
|
+
volume: 'normal',
|
|
141
|
+
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (reflective) {
|
|
146
|
+
return {
|
|
147
|
+
pace: 'medium',
|
|
148
|
+
pitch: 'neutral',
|
|
149
|
+
energy: 'warm',
|
|
150
|
+
volume: 'normal',
|
|
151
|
+
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
pace: 'medium',
|
|
157
|
+
pitch: 'neutral',
|
|
158
|
+
energy: 'warm',
|
|
159
|
+
volume: 'normal',
|
|
160
|
+
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function synthesizeSpokenSegments(text) {
|
|
165
|
+
const language = inferSpokenLanguage(text);
|
|
166
|
+
const rawSegments = splitSpeechSegments(text);
|
|
167
|
+
if (rawSegments.length === 0) return null;
|
|
168
|
+
|
|
169
|
+
const segments = rawSegments.map((segmentText, index) => ({
|
|
170
|
+
text: segmentText,
|
|
171
|
+
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
172
|
+
}));
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
language,
|
|
176
|
+
segments,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function normalizeSpokenMetadata(spoken) {
|
|
181
|
+
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
182
|
+
|
|
183
|
+
const text = trimString(spoken.text);
|
|
184
|
+
if (!text) return null;
|
|
185
|
+
|
|
186
|
+
const normalized = { text };
|
|
187
|
+
const language = trimString(spoken.language);
|
|
188
|
+
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
189
|
+
normalized.language = language;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
const explicitSegments =
|
|
193
|
+
Array.isArray(spoken.segments)
|
|
194
|
+
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
195
|
+
: [];
|
|
196
|
+
if (explicitSegments.length > 0) {
|
|
197
|
+
normalized.segments = explicitSegments;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const instructions = trimString(spoken.instructions);
|
|
201
|
+
if (instructions) normalized.instructions = instructions;
|
|
202
|
+
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
203
|
+
normalized.style = spoken.style;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
207
|
+
if (!normalized.language && fallbackSegments?.language) {
|
|
208
|
+
normalized.language = fallbackSegments.language;
|
|
209
|
+
}
|
|
210
|
+
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
211
|
+
normalized.segments = fallbackSegments.segments;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
return normalized;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function inferSpokenMetadataFromContent(content) {
|
|
218
|
+
const text = normalizeSpeechText(trimString(content));
|
|
219
|
+
if (!text) return null;
|
|
220
|
+
const synthesized = synthesizeSpokenSegments(text);
|
|
221
|
+
|
|
222
|
+
const normalized = text.toLowerCase();
|
|
223
|
+
const upbeat =
|
|
224
|
+
/!/.test(text) ||
|
|
225
|
+
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
226
|
+
const gentle =
|
|
227
|
+
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
228
|
+
const curious = /\?/.test(text);
|
|
229
|
+
|
|
230
|
+
if (upbeat) {
|
|
231
|
+
return {
|
|
232
|
+
text,
|
|
233
|
+
language: synthesized?.language || 'English',
|
|
234
|
+
segments: synthesized?.segments,
|
|
235
|
+
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
236
|
+
style: { emotion: 'upbeat', energy: 'medium' },
|
|
237
|
+
};
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
if (gentle) {
|
|
241
|
+
return {
|
|
242
|
+
text,
|
|
243
|
+
language: synthesized?.language || 'English',
|
|
244
|
+
segments: synthesized?.segments,
|
|
245
|
+
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
246
|
+
style: { emotion: 'gentle', energy: 'low' },
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (curious) {
|
|
251
|
+
return {
|
|
252
|
+
text,
|
|
253
|
+
language: synthesized?.language || 'English',
|
|
254
|
+
segments: synthesized?.segments,
|
|
255
|
+
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
256
|
+
style: { emotion: 'curious', energy: 'medium' },
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
text,
|
|
262
|
+
language: synthesized?.language || 'English',
|
|
263
|
+
segments: synthesized?.segments,
|
|
264
|
+
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
265
|
+
style: { emotion: 'neutral', energy: 'medium' },
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export {
|
|
270
|
+
inferSpokenMetadataFromContent,
|
|
271
|
+
normalizeSpokenMetadata,
|
|
272
|
+
normalizeSpeechText,
|
|
273
|
+
};
|
package/openclaw.extension.js
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import { inferSpokenMetadataFromContent, normalizeSpokenMetadata } from './lib/spokenMetadata.js';
|
|
2
|
+
|
|
3
|
+
const CHANNEL_ID = 'oomi';
|
|
2
4
|
const DEFAULT_SESSION_KEY = 'agent:main:webchat:channel:oomi';
|
|
3
5
|
const DEFAULT_TIMEOUT_MS = 15000;
|
|
4
6
|
|
|
@@ -178,272 +180,15 @@ function extractCorrelationId(payload) {
|
|
|
178
180
|
return '';
|
|
179
181
|
}
|
|
180
182
|
|
|
181
|
-
const BOUNDED_LANGUAGE_TYPES = new Set([
|
|
182
|
-
'Auto',
|
|
183
|
-
'Chinese',
|
|
184
|
-
'English',
|
|
185
|
-
'German',
|
|
186
|
-
'Italian',
|
|
187
|
-
'Portuguese',
|
|
188
|
-
'Spanish',
|
|
189
|
-
'Japanese',
|
|
190
|
-
'Korean',
|
|
191
|
-
'French',
|
|
192
|
-
'Russian',
|
|
193
|
-
]);
|
|
194
|
-
|
|
195
|
-
const BOUNDED_PACE_VALUES = new Set(['very_slow', 'slow', 'medium', 'medium_fast', 'fast']);
|
|
196
|
-
const BOUNDED_PITCH_VALUES = new Set(['low', 'slightly_low', 'neutral', 'slightly_high', 'high']);
|
|
197
|
-
const BOUNDED_ENERGY_VALUES = new Set(['soft', 'calm', 'warm', 'bright', 'intense']);
|
|
198
|
-
const BOUNDED_VOLUME_VALUES = new Set(['soft', 'normal', 'projected']);
|
|
199
|
-
|
|
200
|
-
function inferSpokenLanguage(text) {
|
|
201
|
-
const normalized = toString(text);
|
|
202
|
-
if (!normalized) return 'English';
|
|
203
|
-
return 'English';
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
function normalizeSpokenSegment(segment) {
|
|
207
|
-
if (!segment || typeof segment !== 'object' || Array.isArray(segment)) return null;
|
|
208
|
-
|
|
209
|
-
const text = toString(segment.text);
|
|
210
|
-
if (!text) return null;
|
|
211
|
-
|
|
212
|
-
const normalized = { text };
|
|
213
|
-
const pace = toString(segment.pace);
|
|
214
|
-
const pitch = toString(segment.pitch);
|
|
215
|
-
const energy = toString(segment.energy);
|
|
216
|
-
const volume = toString(segment.volume);
|
|
217
|
-
const pauseAfterMs = toNumber(segment.pause_after_ms, 0, { min: 0, max: 1200 });
|
|
218
|
-
|
|
219
|
-
if (BOUNDED_PACE_VALUES.has(pace)) normalized.pace = pace;
|
|
220
|
-
if (BOUNDED_PITCH_VALUES.has(pitch)) normalized.pitch = pitch;
|
|
221
|
-
if (BOUNDED_ENERGY_VALUES.has(energy)) normalized.energy = energy;
|
|
222
|
-
if (BOUNDED_VOLUME_VALUES.has(volume)) normalized.volume = volume;
|
|
223
|
-
normalized.pause_after_ms = pauseAfterMs;
|
|
224
|
-
|
|
225
|
-
return normalized;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
function splitSpeechSegments(text) {
|
|
229
|
-
const normalized = normalizeSpeechText(text);
|
|
230
|
-
if (!normalized) return [];
|
|
231
|
-
|
|
232
|
-
const baseSegments = normalized
|
|
233
|
-
.split(/(?<=[.!?])\s+/)
|
|
234
|
-
.map((segment) => segment.trim())
|
|
235
|
-
.filter(Boolean);
|
|
236
|
-
|
|
237
|
-
const segments = [];
|
|
238
|
-
for (const segment of baseSegments) {
|
|
239
|
-
if (segment.length <= 96) {
|
|
240
|
-
segments.push(segment);
|
|
241
|
-
continue;
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
const clauseParts = segment
|
|
245
|
-
.split(/,\s+/)
|
|
246
|
-
.map((part) => part.trim())
|
|
247
|
-
.filter(Boolean);
|
|
248
|
-
|
|
249
|
-
if (clauseParts.length > 1) {
|
|
250
|
-
for (let index = 0; index < clauseParts.length; index += 1) {
|
|
251
|
-
const part = clauseParts[index];
|
|
252
|
-
const needsComma = index < clauseParts.length - 1 && !/[.!?]$/.test(part);
|
|
253
|
-
segments.push(needsComma ? `${part},` : part);
|
|
254
|
-
}
|
|
255
|
-
continue;
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
segments.push(segment);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
if (segments.length <= 5) return segments;
|
|
262
|
-
|
|
263
|
-
return [...segments.slice(0, 4), segments.slice(4).join(' ').trim()];
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
function inferSegmentStyle(segmentText, index, totalSegments) {
|
|
267
|
-
const normalized = segmentText.toLowerCase();
|
|
268
|
-
const exclamatory = /!/.test(segmentText) || /\b(hell yeah|awesome|amazing|stoked|love|perfect|great)\b/.test(normalized);
|
|
269
|
-
const curious = /\?/.test(segmentText);
|
|
270
|
-
const reflective =
|
|
271
|
-
/\b(i think|i'm|i am|i've|i have|lately|right now|before this|each time|understand|it feels like)\b/.test(normalized) ||
|
|
272
|
-
segmentText.length > 60;
|
|
273
|
-
|
|
274
|
-
if (curious) {
|
|
275
|
-
return {
|
|
276
|
-
pace: 'medium',
|
|
277
|
-
pitch: 'slightly_high',
|
|
278
|
-
energy: 'warm',
|
|
279
|
-
volume: 'normal',
|
|
280
|
-
pause_after_ms: 0,
|
|
281
|
-
};
|
|
282
|
-
}
|
|
283
|
-
|
|
284
|
-
if (exclamatory) {
|
|
285
|
-
return {
|
|
286
|
-
pace: 'medium_fast',
|
|
287
|
-
pitch: 'slightly_high',
|
|
288
|
-
energy: 'bright',
|
|
289
|
-
volume: 'normal',
|
|
290
|
-
pause_after_ms: index < totalSegments - 1 ? 220 : 0,
|
|
291
|
-
};
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
if (reflective) {
|
|
295
|
-
return {
|
|
296
|
-
pace: 'medium',
|
|
297
|
-
pitch: 'neutral',
|
|
298
|
-
energy: 'warm',
|
|
299
|
-
volume: 'normal',
|
|
300
|
-
pause_after_ms: index < totalSegments - 1 ? 260 : 0,
|
|
301
|
-
};
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
return {
|
|
305
|
-
pace: 'medium',
|
|
306
|
-
pitch: 'neutral',
|
|
307
|
-
energy: 'warm',
|
|
308
|
-
volume: 'normal',
|
|
309
|
-
pause_after_ms: index < totalSegments - 1 ? 180 : 0,
|
|
310
|
-
};
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
function synthesizeSpokenSegments(text) {
|
|
314
|
-
const language = inferSpokenLanguage(text);
|
|
315
|
-
const rawSegments = splitSpeechSegments(text);
|
|
316
|
-
if (rawSegments.length === 0) return null;
|
|
317
|
-
|
|
318
|
-
const segments = rawSegments.map((segmentText, index) => ({
|
|
319
|
-
text: segmentText,
|
|
320
|
-
...inferSegmentStyle(segmentText, index, rawSegments.length),
|
|
321
|
-
}));
|
|
322
|
-
|
|
323
|
-
return {
|
|
324
|
-
language,
|
|
325
|
-
segments,
|
|
326
|
-
};
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
function normalizeSpokenMetadata(spoken) {
|
|
330
|
-
if (!spoken || typeof spoken !== 'object' || Array.isArray(spoken)) return null;
|
|
331
|
-
|
|
332
|
-
const text = toString(spoken.text);
|
|
333
|
-
if (!text) return null;
|
|
334
|
-
|
|
335
|
-
const normalized = { text };
|
|
336
|
-
const language = toString(spoken.language);
|
|
337
|
-
if (BOUNDED_LANGUAGE_TYPES.has(language)) {
|
|
338
|
-
normalized.language = language;
|
|
339
|
-
}
|
|
340
|
-
|
|
341
|
-
const explicitSegments =
|
|
342
|
-
Array.isArray(spoken.segments)
|
|
343
|
-
? spoken.segments.map((segment) => normalizeSpokenSegment(segment)).filter(Boolean)
|
|
344
|
-
: [];
|
|
345
|
-
if (explicitSegments.length > 0) {
|
|
346
|
-
normalized.segments = explicitSegments;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
const instructions = toString(spoken.instructions);
|
|
350
|
-
if (instructions) normalized.instructions = instructions;
|
|
351
|
-
if (spoken.style && typeof spoken.style === 'object' && !Array.isArray(spoken.style)) {
|
|
352
|
-
normalized.style = spoken.style;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
const fallbackSegments = synthesizeSpokenSegments(text);
|
|
356
|
-
if (!normalized.language && fallbackSegments?.language) {
|
|
357
|
-
normalized.language = fallbackSegments.language;
|
|
358
|
-
}
|
|
359
|
-
if (!normalized.segments && fallbackSegments?.segments?.length) {
|
|
360
|
-
normalized.segments = fallbackSegments.segments;
|
|
361
|
-
}
|
|
362
|
-
|
|
363
|
-
return normalized;
|
|
364
|
-
}
|
|
365
|
-
|
|
366
|
-
function stripEmoji(text) {
|
|
367
|
-
return text.replace(/[\uFE0E\uFE0F]/g, '').replace(/\p{Extended_Pictographic}|\p{Emoji_Presentation}/gu, '');
|
|
368
|
-
}
|
|
369
|
-
|
|
370
|
-
function normalizeSpeechText(text) {
|
|
371
|
-
return stripEmoji(text)
|
|
372
|
-
.replace(/\*\*(.*?)\*\*/g, '$1')
|
|
373
|
-
.replace(/__(.*?)__/g, '$1')
|
|
374
|
-
.replace(/`([^`]+)`/g, '$1')
|
|
375
|
-
.replace(/[–—]/g, ', ')
|
|
376
|
-
.replace(/…/g, '...')
|
|
377
|
-
.replace(/\s+/g, ' ')
|
|
378
|
-
.replace(/\s+([,.;!?])/g, '$1')
|
|
379
|
-
.replace(/([,.;!?])(?=[^\s])/g, '$1 ')
|
|
380
|
-
.replace(/,\s*,+/g, ', ')
|
|
381
|
-
.replace(/\s+/g, ' ')
|
|
382
|
-
.trim();
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
function inferSpokenMetadataFromContent(content) {
|
|
386
|
-
const text = normalizeSpeechText(toString(content));
|
|
387
|
-
if (!text) return null;
|
|
388
|
-
const synthesized = synthesizeSpokenSegments(text);
|
|
389
|
-
|
|
390
|
-
const normalized = text.toLowerCase();
|
|
391
|
-
const upbeat =
|
|
392
|
-
/!/.test(text) ||
|
|
393
|
-
/\b(hell yeah|awesome|amazing|great|stoked|love|glad|perfect|nice|cool)\b/.test(normalized);
|
|
394
|
-
const gentle =
|
|
395
|
-
/\b(sorry|gentle|softly|careful|reassuring|calm|okay|it'?s okay|i know)\b/.test(normalized);
|
|
396
|
-
const curious = /\?/.test(text);
|
|
397
|
-
|
|
398
|
-
if (upbeat) {
|
|
399
|
-
return {
|
|
400
|
-
text,
|
|
401
|
-
language: synthesized?.language || 'English',
|
|
402
|
-
segments: synthesized?.segments,
|
|
403
|
-
instructions: 'Speak with warm, upbeat conversational energy and natural pacing.',
|
|
404
|
-
style: { emotion: 'upbeat', energy: 'medium' },
|
|
405
|
-
};
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
if (gentle) {
|
|
409
|
-
return {
|
|
410
|
-
text,
|
|
411
|
-
language: synthesized?.language || 'English',
|
|
412
|
-
segments: synthesized?.segments,
|
|
413
|
-
instructions: 'Speak gently and reassuringly, with a calm pace and soft emphasis.',
|
|
414
|
-
style: { emotion: 'gentle', energy: 'low' },
|
|
415
|
-
};
|
|
416
|
-
}
|
|
417
|
-
|
|
418
|
-
if (curious) {
|
|
419
|
-
return {
|
|
420
|
-
text,
|
|
421
|
-
language: synthesized?.language || 'English',
|
|
422
|
-
segments: synthesized?.segments,
|
|
423
|
-
instructions: 'Speak naturally with curious, engaged intonation and a conversational pace.',
|
|
424
|
-
style: { emotion: 'curious', energy: 'medium' },
|
|
425
|
-
};
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
return {
|
|
429
|
-
text,
|
|
430
|
-
language: synthesized?.language || 'English',
|
|
431
|
-
segments: synthesized?.segments,
|
|
432
|
-
instructions: 'Speak naturally with light warmth and conversational pacing.',
|
|
433
|
-
style: { emotion: 'neutral', energy: 'medium' },
|
|
434
|
-
};
|
|
435
|
-
}
|
|
436
|
-
|
|
437
183
|
function normalizeOutgoingMetadata(payloadMetadata, { accountId, correlationId, content }) {
|
|
438
184
|
const metadata =
|
|
439
185
|
payloadMetadata && typeof payloadMetadata === 'object' && !Array.isArray(payloadMetadata)
|
|
440
186
|
? { ...payloadMetadata }
|
|
441
187
|
: {};
|
|
442
188
|
|
|
443
|
-
const
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
(!explicitSpokenPresent ? inferSpokenMetadataFromContent(content) : null);
|
|
189
|
+
const spoken =
|
|
190
|
+
normalizeSpokenMetadata(metadata.spoken) ||
|
|
191
|
+
inferSpokenMetadataFromContent(content);
|
|
447
192
|
if (spoken) {
|
|
448
193
|
metadata.spoken = spoken;
|
|
449
194
|
} else {
|
package/openclaw.plugin.json
CHANGED
package/package.json
CHANGED
package/skills/oomi/SKILL.md
CHANGED
|
@@ -168,16 +168,16 @@ Use this shape when a voice turn needs more natural delivery without changing vi
|
|
|
168
168
|
}
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
-
Rules:
|
|
172
|
-
- keep visible assistant `content` clean and user-facing
|
|
173
|
-
- do not place raw intonation tags in visible chat
|
|
174
|
-
- for managed voice replies, include `metadata.spoken` when
|
|
175
|
-
- `metadata.spoken.text` is backend TTS input only
|
|
176
|
-
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
177
|
-
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
178
|
-
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
179
|
-
- if the
|
|
180
|
-
-
|
|
171
|
+
Rules:
|
|
172
|
+
- keep visible assistant `content` clean and user-facing
|
|
173
|
+
- do not place raw intonation tags in visible chat
|
|
174
|
+
- for managed cloned-voice replies, include `metadata.spoken` when backend TTS should speak the turn
|
|
175
|
+
- `metadata.spoken.text` is backend TTS input only
|
|
176
|
+
- `metadata.spoken.language` should be one of the supported Qwen language values such as `English`
|
|
177
|
+
- `metadata.spoken.segments` can carry bounded per-segment prosody for pace, pitch, volume, and pause timing
|
|
178
|
+
- `metadata.spoken.instructions` should use natural-language speaking guidance
|
|
179
|
+
- if you omit `metadata.spoken`, the shared package helper may synthesize it as a compatibility guardrail before backend TTS
|
|
180
|
+
- backend cloned voice is strict: if `metadata.spoken` does not reach Oomi, playback fails instead of falling back to flat speech
|
|
181
181
|
|
|
182
182
|
## Avatar Control
|
|
183
183
|
|