@omote/core 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +222 -443
- package/dist/index.d.mts +79 -828
- package/dist/index.d.ts +79 -828
- package/dist/index.js +180 -1314
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +180 -1314
- package/dist/index.mjs.map +1 -1
- package/package.json +7 -3
package/README.md
CHANGED
|
@@ -1,19 +1,21 @@
|
|
|
1
1
|
# @omote/core
|
|
2
2
|
|
|
3
|
-
>
|
|
3
|
+
> Client-side AI inference for real-time lip sync, speech recognition, and avatar animation — runs entirely in browser via WebGPU and WASM.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- **Lip Sync
|
|
8
|
-
- **
|
|
9
|
-
- **
|
|
10
|
-
- **Voice Activity Detection**
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
13
|
-
- **
|
|
14
|
-
- **
|
|
15
|
-
- **
|
|
16
|
-
- **
|
|
7
|
+
- **Lip Sync (A2E)** — Audio to 52 ARKit blendshapes via Wav2Vec2, with automatic GPU/CPU platform detection
|
|
8
|
+
- **Full-Face Pipeline** — TTS audio playback to lip sync with ExpressionProfile scaling, gapless scheduling
|
|
9
|
+
- **Speech Recognition** — SenseVoice ASR (ONNX), 15x faster than Whisper, progressive transcription
|
|
10
|
+
- **Voice Activity Detection** — Silero VAD with Worker and main-thread modes
|
|
11
|
+
- **Text-to-Speech** — ChatterboxTurbo (experimental, use server-side TTS for production)
|
|
12
|
+
- **Animation Graph** — State machine (idle/listening/thinking/speaking) with emotion blending
|
|
13
|
+
- **Emotion Controller** — Preset-based emotion system with smooth transitions
|
|
14
|
+
- **Model Caching** — IndexedDB with versioning, LRU eviction, and quota monitoring
|
|
15
|
+
- **Microphone Capture** — Browser noise suppression, echo cancellation, AGC
|
|
16
|
+
- **Logging & Telemetry** — Structured logging (6 levels) and OpenTelemetry-compatible tracing
|
|
17
|
+
- **Offline Ready** — No cloud dependencies, works entirely without internet
|
|
18
|
+
- **WebGPU + WASM** — WebGPU-first with automatic WASM fallback
|
|
17
19
|
|
|
18
20
|
## Installation
|
|
19
21
|
|
|
@@ -21,563 +23,340 @@
|
|
|
21
23
|
npm install @omote/core
|
|
22
24
|
```
|
|
23
25
|
|
|
26
|
+
Peer dependency: `onnxruntime-web` is included — no additional installs needed.
|
|
27
|
+
|
|
24
28
|
## Quick Start
|
|
25
29
|
|
|
26
|
-
### Lip Sync
|
|
30
|
+
### FullFacePipeline (TTS Lip Sync)
|
|
31
|
+
|
|
32
|
+
The most common use case: feed TTS audio chunks and get back 52 ARKit blendshape frames at render rate.
|
|
27
33
|
|
|
28
34
|
```typescript
|
|
29
|
-
import {
|
|
35
|
+
import { FullFacePipeline, createA2E } from '@omote/core';
|
|
30
36
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
37
|
+
// 1. Create A2E backend (auto-detects GPU vs CPU)
|
|
38
|
+
const lam = createA2E({
|
|
39
|
+
gpuModelUrl: '/models/lam-wav2vec2.onnx',
|
|
40
|
+
cpuModelUrl: '/models/wav2arkit_cpu.onnx',
|
|
41
|
+
mode: 'auto',
|
|
34
42
|
});
|
|
35
|
-
|
|
36
43
|
await lam.load();
|
|
37
44
|
|
|
38
|
-
//
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
}
|
|
46
|
-
```
|
|
47
|
-
|
|
48
|
-
### Speech-to-Text (WhisperInference)
|
|
49
|
-
|
|
50
|
-
```typescript
|
|
51
|
-
import { WhisperInference } from '@omote/core';
|
|
45
|
+
// 2. Create pipeline with expression profile
|
|
46
|
+
const pipeline = new FullFacePipeline({
|
|
47
|
+
lam,
|
|
48
|
+
sampleRate: 16000,
|
|
49
|
+
profile: { mouth: 1.0, jaw: 1.0, brows: 0.6, eyes: 0.0, cheeks: 0.5, nose: 0.3, tongue: 0.5 },
|
|
50
|
+
});
|
|
51
|
+
await pipeline.initialize();
|
|
52
52
|
|
|
53
|
-
//
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
// 3. Listen for blendshape frames
|
|
54
|
+
pipeline.on('full_frame_ready', (frame) => {
|
|
55
|
+
applyToAvatar(frame.blendshapes); // ExpressionProfile-scaled, 52 ARKit weights
|
|
56
|
+
});
|
|
56
57
|
|
|
57
|
-
|
|
58
|
-
|
|
58
|
+
// 4. Feed TTS audio and play
|
|
59
|
+
pipeline.start();
|
|
60
|
+
await pipeline.onAudioChunk(ttsAudioChunk); // Uint8Array PCM16
|
|
61
|
+
await pipeline.end(); // Flush remaining audio
|
|
59
62
|
```
|
|
60
63
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
```typescript
|
|
64
|
-
import { DistilHuBERTEmotionInference, EMOTION_LABELS } from '@omote/core';
|
|
64
|
+
## API Reference
|
|
65
65
|
|
|
66
|
-
|
|
67
|
-
modelUrl: '/models/distilhubert-emotion.onnx'
|
|
68
|
-
});
|
|
69
|
-
await emotion.load();
|
|
66
|
+
### A2E (Audio to Expression)
|
|
70
67
|
|
|
71
|
-
|
|
72
|
-
console.log(detected); // 'happy', 'sad', 'angry', etc.
|
|
73
|
-
```
|
|
68
|
+
#### Factory API (Recommended)
|
|
74
69
|
|
|
75
|
-
|
|
70
|
+
Auto-detects platform: Chrome/Edge/Android use WebGPU, Safari/iOS use WASM CPU fallback.
|
|
76
71
|
|
|
77
72
|
```typescript
|
|
78
|
-
import {
|
|
73
|
+
import { createA2E } from '@omote/core';
|
|
79
74
|
|
|
80
|
-
const
|
|
81
|
-
|
|
75
|
+
const a2e = createA2E({
|
|
76
|
+
gpuModelUrl: '/models/lam-wav2vec2.onnx', // 384MB, WebGPU
|
|
77
|
+
cpuModelUrl: '/models/wav2arkit_cpu.onnx', // 404MB, WASM
|
|
78
|
+
mode: 'auto', // 'auto' | 'gpu' | 'cpu'
|
|
79
|
+
fallbackOnError: true, // GPU failure → auto-switch to CPU
|
|
82
80
|
});
|
|
83
|
-
await
|
|
81
|
+
await a2e.load();
|
|
84
82
|
|
|
85
|
-
const {
|
|
83
|
+
const { blendshapes } = await a2e.infer(audioSamples); // Float32Array (16kHz)
|
|
84
|
+
// → 52 ARKit blendshape weights
|
|
86
85
|
```
|
|
87
86
|
|
|
88
|
-
|
|
87
|
+
#### Direct API
|
|
89
88
|
|
|
90
89
|
```typescript
|
|
91
|
-
import {
|
|
92
|
-
EmotionController,
|
|
93
|
-
createEmotionVector,
|
|
94
|
-
EmotionPresets,
|
|
95
|
-
} from '@omote/core';
|
|
96
|
-
|
|
97
|
-
// Create emotion vectors
|
|
98
|
-
const emotion = createEmotionVector({ joy: 0.8, amazement: 0.2 });
|
|
90
|
+
import { Wav2Vec2Inference, LAM_BLENDSHAPES } from '@omote/core';
|
|
99
91
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
controller.setPreset('happy');
|
|
103
|
-
controller.transitionTo({ sadness: 0.7 }, 500); // 500ms transition
|
|
92
|
+
const lam = new Wav2Vec2Inference({ modelUrl: '/models/lam-wav2vec2.onnx' });
|
|
93
|
+
await lam.load();
|
|
104
94
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
const currentEmotion = controller.emotion; // Float32Array(26)
|
|
95
|
+
const { blendshapes } = await lam.infer(audioSamples);
|
|
96
|
+
const jawOpen = blendshapes[LAM_BLENDSHAPES.indexOf('jawOpen')];
|
|
108
97
|
```
|
|
109
98
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
**Presets:** `neutral`, `happy`, `sad`, `angry`, `surprised`, `scared`, `disgusted`, `excited`, `tired`, `playful`, `pained`, `contemplative`
|
|
99
|
+
### FullFacePipeline
|
|
113
100
|
|
|
114
|
-
|
|
101
|
+
End-to-end TTS playback with lip sync inference, audio scheduling, and ExpressionProfile scaling.
|
|
115
102
|
|
|
116
103
|
```typescript
|
|
117
|
-
import {
|
|
104
|
+
import { FullFacePipeline } from '@omote/core';
|
|
118
105
|
|
|
119
|
-
const
|
|
106
|
+
const pipeline = new FullFacePipeline({
|
|
107
|
+
lam, // A2E backend from createA2E()
|
|
120
108
|
sampleRate: 16000,
|
|
121
|
-
|
|
109
|
+
profile: { mouth: 1.0, jaw: 1.0, brows: 0.6, eyes: 0.0, cheeks: 0.5, nose: 0.3, tongue: 0.5 },
|
|
122
110
|
});
|
|
111
|
+
await pipeline.initialize();
|
|
123
112
|
|
|
124
|
-
|
|
125
|
-
//
|
|
126
|
-
|
|
113
|
+
pipeline.on('full_frame_ready', (frame) => {
|
|
114
|
+
// frame.blendshapes — ExpressionProfile-scaled
|
|
115
|
+
// frame.rawBlendshapes — unscaled original values
|
|
116
|
+
applyToAvatar(frame.blendshapes);
|
|
127
117
|
});
|
|
128
118
|
|
|
129
|
-
|
|
119
|
+
pipeline.start();
|
|
120
|
+
await pipeline.onAudioChunk(chunk); // feed TTS audio (Uint8Array PCM16)
|
|
121
|
+
await pipeline.end(); // flush final partial chunk
|
|
130
122
|
```
|
|
131
123
|
|
|
132
|
-
###
|
|
124
|
+
### A2EProcessor
|
|
133
125
|
|
|
134
|
-
|
|
135
|
-
import { configureLogging, createLogger } from '@omote/core';
|
|
126
|
+
Engine-agnostic audio-to-blendshapes processor for custom integrations. Supports pull mode (timestamped frames for TTS) and push mode (drip-feed for live mic).
|
|
136
127
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
level: 'debug', // 'error' | 'warn' | 'info' | 'debug' | 'trace' | 'verbose'
|
|
140
|
-
format: 'pretty', // 'json' | 'pretty'
|
|
141
|
-
enabled: true,
|
|
142
|
-
});
|
|
143
|
-
|
|
144
|
-
// Create module-specific loggers
|
|
145
|
-
const logger = createLogger('MyComponent');
|
|
146
|
-
|
|
147
|
-
logger.info('Model loaded', { backend: 'webgpu', loadTimeMs: 1234 });
|
|
148
|
-
logger.debug('Processing audio', { samples: 16000 });
|
|
149
|
-
logger.error('Failed to load', { error: err.message });
|
|
128
|
+
```typescript
|
|
129
|
+
import { A2EProcessor } from '@omote/core';
|
|
150
130
|
|
|
151
|
-
|
|
152
|
-
// {"timestamp":1704672000000,"level":"info","module":"MyComponent","message":"Model loaded","data":{"backend":"webgpu"}}
|
|
131
|
+
const processor = new A2EProcessor({ backend: lam, chunkSize: 16000 });
|
|
153
132
|
|
|
154
|
-
//
|
|
155
|
-
|
|
133
|
+
// Pull mode: timestamp audio for later retrieval
|
|
134
|
+
processor.pushAudio(samples, audioContext.currentTime + delay);
|
|
135
|
+
const frame = processor.getFrameForTime(audioContext.currentTime);
|
|
156
136
|
```
|
|
157
137
|
|
|
158
|
-
###
|
|
138
|
+
### Speech Recognition (SenseVoice)
|
|
159
139
|
|
|
160
|
-
|
|
140
|
+
SenseVoice ASR — 15x faster than Whisper, with progressive transcription and emotion detection.
|
|
161
141
|
|
|
162
142
|
```typescript
|
|
163
|
-
import {
|
|
143
|
+
import { SenseVoiceInference } from '@omote/core';
|
|
164
144
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
enabled: true,
|
|
168
|
-
serviceName: 'my-app',
|
|
169
|
-
exporter: 'console',
|
|
145
|
+
const asr = new SenseVoiceInference({
|
|
146
|
+
modelUrl: '/models/sensevoice/model.int8.onnx',
|
|
170
147
|
});
|
|
148
|
+
await asr.load();
|
|
171
149
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
enabled: true,
|
|
175
|
-
serviceName: 'my-app',
|
|
176
|
-
serviceVersion: '1.0.0',
|
|
177
|
-
exporter: 'otlp',
|
|
178
|
-
exporterConfig: {
|
|
179
|
-
endpoint: 'https://tempo.example.com',
|
|
180
|
-
headers: { 'Authorization': 'Bearer token' },
|
|
181
|
-
},
|
|
182
|
-
sampling: {
|
|
183
|
-
ratio: 0.1, // Sample 10% of traces
|
|
184
|
-
alwaysSampleErrors: true, // Always capture errors
|
|
185
|
-
},
|
|
186
|
-
});
|
|
150
|
+
const { text, emotion, language } = await asr.transcribe(audioSamples);
|
|
151
|
+
```
|
|
187
152
|
|
|
188
|
-
|
|
189
|
-
const telemetry = getTelemetry();
|
|
153
|
+
#### Platform-Aware ASR
|
|
190
154
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
span.setStatus('ok');
|
|
198
|
-
} catch (error) {
|
|
199
|
-
span.setStatus('error', error);
|
|
200
|
-
} finally {
|
|
201
|
-
span.end();
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// Record metrics
|
|
205
|
-
telemetry.recordMetric('custom_counter', 1, 'counter', { label: 'value' });
|
|
206
|
-
telemetry.recordMetric('custom_gauge', 42.5, 'gauge');
|
|
155
|
+
```typescript
|
|
156
|
+
import { shouldUseNativeASR, SafariSpeechRecognition, SenseVoiceInference } from '@omote/core';
|
|
157
|
+
|
|
158
|
+
const asr = shouldUseNativeASR()
|
|
159
|
+
? new SafariSpeechRecognition({ language: 'en-US' })
|
|
160
|
+
: new SenseVoiceInference({ modelUrl: '/models/sensevoice/model.int8.onnx' });
|
|
207
161
|
```
|
|
208
162
|
|
|
209
|
-
###
|
|
163
|
+
### Voice Activity Detection (Silero VAD)
|
|
210
164
|
|
|
211
|
-
|
|
165
|
+
#### Factory API (Recommended)
|
|
212
166
|
|
|
213
167
|
```typescript
|
|
214
|
-
import {
|
|
215
|
-
|
|
216
|
-
// Fetch with automatic caching (used internally by inference classes)
|
|
217
|
-
const modelData = await fetchWithCache('/models/lam-wav2vec2.onnx', (loaded, total) => {
|
|
218
|
-
console.log(`Loading: ${formatBytes(loaded)} / ${formatBytes(total)}`);
|
|
219
|
-
});
|
|
168
|
+
import { createSileroVAD } from '@omote/core';
|
|
220
169
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
console.log(`Preloaded ${completed}/${total}: ${url}`);
|
|
170
|
+
const vad = createSileroVAD({
|
|
171
|
+
modelUrl: '/models/silero-vad.onnx',
|
|
172
|
+
threshold: 0.5,
|
|
173
|
+
// useWorker: true // Force off-main-thread
|
|
174
|
+
// useWorker: false // Force main thread
|
|
227
175
|
});
|
|
176
|
+
await vad.load();
|
|
228
177
|
|
|
229
|
-
|
|
230
|
-
const cache = getModelCache();
|
|
231
|
-
const stats = await cache.getStats();
|
|
232
|
-
console.log(`Cached: ${stats.modelCount} models, ${formatBytes(stats.totalSize)}`);
|
|
233
|
-
|
|
234
|
-
await cache.delete('/models/old-model.onnx');
|
|
235
|
-
await cache.clear(); // Clear all cached models
|
|
178
|
+
const { isSpeech, probability } = await vad.process(audioSamples);
|
|
236
179
|
```
|
|
237
180
|
|
|
238
|
-
|
|
181
|
+
#### Direct API
|
|
239
182
|
|
|
240
|
-
|
|
183
|
+
```typescript
|
|
184
|
+
import { SileroVADInference, SileroVADWorker } from '@omote/core';
|
|
241
185
|
|
|
242
|
-
|
|
186
|
+
// Main thread (mobile-friendly)
|
|
187
|
+
const vad = new SileroVADInference({ modelUrl: '/models/silero-vad.onnx' });
|
|
243
188
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
models/
|
|
247
|
-
lam-wav2vec2.onnx # LAM lip sync model
|
|
248
|
-
silero-vad.onnx # Voice activity detection
|
|
249
|
-
distilhubert-emotion.onnx # Emotion detection
|
|
189
|
+
// Web Worker (desktop, off-main-thread)
|
|
190
|
+
const vadWorker = new SileroVADWorker({ modelUrl: '/models/silero-vad.onnx' });
|
|
250
191
|
```
|
|
251
192
|
|
|
252
|
-
###
|
|
193
|
+
### Animation Graph
|
|
253
194
|
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
| Model | Size | Quality | Speed |
|
|
257
|
-
|-------|------|---------|-------|
|
|
258
|
-
| `tiny` | ~75MB | Good | Fastest |
|
|
259
|
-
| `base` | ~150MB | Better | Medium |
|
|
195
|
+
State machine for avatar animation states with emotion blending and audio energy.
|
|
260
196
|
|
|
261
197
|
```typescript
|
|
262
|
-
|
|
263
|
-
```
|
|
198
|
+
import { AnimationGraph, AudioEnergyAnalyzer, EmphasisDetector } from '@omote/core';
|
|
264
199
|
|
|
265
|
-
|
|
200
|
+
const graph = new AnimationGraph();
|
|
266
201
|
|
|
267
|
-
|
|
202
|
+
graph.on('state.change', ({ from, to, trigger }) => {
|
|
203
|
+
console.log(`${from} → ${to}`);
|
|
204
|
+
});
|
|
268
205
|
|
|
269
|
-
|
|
206
|
+
graph.on('output.update', (output) => applyToAvatar(output));
|
|
270
207
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
| `dispose()` | Release resources |
|
|
208
|
+
// State transitions
|
|
209
|
+
graph.trigger('user_speech_start'); // idle → listening
|
|
210
|
+
graph.trigger('transcript_ready'); // listening → thinking
|
|
211
|
+
graph.trigger('ai_audio_start'); // thinking → speaking
|
|
212
|
+
graph.trigger('ai_audio_end'); // speaking → idle
|
|
277
213
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
backend: 'webgpu' | 'wasm';
|
|
283
|
-
}
|
|
214
|
+
// Blend emotion and audio energy into output
|
|
215
|
+
graph.setEmotion('happy', 0.8);
|
|
216
|
+
graph.setAudioEnergy(0.7);
|
|
217
|
+
graph.update(deltaTime); // call each frame
|
|
284
218
|
```
|
|
285
219
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
Whisper speech-to-text.
|
|
220
|
+
**States:** `idle` → `listening` → `thinking` → `speaking` → `idle`
|
|
289
221
|
|
|
290
|
-
|
|
291
|
-
|--------|-------------|
|
|
292
|
-
| `new WhisperInference(config)` | Create with `{ model, modelUrl? }` |
|
|
293
|
-
| `load()` | Load encoder + decoder |
|
|
294
|
-
| `transcribe(audio)` | Transcribe audio |
|
|
295
|
-
| `dispose()` | Release resources |
|
|
222
|
+
### Emotion Controller
|
|
296
223
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
Speech emotion recognition (7 emotions).
|
|
300
|
-
|
|
301
|
-
| Method | Description |
|
|
302
|
-
|--------|-------------|
|
|
303
|
-
| `new DistilHuBERTEmotionInference(config)` | Create with `{ modelUrl }` |
|
|
304
|
-
| `load()` | Load ONNX model |
|
|
305
|
-
| `infer(audio)` | Detect emotion |
|
|
306
|
-
| `dispose()` | Release resources |
|
|
307
|
-
|
|
308
|
-
**Emotion Labels:** `angry`, `disgusted`, `fearful`, `happy`, `neutral`, `sad`, `surprised`
|
|
309
|
-
|
|
310
|
-
### SileroVADInference
|
|
311
|
-
|
|
312
|
-
Voice activity detection.
|
|
313
|
-
|
|
314
|
-
| Method | Description |
|
|
315
|
-
|--------|-------------|
|
|
316
|
-
| `new SileroVADInference(config)` | Create with `{ modelUrl }` |
|
|
317
|
-
| `load()` | Load ONNX model |
|
|
318
|
-
| `infer(audio)` | Detect speech |
|
|
319
|
-
| `dispose()` | Release resources |
|
|
320
|
-
|
|
321
|
-
### EmotionController
|
|
322
|
-
|
|
323
|
-
Emotion state with smooth transitions.
|
|
324
|
-
|
|
325
|
-
| Method | Description |
|
|
326
|
-
|--------|-------------|
|
|
327
|
-
| `set(weights)` | Set emotion immediately |
|
|
328
|
-
| `setPreset(name)` | Set preset immediately |
|
|
329
|
-
| `transitionTo(weights, ms)` | Smooth transition |
|
|
330
|
-
| `transitionToPreset(name, ms)` | Transition to preset |
|
|
331
|
-
| `update()` | Update transition (call each frame) |
|
|
332
|
-
| `reset()` | Reset to neutral |
|
|
333
|
-
|
|
334
|
-
| Property | Type | Description |
|
|
335
|
-
|----------|------|-------------|
|
|
336
|
-
| `emotion` | `Float32Array` | Current 26-element vector |
|
|
337
|
-
| `isTransitioning` | `boolean` | Transition in progress |
|
|
338
|
-
|
|
339
|
-
### Logger
|
|
340
|
-
|
|
341
|
-
Structured logging with multiple output formats.
|
|
342
|
-
|
|
343
|
-
| Function | Description |
|
|
344
|
-
|----------|-------------|
|
|
345
|
-
| `configureLogging(config)` | Set global logging configuration |
|
|
346
|
-
| `createLogger(module)` | Create a module-specific logger |
|
|
347
|
-
| `getGlobalLogger()` | Get the global logger instance |
|
|
348
|
-
|
|
349
|
-
**Logger Methods:**
|
|
350
|
-
|
|
351
|
-
| Method | Description |
|
|
352
|
-
|--------|-------------|
|
|
353
|
-
| `error(message, data?)` | Log error (always shown) |
|
|
354
|
-
| `warn(message, data?)` | Log warning |
|
|
355
|
-
| `info(message, data?)` | Log info |
|
|
356
|
-
| `debug(message, data?)` | Log debug |
|
|
357
|
-
| `trace(message, data?)` | Log trace |
|
|
358
|
-
| `verbose(message, data?)` | Log verbose (most detailed) |
|
|
359
|
-
| `child(subModule)` | Create child logger with prefixed module |
|
|
224
|
+
```typescript
|
|
225
|
+
import { EmotionController, EmotionPresets } from '@omote/core';
|
|
360
226
|
|
|
361
|
-
|
|
227
|
+
const controller = new EmotionController();
|
|
228
|
+
controller.setPreset('happy');
|
|
229
|
+
controller.transitionTo({ joy: 0.8 }, 500); // 500ms smooth transition
|
|
362
230
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
enabled?: boolean;
|
|
367
|
-
format?: 'json' | 'pretty';
|
|
368
|
-
sink?: (entry: LogEntry) => void; // Custom output handler
|
|
369
|
-
}
|
|
231
|
+
// In animation loop
|
|
232
|
+
controller.update();
|
|
233
|
+
const current = controller.emotion;
|
|
370
234
|
```
|
|
371
235
|
|
|
372
|
-
|
|
236
|
+
**Presets:** `neutral`, `happy`, `sad`, `angry`, `surprised`, `scared`, `disgusted`, `excited`, `tired`, `playful`, `pained`, `contemplative`
|
|
373
237
|
|
|
374
|
-
|
|
238
|
+
### Model Caching
|
|
375
239
|
|
|
376
|
-
|
|
377
|
-
|----------|-------------|
|
|
378
|
-
| `configureTelemetry(config)` | Initialize telemetry system |
|
|
379
|
-
| `getTelemetry()` | Get global telemetry instance |
|
|
240
|
+
IndexedDB-based caching with versioning, LRU eviction, and storage quota monitoring.
|
|
380
241
|
|
|
381
|
-
|
|
242
|
+
```typescript
|
|
243
|
+
import { getModelCache, fetchWithCache, preloadModels } from '@omote/core';
|
|
382
244
|
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
| `startSpan(name, attributes?)` | Start a new trace span |
|
|
386
|
-
| `recordMetric(name, value, type, attributes?)` | Record a metric |
|
|
387
|
-
| `flush()` | Force flush all pending data |
|
|
388
|
-
| `shutdown()` | Shutdown telemetry system |
|
|
245
|
+
// Fetch with automatic caching
|
|
246
|
+
const data = await fetchWithCache('/models/model.onnx');
|
|
389
247
|
|
|
390
|
-
|
|
248
|
+
// Versioned caching for model updates
|
|
249
|
+
const data = await fetchWithCache('/models/model.onnx', {
|
|
250
|
+
version: '1.0.0',
|
|
251
|
+
validateStale: true,
|
|
252
|
+
});
|
|
391
253
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
| `setAttribute(key, value)` | Add attribute to span |
|
|
395
|
-
| `setStatus(status, error?)` | Set span status ('ok' or 'error') |
|
|
396
|
-
| `end()` | End the span |
|
|
254
|
+
// Cache quota monitoring
|
|
255
|
+
import { configureCacheLimit, getQuotaInfo } from '@omote/core';
|
|
397
256
|
|
|
398
|
-
|
|
257
|
+
configureCacheLimit({
|
|
258
|
+
maxSizeBytes: 500 * 1024 * 1024, // 500MB limit
|
|
259
|
+
onQuotaWarning: (info) => console.warn(`Storage ${info.percentUsed}% used`),
|
|
260
|
+
});
|
|
399
261
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
serviceName: string;
|
|
404
|
-
serviceVersion?: string;
|
|
405
|
-
exporter: 'console' | 'otlp' | 'none';
|
|
406
|
-
exporterConfig?: {
|
|
407
|
-
endpoint: string;
|
|
408
|
-
headers?: Record<string, string>;
|
|
409
|
-
timeoutMs?: number;
|
|
410
|
-
};
|
|
411
|
-
sampling?: {
|
|
412
|
-
ratio?: number; // 0.0 to 1.0
|
|
413
|
-
alwaysSampleErrors?: boolean;
|
|
414
|
-
};
|
|
415
|
-
}
|
|
262
|
+
// Cache stats
|
|
263
|
+
const cache = getModelCache();
|
|
264
|
+
const stats = await cache.getStats(); // { totalSize, modelCount, models }
|
|
416
265
|
```
|
|
417
266
|
|
|
418
|
-
###
|
|
267
|
+
### Microphone Capture
|
|
419
268
|
|
|
420
|
-
|
|
269
|
+
```typescript
|
|
270
|
+
import { MicrophoneCapture } from '@omote/core';
|
|
421
271
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
| `preloadModels(urls, onProgress?)` | Preload multiple models |
|
|
427
|
-
| `formatBytes(bytes)` | Format bytes as human-readable |
|
|
272
|
+
const mic = new MicrophoneCapture({
|
|
273
|
+
sampleRate: 16000,
|
|
274
|
+
bufferSize: 4096,
|
|
275
|
+
});
|
|
428
276
|
|
|
429
|
-
|
|
277
|
+
mic.on('audio', ({ samples }) => {
|
|
278
|
+
// Process 16kHz Float32Array samples
|
|
279
|
+
});
|
|
430
280
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
| `has(url)` | Check if model is cached |
|
|
434
|
-
| `get(url)` | Get cached model data |
|
|
435
|
-
| `set(url, data, etag?)` | Store model in cache |
|
|
436
|
-
| `delete(url)` | Remove model from cache |
|
|
437
|
-
| `clear()` | Clear all cached models |
|
|
438
|
-
| `getStats()` | Get cache statistics |
|
|
281
|
+
await mic.start();
|
|
282
|
+
```
|
|
439
283
|
|
|
440
|
-
###
|
|
284
|
+
### Logging
|
|
441
285
|
|
|
442
286
|
```typescript
|
|
443
|
-
|
|
444
|
-
createEmotionVector({ joy: 0.8, amazement: 0.2 }): Float32Array
|
|
445
|
-
|
|
446
|
-
// Blend multiple emotions
|
|
447
|
-
blendEmotions([
|
|
448
|
-
{ vector: preset1, weight: 0.7 },
|
|
449
|
-
{ vector: preset2, weight: 0.3 },
|
|
450
|
-
]): Float32Array
|
|
287
|
+
import { configureLogging, createLogger } from '@omote/core';
|
|
451
288
|
|
|
452
|
-
|
|
453
|
-
lerpEmotion(from, to, t): Float32Array
|
|
289
|
+
configureLogging({ level: 'debug', format: 'pretty' });
|
|
454
290
|
|
|
455
|
-
|
|
456
|
-
|
|
291
|
+
const logger = createLogger('MyModule');
|
|
292
|
+
logger.info('Model loaded', { backend: 'webgpu', loadTimeMs: 1234 });
|
|
457
293
|
```
|
|
458
294
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
52 output blendshapes compatible with ARKit:
|
|
462
|
-
|
|
463
|
-
```
|
|
464
|
-
eyeBlinkLeft, eyeLookDownLeft, eyeLookInLeft, eyeLookOutLeft, eyeLookUpLeft,
|
|
465
|
-
eyeSquintLeft, eyeWideLeft, eyeBlinkRight, eyeLookDownRight, eyeLookInRight,
|
|
466
|
-
eyeLookOutRight, eyeLookUpRight, eyeSquintRight, eyeWideRight,
|
|
467
|
-
jawForward, jawLeft, jawRight, jawOpen,
|
|
468
|
-
mouthClose, mouthFunnel, mouthPucker, mouthLeft, mouthRight,
|
|
469
|
-
mouthSmileLeft, mouthSmileRight, mouthFrownLeft, mouthFrownRight,
|
|
470
|
-
mouthDimpleLeft, mouthDimpleRight, mouthStretchLeft, mouthStretchRight,
|
|
471
|
-
mouthRollLower, mouthRollUpper, mouthShrugLower, mouthShrugUpper,
|
|
472
|
-
mouthPressLeft, mouthPressRight, mouthLowerDownLeft, mouthLowerDownRight,
|
|
473
|
-
mouthUpperUpLeft, mouthUpperUpRight,
|
|
474
|
-
browDownLeft, browDownRight, browInnerUp, browOuterUpLeft, browOuterUpRight,
|
|
475
|
-
cheekPuff, cheekSquintLeft, cheekSquintRight,
|
|
476
|
-
noseSneerLeft, noseSneerRight, tongueOut
|
|
477
|
-
```
|
|
295
|
+
### Telemetry
|
|
478
296
|
|
|
479
|
-
|
|
297
|
+
OpenTelemetry-compatible tracing and metrics.
|
|
480
298
|
|
|
481
|
-
|
|
299
|
+
```typescript
|
|
300
|
+
import { configureTelemetry, getTelemetry } from '@omote/core';
|
|
482
301
|
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
302
|
+
configureTelemetry({
|
|
303
|
+
enabled: true,
|
|
304
|
+
serviceName: 'my-app',
|
|
305
|
+
exporter: 'console', // or 'otlp' for production
|
|
306
|
+
});
|
|
487
307
|
|
|
488
|
-
|
|
308
|
+
const telemetry = getTelemetry();
|
|
309
|
+
const span = telemetry.startSpan('custom-operation');
|
|
310
|
+
// ... do work
|
|
311
|
+
span.end();
|
|
312
|
+
```
|
|
489
313
|
|
|
490
|
-
|
|
491
|
-
|-----------|-------|
|
|
492
|
-
| Input | 16kHz audio samples |
|
|
493
|
-
| Output | 52 ARKit blendshapes per frame |
|
|
494
|
-
| Frame Rate | 30fps |
|
|
495
|
-
| Backend | WebGPU / WASM |
|
|
314
|
+
## Models
|
|
496
315
|
|
|
497
|
-
|
|
316
|
+
Place models in your public assets directory:
|
|
498
317
|
|
|
499
318
|
```
|
|
500
|
-
|
|
319
|
+
public/models/
|
|
320
|
+
lam-wav2vec2.onnx # A2E lip sync — WebGPU (384MB)
|
|
321
|
+
wav2arkit_cpu.onnx # A2E lip sync — WASM fallback (1.86MB graph)
|
|
322
|
+
wav2arkit_cpu.onnx.data # A2E lip sync — WASM fallback (402MB weights)
|
|
323
|
+
sensevoice/model.int8.onnx # SenseVoice ASR (239MB)
|
|
324
|
+
silero-vad.onnx # Voice activity detection (~2MB)
|
|
501
325
|
```
|
|
502
326
|
|
|
503
|
-
##
|
|
327
|
+
## Browser Compatibility
|
|
504
328
|
|
|
505
|
-
|
|
329
|
+
WebGPU-first with automatic WASM fallback.
|
|
506
330
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
331
|
+
| Browser | WebGPU | WASM | Recommended |
|
|
332
|
+
|---------|--------|------|-------------|
|
|
333
|
+
| Chrome 113+ (Desktop) | Yes | Yes | WebGPU |
|
|
334
|
+
| Chrome 113+ (Android) | Yes | Yes | WebGPU |
|
|
335
|
+
| Edge 113+ | Yes | Yes | WebGPU |
|
|
336
|
+
| Firefox 130+ | Flag only | Yes | WASM |
|
|
337
|
+
| Safari 18+ (macOS) | Limited | Yes | WASM |
|
|
338
|
+
| Safari (iOS) | No | Yes | WASM |
|
|
511
339
|
|
|
512
340
|
```typescript
|
|
513
|
-
import {
|
|
514
|
-
|
|
515
|
-
const orchestrator = new ConversationOrchestrator({
|
|
516
|
-
adapter: {
|
|
517
|
-
endpoint: 'wss://your-agentcore-endpoint.com/ws',
|
|
518
|
-
models: {
|
|
519
|
-
lamUrl: '/models/lam-wav2vec2.onnx',
|
|
520
|
-
},
|
|
521
|
-
},
|
|
522
|
-
});
|
|
523
|
-
|
|
524
|
-
// Register tenant
|
|
525
|
-
orchestrator.registerTenant({
|
|
526
|
-
tenantId: 'tenant-123',
|
|
527
|
-
characterId: 'character-abc',
|
|
528
|
-
credentials: { authToken: 'jwt-token' },
|
|
529
|
-
});
|
|
530
|
-
|
|
531
|
-
// Create session
|
|
532
|
-
const session = await orchestrator.createSession('tenant-123', {
|
|
533
|
-
systemPrompt: 'You are a helpful assistant.',
|
|
534
|
-
});
|
|
535
|
-
|
|
536
|
-
// Listen for animation events
|
|
537
|
-
orchestrator.on('animation', ({ blendshapes }) => {
|
|
538
|
-
applyToAvatar(blendshapes);
|
|
539
|
-
});
|
|
540
|
-
|
|
541
|
-
// Push audio from microphone
|
|
542
|
-
session.pushAudio(audioSamples);
|
|
341
|
+
import { isWebGPUAvailable } from '@omote/core';
|
|
342
|
+
const webgpu = await isWebGPUAvailable();
|
|
543
343
|
```
|
|
544
344
|
|
|
545
|
-
##
|
|
546
|
-
|
|
547
|
-
| Browser | WebGPU | WASM Fallback |
|
|
548
|
-
|---------|--------|---------------|
|
|
549
|
-
| Chrome 113+ | Yes | Yes |
|
|
550
|
-
| Edge 113+ | Yes | Yes |
|
|
551
|
-
| Firefox | No | Yes |
|
|
552
|
-
| Safari 18+ | Yes | Yes |
|
|
345
|
+
## iOS Notes
|
|
553
346
|
|
|
554
|
-
|
|
347
|
+
All iOS browsers use WebKit under the hood. The SDK handles three platform constraints automatically:
|
|
555
348
|
|
|
556
|
-
|
|
349
|
+
1. **WASM binary selection** — iOS crashes with the default JSEP/ASYNCIFY WASM binary. The SDK imports `onnxruntime-web/wasm` (non-JSEP) on iOS/Safari.
|
|
350
|
+
2. **A2E model fallback** — The Wav2Vec2 GPU model exceeds iOS memory limits. `createA2E({ mode: 'auto' })` automatically selects the `wav2arkit_cpu` model on iOS.
|
|
351
|
+
3. **Worker memory** — Multiple Workers each load their own ORT WASM runtime, exceeding iOS tab memory (~1.5GB). The SDK defaults to main-thread inference on iOS.
|
|
557
352
|
|
|
558
|
-
iOS
|
|
559
|
-
|
|
560
|
-
| Feature | iOS Status | Alternative |
|
|
561
|
-
|---------|------------|-------------|
|
|
562
|
-
| **VAD** | Works (0.9ms) | Use as-is |
|
|
563
|
-
| **ASR** | Slow (1.3s) | `SafariSpeechRecognition` |
|
|
564
|
-
| **Lip Sync** | Slow (332ms) | Lambda LAM (server-side) |
|
|
565
|
-
|
|
566
|
-
```typescript
|
|
567
|
-
import { shouldUseNativeASR, SafariSpeechRecognition } from '@omote/core';
|
|
568
|
-
|
|
569
|
-
// Platform-aware ASR
|
|
570
|
-
if (shouldUseNativeASR()) {
|
|
571
|
-
const speech = new SafariSpeechRecognition({ language: 'en-US' });
|
|
572
|
-
speech.onResult((result) => console.log(result.text));
|
|
573
|
-
await speech.start();
|
|
574
|
-
} else {
|
|
575
|
-
const whisper = new WhisperInference({ model: 'tiny' });
|
|
576
|
-
await whisper.load();
|
|
577
|
-
}
|
|
578
|
-
```
|
|
353
|
+
**Consumer requirement:** COEP/COOP headers must be skipped for iOS to avoid triggering SharedArrayBuffer (which forces threaded WASM with 4GB shared memory — crashes iOS). Desktop should keep COEP/COOP for multi-threaded performance.
|
|
579
354
|
|
|
580
|
-
|
|
355
|
+
| Feature | iOS Status | Notes |
|
|
356
|
+
|---------|------------|-------|
|
|
357
|
+
| Silero VAD | Works | 0.9ms latency |
|
|
358
|
+
| SenseVoice ASR | Works | WASM, ~200ms |
|
|
359
|
+
| A2E Lip Sync | Works | wav2arkit_cpu via createA2E auto-detect, ~45ms |
|
|
581
360
|
|
|
582
361
|
## License
|
|
583
362
|
|