@omote/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,584 @@
1
+ # @omote/core
2
+
3
+ > WebGPU-accelerated inference for real-time lip sync, speech recognition, emotion detection, and avatar animation - runs entirely in browser.
4
+
5
+ ## Features
6
+
7
+ - **Lip Sync** - LAM (Wav2Vec2) inference (audio → 52 ARKit blendshapes directly)
8
+ - **Speech-to-Text** - Whisper ASR (tiny/base models)
9
+ - **Emotion Detection** - DistilHuBERT speech emotion recognition (7 emotions)
10
+ - **Voice Activity Detection** - Silero VAD for speech detection
11
+ - **Emotion Control** - 10-channel emotion system with presets and transitions
12
+ - **Model Caching** - IndexedDB-based caching for fast subsequent loads
13
+ - **Structured Logging** - 6 log levels, JSON/pretty formats, module-scoped
14
+ - **Telemetry** - OpenTelemetry-compatible tracing and metrics
15
+ - **Offline Ready** - Works entirely without internet
16
+ - **WebGPU + WASM** - Auto-fallback for broad browser support
17
+
18
+ ## Installation
19
+
20
+ ```bash
21
+ npm install @omote/core
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ### Lip Sync (Wav2Vec2Inference)
27
+
28
+ ```typescript
29
+ import { Wav2Vec2Inference, LAM_BLENDSHAPES } from '@omote/core';
30
+
31
+ const lam = new Wav2Vec2Inference({
32
+ modelUrl: '/models/lam-wav2vec2.onnx',
33
+ backend: 'auto', // 'webgpu' | 'wasm' | 'auto'
34
+ });
35
+
36
+ await lam.load();
37
+
38
+ // Process audio (16kHz Float32Array)
39
+ const result = await lam.infer(audioSamples);
40
+
41
+ // result.blendshapes is an array of frames, each with 52 ARKit weights
42
+ for (const frame of result.blendshapes) {
43
+ const jawOpen = frame[LAM_BLENDSHAPES.indexOf('jawOpen')];
44
+ applyToAvatar(frame);
45
+ }
46
+ ```
47
+
48
+ ### Speech-to-Text (WhisperInference)
49
+
50
+ ```typescript
51
+ import { WhisperInference } from '@omote/core';
52
+
53
+ // Models auto-download from HuggingFace
54
+ const whisper = new WhisperInference({ model: 'tiny' });
55
+ await whisper.load();
56
+
57
+ const { text, inferenceTimeMs } = await whisper.transcribe(audioSamples);
58
+ console.log(text); // "Hello world"
59
+ ```
60
+
61
+ ### Emotion Detection (DistilHuBERTEmotionInference)
62
+
63
+ ```typescript
64
+ import { DistilHuBERTEmotionInference, EMOTION_LABELS } from '@omote/core';
65
+
66
+ const emotion = new DistilHuBERTEmotionInference({
67
+ modelUrl: '/models/distilhubert-emotion.onnx'
68
+ });
69
+ await emotion.load();
70
+
71
+ const { emotion: detected, probabilities } = await emotion.infer(audioSamples);
72
+ console.log(detected); // 'happy', 'sad', 'angry', etc.
73
+ ```
74
+
75
+ ### Voice Activity Detection (SileroVADInference)
76
+
77
+ ```typescript
78
+ import { SileroVADInference } from '@omote/core';
79
+
80
+ const vad = new SileroVADInference({
81
+ modelUrl: '/models/silero-vad.onnx'
82
+ });
83
+ await vad.load();
84
+
85
+ const { isSpeech, probability } = await vad.infer(audioSamples);
86
+ ```
87
+
88
+ ### Emotion Control
89
+
90
+ ```typescript
91
+ import {
92
+ EmotionController,
93
+ createEmotionVector,
94
+ EmotionPresets,
95
+ } from '@omote/core';
96
+
97
+ // Create emotion vectors
98
+ const emotion = createEmotionVector({ joy: 0.8, amazement: 0.2 });
99
+
100
+ // Or use controller for smooth transitions
101
+ const controller = new EmotionController();
102
+ controller.setPreset('happy');
103
+ controller.transitionTo({ sadness: 0.7 }, 500); // 500ms transition
104
+
105
+ // In animation loop
106
+ controller.update();
107
+ const currentEmotion = controller.emotion; // Float32Array(26)
108
+ ```
109
+
110
+ **Available Emotions:** `amazement`, `anger`, `cheekiness`, `disgust`, `fear`, `grief`, `joy`, `outofbreath`, `pain`, `sadness`
111
+
112
+ **Presets:** `neutral`, `happy`, `sad`, `angry`, `surprised`, `scared`, `disgusted`, `excited`, `tired`, `playful`, `pained`, `contemplative`
113
+
114
+ ### Microphone Capture
115
+
116
+ ```typescript
117
+ import { MicrophoneCapture } from '@omote/core';
118
+
119
+ const mic = new MicrophoneCapture({
120
+ sampleRate: 16000,
121
+ bufferSize: 4096,
122
+ });
123
+
124
+ mic.on('audio', ({ samples }) => {
125
+ // Process audio samples
126
+ const result = await lam.infer(samples);
127
+ });
128
+
129
+ await mic.start();
130
+ ```
131
+
132
+ ### Logging
133
+
134
+ ```typescript
135
+ import { configureLogging, createLogger } from '@omote/core';
136
+
137
+ // Configure globally (once at app startup)
138
+ configureLogging({
139
+ level: 'debug', // 'error' | 'warn' | 'info' | 'debug' | 'trace' | 'verbose'
140
+ format: 'pretty', // 'json' | 'pretty'
141
+ enabled: true,
142
+ });
143
+
144
+ // Create module-specific loggers
145
+ const logger = createLogger('MyComponent');
146
+
147
+ logger.info('Model loaded', { backend: 'webgpu', loadTimeMs: 1234 });
148
+ logger.debug('Processing audio', { samples: 16000 });
149
+ logger.error('Failed to load', { error: err.message });
150
+
151
+ // JSON output (production):
152
+ // {"timestamp":1704672000000,"level":"info","module":"MyComponent","message":"Model loaded","data":{"backend":"webgpu"}}
153
+
154
+ // Pretty output (development):
155
+ // [12:00:00.000] INFO [MyComponent] Model loaded { backend: 'webgpu' }
156
+ ```
157
+
158
+ ### Telemetry
159
+
160
+ OpenTelemetry-compatible observability for inference operations.
161
+
162
+ ```typescript
163
+ import { configureTelemetry, getTelemetry } from '@omote/core';
164
+
165
+ // Development: Console output
166
+ configureTelemetry({
167
+ enabled: true,
168
+ serviceName: 'my-app',
169
+ exporter: 'console',
170
+ });
171
+
172
+ // Production: OTLP export to Jaeger/Tempo/etc
173
+ configureTelemetry({
174
+ enabled: true,
175
+ serviceName: 'my-app',
176
+ serviceVersion: '1.0.0',
177
+ exporter: 'otlp',
178
+ exporterConfig: {
179
+ endpoint: 'https://tempo.example.com',
180
+ headers: { 'Authorization': 'Bearer token' },
181
+ },
182
+ sampling: {
183
+ ratio: 0.1, // Sample 10% of traces
184
+ alwaysSampleErrors: true, // Always capture errors
185
+ },
186
+ });
187
+
188
+ // Manual instrumentation
189
+ const telemetry = getTelemetry();
190
+
191
+ // Create spans for custom operations
192
+ const span = telemetry.startSpan('custom-operation', {
193
+ 'custom.attribute': 'value',
194
+ });
195
+ try {
196
+ // ... do work
197
+ span.setStatus('ok');
198
+ } catch (error) {
199
+ span.setStatus('error', error);
200
+ } finally {
201
+ span.end();
202
+ }
203
+
204
+ // Record metrics
205
+ telemetry.recordMetric('custom_counter', 1, 'counter', { label: 'value' });
206
+ telemetry.recordMetric('custom_gauge', 42.5, 'gauge');
207
+ ```
208
+
209
+ ### Model Caching
210
+
211
+ Automatic IndexedDB caching for ONNX models.
212
+
213
+ ```typescript
214
+ import { getModelCache, fetchWithCache, preloadModels, formatBytes } from '@omote/core';
215
+
216
+ // Fetch with automatic caching (used internally by inference classes)
217
+ const modelData = await fetchWithCache('/models/lam-wav2vec2.onnx', (loaded, total) => {
218
+ console.log(`Loading: ${formatBytes(loaded)} / ${formatBytes(total)}`);
219
+ });
220
+
221
+ // Preload multiple models
222
+ await preloadModels([
223
+ '/models/lam-wav2vec2.onnx',
224
+ '/models/silero-vad.onnx',
225
+ ], (completed, total, url) => {
226
+ console.log(`Preloaded ${completed}/${total}: ${url}`);
227
+ });
228
+
229
+ // Manual cache management
230
+ const cache = getModelCache();
231
+ const stats = await cache.getStats();
232
+ console.log(`Cached: ${stats.modelCount} models, ${formatBytes(stats.totalSize)}`);
233
+
234
+ await cache.delete('/models/old-model.onnx');
235
+ await cache.clear(); // Clear all cached models
236
+ ```
237
+
238
+ ## Models
239
+
240
+ ### Required Files
241
+
242
+ Place models in your public assets folder:
243
+
244
+ ```
245
+ public/
246
+ models/
247
+ lam-wav2vec2.onnx # LAM lip sync model
248
+ silero-vad.onnx # Voice activity detection
249
+ distilhubert-emotion.onnx # Emotion detection
250
+ ```
251
+
252
+ ### Whisper Models (Auto-Download)
253
+
254
+ Whisper models download automatically from HuggingFace on first use:
255
+
256
+ | Model | Size | Quality | Speed |
257
+ |-------|------|---------|-------|
258
+ | `tiny` | ~75MB | Good | Fastest |
259
+ | `base` | ~150MB | Better | Medium |
260
+
261
+ ```typescript
262
+ const whisper = new WhisperInference({ model: 'tiny' }); // Recommended for real-time
263
+ ```
264
+
265
+ ## API Reference
266
+
267
+ ### Wav2Vec2Inference (LAM)
268
+
269
+ LAM lip sync - audio to 52 ARKit blendshapes directly.
270
+
271
+ | Method | Description |
272
+ |--------|-------------|
273
+ | `new Wav2Vec2Inference(config)` | Create with `{ modelUrl, backend? }` |
274
+ | `load()` | Load ONNX model |
275
+ | `infer(audio)` | Run inference |
276
+ | `dispose()` | Release resources |
277
+
278
+ ```typescript
279
+ interface Wav2Vec2Result {
280
+ blendshapes: Float32Array[]; // Array of frames, each 52 ARKit weights
281
+ inferenceTimeMs: number;
282
+ backend: 'webgpu' | 'wasm';
283
+ }
284
+ ```
285
+
286
+ ### WhisperInference
287
+
288
+ Whisper speech-to-text.
289
+
290
+ | Method | Description |
291
+ |--------|-------------|
292
+ | `new WhisperInference(config)` | Create with `{ model, modelUrl? }` |
293
+ | `load()` | Load encoder + decoder |
294
+ | `transcribe(audio)` | Transcribe audio |
295
+ | `dispose()` | Release resources |
296
+
297
+ ### DistilHuBERTEmotionInference
298
+
299
+ Speech emotion recognition (7 emotions).
300
+
301
+ | Method | Description |
302
+ |--------|-------------|
303
+ | `new DistilHuBERTEmotionInference(config)` | Create with `{ modelUrl }` |
304
+ | `load()` | Load ONNX model |
305
+ | `infer(audio)` | Detect emotion |
306
+ | `dispose()` | Release resources |
307
+
308
+ **Emotion Labels:** `angry`, `disgusted`, `fearful`, `happy`, `neutral`, `sad`, `surprised`
309
+
310
+ ### SileroVADInference
311
+
312
+ Voice activity detection.
313
+
314
+ | Method | Description |
315
+ |--------|-------------|
316
+ | `new SileroVADInference(config)` | Create with `{ modelUrl }` |
317
+ | `load()` | Load ONNX model |
318
+ | `infer(audio)` | Detect speech |
319
+ | `dispose()` | Release resources |
320
+
321
+ ### EmotionController
322
+
323
+ Emotion state with smooth transitions.
324
+
325
+ | Method | Description |
326
+ |--------|-------------|
327
+ | `set(weights)` | Set emotion immediately |
328
+ | `setPreset(name)` | Set preset immediately |
329
+ | `transitionTo(weights, ms)` | Smooth transition |
330
+ | `transitionToPreset(name, ms)` | Transition to preset |
331
+ | `update()` | Update transition (call each frame) |
332
+ | `reset()` | Reset to neutral |
333
+
334
+ | Property | Type | Description |
335
+ |----------|------|-------------|
336
+ | `emotion` | `Float32Array` | Current 26-element vector |
337
+ | `isTransitioning` | `boolean` | Transition in progress |
338
+
339
+ ### Logger
340
+
341
+ Structured logging with multiple output formats.
342
+
343
+ | Function | Description |
344
+ |----------|-------------|
345
+ | `configureLogging(config)` | Set global logging configuration |
346
+ | `createLogger(module)` | Create a module-specific logger |
347
+ | `getGlobalLogger()` | Get the global logger instance |
348
+
349
+ **Logger Methods:**
350
+
351
+ | Method | Description |
352
+ |--------|-------------|
353
+ | `error(message, data?)` | Log error (always shown) |
354
+ | `warn(message, data?)` | Log warning |
355
+ | `info(message, data?)` | Log info |
356
+ | `debug(message, data?)` | Log debug |
357
+ | `trace(message, data?)` | Log trace |
358
+ | `verbose(message, data?)` | Log verbose (most detailed) |
359
+ | `child(subModule)` | Create child logger with prefixed module |
360
+
361
+ **Configuration:**
362
+
363
+ ```typescript
364
+ interface LoggerConfig {
365
+ level?: 'error' | 'warn' | 'info' | 'debug' | 'trace' | 'verbose';
366
+ enabled?: boolean;
367
+ format?: 'json' | 'pretty';
368
+ sink?: (entry: LogEntry) => void; // Custom output handler
369
+ }
370
+ ```
371
+
372
+ ### OmoteTelemetry
373
+
374
+ OpenTelemetry-compatible telemetry for tracing and metrics.
375
+
376
+ | Function | Description |
377
+ |----------|-------------|
378
+ | `configureTelemetry(config)` | Initialize telemetry system |
379
+ | `getTelemetry()` | Get global telemetry instance |
380
+
381
+ **OmoteTelemetry Methods:**
382
+
383
+ | Method | Description |
384
+ |--------|-------------|
385
+ | `startSpan(name, attributes?)` | Start a new trace span |
386
+ | `recordMetric(name, value, type, attributes?)` | Record a metric |
387
+ | `flush()` | Force flush all pending data |
388
+ | `shutdown()` | Shutdown telemetry system |
389
+
390
+ **Span Methods:**
391
+
392
+ | Method | Description |
393
+ |--------|-------------|
394
+ | `setAttribute(key, value)` | Add attribute to span |
395
+ | `setStatus(status, error?)` | Set span status ('ok' or 'error') |
396
+ | `end()` | End the span |
397
+
398
+ **Configuration:**
399
+
400
+ ```typescript
401
+ interface TelemetryConfig {
402
+ enabled: boolean;
403
+ serviceName: string;
404
+ serviceVersion?: string;
405
+ exporter: 'console' | 'otlp' | 'none';
406
+ exporterConfig?: {
407
+ endpoint: string;
408
+ headers?: Record<string, string>;
409
+ timeoutMs?: number;
410
+ };
411
+ sampling?: {
412
+ ratio?: number; // 0.0 to 1.0
413
+ alwaysSampleErrors?: boolean;
414
+ };
415
+ }
416
+ ```
417
+
418
+ ### ModelCache
419
+
420
+ IndexedDB-based model caching.
421
+
422
+ | Function | Description |
423
+ |----------|-------------|
424
+ | `getModelCache()` | Get singleton cache instance |
425
+ | `fetchWithCache(url, onProgress?)` | Fetch with automatic caching |
426
+ | `preloadModels(urls, onProgress?)` | Preload multiple models |
427
+ | `formatBytes(bytes)` | Format bytes as human-readable |
428
+
429
+ **ModelCache Methods:**
430
+
431
+ | Method | Description |
432
+ |--------|-------------|
433
+ | `has(url)` | Check if model is cached |
434
+ | `get(url)` | Get cached model data |
435
+ | `set(url, data, etag?)` | Store model in cache |
436
+ | `delete(url)` | Remove model from cache |
437
+ | `clear()` | Clear all cached models |
438
+ | `getStats()` | Get cache statistics |
439
+
440
+ ### Utility Functions
441
+
442
+ ```typescript
443
+ // Create emotion vector from named weights
444
+ createEmotionVector({ joy: 0.8, amazement: 0.2 }): Float32Array
445
+
446
+ // Blend multiple emotions
447
+ blendEmotions([
448
+ { vector: preset1, weight: 0.7 },
449
+ { vector: preset2, weight: 0.3 },
450
+ ]): Float32Array
451
+
452
+ // Linear interpolation
453
+ lerpEmotion(from, to, t): Float32Array
454
+
455
+ // Get preset copy
456
+ getEmotionPreset('happy'): Float32Array
457
+ ```
458
+
459
+ ## ARKit Blendshapes
460
+
461
+ 52 output blendshapes compatible with ARKit:
462
+
463
+ ```
464
+ eyeBlinkLeft, eyeLookDownLeft, eyeLookInLeft, eyeLookOutLeft, eyeLookUpLeft,
465
+ eyeSquintLeft, eyeWideLeft, eyeBlinkRight, eyeLookDownRight, eyeLookInRight,
466
+ eyeLookOutRight, eyeLookUpRight, eyeSquintRight, eyeWideRight,
467
+ jawForward, jawLeft, jawRight, jawOpen,
468
+ mouthClose, mouthFunnel, mouthPucker, mouthLeft, mouthRight,
469
+ mouthSmileLeft, mouthSmileRight, mouthFrownLeft, mouthFrownRight,
470
+ mouthDimpleLeft, mouthDimpleRight, mouthStretchLeft, mouthStretchRight,
471
+ mouthRollLower, mouthRollUpper, mouthShrugLower, mouthShrugUpper,
472
+ mouthPressLeft, mouthPressRight, mouthLowerDownLeft, mouthLowerDownRight,
473
+ mouthUpperUpLeft, mouthUpperUpRight,
474
+ browDownLeft, browDownRight, browInnerUp, browOuterUpLeft, browOuterUpRight,
475
+ cheekPuff, cheekSquintLeft, cheekSquintRight,
476
+ noseSneerLeft, noseSneerRight, tongueOut
477
+ ```
478
+
479
+ ## Technical Specifications
480
+
481
+ ### Audio Input
482
+
483
+ | Parameter | Value |
484
+ |-----------|-------|
485
+ | Sample Rate | 16kHz |
486
+ | Format | Float32Array or Int16Array |
487
+
488
+ ### Wav2Vec2 (LAM) Model
489
+
490
+ | Parameter | Value |
491
+ |-----------|-------|
492
+ | Input | 16kHz audio samples |
493
+ | Output | 52 ARKit blendshapes per frame |
494
+ | Frame Rate | 30fps |
495
+ | Backend | WebGPU / WASM |
496
+
497
+ ### DistilHuBERT Emotion Labels
498
+
499
+ ```
500
+ angry, disgusted, fearful, happy, neutral, sad, surprised
501
+ ```
502
+
503
+ ## AI Conversation (Platform Integration)
504
+
505
+ For production deployments with the Omote Platform, use the `AgentCoreAdapter` which handles:
506
+
507
+ - WebSocket connection to AgentCore backend
508
+ - Local Whisper ASR for speech-to-text
509
+ - Receives TTS audio from backend (ElevenLabs handled server-side)
510
+ - Local LAM inference for lip sync animation
511
+
512
+ ```typescript
513
+ import { AgentCoreAdapter, ConversationOrchestrator } from '@omote/core';
514
+
515
+ const orchestrator = new ConversationOrchestrator({
516
+ adapter: {
517
+ endpoint: 'wss://your-agentcore-endpoint.com/ws',
518
+ models: {
519
+ lamUrl: '/models/lam-wav2vec2.onnx',
520
+ },
521
+ },
522
+ });
523
+
524
+ // Register tenant
525
+ orchestrator.registerTenant({
526
+ tenantId: 'tenant-123',
527
+ characterId: 'character-abc',
528
+ credentials: { authToken: 'jwt-token' },
529
+ });
530
+
531
+ // Create session
532
+ const session = await orchestrator.createSession('tenant-123', {
533
+ systemPrompt: 'You are a helpful assistant.',
534
+ });
535
+
536
+ // Listen for animation events
537
+ orchestrator.on('animation', ({ blendshapes }) => {
538
+ applyToAvatar(blendshapes);
539
+ });
540
+
541
+ // Push audio from microphone
542
+ session.pushAudio(audioSamples);
543
+ ```
544
+
545
+ ## Browser Support
546
+
547
+ | Browser | WebGPU | WASM Fallback |
548
+ |---------|--------|---------------|
549
+ | Chrome 113+ | Yes | Yes |
550
+ | Edge 113+ | Yes | Yes |
551
+ | Firefox | No | Yes |
552
+ | Safari 18+ | Yes | Yes |
553
+
554
+ The SDK auto-detects WebGPU support and falls back to WASM when unavailable.
555
+
556
+ ## iOS Support
557
+
558
+ iOS Safari has WebGPU API but ONNX Runtime has memory and threading limitations. The SDK provides automatic detection and optimized fallbacks:
559
+
560
+ | Feature | iOS Status | Alternative |
561
+ |---------|------------|-------------|
562
+ | **VAD** | Works (0.9ms) | Use as-is |
563
+ | **ASR** | Slow (1.3s) | `SafariSpeechRecognition` |
564
+ | **Lip Sync** | Slow (332ms) | Lambda LAM (server-side) |
565
+
566
+ ```typescript
567
+ import { shouldUseNativeASR, SafariSpeechRecognition } from '@omote/core';
568
+
569
+ // Platform-aware ASR
570
+ if (shouldUseNativeASR()) {
571
+ const speech = new SafariSpeechRecognition({ language: 'en-US' });
572
+ speech.onResult((result) => console.log(result.text));
573
+ await speech.start();
574
+ } else {
575
+ const whisper = new WhisperInference({ model: 'tiny' });
576
+ await whisper.load();
577
+ }
578
+ ```
579
+
580
+ See the [iOS Integration Guide](../../docs/ios-integration.md) for complete setup including Lambda LAM deployment.
581
+
582
+ ## License
583
+
584
+ MIT