@omote/core 0.10.5 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -15,7 +15,7 @@
15
15
  - **SpeechListener** — Mic → VAD → ASR orchestration with adaptive silence detection
16
16
  - **createTTSPlayer()** — Factory composing Kokoro TTS + TTSSpeaker for zero-config playback
17
17
  - **VoiceOrchestrator** — Full conversational agent loop with local TTS support (cloud or offline)
18
- - **configureOrtCdn()** — Enterprise CDN override for ORT WASM/WebGPU binaries
18
+ - **configureModelUrls()** — Self-host model files from your own CDN
19
19
  - **Animation Graph** — State machine (idle/listening/thinking/speaking) with emotion blending
20
20
  - **Emotion Controller** — Preset-based emotion system with smooth transitions
21
21
  - **Model Caching** — IndexedDB with versioning, LRU eviction, and quota monitoring
@@ -81,15 +81,15 @@ const { blendshapes } = await a2e.infer(audioSamples); // Float32Array (16kHz)
81
81
  // → 52 ARKit blendshape weights
82
82
  ```
83
83
 
84
- #### Direct API
84
+ #### Custom Configuration
85
85
 
86
86
  ```typescript
87
- import { A2EInference, ARKIT_BLENDSHAPES } from '@omote/core';
87
+ import { createA2E, ARKIT_BLENDSHAPES } from '@omote/core';
88
88
 
89
- const lam = new A2EInference({ modelUrl: '/models/model_fp16.onnx' });
90
- await lam.load();
89
+ const a2e = createA2E({ backend: 'wasm' }); // Force WASM for testing
90
+ await a2e.load();
91
91
 
92
- const { blendshapes } = await lam.infer(audioSamples);
92
+ const { blendshapes } = await a2e.infer(audioSamples);
93
93
  const jawOpen = blendshapes[ARKIT_BLENDSHAPES.indexOf('jawOpen')];
94
94
  ```
95
95
 
@@ -136,11 +136,9 @@ const frame = processor.getFrameForTime(audioContext.currentTime);
136
136
  SenseVoice ASR — 15x faster than Whisper, with progressive transcription and emotion detection.
137
137
 
138
138
  ```typescript
139
- import { SenseVoiceInference } from '@omote/core';
139
+ import { createSenseVoice } from '@omote/core';
140
140
 
141
- const asr = new SenseVoiceInference({
142
- modelUrl: '/models/sensevoice/model.int8.onnx',
143
- });
141
+ const asr = createSenseVoice(); // Auto-detects platform, fetches from HF CDN
144
142
  await asr.load();
145
143
 
146
144
  const { text, emotion, language } = await asr.transcribe(audioSamples);
@@ -149,22 +147,19 @@ const { text, emotion, language } = await asr.transcribe(audioSamples);
149
147
  #### Platform-Aware ASR
150
148
 
151
149
  ```typescript
152
- import { shouldUseNativeASR, SafariSpeechRecognition, SenseVoiceInference } from '@omote/core';
150
+ import { shouldUseNativeASR, SafariSpeechRecognition, createSenseVoice } from '@omote/core';
153
151
 
154
152
  const asr = shouldUseNativeASR()
155
153
  ? new SafariSpeechRecognition({ language: 'en-US' })
156
- : new SenseVoiceInference({ modelUrl: '/models/sensevoice/model.int8.onnx' });
154
+ : createSenseVoice();
157
155
  ```
158
156
 
159
157
  ### Voice Activity Detection (Silero VAD)
160
158
 
161
- #### Factory API (Recommended)
162
-
163
159
  ```typescript
164
160
  import { createSileroVAD } from '@omote/core';
165
161
 
166
162
  const vad = createSileroVAD({
167
- modelUrl: '/models/silero-vad.onnx',
168
163
  threshold: 0.5,
169
164
  // useWorker: true // Force off-main-thread
170
165
  // useWorker: false // Force main thread
@@ -174,18 +169,6 @@ await vad.load();
174
169
  const { isSpeech, probability } = await vad.process(audioSamples);
175
170
  ```
176
171
 
177
- #### Direct API
178
-
179
- ```typescript
180
- import { SileroVADInference, SileroVADWorker } from '@omote/core';
181
-
182
- // Main thread (mobile-friendly)
183
- const vad = new SileroVADInference({ modelUrl: '/models/silero-vad.onnx' });
184
-
185
- // Web Worker (desktop, off-main-thread)
186
- const vadWorker = new SileroVADWorker({ modelUrl: '/models/silero-vad.onnx' });
187
- ```
188
-
189
172
  ### Animation Graph
190
173
 
191
174
  State machine for avatar animation states with emotion blending and audio energy.
@@ -248,7 +231,7 @@ const data = await fetchWithCache('/models/model.onnx', {
248
231
  });
249
232
 
250
233
  // Cache quota monitoring
251
- import { configureCacheLimit, getQuotaInfo } from '@omote/core';
234
+ import { configureCacheLimit } from '@omote/core';
252
235
 
253
236
  configureCacheLimit({
254
237
  maxSizeBytes: 500 * 1024 * 1024, // 500MB limit
@@ -307,17 +290,76 @@ const span = telemetry.startSpan('custom-operation');
307
290
  span.end();
308
291
  ```
309
292
 
310
- ## Models
293
+ ### Text-to-Speech (Kokoro TTS)
294
+
295
+ ```typescript
296
+ import { createKokoroTTS } from '@omote/core';
311
297
 
312
- Place models in your public assets directory:
298
+ const tts = createKokoroTTS({ defaultVoice: 'af_heart' });
299
+ await tts.load();
313
300
 
301
+ const audio = await tts.synthesize('Hello world!');
302
+ // audio: Float32Array @ 24kHz
314
303
  ```
315
- public/models/
316
- model_fp16.onnx # A2E lip sync WebGPU (192MB fp16, from omote-ai/lam-a2e)
317
- sensevoice/model.int8.onnx # SenseVoice ASR (239MB)
318
- silero-vad.onnx # Voice activity detection (~2MB)
304
+
305
+ Kokoro auto-detects the platform: mixed-fp16 WebGPU model (156MB) on Chrome/Edge, q8 WASM model (92MB) on Safari/iOS/Firefox.
306
+
307
+ ### Eager Load & Warmup
308
+
309
+ Use `eagerLoad` to preload models at construction time:
310
+
311
+ ```typescript
312
+ const tts = createKokoroTTS({ eagerLoad: true }); // Starts loading immediately
319
313
  ```
320
314
 
315
+ Use `warmup()` to prime AudioContext for iOS/Safari autoplay policy. Call from a user gesture handler:
316
+
317
+ ```typescript
318
+ button.onclick = async () => {
319
+ await avatar.warmup(); // Primes AudioContext
320
+ await avatar.connectVoice({ ... });
321
+ };
322
+ ```
323
+
324
+ ### Observability
325
+
326
+ The SDK includes built-in OpenTelemetry-compatible tracing and metrics:
327
+
328
+ ```typescript
329
+ import { configureTelemetry, getTelemetry, MetricNames } from '@omote/core';
330
+
331
+ configureTelemetry({
332
+ enabled: true,
333
+ serviceName: 'my-app',
334
+ exporter: 'console', // or OTLPExporter for production
335
+ });
336
+ ```
337
+
338
+ All inference calls, model loads, cache operations, and voice turns are automatically instrumented.
339
+
340
+ ## Models
341
+
342
+ All models default to the HuggingFace CDN and are auto-downloaded on first use. Self-host with `configureModelUrls()`:
343
+
344
+ ```typescript
345
+ import { configureModelUrls } from '@omote/core';
346
+
347
+ configureModelUrls({
348
+ lam: 'https://your-cdn.com/models/lam.onnx',
349
+ lamData: 'https://your-cdn.com/models/lam.onnx.data',
350
+ senseVoice: 'https://your-cdn.com/models/sensevoice.onnx',
351
+ sileroVad: 'https://your-cdn.com/models/silero_vad.onnx',
352
+ });
353
+ ```
354
+
355
+ | Model | HuggingFace Repo | Size |
356
+ |-------|-------------------|------|
357
+ | LAM A2E | `omote-ai/lam-a2e` | `lam.onnx` (230KB) + `lam.onnx.data` (192MB) |
358
+ | SenseVoice | `omote-ai/sensevoice-asr` | 228MB |
359
+ | Silero VAD | `deepghs/silero-vad-onnx` | ~2MB |
360
+ | Kokoro TTS (WASM) | `onnx-community/Kokoro-82M-v1.0-ONNX` | 92MB q8 |
361
+ | Kokoro TTS (WebGPU) | `omote-ai/kokoro-tts` | 156MB mixed-fp16 |
362
+
321
363
  ## Browser Compatibility
322
364
 
323
365
  WebGPU-first with automatic WASM fallback.