@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.3-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +157 -0
- package/dist/actions/generate-media.d.ts +59 -0
- package/dist/actions/generate-media.d.ts.map +1 -0
- package/dist/actions/identify-speaker.d.ts +23 -0
- package/dist/actions/identify-speaker.d.ts.map +1 -0
- package/dist/actions/transcription-control.d.ts +29 -0
- package/dist/actions/transcription-control.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/environment.d.ts +12 -0
- package/dist/adapters/capacitor-llama/environment.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/index.browser.d.ts +9 -0
- package/dist/adapters/capacitor-llama/index.browser.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/index.d.ts +18 -0
- package/dist/adapters/capacitor-llama/index.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/loader.d.ts +35 -0
- package/dist/adapters/capacitor-llama/loader.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/native-voice-capture.d.ts +70 -0
- package/dist/adapters/capacitor-llama/native-voice-capture.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/structured-output.d.ts +62 -0
- package/dist/adapters/capacitor-llama/structured-output.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/text-streaming.d.ts +24 -0
- package/dist/adapters/capacitor-llama/text-streaming.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/types.d.ts +338 -0
- package/dist/adapters/capacitor-llama/types.d.ts.map +1 -0
- package/dist/adapters/capacitor-llama/voice-turn.d.ts +86 -0
- package/dist/adapters/capacitor-llama/voice-turn.d.ts.map +1 -0
- package/dist/backends/apple-foundation.d.ts +56 -0
- package/dist/backends/apple-foundation.d.ts.map +1 -0
- package/dist/index.d.ts +8 -37
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +38979 -430
- package/dist/index.js.map +217 -0
- package/dist/local-inference-routes.d.ts +47 -0
- package/dist/local-inference-routes.d.ts.map +1 -0
- package/dist/provider.d.ts +21 -0
- package/dist/provider.d.ts.map +1 -0
- package/dist/routes/compat-helpers.d.ts +18 -0
- package/dist/routes/compat-helpers.d.ts.map +1 -0
- package/dist/routes/family-member-route.d.ts +62 -0
- package/dist/routes/family-member-route.d.ts.map +1 -0
- package/dist/routes/index.d.ts +20 -0
- package/dist/routes/index.d.ts.map +1 -0
- package/dist/routes/index.js +42040 -0
- package/dist/routes/index.js.map +236 -0
- package/dist/routes/live-diarization-route.d.ts +33 -0
- package/dist/routes/live-diarization-route.d.ts.map +1 -0
- package/dist/routes/local-inference-asr-route.d.ts +4 -0
- package/dist/routes/local-inference-asr-route.d.ts.map +1 -0
- package/dist/routes/local-inference-asr-transcribe.d.ts +20 -0
- package/dist/routes/local-inference-asr-transcribe.d.ts.map +1 -0
- package/dist/routes/local-inference-compat-routes.d.ts +16 -0
- package/dist/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/dist/routes/local-inference-tts-route.d.ts +7 -0
- package/dist/routes/local-inference-tts-route.d.ts.map +1 -0
- package/dist/routes/native-pcm-turn-route.d.ts +3 -0
- package/dist/routes/native-pcm-turn-route.d.ts.map +1 -0
- package/dist/routes/transcript-audio-store.d.ts +15 -0
- package/dist/routes/transcript-audio-store.d.ts.map +1 -0
- package/dist/routes/transcripts-routes.d.ts +44 -0
- package/dist/routes/transcripts-routes.d.ts.map +1 -0
- package/dist/routes/voice-first-run-routes.d.ts +62 -0
- package/dist/routes/voice-first-run-routes.d.ts.map +1 -0
- package/dist/routes/voice-models-routes.d.ts +62 -0
- package/dist/routes/voice-models-routes.d.ts.map +1 -0
- package/dist/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/dist/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/dist/routes/voice-profiles-management-routes.d.ts +52 -0
- package/dist/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/dist/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/dist/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/dist/runtime/embedding-manager-support.d.ts +77 -0
- package/dist/runtime/embedding-manager-support.d.ts.map +1 -0
- package/dist/runtime/embedding-presets.d.ts +16 -0
- package/dist/runtime/embedding-presets.d.ts.map +1 -0
- package/dist/runtime/embedding-warmup-policy.d.ts +14 -0
- package/dist/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/dist/runtime/ensure-local-inference-handler.d.ts +70 -0
- package/dist/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/dist/runtime/index.d.ts +15 -0
- package/dist/runtime/index.d.ts.map +1 -0
- package/dist/runtime/index.js +38768 -0
- package/dist/runtime/index.js.map +217 -0
- package/dist/runtime/mobile-local-inference-gate.d.ts +63 -0
- package/dist/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/dist/runtime/voice-entity-binding.d.ts +113 -0
- package/dist/runtime/voice-entity-binding.d.ts.map +1 -0
- package/dist/services/active-model.d.ts +310 -0
- package/dist/services/active-model.d.ts.map +1 -0
- package/dist/services/asr-provenance.d.ts +5 -0
- package/dist/services/asr-provenance.d.ts.map +1 -0
- package/dist/services/assignments.d.ts +84 -0
- package/dist/services/assignments.d.ts.map +1 -0
- package/dist/services/backend-selector.d.ts +55 -0
- package/dist/services/backend-selector.d.ts.map +1 -0
- package/dist/services/backend.d.ts +440 -0
- package/dist/services/backend.d.ts.map +1 -0
- package/dist/services/bionic-host-loader.d.ts +67 -0
- package/dist/services/bionic-host-loader.d.ts.map +1 -0
- package/dist/services/bundled-models.d.ts +34 -0
- package/dist/services/bundled-models.d.ts.map +1 -0
- package/dist/services/cache-bridge.d.ts +206 -0
- package/dist/services/cache-bridge.d.ts.map +1 -0
- package/dist/services/catalog.d.ts +10 -0
- package/dist/services/catalog.d.ts.map +1 -0
- package/dist/services/checkpoint-client.d.ts +109 -0
- package/dist/services/checkpoint-client.d.ts.map +1 -0
- package/dist/services/checkpoint-manager.d.ts +217 -0
- package/dist/services/checkpoint-manager.d.ts.map +1 -0
- package/dist/services/cloud-fallback.d.ts +102 -0
- package/dist/services/cloud-fallback.d.ts.map +1 -0
- package/dist/services/context-fit.d.ts +36 -0
- package/dist/services/context-fit.d.ts.map +1 -0
- package/dist/services/conversation-registry.d.ts +142 -0
- package/dist/services/conversation-registry.d.ts.map +1 -0
- package/dist/services/desktop-fused-ffi-backend-runtime.d.ts +111 -0
- package/dist/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/dist/services/device-bridge.d.ts +188 -0
- package/dist/services/device-bridge.d.ts.map +1 -0
- package/dist/services/device-resource-metrics.d.ts +149 -0
- package/dist/services/device-resource-metrics.d.ts.map +1 -0
- package/dist/services/device-tier.d.ts +133 -0
- package/dist/services/device-tier.d.ts.map +1 -0
- package/dist/services/downloader.d.ts +94 -0
- package/dist/services/downloader.d.ts.map +1 -0
- package/dist/services/engine.d.ts +579 -0
- package/dist/services/engine.d.ts.map +1 -0
- package/dist/services/ensure-local-artifacts.d.ts +82 -0
- package/dist/services/ensure-local-artifacts.d.ts.map +1 -0
- package/dist/services/external-scanner.d.ts +17 -0
- package/dist/services/external-scanner.d.ts.map +1 -0
- package/dist/services/ffi-llm-mock.d.ts +90 -0
- package/dist/services/ffi-llm-mock.d.ts.map +1 -0
- package/dist/services/ffi-llm-streaming-abi.d.ts +318 -0
- package/dist/services/ffi-llm-streaming-abi.d.ts.map +1 -0
- package/dist/services/ffi-streaming-backend.d.ts +201 -0
- package/dist/services/ffi-streaming-backend.d.ts.map +1 -0
- package/dist/services/ffi-streaming-runner.d.ts +146 -0
- package/dist/services/ffi-streaming-runner.d.ts.map +1 -0
- package/dist/services/gpu-autotune.d.ts +150 -0
- package/dist/services/gpu-autotune.d.ts.map +1 -0
- package/dist/services/gpu-detect.d.ts +56 -0
- package/dist/services/gpu-detect.d.ts.map +1 -0
- package/dist/services/handler-registry.d.ts +72 -0
- package/dist/services/handler-registry.d.ts.map +1 -0
- package/dist/services/hardware.d.ts +63 -0
- package/dist/services/hardware.d.ts.map +1 -0
- package/dist/services/image-description-runtime.d.ts +14 -0
- package/dist/services/image-description-runtime.d.ts.map +1 -0
- package/dist/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/dist/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/dist/services/imagegen/backend-selector.d.ts +118 -0
- package/dist/services/imagegen/backend-selector.d.ts.map +1 -0
- package/dist/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/dist/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/dist/services/imagegen/errors.d.ts +16 -0
- package/dist/services/imagegen/errors.d.ts.map +1 -0
- package/dist/services/imagegen/index.d.ts +58 -0
- package/dist/services/imagegen/index.d.ts.map +1 -0
- package/dist/services/imagegen/mflux.d.ts +74 -0
- package/dist/services/imagegen/mflux.d.ts.map +1 -0
- package/dist/services/imagegen/sd-cpp.d.ts +181 -0
- package/dist/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/dist/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/dist/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/dist/services/imagegen/types.d.ts +181 -0
- package/dist/services/imagegen/types.d.ts.map +1 -0
- package/dist/services/index.d.ts +31 -0
- package/dist/services/index.d.ts.map +1 -0
- package/dist/services/index.js +39453 -0
- package/dist/services/index.js.map +227 -0
- package/dist/services/inference-capabilities.d.ts +132 -0
- package/dist/services/inference-capabilities.d.ts.map +1 -0
- package/dist/services/inference-telemetry.d.ts +59 -0
- package/dist/services/inference-telemetry.d.ts.map +1 -0
- package/dist/services/ios-llama-streaming.d.ts +119 -0
- package/dist/services/ios-llama-streaming.d.ts.map +1 -0
- package/dist/services/kv-spill.d.ts +189 -0
- package/dist/services/kv-spill.d.ts.map +1 -0
- package/dist/services/latency-trace.d.ts +346 -0
- package/dist/services/latency-trace.d.ts.map +1 -0
- package/dist/services/lib-target.d.ts +55 -0
- package/dist/services/lib-target.d.ts.map +1 -0
- package/dist/services/live-signals.d.ts +86 -0
- package/dist/services/live-signals.d.ts.map +1 -0
- package/dist/services/llama-server-metrics.d.ts +114 -0
- package/dist/services/llama-server-metrics.d.ts.map +1 -0
- package/dist/services/llm-streaming-binding.d.ts +96 -0
- package/dist/services/llm-streaming-binding.d.ts.map +1 -0
- package/dist/services/load-args.d.ts +82 -0
- package/dist/services/load-args.d.ts.map +1 -0
- package/dist/services/manifest/index.d.ts +4 -0
- package/dist/services/manifest/index.d.ts.map +1 -0
- package/dist/services/manifest/schema.d.ts +903 -0
- package/dist/services/manifest/schema.d.ts.map +1 -0
- package/dist/services/manifest/types.d.ts +32 -0
- package/dist/services/manifest/types.d.ts.map +1 -0
- package/dist/services/manifest/validator.d.ts +66 -0
- package/dist/services/manifest/validator.d.ts.map +1 -0
- package/dist/services/memory-arbiter.d.ts +348 -0
- package/dist/services/memory-arbiter.d.ts.map +1 -0
- package/dist/services/memory-benchmark.d.ts +76 -0
- package/dist/services/memory-benchmark.d.ts.map +1 -0
- package/dist/services/memory-monitor.d.ts +128 -0
- package/dist/services/memory-monitor.d.ts.map +1 -0
- package/dist/services/memory-pressure.d.ts +130 -0
- package/dist/services/memory-pressure.d.ts.map +1 -0
- package/dist/services/mtp-doctor.d.ts +13 -0
- package/dist/services/mtp-doctor.d.ts.map +1 -0
- package/dist/services/network-policy.d.ts +127 -0
- package/dist/services/network-policy.d.ts.map +1 -0
- package/dist/services/paths.d.ts +6 -0
- package/dist/services/paths.d.ts.map +1 -0
- package/dist/services/planner-skeleton.d.ts +124 -0
- package/dist/services/planner-skeleton.d.ts.map +1 -0
- package/dist/services/providers.d.ts +38 -0
- package/dist/services/providers.d.ts.map +1 -0
- package/dist/services/ram-budget.d.ts +110 -0
- package/dist/services/ram-budget.d.ts.map +1 -0
- package/dist/services/readiness.d.ts +9 -0
- package/dist/services/readiness.d.ts.map +1 -0
- package/dist/services/recommendation.d.ts +111 -0
- package/dist/services/recommendation.d.ts.map +1 -0
- package/dist/services/registry.d.ts +33 -0
- package/dist/services/registry.d.ts.map +1 -0
- package/dist/services/router-handler.d.ts +92 -0
- package/dist/services/router-handler.d.ts.map +1 -0
- package/dist/services/routing-policy.d.ts +92 -0
- package/dist/services/routing-policy.d.ts.map +1 -0
- package/dist/services/routing-preferences.d.ts +8 -0
- package/dist/services/routing-preferences.d.ts.map +1 -0
- package/dist/services/runtime-target.d.ts +98 -0
- package/dist/services/runtime-target.d.ts.map +1 -0
- package/dist/services/service.d.ts +128 -0
- package/dist/services/service.d.ts.map +1 -0
- package/dist/services/session-pool.d.ts +72 -0
- package/dist/services/session-pool.d.ts.map +1 -0
- package/dist/services/structured-output/deterministic-repair.d.ts +23 -0
- package/dist/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/dist/services/structured-output/index.d.ts +2 -0
- package/dist/services/structured-output/index.d.ts.map +1 -0
- package/dist/services/structured-output.d.ts +311 -0
- package/dist/services/structured-output.d.ts.map +1 -0
- package/dist/services/system-memory.d.ts +33 -0
- package/dist/services/system-memory.d.ts.map +1 -0
- package/dist/services/types.d.ts +19 -0
- package/dist/services/types.d.ts.map +1 -0
- package/dist/services/verify-on-device.d.ts +34 -0
- package/dist/services/verify-on-device.d.ts.map +1 -0
- package/dist/services/verify.d.ts +8 -0
- package/dist/services/verify.d.ts.map +1 -0
- package/dist/services/vision/aosp-unavailable.d.ts +115 -0
- package/dist/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/dist/services/vision/capacitor-llama.d.ts +99 -0
- package/dist/services/vision/capacitor-llama.d.ts.map +1 -0
- package/dist/services/vision/cloud-fallback.d.ts +47 -0
- package/dist/services/vision/cloud-fallback.d.ts.map +1 -0
- package/dist/services/vision/hash.d.ts +71 -0
- package/dist/services/vision/hash.d.ts.map +1 -0
- package/dist/services/vision/index.d.ts +95 -0
- package/dist/services/vision/index.d.ts.map +1 -0
- package/dist/services/vision/llama-server.d.ts +73 -0
- package/dist/services/vision/llama-server.d.ts.map +1 -0
- package/dist/services/vision/types.d.ts +162 -0
- package/dist/services/vision/types.d.ts.map +1 -0
- package/dist/services/vision/vast-fallback.d.ts +18 -0
- package/dist/services/vision/vast-fallback.d.ts.map +1 -0
- package/dist/services/vision-embedding-cache.d.ts +98 -0
- package/dist/services/vision-embedding-cache.d.ts.map +1 -0
- package/dist/services/voice/__test-helpers__/fake-ffi.d.ts +27 -0
- package/dist/services/voice/__test-helpers__/fake-ffi.d.ts.map +1 -0
- package/dist/services/voice/__test-helpers__/synthetic-speech.d.ts +66 -0
- package/dist/services/voice/__test-helpers__/synthetic-speech.d.ts.map +1 -0
- package/dist/services/voice/acoustic-speaker-attribution.d.ts +61 -0
- package/dist/services/voice/acoustic-speaker-attribution.d.ts.map +1 -0
- package/dist/services/voice/audio-frame-consumer.d.ts +294 -0
- package/dist/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/dist/services/voice/barge-in.d.ts +112 -0
- package/dist/services/voice/barge-in.d.ts.map +1 -0
- package/dist/services/voice/cancellation-coordinator.d.ts +127 -0
- package/dist/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/dist/services/voice/checkpoint-manager.d.ts +199 -0
- package/dist/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/dist/services/voice/checkpoint-policy.d.ts +178 -0
- package/dist/services/voice/checkpoint-policy.d.ts.map +1 -0
- package/dist/services/voice/corpus-augment.d.ts +111 -0
- package/dist/services/voice/corpus-augment.d.ts.map +1 -0
- package/dist/services/voice/corpus-generator.d.ts +134 -0
- package/dist/services/voice/corpus-generator.d.ts.map +1 -0
- package/dist/services/voice/diarization-error-rate.d.ts +40 -0
- package/dist/services/voice/diarization-error-rate.d.ts.map +1 -0
- package/dist/services/voice/e2e-harness.d.ts +297 -0
- package/dist/services/voice/e2e-harness.d.ts.map +1 -0
- package/dist/services/voice/eager-context-builder.d.ts +170 -0
- package/dist/services/voice/eager-context-builder.d.ts.map +1 -0
- package/dist/services/voice/echo-delay.d.ts +67 -0
- package/dist/services/voice/echo-delay.d.ts.map +1 -0
- package/dist/services/voice/echo-metrics.d.ts +7 -0
- package/dist/services/voice/echo-metrics.d.ts.map +1 -0
- package/dist/services/voice/echo-reference-buffer.d.ts +65 -0
- package/dist/services/voice/echo-reference-buffer.d.ts.map +1 -0
- package/dist/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/dist/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/dist/services/voice/embedding-server.d.ts +37 -0
- package/dist/services/voice/embedding-server.d.ts.map +1 -0
- package/dist/services/voice/embedding.d.ts +132 -0
- package/dist/services/voice/embedding.d.ts.map +1 -0
- package/dist/services/voice/emotion-attribution.d.ts +68 -0
- package/dist/services/voice/emotion-attribution.d.ts.map +1 -0
- package/dist/services/voice/engine-bridge.d.ts +762 -0
- package/dist/services/voice/engine-bridge.d.ts.map +1 -0
- package/dist/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/dist/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/dist/services/voice/eot-classifier.d.ts +211 -0
- package/dist/services/voice/eot-classifier.d.ts.map +1 -0
- package/dist/services/voice/errors.d.ts +20 -0
- package/dist/services/voice/errors.d.ts.map +1 -0
- package/dist/services/voice/expressive-tags.d.ts +158 -0
- package/dist/services/voice/expressive-tags.d.ts.map +1 -0
- package/dist/services/voice/ffi-bindings.d.ts +696 -0
- package/dist/services/voice/ffi-bindings.d.ts.map +1 -0
- package/dist/services/voice/first-line-cache.d.ts +181 -0
- package/dist/services/voice/first-line-cache.d.ts.map +1 -0
- package/dist/services/voice/fused-eot-scorer.d.ts +51 -0
- package/dist/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/dist/services/voice/index.d.ts +96 -0
- package/dist/services/voice/index.d.ts.map +1 -0
- package/dist/services/voice/kokoro/index.d.ts +24 -0
- package/dist/services/voice/kokoro/index.d.ts.map +1 -0
- package/dist/services/voice/kokoro/kokoro-backend.d.ts +87 -0
- package/dist/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/dist/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/dist/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/dist/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/dist/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/dist/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/dist/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/dist/services/voice/kokoro/phoneme-stream.d.ts +51 -0
- package/dist/services/voice/kokoro/phoneme-stream.d.ts.map +1 -0
- package/dist/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/dist/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/dist/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/dist/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/dist/services/voice/kokoro/runtime-selection.d.ts +31 -0
- package/dist/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/dist/services/voice/kokoro/types.d.ts +82 -0
- package/dist/services/voice/kokoro/types.d.ts.map +1 -0
- package/dist/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/dist/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/dist/services/voice/kokoro/voices.d.ts +30 -0
- package/dist/services/voice/kokoro/voices.d.ts.map +1 -0
- package/dist/services/voice/lifecycle.d.ts +135 -0
- package/dist/services/voice/lifecycle.d.ts.map +1 -0
- package/dist/services/voice/live-diarization-session.d.ts +196 -0
- package/dist/services/voice/live-diarization-session.d.ts.map +1 -0
- package/dist/services/voice/metric-math.d.ts +10 -0
- package/dist/services/voice/metric-math.d.ts.map +1 -0
- package/dist/services/voice/mic-source.d.ts +136 -0
- package/dist/services/voice/mic-source.d.ts.map +1 -0
- package/dist/services/voice/nlms-echo-canceller.d.ts +137 -0
- package/dist/services/voice/nlms-echo-canceller.d.ts.map +1 -0
- package/dist/services/voice/optimistic-policy.d.ts +109 -0
- package/dist/services/voice/optimistic-policy.d.ts.map +1 -0
- package/dist/services/voice/optimistic-rollback.d.ts +151 -0
- package/dist/services/voice/optimistic-rollback.d.ts.map +1 -0
- package/dist/services/voice/partial-stabilizer.d.ts +73 -0
- package/dist/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/dist/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/dist/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/dist/services/voice/phrase-cache.d.ts +76 -0
- package/dist/services/voice/phrase-cache.d.ts.map +1 -0
- package/dist/services/voice/phrase-chunker.d.ts +62 -0
- package/dist/services/voice/phrase-chunker.d.ts.map +1 -0
- package/dist/services/voice/pipeline-impls.d.ts +151 -0
- package/dist/services/voice/pipeline-impls.d.ts.map +1 -0
- package/dist/services/voice/pipeline.d.ts +216 -0
- package/dist/services/voice/pipeline.d.ts.map +1 -0
- package/dist/services/voice/prefill-client.d.ts +123 -0
- package/dist/services/voice/prefill-client.d.ts.map +1 -0
- package/dist/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/dist/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/dist/services/voice/profile-store.d.ts +248 -0
- package/dist/services/voice/profile-store.d.ts.map +1 -0
- package/dist/services/voice/ring-buffer.d.ts +40 -0
- package/dist/services/voice/ring-buffer.d.ts.map +1 -0
- package/dist/services/voice/rollback-queue.d.ts +24 -0
- package/dist/services/voice/rollback-queue.d.ts.map +1 -0
- package/dist/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/dist/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/dist/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/dist/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/dist/services/voice/scheduler.d.ts +146 -0
- package/dist/services/voice/scheduler.d.ts.map +1 -0
- package/dist/services/voice/self-voice-imprint.d.ts +33 -0
- package/dist/services/voice/self-voice-imprint.d.ts.map +1 -0
- package/dist/services/voice/shared-resources.d.ts +204 -0
- package/dist/services/voice/shared-resources.d.ts.map +1 -0
- package/dist/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/dist/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/dist/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/dist/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/dist/services/voice/speaker/diarizer.d.ts +75 -0
- package/dist/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/dist/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/dist/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/dist/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/dist/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/dist/services/voice/speaker/encoder.d.ts +37 -0
- package/dist/services/voice/speaker/encoder.d.ts.map +1 -0
- package/dist/services/voice/speaker-imprint.d.ts +83 -0
- package/dist/services/voice/speaker-imprint.d.ts.map +1 -0
- package/dist/services/voice/speaker-preset-cache.d.ts +77 -0
- package/dist/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/dist/services/voice/streaming-asr/streaming-pipeline-adapter.d.ts +160 -0
- package/dist/services/voice/streaming-asr/streaming-pipeline-adapter.d.ts.map +1 -0
- package/dist/services/voice/system-audio-sink.d.ts +73 -0
- package/dist/services/voice/system-audio-sink.d.ts.map +1 -0
- package/dist/services/voice/transcriber.d.ts +244 -0
- package/dist/services/voice/transcriber.d.ts.map +1 -0
- package/dist/services/voice/transcript-knowledge.d.ts +37 -0
- package/dist/services/voice/transcript-knowledge.d.ts.map +1 -0
- package/dist/services/voice/transcript-service.d.ts +60 -0
- package/dist/services/voice/transcript-service.d.ts.map +1 -0
- package/dist/services/voice/transcript-store.d.ts +64 -0
- package/dist/services/voice/transcript-store.d.ts.map +1 -0
- package/dist/services/voice/turn-controller.d.ts +183 -0
- package/dist/services/voice/turn-controller.d.ts.map +1 -0
- package/dist/services/voice/types.d.ts +643 -0
- package/dist/services/voice/types.d.ts.map +1 -0
- package/dist/services/voice/vad.d.ts +283 -0
- package/dist/services/voice/vad.d.ts.map +1 -0
- package/dist/services/voice/voice-budget.d.ts +241 -0
- package/dist/services/voice/voice-budget.d.ts.map +1 -0
- package/dist/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/dist/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/dist/services/voice/voice-preload-predictor.d.ts +76 -0
- package/dist/services/voice/voice-preload-predictor.d.ts.map +1 -0
- package/dist/services/voice/voice-preset-format.d.ts +158 -0
- package/dist/services/voice/voice-preset-format.d.ts.map +1 -0
- package/dist/services/voice/voice-profile-artifact.d.ts +116 -0
- package/dist/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/dist/services/voice/voice-profile-routes.d.ts +83 -0
- package/dist/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/dist/services/voice/voice-scenario.d.ts +131 -0
- package/dist/services/voice/voice-scenario.d.ts.map +1 -0
- package/dist/services/voice/voice-state-machine.d.ts +364 -0
- package/dist/services/voice/voice-state-machine.d.ts.map +1 -0
- package/dist/services/voice/voice-workbench-report.d.ts +117 -0
- package/dist/services/voice/voice-workbench-report.d.ts.map +1 -0
- package/dist/services/voice/wake-word-ggml.d.ts +100 -0
- package/dist/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/dist/services/voice/wake-word.d.ts +255 -0
- package/dist/services/voice/wake-word.d.ts.map +1 -0
- package/dist/services/voice/wav-codec.d.ts +11 -0
- package/dist/services/voice/wav-codec.d.ts.map +1 -0
- package/dist/services/voice/workbench-entrypoint.d.ts +42 -0
- package/dist/services/voice/workbench-entrypoint.d.ts.map +1 -0
- package/dist/services/voice/workbench-headless-runner.d.ts +102 -0
- package/dist/services/voice/workbench-headless-runner.d.ts.map +1 -0
- package/dist/services/voice/workbench-logic-services.d.ts +36 -0
- package/dist/services/voice/workbench-logic-services.d.ts.map +1 -0
- package/dist/services/voice/workbench-real-services.d.ts +17 -0
- package/dist/services/voice/workbench-real-services.d.ts.map +1 -0
- package/dist/services/voice/workbench-scenarios.d.ts +24 -0
- package/dist/services/voice/workbench-scenarios.d.ts.map +1 -0
- package/dist/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/dist/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/dist/services/voice-model-updater.d.ts +240 -0
- package/dist/services/voice-model-updater.d.ts.map +1 -0
- package/dist/services/voice-prewarm.d.ts +3 -0
- package/dist/services/voice-prewarm.d.ts.map +1 -0
- package/dist/voice-workbench.d.ts +18 -0
- package/dist/voice-workbench.d.ts.map +1 -0
- package/dist/voice-workbench.js +5259 -0
- package/dist/voice-workbench.js.map +34 -0
- package/package.json +101 -15
- package/registry-entry.json +137 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/actions/transcription-control.test.ts +100 -0
- package/src/actions/transcription-control.ts +127 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/__tests__/voice-turn.test.ts +293 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +831 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/native-voice-capture.ts +140 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/adapters/capacitor-llama/voice-turn.ts +178 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.ts +62 -0
- package/src/local-inference-routes.test.ts +390 -0
- package/src/local-inference-routes.ts +1625 -0
- package/src/provider.ts +1111 -0
- package/src/routes/compat-helpers.ts +275 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.ts +61 -0
- package/src/routes/live-diarization-route.test.ts +347 -0
- package/src/routes/live-diarization-route.ts +198 -0
- package/src/routes/local-inference-asr-route.test.ts +246 -0
- package/src/routes/local-inference-asr-route.ts +166 -0
- package/src/routes/local-inference-asr-transcribe.test.ts +118 -0
- package/src/routes/local-inference-asr-transcribe.ts +97 -0
- package/src/routes/local-inference-compat-routes.test.ts +485 -0
- package/src/routes/local-inference-compat-routes.ts +775 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/native-pcm-turn-route.test.ts +136 -0
- package/src/routes/native-pcm-turn-route.ts +121 -0
- package/src/routes/transcript-audio-store.ts +27 -0
- package/src/routes/transcripts-routes.test.ts +195 -0
- package/src/routes/transcripts-routes.ts +191 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/bionic-wire-encoding.test.ts +147 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +726 -0
- package/src/runtime/ensure-local-inference-handler.ts +1640 -0
- package/src/runtime/index.ts +36 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +152 -0
- package/src/runtime/mobile-local-inference-gate.ts +99 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +98 -0
- package/src/runtime/voice-entity-binding.ts +368 -0
- package/src/runtime/voice-speaker-entity-contract.test.ts +149 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.precedence.test.ts +333 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-context-fit.test.ts +125 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.ts +1416 -0
- package/src/services/asr-provenance.ts +68 -0
- package/src/services/assignment-validation.test.ts +118 -0
- package/src/services/assignments.test.ts +106 -0
- package/src/services/assignments.ts +278 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.test.ts +84 -0
- package/src/services/backend.ts +791 -0
- package/src/services/bionic-host-loader.test.ts +226 -0
- package/src/services/bionic-host-loader.ts +252 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.test.ts +259 -0
- package/src/services/catalog.ts +33 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/context-fit.test.ts +121 -0
- package/src/services/context-fit.ts +113 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +431 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.test.ts +458 -0
- package/src/services/device-tier.ts +502 -0
- package/src/services/downloader.test.ts +888 -0
- package/src/services/downloader.ts +1039 -0
- package/src/services/engine-direct-bundle.test.ts +90 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.ts +2096 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +445 -0
- package/src/services/ffi-streaming-backend.ts +418 -0
- package/src/services/ffi-streaming-runner.test.ts +220 -0
- package/src/services/ffi-streaming-runner.ts +407 -0
- package/src/services/ffi-unload-ordering.test.ts +166 -0
- package/src/services/fused-eliza1-no-regression.test.ts +144 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.test.ts +236 -0
- package/src/services/hardware.ts +438 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.test.ts +190 -0
- package/src/services/imagegen/backend-selector.ts +277 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.ts +715 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.ts +229 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +357 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/lib-target.test.ts +145 -0
- package/src/services/lib-target.ts +102 -0
- package/src/services/live-signals.test.ts +132 -0
- package/src/services/live-signals.ts +177 -0
- package/src/services/llama-server-metrics.test.ts +168 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +790 -0
- package/src/services/manifest/index.ts +72 -0
- package/src/services/manifest/manifest.test.ts +791 -0
- package/src/services/manifest/schema.ts +761 -0
- package/src/services/manifest/types.ts +61 -0
- package/src/services/manifest/validator.ts +633 -0
- package/src/services/memory-arbiter.test.ts +558 -0
- package/src/services/memory-arbiter.ts +991 -0
- package/src/services/memory-benchmark.test.ts +91 -0
- package/src/services/memory-benchmark.ts +354 -0
- package/src/services/memory-monitor.test.ts +232 -0
- package/src/services/memory-monitor.ts +309 -0
- package/src/services/memory-pressure.ts +414 -0
- package/src/services/mtp-doctor.ts +86 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +164 -0
- package/src/services/ram-budget.ts +309 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.test.ts +216 -0
- package/src/services/recommendation.ts +671 -0
- package/src/services/registry.ts +157 -0
- package/src/services/required-kernels-gate.test.ts +64 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +426 -0
- package/src/services/routing-policy.test.ts +352 -0
- package/src/services/routing-policy.ts +367 -0
- package/src/services/routing-preferences.ts +17 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +750 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/system-memory.test.ts +47 -0
- package/src/services/system-memory.ts +67 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/types.ts +59 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.ts +163 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +133 -0
- package/src/services/voice/__fixtures__/voice-workbench-logic-baseline.json +180 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +94 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +194 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +195 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/acoustic-speaker-attribution.test.ts +165 -0
- package/src/services/voice/acoustic-speaker-attribution.ts +336 -0
- package/src/services/voice/asr-timed.real.test.ts +139 -0
- package/src/services/voice/audio-frame-consumer.test.ts +669 -0
- package/src/services/voice/audio-frame-consumer.ts +651 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +335 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/corpus-augment.test.ts +276 -0
- package/src/services/voice/corpus-augment.ts +451 -0
- package/src/services/voice/corpus-generator.test.ts +201 -0
- package/src/services/voice/corpus-generator.ts +413 -0
- package/src/services/voice/diarization-error-rate.greedy.test.ts +140 -0
- package/src/services/voice/diarization-error-rate.test.ts +100 -0
- package/src/services/voice/diarization-error-rate.ts +249 -0
- package/src/services/voice/e2e-harness.der.test.ts +94 -0
- package/src/services/voice/e2e-harness.respond-eot-entity.test.ts +277 -0
- package/src/services/voice/e2e-harness.security-echo.test.ts +103 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +902 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/echo-delay.test.ts +118 -0
- package/src/services/voice/echo-delay.ts +135 -0
- package/src/services/voice/echo-metrics.test.ts +17 -0
- package/src/services/voice/echo-metrics.ts +20 -0
- package/src/services/voice/echo-reference-buffer.test.ts +86 -0
- package/src/services/voice/echo-reference-buffer.ts +165 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.test.ts +131 -0
- package/src/services/voice/embedding.ts +242 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge-transcript-join.test.ts +278 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2343 -0
- package/src/services/voice/eot-classifier-ggml.ts +569 -0
- package/src/services/voice/eot-classifier.test.ts +98 -0
- package/src/services/voice/eot-classifier.ts +422 -0
- package/src/services/voice/errors.ts +34 -0
- package/src/services/voice/expressive-tags.asr.test.ts +77 -0
- package/src/services/voice/expressive-tags.test.ts +102 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.test.ts +735 -0
- package/src/services/voice/ffi-bindings.ts +3387 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.ts +139 -0
- package/src/services/voice/index.ts +502 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +262 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +236 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +67 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +223 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.ts +64 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.echo.test.ts +232 -0
- package/src/services/voice/live-diarization-session.ts +622 -0
- package/src/services/voice/metric-math.test.ts +61 -0
- package/src/services/voice/metric-math.ts +25 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/nlms-echo-canceller.test.ts +244 -0
- package/src/services/voice/nlms-echo-canceller.ts +317 -0
- package/src/services/voice/optimistic-policy.power-source.test.ts +36 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.ts +504 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/real-audio-decode.test.ts +148 -0
- package/src/services/voice/research/VOICE_8785_ASSESSMENT.md +141 -0
- package/src/services/voice/research/VOICE_PIPELINE_RESEARCH_2026.md +117 -0
- package/src/services/voice/research/VOICE_VALIDATION_RUNBOOK.md +135 -0
- package/src/services/voice/ring-buffer.test.ts +129 -0
- package/src/services/voice/ring-buffer.ts +123 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/samantha-preset-regenerator.wav.test.ts +90 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/self-voice-imprint.test.ts +59 -0
- package/src/services/voice/self-voice-imprint.ts +102 -0
- package/src/services/voice/shared-resources.ts +343 -0
- package/src/services/voice/speaker/attribution-pipeline.test.ts +221 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +449 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.test.ts +59 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.asr-backend.test.ts +76 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/transcript-knowledge.test.ts +68 -0
- package/src/services/voice/transcript-knowledge.ts +75 -0
- package/src/services/voice/transcript-service.test.ts +195 -0
- package/src/services/voice/transcript-service.ts +205 -0
- package/src/services/voice/transcript-store.test.ts +189 -0
- package/src/services/voice/transcript-store.ts +164 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.test.ts +498 -0
- package/src/services/voice/vad.ts +832 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.test.ts +415 -0
- package/src/services/voice/voice-budget.ts +635 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-hardening.fuzz.test.ts +116 -0
- package/src/services/voice/voice-preload-predictor.test.ts +130 -0
- package/src/services/voice/voice-preload-predictor.ts +113 -0
- package/src/services/voice/voice-preset-format.fuzz.test.ts +89 -0
- package/src/services/voice/voice-preset-format.test.ts +75 -0
- package/src/services/voice/voice-preset-format.ts +713 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.test.ts +159 -0
- package/src/services/voice/voice-scenario.ts +280 -0
- package/src/services/voice/voice-scenario.turn-helpers.test.ts +77 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +367 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.ts +319 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wav-codec.fuzz.test.ts +59 -0
- package/src/services/voice/wav-codec.test.ts +32 -0
- package/src/services/voice/wav-codec.ts +101 -0
- package/src/services/voice/workbench-entrypoint.test.ts +55 -0
- package/src/services/voice/workbench-entrypoint.ts +88 -0
- package/src/services/voice/workbench-headless-runner.test.ts +162 -0
- package/src/services/voice/workbench-headless-runner.ts +396 -0
- package/src/services/voice/workbench-logic-services.test.ts +225 -0
- package/src/services/voice/workbench-logic-services.ts +184 -0
- package/src/services/voice/workbench-real-services.ts +629 -0
- package/src/services/voice/workbench-scenarios.ts +407 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/src/voice-workbench.ts +71 -0
|
@@ -0,0 +1,2343 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Engine ↔ voice scheduler bridge.
|
|
3
|
+
*
|
|
4
|
+
* Adapts the live `LocalInferenceEngine` (`engine.ts`) plus the MTP
|
|
5
|
+
* llama-server (`ffi-streaming-backend.ts`) onto the voice scaffold's
|
|
6
|
+
* `VoiceScheduler`. See `packages/inference/AGENTS.md` §4 for the
|
|
7
|
+
* streaming graph this implements:
|
|
8
|
+
*
|
|
9
|
+
* ASR → text tokens → MTP drafter ↔ target verifier (text model)
|
|
10
|
+
* → phrase chunker → speaker preset cache + phrase cache
|
|
11
|
+
* → OmniVoice TTS → PCM ring buffer → audio out
|
|
12
|
+
*
|
|
13
|
+
* Plus rollback queue (MTP rejection → cancel pending TTS chunks)
|
|
14
|
+
* and barge-in cancellation (mic VAD → drain ring buffer + cancel TTS).
|
|
15
|
+
*
|
|
16
|
+
* Two TTS backends are exposed:
|
|
17
|
+
* - `StubOmniVoiceBackend`: deterministic synthetic PCM. Used by tests
|
|
18
|
+
* and any path that wants the streaming graph without real audio.
|
|
19
|
+
* - `FfiOmniVoiceBackend`: forwards through the fused
|
|
20
|
+
* `libelizainference.{dylib,so,dll}` ABI. The bridge creates the
|
|
21
|
+
* context lazily when voice is armed or first used, so voice-off
|
|
22
|
+
* does not keep OmniVoice weights resident.
|
|
23
|
+
*
|
|
24
|
+
* Per AGENTS.md §3 + §9 (no defensive code, no log-and-continue), every
|
|
25
|
+
* startup precondition surfaces as a thrown `VoiceStartupError`. There
|
|
26
|
+
* is no silent fallback to text-only.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
import { existsSync, readdirSync, statSync } from "node:fs";
|
|
30
|
+
import os from "node:os";
|
|
31
|
+
import path from "node:path";
|
|
32
|
+
import type { IAgentRuntime } from "@elizaos/core";
|
|
33
|
+
import { logger } from "@elizaos/core";
|
|
34
|
+
import type { VoiceCancellationReason } from "@elizaos/shared";
|
|
35
|
+
import { localInferenceRoot } from "../paths";
|
|
36
|
+
import {
|
|
37
|
+
type CoordinatorRuntime,
|
|
38
|
+
VoiceCancellationCoordinator,
|
|
39
|
+
} from "./cancellation-coordinator";
|
|
40
|
+
import { VoiceStartupError } from "./errors";
|
|
41
|
+
import type {
|
|
42
|
+
AsrWordTiming,
|
|
43
|
+
ElizaInferenceContextHandle,
|
|
44
|
+
ElizaInferenceFfi,
|
|
45
|
+
NativeVerifierEvent,
|
|
46
|
+
} from "./ffi-bindings";
|
|
47
|
+
import { loadElizaInferenceFfi } from "./ffi-bindings";
|
|
48
|
+
import { KokoroTtsBackend } from "./kokoro/kokoro-backend";
|
|
49
|
+
import type { KokoroEngineDiscoveryResult } from "./kokoro/kokoro-engine-discovery";
|
|
50
|
+
import { pickKokoroRuntimeBackend } from "./kokoro/pick-runtime";
|
|
51
|
+
import {
|
|
52
|
+
VoiceLifecycle,
|
|
53
|
+
VoiceLifecycleError,
|
|
54
|
+
type VoiceLifecycleLoaders,
|
|
55
|
+
} from "./lifecycle";
|
|
56
|
+
import {
|
|
57
|
+
OptimisticGenerationPolicy,
|
|
58
|
+
type OptimisticPolicyOptions,
|
|
59
|
+
resolvePowerSourceState,
|
|
60
|
+
} from "./optimistic-policy";
|
|
61
|
+
import {
|
|
62
|
+
type CachedPhraseAudio,
|
|
63
|
+
DEFAULT_PHRASE_CACHE_SEED,
|
|
64
|
+
FIRST_AUDIO_FILLERS,
|
|
65
|
+
PhraseCache,
|
|
66
|
+
} from "./phrase-cache";
|
|
67
|
+
import {
|
|
68
|
+
VoicePipeline,
|
|
69
|
+
type VoicePipelineConfig,
|
|
70
|
+
type VoicePipelineDeps,
|
|
71
|
+
type VoicePipelineEvents,
|
|
72
|
+
} from "./pipeline";
|
|
73
|
+
import {
|
|
74
|
+
MissingAsrTranscriber,
|
|
75
|
+
MtpDraftProposer,
|
|
76
|
+
MtpTargetVerifier,
|
|
77
|
+
type MtpTextRunner,
|
|
78
|
+
} from "./pipeline-impls";
|
|
79
|
+
import type { VoiceProfileStore } from "./profile-store";
|
|
80
|
+
import { type SchedulerEvents, VoiceScheduler } from "./scheduler";
|
|
81
|
+
import { AgentSelfVoiceImprint } from "./self-voice-imprint";
|
|
82
|
+
import {
|
|
83
|
+
type MmapRegionHandle,
|
|
84
|
+
SharedResourceRegistry,
|
|
85
|
+
} from "./shared-resources";
|
|
86
|
+
import {
|
|
87
|
+
type VoiceAttributionOutput,
|
|
88
|
+
VoiceAttributionPipeline,
|
|
89
|
+
} from "./speaker/attribution-pipeline";
|
|
90
|
+
import {
|
|
91
|
+
type Diarizer,
|
|
92
|
+
PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID,
|
|
93
|
+
} from "./speaker/diarizer";
|
|
94
|
+
import { FusedDiarizer } from "./speaker/diarizer-fused";
|
|
95
|
+
import type { SpeakerEncoder } from "./speaker/encoder";
|
|
96
|
+
import { FusedSpeakerEncoder } from "./speaker/encoder-fused";
|
|
97
|
+
import {
|
|
98
|
+
SPEAKER_GGML_EMBEDDING_DIM,
|
|
99
|
+
SPEAKER_GGML_SAMPLE_RATE,
|
|
100
|
+
} from "./speaker/encoder-ggml";
|
|
101
|
+
import {
|
|
102
|
+
DEFAULT_VOICE_PRESET_REL_PATH,
|
|
103
|
+
SpeakerPresetCache,
|
|
104
|
+
} from "./speaker-preset-cache";
|
|
105
|
+
import {
|
|
106
|
+
ASR_SAMPLE_RATE,
|
|
107
|
+
AsrUnavailableError,
|
|
108
|
+
createStreamingTranscriber,
|
|
109
|
+
resampleLinear,
|
|
110
|
+
} from "./transcriber";
|
|
111
|
+
import type {
|
|
112
|
+
AudioChunk,
|
|
113
|
+
AudioSink,
|
|
114
|
+
OmniVoiceBackend,
|
|
115
|
+
Phrase,
|
|
116
|
+
RejectedTokenRange,
|
|
117
|
+
SchedulerConfig,
|
|
118
|
+
SpeakerPreset,
|
|
119
|
+
StreamingTranscriber,
|
|
120
|
+
TextToken,
|
|
121
|
+
TranscriptionAudio,
|
|
122
|
+
VadEventSource,
|
|
123
|
+
} from "./types";
|
|
124
|
+
import { decodeMonoPcm16Wav, encodeMonoPcm16Wav } from "./wav-codec";
|
|
125
|
+
|
|
126
|
+
const SAMPLE_RATE_DEFAULT = 24_000;
|
|
127
|
+
const RING_BUFFER_CAPACITY_DEFAULT = SAMPLE_RATE_DEFAULT * 4; // 4s
|
|
128
|
+
/**
|
|
129
|
+
* Runtime default for the no-punctuation phrase cap (`PhraseChunker.maxTokensPerPhrase`).
|
|
130
|
+
* Punctuation (`, . ! ?`) is still the primary boundary; this only bounds
|
|
131
|
+
* a run-on token stream. Kept small — equal to the MTP draft window
|
|
132
|
+
* (`DEFAULT_VOICE_MAX_DRAFT_TOKENS` in `engine.ts`) — so first-audio latency
|
|
133
|
+
* is bounded (a phrase ≈ one draft round of audio, not 30 words) and a
|
|
134
|
+
* MTP-reject rollback drops at most one un-spoken chunk (AGENTS.md §4 —
|
|
135
|
+
* "small chunk = low latency cost on rollback"). Override per bridge via
|
|
136
|
+
* `maxTokensPerPhrase` or `ELIZA_VOICE_MAX_TOKENS_PER_PHRASE`. The
|
|
137
|
+
* `PhraseChunker` primitive keeps the AGENTS-spec 30-word default for
|
|
138
|
+
* non-runtime callers.
|
|
139
|
+
*/
|
|
140
|
+
const PHRASE_MAX_TOKENS_DEFAULT = 8;
|
|
141
|
+
const STUB_PCM_MS_PER_PHRASE = 100;
|
|
142
|
+
const STUB_PCM_STREAM_CHUNKS = 4;
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Resolve the `speaker_preset_id` value to send across the FFI boundary.
|
|
146
|
+
*
|
|
147
|
+
* Historically this returned `null` for the default voice — the C side then
|
|
148
|
+
* treated `null` as "auto-voice mode" and ignored any preset file under
|
|
149
|
+
* `cache/voice-preset-default.bin`. That was the right behaviour when the
|
|
150
|
+
* default preset was a 256-fp32-zero placeholder; it's wrong now that the
|
|
151
|
+
* default preset can be a real (v2) OmniVoice sam freeze. With ABI v4
|
|
152
|
+
* the FFI bridge looks up `<bundle>/cache/voice-preset-<id>.bin` when the
|
|
153
|
+
* id is supplied and applies the `(instruct, ref_audio_tokens, ref_text)`
|
|
154
|
+
* triple to `ov_tts_params` — so we must always pass the id.
|
|
155
|
+
*
|
|
156
|
+
* The only case we return `null` is when the preset shape is degenerate
|
|
157
|
+
* (no embedding, no ref-audio-tokens, no instruct) — i.e. an explicit
|
|
158
|
+
* "no preset" signal from a caller that doesn't want a voice bound. The
|
|
159
|
+
* FFI side honours `null` by running OmniVoice's intrinsic auto-voice
|
|
160
|
+
* path.
|
|
161
|
+
*/
|
|
162
|
+
function ffiSpeakerPresetId(preset: SpeakerPreset): string | null {
|
|
163
|
+
const hasV2Payload =
|
|
164
|
+
(preset.instruct !== undefined && preset.instruct.length > 0) ||
|
|
165
|
+
(preset.refText !== undefined && preset.refText.length > 0) ||
|
|
166
|
+
(preset.refAudioTokens !== undefined &&
|
|
167
|
+
preset.refAudioTokens.tokens.length > 0);
|
|
168
|
+
const hasEmbedding = preset.embedding.length > 0;
|
|
169
|
+
if (!hasV2Payload && !hasEmbedding) {
|
|
170
|
+
// Degenerate preset (e.g. the 1052-byte all-zero placeholder). The C
|
|
171
|
+
// side cannot do anything useful with it; let OmniVoice pick its own
|
|
172
|
+
// voice via the auto-voice path.
|
|
173
|
+
return null;
|
|
174
|
+
}
|
|
175
|
+
return preset.voiceId;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/** Re-exported from `./errors` so existing `engine-bridge` importers don't churn. */
|
|
179
|
+
export { VoiceStartupError };
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Native verifier callbacks report rejected token ranges as half-open
|
|
183
|
+
* `[from, to)` intervals. The scheduler rollback queue uses inclusive
|
|
184
|
+
* token indexes, so convert in exactly one place.
|
|
185
|
+
*/
|
|
186
|
+
export function nativeRejectedRangeToRollbackRange(
|
|
187
|
+
event: Pick<NativeVerifierEvent, "rejectedFrom" | "rejectedTo">,
|
|
188
|
+
): RejectedTokenRange | null {
|
|
189
|
+
if (event.rejectedFrom < 0 || event.rejectedTo <= event.rejectedFrom) {
|
|
190
|
+
return null;
|
|
191
|
+
}
|
|
192
|
+
return {
|
|
193
|
+
fromIndex: event.rejectedFrom,
|
|
194
|
+
toIndex: event.rejectedTo - 1,
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* One PCM segment delivered to a `StreamingTtsBackend.synthesizeStream`
|
|
200
|
+
* consumer (W9's scheduler) as TTS decodes it. `isFinal` marks the
|
|
201
|
+
* zero-length tail chunk that closes the phrase.
|
|
202
|
+
*/
|
|
203
|
+
export interface TtsPcmChunk {
|
|
204
|
+
pcm: Float32Array;
|
|
205
|
+
sampleRate: number;
|
|
206
|
+
isFinal: boolean;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Streaming-TTS seam between the fused `libelizainference` runtime and
|
|
211
|
+
* W9's voice scheduler. The scheduler calls `synthesizeStream(...)` for
|
|
212
|
+
* a phrase and writes each delivered `pcm` segment into the
|
|
213
|
+
* `PcmRingBuffer` on the same scheduler tick (AGENTS.md §4 —
|
|
214
|
+
* phrase-chunk → TTS within one scheduler tick); returning `true` from
|
|
215
|
+
* `onChunk` (or flipping `cancelSignal.cancelled`) hard-cancels the
|
|
216
|
+
* in-flight forward pass at the next kernel boundary (barge-in /
|
|
217
|
+
* MTP-rejected tail).
|
|
218
|
+
*
|
|
219
|
+
* Both `OmniVoiceBackend` implementations in this module satisfy it:
|
|
220
|
+
* - `FfiOmniVoiceBackend` forwards to
|
|
221
|
+
* `eliza_inference_tts_synthesize_stream` when the loaded build
|
|
222
|
+
* advertises streaming TTS (`tts_stream_supported() == 1`), else it
|
|
223
|
+
* synthesizes whole and emits the result as one body chunk + a final
|
|
224
|
+
* tail (no silent "streaming" lie — the chunk count just collapses
|
|
225
|
+
* to one when the build is non-streaming);
|
|
226
|
+
* - `StubOmniVoiceBackend` emits deterministic synthetic PCM split
|
|
227
|
+
* into a fixed number of chunks so scheduler tests can observe the
|
|
228
|
+
* incremental handoff without a real model.
|
|
229
|
+
*/
|
|
230
|
+
export interface StreamingTtsBackend {
|
|
231
|
+
/**
|
|
232
|
+
* Synthesize `phrase` with `preset` and deliver PCM in chunks. The
|
|
233
|
+
* scheduler owns the ring-buffer write inside `onChunk`. Resolves with
|
|
234
|
+
* `cancelled: true` if `onChunk` requested a stop (or `cancelSignal`
|
|
235
|
+
* was set), `false` on a clean finish. The final `onChunk` call always
|
|
236
|
+
* has `isFinal: true` (possibly a zero-length `pcm`) so the consumer
|
|
237
|
+
* can settle per-phrase state.
|
|
238
|
+
*/
|
|
239
|
+
synthesizeStream(args: {
|
|
240
|
+
phrase: Phrase;
|
|
241
|
+
preset: SpeakerPreset;
|
|
242
|
+
cancelSignal: { cancelled: boolean };
|
|
243
|
+
onChunk: (chunk: TtsPcmChunk) => boolean | undefined;
|
|
244
|
+
onKernelTick?: () => void;
|
|
245
|
+
}): Promise<{ cancelled: boolean }>;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/** True when `backend` implements the `StreamingTtsBackend` seam. */
|
|
249
|
+
export function isStreamingTtsBackend(
|
|
250
|
+
backend: OmniVoiceBackend,
|
|
251
|
+
): backend is OmniVoiceBackend & StreamingTtsBackend {
|
|
252
|
+
return (
|
|
253
|
+
typeof (backend as Partial<StreamingTtsBackend>).synthesizeStream ===
|
|
254
|
+
"function"
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* Deterministic test TTS backend. Each phrase yields
|
|
260
|
+
* `STUB_PCM_MS_PER_PHRASE` ms of silence (zeros), with the
|
|
261
|
+
* cancel signal honoured at the kernel-tick boundary so barge-in tests
|
|
262
|
+
* observe cancellation without waiting on a real model.
|
|
263
|
+
*/
|
|
264
|
+
export class StubOmniVoiceBackend
|
|
265
|
+
implements OmniVoiceBackend, StreamingTtsBackend
|
|
266
|
+
{
|
|
267
|
+
readonly id = "stub" as const;
|
|
268
|
+
private readonly sampleRate: number;
|
|
269
|
+
calls = 0;
|
|
270
|
+
streamCalls = 0;
|
|
271
|
+
|
|
272
|
+
constructor(sampleRate = SAMPLE_RATE_DEFAULT) {
|
|
273
|
+
this.sampleRate = sampleRate;
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
async synthesize(args: {
|
|
277
|
+
phrase: Phrase;
|
|
278
|
+
preset: SpeakerPreset;
|
|
279
|
+
cancelSignal: { cancelled: boolean };
|
|
280
|
+
onKernelTick?: () => void;
|
|
281
|
+
}): Promise<AudioChunk> {
|
|
282
|
+
this.calls++;
|
|
283
|
+
args.onKernelTick?.();
|
|
284
|
+
const samples = Math.floor(
|
|
285
|
+
(this.sampleRate * STUB_PCM_MS_PER_PHRASE) / 1000,
|
|
286
|
+
);
|
|
287
|
+
const pcm = new Float32Array(samples);
|
|
288
|
+
return {
|
|
289
|
+
phraseId: args.phrase.id,
|
|
290
|
+
fromIndex: args.phrase.fromIndex,
|
|
291
|
+
toIndex: args.phrase.toIndex,
|
|
292
|
+
pcm,
|
|
293
|
+
sampleRate: this.sampleRate,
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
async synthesizeStream(args: {
|
|
298
|
+
phrase: Phrase;
|
|
299
|
+
preset: SpeakerPreset;
|
|
300
|
+
cancelSignal: { cancelled: boolean };
|
|
301
|
+
onChunk: (chunk: TtsPcmChunk) => boolean | undefined;
|
|
302
|
+
onKernelTick?: () => void;
|
|
303
|
+
}): Promise<{ cancelled: boolean }> {
|
|
304
|
+
this.streamCalls++;
|
|
305
|
+
const totalSamples = Math.floor(
|
|
306
|
+
(this.sampleRate * STUB_PCM_MS_PER_PHRASE) / 1000,
|
|
307
|
+
);
|
|
308
|
+
const perChunk = Math.max(
|
|
309
|
+
1,
|
|
310
|
+
Math.ceil(totalSamples / STUB_PCM_STREAM_CHUNKS),
|
|
311
|
+
);
|
|
312
|
+
let cancelled = false;
|
|
313
|
+
for (let off = 0; off < totalSamples; off += perChunk) {
|
|
314
|
+
args.onKernelTick?.();
|
|
315
|
+
if (args.cancelSignal.cancelled) {
|
|
316
|
+
cancelled = true;
|
|
317
|
+
break;
|
|
318
|
+
}
|
|
319
|
+
const n = Math.min(perChunk, totalSamples - off);
|
|
320
|
+
const want = args.onChunk({
|
|
321
|
+
pcm: new Float32Array(n),
|
|
322
|
+
sampleRate: this.sampleRate,
|
|
323
|
+
isFinal: false,
|
|
324
|
+
});
|
|
325
|
+
if (want === true || args.cancelSignal.cancelled) {
|
|
326
|
+
cancelled = true;
|
|
327
|
+
break;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
args.onChunk({
|
|
331
|
+
pcm: new Float32Array(0),
|
|
332
|
+
sampleRate: this.sampleRate,
|
|
333
|
+
isFinal: true,
|
|
334
|
+
});
|
|
335
|
+
return { cancelled };
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* FFI-backed TTS backend. Forwards each `synthesize()` call through the
|
|
341
|
+
* fused `libelizainference` ABI declared in
|
|
342
|
+
* `packages/app-core/scripts/omnivoice-fuse/ffi.h`. The library handle
|
|
343
|
+
* + a per-engine context pointer are held by the bridge and passed in
|
|
344
|
+
* at construction so this backend stays a thin adapter.
|
|
345
|
+
*
|
|
346
|
+
* Until the real fused build ships, the binding is exercised against
|
|
347
|
+
* the compatibility C library at `scripts/omnivoice-fuse/ffi-stub.c`, which returns
|
|
348
|
+
* `ELIZA_ERR_NOT_IMPLEMENTED` for `tts_synthesize` — the binding then
|
|
349
|
+
* raises `VoiceLifecycleError({code:"kernel-missing"})`. The adapter
|
|
350
|
+
* re-wraps that as `VoiceStartupError("missing-fused-build", ...)` so
|
|
351
|
+
* the engine layer's startup-error taxonomy stays unified. No silent
|
|
352
|
+
* fallback (AGENTS.md §3 + §9).
|
|
353
|
+
*/
|
|
354
|
+
export class FfiOmniVoiceBackend
|
|
355
|
+
implements OmniVoiceBackend, StreamingTtsBackend
|
|
356
|
+
{
|
|
357
|
+
readonly id = "ffi" as const;
|
|
358
|
+
private readonly ffi: ElizaInferenceFfi;
|
|
359
|
+
private readonly getContext: () => ElizaInferenceContextHandle;
|
|
360
|
+
private readonly sampleRate: number;
|
|
361
|
+
private readonly maxSecondsPerPhrase: number;
|
|
362
|
+
|
|
363
|
+
constructor(args: {
|
|
364
|
+
ffi: ElizaInferenceFfi;
|
|
365
|
+
ctx?: ElizaInferenceContextHandle;
|
|
366
|
+
getContext?: () => ElizaInferenceContextHandle;
|
|
367
|
+
sampleRate?: number;
|
|
368
|
+
maxSecondsPerPhrase?: number;
|
|
369
|
+
}) {
|
|
370
|
+
this.ffi = args.ffi;
|
|
371
|
+
this.getContext =
|
|
372
|
+
args.getContext ??
|
|
373
|
+
(() => {
|
|
374
|
+
if (args.ctx === undefined) {
|
|
375
|
+
throw new VoiceStartupError(
|
|
376
|
+
"missing-fused-build",
|
|
377
|
+
"[voice] FFI backend has no context provider",
|
|
378
|
+
);
|
|
379
|
+
}
|
|
380
|
+
return args.ctx;
|
|
381
|
+
});
|
|
382
|
+
this.sampleRate = args.sampleRate ?? SAMPLE_RATE_DEFAULT;
|
|
383
|
+
this.maxSecondsPerPhrase = args.maxSecondsPerPhrase ?? 6;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/** True when the loaded `libelizainference` advertises streaming TTS. */
|
|
387
|
+
supportsStreamingTts(): boolean {
|
|
388
|
+
return this.ffi.ttsStreamSupported();
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* One-shot synthesis returning the whole phrase as an `AudioChunk`.
|
|
393
|
+
* When the loaded build advertises streaming TTS this routes through
|
|
394
|
+
* `eliza_inference_tts_synthesize_stream` and concatenates the
|
|
395
|
+
* delivered chunks (so the chunk-aware native path is exercised even
|
|
396
|
+
* for whole-phrase callers); otherwise it uses the batch
|
|
397
|
+
* `eliza_inference_tts_synthesize` symbol. `cancelSignal` is honoured
|
|
398
|
+
* at chunk boundaries — a cancelled stream returns whatever was
|
|
399
|
+
* synthesized so far.
|
|
400
|
+
*/
|
|
401
|
+
async synthesize(args: {
|
|
402
|
+
phrase: Phrase;
|
|
403
|
+
preset: SpeakerPreset;
|
|
404
|
+
cancelSignal: { cancelled: boolean };
|
|
405
|
+
onKernelTick?: () => void;
|
|
406
|
+
}): Promise<AudioChunk> {
|
|
407
|
+
args.onKernelTick?.();
|
|
408
|
+
const ctx = this.getContext();
|
|
409
|
+
if (this.ffi.ttsStreamSupported()) {
|
|
410
|
+
const parts: Float32Array[] = [];
|
|
411
|
+
let total = 0;
|
|
412
|
+
this.ffi.ttsSynthesizeStream({
|
|
413
|
+
ctx,
|
|
414
|
+
text: args.phrase.text,
|
|
415
|
+
speakerPresetId: ffiSpeakerPresetId(args.preset),
|
|
416
|
+
onChunk: ({ pcm, isFinal }) => {
|
|
417
|
+
args.onKernelTick?.();
|
|
418
|
+
if (!isFinal && pcm.length > 0) {
|
|
419
|
+
parts.push(pcm);
|
|
420
|
+
total += pcm.length;
|
|
421
|
+
}
|
|
422
|
+
return args.cancelSignal.cancelled === true;
|
|
423
|
+
},
|
|
424
|
+
});
|
|
425
|
+
const merged = new Float32Array(total);
|
|
426
|
+
let off = 0;
|
|
427
|
+
for (const part of parts) {
|
|
428
|
+
merged.set(part, off);
|
|
429
|
+
off += part.length;
|
|
430
|
+
}
|
|
431
|
+
return {
|
|
432
|
+
phraseId: args.phrase.id,
|
|
433
|
+
fromIndex: args.phrase.fromIndex,
|
|
434
|
+
toIndex: args.phrase.toIndex,
|
|
435
|
+
pcm: merged,
|
|
436
|
+
sampleRate: this.sampleRate,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
const out = new Float32Array(this.sampleRate * this.maxSecondsPerPhrase);
|
|
440
|
+
const samples = this.ffi.ttsSynthesize({
|
|
441
|
+
ctx,
|
|
442
|
+
text: args.phrase.text,
|
|
443
|
+
speakerPresetId: ffiSpeakerPresetId(args.preset),
|
|
444
|
+
out,
|
|
445
|
+
});
|
|
446
|
+
return {
|
|
447
|
+
phraseId: args.phrase.id,
|
|
448
|
+
fromIndex: args.phrase.fromIndex,
|
|
449
|
+
toIndex: args.phrase.toIndex,
|
|
450
|
+
pcm: out.subarray(0, samples),
|
|
451
|
+
sampleRate: this.sampleRate,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Streaming synthesis: forwards to `eliza_inference_tts_synthesize_stream`
|
|
457
|
+
* when the build advertises a streaming decoder. When it does NOT
|
|
458
|
+
* (`tts_stream_supported() == 0`), this still satisfies the seam — but
|
|
459
|
+
* with exactly one body chunk + one final tail (the batch synthesis
|
|
460
|
+
* result), so the caller never mistakes a non-streaming build for a
|
|
461
|
+
* streaming one (no fallback sludge — the chunk count is the honest
|
|
462
|
+
* signal). The native side checks `ctx->tts_cancel` (set via
|
|
463
|
+
* `eliza_inference_cancel_tts`) on top of the `onChunk` return value.
|
|
464
|
+
* A non-streaming build cannot be interrupted while the native batch
|
|
465
|
+
* forward pass is inside `ttsSynthesize`; it only observes cancellation
|
|
466
|
+
* before emitting the body chunk. Barge-in-critical product paths should
|
|
467
|
+
* require `supportsStreamingTts()`.
|
|
468
|
+
*/
|
|
469
|
+
async synthesizeStream(args: {
|
|
470
|
+
phrase: Phrase;
|
|
471
|
+
preset: SpeakerPreset;
|
|
472
|
+
cancelSignal: { cancelled: boolean };
|
|
473
|
+
onChunk: (chunk: TtsPcmChunk) => boolean | undefined;
|
|
474
|
+
onKernelTick?: () => void;
|
|
475
|
+
}): Promise<{ cancelled: boolean }> {
|
|
476
|
+
const ctx = this.getContext();
|
|
477
|
+
if (this.ffi.ttsStreamSupported()) {
|
|
478
|
+
const { cancelled } = this.ffi.ttsSynthesizeStream({
|
|
479
|
+
ctx,
|
|
480
|
+
text: args.phrase.text,
|
|
481
|
+
speakerPresetId: ffiSpeakerPresetId(args.preset),
|
|
482
|
+
onChunk: ({ pcm, isFinal }) => {
|
|
483
|
+
args.onKernelTick?.();
|
|
484
|
+
if (args.cancelSignal.cancelled) return true;
|
|
485
|
+
const want = args.onChunk({
|
|
486
|
+
pcm,
|
|
487
|
+
sampleRate: this.sampleRate,
|
|
488
|
+
isFinal,
|
|
489
|
+
});
|
|
490
|
+
// Re-read the (mutable) cancel flag — the chunk callback or a
|
|
491
|
+
// concurrent barge-in may have flipped it.
|
|
492
|
+
return want === true || args.cancelSignal.cancelled;
|
|
493
|
+
},
|
|
494
|
+
});
|
|
495
|
+
return { cancelled };
|
|
496
|
+
}
|
|
497
|
+
// Non-streaming build: one batch forward pass, surfaced as a single
|
|
498
|
+
// body chunk + final tail.
|
|
499
|
+
args.onKernelTick?.();
|
|
500
|
+
const out = new Float32Array(this.sampleRate * this.maxSecondsPerPhrase);
|
|
501
|
+
const samples = this.ffi.ttsSynthesize({
|
|
502
|
+
ctx,
|
|
503
|
+
text: args.phrase.text,
|
|
504
|
+
speakerPresetId: ffiSpeakerPresetId(args.preset),
|
|
505
|
+
out,
|
|
506
|
+
});
|
|
507
|
+
let cancelled = args.cancelSignal.cancelled === true;
|
|
508
|
+
if (!cancelled && samples > 0) {
|
|
509
|
+
const want = args.onChunk({
|
|
510
|
+
pcm: out.subarray(0, samples),
|
|
511
|
+
sampleRate: this.sampleRate,
|
|
512
|
+
isFinal: false,
|
|
513
|
+
});
|
|
514
|
+
cancelled = want === true || args.cancelSignal.cancelled === true;
|
|
515
|
+
}
|
|
516
|
+
args.onChunk({
|
|
517
|
+
pcm: new Float32Array(0),
|
|
518
|
+
sampleRate: this.sampleRate,
|
|
519
|
+
isFinal: true,
|
|
520
|
+
});
|
|
521
|
+
return { cancelled };
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/** Hard-cancel any in-flight TTS forward pass on this backend's context. */
|
|
525
|
+
cancelTts(): void {
|
|
526
|
+
this.ffi.cancelTts(this.getContext());
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
/**
|
|
530
|
+
* Batch transcription. One-shot callers should use the fused batch ABI
|
|
531
|
+
* directly so the native side receives the original sample-rate metadata
|
|
532
|
+
* and can apply its own audio preprocessing. Live mic streaming remains
|
|
533
|
+
* available through `EngineVoiceBridge.createStreamingTranscriber()`.
|
|
534
|
+
*/
|
|
535
|
+
async transcribe(args: TranscriptionAudio): Promise<string> {
|
|
536
|
+
return this.ffi.asrTranscribe({
|
|
537
|
+
ctx: this.getContext(),
|
|
538
|
+
pcm: args.pcm,
|
|
539
|
+
sampleRateHz: args.sampleRate,
|
|
540
|
+
});
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
/** Transcribe + per-word timings when the fused build is ABI v12+; otherwise
|
|
544
|
+
* the text with empty `words` (the caller degrades to segment highlight). */
|
|
545
|
+
async transcribeTimed(
|
|
546
|
+
args: TranscriptionAudio,
|
|
547
|
+
): Promise<{ text: string; words: AsrWordTiming[] }> {
|
|
548
|
+
if (this.ffi.timedAsrSupported()) {
|
|
549
|
+
const res = this.ffi.asrTranscribeTimed({
|
|
550
|
+
ctx: this.getContext(),
|
|
551
|
+
pcm: args.pcm,
|
|
552
|
+
sampleRateHz: args.sampleRate,
|
|
553
|
+
});
|
|
554
|
+
return { text: res.text.trim(), words: res.words };
|
|
555
|
+
}
|
|
556
|
+
logger.debug(
|
|
557
|
+
"[FfiOmniVoiceBackend] timedAsrSupported()===false on the active fused build — per-word timings dropped, transcript player degrades to segment-level highlight",
|
|
558
|
+
);
|
|
559
|
+
return { text: (await this.transcribe(args)).trim(), words: [] };
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
export interface EngineVoiceBridgeOptions {
|
|
564
|
+
/**
|
|
565
|
+
* Bundle root on disk. Must contain `cache/voice-preset-default.bin`
|
|
566
|
+
* and the FFI library (`lib/libelizainference.{dylib,so}`) when
|
|
567
|
+
* `useFfiBackend === true`.
|
|
568
|
+
*/
|
|
569
|
+
bundleRoot: string;
|
|
570
|
+
/**
|
|
571
|
+
* When true, use `FfiOmniVoiceBackend`. When false, use the deterministic test backend
|
|
572
|
+
* only for lifecycle/unit tests; live sessions and direct synthesis reject
|
|
573
|
+
* the deterministic test backend before user-visible audio can be emitted.
|
|
574
|
+
*/
|
|
575
|
+
useFfiBackend: boolean;
|
|
576
|
+
/** Override sample rate. Defaults to 24 kHz. */
|
|
577
|
+
sampleRate?: number;
|
|
578
|
+
/** Override ring buffer capacity (samples). Defaults to 4 s @ 24 kHz. */
|
|
579
|
+
ringBufferCapacity?: number;
|
|
580
|
+
/** Phrase chunker `maxTokensPerPhrase` (no-punctuation run-on cap). Defaults to
|
|
581
|
+
* `ELIZA_VOICE_MAX_TOKENS_PER_PHRASE` or 8 (one MTP draft round). */
|
|
582
|
+
maxTokensPerPhrase?: number;
|
|
583
|
+
/** Max concurrent TTS phrase dispatches. Defaults to env or scheduler default. */
|
|
584
|
+
maxInFlightPhrases?: number;
|
|
585
|
+
/**
|
|
586
|
+
* Pre-warmed phrase cache entries. Per AGENTS.md §4, a precomputed
|
|
587
|
+
* phrase cache for common assistant utterances is mandatory for the
|
|
588
|
+
* first-byte-latency win. Empty by default — callers wire actual
|
|
589
|
+
* entries from the bundle when available.
|
|
590
|
+
*/
|
|
591
|
+
prewarmedPhrases?: ReadonlyArray<CachedPhraseAudio>;
|
|
592
|
+
/**
|
|
593
|
+
* Optional sink override (e.g. for tests or for routing PCM to a
|
|
594
|
+
* platform-specific audio device). Defaults to the in-memory sink the
|
|
595
|
+
* scheduler creates.
|
|
596
|
+
*/
|
|
597
|
+
sink?: AudioSink;
|
|
598
|
+
/** Optional scheduler event listeners (rollback, audio, cancel). */
|
|
599
|
+
events?: SchedulerEvents;
|
|
600
|
+
/**
|
|
601
|
+
* Optional override for the TTS backend. When set, supersedes
|
|
602
|
+
* `useFfiBackend`. Tests use this to inject a controllable backend
|
|
603
|
+
* (e.g. one that holds synthesis open until a deferred resolves) so
|
|
604
|
+
* rollback timing can be observed deterministically.
|
|
605
|
+
*/
|
|
606
|
+
backendOverride?: OmniVoiceBackend;
|
|
607
|
+
/**
|
|
608
|
+
* Override only the TTS backend while keeping the fused bundle lifecycle
|
|
609
|
+
* and ASR FFI loaded. Used when a bundle falls back from OmniVoice speech
|
|
610
|
+
* to Kokoro speech but still needs bundled Gemma ASR for mic input.
|
|
611
|
+
*/
|
|
612
|
+
ttsBackendOverride?: OmniVoiceBackend;
|
|
613
|
+
/** Optional speaker preset paired with `ttsBackendOverride`. */
|
|
614
|
+
speakerPresetOverride?: SpeakerPreset;
|
|
615
|
+
/**
|
|
616
|
+
* Optional shared resource registry. When the bridge is created
|
|
617
|
+
* inside an engine that already owns one (text + voice on the same
|
|
618
|
+
* tokenizer / mmap regions), the engine passes its registry in so
|
|
619
|
+
* voice ref-counts against the same canonical resources. Tests can
|
|
620
|
+
* leave this unset to get a private registry.
|
|
621
|
+
*/
|
|
622
|
+
sharedResources?: SharedResourceRegistry;
|
|
623
|
+
/**
|
|
624
|
+
* Optional lifecycle loaders override. Production wires real
|
|
625
|
+
* `madvise`-backed mmap handles via the FFI; tests inject mocks so
|
|
626
|
+
* the disarm path can assert eviction without a real file mapping.
|
|
627
|
+
* When unset, default loaders are derived from the bundle root.
|
|
628
|
+
*/
|
|
629
|
+
lifecycleLoaders?: VoiceLifecycleLoaders;
|
|
630
|
+
/**
|
|
631
|
+
* Construct a `KokoroTtsBackend` directly and skip the bundle-root +
|
|
632
|
+
* speaker-preset + FFI checks the fused omnivoice path requires.
|
|
633
|
+
* Kokoro voices are picked by id (`KOKORO_VOICE_PACKS`), so the bundle's
|
|
634
|
+
* per-user speaker preset is not used. Mutually exclusive with
|
|
635
|
+
* `useFfiBackend: true` and `backendOverride`. Lifecycle loaders
|
|
636
|
+
* default to empty lifecycle handles (ORT owns the model memory; nothing to
|
|
637
|
+
* mmap-evict).
|
|
638
|
+
*/
|
|
639
|
+
kokoroOnly?: KokoroEngineDiscoveryResult;
|
|
640
|
+
/**
|
|
641
|
+
* Optional pre-loaded fused inference handle for the `kokoroOnly` path. When
|
|
642
|
+
* set, the Kokoro FFI runtime reuses it instead of dlopen-ing a second copy
|
|
643
|
+
* of `libelizainference` (tests inject a stub; production may share the
|
|
644
|
+
* engine's handle).
|
|
645
|
+
*/
|
|
646
|
+
kokoroFfi?: ElizaInferenceFfi;
|
|
647
|
+
/**
|
|
648
|
+
* Optional voice-profile store for speaker-attribution. When set, the
|
|
649
|
+
* bridge constructs a `VoiceAttributionPipeline` and runs attribution
|
|
650
|
+
* in parallel with ASR on every turn via `runVoiceTurn`. Callers receive
|
|
651
|
+
* the resolved `VoiceAttributionOutput` via `onAttribution` in the turn
|
|
652
|
+
* events passed to `runVoiceTurn`.
|
|
653
|
+
*
|
|
654
|
+
* When absent, attribution is skipped and the pipeline operates exactly
|
|
655
|
+
* as before (no diarizer / encoder overhead).
|
|
656
|
+
*/
|
|
657
|
+
profileStore?: VoiceProfileStore;
|
|
658
|
+
/**
|
|
659
|
+
* W3-9 / F1 — the agent runtime. When supplied, the bridge constructs a
|
|
660
|
+
* `VoiceCancellationCoordinator` and an `OptimisticGenerationPolicy`
|
|
661
|
+
* scoped to this voice session. The coordinator owns one cancellation
|
|
662
|
+
* token per `roomId` and fans abort out to:
|
|
663
|
+
* 1. `runtime.turnControllers.abortTurn(roomId, reason)` — the
|
|
664
|
+
* planner-loop / action handlers / streaming `useModel` see the
|
|
665
|
+
* abort within one tick.
|
|
666
|
+
* 2. The slot-abort callback (`slotAbort`) when the LM slot id is
|
|
667
|
+
* registered with the turn.
|
|
668
|
+
* 3. The TTS hard-stop callback (`ttsStop`), which the bridge wires
|
|
669
|
+
* to its existing `triggerBargeIn()` (audio sink drain + FFI/HTTP
|
|
670
|
+
* synthesis cancel).
|
|
671
|
+
* 4. The standard `AbortSignal` every fetch / `useModel` / FFI call
|
|
672
|
+
* that took `token.signal` honours.
|
|
673
|
+
*
|
|
674
|
+
* The reverse direction (runtime → voice) is wired symmetrically via
|
|
675
|
+
* the coordinator's `runtime.turnControllers.onEvent` subscription.
|
|
676
|
+
*
|
|
677
|
+
* Omit to keep the prior behaviour — the bridge then exposes no
|
|
678
|
+
* coordinator / policy and callers fall back to the legacy
|
|
679
|
+
* `BargeInController` + `triggerBargeIn()` surface.
|
|
680
|
+
*
|
|
681
|
+
* Structural type — `CoordinatorRuntime` is the minimum surface the
|
|
682
|
+
* coordinator needs (`turnControllers.{abortTurn, onEvent}`). Production
|
|
683
|
+
* passes a full `IAgentRuntime`; tests can pass a fake matching the
|
|
684
|
+
* structural shape.
|
|
685
|
+
*/
|
|
686
|
+
runtime?: IAgentRuntime | CoordinatorRuntime;
|
|
687
|
+
/**
|
|
688
|
+
* W3-9 / F1 — optional `OptimisticGenerationPolicy` overrides. When
|
|
689
|
+
* `runtime` is set and `optimisticPolicyOptions` is omitted, the bridge
|
|
690
|
+
* constructs a default policy gated on the resolved power source
|
|
691
|
+
* (plugged-in / battery / unknown) and the canonical EOT threshold.
|
|
692
|
+
*/
|
|
693
|
+
optimisticPolicyOptions?: OptimisticPolicyOptions;
|
|
694
|
+
/**
|
|
695
|
+
* W3-9 / F1 — optional LM slot-abort callback for the cancellation
|
|
696
|
+
* coordinator. Production wires this to `MtpLlamaServer.abortSlot`
|
|
697
|
+
* once a slot id is known per turn. The bridge passes this directly
|
|
698
|
+
* into the coordinator; the bridge itself does not own slot ids.
|
|
699
|
+
*
|
|
700
|
+
* Has no effect when `runtime` is unset (no coordinator is constructed).
|
|
701
|
+
*/
|
|
702
|
+
slotAbort?: (slotId: number, reason: VoiceCancellationReason) => void;
|
|
703
|
+
/**
|
|
704
|
+
* Live speaker-attribution gating. When set alongside a `profileStore` AND
|
|
705
|
+
* a full `runtime` (with `emitEvent`), `runVoiceTurn` automatically:
|
|
706
|
+
* 1. emits `VOICE_TURN_OBSERVED` for every attributed turn, and
|
|
707
|
+
* 2. folds the diarization decision into the turn's `voiceTurnSignal`
|
|
708
|
+
* (stamped onto `output.turn.metadata`) so the server gate
|
|
709
|
+
* `core.voice_turn_signal` can suppress confident-bystander cross-talk.
|
|
710
|
+
*
|
|
711
|
+
* `knownSpeakerEntityIds` / `ownerEntityId` may be functions so the caller
|
|
712
|
+
* can resolve the enrolled-speaker set lazily per turn (the household roster
|
|
713
|
+
* changes as people are named). When omitted, attribution still emits
|
|
714
|
+
* `VOICE_TURN_OBSERVED` and produces a fail-open signal (no bystander
|
|
715
|
+
* suppression — every attribution is treated as potentially addressed to us).
|
|
716
|
+
*/
|
|
717
|
+
liveAttribution?: LiveAttributionConfig;
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
/** Gating inputs for the automatic live-attribution → voiceTurnSignal seam. */
|
|
721
|
+
export interface LiveAttributionConfig {
|
|
722
|
+
/** Owner / primary-enrolled entity id (always allowed to speak). */
|
|
723
|
+
ownerEntityId?: string | (() => string | null | undefined);
|
|
724
|
+
/** Entity ids the agent answers without a wake word (owner + enrolled). */
|
|
725
|
+
knownSpeakerEntityIds?:
|
|
726
|
+
| readonly string[]
|
|
727
|
+
| (() => readonly string[] | undefined);
|
|
728
|
+
/** True when a wake word fired within the recent listen window. */
|
|
729
|
+
wakeWordActive?: boolean | (() => boolean);
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
export function createKokoroTtsBackend(
|
|
733
|
+
kokoro: KokoroEngineDiscoveryResult,
|
|
734
|
+
opts: { bundleRoot?: string; ffi?: ElizaInferenceFfi } = {},
|
|
735
|
+
): KokoroTtsBackend {
|
|
736
|
+
// In-process FFI is the sole Kokoro synthesis path on every platform — it
|
|
737
|
+
// runs inside the fused libelizainference handle, the only path that ships
|
|
738
|
+
// on iOS / Google Play (no local TCP socket). The legacy HTTP `fork`
|
|
739
|
+
// (llama-server /v1/audio/speech) runtime was removed. An already-loaded
|
|
740
|
+
// fused handle may be injected (`opts.ffi`) so Kokoro reuses it instead of
|
|
741
|
+
// dlopen-ing a second copy of the lib.
|
|
742
|
+
const decision = pickKokoroRuntimeBackend({
|
|
743
|
+
defaultBackend: "ffi",
|
|
744
|
+
ffi: {
|
|
745
|
+
layout: kokoro.layout,
|
|
746
|
+
bundleRoot: opts.bundleRoot,
|
|
747
|
+
...(opts.ffi ? { ffi: opts.ffi } : {}),
|
|
748
|
+
},
|
|
749
|
+
});
|
|
750
|
+
logger.info(
|
|
751
|
+
`[voice/kokoro] runtime backend=${decision.backend} reason="${decision.reason}"`,
|
|
752
|
+
);
|
|
753
|
+
return new KokoroTtsBackend({
|
|
754
|
+
layout: kokoro.layout,
|
|
755
|
+
runtime: decision.runtime,
|
|
756
|
+
defaultVoiceId: kokoro.defaultVoiceId,
|
|
757
|
+
});
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
export function createKokoroSpeakerPreset(
|
|
761
|
+
kokoro: KokoroEngineDiscoveryResult,
|
|
762
|
+
): SpeakerPreset {
|
|
763
|
+
return {
|
|
764
|
+
voiceId: kokoro.defaultVoiceId,
|
|
765
|
+
embedding: new Float32Array(0),
|
|
766
|
+
bytes: new Uint8Array(0),
|
|
767
|
+
};
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Per-turn events that include the optional attribution result alongside
|
|
772
|
+
* the existing `VoicePipelineEvents`. The attribution runs in parallel
|
|
773
|
+
* with ASR; it resolves some time after `onAsrComplete` and before
|
|
774
|
+
* `onComplete`.
|
|
775
|
+
*/
|
|
776
|
+
export interface VoiceTurnEvents extends VoicePipelineEvents {
|
|
777
|
+
/**
|
|
778
|
+
* Called once per turn when the `VoiceAttributionPipeline` resolves
|
|
779
|
+
* (diarizer + encoder + profile-store match). Only fired when the
|
|
780
|
+
* bridge was constructed with a `profileStore`. May arrive after
|
|
781
|
+
* `onAsrComplete` but before `onComplete`. Fire-and-forget from the
|
|
782
|
+
* bridge's perspective — callers attach the metadata to the turn's
|
|
783
|
+
* transcript asynchronously.
|
|
784
|
+
*/
|
|
785
|
+
onAttribution?(output: VoiceAttributionOutput): void;
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
/**
|
|
789
|
+
* Internal helper: construct the W3-9 cancellation coordinator + the
|
|
790
|
+
* optimistic-generation policy for a session, given the bridge options.
|
|
791
|
+
* Returns null/null when no runtime was supplied (the bridge then operates
|
|
792
|
+
* without the W3-9 surface — back-compat for callers that haven't adopted
|
|
793
|
+
* the canonical cancellation token yet).
|
|
794
|
+
*
|
|
795
|
+
* Lives outside the class so both `start()` and `startKokoroOnly()` can
|
|
796
|
+
* share it without duplicating the construction order (the coordinator's
|
|
797
|
+
* `ttsStop` callback closes over the to-be-constructed bridge — we plumb
|
|
798
|
+
* that through `setTtsStop` after the bridge is built).
|
|
799
|
+
*/
|
|
800
|
+
interface PendingCancellationWiring {
|
|
801
|
+
coordinator: VoiceCancellationCoordinator;
|
|
802
|
+
policy: OptimisticGenerationPolicy;
|
|
803
|
+
/** Wire the bridge's `triggerBargeIn` as the ttsStop callback. */
|
|
804
|
+
bindTtsStop(stop: () => void): void;
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* True when `runtime` is a full `IAgentRuntime` (exposes `emitEvent`) rather
|
|
809
|
+
* than the structural `CoordinatorRuntime` a test may pass. Only an
|
|
810
|
+
* event-capable runtime can drive the automatic `VOICE_TURN_OBSERVED` emit.
|
|
811
|
+
*/
|
|
812
|
+
function isEventRuntime(
|
|
813
|
+
runtime: IAgentRuntime | CoordinatorRuntime | undefined,
|
|
814
|
+
): runtime is IAgentRuntime {
|
|
815
|
+
return (
|
|
816
|
+
runtime !== undefined &&
|
|
817
|
+
typeof (runtime as { emitEvent?: unknown }).emitEvent === "function"
|
|
818
|
+
);
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
/**
|
|
822
|
+
* Flatten the (possibly lazy) `LiveAttributionConfig` into the plain options
|
|
823
|
+
* the runtime helper consumes. Resolved per turn so a changing household roster
|
|
824
|
+
* is picked up without re-arming voice.
|
|
825
|
+
*
|
|
826
|
+
* `transcript` is the turn's joined ASR text. The in-process engine owns ASR, so
|
|
827
|
+
* it threads the real transcript through to `handleLiveVoiceAttribution` — the
|
|
828
|
+
* merge engine's live name/partner extraction (`VoiceObserver.ingestTurn`) needs
|
|
829
|
+
* *what* was said, not just *who* said it (#8786). When empty it is omitted, so
|
|
830
|
+
* the helper falls back to "" exactly as before and diarization-only callers are
|
|
831
|
+
* unaffected.
|
|
832
|
+
*/
|
|
833
|
+
function resolveLiveAttributionOptions(
|
|
834
|
+
cfg: LiveAttributionConfig | null,
|
|
835
|
+
transcript = "",
|
|
836
|
+
): {
|
|
837
|
+
ownerEntityId?: string | null;
|
|
838
|
+
knownSpeakerEntityIds?: readonly string[];
|
|
839
|
+
wakeWordActive?: boolean;
|
|
840
|
+
transcript?: string;
|
|
841
|
+
} {
|
|
842
|
+
const transcriptOpt = transcript !== "" ? { transcript } : {};
|
|
843
|
+
if (!cfg) return transcriptOpt;
|
|
844
|
+
const ownerEntityId =
|
|
845
|
+
typeof cfg.ownerEntityId === "function"
|
|
846
|
+
? cfg.ownerEntityId()
|
|
847
|
+
: cfg.ownerEntityId;
|
|
848
|
+
const knownSpeakerEntityIds =
|
|
849
|
+
typeof cfg.knownSpeakerEntityIds === "function"
|
|
850
|
+
? cfg.knownSpeakerEntityIds()
|
|
851
|
+
: cfg.knownSpeakerEntityIds;
|
|
852
|
+
const wakeWordActive =
|
|
853
|
+
typeof cfg.wakeWordActive === "function"
|
|
854
|
+
? cfg.wakeWordActive()
|
|
855
|
+
: cfg.wakeWordActive;
|
|
856
|
+
return {
|
|
857
|
+
...(ownerEntityId !== undefined ? { ownerEntityId } : {}),
|
|
858
|
+
...(knownSpeakerEntityIds !== undefined ? { knownSpeakerEntityIds } : {}),
|
|
859
|
+
...(wakeWordActive !== undefined ? { wakeWordActive } : {}),
|
|
860
|
+
...transcriptOpt,
|
|
861
|
+
};
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
function buildCancellationWiring(
|
|
865
|
+
opts: EngineVoiceBridgeOptions,
|
|
866
|
+
): PendingCancellationWiring | null {
|
|
867
|
+
if (!opts.runtime) return null;
|
|
868
|
+
let ttsStopHandler: (() => void) | null = null;
|
|
869
|
+
const coordinator = new VoiceCancellationCoordinator({
|
|
870
|
+
runtime: opts.runtime,
|
|
871
|
+
...(opts.slotAbort ? { slotAbort: opts.slotAbort } : {}),
|
|
872
|
+
ttsStop: () => {
|
|
873
|
+
if (ttsStopHandler) {
|
|
874
|
+
ttsStopHandler();
|
|
875
|
+
}
|
|
876
|
+
},
|
|
877
|
+
});
|
|
878
|
+
const policy = new OptimisticGenerationPolicy(
|
|
879
|
+
opts.optimisticPolicyOptions ?? {},
|
|
880
|
+
);
|
|
881
|
+
policy.setPowerSource(resolvePowerSourceState());
|
|
882
|
+
return {
|
|
883
|
+
coordinator,
|
|
884
|
+
policy,
|
|
885
|
+
bindTtsStop(stop) {
|
|
886
|
+
ttsStopHandler = stop;
|
|
887
|
+
},
|
|
888
|
+
};
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
/**
|
|
892
|
+
* Wires the voice scaffold (`VoiceScheduler` + helpers) onto the engine.
|
|
893
|
+
* One bridge per active voice session — created in
|
|
894
|
+
* `LocalInferenceEngine.startVoice()` and disposed when the engine
|
|
895
|
+
* unloads or `stopVoice()` is called.
|
|
896
|
+
*/
|
|
897
|
+
export class EngineVoiceBridge {
|
|
898
|
+
readonly scheduler: VoiceScheduler;
|
|
899
|
+
readonly backend: OmniVoiceBackend;
|
|
900
|
+
readonly lifecycle: VoiceLifecycle;
|
|
901
|
+
/** Loaded FFI handle when running against the fused build (else null). */
|
|
902
|
+
readonly ffi: ElizaInferenceFfi | null;
|
|
903
|
+
/** Lazily-created FFI context this bridge owns; destroyed in `dispose()`. */
|
|
904
|
+
private readonly ffiContextRef: FfiContextRef | null;
|
|
905
|
+
readonly asrAvailable: boolean;
|
|
906
|
+
private readonly bundleRoot: string;
|
|
907
|
+
/** The phrase cache the scheduler dispatches against — held so the bridge
|
|
908
|
+
* can answer "is phrase X cached" for the first-audio filler and seed the
|
|
909
|
+
* idle-time auto-prewarm. */
|
|
910
|
+
private readonly phraseCache: PhraseCache;
|
|
911
|
+
/** In-flight fused turn (`runVoiceTurn`), if any — cancelled on barge-in. */
|
|
912
|
+
private activePipeline: VoicePipeline | null = null;
|
|
913
|
+
/**
|
|
914
|
+
* Optional attribution pipeline. Populated when the bridge was created
|
|
915
|
+
* with a `profileStore` option. When present, `runVoiceTurn` fires
|
|
916
|
+
* attribution in parallel with ASR and delivers the result via
|
|
917
|
+
* `VoiceTurnEvents.onAttribution`.
|
|
918
|
+
*/
|
|
919
|
+
private readonly attributionPipeline: VoiceAttributionPipeline | null;
|
|
920
|
+
/**
|
|
921
|
+
* Full agent runtime, retained only when `opts.runtime` supports
|
|
922
|
+
* `emitEvent` (i.e. it is a real `IAgentRuntime`, not the structural
|
|
923
|
+
* `CoordinatorRuntime` a test may pass). Used by the automatic
|
|
924
|
+
* live-attribution seam in `runVoiceTurn` to emit `VOICE_TURN_OBSERVED`.
|
|
925
|
+
* Null when no event-capable runtime was supplied.
|
|
926
|
+
*/
|
|
927
|
+
private readonly eventRuntime: IAgentRuntime | null;
|
|
928
|
+
/** Gating inputs for the live-attribution → voiceTurnSignal seam. */
|
|
929
|
+
private readonly liveAttribution: LiveAttributionConfig | null;
|
|
930
|
+
/**
|
|
931
|
+
* W3-9 / F1 — voice cancellation coordinator. Populated when the bridge
|
|
932
|
+
* was created with a `runtime` option. Owns one
|
|
933
|
+
* `VoiceCancellationToken` per active `roomId` and fans abort out to
|
|
934
|
+
* the runtime turn controller, the LM slot, the TTS pipeline, and the
|
|
935
|
+
* standard `AbortSignal`. See `cancellation-coordinator.ts` for the
|
|
936
|
+
* full contract.
|
|
937
|
+
*/
|
|
938
|
+
private readonly cancellationCoordinator: VoiceCancellationCoordinator | null;
|
|
939
|
+
/**
|
|
940
|
+
* W3-9 / F1 — optimistic-generation policy. Constructed once per
|
|
941
|
+
* session when `runtime` is supplied. Gates the speculative LM prefill
|
|
942
|
+
* at the `firePrefill` site (see `voice-state-machine.ts`). Hot-swappable
|
|
943
|
+
* via `setPowerSource()` / `setOverride()` from Settings or a device-
|
|
944
|
+
* event listener.
|
|
945
|
+
*/
|
|
946
|
+
private readonly optimisticGenerationPolicy: OptimisticGenerationPolicy | null;
|
|
947
|
+
/**
|
|
948
|
+
* W3-9 / F1 — per-room `BargeInController` bindings the bridge owns.
|
|
949
|
+
* Holds the unsubscribe handle returned by
|
|
950
|
+
* `coordinator.bindBargeInController` so `dispose()` can tear them down.
|
|
951
|
+
*/
|
|
952
|
+
private readonly bargeInBindings = new Map<string, () => void>();
|
|
953
|
+
|
|
954
|
+
private constructor(
|
|
955
|
+
scheduler: VoiceScheduler,
|
|
956
|
+
backend: OmniVoiceBackend,
|
|
957
|
+
bundleRoot: string,
|
|
958
|
+
lifecycle: VoiceLifecycle,
|
|
959
|
+
ffi: ElizaInferenceFfi | null,
|
|
960
|
+
ffiContextRef: FfiContextRef | null,
|
|
961
|
+
asrAvailable: boolean,
|
|
962
|
+
phraseCache: PhraseCache,
|
|
963
|
+
attributionPipeline: VoiceAttributionPipeline | null = null,
|
|
964
|
+
private readonly selfVoiceImprint: AgentSelfVoiceImprint | null = null,
|
|
965
|
+
cancellationCoordinator: VoiceCancellationCoordinator | null = null,
|
|
966
|
+
optimisticGenerationPolicy: OptimisticGenerationPolicy | null = null,
|
|
967
|
+
eventRuntime: IAgentRuntime | null = null,
|
|
968
|
+
liveAttribution: LiveAttributionConfig | null = null,
|
|
969
|
+
) {
|
|
970
|
+
this.scheduler = scheduler;
|
|
971
|
+
this.backend = backend;
|
|
972
|
+
this.bundleRoot = bundleRoot;
|
|
973
|
+
this.lifecycle = lifecycle;
|
|
974
|
+
this.ffi = ffi;
|
|
975
|
+
this.ffiContextRef = ffiContextRef;
|
|
976
|
+
this.asrAvailable = asrAvailable;
|
|
977
|
+
this.phraseCache = phraseCache;
|
|
978
|
+
this.attributionPipeline = attributionPipeline;
|
|
979
|
+
this.cancellationCoordinator = cancellationCoordinator;
|
|
980
|
+
this.optimisticGenerationPolicy = optimisticGenerationPolicy;
|
|
981
|
+
this.eventRuntime = eventRuntime;
|
|
982
|
+
this.liveAttribution = liveAttribution;
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
get ffiCtx(): ElizaInferenceContextHandle | null {
|
|
986
|
+
return this.ffiContextRef?.current ?? null;
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
/**
|
|
990
|
+
* Tear down the FFI context the bridge owns. Idempotent; safe to call
|
|
991
|
+
* multiple times. Callers should `disarm()` first to drop voice
|
|
992
|
+
* resources, then `dispose()` to close the FFI handle.
|
|
993
|
+
*/
|
|
994
|
+
dispose(): void {
|
|
995
|
+
// W3-9 / F1 — tear down barge-in bindings + the cancellation
|
|
996
|
+
// coordinator first so any armed turn aborts with reason=external
|
|
997
|
+
// before the FFI context goes away.
|
|
998
|
+
for (const unsub of Array.from(this.bargeInBindings.values())) {
|
|
999
|
+
try {
|
|
1000
|
+
unsub();
|
|
1001
|
+
} catch {
|
|
1002
|
+
// Best-effort teardown.
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
this.bargeInBindings.clear();
|
|
1006
|
+
if (this.cancellationCoordinator) {
|
|
1007
|
+
try {
|
|
1008
|
+
this.cancellationCoordinator.dispose();
|
|
1009
|
+
} catch {
|
|
1010
|
+
// Coordinator dispose must not block FFI teardown.
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
1013
|
+
if (this.ffi) {
|
|
1014
|
+
const ctx = this.ffiContextRef?.current ?? null;
|
|
1015
|
+
if (ctx !== null) {
|
|
1016
|
+
this.ffi.destroy(ctx);
|
|
1017
|
+
if (this.ffiContextRef) this.ffiContextRef.current = null;
|
|
1018
|
+
}
|
|
1019
|
+
this.ffi.close();
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
/**
|
|
1024
|
+
* Start the voice session for a bundle. Validates the bundle layout
|
|
1025
|
+
* up-front (per AGENTS.md §3 + §7 — required artifacts checked before
|
|
1026
|
+
* activation) and throws `VoiceStartupError` for any missing piece.
|
|
1027
|
+
* No partial activation: either the scheduler exists and is wired or
|
|
1028
|
+
* the call throws.
|
|
1029
|
+
*/
|
|
1030
|
+
static start(opts: EngineVoiceBridgeOptions): EngineVoiceBridge {
|
|
1031
|
+
if (opts.kokoroOnly) {
|
|
1032
|
+
if (opts.useFfiBackend || opts.backendOverride) {
|
|
1033
|
+
throw new VoiceStartupError(
|
|
1034
|
+
"invalid-options",
|
|
1035
|
+
"[voice] kokoroOnly cannot be combined with useFfiBackend or backendOverride. Caller must pick exactly one backend path.",
|
|
1036
|
+
);
|
|
1037
|
+
}
|
|
1038
|
+
return EngineVoiceBridge.startKokoroOnly(opts);
|
|
1039
|
+
}
|
|
1040
|
+
if (!opts.bundleRoot || !existsSync(opts.bundleRoot)) {
|
|
1041
|
+
throw new VoiceStartupError(
|
|
1042
|
+
"missing-bundle-root",
|
|
1043
|
+
`[voice] Bundle root does not exist: ${opts.bundleRoot}`,
|
|
1044
|
+
);
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
const presetPath = path.join(
|
|
1048
|
+
opts.bundleRoot,
|
|
1049
|
+
DEFAULT_VOICE_PRESET_REL_PATH,
|
|
1050
|
+
);
|
|
1051
|
+
if (!existsSync(presetPath)) {
|
|
1052
|
+
throw new VoiceStartupError(
|
|
1053
|
+
"missing-speaker-preset",
|
|
1054
|
+
`[voice] Bundle is missing required speaker preset at ${presetPath}. The default voice MUST ship as a precomputed embedding (AGENTS.md §4).`,
|
|
1055
|
+
);
|
|
1056
|
+
}
|
|
1057
|
+
|
|
1058
|
+
const sampleRate = opts.sampleRate ?? SAMPLE_RATE_DEFAULT;
|
|
1059
|
+
const presetCache = new SpeakerPresetCache();
|
|
1060
|
+
const { preset, phrases: seedPhrases } = presetCache.loadFromBundle({
|
|
1061
|
+
bundleRoot: opts.bundleRoot,
|
|
1062
|
+
});
|
|
1063
|
+
const schedulerPreset = opts.speakerPresetOverride ?? preset;
|
|
1064
|
+
|
|
1065
|
+
const phraseCache = new PhraseCache();
|
|
1066
|
+
phraseCache.seed(seedPhrases);
|
|
1067
|
+
for (const entry of opts.prewarmedPhrases ?? []) {
|
|
1068
|
+
phraseCache.put(entry);
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
// FFI binding + per-bridge context. When the bridge runs against
|
|
1072
|
+
// the real fused build, the same `ffi`/`ctx` pair is shared by:
|
|
1073
|
+
// - the TTS backend (`FfiOmniVoiceBackend.synthesize`),
|
|
1074
|
+
// - the lifecycle loaders (`MmapRegionHandle.evictPages` calls
|
|
1075
|
+
// `ffi.mmapEvict(ctx, "tts" | "asr")`).
|
|
1076
|
+
// Tests can opt out by either passing `lifecycleLoaders` (mocks
|
|
1077
|
+
// `evictPages`) or `backendOverride` (mocks the backend) or
|
|
1078
|
+
// setting `useFfiBackend: false` (test TTS + empty evict transition).
|
|
1079
|
+
let ffiHandle: ElizaInferenceFfi | null = null;
|
|
1080
|
+
let ffiContextRef: FfiContextRef | null = null;
|
|
1081
|
+
let backend: OmniVoiceBackend;
|
|
1082
|
+
const asrAvailable = bundleHasRegularFile(
|
|
1083
|
+
path.join(opts.bundleRoot, "asr"),
|
|
1084
|
+
);
|
|
1085
|
+
if (opts.backendOverride && opts.ttsBackendOverride) {
|
|
1086
|
+
throw new VoiceStartupError(
|
|
1087
|
+
"invalid-options",
|
|
1088
|
+
"[voice] backendOverride and ttsBackendOverride are mutually exclusive.",
|
|
1089
|
+
);
|
|
1090
|
+
}
|
|
1091
|
+
if (opts.backendOverride && opts.useFfiBackend) {
|
|
1092
|
+
throw new VoiceStartupError(
|
|
1093
|
+
"missing-fused-build",
|
|
1094
|
+
"[voice] backendOverride cannot be combined with useFfiBackend=true. Voice-on production paths must load libelizainference and verify its ABI instead of bypassing the fused runtime.",
|
|
1095
|
+
);
|
|
1096
|
+
}
|
|
1097
|
+
if (opts.backendOverride) {
|
|
1098
|
+
backend = opts.backendOverride;
|
|
1099
|
+
} else if (opts.useFfiBackend) {
|
|
1100
|
+
const libPath = locateBundleLibrary(opts.bundleRoot);
|
|
1101
|
+
if (!existsSync(libPath)) {
|
|
1102
|
+
throw new VoiceStartupError(
|
|
1103
|
+
"missing-ffi",
|
|
1104
|
+
`[voice] Fused omnivoice library not found under ${path.join(opts.bundleRoot, "lib")} (tried ${libraryFilenames().join(", ")}). Build via packages/app-core/scripts/build-llama-cpp-mtp.mjs (omnivoice-fuse target).`,
|
|
1105
|
+
);
|
|
1106
|
+
}
|
|
1107
|
+
ffiHandle = loadElizaInferenceFfi(libPath);
|
|
1108
|
+
const contextRef: FfiContextRef = {
|
|
1109
|
+
current: null,
|
|
1110
|
+
ensure: () => {
|
|
1111
|
+
if (!ffiHandle) {
|
|
1112
|
+
throw new VoiceStartupError(
|
|
1113
|
+
"missing-ffi",
|
|
1114
|
+
"[voice] FFI context requested without a loaded libelizainference handle",
|
|
1115
|
+
);
|
|
1116
|
+
}
|
|
1117
|
+
if (contextRef.current === null) {
|
|
1118
|
+
contextRef.current = ffiHandle.create(opts.bundleRoot);
|
|
1119
|
+
}
|
|
1120
|
+
return contextRef.current;
|
|
1121
|
+
},
|
|
1122
|
+
};
|
|
1123
|
+
ffiContextRef = contextRef;
|
|
1124
|
+
backend =
|
|
1125
|
+
opts.ttsBackendOverride ??
|
|
1126
|
+
new FfiOmniVoiceBackend({
|
|
1127
|
+
ffi: ffiHandle,
|
|
1128
|
+
getContext: contextRef.ensure,
|
|
1129
|
+
sampleRate,
|
|
1130
|
+
});
|
|
1131
|
+
} else {
|
|
1132
|
+
backend = opts.ttsBackendOverride ?? new StubOmniVoiceBackend(sampleRate);
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
const config: SchedulerConfig = {
|
|
1136
|
+
chunkerConfig: {
|
|
1137
|
+
maxTokensPerPhrase:
|
|
1138
|
+
opts.maxTokensPerPhrase ??
|
|
1139
|
+
readPositiveIntEnv("ELIZA_VOICE_MAX_TOKENS_PER_PHRASE") ??
|
|
1140
|
+
PHRASE_MAX_TOKENS_DEFAULT,
|
|
1141
|
+
},
|
|
1142
|
+
preset: schedulerPreset,
|
|
1143
|
+
ringBufferCapacity:
|
|
1144
|
+
opts.ringBufferCapacity ?? RING_BUFFER_CAPACITY_DEFAULT,
|
|
1145
|
+
sampleRate,
|
|
1146
|
+
maxInFlightPhrases:
|
|
1147
|
+
opts.maxInFlightPhrases ??
|
|
1148
|
+
readPositiveIntEnv("ELIZA_VOICE_MAX_IN_FLIGHT_PHRASES"),
|
|
1149
|
+
};
|
|
1150
|
+
|
|
1151
|
+
const sinkOverride = opts.sink;
|
|
1152
|
+
let selfVoiceImprint: AgentSelfVoiceImprint | null = null;
|
|
1153
|
+
const schedulerEvents: SchedulerEvents = {
|
|
1154
|
+
...(opts.events ?? {}),
|
|
1155
|
+
onAudio(chunk) {
|
|
1156
|
+
opts.events?.onAudio?.(chunk);
|
|
1157
|
+
if (!selfVoiceImprint) return;
|
|
1158
|
+
void selfVoiceImprint
|
|
1159
|
+
.observeAudio(chunk.pcm, chunk.sampleRate)
|
|
1160
|
+
.catch((err: unknown) => {
|
|
1161
|
+
logger.warn(
|
|
1162
|
+
{
|
|
1163
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1164
|
+
},
|
|
1165
|
+
"[voice-bridge] agent self-voice imprint update failed",
|
|
1166
|
+
);
|
|
1167
|
+
});
|
|
1168
|
+
},
|
|
1169
|
+
};
|
|
1170
|
+
const scheduler = new VoiceScheduler(
|
|
1171
|
+
config,
|
|
1172
|
+
sinkOverride
|
|
1173
|
+
? { backend, sink: sinkOverride, phraseCache }
|
|
1174
|
+
: { backend, phraseCache },
|
|
1175
|
+
schedulerEvents,
|
|
1176
|
+
);
|
|
1177
|
+
|
|
1178
|
+
// Wire the voice lifecycle. The lifecycle starts in `voice-off` —
|
|
1179
|
+
// heavy resources (TTS + ASR mmap regions) are loaded only when
|
|
1180
|
+
// `arm()` is called. The default loaders derive an mmap-style
|
|
1181
|
+
// handle from the bundle's `tts/` and `asr/` directories so that
|
|
1182
|
+
// production paths get real eviction calls; tests inject
|
|
1183
|
+
// `lifecycleLoaders` to assert the disarm path.
|
|
1184
|
+
const registry = opts.sharedResources ?? new SharedResourceRegistry();
|
|
1185
|
+
const loaders =
|
|
1186
|
+
opts.lifecycleLoaders ??
|
|
1187
|
+
defaultLifecycleLoaders(opts.bundleRoot, ffiHandle, ffiContextRef, {
|
|
1188
|
+
skipTtsRegion: Boolean(opts.ttsBackendOverride),
|
|
1189
|
+
});
|
|
1190
|
+
const lifecycle = new VoiceLifecycle({ registry, loaders });
|
|
1191
|
+
|
|
1192
|
+
// Wire speaker-attribution when a profile store is provided. The
|
|
1193
|
+
// attribution pipeline wraps the fused encoder + diarizer + profile-store.
|
|
1194
|
+
// Both run through the ONE fused `libelizainference` handle via its
|
|
1195
|
+
// `eliza_inference_speaker_*` / `_diariz_*` ABI — there is no standalone
|
|
1196
|
+
// `libvoice_classifier` runtime.
|
|
1197
|
+
//
|
|
1198
|
+
// Fail-fast at ARM time: the fused speaker ABI is probed synchronously
|
|
1199
|
+
// here (`FusedSpeakerEncoder.isSupported`). When the build does not
|
|
1200
|
+
// advertise it, this throws `VoiceStartupError` rather than silently
|
|
1201
|
+
// degrading attribution to "unknown speaker" on the first turn. The
|
|
1202
|
+
// native session `load()` runs lazily on first encode/diarize, but the
|
|
1203
|
+
// capability is decided up front.
|
|
1204
|
+
let attributionPipeline: VoiceAttributionPipeline | null = null;
|
|
1205
|
+
if (opts.profileStore) {
|
|
1206
|
+
const fusedFfi = ffiHandle;
|
|
1207
|
+
const fusedCtx = ffiContextRef;
|
|
1208
|
+
if (!fusedFfi || !fusedCtx) {
|
|
1209
|
+
throw new VoiceStartupError(
|
|
1210
|
+
"missing-fused-build",
|
|
1211
|
+
"[voice] Speaker-attribution requires the fused libelizainference handle (useFfiBackend). No standalone speaker runtime exists.",
|
|
1212
|
+
);
|
|
1213
|
+
}
|
|
1214
|
+
if (!FusedSpeakerEncoder.isSupported(fusedFfi)) {
|
|
1215
|
+
throw new VoiceStartupError(
|
|
1216
|
+
"missing-fused-build",
|
|
1217
|
+
"[voice] The loaded libelizainference build lacks the speaker ABI (eliza_inference_speaker_supported() == 0). Rebuild with the WeSpeaker forward graph linked in (eliza_inference_speaker_* symbols).",
|
|
1218
|
+
);
|
|
1219
|
+
}
|
|
1220
|
+
// Fused encoder: probe passed above; the native session opens lazily
|
|
1221
|
+
// on first encode() so voice-off does not keep the model resident.
|
|
1222
|
+
let resolvedEncoder: SpeakerEncoder | null = null;
|
|
1223
|
+
let encoderLoadError: Error | null = null;
|
|
1224
|
+
const lazyEncoder: SpeakerEncoder = {
|
|
1225
|
+
embeddingDim: SPEAKER_GGML_EMBEDDING_DIM,
|
|
1226
|
+
sampleRate: SPEAKER_GGML_SAMPLE_RATE,
|
|
1227
|
+
async encode(pcm: Float32Array): Promise<Float32Array> {
|
|
1228
|
+
if (encoderLoadError) throw encoderLoadError;
|
|
1229
|
+
if (!resolvedEncoder) {
|
|
1230
|
+
try {
|
|
1231
|
+
resolvedEncoder = await FusedSpeakerEncoder.load({
|
|
1232
|
+
ffi: fusedFfi,
|
|
1233
|
+
ctx: () => fusedCtx.ensure(),
|
|
1234
|
+
});
|
|
1235
|
+
} catch (err) {
|
|
1236
|
+
encoderLoadError =
|
|
1237
|
+
err instanceof Error ? err : new Error(String(err));
|
|
1238
|
+
throw encoderLoadError;
|
|
1239
|
+
}
|
|
1240
|
+
}
|
|
1241
|
+
return resolvedEncoder.encode(pcm);
|
|
1242
|
+
},
|
|
1243
|
+
async dispose(): Promise<void> {
|
|
1244
|
+
await resolvedEncoder?.dispose();
|
|
1245
|
+
},
|
|
1246
|
+
};
|
|
1247
|
+
selfVoiceImprint = new AgentSelfVoiceImprint({
|
|
1248
|
+
encoder: lazyEncoder,
|
|
1249
|
+
});
|
|
1250
|
+
// Fused diarizer (optional). When the build does not advertise the
|
|
1251
|
+
// diarizer ABI, attribution runs without it — a single-speaker turn
|
|
1252
|
+
// collapses to one segment (the attribution-pipeline localSpeakerId=0
|
|
1253
|
+
// path). The diarizer is NOT a fail-fast gate (unlike the encoder):
|
|
1254
|
+
// it refines multi-speaker windows, it is not required to attribute a
|
|
1255
|
+
// single speaker.
|
|
1256
|
+
let lazyDiarizer: Diarizer | undefined;
|
|
1257
|
+
if (FusedDiarizer.isSupported(fusedFfi)) {
|
|
1258
|
+
let resolvedDiarizer: Diarizer | null = null;
|
|
1259
|
+
let diarizerLoadError: Error | null = null;
|
|
1260
|
+
lazyDiarizer = {
|
|
1261
|
+
modelId: PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID,
|
|
1262
|
+
sampleRate: SPEAKER_GGML_SAMPLE_RATE,
|
|
1263
|
+
async diarizeWindow(pcm: Float32Array) {
|
|
1264
|
+
if (diarizerLoadError) throw diarizerLoadError;
|
|
1265
|
+
if (!resolvedDiarizer) {
|
|
1266
|
+
try {
|
|
1267
|
+
resolvedDiarizer = await FusedDiarizer.load({
|
|
1268
|
+
ffi: fusedFfi,
|
|
1269
|
+
ctx: () => fusedCtx.ensure(),
|
|
1270
|
+
});
|
|
1271
|
+
} catch (err) {
|
|
1272
|
+
diarizerLoadError =
|
|
1273
|
+
err instanceof Error ? err : new Error(String(err));
|
|
1274
|
+
throw diarizerLoadError;
|
|
1275
|
+
}
|
|
1276
|
+
}
|
|
1277
|
+
return resolvedDiarizer.diarizeWindow(pcm);
|
|
1278
|
+
},
|
|
1279
|
+
async dispose(): Promise<void> {
|
|
1280
|
+
await resolvedDiarizer?.dispose();
|
|
1281
|
+
},
|
|
1282
|
+
};
|
|
1283
|
+
}
|
|
1284
|
+
attributionPipeline = new VoiceAttributionPipeline({
|
|
1285
|
+
encoder: lazyEncoder,
|
|
1286
|
+
...(lazyDiarizer ? { diarizer: lazyDiarizer } : {}),
|
|
1287
|
+
profileStore: opts.profileStore,
|
|
1288
|
+
});
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
// W3-9 / F1 — construct the cancellation coordinator + optimistic policy
|
|
1292
|
+
// when a runtime is supplied. The coordinator's ttsStop callback closes
|
|
1293
|
+
// over `bridge.triggerBargeIn()`, which is wired below once the bridge
|
|
1294
|
+
// is constructed.
|
|
1295
|
+
const wiring = buildCancellationWiring(opts);
|
|
1296
|
+
|
|
1297
|
+
const bridge = new EngineVoiceBridge(
|
|
1298
|
+
scheduler,
|
|
1299
|
+
backend,
|
|
1300
|
+
opts.bundleRoot,
|
|
1301
|
+
lifecycle,
|
|
1302
|
+
ffiHandle,
|
|
1303
|
+
ffiContextRef,
|
|
1304
|
+
asrAvailable,
|
|
1305
|
+
phraseCache,
|
|
1306
|
+
attributionPipeline,
|
|
1307
|
+
selfVoiceImprint,
|
|
1308
|
+
wiring?.coordinator ?? null,
|
|
1309
|
+
wiring?.policy ?? null,
|
|
1310
|
+
isEventRuntime(opts.runtime) ? opts.runtime : null,
|
|
1311
|
+
opts.liveAttribution ?? null,
|
|
1312
|
+
);
|
|
1313
|
+
if (wiring) wiring.bindTtsStop(() => bridge.triggerBargeIn());
|
|
1314
|
+
return bridge;
|
|
1315
|
+
}
|
|
1316
|
+
|
|
1317
|
+
/**
|
|
1318
|
+
* Kokoro-only path. Skips bundle-root / speaker-preset / FFI checks
|
|
1319
|
+
* (Kokoro picks voices by id against `KOKORO_VOICE_PACKS`) and
|
|
1320
|
+
* synthesizes a minimal `SpeakerPreset` keyed to the discovered voice
|
|
1321
|
+
* id. Defaults lifecycle loaders to empty handles since ORT owns the
|
|
1322
|
+
* model memory. `asrAvailable` is `false`: callers needing ASR
|
|
1323
|
+
* construct `createStreamingTranscriber` directly.
|
|
1324
|
+
*/
|
|
1325
|
+
private static startKokoroOnly(
|
|
1326
|
+
opts: EngineVoiceBridgeOptions,
|
|
1327
|
+
): EngineVoiceBridge {
|
|
1328
|
+
if (!opts.kokoroOnly) {
|
|
1329
|
+
throw new VoiceStartupError(
|
|
1330
|
+
"invalid-options",
|
|
1331
|
+
"[voice] startKokoroOnly called without `kokoroOnly` config — this is an internal error.",
|
|
1332
|
+
);
|
|
1333
|
+
}
|
|
1334
|
+
const kokoro = opts.kokoroOnly;
|
|
1335
|
+
const sampleRate = opts.sampleRate ?? kokoro.layout.sampleRate;
|
|
1336
|
+
const workDir =
|
|
1337
|
+
opts.bundleRoot && existsSync(opts.bundleRoot)
|
|
1338
|
+
? opts.bundleRoot
|
|
1339
|
+
: localInferenceRoot();
|
|
1340
|
+
|
|
1341
|
+
// Synthesize a minimal preset. Kokoro's `resolveVoice(preset)` looks
|
|
1342
|
+
// up `preset.voiceId` against `KOKORO_VOICE_PACKS`; the embedding +
|
|
1343
|
+
// bytes fields are ignored on this path (voice cloning is OmniVoice-only).
|
|
1344
|
+
const preset = createKokoroSpeakerPreset(kokoro);
|
|
1345
|
+
|
|
1346
|
+
// Anchor the in-process Kokoro FFI ctx at the Eliza-1 bundle root when
|
|
1347
|
+
// one is present; otherwise the runtime anchors at the Kokoro model root.
|
|
1348
|
+
const backend = createKokoroTtsBackend(kokoro, {
|
|
1349
|
+
bundleRoot:
|
|
1350
|
+
opts.bundleRoot && existsSync(opts.bundleRoot)
|
|
1351
|
+
? opts.bundleRoot
|
|
1352
|
+
: undefined,
|
|
1353
|
+
...(opts.kokoroFfi ? { ffi: opts.kokoroFfi } : {}),
|
|
1354
|
+
});
|
|
1355
|
+
|
|
1356
|
+
const phraseCache = new PhraseCache();
|
|
1357
|
+
for (const entry of opts.prewarmedPhrases ?? []) {
|
|
1358
|
+
phraseCache.put(entry);
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
const config: SchedulerConfig = {
|
|
1362
|
+
chunkerConfig: {
|
|
1363
|
+
maxTokensPerPhrase:
|
|
1364
|
+
opts.maxTokensPerPhrase ??
|
|
1365
|
+
readPositiveIntEnv("ELIZA_VOICE_MAX_TOKENS_PER_PHRASE") ??
|
|
1366
|
+
PHRASE_MAX_TOKENS_DEFAULT,
|
|
1367
|
+
},
|
|
1368
|
+
preset,
|
|
1369
|
+
ringBufferCapacity:
|
|
1370
|
+
opts.ringBufferCapacity ?? RING_BUFFER_CAPACITY_DEFAULT,
|
|
1371
|
+
sampleRate,
|
|
1372
|
+
maxInFlightPhrases:
|
|
1373
|
+
opts.maxInFlightPhrases ??
|
|
1374
|
+
readPositiveIntEnv("ELIZA_VOICE_MAX_IN_FLIGHT_PHRASES"),
|
|
1375
|
+
};
|
|
1376
|
+
|
|
1377
|
+
const sinkOverride = opts.sink;
|
|
1378
|
+
const scheduler = new VoiceScheduler(
|
|
1379
|
+
config,
|
|
1380
|
+
sinkOverride
|
|
1381
|
+
? { backend, sink: sinkOverride, phraseCache }
|
|
1382
|
+
: { backend, phraseCache },
|
|
1383
|
+
opts.events ?? {},
|
|
1384
|
+
);
|
|
1385
|
+
|
|
1386
|
+
const registry = opts.sharedResources ?? new SharedResourceRegistry();
|
|
1387
|
+
const loaders = opts.lifecycleLoaders ?? kokoroOnlyLifecycleLoaders();
|
|
1388
|
+
const lifecycle = new VoiceLifecycle({ registry, loaders });
|
|
1389
|
+
|
|
1390
|
+
const wiring = buildCancellationWiring(opts);
|
|
1391
|
+
|
|
1392
|
+
const bridge = new EngineVoiceBridge(
|
|
1393
|
+
scheduler,
|
|
1394
|
+
backend,
|
|
1395
|
+
workDir,
|
|
1396
|
+
lifecycle,
|
|
1397
|
+
null, // no FFI handle on Kokoro-only
|
|
1398
|
+
null, // no FFI context on Kokoro-only
|
|
1399
|
+
false, // ASR is not served from this path
|
|
1400
|
+
phraseCache,
|
|
1401
|
+
null, // no profile store on Kokoro-only
|
|
1402
|
+
null, // no self-voice imprint without live attribution
|
|
1403
|
+
wiring?.coordinator ?? null,
|
|
1404
|
+
wiring?.policy ?? null,
|
|
1405
|
+
);
|
|
1406
|
+
if (wiring) wiring.bindTtsStop(() => bridge.triggerBargeIn());
|
|
1407
|
+
return bridge;
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
/**
|
|
1411
|
+
* True when this bridge runs against a TTS backend that produces real
|
|
1412
|
+
* audio — i.e. anything but the `StubOmniVoiceBackend` (which yields
|
|
1413
|
+
* zeros and is tests-only). The prewarm + first-audio-filler paths gate
|
|
1414
|
+
* on this so the cache never holds silence (AGENTS.md §3 — no fake data).
|
|
1415
|
+
*/
|
|
1416
|
+
hasRealTtsBackend(): boolean {
|
|
1417
|
+
return !(this.backend instanceof StubOmniVoiceBackend);
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
/**
|
|
1421
|
+
* Lazy-load the TTS mmap region, optional ASR region, and the voice
|
|
1422
|
+
* scheduler nodes via the lifecycle state machine. Idempotent for
|
|
1423
|
+
* repeated calls in `voice-on` (returns the existing armed resources).
|
|
1424
|
+
* Surfaces RAM pressure / mmap-fail / kernel-missing as `VoiceLifecycleError` —
|
|
1425
|
+
* see `lifecycle.ts` for the full error taxonomy.
|
|
1426
|
+
*/
|
|
1427
|
+
async arm(): Promise<void> {
|
|
1428
|
+
if (this.lifecycle.current().kind === "voice-on") return;
|
|
1429
|
+
await this.lifecycle.arm();
|
|
1430
|
+
}
|
|
1431
|
+
|
|
1432
|
+
/**
|
|
1433
|
+
* Drain in-flight TTS, settle the scheduler, then disarm the
|
|
1434
|
+
* lifecycle. Disarm calls `evictPages()` (madvise / VirtualUnlock
|
|
1435
|
+
* equivalent) on the TTS + optional ASR mmap regions and releases every
|
|
1436
|
+
* voice-only ref. Speaker preset + phrase cache survive in the
|
|
1437
|
+
* registry as small LRU entries (KB-scale; not worth evicting).
|
|
1438
|
+
*/
|
|
1439
|
+
async disarm(): Promise<void> {
|
|
1440
|
+
if (this.lifecycle.current().kind !== "voice-on") return;
|
|
1441
|
+
await this.settle();
|
|
1442
|
+
await this.lifecycle.disarm();
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
/**
|
|
1446
|
+
* Forward an accepted text token from the verifier into the scheduler.
|
|
1447
|
+
* Tokens that fill a phrase trigger TTS dispatch on the same scheduler
|
|
1448
|
+
* tick (AGENTS.md §4 — no buffering past phrase boundaries).
|
|
1449
|
+
*/
|
|
1450
|
+
async pushAcceptedToken(
|
|
1451
|
+
token: TextToken,
|
|
1452
|
+
acceptedAt = Date.now(),
|
|
1453
|
+
): Promise<void> {
|
|
1454
|
+
await this.scheduler.accept(token, acceptedAt);
|
|
1455
|
+
}
|
|
1456
|
+
|
|
1457
|
+
/**
|
|
1458
|
+
* MTP rejection → rollback queue. The scheduler cancels any
|
|
1459
|
+
* in-flight TTS forward pass for phrases that overlap the rejected
|
|
1460
|
+
* token range and emits an `onRollback` event for observability.
|
|
1461
|
+
* Already-played audio cannot be unplayed; the chunker is sized so
|
|
1462
|
+
* rollback is rare and cheap.
|
|
1463
|
+
*/
|
|
1464
|
+
async pushRejectedRange(range: RejectedTokenRange): Promise<void> {
|
|
1465
|
+
await this.scheduler.reject(range);
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
/**
|
|
1469
|
+
* Voice activity detected on the mic input → cancel everything.
|
|
1470
|
+
* Drains the ring buffer immediately, flushes the chunker queue, and
|
|
1471
|
+
* marks every in-flight cancel signal so synthesise loops exit at the
|
|
1472
|
+
* next kernel boundary (AGENTS.md §4 — barge-in cancellation MUST be
|
|
1473
|
+
* within one kernel tick).
|
|
1474
|
+
*/
|
|
1475
|
+
triggerBargeIn(): void {
|
|
1476
|
+
// Cancel the text side first (stop ASR / drafter / verifier at the next
|
|
1477
|
+
// kernel boundary), then the audio side (ring-buffer drain + chunker
|
|
1478
|
+
// flush + in-flight TTS cancel). The pipeline also wires its own
|
|
1479
|
+
// barge-in listener onto the scheduler, so `onMicActive()` alone would
|
|
1480
|
+
// suffice — calling `cancel()` first just stops the next HTTP body
|
|
1481
|
+
// sooner.
|
|
1482
|
+
this.activePipeline?.cancel();
|
|
1483
|
+
this.scheduler.bargeIn.onMicActive();
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
/**
|
|
1487
|
+
* W3-9 / F1 — the canonical voice cancellation coordinator for this
|
|
1488
|
+
* session, or `null` when the bridge was constructed without a
|
|
1489
|
+
* `runtime` option. Callers (turn controller, mic VAD source, UI cancel
|
|
1490
|
+
* route) use this to arm per-turn tokens, fire `bargeIn(roomId)` on
|
|
1491
|
+
* VAD speech-start, fire `revokeEot(roomId)` when the turn detector
|
|
1492
|
+
* revokes a tentative EOT, etc. See
|
|
1493
|
+
* `plugins/plugin-local-inference/docs/voice-cancellation-contract.md`.
|
|
1494
|
+
*/
|
|
1495
|
+
cancellationCoordinatorOrNull(): VoiceCancellationCoordinator | null {
|
|
1496
|
+
return this.cancellationCoordinator;
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
/**
|
|
1500
|
+
* W3-9 / F1 — the optimistic-generation policy for this session, or
|
|
1501
|
+
* `null` when the bridge was constructed without a `runtime` option.
|
|
1502
|
+
* The bridge primes it with the resolved power source at construction
|
|
1503
|
+
* time; callers can mutate it via `setPowerSource()` / `setOverride()`
|
|
1504
|
+
* to respond to Settings toggles or battery-state events.
|
|
1505
|
+
*/
|
|
1506
|
+
optimisticPolicyOrNull(): OptimisticGenerationPolicy | null {
|
|
1507
|
+
return this.optimisticGenerationPolicy;
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
/**
|
|
1511
|
+
* W3-9 / F1 — bind the scheduler's `BargeInController` into the
|
|
1512
|
+
* cancellation coordinator for `roomId`. Subsequent
|
|
1513
|
+
* `BargeInController.hardStop()` calls (typically fired by the
|
|
1514
|
+
* ASR-confirmed barge-in words ladder) translate into
|
|
1515
|
+
* `coordinator.bargeIn(roomId)` so the canonical token (and every
|
|
1516
|
+
* downstream consumer: runtime turn abort, LM slot abort, TTS stop,
|
|
1517
|
+
* AbortSignal) sees the abort.
|
|
1518
|
+
*
|
|
1519
|
+
* Idempotent per `roomId` — repeated calls for the same room return
|
|
1520
|
+
* the same unsubscribe handle (the prior binding is torn down first).
|
|
1521
|
+
*
|
|
1522
|
+
* When the bridge was constructed without a `runtime` option, this returns
|
|
1523
|
+
* an empty unsubscribe. Callers should still call it
|
|
1524
|
+
* unconditionally — back-compat for the legacy path is automatic.
|
|
1525
|
+
*/
|
|
1526
|
+
bindBargeInControllerForRoom(roomId: string): () => void {
|
|
1527
|
+
if (!this.cancellationCoordinator) {
|
|
1528
|
+
return () => undefined;
|
|
1529
|
+
}
|
|
1530
|
+
const existing = this.bargeInBindings.get(roomId);
|
|
1531
|
+
if (existing) existing();
|
|
1532
|
+
const unsub = this.cancellationCoordinator.bindBargeInController(
|
|
1533
|
+
roomId,
|
|
1534
|
+
this.scheduler.bargeIn,
|
|
1535
|
+
);
|
|
1536
|
+
this.bargeInBindings.set(roomId, unsub);
|
|
1537
|
+
return () => {
|
|
1538
|
+
unsub();
|
|
1539
|
+
if (this.bargeInBindings.get(roomId) === unsub) {
|
|
1540
|
+
this.bargeInBindings.delete(roomId);
|
|
1541
|
+
}
|
|
1542
|
+
};
|
|
1543
|
+
}
|
|
1544
|
+
|
|
1545
|
+
/**
|
|
1546
|
+
* Drain pending phrase data and wait for in-flight TTS to settle.
|
|
1547
|
+
* Used at the end of a turn so callers can synchronise on a quiescent
|
|
1548
|
+
* scheduler before they tear it down.
|
|
1549
|
+
*/
|
|
1550
|
+
async settle(): Promise<void> {
|
|
1551
|
+
await this.scheduler.flushPending();
|
|
1552
|
+
await this.scheduler.waitIdle();
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
async synthesizeTextToWav(
|
|
1556
|
+
text: string,
|
|
1557
|
+
signal?: AbortSignal,
|
|
1558
|
+
): Promise<Uint8Array> {
|
|
1559
|
+
this.assertVoiceOn("synthesize speech");
|
|
1560
|
+
if (!this.hasRealTtsBackend()) {
|
|
1561
|
+
throw new VoiceStartupError(
|
|
1562
|
+
"missing-fused-build",
|
|
1563
|
+
"[voice] Direct speech synthesis requires a fused OmniVoice backend. The deterministic test backend is only allowed in scheduler/unit tests.",
|
|
1564
|
+
);
|
|
1565
|
+
}
|
|
1566
|
+
const chunk = await this.scheduler.synthesizeText(text, signal);
|
|
1567
|
+
return encodeMonoPcm16Wav(chunk.pcm, chunk.sampleRate);
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
/**
|
|
1571
|
+
* The streaming-TTS seam W9's scheduler drives: returns the active
|
|
1572
|
+
* backend as a `StreamingTtsBackend` (`FfiOmniVoiceBackend` against the
|
|
1573
|
+
* fused build, `StubOmniVoiceBackend` for tests). The scheduler calls
|
|
1574
|
+
* `synthesizeStream(...)` for each phrase and writes the delivered PCM
|
|
1575
|
+
* segments into its `PcmRingBuffer` on the same scheduler tick. Returns
|
|
1576
|
+
* null when an injected `backendOverride` does not implement the seam.
|
|
1577
|
+
*/
|
|
1578
|
+
streamingTtsBackend(): StreamingTtsBackend | null {
|
|
1579
|
+
return isStreamingTtsBackend(this.backend) ? this.backend : null;
|
|
1580
|
+
}
|
|
1581
|
+
|
|
1582
|
+
/**
|
|
1583
|
+
* True when the loaded fused `libelizainference` runs the MTP
|
|
1584
|
+
* speculative loop in-process and can emit native accept/reject
|
|
1585
|
+
* verifier events. When true, callers (W9's turn controller /
|
|
1586
|
+
* `ffi-streaming-backend.ts` wiring) should subscribe via
|
|
1587
|
+
* `subscribeNativeVerifier()` and SKIP the `llama-server` SSE
|
|
1588
|
+
* `{"verifier":{"rejected":[a,b]}}` side-channel — the SSE path stays
|
|
1589
|
+
* only as the non-fused desktop text fallback. False whenever there is
|
|
1590
|
+
* no FFI handle or the build pre-dates the verifier callback.
|
|
1591
|
+
*/
|
|
1592
|
+
hasNativeVerifier(): boolean {
|
|
1593
|
+
// ABI v3 exports `eliza_inference_set_verifier_callback`, but the
|
|
1594
|
+
// current generated adapter returns ELIZA_ERR_NOT_IMPLEMENTED until the
|
|
1595
|
+
// native MTP speculative loop is ported into libelizainference. Do
|
|
1596
|
+
// not let callers skip the SSE verifier fallback merely because the
|
|
1597
|
+
// symbol exists.
|
|
1598
|
+
return false;
|
|
1599
|
+
}
|
|
1600
|
+
|
|
1601
|
+
/**
|
|
1602
|
+
* Register the native MTP verifier callback on the fused runtime
|
|
1603
|
+
* and adapt each `NativeVerifierEvent` into the rollback-queue domain:
|
|
1604
|
+
* accepted/corrected token-id ranges become `VerifierStreamEvent`s and
|
|
1605
|
+
* rejected ranges become `RejectedTokenRange`s fed to `pushRejectedRange`.
|
|
1606
|
+
* The returned handle MUST be `close()`d (clears the native callback +
|
|
1607
|
+
* frees the bun:ffi `JSCallback`). Throws if no fused runtime is loaded.
|
|
1608
|
+
*
|
|
1609
|
+
* `onEvent` (optional) also receives the raw `NativeVerifierEvent` for
|
|
1610
|
+
* callers that want the accepted-token stream (W9's phrase-chunker can
|
|
1611
|
+
* commit accepted draft tokens directly off this instead of round-trip
|
|
1612
|
+
* SSE deltas).
|
|
1613
|
+
*/
|
|
1614
|
+
subscribeNativeVerifier(onEvent?: (event: NativeVerifierEvent) => void): {
|
|
1615
|
+
close(): void;
|
|
1616
|
+
} {
|
|
1617
|
+
if (!this.ffi) {
|
|
1618
|
+
throw new VoiceStartupError(
|
|
1619
|
+
"missing-ffi",
|
|
1620
|
+
"[voice] subscribeNativeVerifier requires a loaded fused libelizainference handle",
|
|
1621
|
+
);
|
|
1622
|
+
}
|
|
1623
|
+
const ctx = this.ffiContextRef
|
|
1624
|
+
? this.ffiContextRef.ensure()
|
|
1625
|
+
: (() => {
|
|
1626
|
+
throw new VoiceStartupError(
|
|
1627
|
+
"missing-ffi",
|
|
1628
|
+
"[voice] subscribeNativeVerifier: no FFI context provider",
|
|
1629
|
+
);
|
|
1630
|
+
})();
|
|
1631
|
+
return this.ffi.setVerifierCallback(ctx, (event) => {
|
|
1632
|
+
onEvent?.(event);
|
|
1633
|
+
const rollback = nativeRejectedRangeToRollbackRange(event);
|
|
1634
|
+
if (rollback) {
|
|
1635
|
+
void this.pushRejectedRange(rollback);
|
|
1636
|
+
}
|
|
1637
|
+
});
|
|
1638
|
+
}
|
|
1639
|
+
|
|
1640
|
+
async prewarmPhrases(
|
|
1641
|
+
texts: ReadonlyArray<string>,
|
|
1642
|
+
opts: { concurrency?: number } = {},
|
|
1643
|
+
): Promise<{ warmed: number; cached: number }> {
|
|
1644
|
+
this.assertVoiceOn("prewarm voice phrases");
|
|
1645
|
+
return this.scheduler.prewarmPhrases(texts, opts);
|
|
1646
|
+
}
|
|
1647
|
+
|
|
1648
|
+
/**
|
|
1649
|
+
* Idle-time auto-prewarm hook: synthesize the canonical phrase-cache seed
|
|
1650
|
+
* (`DEFAULT_PHRASE_CACHE_SEED`) so common openers/acks are cached before
|
|
1651
|
+
* the next turn. The voice bridge / connector calls this when the loop is
|
|
1652
|
+
* idle. No-op (returns `{ warmed: 0, cached: 0 }`) unless a real TTS
|
|
1653
|
+
* backend is present and voice is armed — we never cache the test backend's zeros
|
|
1654
|
+
* (AGENTS.md §3).
|
|
1655
|
+
*/
|
|
1656
|
+
async prewarmIdlePhrases(
|
|
1657
|
+
opts: { concurrency?: number } = {},
|
|
1658
|
+
): Promise<{ warmed: number; cached: number }> {
|
|
1659
|
+
if (!this.hasRealTtsBackend()) return { warmed: 0, cached: 0 };
|
|
1660
|
+
if (this.lifecycle.current().kind !== "voice-on") {
|
|
1661
|
+
return { warmed: 0, cached: 0 };
|
|
1662
|
+
}
|
|
1663
|
+
return this.scheduler.prewarmPhrases(DEFAULT_PHRASE_CACHE_SEED, opts);
|
|
1664
|
+
}
|
|
1665
|
+
|
|
1666
|
+
/**
|
|
1667
|
+
* First-audio filler (AGENTS.md §4 / H4): the instant W1's VAD fires
|
|
1668
|
+
* `speech-start`, play a short cached acknowledgement ("one sec", "okay",
|
|
1669
|
+
* …) into the audio sink to mask first-token latency. W9's turn controller
|
|
1670
|
+
* owns the call site (it gets the `speech-start` event and the cutover to
|
|
1671
|
+
* real `replyText` audio); this method is the seam.
|
|
1672
|
+
*
|
|
1673
|
+
* It only ever plays audio that is *already in the phrase cache* — it does
|
|
1674
|
+
* not synthesize. Returns the filler text that was played, or `null` if no
|
|
1675
|
+
* filler was played (no real TTS backend, voice not armed, or none of the
|
|
1676
|
+
* filler phrases are cached). When real reply audio is ready, W9 cuts over
|
|
1677
|
+
* by writing it through the scheduler as usual (a `triggerBargeIn()` or a
|
|
1678
|
+
* direct `ringBuffer.drain()` truncates any still-playing filler first).
|
|
1679
|
+
*/
|
|
1680
|
+
playFirstAudioFiller(): string | null {
|
|
1681
|
+
if (!this.hasRealTtsBackend()) return null;
|
|
1682
|
+
if (this.lifecycle.current().kind !== "voice-on") return null;
|
|
1683
|
+
for (const text of FIRST_AUDIO_FILLERS) {
|
|
1684
|
+
const cached = this.phraseCache.get(text);
|
|
1685
|
+
if (!cached || cached.pcm.length === 0) continue;
|
|
1686
|
+
this.scheduler.ringBuffer.write(cached.pcm);
|
|
1687
|
+
const flushed = this.scheduler.ringBuffer.flushToSink();
|
|
1688
|
+
this.scheduler.markAgentSpeakingForAudio(flushed, cached.sampleRate);
|
|
1689
|
+
return cached.text;
|
|
1690
|
+
}
|
|
1691
|
+
return null;
|
|
1692
|
+
}
|
|
1693
|
+
|
|
1694
|
+
/**
|
|
1695
|
+
* Construct a `StreamingTranscriber` for live ASR — the contract the
|
|
1696
|
+
* voice turn controller (W9) feeds mic frames into and the barge-in
|
|
1697
|
+
* word-confirm gate (W1) listens to. Resolves the adapter chain:
|
|
1698
|
+
* fused `libelizainference` streaming ASR (final path, gated on a
|
|
1699
|
+
* working decoder AND a bundled ASR model) → fused batch ASR over the
|
|
1700
|
+
* same bundled model → `AsrUnavailableError`. The Eliza-1 bridge runs
|
|
1701
|
+
* only the fused path; the whisper.cpp interim fallback has been removed.
|
|
1702
|
+
*
|
|
1703
|
+
* Pass W1's `vad` event stream to gate decoding to active speech
|
|
1704
|
+
* windows. Caller owns the returned transcriber's lifecycle (`dispose()`).
|
|
1705
|
+
*/
|
|
1706
|
+
createStreamingTranscriber(opts?: {
|
|
1707
|
+
vad?: VadEventSource;
|
|
1708
|
+
}): StreamingTranscriber {
|
|
1709
|
+
this.assertVoiceOn("create streaming transcriber");
|
|
1710
|
+
const contextRef = this.ffiContextRef;
|
|
1711
|
+
return createStreamingTranscriber({
|
|
1712
|
+
ffi: this.ffi,
|
|
1713
|
+
getContext: contextRef ? () => contextRef.ensure() : undefined,
|
|
1714
|
+
asrBundlePresent: this.asrAvailable,
|
|
1715
|
+
vad: opts?.vad,
|
|
1716
|
+
});
|
|
1717
|
+
}
|
|
1718
|
+
|
|
1719
|
+
/**
|
|
1720
|
+
* Batch transcription: one-shot over a whole PCM buffer. When the active
|
|
1721
|
+
* backend exposes the fused batch ASR ABI, use it directly so the native
|
|
1722
|
+
* side receives the original sample rate and can apply its own resampling.
|
|
1723
|
+
* Otherwise drive a `StreamingTranscriber` (fused streaming ASR →
|
|
1724
|
+
* fused-batch interim) by feeding the buffer as a single frame and
|
|
1725
|
+
* `flush()`ing. Throws `AsrUnavailableError` when no ASR backend is
|
|
1726
|
+
* available — never a silent empty string.
|
|
1727
|
+
*/
|
|
1728
|
+
/** Transcribe + per-word timings through the fused ASR (v12). Prefers the
|
|
1729
|
+
* backend's timed path; falls back to the plain transcript with empty
|
|
1730
|
+
* `words` when timing isn't available. */
|
|
1731
|
+
async transcribePcmTimed(
|
|
1732
|
+
args: TranscriptionAudio,
|
|
1733
|
+
signal?: AbortSignal,
|
|
1734
|
+
): Promise<{ text: string; words: AsrWordTiming[] }> {
|
|
1735
|
+
this.assertVoiceOn("transcribe audio");
|
|
1736
|
+
if (signal?.aborted) {
|
|
1737
|
+
throw signal.reason instanceof Error
|
|
1738
|
+
? signal.reason
|
|
1739
|
+
: new DOMException("Aborted", "AbortError");
|
|
1740
|
+
}
|
|
1741
|
+
const backendTimed = this.backend as OmniVoiceBackend & {
|
|
1742
|
+
transcribeTimed?: (
|
|
1743
|
+
args: TranscriptionAudio,
|
|
1744
|
+
) => Promise<{ text: string; words: AsrWordTiming[] }>;
|
|
1745
|
+
};
|
|
1746
|
+
if (typeof backendTimed.transcribeTimed === "function") {
|
|
1747
|
+
const result = await backendTimed.transcribeTimed(args);
|
|
1748
|
+
if (signal?.aborted) {
|
|
1749
|
+
throw signal.reason instanceof Error
|
|
1750
|
+
? signal.reason
|
|
1751
|
+
: new DOMException("Aborted", "AbortError");
|
|
1752
|
+
}
|
|
1753
|
+
return result;
|
|
1754
|
+
}
|
|
1755
|
+
if (
|
|
1756
|
+
this.ffi &&
|
|
1757
|
+
this.ffiContextRef &&
|
|
1758
|
+
this.asrAvailable &&
|
|
1759
|
+
this.ffi.timedAsrSupported()
|
|
1760
|
+
) {
|
|
1761
|
+
const pcm =
|
|
1762
|
+
args.sampleRate === ASR_SAMPLE_RATE
|
|
1763
|
+
? args.pcm
|
|
1764
|
+
: resampleLinear(args.pcm, args.sampleRate, ASR_SAMPLE_RATE);
|
|
1765
|
+
const res = this.ffi.asrTranscribeTimed({
|
|
1766
|
+
ctx: this.ffiContextRef.ensure(),
|
|
1767
|
+
pcm,
|
|
1768
|
+
sampleRateHz: ASR_SAMPLE_RATE,
|
|
1769
|
+
});
|
|
1770
|
+
if (signal?.aborted) {
|
|
1771
|
+
throw signal.reason instanceof Error
|
|
1772
|
+
? signal.reason
|
|
1773
|
+
: new DOMException("Aborted", "AbortError");
|
|
1774
|
+
}
|
|
1775
|
+
return { text: res.text.trim(), words: res.words };
|
|
1776
|
+
}
|
|
1777
|
+
// No timed path available — degrade to the text-only transcript.
|
|
1778
|
+
logger.debug(
|
|
1779
|
+
"[EngineVoiceBridge] timedAsrSupported()===false on the active fused build — per-word timings dropped, transcript player degrades to segment-level highlight",
|
|
1780
|
+
);
|
|
1781
|
+
return { text: await this.transcribePcm(args, signal), words: [] };
|
|
1782
|
+
}
|
|
1783
|
+
|
|
1784
|
+
async transcribePcm(
|
|
1785
|
+
args: TranscriptionAudio,
|
|
1786
|
+
signal?: AbortSignal,
|
|
1787
|
+
onPartial?: (delta: string) => void,
|
|
1788
|
+
): Promise<string> {
|
|
1789
|
+
this.assertVoiceOn("transcribe audio");
|
|
1790
|
+
if (signal?.aborted) {
|
|
1791
|
+
throw signal.reason instanceof Error
|
|
1792
|
+
? signal.reason
|
|
1793
|
+
: new DOMException("Aborted", "AbortError");
|
|
1794
|
+
}
|
|
1795
|
+
// Streaming path: when the caller wants partial transcripts (the
|
|
1796
|
+
// TRANSCRIPTION model handler forwards the runtime's onStreamChunk here),
|
|
1797
|
+
// drive the fused streaming-ASR session and emit each running partial as a
|
|
1798
|
+
// delta — the same per-token pipe as chat text. Feed in ~1s windows so the
|
|
1799
|
+
// decode surfaces partials progressively. Degrades gracefully: when the
|
|
1800
|
+
// fused build's streaming-ASR decoder is a stub, createStreamingTranscriber
|
|
1801
|
+
// resolves the fused batch adapter and the final transcript is emitted once.
|
|
1802
|
+
if (onPartial) {
|
|
1803
|
+
const transcriber = this.createStreamingTranscriber();
|
|
1804
|
+
let shown = 0;
|
|
1805
|
+
const emit = (full: string): void => {
|
|
1806
|
+
if (typeof full === "string" && full.length > shown) {
|
|
1807
|
+
const delta = full.slice(shown);
|
|
1808
|
+
shown = full.length;
|
|
1809
|
+
onPartial(delta);
|
|
1810
|
+
}
|
|
1811
|
+
};
|
|
1812
|
+
const unsub = transcriber.on((ev) => {
|
|
1813
|
+
if (ev.kind === "partial" || ev.kind === "final") {
|
|
1814
|
+
emit(ev.update.partial);
|
|
1815
|
+
}
|
|
1816
|
+
});
|
|
1817
|
+
const abort = () => transcriber.dispose();
|
|
1818
|
+
try {
|
|
1819
|
+
signal?.addEventListener("abort", abort, { once: true });
|
|
1820
|
+
const win = Math.max(1600, Math.round(args.sampleRate));
|
|
1821
|
+
for (let off = 0; off < args.pcm.length; off += win) {
|
|
1822
|
+
if (signal?.aborted) break;
|
|
1823
|
+
transcriber.feed({
|
|
1824
|
+
pcm: args.pcm.subarray(off, Math.min(off + win, args.pcm.length)),
|
|
1825
|
+
sampleRate: args.sampleRate,
|
|
1826
|
+
timestampMs: Math.round((off / args.sampleRate) * 1000),
|
|
1827
|
+
});
|
|
1828
|
+
}
|
|
1829
|
+
const final = await transcriber.flush();
|
|
1830
|
+
emit(final.partial);
|
|
1831
|
+
if (signal?.aborted) {
|
|
1832
|
+
throw signal.reason instanceof Error
|
|
1833
|
+
? signal.reason
|
|
1834
|
+
: new DOMException("Aborted", "AbortError");
|
|
1835
|
+
}
|
|
1836
|
+
return final.partial;
|
|
1837
|
+
} finally {
|
|
1838
|
+
unsub();
|
|
1839
|
+
signal?.removeEventListener("abort", abort);
|
|
1840
|
+
transcriber.dispose();
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
const backendBatch = this.backend as OmniVoiceBackend & {
|
|
1844
|
+
transcribe?: (args: TranscriptionAudio) => Promise<string>;
|
|
1845
|
+
};
|
|
1846
|
+
if (typeof backendBatch.transcribe === "function") {
|
|
1847
|
+
const transcript = await backendBatch.transcribe(args);
|
|
1848
|
+
if (signal?.aborted) {
|
|
1849
|
+
throw signal.reason instanceof Error
|
|
1850
|
+
? signal.reason
|
|
1851
|
+
: new DOMException("Aborted", "AbortError");
|
|
1852
|
+
}
|
|
1853
|
+
return transcript;
|
|
1854
|
+
}
|
|
1855
|
+
if (
|
|
1856
|
+
this.ffi &&
|
|
1857
|
+
this.ffiContextRef &&
|
|
1858
|
+
this.asrAvailable &&
|
|
1859
|
+
typeof this.ffi.asrTranscribe === "function"
|
|
1860
|
+
) {
|
|
1861
|
+
const pcm =
|
|
1862
|
+
args.sampleRate === ASR_SAMPLE_RATE
|
|
1863
|
+
? args.pcm
|
|
1864
|
+
: resampleLinear(args.pcm, args.sampleRate, ASR_SAMPLE_RATE);
|
|
1865
|
+
const transcript = this.ffi
|
|
1866
|
+
.asrTranscribe({
|
|
1867
|
+
ctx: this.ffiContextRef.ensure(),
|
|
1868
|
+
pcm,
|
|
1869
|
+
sampleRateHz: ASR_SAMPLE_RATE,
|
|
1870
|
+
})
|
|
1871
|
+
.trim();
|
|
1872
|
+
if (signal?.aborted) {
|
|
1873
|
+
throw signal.reason instanceof Error
|
|
1874
|
+
? signal.reason
|
|
1875
|
+
: new DOMException("Aborted", "AbortError");
|
|
1876
|
+
}
|
|
1877
|
+
return transcript;
|
|
1878
|
+
}
|
|
1879
|
+
const transcriber = this.createStreamingTranscriber();
|
|
1880
|
+
const abort = () => transcriber.dispose();
|
|
1881
|
+
try {
|
|
1882
|
+
signal?.addEventListener("abort", abort, { once: true });
|
|
1883
|
+
transcriber.feed({
|
|
1884
|
+
pcm: args.pcm,
|
|
1885
|
+
sampleRate: args.sampleRate,
|
|
1886
|
+
timestampMs: 0,
|
|
1887
|
+
});
|
|
1888
|
+
const final = await transcriber.flush();
|
|
1889
|
+
if (signal?.aborted) {
|
|
1890
|
+
throw signal.reason instanceof Error
|
|
1891
|
+
? signal.reason
|
|
1892
|
+
: new DOMException("Aborted", "AbortError");
|
|
1893
|
+
}
|
|
1894
|
+
return final.partial;
|
|
1895
|
+
} finally {
|
|
1896
|
+
signal?.removeEventListener("abort", abort);
|
|
1897
|
+
transcriber.dispose();
|
|
1898
|
+
}
|
|
1899
|
+
}
|
|
1900
|
+
|
|
1901
|
+
/**
|
|
1902
|
+
* Run one fused mic→speech turn through the overlapped `VoicePipeline`
|
|
1903
|
+
* (AGENTS.md §4): ASR streams; the instant its last token lands the
|
|
1904
|
+
* MTP drafter and the target verifier kick off concurrently, accepted
|
|
1905
|
+
* tokens flow into this bridge's phrase chunker → TTS → ring buffer on
|
|
1906
|
+
* the same tick, rejected draft tails roll back not-yet-spoken audio, and
|
|
1907
|
+
* a mic-VAD barge-in cancels everything at the next kernel boundary.
|
|
1908
|
+
*
|
|
1909
|
+
* The drafter + verifier are wired against the running MTP llama-server
|
|
1910
|
+
* (`textRunner`); the transcriber is the fused ABI's ASR when this bridge
|
|
1911
|
+
* was started with the FFI backend and the bundle ships an `asr/` region.
|
|
1912
|
+
* In voice mode a missing ASR region is a hard `VoiceStartupError` — no
|
|
1913
|
+
* silent cloud fallback (AGENTS.md §3 + §7).
|
|
1914
|
+
*
|
|
1915
|
+
* Resolves with the turn's exit reason. Throws if no turn is wired or one
|
|
1916
|
+
* is already in flight. The created pipeline is held until the turn ends
|
|
1917
|
+
* so `bargeIn()` can cancel it.
|
|
1918
|
+
*/
|
|
1919
|
+
async runVoiceTurn(
|
|
1920
|
+
audio: TranscriptionAudio,
|
|
1921
|
+
textRunner: MtpTextRunner,
|
|
1922
|
+
config: VoicePipelineConfig,
|
|
1923
|
+
events?: VoiceTurnEvents,
|
|
1924
|
+
): Promise<"done" | "token-cap" | "cancelled"> {
|
|
1925
|
+
this.assertVoiceOn("run a voice turn");
|
|
1926
|
+
// The turn's ASR transcript materializes inside `pipeline.run` (the
|
|
1927
|
+
// `onAsrComplete` event) while attribution runs in parallel, so the two
|
|
1928
|
+
// have to be correlated. `transcriptReady` resolves with the joined ASR
|
|
1929
|
+
// text the instant ASR finalizes; the attribution `.then` awaits it before
|
|
1930
|
+
// emitting `VOICE_TURN_OBSERVED` so the merge engine sees *what* was said,
|
|
1931
|
+
// not just *who* said it (#8786). The pipeline's `finally` resolves it with
|
|
1932
|
+
// the captured text (or "") so a cancelled/no-ASR turn never hangs the await.
|
|
1933
|
+
let asrTranscript = "";
|
|
1934
|
+
let resolveTranscript: (text: string) => void = () => {};
|
|
1935
|
+
const transcriptReady = new Promise<string>((resolve) => {
|
|
1936
|
+
resolveTranscript = resolve;
|
|
1937
|
+
});
|
|
1938
|
+
const turnEvents: VoiceTurnEvents = {
|
|
1939
|
+
...events,
|
|
1940
|
+
onAsrComplete(tokens) {
|
|
1941
|
+
asrTranscript = tokens.map((t) => t.text).join("");
|
|
1942
|
+
resolveTranscript(asrTranscript);
|
|
1943
|
+
events?.onAsrComplete?.(tokens);
|
|
1944
|
+
},
|
|
1945
|
+
};
|
|
1946
|
+
// If a profileStore was wired, kick off speaker-attribution in parallel
|
|
1947
|
+
// with ASR. The attribution uses the same PCM buffer as the transcriber
|
|
1948
|
+
// but runs through the diarizer + encoder + profile-store independently.
|
|
1949
|
+
// It is fire-and-forget from the pipeline's perspective: the result
|
|
1950
|
+
// arrives via `onAttribution` asynchronously (possibly after onComplete).
|
|
1951
|
+
if (
|
|
1952
|
+
this.attributionPipeline &&
|
|
1953
|
+
(turnEvents.onAttribution || this.eventRuntime)
|
|
1954
|
+
) {
|
|
1955
|
+
const onAttribution = turnEvents.onAttribution;
|
|
1956
|
+
const attribution = this.attributionPipeline;
|
|
1957
|
+
const eventRuntime = this.eventRuntime;
|
|
1958
|
+
const liveAttribution = this.liveAttribution;
|
|
1959
|
+
const turnId = `turn-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`;
|
|
1960
|
+
void attribution
|
|
1961
|
+
.attribute({
|
|
1962
|
+
turnId,
|
|
1963
|
+
pcm: audio.pcm,
|
|
1964
|
+
})
|
|
1965
|
+
.then(async (output) => {
|
|
1966
|
+
// Automatic seam: when a full runtime is wired, emit
|
|
1967
|
+
// VOICE_TURN_OBSERVED and fold the speaker decision into the
|
|
1968
|
+
// turn's voiceTurnSignal BEFORE handing the (now-stamped)
|
|
1969
|
+
// output to the caller. Any caller with a profileStore +
|
|
1970
|
+
// runtime gets diarization-driven gating for free.
|
|
1971
|
+
if (eventRuntime) {
|
|
1972
|
+
const transcript = await transcriptReady;
|
|
1973
|
+
const { handleLiveVoiceAttribution } = await import(
|
|
1974
|
+
"../../runtime/voice-entity-binding.js"
|
|
1975
|
+
);
|
|
1976
|
+
const selfVoiceSimilarity =
|
|
1977
|
+
output.observation?.embedding && this.selfVoiceImprint
|
|
1978
|
+
? await this.selfVoiceImprint.similarity(
|
|
1979
|
+
output.observation.embedding,
|
|
1980
|
+
)
|
|
1981
|
+
: null;
|
|
1982
|
+
await handleLiveVoiceAttribution(eventRuntime, output, {
|
|
1983
|
+
...resolveLiveAttributionOptions(liveAttribution, transcript),
|
|
1984
|
+
agentSpeaking: this.scheduler.bargeIn.isAgentSpeaking,
|
|
1985
|
+
...(typeof selfVoiceSimilarity === "number"
|
|
1986
|
+
? { selfVoiceSimilarity }
|
|
1987
|
+
: {}),
|
|
1988
|
+
});
|
|
1989
|
+
}
|
|
1990
|
+
onAttribution?.(output);
|
|
1991
|
+
})
|
|
1992
|
+
.catch((err: unknown) => {
|
|
1993
|
+
// Attribution failures must not crash the turn. Log and continue.
|
|
1994
|
+
logger.warn(
|
|
1995
|
+
{
|
|
1996
|
+
turnId,
|
|
1997
|
+
error: err instanceof Error ? err.message : String(err),
|
|
1998
|
+
},
|
|
1999
|
+
"[voice-bridge] speaker attribution failed",
|
|
2000
|
+
);
|
|
2001
|
+
});
|
|
2002
|
+
}
|
|
2003
|
+
const pipeline = this.buildPipeline(textRunner, config, turnEvents);
|
|
2004
|
+
this.activePipeline = pipeline;
|
|
2005
|
+
try {
|
|
2006
|
+
return await pipeline.run(audio);
|
|
2007
|
+
} finally {
|
|
2008
|
+
// Settle the transcript promise so a cancelled/no-ASR turn (where
|
|
2009
|
+
// `onAsrComplete` never fired) cannot leave the attribution await pending.
|
|
2010
|
+
resolveTranscript(asrTranscript);
|
|
2011
|
+
if (this.activePipeline === pipeline) this.activePipeline = null;
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
|
|
2015
|
+
/** Construct the `VoicePipeline` for this bridge (no-run). Exposed for tests. */
|
|
2016
|
+
buildPipeline(
|
|
2017
|
+
textRunner: MtpTextRunner,
|
|
2018
|
+
config: VoicePipelineConfig,
|
|
2019
|
+
events?: VoicePipelineEvents,
|
|
2020
|
+
): VoicePipeline {
|
|
2021
|
+
const transcriber = this.resolveTranscriber();
|
|
2022
|
+
const deps: VoicePipelineDeps = {
|
|
2023
|
+
scheduler: this.scheduler,
|
|
2024
|
+
transcriber,
|
|
2025
|
+
drafter: new MtpDraftProposer(textRunner),
|
|
2026
|
+
verifier: new MtpTargetVerifier(textRunner),
|
|
2027
|
+
};
|
|
2028
|
+
return new VoicePipeline(deps, config, events);
|
|
2029
|
+
}
|
|
2030
|
+
|
|
2031
|
+
/**
|
|
2032
|
+
* Resolve the pipeline's ASR backend: a live `StreamingTranscriber` —
|
|
2033
|
+
* the fused `eliza_inference_asr_stream_*` decoder when the loaded build
|
|
2034
|
+
* advertises one and the bundle ships an `asr/` region, else the fused
|
|
2035
|
+
* batch ASR adapter. The `VoicePipeline` drives it as a batch
|
|
2036
|
+
* (feed the whole utterance, `flush()`, split the transcript into
|
|
2037
|
+
* tokens). When no ASR backend is available the failure is surfaced as a
|
|
2038
|
+
* `MissingAsrTranscriber` that throws on first use — AGENTS.md §3, no
|
|
2039
|
+
* silent cloud fallback.
|
|
2040
|
+
*/
|
|
2041
|
+
private resolveTranscriber(): StreamingTranscriber {
|
|
2042
|
+
const ctxRef = this.ffiContextRef;
|
|
2043
|
+
try {
|
|
2044
|
+
return createStreamingTranscriber({
|
|
2045
|
+
ffi: this.ffi,
|
|
2046
|
+
getContext: ctxRef ? () => ctxRef.ensure() : undefined,
|
|
2047
|
+
asrBundlePresent: this.asrAvailable,
|
|
2048
|
+
});
|
|
2049
|
+
} catch (err) {
|
|
2050
|
+
if (err instanceof AsrUnavailableError) {
|
|
2051
|
+
return new MissingAsrTranscriber(err.message);
|
|
2052
|
+
}
|
|
2053
|
+
throw err;
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
|
|
2057
|
+
/** Diagnostic accessor — bundle root the bridge is wired against. */
|
|
2058
|
+
bundlePath(): string {
|
|
2059
|
+
return this.bundleRoot;
|
|
2060
|
+
}
|
|
2061
|
+
|
|
2062
|
+
private assertVoiceOn(action: string): void {
|
|
2063
|
+
const state = this.lifecycle.current();
|
|
2064
|
+
if (state.kind === "voice-on") return;
|
|
2065
|
+
if (state.kind === "voice-error") {
|
|
2066
|
+
throw state.error;
|
|
2067
|
+
}
|
|
2068
|
+
throw new VoiceLifecycleError(
|
|
2069
|
+
"illegal-transition",
|
|
2070
|
+
`[voice] Cannot ${action} while lifecycle is ${state.kind}. Call armVoice() and wait for voice-on first.`,
|
|
2071
|
+
);
|
|
2072
|
+
}
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
// The mono PCM16 WAV codec lives in the dependency-light `wav-codec.ts` so
|
|
2076
|
+
// corpus / fixture / test code can encode + decode WAV without dragging in this
|
|
2077
|
+
// heavy module. Re-exported here for the existing callers.
|
|
2078
|
+
export { decodeMonoPcm16Wav, encodeMonoPcm16Wav };
|
|
2079
|
+
|
|
2080
|
+
function readPositiveIntEnv(name: string): number | undefined {
|
|
2081
|
+
const raw = process.env[name]?.trim();
|
|
2082
|
+
if (!raw) return undefined;
|
|
2083
|
+
const value = Number.parseInt(raw, 10);
|
|
2084
|
+
return Number.isFinite(value) && value > 0 ? value : undefined;
|
|
2085
|
+
}
|
|
2086
|
+
|
|
2087
|
+
/**
|
|
2088
|
+
* Default lifecycle loaders derived from the bundle layout (per
|
|
2089
|
+
* AGENTS.md §2: `tts/omnivoice-<size>.gguf` + `asr/...`).
|
|
2090
|
+
*
|
|
2091
|
+
* When a live `ffi`/`ctx` pair is passed in, arming calls
|
|
2092
|
+
* `ffi.mmapAcquire(ctx, "tts" | "asr")` before the lifecycle can enter
|
|
2093
|
+
* `voice-on`, and the returned handles' `evictPages()` calls forward
|
|
2094
|
+
* to `ffi.mmapEvict(ctx, "tts" | "asr")`. The C ABI is declared in
|
|
2095
|
+
* `scripts/omnivoice-fuse/ffi.h`. Production builds may implement this
|
|
2096
|
+
* as page eviction or as a full voice-runtime unload for mobile RAM
|
|
2097
|
+
* pressure; callers must reacquire before using the region again. The
|
|
2098
|
+
* compatibility library returns `ELIZA_ERR_NOT_IMPLEMENTED`, which the binding raises as
|
|
2099
|
+
* `VoiceLifecycleError({code:"kernel-missing"})`.
|
|
2100
|
+
*
|
|
2101
|
+
* When `ffi` is null, acquire/evict are documented empty transitions — used by the
|
|
2102
|
+
* development TTS path in tests + dev (no real mmap exists). Directory and
|
|
2103
|
+
* "contains at least one file" checks still run for both TTS and ASR.
|
|
2104
|
+
* ASR never gets a virtual fallback: voice-on requires a real bundled ASR
|
|
2105
|
+
* model file so the FFI path can acquire the `"asr"` region and surface
|
|
2106
|
+
* the fused ABI's diagnostic if the runtime lacks the required region support.
|
|
2107
|
+
*/
|
|
2108
|
+
interface FfiContextRef {
|
|
2109
|
+
current: ElizaInferenceContextHandle | null;
|
|
2110
|
+
ensure(): ElizaInferenceContextHandle;
|
|
2111
|
+
}
|
|
2112
|
+
|
|
2113
|
+
function ensureContext(
|
|
2114
|
+
ref: ElizaInferenceContextHandle | FfiContextRef | null,
|
|
2115
|
+
): ElizaInferenceContextHandle | null {
|
|
2116
|
+
if (ref === null) return null;
|
|
2117
|
+
if (typeof ref === "object" && "ensure" in ref) return ref.ensure();
|
|
2118
|
+
return ref;
|
|
2119
|
+
}
|
|
2120
|
+
|
|
2121
|
+
/**
|
|
2122
|
+
* No-op lifecycle loaders for the Kokoro-only bridge. ORT owns the
|
|
2123
|
+
* model memory; nothing to mmap-acquire or evict. ASR is not served
|
|
2124
|
+
* from this path — callers that need ASR construct
|
|
2125
|
+
* `createStreamingTranscriber` directly (the fused-only chain in
|
|
2126
|
+
* `transcriber.ts`: fused streaming → fused batch → AsrUnavailableError).
|
|
2127
|
+
*/
|
|
2128
|
+
function noopMmapRegion(id: string): MmapRegionHandle {
|
|
2129
|
+
return {
|
|
2130
|
+
id,
|
|
2131
|
+
path: "",
|
|
2132
|
+
sizeBytes: 0,
|
|
2133
|
+
async evictPages() {
|
|
2134
|
+
// Nothing to evict — ORT owns the model bytes.
|
|
2135
|
+
},
|
|
2136
|
+
async release() {
|
|
2137
|
+
// No mmap region to release.
|
|
2138
|
+
},
|
|
2139
|
+
};
|
|
2140
|
+
}
|
|
2141
|
+
|
|
2142
|
+
function kokoroOnlyLifecycleLoaders(): VoiceLifecycleLoaders {
|
|
2143
|
+
return {
|
|
2144
|
+
loadTtsRegion: async () => noopMmapRegion("kokoro:tts"),
|
|
2145
|
+
loadAsrRegion: async () => noopMmapRegion("kokoro:asr"),
|
|
2146
|
+
loadVoiceCaches: async () => ({
|
|
2147
|
+
id: "kokoro:voice-caches",
|
|
2148
|
+
async release() {},
|
|
2149
|
+
}),
|
|
2150
|
+
loadVoiceSchedulerNodes: async () => ({
|
|
2151
|
+
id: "kokoro:voice-scheduler-nodes",
|
|
2152
|
+
async release() {},
|
|
2153
|
+
}),
|
|
2154
|
+
};
|
|
2155
|
+
}
|
|
2156
|
+
|
|
2157
|
+
function defaultLifecycleLoaders(
|
|
2158
|
+
bundleRoot: string,
|
|
2159
|
+
ffi: ElizaInferenceFfi | null,
|
|
2160
|
+
ctx: ElizaInferenceContextHandle | FfiContextRef | null,
|
|
2161
|
+
options: { skipTtsRegion?: boolean } = {},
|
|
2162
|
+
): VoiceLifecycleLoaders {
|
|
2163
|
+
return {
|
|
2164
|
+
loadTtsRegion: async () =>
|
|
2165
|
+
options.skipTtsRegion === true
|
|
2166
|
+
? noopMmapRegion(`tts-override:${bundleRoot}`)
|
|
2167
|
+
: bundleMmapRegion(path.join(bundleRoot, "tts"), "tts", ffi, ctx),
|
|
2168
|
+
loadAsrRegion: async () =>
|
|
2169
|
+
bundleMmapRegion(path.join(bundleRoot, "asr"), "asr", ffi, ctx),
|
|
2170
|
+
loadVoiceCaches: async () => ({
|
|
2171
|
+
id: `voice-caches:${bundleRoot}`,
|
|
2172
|
+
async release() {
|
|
2173
|
+
// Caches stay live in the SpeakerPresetCache + PhraseCache
|
|
2174
|
+
// singletons; the registry refcount is the only thing that
|
|
2175
|
+
// drops on disarm.
|
|
2176
|
+
},
|
|
2177
|
+
}),
|
|
2178
|
+
loadVoiceSchedulerNodes: async () => ({
|
|
2179
|
+
id: `voice-scheduler-nodes:${bundleRoot}`,
|
|
2180
|
+
async release() {
|
|
2181
|
+
// Scheduler nodes (chunker, rollback, ring buffer, barge-in)
|
|
2182
|
+
// are owned by the bridge's `scheduler` field — no extra
|
|
2183
|
+
// teardown beyond the refcount drop.
|
|
2184
|
+
},
|
|
2185
|
+
}),
|
|
2186
|
+
};
|
|
2187
|
+
}
|
|
2188
|
+
|
|
2189
|
+
/**
|
|
2190
|
+
* Build an `MmapRegionHandle` for a bundle subdirectory. Refuses to
|
|
2191
|
+
* fabricate a region when the directory is missing — that surfaces as
|
|
2192
|
+
* `VoiceLifecycleError` via the lifecycle's `arm-failed`/`mmap-fail`
|
|
2193
|
+
* mapping (no silent fallback to a smaller voice model — AGENTS.md §3).
|
|
2194
|
+
*
|
|
2195
|
+
* `mmapAcquire()` / `evictPages()` forward to the FFI binding when one
|
|
2196
|
+
* is supplied. With no FFI handle (test mode), those calls return without
|
|
2197
|
+
* touching native memory because no real mmap was made. The lifecycle test
|
|
2198
|
+
* still asserts the call shape via injected mocks.
|
|
2199
|
+
*/
|
|
2200
|
+
function bundleMmapRegion(
|
|
2201
|
+
dir: string,
|
|
2202
|
+
kind: "tts" | "asr",
|
|
2203
|
+
ffi: ElizaInferenceFfi | null,
|
|
2204
|
+
ctx: ElizaInferenceContextHandle | FfiContextRef | null,
|
|
2205
|
+
): MmapRegionHandle {
|
|
2206
|
+
if (!existsSync(dir)) {
|
|
2207
|
+
throw new Error(
|
|
2208
|
+
`[voice] mmap MAP_FAILED: ${kind} directory missing at ${dir}`,
|
|
2209
|
+
);
|
|
2210
|
+
}
|
|
2211
|
+
if (!directoryHasRegularFile(dir)) {
|
|
2212
|
+
throw new Error(
|
|
2213
|
+
`[voice] mmap MAP_FAILED: ${kind} directory has no model files at ${dir}`,
|
|
2214
|
+
);
|
|
2215
|
+
}
|
|
2216
|
+
// Stat the directory to get a stable inode for id derivation. Real
|
|
2217
|
+
// FFI will mmap each weight file independently; this default loader
|
|
2218
|
+
// collapses them into one region per kind for refcount purposes.
|
|
2219
|
+
const st = statSync(dir);
|
|
2220
|
+
const handle = ffi ? ensureContext(ctx) : null;
|
|
2221
|
+
if (ffi && handle !== null) {
|
|
2222
|
+
// Real fused build: load or re-page the heavy voice region now.
|
|
2223
|
+
// A compatibility runtime without region support returns ELIZA_ERR_NOT_IMPLEMENTED,
|
|
2224
|
+
// which surfaces as VoiceLifecycleError({code:"kernel-missing"})
|
|
2225
|
+
// before the lifecycle can enter voice-on.
|
|
2226
|
+
ffi.mmapAcquire(handle, kind);
|
|
2227
|
+
}
|
|
2228
|
+
return {
|
|
2229
|
+
id: `mmap:${kind}:${st.ino}`,
|
|
2230
|
+
path: dir,
|
|
2231
|
+
sizeBytes: st.size,
|
|
2232
|
+
async evictPages() {
|
|
2233
|
+
const evictHandle = ffi ? ensureContext(ctx) : null;
|
|
2234
|
+
if (ffi && evictHandle !== null) {
|
|
2235
|
+
// Real fused build: madvise / VirtualUnlock through the C ABI.
|
|
2236
|
+
// Throws VoiceLifecycleError on a negative return — the
|
|
2237
|
+
// lifecycle catches and re-classifies via `disarm-failed`.
|
|
2238
|
+
ffi.mmapEvict(evictHandle, kind);
|
|
2239
|
+
}
|
|
2240
|
+
// Else: no FFI handle (test TTS / no fused build) — nothing to
|
|
2241
|
+
// evict.
|
|
2242
|
+
},
|
|
2243
|
+
async release() {
|
|
2244
|
+
// The FFI owns the actual mmap; release is a refcount drop on
|
|
2245
|
+
// the JS side. The fused build's destroy path flushes any
|
|
2246
|
+
// remaining pages when the context is destroyed.
|
|
2247
|
+
},
|
|
2248
|
+
};
|
|
2249
|
+
}
|
|
2250
|
+
|
|
2251
|
+
/** Re-export for the engine and tests that want the default loader. */
|
|
2252
|
+
export { defaultLifecycleLoaders };
|
|
2253
|
+
|
|
2254
|
+
/**
|
|
2255
|
+
* Platform-specific shared-library suffix for the fused omnivoice build.
|
|
2256
|
+
* macOS dylib, Linux/Android so, Windows dll. Windows artifacts have
|
|
2257
|
+
* used both `elizainference.dll` and `libelizainference.dll` names in
|
|
2258
|
+
* cross-build toolchains, so the runtime probes both.
|
|
2259
|
+
*/
|
|
2260
|
+
function libraryFilenames(): string[] {
|
|
2261
|
+
if (process.platform === "darwin") return ["libelizainference.dylib"];
|
|
2262
|
+
if (process.platform === "win32") {
|
|
2263
|
+
return ["elizainference.dll", "libelizainference.dll"];
|
|
2264
|
+
}
|
|
2265
|
+
return ["libelizainference.so"];
|
|
2266
|
+
}
|
|
2267
|
+
|
|
2268
|
+
function locateBundleLibrary(bundleRoot: string): string {
|
|
2269
|
+
const exact = process.env.ELIZA_INFERENCE_LIBRARY?.trim();
|
|
2270
|
+
if (exact && existsSync(exact)) return exact;
|
|
2271
|
+
|
|
2272
|
+
const dirs = [
|
|
2273
|
+
path.join(bundleRoot, "lib"),
|
|
2274
|
+
exact ? path.dirname(exact) : null,
|
|
2275
|
+
process.env.ELIZA_INFERENCE_LIB_DIR?.trim() || null,
|
|
2276
|
+
...managedFusedRuntimeDirs(),
|
|
2277
|
+
].filter((dir): dir is string => Boolean(dir));
|
|
2278
|
+
|
|
2279
|
+
for (const dir of dirs) {
|
|
2280
|
+
for (const name of libraryFilenames()) {
|
|
2281
|
+
const candidate = path.join(dir, name);
|
|
2282
|
+
if (existsSync(candidate)) return candidate;
|
|
2283
|
+
}
|
|
2284
|
+
}
|
|
2285
|
+
return path.join(
|
|
2286
|
+
dirs[0] ?? path.join(bundleRoot, "lib"),
|
|
2287
|
+
libraryFilenames()[0] ?? "libelizainference.so",
|
|
2288
|
+
);
|
|
2289
|
+
}
|
|
2290
|
+
|
|
2291
|
+
function bundleHasOmniVoiceWeights(bundleRoot: string): boolean {
|
|
2292
|
+
const ttsDir = path.join(bundleRoot, "tts");
|
|
2293
|
+
if (!existsSync(ttsDir)) return false;
|
|
2294
|
+
try {
|
|
2295
|
+
return readdirSync(ttsDir, { withFileTypes: true }).some(
|
|
2296
|
+
(entry) => entry.isFile() && /^omnivoice-.+\.gguf$/i.test(entry.name),
|
|
2297
|
+
);
|
|
2298
|
+
} catch {
|
|
2299
|
+
return false;
|
|
2300
|
+
}
|
|
2301
|
+
}
|
|
2302
|
+
|
|
2303
|
+
export function isOmniVoiceBundleAvailable(bundleRoot: string): boolean {
|
|
2304
|
+
if (!bundleRoot || !existsSync(bundleRoot)) return false;
|
|
2305
|
+
const presetPath = path.join(bundleRoot, DEFAULT_VOICE_PRESET_REL_PATH);
|
|
2306
|
+
return (
|
|
2307
|
+
existsSync(presetPath) &&
|
|
2308
|
+
bundleHasOmniVoiceWeights(bundleRoot) &&
|
|
2309
|
+
existsSync(locateBundleLibrary(bundleRoot))
|
|
2310
|
+
);
|
|
2311
|
+
}
|
|
2312
|
+
|
|
2313
|
+
function directoryHasRegularFile(dir: string): boolean {
|
|
2314
|
+
for (const entry of readdirSync(dir, { withFileTypes: true })) {
|
|
2315
|
+
if (entry.isFile()) return true;
|
|
2316
|
+
}
|
|
2317
|
+
return false;
|
|
2318
|
+
}
|
|
2319
|
+
|
|
2320
|
+
function bundleHasRegularFile(dir: string): boolean {
|
|
2321
|
+
if (!existsSync(dir)) return false;
|
|
2322
|
+
try {
|
|
2323
|
+
return directoryHasRegularFile(dir);
|
|
2324
|
+
} catch {
|
|
2325
|
+
return false;
|
|
2326
|
+
}
|
|
2327
|
+
}
|
|
2328
|
+
|
|
2329
|
+
function managedFusedRuntimeDirs(): string[] {
|
|
2330
|
+
if (process.env.ELIZA_INFERENCE_MANAGED_LOOKUP?.trim() === "0") {
|
|
2331
|
+
return [];
|
|
2332
|
+
}
|
|
2333
|
+
const root = localInferenceRoot();
|
|
2334
|
+
const platform = process.platform;
|
|
2335
|
+
const arch = os.arch();
|
|
2336
|
+
const candidates = [
|
|
2337
|
+
`${platform}-${arch}-metal-fused`,
|
|
2338
|
+
`${platform}-${arch}-vulkan-fused`,
|
|
2339
|
+
`${platform}-${arch}-cuda-fused`,
|
|
2340
|
+
`${platform}-${arch}-cpu-fused`,
|
|
2341
|
+
];
|
|
2342
|
+
return candidates.map((target) => path.join(root, "bin", "mtp", target));
|
|
2343
|
+
}
|