@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -0
- package/package.json +81 -15
- package/src/actions/generate-media.d.ts +59 -0
- package/src/actions/generate-media.d.ts.map +1 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.d.ts +23 -0
- package/src/actions/identify-speaker.d.ts.map +1 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +807 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.d.ts +7 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +54 -0
- package/src/local-inference-routes.d.ts +38 -0
- package/src/local-inference-routes.d.ts.map +1 -0
- package/src/local-inference-routes.test.ts +344 -0
- package/src/local-inference-routes.ts +1543 -0
- package/src/provider.d.ts +21 -0
- package/src/provider.d.ts.map +1 -0
- package/src/provider.ts +1171 -0
- package/src/routes/compat-helpers.d.ts +18 -0
- package/src/routes/compat-helpers.d.ts.map +1 -0
- package/src/routes/compat-helpers.ts +274 -0
- package/src/routes/family-member-route.d.ts +62 -0
- package/src/routes/family-member-route.d.ts.map +1 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.d.ts +19 -0
- package/src/routes/index.d.ts.map +1 -0
- package/src/routes/index.ts +60 -0
- package/src/routes/live-diarization-route.d.ts +26 -0
- package/src/routes/live-diarization-route.d.ts.map +1 -0
- package/src/routes/live-diarization-route.test.ts +213 -0
- package/src/routes/live-diarization-route.ts +122 -0
- package/src/routes/local-inference-asr-route.d.ts +4 -0
- package/src/routes/local-inference-asr-route.d.ts.map +1 -0
- package/src/routes/local-inference-asr-route.test.ts +190 -0
- package/src/routes/local-inference-asr-route.ts +213 -0
- package/src/routes/local-inference-compat-routes.d.ts +16 -0
- package/src/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/src/routes/local-inference-compat-routes.test.ts +423 -0
- package/src/routes/local-inference-compat-routes.ts +782 -0
- package/src/routes/local-inference-tts-route.d.ts +7 -0
- package/src/routes/local-inference-tts-route.d.ts.map +1 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/voice-first-run-routes.d.ts +62 -0
- package/src/routes/voice-first-run-routes.d.ts.map +1 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.d.ts +62 -0
- package/src/routes/voice-models-routes.d.ts.map +1 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/src/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.d.ts +52 -0
- package/src/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/src/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.d.ts +77 -0
- package/src/runtime/embedding-manager-support.d.ts.map +1 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.d.ts +16 -0
- package/src/runtime/embedding-presets.d.ts.map +1 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.d.ts +14 -0
- package/src/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.d.ts +53 -0
- package/src/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +528 -0
- package/src/runtime/ensure-local-inference-handler.ts +1398 -0
- package/src/runtime/index.d.ts +14 -0
- package/src/runtime/index.d.ts.map +1 -0
- package/src/runtime/index.ts +27 -0
- package/src/runtime/mobile-local-inference-gate.d.ts +31 -0
- package/src/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +69 -0
- package/src/runtime/mobile-local-inference-gate.ts +44 -0
- package/src/runtime/voice-entity-binding.d.ts +103 -0
- package/src/runtime/voice-entity-binding.d.ts.map +1 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +69 -0
- package/src/runtime/voice-entity-binding.ts +328 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.d.ts +282 -0
- package/src/services/active-model.d.ts.map +1 -0
- package/src/services/active-model.ts +1213 -0
- package/src/services/asr/errors.d.ts +21 -0
- package/src/services/asr/errors.d.ts.map +1 -0
- package/src/services/asr/errors.ts +50 -0
- package/src/services/asr/hash.d.ts +28 -0
- package/src/services/asr/hash.d.ts.map +1 -0
- package/src/services/asr/hash.ts +49 -0
- package/src/services/asr/index.d.ts +76 -0
- package/src/services/asr/index.d.ts.map +1 -0
- package/src/services/asr/index.ts +178 -0
- package/src/services/asr/types.d.ts +91 -0
- package/src/services/asr/types.d.ts.map +1 -0
- package/src/services/asr/types.ts +95 -0
- package/src/services/assignments.d.ts +71 -0
- package/src/services/assignments.d.ts.map +1 -0
- package/src/services/assignments.test.ts +80 -0
- package/src/services/assignments.ts +230 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.d.ts +346 -0
- package/src/services/backend.d.ts.map +1 -0
- package/src/services/backend.ts +612 -0
- package/src/services/bundled-models.d.ts +34 -0
- package/src/services/bundled-models.d.ts.map +1 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.d.ts +206 -0
- package/src/services/cache-bridge.d.ts.map +1 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.d.ts +10 -0
- package/src/services/catalog.d.ts.map +1 -0
- package/src/services/catalog.test.ts +240 -0
- package/src/services/catalog.ts +27 -0
- package/src/services/checkpoint-client.d.ts +109 -0
- package/src/services/checkpoint-client.d.ts.map +1 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.d.ts +102 -0
- package/src/services/cloud-fallback.d.ts.map +1 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/conversation-registry.d.ts +142 -0
- package/src/services/conversation-registry.d.ts.map +1 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts +92 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +333 -0
- package/src/services/device-bridge.d.ts +188 -0
- package/src/services/device-bridge.d.ts.map +1 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.d.ts +149 -0
- package/src/services/device-resource-metrics.d.ts.map +1 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.d.ts +115 -0
- package/src/services/device-tier.d.ts.map +1 -0
- package/src/services/device-tier.test.ts +371 -0
- package/src/services/device-tier.ts +410 -0
- package/src/services/downloader.d.ts +82 -0
- package/src/services/downloader.d.ts.map +1 -0
- package/src/services/downloader.test.ts +724 -0
- package/src/services/downloader.ts +899 -0
- package/src/services/engine-direct-bundle.test.ts +58 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.d.ts +534 -0
- package/src/services/engine.d.ts.map +1 -0
- package/src/services/engine.ts +1891 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.d.ts +17 -0
- package/src/services/external-scanner.d.ts.map +1 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +442 -0
- package/src/services/ffi-streaming-backend.d.ts +180 -0
- package/src/services/ffi-streaming-backend.d.ts.map +1 -0
- package/src/services/ffi-streaming-backend.ts +382 -0
- package/src/services/ffi-streaming-runner.d.ts +122 -0
- package/src/services/ffi-streaming-runner.d.ts.map +1 -0
- package/src/services/ffi-streaming-runner.test.ts +60 -0
- package/src/services/ffi-streaming-runner.ts +354 -0
- package/src/services/ffi-unload-ordering.test.ts +162 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.d.ts +72 -0
- package/src/services/handler-registry.d.ts.map +1 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.d.ts +63 -0
- package/src/services/hardware.d.ts.map +1 -0
- package/src/services/hardware.test.ts +183 -0
- package/src/services/hardware.ts +404 -0
- package/src/services/hf-search.d.ts +26 -0
- package/src/services/hf-search.d.ts.map +1 -0
- package/src/services/hf-search.test.ts +69 -0
- package/src/services/hf-search.ts +420 -0
- package/src/services/image-description-runtime.d.ts +14 -0
- package/src/services/image-description-runtime.d.ts.map +1 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/src/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.d.ts +118 -0
- package/src/services/imagegen/backend-selector.d.ts.map +1 -0
- package/src/services/imagegen/backend-selector.ts +281 -0
- package/src/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/src/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.d.ts +16 -0
- package/src/services/imagegen/errors.d.ts.map +1 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.d.ts +58 -0
- package/src/services/imagegen/index.d.ts.map +1 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.d.ts +74 -0
- package/src/services/imagegen/mflux.d.ts.map +1 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.d.ts +180 -0
- package/src/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/src/services/imagegen/sd-cpp.ts +718 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.d.ts +181 -0
- package/src/services/imagegen/types.d.ts.map +1 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.d.ts +30 -0
- package/src/services/index.d.ts.map +1 -0
- package/src/services/index.ts +225 -0
- package/src/services/inference-capabilities.d.ts +132 -0
- package/src/services/inference-capabilities.d.ts.map +1 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.d.ts +59 -0
- package/src/services/inference-telemetry.d.ts.map +1 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.d.ts +189 -0
- package/src/services/kv-spill.d.ts.map +1 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +356 -0
- package/src/services/latency-trace.d.ts +346 -0
- package/src/services/latency-trace.d.ts.map +1 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.d.ts +96 -0
- package/src/services/llm-streaming-binding.d.ts.map +1 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.d.ts +82 -0
- package/src/services/load-args.d.ts.map +1 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +708 -0
- package/src/services/manifest/index.d.ts +4 -0
- package/src/services/manifest/index.d.ts.map +1 -0
- package/src/services/manifest/index.ts +66 -0
- package/src/services/manifest/manifest.test.ts +693 -0
- package/src/services/manifest/schema.d.ts +715 -0
- package/src/services/manifest/schema.d.ts.map +1 -0
- package/src/services/manifest/schema.ts +655 -0
- package/src/services/manifest/types.d.ts +30 -0
- package/src/services/manifest/types.d.ts.map +1 -0
- package/src/services/manifest/types.ts +55 -0
- package/src/services/manifest/validator.d.ts +66 -0
- package/src/services/manifest/validator.d.ts.map +1 -0
- package/src/services/manifest/validator.ts +569 -0
- package/src/services/memory-arbiter.d.ts +343 -0
- package/src/services/memory-arbiter.d.ts.map +1 -0
- package/src/services/memory-arbiter.test.ts +419 -0
- package/src/services/memory-arbiter.ts +1000 -0
- package/src/services/memory-monitor.d.ts +119 -0
- package/src/services/memory-monitor.d.ts.map +1 -0
- package/src/services/memory-monitor.test.ts +208 -0
- package/src/services/memory-monitor.ts +296 -0
- package/src/services/memory-pressure.d.ts +127 -0
- package/src/services/memory-pressure.d.ts.map +1 -0
- package/src/services/memory-pressure.ts +413 -0
- package/src/services/mtp-doctor.d.ts +13 -0
- package/src/services/mtp-doctor.d.ts.map +1 -0
- package/src/services/mtp-doctor.ts +78 -0
- package/src/services/network-policy.d.ts +127 -0
- package/src/services/network-policy.d.ts.map +1 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.d.ts +6 -0
- package/src/services/paths.d.ts.map +1 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.d.ts +124 -0
- package/src/services/planner-skeleton.d.ts.map +1 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.d.ts +38 -0
- package/src/services/providers.d.ts.map +1 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +163 -0
- package/src/services/ram-budget.d.ts +110 -0
- package/src/services/ram-budget.d.ts.map +1 -0
- package/src/services/ram-budget.ts +0 -0
- package/src/services/readiness.d.ts +9 -0
- package/src/services/readiness.d.ts.map +1 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.d.ts +111 -0
- package/src/services/recommendation.d.ts.map +1 -0
- package/src/services/recommendation.ts +672 -0
- package/src/services/registry.d.ts +35 -0
- package/src/services/registry.d.ts.map +1 -0
- package/src/services/registry.ts +151 -0
- package/src/services/router-handler.d.ts +92 -0
- package/src/services/router-handler.d.ts.map +1 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +376 -0
- package/src/services/routing-policy.d.ts +55 -0
- package/src/services/routing-policy.d.ts.map +1 -0
- package/src/services/routing-policy.ts +228 -0
- package/src/services/routing-preferences.d.ts +8 -0
- package/src/services/routing-preferences.d.ts.map +1 -0
- package/src/services/routing-preferences.ts +15 -0
- package/src/services/runtime-target.d.ts +98 -0
- package/src/services/runtime-target.d.ts.map +1 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.d.ts +128 -0
- package/src/services/service.d.ts.map +1 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +735 -0
- package/src/services/session-pool.d.ts +72 -0
- package/src/services/session-pool.d.ts.map +1 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.d.ts +23 -0
- package/src/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.d.ts +311 -0
- package/src/services/structured-output.d.ts.map +1 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/tts/errors.ts +46 -0
- package/src/services/tts/index.ts +214 -0
- package/src/services/tts/tts-audio-cache.ts +235 -0
- package/src/services/tts/types.ts +157 -0
- package/src/services/types.d.ts +19 -0
- package/src/services/types.d.ts.map +1 -0
- package/src/services/types.ts +55 -0
- package/src/services/verify-on-device.d.ts +34 -0
- package/src/services/verify-on-device.d.ts.map +1 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.d.ts +8 -0
- package/src/services/verify.d.ts.map +1 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.d.ts +115 -0
- package/src/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.d.ts +99 -0
- package/src/services/vision/capacitor-llama.d.ts.map +1 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.d.ts +47 -0
- package/src/services/vision/cloud-fallback.d.ts.map +1 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.d.ts +71 -0
- package/src/services/vision/hash.d.ts.map +1 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.d.ts +95 -0
- package/src/services/vision/index.d.ts.map +1 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.d.ts +73 -0
- package/src/services/vision/llama-server.d.ts.map +1 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.d.ts +153 -0
- package/src/services/vision/types.d.ts.map +1 -0
- package/src/services/vision/types.ts +154 -0
- package/src/services/vision/vast-fallback.d.ts +18 -0
- package/src/services/vision/vast-fallback.d.ts.map +1 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.d.ts +98 -0
- package/src/services/vision-embedding-cache.d.ts.map +1 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +88 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +92 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +124 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +197 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/audio-frame-consumer.d.ts +212 -0
- package/src/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/src/services/voice/audio-frame-consumer.test.ts +343 -0
- package/src/services/voice/audio-frame-consumer.ts +491 -0
- package/src/services/voice/barge-in.d.ts +112 -0
- package/src/services/voice/barge-in.d.ts.map +1 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +336 -0
- package/src/services/voice/cancellation-coordinator.d.ts +127 -0
- package/src/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.d.ts +199 -0
- package/src/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +743 -0
- package/src/services/voice/eager-context-builder.d.ts +170 -0
- package/src/services/voice/eager-context-builder.d.ts.map +1 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.d.ts +133 -0
- package/src/services/voice/embedding.d.ts.map +1 -0
- package/src/services/voice/embedding.test.ts +148 -0
- package/src/services/voice/embedding.ts +244 -0
- package/src/services/voice/emotion-attribution.d.ts +68 -0
- package/src/services/voice/emotion-attribution.d.ts.map +1 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge.d.ts +746 -0
- package/src/services/voice/engine-bridge.d.ts.map +1 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2226 -0
- package/src/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/src/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/src/services/voice/eot-classifier-ggml.ts +566 -0
- package/src/services/voice/eot-classifier.d.ts +214 -0
- package/src/services/voice/eot-classifier.d.ts.map +1 -0
- package/src/services/voice/eot-classifier.ts +533 -0
- package/src/services/voice/errors.d.ts +20 -0
- package/src/services/voice/errors.d.ts.map +1 -0
- package/src/services/voice/errors.ts +32 -0
- package/src/services/voice/expressive-tags.d.ts +158 -0
- package/src/services/voice/expressive-tags.d.ts.map +1 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.d.ts +636 -0
- package/src/services/voice/ffi-bindings.d.ts.map +1 -0
- package/src/services/voice/ffi-bindings.test.ts +671 -0
- package/src/services/voice/ffi-bindings.ts +3050 -0
- package/src/services/voice/first-line-cache.d.ts +181 -0
- package/src/services/voice/first-line-cache.d.ts.map +1 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.d.ts +51 -0
- package/src/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/fused-eot-scorer.ts +135 -0
- package/src/services/voice/index.d.ts +91 -0
- package/src/services/voice/index.d.ts.map +1 -0
- package/src/services/voice/index.ts +481 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +231 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts +72 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +207 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/src/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts +92 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/src/services/voice/kokoro/runtime-selection.ts +237 -0
- package/src/services/voice/kokoro/types.d.ts +82 -0
- package/src/services/voice/kokoro/types.d.ts.map +1 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/src/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.d.ts +30 -0
- package/src/services/voice/kokoro/voices.d.ts.map +1 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.d.ts +135 -0
- package/src/services/voice/lifecycle.d.ts.map +1 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.d.ts +96 -0
- package/src/services/voice/live-diarization-session.d.ts.map +1 -0
- package/src/services/voice/live-diarization-session.ts +289 -0
- package/src/services/voice/mic-source.d.ts +136 -0
- package/src/services/voice/mic-source.d.ts.map +1 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/optimistic-policy.d.ts +109 -0
- package/src/services/voice/optimistic-policy.d.ts.map +1 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.d.ts +73 -0
- package/src/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/src/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.d.ts +76 -0
- package/src/services/voice/phrase-cache.d.ts.map +1 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.d.ts +62 -0
- package/src/services/voice/phrase-chunker.d.ts.map +1 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.d.ts +151 -0
- package/src/services/voice/pipeline-impls.d.ts.map +1 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.d.ts +216 -0
- package/src/services/voice/pipeline.d.ts.map +1 -0
- package/src/services/voice/pipeline.ts +505 -0
- package/src/services/voice/prefill-client.d.ts +123 -0
- package/src/services/voice/prefill-client.d.ts.map +1 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/src/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.d.ts +248 -0
- package/src/services/voice/profile-store.d.ts.map +1 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/ring-buffer.d.ts +40 -0
- package/src/services/voice/ring-buffer.d.ts.map +1 -0
- package/src/services/voice/ring-buffer.ts +105 -0
- package/src/services/voice/rollback-queue.d.ts +24 -0
- package/src/services/voice/rollback-queue.d.ts.map +1 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/scheduler.d.ts +146 -0
- package/src/services/voice/scheduler.d.ts.map +1 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/shared-resources.d.ts +190 -0
- package/src/services/voice/shared-resources.d.ts.map +1 -0
- package/src/services/voice/shared-resources.ts +320 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +386 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.d.ts +75 -0
- package/src/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/src/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.d.ts +37 -0
- package/src/services/voice/speaker/encoder.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.d.ts +83 -0
- package/src/services/voice/speaker-imprint.d.ts.map +1 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.d.ts +77 -0
- package/src/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.d.ts +73 -0
- package/src/services/voice/system-audio-sink.d.ts.map +1 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.d.ts +244 -0
- package/src/services/voice/transcriber.d.ts.map +1 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/turn-controller.d.ts +183 -0
- package/src/services/voice/turn-controller.d.ts.map +1 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.d.ts +643 -0
- package/src/services/voice/types.d.ts.map +1 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.d.ts +282 -0
- package/src/services/voice/vad.d.ts.map +1 -0
- package/src/services/voice/vad.test.ts +480 -0
- package/src/services/voice/vad.ts +827 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.d.ts +241 -0
- package/src/services/voice/voice-budget.d.ts.map +1 -0
- package/src/services/voice/voice-budget.test.ts +420 -0
- package/src/services/voice/voice-budget.ts +656 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/src/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-preset-format.d.ts +158 -0
- package/src/services/voice/voice-preset-format.d.ts.map +1 -0
- package/src/services/voice/voice-preset-format.ts +700 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.d.ts +116 -0
- package/src/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.d.ts +83 -0
- package/src/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.ts +154 -0
- package/src/services/voice/voice-settings.d.ts +82 -0
- package/src/services/voice/voice-settings.d.ts.map +1 -0
- package/src/services/voice/voice-settings.ts +172 -0
- package/src/services/voice/voice-state-machine.d.ts +364 -0
- package/src/services/voice/voice-state-machine.d.ts.map +1 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +326 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.d.ts +101 -0
- package/src/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/src/services/voice/wake-word-ggml.ts +320 -0
- package/src/services/voice/wake-word.d.ts +255 -0
- package/src/services/voice/wake-word.d.ts.map +1 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.d.ts +240 -0
- package/src/services/voice-model-updater.d.ts.map +1 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.d.ts +3 -0
- package/src/services/voice-prewarm.d.ts.map +1 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/dist/index.d.ts +0 -37
- package/dist/index.js +0 -1098
|
@@ -0,0 +1,827 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Voice activity detection — the audio front-end's two-tier gate.
|
|
3
|
+
*
|
|
4
|
+
* Tier 1 — `RmsEnergyGate`. A frame-level RMS threshold with hysteresis.
|
|
5
|
+
* Sub-frame latency, no model. Its rising edge is the "wake the
|
|
6
|
+
* response pipeline" signal (KV-prefill the response prompt,
|
|
7
|
+
* preload the drafter, pre-generate the first filler). It NEVER
|
|
8
|
+
* substitutes for the model VAD — it only decides "is there
|
|
9
|
+
* acoustic activity right now".
|
|
10
|
+
*
|
|
11
|
+
* Tier 2 — a model VAD provider. Resolver order is an optional injected
|
|
12
|
+
* Qwen toolkit adapter when supplied, otherwise the fused
|
|
13
|
+
* `libelizainference` Silero v5 VAD ABI (`eliza_inference_vad_*`,
|
|
14
|
+
* backend id `silero-ggml`). 512-sample windows at 16 kHz (32 ms
|
|
15
|
+
* hop), one speech probability per window. This is the
|
|
16
|
+
* *authoritative* speech/no-speech signal — it gates ASR and drives
|
|
17
|
+
* turn-taking. The fused engine is the sole on-device VAD runtime;
|
|
18
|
+
* there is no standalone VAD library.
|
|
19
|
+
*
|
|
20
|
+
* `VadDetector` wires both together and emits the `VadEvent` stream
|
|
21
|
+
* (`speech-start` / `speech-active` / `speech-pause` / `speech-end` /
|
|
22
|
+
* `blip`) plus the raw `EnergyGateEvent` stream.
|
|
23
|
+
*
|
|
24
|
+
* No fallback sludge: if the fused VAD ABI is unavailable (and no injected
|
|
25
|
+
* adapter is supplied), `createVadDetector()` throws `VadUnavailableError`. The
|
|
26
|
+
* caller surfaces "VAD unavailable — voice features degrade" — there is no
|
|
27
|
+
* silent downgrade to the RMS gate, and no standalone-library fallback
|
|
28
|
+
* (AGENTS.md §3).
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { existsSync } from "node:fs";
|
|
32
|
+
import path from "node:path";
|
|
33
|
+
import { localInferenceRoot } from "../paths";
|
|
34
|
+
import type {
|
|
35
|
+
ElizaInferenceContextHandle,
|
|
36
|
+
ElizaInferenceFfi,
|
|
37
|
+
NativeVadHandle,
|
|
38
|
+
} from "./ffi-bindings";
|
|
39
|
+
import type {
|
|
40
|
+
EnergyGateEvent,
|
|
41
|
+
EnergyGateListener,
|
|
42
|
+
PcmFrame,
|
|
43
|
+
VadEvent,
|
|
44
|
+
VadEventListener,
|
|
45
|
+
} from "./types";
|
|
46
|
+
|
|
47
|
+
/** Thrown when the Silero VAD backend cannot be loaded — the native VAD FFI
|
|
48
|
+
* is missing or ABI-only, the model file is absent, or the model is corrupt.
|
|
49
|
+
* There is no fallback; voice features that depend on VAD must surface
|
|
50
|
+
* this. */
|
|
51
|
+
export class VadUnavailableError extends Error {
|
|
52
|
+
readonly code:
|
|
53
|
+
| "ffi-missing"
|
|
54
|
+
| "model-missing"
|
|
55
|
+
| "model-load-failed"
|
|
56
|
+
| "provider-missing";
|
|
57
|
+
constructor(code: VadUnavailableError["code"], message: string) {
|
|
58
|
+
super(message);
|
|
59
|
+
this.name = "VadUnavailableError";
|
|
60
|
+
this.code = code;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Relative path of the fused Silero v5 GGML VAD model inside an Eliza-1
|
|
65
|
+
* bundle. The file is read by `libelizainference`'s native VAD ABI. */
|
|
66
|
+
const SILERO_VAD_GGML_REL_PATH = path.join("vad", "silero-vad-v5.1.2.ggml.bin");
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Resolve the fused-libelizainference Silero GGML VAD model on disk. An
|
|
70
|
+
* explicit `modelPath` is honored exactly — if it is set but missing, the
|
|
71
|
+
* result is `null` (no silent substitution of a different model). When
|
|
72
|
+
* `modelPath` is not given the search order is:
|
|
73
|
+
* 1. `<bundleRoot>/vad/silero-vad-v5.1.2.ggml.bin`
|
|
74
|
+
* 2. `<state-dir>/local-inference/vad/silero-vad-v5.1.2.ggml.bin`
|
|
75
|
+
* 3. `$ELIZA_VAD_MODEL_PATH`
|
|
76
|
+
* Returns `null` when none exist.
|
|
77
|
+
*/
|
|
78
|
+
export function resolveSileroVadPath(opts: {
|
|
79
|
+
modelPath?: string;
|
|
80
|
+
bundleRoot?: string;
|
|
81
|
+
}): string | null {
|
|
82
|
+
if (opts.modelPath) {
|
|
83
|
+
return existsSync(opts.modelPath) ? path.resolve(opts.modelPath) : null;
|
|
84
|
+
}
|
|
85
|
+
const candidates: Array<string | undefined> = [
|
|
86
|
+
opts.bundleRoot
|
|
87
|
+
? path.join(opts.bundleRoot, SILERO_VAD_GGML_REL_PATH)
|
|
88
|
+
: undefined,
|
|
89
|
+
path.join(localInferenceRoot(), SILERO_VAD_GGML_REL_PATH),
|
|
90
|
+
process.env.ELIZA_VAD_MODEL_PATH?.trim() || undefined,
|
|
91
|
+
];
|
|
92
|
+
for (const c of candidates) {
|
|
93
|
+
if (c && existsSync(c)) return path.resolve(c);
|
|
94
|
+
}
|
|
95
|
+
return null;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
const SILERO_WINDOW_16K = 512; // samples per inference window @ 16 kHz
|
|
99
|
+
|
|
100
|
+
function validateSileroSampleRate(sampleRate: number): void {
|
|
101
|
+
if (sampleRate !== 16_000) {
|
|
102
|
+
throw new VadUnavailableError(
|
|
103
|
+
"model-load-failed",
|
|
104
|
+
`[voice] Silero VAD v5 only supports 16 kHz; got ${sampleRate}. Resample the mic stream to 16 kHz before the VAD.`,
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Fused libelizainference-backed Silero v5 GGML VAD — the sole on-device VAD
|
|
111
|
+
* runtime. The model (`silero-vad-v5.1.2.ggml.bin`) is loaded by the shared
|
|
112
|
+
* ggml context owned by the FFI; `process()` runs one 512-sample 16 kHz window
|
|
113
|
+
* through the native VAD and returns the speech probability. `reset()` clears
|
|
114
|
+
* the recurrent state at utterance boundaries.
|
|
115
|
+
*/
|
|
116
|
+
export class GgmlSileroVad {
|
|
117
|
+
readonly sampleRate: number;
|
|
118
|
+
readonly windowSamples = SILERO_WINDOW_16K;
|
|
119
|
+
private closed = false;
|
|
120
|
+
|
|
121
|
+
private constructor(
|
|
122
|
+
private readonly ffi: ElizaInferenceFfi,
|
|
123
|
+
private readonly handle: NativeVadHandle,
|
|
124
|
+
sampleRate: number,
|
|
125
|
+
) {
|
|
126
|
+
this.sampleRate = sampleRate;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/** True when the libelizainference build exports the native VAD ABI and
|
|
130
|
+
* advertises support. False on ABI-only builds or when the C++ side has
|
|
131
|
+
* not been linked against the GGML Silero VAD runtime. */
|
|
132
|
+
static isSupported(ffi: ElizaInferenceFfi | null | undefined): boolean {
|
|
133
|
+
if (!ffi || typeof ffi.vadSupported !== "function") return false;
|
|
134
|
+
return ffi.vadSupported();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
static async load(opts: {
|
|
138
|
+
ffi: ElizaInferenceFfi;
|
|
139
|
+
ctx: ElizaInferenceContextHandle | (() => ElizaInferenceContextHandle);
|
|
140
|
+
sampleRate?: number;
|
|
141
|
+
}): Promise<GgmlSileroVad> {
|
|
142
|
+
const sampleRate = opts.sampleRate ?? 16_000;
|
|
143
|
+
validateSileroSampleRate(sampleRate);
|
|
144
|
+
if (!GgmlSileroVad.isSupported(opts.ffi)) {
|
|
145
|
+
throw new VadUnavailableError(
|
|
146
|
+
"ffi-missing",
|
|
147
|
+
"[voice] Native GGML Silero VAD is not supported by this libelizainference build. Rebuild with the GGML VAD runtime linked in (eliza_inference_vad_* symbols).",
|
|
148
|
+
);
|
|
149
|
+
}
|
|
150
|
+
if (
|
|
151
|
+
!opts.ffi.vadOpen ||
|
|
152
|
+
!opts.ffi.vadProcess ||
|
|
153
|
+
!opts.ffi.vadReset ||
|
|
154
|
+
!opts.ffi.vadClose
|
|
155
|
+
) {
|
|
156
|
+
throw new VadUnavailableError(
|
|
157
|
+
"model-load-failed",
|
|
158
|
+
"[voice] Native GGML Silero VAD support probe succeeded, but the required VAD FFI methods are missing.",
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
const ctx = typeof opts.ctx === "function" ? opts.ctx() : opts.ctx;
|
|
162
|
+
const handle = opts.ffi.vadOpen({ ctx, sampleRateHz: sampleRate });
|
|
163
|
+
return new GgmlSileroVad(opts.ffi, handle, sampleRate);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async process(window: Float32Array): Promise<number> {
|
|
167
|
+
if (this.closed) {
|
|
168
|
+
throw new Error("[voice] GgmlSileroVad.process called after close()");
|
|
169
|
+
}
|
|
170
|
+
if (window.length !== SILERO_WINDOW_16K) {
|
|
171
|
+
throw new Error(
|
|
172
|
+
`[voice] GgmlSileroVad.process expects a ${SILERO_WINDOW_16K}-sample window; got ${window.length}`,
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
const vadProcess = this.ffi.vadProcess;
|
|
176
|
+
if (!vadProcess) {
|
|
177
|
+
throw new Error("[voice] GgmlSileroVad.process missing FFI method");
|
|
178
|
+
}
|
|
179
|
+
return vadProcess({ vad: this.handle, pcm: window });
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
reset(): void {
|
|
183
|
+
if (this.closed) return;
|
|
184
|
+
const vadReset = this.ffi.vadReset;
|
|
185
|
+
if (!vadReset) {
|
|
186
|
+
throw new Error("[voice] GgmlSileroVad.reset missing FFI method");
|
|
187
|
+
}
|
|
188
|
+
vadReset(this.handle);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
close(): void {
|
|
192
|
+
if (this.closed) return;
|
|
193
|
+
this.closed = true;
|
|
194
|
+
const vadClose = this.ffi.vadClose;
|
|
195
|
+
if (!vadClose) {
|
|
196
|
+
throw new Error("[voice] GgmlSileroVad.close missing FFI method");
|
|
197
|
+
}
|
|
198
|
+
vadClose(this.handle);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/** @deprecated Use `GgmlSileroVad`. Kept as an alias while callers migrate
|
|
203
|
+
* off the legacy ONNX-era name. */
|
|
204
|
+
export const NativeSileroVad = GgmlSileroVad;
|
|
205
|
+
export type NativeSileroVad = GgmlSileroVad;
|
|
206
|
+
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
// Tier 1: cheap always-on RMS energy gate.
|
|
209
|
+
// ---------------------------------------------------------------------------
|
|
210
|
+
|
|
211
|
+
export interface RmsEnergyGateConfig {
|
|
212
|
+
/** RMS above this counts as activity. Default 0.012 — between the 0.01 the
|
|
213
|
+
* vision capture stream uses and the 0.05 Discord uses for speaking. */
|
|
214
|
+
riseThreshold?: number;
|
|
215
|
+
/** RMS must drop below this to count as quiet (hysteresis). Default
|
|
216
|
+
* `0.6 * riseThreshold`. */
|
|
217
|
+
fallThreshold?: number;
|
|
218
|
+
/** Consecutive ms below `fallThreshold` before emitting `energy-fall`.
|
|
219
|
+
* Default 200 ms. */
|
|
220
|
+
fallHoldMs?: number;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
export function rms(pcm: Float32Array): number {
|
|
224
|
+
if (pcm.length === 0) return 0;
|
|
225
|
+
let sum = 0;
|
|
226
|
+
for (let i = 0; i < pcm.length; i++) sum += pcm[i] * pcm[i];
|
|
227
|
+
return Math.sqrt(sum / pcm.length);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Hysteretic RMS gate. Feed it `PcmFrame`s; it emits `energy-rise` on the
|
|
232
|
+
* first frame above `riseThreshold` and `energy-fall` after RMS has been
|
|
233
|
+
* below `fallThreshold` for `fallHoldMs`. This is the fast pre-warm trigger
|
|
234
|
+
* — not a speech detector.
|
|
235
|
+
*/
|
|
236
|
+
export class RmsEnergyGate {
|
|
237
|
+
private readonly riseThreshold: number;
|
|
238
|
+
private readonly fallThreshold: number;
|
|
239
|
+
private readonly fallHoldMs: number;
|
|
240
|
+
private active = false;
|
|
241
|
+
private quietSinceMs: number | null = null;
|
|
242
|
+
private readonly listeners = new Set<EnergyGateListener>();
|
|
243
|
+
|
|
244
|
+
constructor(config: RmsEnergyGateConfig = {}) {
|
|
245
|
+
this.riseThreshold = config.riseThreshold ?? 0.012;
|
|
246
|
+
this.fallThreshold = config.fallThreshold ?? this.riseThreshold * 0.6;
|
|
247
|
+
this.fallHoldMs = config.fallHoldMs ?? 200;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
get isActive(): boolean {
|
|
251
|
+
return this.active;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
onEvent(listener: EnergyGateListener): () => void {
|
|
255
|
+
this.listeners.add(listener);
|
|
256
|
+
return () => this.listeners.delete(listener);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/** Returns the frame RMS so callers can reuse it. */
|
|
260
|
+
push(frame: PcmFrame): number {
|
|
261
|
+
const level = rms(frame.pcm);
|
|
262
|
+
if (!this.active) {
|
|
263
|
+
if (level >= this.riseThreshold) {
|
|
264
|
+
this.active = true;
|
|
265
|
+
this.quietSinceMs = null;
|
|
266
|
+
this.emit({
|
|
267
|
+
type: "energy-rise",
|
|
268
|
+
timestampMs: frame.timestampMs,
|
|
269
|
+
rms: level,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
return level;
|
|
273
|
+
}
|
|
274
|
+
// active
|
|
275
|
+
if (level < this.fallThreshold) {
|
|
276
|
+
if (this.quietSinceMs === null) this.quietSinceMs = frame.timestampMs;
|
|
277
|
+
const quietMs = frame.timestampMs - this.quietSinceMs;
|
|
278
|
+
if (quietMs >= this.fallHoldMs) {
|
|
279
|
+
this.active = false;
|
|
280
|
+
this.quietSinceMs = null;
|
|
281
|
+
this.emit({
|
|
282
|
+
type: "energy-fall",
|
|
283
|
+
timestampMs: frame.timestampMs,
|
|
284
|
+
quietMs,
|
|
285
|
+
});
|
|
286
|
+
}
|
|
287
|
+
} else {
|
|
288
|
+
this.quietSinceMs = null;
|
|
289
|
+
}
|
|
290
|
+
return level;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
reset(): void {
|
|
294
|
+
this.active = false;
|
|
295
|
+
this.quietSinceMs = null;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private emit(event: EnergyGateEvent): void {
|
|
299
|
+
for (const l of this.listeners) l(event);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// ---------------------------------------------------------------------------
|
|
304
|
+
// Tier 2 driver: VadDetector — the Silero speech state machine.
|
|
305
|
+
// ---------------------------------------------------------------------------
|
|
306
|
+
|
|
307
|
+
export interface VadDetectorConfig {
|
|
308
|
+
/** Mic sample rate (Hz). MUST be 16 000 — Silero v5 is 16 kHz only. */
|
|
309
|
+
sampleRate?: number;
|
|
310
|
+
/** Speech probability above this opens a speech segment. Default 0.5. */
|
|
311
|
+
onsetThreshold?: number;
|
|
312
|
+
/** Speech probability must drop below this to count toward end-of-speech.
|
|
313
|
+
* Default `onsetThreshold - 0.15`. Below the onset to avoid flapping. */
|
|
314
|
+
offsetThreshold?: number;
|
|
315
|
+
/** Consecutive ms of speech-prob below `offsetThreshold` before the
|
|
316
|
+
* segment is considered *paused* (kick speculative response). Default
|
|
317
|
+
* 100 ms (lowered from 220ms; further reduction gated on semantic EOT
|
|
318
|
+
* classifier V2). Override via `ELIZA_PAUSE_HANGOVER_MS`. */
|
|
319
|
+
pauseHangoverMs?: number;
|
|
320
|
+
/**
|
|
321
|
+
* V1 — "fast endpoint" pause hangover, used when `fastEndpointEnabled`
|
|
322
|
+
* is true. Default 100 ms — short enough that a clean trailing-off
|
|
323
|
+
* end-of-utterance hits the speculative path quickly, but long enough
|
|
324
|
+
* to ride out mid-sentence micro-pauses. Gated by the flag so callers
|
|
325
|
+
* can opt in once they've validated the false-positive rate on their
|
|
326
|
+
* hardware. Default 100 ms.
|
|
327
|
+
*/
|
|
328
|
+
fastPauseHangoverMs?: number;
|
|
329
|
+
/**
|
|
330
|
+
* V1 — when true, use `fastPauseHangoverMs` instead of `pauseHangoverMs`.
|
|
331
|
+
* Default false until the streaming-ASR fast path (V2) ships.
|
|
332
|
+
*/
|
|
333
|
+
fastEndpointEnabled?: boolean;
|
|
334
|
+
/** Consecutive ms paused before the segment *ends* (finalize the turn).
|
|
335
|
+
* Default 700 ms. Must be ≥ `pauseHangoverMs`. */
|
|
336
|
+
endHangoverMs?: number;
|
|
337
|
+
/** A segment shorter than this (from onset to end) is reclassified as a
|
|
338
|
+
* `blip` rather than `speech-end`. Default 250 ms. */
|
|
339
|
+
minSpeechMs?: number;
|
|
340
|
+
/** Interval between `speech-active` heartbeats while speaking. Default
|
|
341
|
+
* 200 ms. */
|
|
342
|
+
activeHeartbeatMs?: number;
|
|
343
|
+
/**
|
|
344
|
+
* V4 — adaptive pause hangover. When the windowed RMS is in a sharp
|
|
345
|
+
* downward trend across the last few frames (the user audibly trailed
|
|
346
|
+
* off rather than stopping mid-thought), the hangover used to detect a
|
|
347
|
+
* pause is scaled by this factor (clamped to a minimum). Default 0.5
|
|
348
|
+
* (halve the hangover); set to 1.0 to disable.
|
|
349
|
+
*/
|
|
350
|
+
adaptiveHangoverScaleOnDrop?: number;
|
|
351
|
+
/**
|
|
352
|
+
* V4 — minimum hangover the adaptive scale is allowed to produce, ms.
|
|
353
|
+
* Default 50 ms. Prevents a steep drop from collapsing the hangover to
|
|
354
|
+
* zero and emitting a pause on a single quiet frame.
|
|
355
|
+
*/
|
|
356
|
+
adaptiveHangoverFloorMs?: number;
|
|
357
|
+
/**
|
|
358
|
+
* V4 — energy derivative (ΔRMS over the V4 history window) below this
|
|
359
|
+
* value, combined with RMS below `offsetThreshold`, counts as "audibly
|
|
360
|
+
* trailed off". Default -0.02 (negative slope: RMS dropping at least
|
|
361
|
+
* 0.02 / window).
|
|
362
|
+
*/
|
|
363
|
+
adaptiveHangoverDropThreshold?: number;
|
|
364
|
+
/** RMS gate config (tier 1). */
|
|
365
|
+
energyGate?: RmsEnergyGateConfig;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
type SegmentPhase = "idle" | "speaking" | "paused";
|
|
369
|
+
|
|
370
|
+
export type { VadLike } from "./types.js";
|
|
371
|
+
|
|
372
|
+
import type { VadLike } from "./types.js";
|
|
373
|
+
|
|
374
|
+
export type VadProviderId = "qwen-toolkit" | "silero-ggml";
|
|
375
|
+
export type VadProviderPreference = "auto" | VadProviderId;
|
|
376
|
+
|
|
377
|
+
export interface QwenToolkitVadAdapter {
|
|
378
|
+
isAvailable?(): boolean | Promise<boolean>;
|
|
379
|
+
loadVad(opts: { sampleRate: number }): Promise<VadLike>;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
export interface ResolvedVadProvider {
|
|
383
|
+
id: VadProviderId;
|
|
384
|
+
vad: VadLike;
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
export interface CreateVadDetectorOptions {
|
|
388
|
+
modelPath?: string;
|
|
389
|
+
bundleRoot?: string;
|
|
390
|
+
ffi?: ElizaInferenceFfi | null;
|
|
391
|
+
ctx?: ElizaInferenceContextHandle | (() => ElizaInferenceContextHandle);
|
|
392
|
+
qwenToolkitVad?: QwenToolkitVadAdapter | null;
|
|
393
|
+
config?: VadDetectorConfig;
|
|
394
|
+
prefer?: VadProviderPreference;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
export function vadProviderOrder(
|
|
398
|
+
prefer: VadProviderPreference = "auto",
|
|
399
|
+
): VadProviderId[] {
|
|
400
|
+
if (prefer !== "auto") return [prefer];
|
|
401
|
+
// `silero-ggml` is the fused `libelizainference` VAD ABI — the sole
|
|
402
|
+
// on-device VAD runtime. The optional injected `qwen-toolkit` adapter is
|
|
403
|
+
// tried first only when a caller supplies one; otherwise the fused engine
|
|
404
|
+
// is the single path, and an unavailable fused VAD fails fast.
|
|
405
|
+
return ["qwen-toolkit", "silero-ggml"];
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
export async function resolveVadProvider(
|
|
409
|
+
opts: CreateVadDetectorOptions = {},
|
|
410
|
+
): Promise<ResolvedVadProvider> {
|
|
411
|
+
const sampleRate = opts.config?.sampleRate ?? 16_000;
|
|
412
|
+
const tried: string[] = [];
|
|
413
|
+
const reasons: string[] = [];
|
|
414
|
+
|
|
415
|
+
for (const provider of vadProviderOrder(opts.prefer)) {
|
|
416
|
+
switch (provider) {
|
|
417
|
+
case "qwen-toolkit": {
|
|
418
|
+
tried.push(provider);
|
|
419
|
+
if (!opts.qwenToolkitVad) {
|
|
420
|
+
reasons.push("qwen-toolkit: no adapter supplied");
|
|
421
|
+
break;
|
|
422
|
+
}
|
|
423
|
+
const available = (await opts.qwenToolkitVad.isAvailable?.()) ?? true;
|
|
424
|
+
if (!available) {
|
|
425
|
+
reasons.push("qwen-toolkit: adapter reported unavailable");
|
|
426
|
+
break;
|
|
427
|
+
}
|
|
428
|
+
return {
|
|
429
|
+
id: provider,
|
|
430
|
+
vad: await opts.qwenToolkitVad.loadVad({ sampleRate }),
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
case "silero-ggml": {
|
|
434
|
+
tried.push(provider);
|
|
435
|
+
if (!opts.ffi || !opts.ctx) {
|
|
436
|
+
reasons.push(
|
|
437
|
+
"silero-ggml: libelizainference FFI / context not supplied",
|
|
438
|
+
);
|
|
439
|
+
break;
|
|
440
|
+
}
|
|
441
|
+
if (!GgmlSileroVad.isSupported(opts.ffi)) {
|
|
442
|
+
reasons.push(
|
|
443
|
+
"silero-ggml: libelizainference build does not export the VAD ABI (eliza_inference_vad_supported() == 0)",
|
|
444
|
+
);
|
|
445
|
+
break;
|
|
446
|
+
}
|
|
447
|
+
// Ensure the fused GGML model is on disk before opening the
|
|
448
|
+
// native session. This keeps the failure mode "no model file"
|
|
449
|
+
// distinct from a build with an ABI-only VAD.
|
|
450
|
+
const modelPath = resolveSileroVadPath({
|
|
451
|
+
modelPath: opts.modelPath,
|
|
452
|
+
bundleRoot: opts.bundleRoot,
|
|
453
|
+
});
|
|
454
|
+
if (!modelPath) {
|
|
455
|
+
throw new VadUnavailableError(
|
|
456
|
+
"model-missing",
|
|
457
|
+
`[voice] Fused Silero v5 GGML VAD model not found. Looked for ${SILERO_VAD_GGML_REL_PATH} in the Eliza-1 bundle and under ${localInferenceRoot()}, or set ELIZA_VAD_MODEL_PATH.`,
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
return {
|
|
461
|
+
id: provider,
|
|
462
|
+
vad: await GgmlSileroVad.load({
|
|
463
|
+
ffi: opts.ffi,
|
|
464
|
+
ctx: opts.ctx,
|
|
465
|
+
sampleRate,
|
|
466
|
+
}),
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
throw new VadUnavailableError(
|
|
473
|
+
"provider-missing",
|
|
474
|
+
`[voice] No VAD provider available. Tried: ${tried.join(", ")}. Reasons: ${reasons.join("; ") || "none reported"}.`,
|
|
475
|
+
);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
/**
|
|
479
|
+
* The authoritative VAD. Owns a model VAD provider (or any `VadLike` for tests),
|
|
480
|
+
* an `RmsEnergyGate`, and the speech state machine. `pushFrame()` accepts
|
|
481
|
+
* mic frames of any length ≥ 1 sample; internally it re-windows to the
|
|
482
|
+
* provider's fixed sample window. Emits `VadEvent`s on the VAD timeline and
|
|
483
|
+
* `EnergyGateEvent`s on the fast timeline.
|
|
484
|
+
*
|
|
485
|
+
* Frame ingestion is serialized (`pushFrame` awaits the model forward pass)
|
|
486
|
+
* so events stay in order; callers that can't await may fire-and-forget — a
|
|
487
|
+
* dropped-frame counter (`droppedFrames`) records overruns.
|
|
488
|
+
*/
|
|
489
|
+
export class VadDetector {
|
|
490
|
+
readonly silero: VadLike;
|
|
491
|
+
readonly energyGate: RmsEnergyGate;
|
|
492
|
+
private readonly sampleRate: number;
|
|
493
|
+
private readonly onsetThreshold: number;
|
|
494
|
+
private readonly offsetThreshold: number;
|
|
495
|
+
private readonly pauseHangoverMs: number;
|
|
496
|
+
private readonly fastPauseHangoverMs: number;
|
|
497
|
+
private readonly fastEndpointEnabled: boolean;
|
|
498
|
+
private readonly endHangoverMs: number;
|
|
499
|
+
private readonly minSpeechMs: number;
|
|
500
|
+
private readonly activeHeartbeatMs: number;
|
|
501
|
+
// V4 — adaptive hangover state.
|
|
502
|
+
private readonly adaptiveHangoverScaleOnDrop: number;
|
|
503
|
+
private readonly adaptiveHangoverFloorMs: number;
|
|
504
|
+
private readonly adaptiveHangoverDropThreshold: number;
|
|
505
|
+
// Rolling RMS history (last 3 windows ≈ 96 ms @ 16 kHz / 512). The
|
|
506
|
+
// sample-rate-of-drop check reads from this each window.
|
|
507
|
+
private readonly recentRms: number[] = [];
|
|
508
|
+
private static readonly RECENT_RMS_HISTORY = 3;
|
|
509
|
+
|
|
510
|
+
private readonly vadListeners = new Set<VadEventListener>();
|
|
511
|
+
|
|
512
|
+
private pending: Float32Array = new Float32Array(0);
|
|
513
|
+
private windowDurationMs: number;
|
|
514
|
+
private clockMs = 0; // timestamp of the *next* unconsumed sample
|
|
515
|
+
private busy: Promise<void> = Promise.resolve();
|
|
516
|
+
droppedFrames = 0;
|
|
517
|
+
|
|
518
|
+
private phase: SegmentPhase = "idle";
|
|
519
|
+
private speechStartMs = 0;
|
|
520
|
+
private lastSpeechMs = 0; // last window whose prob ≥ offsetThreshold
|
|
521
|
+
private pauseStartedMs = 0;
|
|
522
|
+
private lastHeartbeatMs = 0;
|
|
523
|
+
private peakRmsInSegment = 0;
|
|
524
|
+
|
|
525
|
+
constructor(silero: VadLike, config: VadDetectorConfig = {}) {
|
|
526
|
+
this.silero = silero;
|
|
527
|
+
this.sampleRate = config.sampleRate ?? silero.sampleRate;
|
|
528
|
+
if (this.sampleRate !== silero.sampleRate) {
|
|
529
|
+
throw new Error(
|
|
530
|
+
`[voice] VadDetector sample rate ${this.sampleRate} != Silero model rate ${silero.sampleRate}`,
|
|
531
|
+
);
|
|
532
|
+
}
|
|
533
|
+
this.onsetThreshold = config.onsetThreshold ?? 0.5;
|
|
534
|
+
this.offsetThreshold =
|
|
535
|
+
config.offsetThreshold ?? Math.max(0.1, this.onsetThreshold - 0.15);
|
|
536
|
+
// Lowered from 220ms; further reduction gated on semantic EOT classifier (V2).
|
|
537
|
+
// Override via ELIZA_PAUSE_HANGOVER_MS env var.
|
|
538
|
+
this.pauseHangoverMs =
|
|
539
|
+
config.pauseHangoverMs ?? readPauseHangoverMsEnv() ?? 100;
|
|
540
|
+
this.fastPauseHangoverMs = config.fastPauseHangoverMs ?? 100;
|
|
541
|
+
this.fastEndpointEnabled = config.fastEndpointEnabled ?? false;
|
|
542
|
+
this.endHangoverMs = Math.max(
|
|
543
|
+
this.fastEndpointEnabled
|
|
544
|
+
? this.fastPauseHangoverMs
|
|
545
|
+
: this.pauseHangoverMs,
|
|
546
|
+
config.endHangoverMs ?? 700,
|
|
547
|
+
);
|
|
548
|
+
this.minSpeechMs = config.minSpeechMs ?? 250;
|
|
549
|
+
this.activeHeartbeatMs = config.activeHeartbeatMs ?? 200;
|
|
550
|
+
this.adaptiveHangoverScaleOnDrop = Math.max(
|
|
551
|
+
0.1,
|
|
552
|
+
Math.min(1, config.adaptiveHangoverScaleOnDrop ?? 0.5),
|
|
553
|
+
);
|
|
554
|
+
this.adaptiveHangoverFloorMs = Math.max(
|
|
555
|
+
0,
|
|
556
|
+
config.adaptiveHangoverFloorMs ?? 50,
|
|
557
|
+
);
|
|
558
|
+
this.adaptiveHangoverDropThreshold =
|
|
559
|
+
config.adaptiveHangoverDropThreshold ?? -0.02;
|
|
560
|
+
this.energyGate = new RmsEnergyGate(config.energyGate);
|
|
561
|
+
this.windowDurationMs = (silero.windowSamples / this.sampleRate) * 1000;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/**
|
|
565
|
+
* Effective pause hangover for this window. Starts from
|
|
566
|
+
* `fastPauseHangoverMs` or `pauseHangoverMs` (V1: gated on
|
|
567
|
+
* `fastEndpointEnabled`), then optionally scales it down when the RMS
|
|
568
|
+
* trajectory shows an audible trail-off (V4).
|
|
569
|
+
*/
|
|
570
|
+
private effectivePauseHangoverMs(): number {
|
|
571
|
+
const base = this.fastEndpointEnabled
|
|
572
|
+
? this.fastPauseHangoverMs
|
|
573
|
+
: this.pauseHangoverMs;
|
|
574
|
+
if (this.adaptiveHangoverScaleOnDrop >= 1) return base;
|
|
575
|
+
// V4 — need at least two samples to compute a slope.
|
|
576
|
+
if (this.recentRms.length < 2) return base;
|
|
577
|
+
const first = this.recentRms[0];
|
|
578
|
+
const last = this.recentRms[this.recentRms.length - 1];
|
|
579
|
+
// Slope per window (we sample once per window). Negative = trailing off.
|
|
580
|
+
const slope = (last - first) / (this.recentRms.length - 1);
|
|
581
|
+
const lastBelowOffset = last < this.offsetThreshold;
|
|
582
|
+
if (slope <= this.adaptiveHangoverDropThreshold && lastBelowOffset) {
|
|
583
|
+
return Math.max(
|
|
584
|
+
this.adaptiveHangoverFloorMs,
|
|
585
|
+
base * this.adaptiveHangoverScaleOnDrop,
|
|
586
|
+
);
|
|
587
|
+
}
|
|
588
|
+
return base;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
onVadEvent(listener: VadEventListener): () => void {
|
|
592
|
+
this.vadListeners.add(listener);
|
|
593
|
+
return () => this.vadListeners.delete(listener);
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
onEnergyEvent(listener: EnergyGateListener): () => void {
|
|
597
|
+
return this.energyGate.onEvent(listener);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
/** True while a speech segment (incl. its pause hangover) is open. */
|
|
601
|
+
get inSpeech(): boolean {
|
|
602
|
+
return this.phase !== "idle";
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Feed a mic frame. Returns a promise that resolves once every full
|
|
607
|
+
* Silero window contained in (the accumulated buffer up to) this frame
|
|
608
|
+
* has been processed and its events emitted. The fast RMS gate fires
|
|
609
|
+
* synchronously before the await.
|
|
610
|
+
*/
|
|
611
|
+
pushFrame(frame: PcmFrame): Promise<void> {
|
|
612
|
+
if (frame.sampleRate !== this.sampleRate) {
|
|
613
|
+
return Promise.reject(
|
|
614
|
+
new Error(
|
|
615
|
+
`[voice] VadDetector expects ${this.sampleRate} Hz frames; got ${frame.sampleRate}. Resample upstream of the VAD.`,
|
|
616
|
+
),
|
|
617
|
+
);
|
|
618
|
+
}
|
|
619
|
+
// Tier 1: synchronous, no model.
|
|
620
|
+
this.energyGate.push(frame);
|
|
621
|
+
|
|
622
|
+
const pcm = frame.pcm.slice();
|
|
623
|
+
const timestampMs = frame.timestampMs;
|
|
624
|
+
const run = this.busy.then(async () => {
|
|
625
|
+
// Anchor the clock to the first frame so timestamps are mic-domain.
|
|
626
|
+
if (this.pending.length === 0 && this.clockMs === 0) {
|
|
627
|
+
this.clockMs = timestampMs;
|
|
628
|
+
}
|
|
629
|
+
// Append to the re-windowing buffer while holding the serialized
|
|
630
|
+
// drain chain. Fire-and-forget callers can overlap model inference;
|
|
631
|
+
// the shared pending buffer must still advance one frame at a time.
|
|
632
|
+
const merged = new Float32Array(this.pending.length + pcm.length);
|
|
633
|
+
merged.set(this.pending, 0);
|
|
634
|
+
merged.set(pcm, this.pending.length);
|
|
635
|
+
this.pending = merged;
|
|
636
|
+
await this.drainWindows();
|
|
637
|
+
});
|
|
638
|
+
// Keep the chain alive even if a window throws (the throw still
|
|
639
|
+
// surfaces via the returned promise).
|
|
640
|
+
this.busy = run.catch(() => {
|
|
641
|
+
this.droppedFrames++;
|
|
642
|
+
});
|
|
643
|
+
return run;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/** Flush any partial trailing samples (zero-padded to a full window) and
|
|
647
|
+
* finalize an open segment. Call at end-of-stream. */
|
|
648
|
+
flush(): Promise<void> {
|
|
649
|
+
const run = this.busy.then(async () => {
|
|
650
|
+
if (this.pending.length > 0) {
|
|
651
|
+
const w = new Float32Array(this.silero.windowSamples);
|
|
652
|
+
w.set(this.pending.subarray(0, this.silero.windowSamples));
|
|
653
|
+
this.pending = new Float32Array(0);
|
|
654
|
+
await this.processWindow(w);
|
|
655
|
+
}
|
|
656
|
+
if (this.phase !== "idle") {
|
|
657
|
+
this.endSegment(this.clockMs);
|
|
658
|
+
}
|
|
659
|
+
});
|
|
660
|
+
this.busy = run.catch(() => {
|
|
661
|
+
this.droppedFrames++;
|
|
662
|
+
});
|
|
663
|
+
return run;
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
reset(): void {
|
|
667
|
+
this.pending = new Float32Array(0);
|
|
668
|
+
this.clockMs = 0;
|
|
669
|
+
this.phase = "idle";
|
|
670
|
+
this.peakRmsInSegment = 0;
|
|
671
|
+
this.recentRms.length = 0;
|
|
672
|
+
this.silero.reset();
|
|
673
|
+
this.energyGate.reset();
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
private async drainWindows(): Promise<void> {
|
|
677
|
+
const win = this.silero.windowSamples;
|
|
678
|
+
while (this.pending.length >= win) {
|
|
679
|
+
const w = this.pending.subarray(0, win);
|
|
680
|
+
// Copy out so the slice is stable across the await.
|
|
681
|
+
const window = w.slice();
|
|
682
|
+
this.pending = this.pending.subarray(win);
|
|
683
|
+
await this.processWindow(window);
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
private async processWindow(window: Float32Array): Promise<void> {
|
|
688
|
+
const prob = await this.silero.process(window);
|
|
689
|
+
const windowRms = rms(window);
|
|
690
|
+
// V4 — keep a short rolling RMS history for the energy-rate-of-drop
|
|
691
|
+
// adaptive hangover. Three windows ≈ 96 ms at 16 kHz / 512 samples.
|
|
692
|
+
this.recentRms.push(windowRms);
|
|
693
|
+
if (this.recentRms.length > VadDetector.RECENT_RMS_HISTORY) {
|
|
694
|
+
this.recentRms.shift();
|
|
695
|
+
}
|
|
696
|
+
// Clock at the *end* of this window.
|
|
697
|
+
this.clockMs += this.windowDurationMs;
|
|
698
|
+
const now = this.clockMs;
|
|
699
|
+
const isSpeechFrame = prob >= this.onsetThreshold;
|
|
700
|
+
const aboveOffset = prob >= this.offsetThreshold;
|
|
701
|
+
|
|
702
|
+
switch (this.phase) {
|
|
703
|
+
case "idle": {
|
|
704
|
+
if (isSpeechFrame) {
|
|
705
|
+
this.phase = "speaking";
|
|
706
|
+
this.speechStartMs = now - this.windowDurationMs;
|
|
707
|
+
this.lastSpeechMs = now;
|
|
708
|
+
this.lastHeartbeatMs = now;
|
|
709
|
+
this.peakRmsInSegment = windowRms;
|
|
710
|
+
this.emit({
|
|
711
|
+
type: "speech-start",
|
|
712
|
+
timestampMs: this.speechStartMs,
|
|
713
|
+
probability: prob,
|
|
714
|
+
});
|
|
715
|
+
}
|
|
716
|
+
break;
|
|
717
|
+
}
|
|
718
|
+
case "speaking": {
|
|
719
|
+
this.peakRmsInSegment = Math.max(this.peakRmsInSegment, windowRms);
|
|
720
|
+
if (aboveOffset) {
|
|
721
|
+
this.lastSpeechMs = now;
|
|
722
|
+
}
|
|
723
|
+
const quietMs = now - this.lastSpeechMs;
|
|
724
|
+
if (quietMs >= this.effectivePauseHangoverMs()) {
|
|
725
|
+
this.phase = "paused";
|
|
726
|
+
this.pauseStartedMs = this.lastSpeechMs;
|
|
727
|
+
this.emit({
|
|
728
|
+
type: "speech-pause",
|
|
729
|
+
timestampMs: now,
|
|
730
|
+
pauseDurationMs: quietMs,
|
|
731
|
+
});
|
|
732
|
+
} else if (now - this.lastHeartbeatMs >= this.activeHeartbeatMs) {
|
|
733
|
+
this.lastHeartbeatMs = now;
|
|
734
|
+
this.emit({
|
|
735
|
+
type: "speech-active",
|
|
736
|
+
timestampMs: now,
|
|
737
|
+
probability: prob,
|
|
738
|
+
speechDurationMs: now - this.speechStartMs,
|
|
739
|
+
});
|
|
740
|
+
}
|
|
741
|
+
break;
|
|
742
|
+
}
|
|
743
|
+
case "paused": {
|
|
744
|
+
this.peakRmsInSegment = Math.max(this.peakRmsInSegment, windowRms);
|
|
745
|
+
if (isSpeechFrame) {
|
|
746
|
+
// Speech resumed before end-of-utterance.
|
|
747
|
+
this.phase = "speaking";
|
|
748
|
+
this.lastSpeechMs = now;
|
|
749
|
+
this.lastHeartbeatMs = now;
|
|
750
|
+
this.emit({
|
|
751
|
+
type: "speech-active",
|
|
752
|
+
timestampMs: now,
|
|
753
|
+
probability: prob,
|
|
754
|
+
speechDurationMs: now - this.speechStartMs,
|
|
755
|
+
});
|
|
756
|
+
} else {
|
|
757
|
+
const pauseMs = now - this.pauseStartedMs;
|
|
758
|
+
if (pauseMs >= this.endHangoverMs) {
|
|
759
|
+
this.endSegment(now);
|
|
760
|
+
} else {
|
|
761
|
+
this.emit({
|
|
762
|
+
type: "speech-pause",
|
|
763
|
+
timestampMs: now,
|
|
764
|
+
pauseDurationMs: pauseMs,
|
|
765
|
+
});
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
break;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
private endSegment(now: number): void {
|
|
774
|
+
const speechDurationMs = this.lastSpeechMs - this.speechStartMs;
|
|
775
|
+
const peakRms = this.peakRmsInSegment;
|
|
776
|
+
this.phase = "idle";
|
|
777
|
+
this.peakRmsInSegment = 0;
|
|
778
|
+
this.silero.reset();
|
|
779
|
+
if (speechDurationMs < this.minSpeechMs) {
|
|
780
|
+
this.emit({
|
|
781
|
+
type: "blip",
|
|
782
|
+
timestampMs: now,
|
|
783
|
+
durationMs: Math.max(0, speechDurationMs),
|
|
784
|
+
peakRms,
|
|
785
|
+
});
|
|
786
|
+
return;
|
|
787
|
+
}
|
|
788
|
+
this.emit({ type: "speech-end", timestampMs: now, speechDurationMs });
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
private emit(event: VadEvent): void {
|
|
792
|
+
for (const l of this.vadListeners) l(event);
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
|
|
796
|
+
/**
|
|
797
|
+
* Back-compat wrapper for callers that still use the legacy
|
|
798
|
+
* `createSileroVadDetector` name. It now goes through the full provider
|
|
799
|
+
* resolver — same as `createVadDetector`.
|
|
800
|
+
*/
|
|
801
|
+
export async function createSileroVadDetector(
|
|
802
|
+
opts: CreateVadDetectorOptions = {},
|
|
803
|
+
): Promise<VadDetector> {
|
|
804
|
+
return createVadDetector(opts);
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* Convenience: resolve the best available model VAD provider and wrap it in a
|
|
809
|
+
* `VadDetector`.
|
|
810
|
+
*/
|
|
811
|
+
export async function createVadDetector(
|
|
812
|
+
opts: CreateVadDetectorOptions = {},
|
|
813
|
+
): Promise<VadDetector> {
|
|
814
|
+
const { vad } = await resolveVadProvider(opts);
|
|
815
|
+
return new VadDetector(vad, opts.config);
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
/**
|
|
819
|
+
* Read `ELIZA_PAUSE_HANGOVER_MS` from the environment. Returns a positive
|
|
820
|
+
* integer when the variable is set and valid, otherwise `undefined`.
|
|
821
|
+
*/
|
|
822
|
+
function readPauseHangoverMsEnv(): number | undefined {
|
|
823
|
+
const raw = process.env.ELIZA_PAUSE_HANGOVER_MS?.trim();
|
|
824
|
+
if (!raw) return undefined;
|
|
825
|
+
const value = Number.parseInt(raw, 10);
|
|
826
|
+
return Number.isFinite(value) && value > 0 ? value : undefined;
|
|
827
|
+
}
|