@elizaos/plugin-local-inference 2.0.0-beta.1 → 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -0
- package/package.json +81 -15
- package/src/actions/generate-media.d.ts +59 -0
- package/src/actions/generate-media.d.ts.map +1 -0
- package/src/actions/generate-media.ts +647 -0
- package/src/actions/identify-speaker.d.ts +23 -0
- package/src/actions/identify-speaker.d.ts.map +1 -0
- package/src/actions/identify-speaker.ts +171 -0
- package/src/adapters/capacitor-llama/__tests__/compat-behavior.test.ts +218 -0
- package/src/adapters/capacitor-llama/__tests__/index.test.ts +68 -0
- package/src/adapters/capacitor-llama/__tests__/structured-output.test.ts +215 -0
- package/src/adapters/capacitor-llama/__tests__/text-streaming.test.ts +174 -0
- package/src/adapters/capacitor-llama/environment.ts +71 -0
- package/src/adapters/capacitor-llama/index.browser.ts +83 -0
- package/src/adapters/capacitor-llama/index.ts +807 -0
- package/src/adapters/capacitor-llama/loader.ts +109 -0
- package/src/adapters/capacitor-llama/structured-output.ts +165 -0
- package/src/adapters/capacitor-llama/text-streaming.ts +227 -0
- package/src/adapters/capacitor-llama/types.ts +374 -0
- package/src/backends/apple-foundation.ts +127 -0
- package/src/index.d.ts +7 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +54 -0
- package/src/local-inference-routes.d.ts +38 -0
- package/src/local-inference-routes.d.ts.map +1 -0
- package/src/local-inference-routes.test.ts +344 -0
- package/src/local-inference-routes.ts +1543 -0
- package/src/provider.d.ts +21 -0
- package/src/provider.d.ts.map +1 -0
- package/src/provider.ts +1171 -0
- package/src/routes/compat-helpers.d.ts +18 -0
- package/src/routes/compat-helpers.d.ts.map +1 -0
- package/src/routes/compat-helpers.ts +274 -0
- package/src/routes/family-member-route.d.ts +62 -0
- package/src/routes/family-member-route.d.ts.map +1 -0
- package/src/routes/family-member-route.ts +353 -0
- package/src/routes/index.d.ts +19 -0
- package/src/routes/index.d.ts.map +1 -0
- package/src/routes/index.ts +60 -0
- package/src/routes/live-diarization-route.d.ts +26 -0
- package/src/routes/live-diarization-route.d.ts.map +1 -0
- package/src/routes/live-diarization-route.test.ts +213 -0
- package/src/routes/live-diarization-route.ts +122 -0
- package/src/routes/local-inference-asr-route.d.ts +4 -0
- package/src/routes/local-inference-asr-route.d.ts.map +1 -0
- package/src/routes/local-inference-asr-route.test.ts +190 -0
- package/src/routes/local-inference-asr-route.ts +213 -0
- package/src/routes/local-inference-compat-routes.d.ts +16 -0
- package/src/routes/local-inference-compat-routes.d.ts.map +1 -0
- package/src/routes/local-inference-compat-routes.test.ts +423 -0
- package/src/routes/local-inference-compat-routes.ts +782 -0
- package/src/routes/local-inference-tts-route.d.ts +7 -0
- package/src/routes/local-inference-tts-route.d.ts.map +1 -0
- package/src/routes/local-inference-tts-route.test.ts +179 -0
- package/src/routes/local-inference-tts-route.ts +230 -0
- package/src/routes/voice-first-run-routes.d.ts +62 -0
- package/src/routes/voice-first-run-routes.d.ts.map +1 -0
- package/src/routes/voice-first-run-routes.ts +524 -0
- package/src/routes/voice-models-routes.d.ts +62 -0
- package/src/routes/voice-models-routes.d.ts.map +1 -0
- package/src/routes/voice-models-routes.ts +554 -0
- package/src/routes/voice-profile-plugin-routes.d.ts +19 -0
- package/src/routes/voice-profile-plugin-routes.d.ts.map +1 -0
- package/src/routes/voice-profile-plugin-routes.ts +138 -0
- package/src/routes/voice-profiles-management-routes.d.ts +52 -0
- package/src/routes/voice-profiles-management-routes.d.ts.map +1 -0
- package/src/routes/voice-profiles-management-routes.ts +476 -0
- package/src/routes/voice-speaker-profile-routes.d.ts +57 -0
- package/src/routes/voice-speaker-profile-routes.d.ts.map +1 -0
- package/src/routes/voice-speaker-profile-routes.ts +199 -0
- package/src/runtime/aosp-llama-loader-selection.test.ts +80 -0
- package/src/runtime/capacitor-llama.d.ts +25 -0
- package/src/runtime/embedding-manager-support.d.ts +77 -0
- package/src/runtime/embedding-manager-support.d.ts.map +1 -0
- package/src/runtime/embedding-manager-support.ts +497 -0
- package/src/runtime/embedding-presets.d.ts +16 -0
- package/src/runtime/embedding-presets.d.ts.map +1 -0
- package/src/runtime/embedding-presets.ts +81 -0
- package/src/runtime/embedding-warmup-policy.d.ts +14 -0
- package/src/runtime/embedding-warmup-policy.d.ts.map +1 -0
- package/src/runtime/embedding-warmup-policy.test.ts +53 -0
- package/src/runtime/embedding-warmup-policy.ts +48 -0
- package/src/runtime/ensure-local-inference-handler.d.ts +53 -0
- package/src/runtime/ensure-local-inference-handler.d.ts.map +1 -0
- package/src/runtime/ensure-local-inference-handler.test.ts +528 -0
- package/src/runtime/ensure-local-inference-handler.ts +1398 -0
- package/src/runtime/index.d.ts +14 -0
- package/src/runtime/index.d.ts.map +1 -0
- package/src/runtime/index.ts +27 -0
- package/src/runtime/mobile-local-inference-gate.d.ts +31 -0
- package/src/runtime/mobile-local-inference-gate.d.ts.map +1 -0
- package/src/runtime/mobile-local-inference-gate.test.ts +69 -0
- package/src/runtime/mobile-local-inference-gate.ts +44 -0
- package/src/runtime/voice-entity-binding.d.ts +103 -0
- package/src/runtime/voice-entity-binding.d.ts.map +1 -0
- package/src/runtime/voice-entity-binding.transcript.test.ts +69 -0
- package/src/runtime/voice-entity-binding.ts +328 -0
- package/src/services/README.md +71 -0
- package/src/services/__tests__/backend-selector.test.ts +101 -0
- package/src/services/__tests__/checkpoint-manager.test.ts +376 -0
- package/src/services/__tests__/gpu-autotune.test.ts +400 -0
- package/src/services/__tests__/llm-streaming-binding.test.ts +85 -0
- package/src/services/__tests__/planner-grammar.test.ts +372 -0
- package/src/services/__tests__/runtime-target.test.ts +176 -0
- package/src/services/active-model-switch-rollback.test.ts +183 -0
- package/src/services/active-model.d.ts +282 -0
- package/src/services/active-model.d.ts.map +1 -0
- package/src/services/active-model.ts +1213 -0
- package/src/services/asr/errors.d.ts +21 -0
- package/src/services/asr/errors.d.ts.map +1 -0
- package/src/services/asr/errors.ts +50 -0
- package/src/services/asr/hash.d.ts +28 -0
- package/src/services/asr/hash.d.ts.map +1 -0
- package/src/services/asr/hash.ts +49 -0
- package/src/services/asr/index.d.ts +76 -0
- package/src/services/asr/index.d.ts.map +1 -0
- package/src/services/asr/index.ts +178 -0
- package/src/services/asr/types.d.ts +91 -0
- package/src/services/asr/types.d.ts.map +1 -0
- package/src/services/asr/types.ts +95 -0
- package/src/services/assignments.d.ts +71 -0
- package/src/services/assignments.d.ts.map +1 -0
- package/src/services/assignments.test.ts +80 -0
- package/src/services/assignments.ts +230 -0
- package/src/services/backend-selector.ts +95 -0
- package/src/services/backend.d.ts +346 -0
- package/src/services/backend.d.ts.map +1 -0
- package/src/services/backend.ts +612 -0
- package/src/services/bundled-models.d.ts +34 -0
- package/src/services/bundled-models.d.ts.map +1 -0
- package/src/services/bundled-models.ts +129 -0
- package/src/services/cache-bridge.d.ts +206 -0
- package/src/services/cache-bridge.d.ts.map +1 -0
- package/src/services/cache-bridge.test.ts +516 -0
- package/src/services/cache-bridge.ts +423 -0
- package/src/services/catalog.d.ts +10 -0
- package/src/services/catalog.d.ts.map +1 -0
- package/src/services/catalog.test.ts +240 -0
- package/src/services/catalog.ts +27 -0
- package/src/services/checkpoint-client.d.ts +109 -0
- package/src/services/checkpoint-client.d.ts.map +1 -0
- package/src/services/checkpoint-client.ts +258 -0
- package/src/services/checkpoint-manager.ts +474 -0
- package/src/services/cloud-fallback.d.ts +102 -0
- package/src/services/cloud-fallback.d.ts.map +1 -0
- package/src/services/cloud-fallback.ts +230 -0
- package/src/services/conversation-registry.d.ts +142 -0
- package/src/services/conversation-registry.d.ts.map +1 -0
- package/src/services/conversation-registry.test.ts +235 -0
- package/src/services/conversation-registry.ts +264 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts +92 -0
- package/src/services/desktop-fused-ffi-backend-runtime.d.ts.map +1 -0
- package/src/services/desktop-fused-ffi-backend-runtime.ts +333 -0
- package/src/services/device-bridge.d.ts +188 -0
- package/src/services/device-bridge.d.ts.map +1 -0
- package/src/services/device-bridge.ts +1237 -0
- package/src/services/device-resource-metrics.d.ts +149 -0
- package/src/services/device-resource-metrics.d.ts.map +1 -0
- package/src/services/device-resource-metrics.test.ts +98 -0
- package/src/services/device-resource-metrics.ts +346 -0
- package/src/services/device-tier.d.ts +115 -0
- package/src/services/device-tier.d.ts.map +1 -0
- package/src/services/device-tier.test.ts +371 -0
- package/src/services/device-tier.ts +410 -0
- package/src/services/downloader.d.ts +82 -0
- package/src/services/downloader.d.ts.map +1 -0
- package/src/services/downloader.test.ts +724 -0
- package/src/services/downloader.ts +899 -0
- package/src/services/engine-direct-bundle.test.ts +58 -0
- package/src/services/engine-streaming.test.ts +80 -0
- package/src/services/engine.d.ts +534 -0
- package/src/services/engine.d.ts.map +1 -0
- package/src/services/engine.ts +1891 -0
- package/src/services/ensure-local-artifacts.integration.test.ts +273 -0
- package/src/services/ensure-local-artifacts.test.ts +368 -0
- package/src/services/ensure-local-artifacts.ts +351 -0
- package/src/services/external-scanner.d.ts +17 -0
- package/src/services/external-scanner.d.ts.map +1 -0
- package/src/services/external-scanner.ts +312 -0
- package/src/services/ffi-llm-mock.ts +354 -0
- package/src/services/ffi-llm-streaming-abi.ts +442 -0
- package/src/services/ffi-streaming-backend.d.ts +180 -0
- package/src/services/ffi-streaming-backend.d.ts.map +1 -0
- package/src/services/ffi-streaming-backend.ts +382 -0
- package/src/services/ffi-streaming-runner.d.ts +122 -0
- package/src/services/ffi-streaming-runner.d.ts.map +1 -0
- package/src/services/ffi-streaming-runner.test.ts +60 -0
- package/src/services/ffi-streaming-runner.ts +354 -0
- package/src/services/ffi-unload-ordering.test.ts +162 -0
- package/src/services/gpu-autotune.ts +534 -0
- package/src/services/gpu-detect.ts +139 -0
- package/src/services/handler-registry.d.ts +72 -0
- package/src/services/handler-registry.d.ts.map +1 -0
- package/src/services/handler-registry.ts +240 -0
- package/src/services/hardware.d.ts +63 -0
- package/src/services/hardware.d.ts.map +1 -0
- package/src/services/hardware.test.ts +183 -0
- package/src/services/hardware.ts +404 -0
- package/src/services/hf-search.d.ts +26 -0
- package/src/services/hf-search.d.ts.map +1 -0
- package/src/services/hf-search.test.ts +69 -0
- package/src/services/hf-search.ts +420 -0
- package/src/services/image-description-runtime.d.ts +14 -0
- package/src/services/image-description-runtime.d.ts.map +1 -0
- package/src/services/image-description-runtime.test.ts +61 -0
- package/src/services/image-description-runtime.ts +118 -0
- package/src/services/imagegen/aosp-unavailable.d.ts +134 -0
- package/src/services/imagegen/aosp-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/aosp-unavailable.ts +229 -0
- package/src/services/imagegen/backend-selector.d.ts +118 -0
- package/src/services/imagegen/backend-selector.d.ts.map +1 -0
- package/src/services/imagegen/backend-selector.ts +281 -0
- package/src/services/imagegen/coreml-unavailable.d.ts +105 -0
- package/src/services/imagegen/coreml-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/coreml-unavailable.ts +237 -0
- package/src/services/imagegen/errors.d.ts +16 -0
- package/src/services/imagegen/errors.d.ts.map +1 -0
- package/src/services/imagegen/errors.ts +40 -0
- package/src/services/imagegen/index.d.ts +58 -0
- package/src/services/imagegen/index.d.ts.map +1 -0
- package/src/services/imagegen/index.ts +144 -0
- package/src/services/imagegen/mflux.d.ts +74 -0
- package/src/services/imagegen/mflux.d.ts.map +1 -0
- package/src/services/imagegen/mflux.ts +313 -0
- package/src/services/imagegen/sd-cpp.d.ts +180 -0
- package/src/services/imagegen/sd-cpp.d.ts.map +1 -0
- package/src/services/imagegen/sd-cpp.ts +718 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts +83 -0
- package/src/services/imagegen/tensorrt-unavailable.d.ts.map +1 -0
- package/src/services/imagegen/tensorrt-unavailable.ts +295 -0
- package/src/services/imagegen/types.d.ts +181 -0
- package/src/services/imagegen/types.d.ts.map +1 -0
- package/src/services/imagegen/types.ts +193 -0
- package/src/services/index.d.ts +30 -0
- package/src/services/index.d.ts.map +1 -0
- package/src/services/index.ts +225 -0
- package/src/services/inference-capabilities.d.ts +132 -0
- package/src/services/inference-capabilities.d.ts.map +1 -0
- package/src/services/inference-capabilities.test.ts +75 -0
- package/src/services/inference-capabilities.ts +204 -0
- package/src/services/inference-telemetry.d.ts +59 -0
- package/src/services/inference-telemetry.d.ts.map +1 -0
- package/src/services/inference-telemetry.ts +143 -0
- package/src/services/ios-llama-streaming.ts +248 -0
- package/src/services/kv-spill.d.ts +189 -0
- package/src/services/kv-spill.d.ts.map +1 -0
- package/src/services/kv-spill.test.ts +222 -0
- package/src/services/kv-spill.ts +356 -0
- package/src/services/latency-trace.d.ts +346 -0
- package/src/services/latency-trace.d.ts.map +1 -0
- package/src/services/latency-trace.test.ts +266 -0
- package/src/services/latency-trace.ts +844 -0
- package/src/services/llama-server-metrics.ts +304 -0
- package/src/services/llm-streaming-binding.d.ts +96 -0
- package/src/services/llm-streaming-binding.d.ts.map +1 -0
- package/src/services/llm-streaming-binding.ts +136 -0
- package/src/services/load-args.d.ts +82 -0
- package/src/services/load-args.d.ts.map +1 -0
- package/src/services/load-args.ts +81 -0
- package/src/services/manifest/eliza-1.manifest.v1.json +708 -0
- package/src/services/manifest/index.d.ts +4 -0
- package/src/services/manifest/index.d.ts.map +1 -0
- package/src/services/manifest/index.ts +66 -0
- package/src/services/manifest/manifest.test.ts +693 -0
- package/src/services/manifest/schema.d.ts +715 -0
- package/src/services/manifest/schema.d.ts.map +1 -0
- package/src/services/manifest/schema.ts +655 -0
- package/src/services/manifest/types.d.ts +30 -0
- package/src/services/manifest/types.d.ts.map +1 -0
- package/src/services/manifest/types.ts +55 -0
- package/src/services/manifest/validator.d.ts +66 -0
- package/src/services/manifest/validator.d.ts.map +1 -0
- package/src/services/manifest/validator.ts +569 -0
- package/src/services/memory-arbiter.d.ts +343 -0
- package/src/services/memory-arbiter.d.ts.map +1 -0
- package/src/services/memory-arbiter.test.ts +419 -0
- package/src/services/memory-arbiter.ts +1000 -0
- package/src/services/memory-monitor.d.ts +119 -0
- package/src/services/memory-monitor.d.ts.map +1 -0
- package/src/services/memory-monitor.test.ts +208 -0
- package/src/services/memory-monitor.ts +296 -0
- package/src/services/memory-pressure.d.ts +127 -0
- package/src/services/memory-pressure.d.ts.map +1 -0
- package/src/services/memory-pressure.ts +413 -0
- package/src/services/mtp-doctor.d.ts +13 -0
- package/src/services/mtp-doctor.d.ts.map +1 -0
- package/src/services/mtp-doctor.ts +78 -0
- package/src/services/network-policy.d.ts +127 -0
- package/src/services/network-policy.d.ts.map +1 -0
- package/src/services/network-policy.ts +346 -0
- package/src/services/paths.d.ts +6 -0
- package/src/services/paths.d.ts.map +1 -0
- package/src/services/paths.ts +25 -0
- package/src/services/planner-skeleton.d.ts +124 -0
- package/src/services/planner-skeleton.d.ts.map +1 -0
- package/src/services/planner-skeleton.ts +175 -0
- package/src/services/providers.d.ts +38 -0
- package/src/services/providers.d.ts.map +1 -0
- package/src/services/providers.ts +507 -0
- package/src/services/ram-budget-cache.test.ts +163 -0
- package/src/services/ram-budget.d.ts +110 -0
- package/src/services/ram-budget.d.ts.map +1 -0
- package/src/services/ram-budget.ts +0 -0
- package/src/services/readiness.d.ts +9 -0
- package/src/services/readiness.d.ts.map +1 -0
- package/src/services/readiness.test.ts +87 -0
- package/src/services/readiness.ts +238 -0
- package/src/services/recommendation.d.ts +111 -0
- package/src/services/recommendation.d.ts.map +1 -0
- package/src/services/recommendation.ts +672 -0
- package/src/services/registry.d.ts +35 -0
- package/src/services/registry.d.ts.map +1 -0
- package/src/services/registry.ts +151 -0
- package/src/services/router-handler.d.ts +92 -0
- package/src/services/router-handler.d.ts.map +1 -0
- package/src/services/router-handler.test.ts +45 -0
- package/src/services/router-handler.ts +376 -0
- package/src/services/routing-policy.d.ts +55 -0
- package/src/services/routing-policy.d.ts.map +1 -0
- package/src/services/routing-policy.ts +228 -0
- package/src/services/routing-preferences.d.ts +8 -0
- package/src/services/routing-preferences.d.ts.map +1 -0
- package/src/services/routing-preferences.ts +15 -0
- package/src/services/runtime-target.d.ts +98 -0
- package/src/services/runtime-target.d.ts.map +1 -0
- package/src/services/runtime-target.ts +154 -0
- package/src/services/service.d.ts +128 -0
- package/src/services/service.d.ts.map +1 -0
- package/src/services/service.test.ts +223 -0
- package/src/services/service.ts +735 -0
- package/src/services/session-pool.d.ts +72 -0
- package/src/services/session-pool.d.ts.map +1 -0
- package/src/services/session-pool.ts +153 -0
- package/src/services/structured-output/deterministic-repair.d.ts +23 -0
- package/src/services/structured-output/deterministic-repair.d.ts.map +1 -0
- package/src/services/structured-output/deterministic-repair.test.ts +169 -0
- package/src/services/structured-output/deterministic-repair.ts +443 -0
- package/src/services/structured-output/index.ts +4 -0
- package/src/services/structured-output.d.ts +311 -0
- package/src/services/structured-output.d.ts.map +1 -0
- package/src/services/structured-output.test.ts +483 -0
- package/src/services/structured-output.ts +712 -0
- package/src/services/transcription-priority.test.ts +211 -0
- package/src/services/tts/errors.ts +46 -0
- package/src/services/tts/index.ts +214 -0
- package/src/services/tts/tts-audio-cache.ts +235 -0
- package/src/services/tts/types.ts +157 -0
- package/src/services/types.d.ts +19 -0
- package/src/services/types.d.ts.map +1 -0
- package/src/services/types.ts +55 -0
- package/src/services/verify-on-device.d.ts +34 -0
- package/src/services/verify-on-device.d.ts.map +1 -0
- package/src/services/verify-on-device.test.ts +87 -0
- package/src/services/verify-on-device.ts +127 -0
- package/src/services/verify.d.ts +8 -0
- package/src/services/verify.d.ts.map +1 -0
- package/src/services/verify.ts +13 -0
- package/src/services/vision/aosp-unavailable.d.ts +115 -0
- package/src/services/vision/aosp-unavailable.d.ts.map +1 -0
- package/src/services/vision/aosp-unavailable.ts +163 -0
- package/src/services/vision/capacitor-llama.d.ts +99 -0
- package/src/services/vision/capacitor-llama.d.ts.map +1 -0
- package/src/services/vision/capacitor-llama.ts +255 -0
- package/src/services/vision/cloud-fallback.d.ts +47 -0
- package/src/services/vision/cloud-fallback.d.ts.map +1 -0
- package/src/services/vision/cloud-fallback.test.ts +243 -0
- package/src/services/vision/cloud-fallback.ts +268 -0
- package/src/services/vision/fallback-chain.test.ts +86 -0
- package/src/services/vision/hash.d.ts +71 -0
- package/src/services/vision/hash.d.ts.map +1 -0
- package/src/services/vision/hash.ts +157 -0
- package/src/services/vision/index.d.ts +95 -0
- package/src/services/vision/index.d.ts.map +1 -0
- package/src/services/vision/index.ts +251 -0
- package/src/services/vision/llama-server.d.ts +73 -0
- package/src/services/vision/llama-server.d.ts.map +1 -0
- package/src/services/vision/llama-server.ts +177 -0
- package/src/services/vision/types.d.ts +153 -0
- package/src/services/vision/types.d.ts.map +1 -0
- package/src/services/vision/types.ts +154 -0
- package/src/services/vision/vast-fallback.d.ts +18 -0
- package/src/services/vision/vast-fallback.d.ts.map +1 -0
- package/src/services/vision/vast-fallback.ts +127 -0
- package/src/services/vision-embedding-cache.d.ts +98 -0
- package/src/services/vision-embedding-cache.d.ts.map +1 -0
- package/src/services/vision-embedding-cache.ts +189 -0
- package/src/services/voice/VOICE_WORKBENCH.md +88 -0
- package/src/services/voice/__test-helpers__/fake-ffi.ts +92 -0
- package/src/services/voice/__test-helpers__/synthetic-speech.ts +124 -0
- package/src/services/voice/__tests__/checkpoint-manager.test.ts +241 -0
- package/src/services/voice/__tests__/checkpoint-policy.test.ts +270 -0
- package/src/services/voice/__tests__/eager-context-builder.test.ts +257 -0
- package/src/services/voice/__tests__/eliza1-eot-scorer.test.ts +288 -0
- package/src/services/voice/__tests__/eot-classifier.test.ts +431 -0
- package/src/services/voice/__tests__/optimistic-rollback.test.ts +312 -0
- package/src/services/voice/__tests__/prefill-client.test.ts +266 -0
- package/src/services/voice/__tests__/prefix-preserving-queue.test.ts +208 -0
- package/src/services/voice/__tests__/streaming-asr.test.ts +450 -0
- package/src/services/voice/__tests__/streaming-transcriber.test.ts +339 -0
- package/src/services/voice/__tests__/turn-detector-resolver.test.ts +197 -0
- package/src/services/voice/__tests__/voice-state-machine-prefill.test.ts +275 -0
- package/src/services/voice/__tests__/voice-state-machine.test.ts +354 -0
- package/src/services/voice/audio-frame-consumer.d.ts +212 -0
- package/src/services/voice/audio-frame-consumer.d.ts.map +1 -0
- package/src/services/voice/audio-frame-consumer.test.ts +343 -0
- package/src/services/voice/audio-frame-consumer.ts +491 -0
- package/src/services/voice/barge-in.d.ts +112 -0
- package/src/services/voice/barge-in.d.ts.map +1 -0
- package/src/services/voice/barge-in.test.ts +244 -0
- package/src/services/voice/barge-in.ts +336 -0
- package/src/services/voice/cancellation-coordinator.d.ts +127 -0
- package/src/services/voice/cancellation-coordinator.d.ts.map +1 -0
- package/src/services/voice/cancellation-coordinator.test.ts +196 -0
- package/src/services/voice/cancellation-coordinator.ts +269 -0
- package/src/services/voice/checkpoint-manager.d.ts +199 -0
- package/src/services/voice/checkpoint-manager.d.ts.map +1 -0
- package/src/services/voice/checkpoint-manager.ts +401 -0
- package/src/services/voice/checkpoint-policy.ts +336 -0
- package/src/services/voice/composite-eot-classifier.test.ts +59 -0
- package/src/services/voice/e2e-harness.test.ts +182 -0
- package/src/services/voice/e2e-harness.ts +743 -0
- package/src/services/voice/eager-context-builder.d.ts +170 -0
- package/src/services/voice/eager-context-builder.d.ts.map +1 -0
- package/src/services/voice/eager-context-builder.ts +262 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts +124 -0
- package/src/services/voice/eliza1-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/eliza1-eot-scorer.ts +242 -0
- package/src/services/voice/embedding-server.ts +200 -0
- package/src/services/voice/embedding.d.ts +133 -0
- package/src/services/voice/embedding.d.ts.map +1 -0
- package/src/services/voice/embedding.test.ts +148 -0
- package/src/services/voice/embedding.ts +244 -0
- package/src/services/voice/emotion-attribution.d.ts +68 -0
- package/src/services/voice/emotion-attribution.d.ts.map +1 -0
- package/src/services/voice/emotion-attribution.test.ts +129 -0
- package/src/services/voice/emotion-attribution.ts +361 -0
- package/src/services/voice/engine-bridge-cancellation.test.ts +422 -0
- package/src/services/voice/engine-bridge.d.ts +746 -0
- package/src/services/voice/engine-bridge.d.ts.map +1 -0
- package/src/services/voice/engine-bridge.test.ts +384 -0
- package/src/services/voice/engine-bridge.ts +2226 -0
- package/src/services/voice/eot-classifier-ggml.d.ts +179 -0
- package/src/services/voice/eot-classifier-ggml.d.ts.map +1 -0
- package/src/services/voice/eot-classifier-ggml.ts +566 -0
- package/src/services/voice/eot-classifier.d.ts +214 -0
- package/src/services/voice/eot-classifier.d.ts.map +1 -0
- package/src/services/voice/eot-classifier.ts +533 -0
- package/src/services/voice/errors.d.ts +20 -0
- package/src/services/voice/errors.d.ts.map +1 -0
- package/src/services/voice/errors.ts +32 -0
- package/src/services/voice/expressive-tags.d.ts +158 -0
- package/src/services/voice/expressive-tags.d.ts.map +1 -0
- package/src/services/voice/expressive-tags.ts +405 -0
- package/src/services/voice/ffi-bindings.d.ts +636 -0
- package/src/services/voice/ffi-bindings.d.ts.map +1 -0
- package/src/services/voice/ffi-bindings.test.ts +671 -0
- package/src/services/voice/ffi-bindings.ts +3050 -0
- package/src/services/voice/first-line-cache.d.ts +181 -0
- package/src/services/voice/first-line-cache.d.ts.map +1 -0
- package/src/services/voice/first-line-cache.ts +725 -0
- package/src/services/voice/fused-eot-scorer.d.ts +51 -0
- package/src/services/voice/fused-eot-scorer.d.ts.map +1 -0
- package/src/services/voice/fused-eot-scorer.ts +135 -0
- package/src/services/voice/index.d.ts +91 -0
- package/src/services/voice/index.d.ts.map +1 -0
- package/src/services/voice/index.ts +481 -0
- package/src/services/voice/kokoro/__tests__/kokoro-backend.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.real.test.ts +151 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-bridge.test.ts +60 -0
- package/src/services/voice/kokoro/__tests__/kokoro-engine-discovery.test.ts +277 -0
- package/src/services/voice/kokoro/__tests__/kokoro-ffi-runtime.test.ts +235 -0
- package/src/services/voice/kokoro/__tests__/kokoro-runtime.test.ts +95 -0
- package/src/services/voice/kokoro/__tests__/phonemizer.test.ts +53 -0
- package/src/services/voice/kokoro/__tests__/runtime-selection.test.ts +231 -0
- package/src/services/voice/kokoro/__tests__/voices.test.ts +57 -0
- package/src/services/voice/kokoro/index.ts +79 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts +72 -0
- package/src/services/voice/kokoro/kokoro-backend.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-backend.ts +207 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts +58 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-engine-discovery.ts +177 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts +75 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-ffi-runtime.ts +233 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts +100 -0
- package/src/services/voice/kokoro/kokoro-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/kokoro-runtime.ts +170 -0
- package/src/services/voice/kokoro/phoneme-stream.ts +123 -0
- package/src/services/voice/kokoro/phonemizer.d.ts +50 -0
- package/src/services/voice/kokoro/phonemizer.d.ts.map +1 -0
- package/src/services/voice/kokoro/phonemizer.ts +344 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts +61 -0
- package/src/services/voice/kokoro/pick-runtime.d.ts.map +1 -0
- package/src/services/voice/kokoro/pick-runtime.test.ts +91 -0
- package/src/services/voice/kokoro/pick-runtime.ts +130 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts +92 -0
- package/src/services/voice/kokoro/runtime-selection.d.ts.map +1 -0
- package/src/services/voice/kokoro/runtime-selection.ts +237 -0
- package/src/services/voice/kokoro/types.d.ts +82 -0
- package/src/services/voice/kokoro/types.d.ts.map +1 -0
- package/src/services/voice/kokoro/types.ts +95 -0
- package/src/services/voice/kokoro/voice-presets.d.ts +23 -0
- package/src/services/voice/kokoro/voice-presets.d.ts.map +1 -0
- package/src/services/voice/kokoro/voice-presets.ts +129 -0
- package/src/services/voice/kokoro/voices.d.ts +30 -0
- package/src/services/voice/kokoro/voices.d.ts.map +1 -0
- package/src/services/voice/kokoro/voices.ts +64 -0
- package/src/services/voice/lifecycle.d.ts +135 -0
- package/src/services/voice/lifecycle.d.ts.map +1 -0
- package/src/services/voice/lifecycle.test.ts +315 -0
- package/src/services/voice/lifecycle.ts +301 -0
- package/src/services/voice/live-diarization-session.d.ts +96 -0
- package/src/services/voice/live-diarization-session.d.ts.map +1 -0
- package/src/services/voice/live-diarization-session.ts +289 -0
- package/src/services/voice/mic-source.d.ts +136 -0
- package/src/services/voice/mic-source.d.ts.map +1 -0
- package/src/services/voice/mic-source.test.ts +210 -0
- package/src/services/voice/mic-source.ts +503 -0
- package/src/services/voice/optimistic-policy.d.ts +109 -0
- package/src/services/voice/optimistic-policy.d.ts.map +1 -0
- package/src/services/voice/optimistic-policy.test.ts +101 -0
- package/src/services/voice/optimistic-policy.ts +192 -0
- package/src/services/voice/optimistic-rollback.ts +343 -0
- package/src/services/voice/partial-stabilizer.d.ts +73 -0
- package/src/services/voice/partial-stabilizer.d.ts.map +1 -0
- package/src/services/voice/partial-stabilizer.test.ts +68 -0
- package/src/services/voice/partial-stabilizer.ts +140 -0
- package/src/services/voice/phoneme-tokenizer.d.ts +49 -0
- package/src/services/voice/phoneme-tokenizer.d.ts.map +1 -0
- package/src/services/voice/phoneme-tokenizer.ts +158 -0
- package/src/services/voice/phrase-cache.d.ts +76 -0
- package/src/services/voice/phrase-cache.d.ts.map +1 -0
- package/src/services/voice/phrase-cache.test.ts +242 -0
- package/src/services/voice/phrase-cache.ts +186 -0
- package/src/services/voice/phrase-chunker.d.ts +62 -0
- package/src/services/voice/phrase-chunker.d.ts.map +1 -0
- package/src/services/voice/phrase-chunker.test.ts +239 -0
- package/src/services/voice/phrase-chunker.ts +281 -0
- package/src/services/voice/pipeline-impls.d.ts +151 -0
- package/src/services/voice/pipeline-impls.d.ts.map +1 -0
- package/src/services/voice/pipeline-impls.l6.test.ts +110 -0
- package/src/services/voice/pipeline-impls.test.ts +292 -0
- package/src/services/voice/pipeline-impls.ts +315 -0
- package/src/services/voice/pipeline.d.ts +216 -0
- package/src/services/voice/pipeline.d.ts.map +1 -0
- package/src/services/voice/pipeline.ts +505 -0
- package/src/services/voice/prefill-client.d.ts +123 -0
- package/src/services/voice/prefill-client.d.ts.map +1 -0
- package/src/services/voice/prefill-client.ts +316 -0
- package/src/services/voice/prefix-preserving-queue.d.ts +113 -0
- package/src/services/voice/prefix-preserving-queue.d.ts.map +1 -0
- package/src/services/voice/prefix-preserving-queue.ts +162 -0
- package/src/services/voice/profile-store.d.ts +248 -0
- package/src/services/voice/profile-store.d.ts.map +1 -0
- package/src/services/voice/profile-store.ts +887 -0
- package/src/services/voice/ring-buffer.d.ts +40 -0
- package/src/services/voice/ring-buffer.d.ts.map +1 -0
- package/src/services/voice/ring-buffer.ts +105 -0
- package/src/services/voice/rollback-queue.d.ts +24 -0
- package/src/services/voice/rollback-queue.d.ts.map +1 -0
- package/src/services/voice/rollback-queue.ts +74 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts +67 -0
- package/src/services/voice/samantha-preset-placeholder.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-placeholder.test.ts +97 -0
- package/src/services/voice/samantha-preset-placeholder.ts +148 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts +87 -0
- package/src/services/voice/samantha-preset-regenerator.d.ts.map +1 -0
- package/src/services/voice/samantha-preset-regenerator.ts +393 -0
- package/src/services/voice/scheduler.d.ts +146 -0
- package/src/services/voice/scheduler.d.ts.map +1 -0
- package/src/services/voice/scheduler.t2.test.ts +141 -0
- package/src/services/voice/scheduler.ts +927 -0
- package/src/services/voice/shared-resources.d.ts +190 -0
- package/src/services/voice/shared-resources.d.ts.map +1 -0
- package/src/services/voice/shared-resources.ts +320 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts +74 -0
- package/src/services/voice/speaker/attribution-pipeline.d.ts.map +1 -0
- package/src/services/voice/speaker/attribution-pipeline.ts +386 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts +59 -0
- package/src/services/voice/speaker/diarizer-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer-fused.real.test.ts +100 -0
- package/src/services/voice/speaker/diarizer-fused.ts +154 -0
- package/src/services/voice/speaker/diarizer.d.ts +75 -0
- package/src/services/voice/speaker/diarizer.d.ts.map +1 -0
- package/src/services/voice/speaker/diarizer.ts +218 -0
- package/src/services/voice/speaker/encoder-fused.d.ts +60 -0
- package/src/services/voice/speaker/encoder-fused.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-fused.real.test.ts +113 -0
- package/src/services/voice/speaker/encoder-fused.ts +138 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts +33 -0
- package/src/services/voice/speaker/encoder-ggml.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder-ggml.ts +79 -0
- package/src/services/voice/speaker/encoder.d.ts +37 -0
- package/src/services/voice/speaker/encoder.d.ts.map +1 -0
- package/src/services/voice/speaker/encoder.ts +105 -0
- package/src/services/voice/speaker-imprint.d.ts +83 -0
- package/src/services/voice/speaker-imprint.d.ts.map +1 -0
- package/src/services/voice/speaker-imprint.test.ts +185 -0
- package/src/services/voice/speaker-imprint.ts +312 -0
- package/src/services/voice/speaker-preset-cache.d.ts +77 -0
- package/src/services/voice/speaker-preset-cache.d.ts.map +1 -0
- package/src/services/voice/speaker-preset-cache.test.ts +154 -0
- package/src/services/voice/speaker-preset-cache.ts +195 -0
- package/src/services/voice/streaming-asr/streaming-pipeline-adapter.ts +292 -0
- package/src/services/voice/system-audio-sink.d.ts +73 -0
- package/src/services/voice/system-audio-sink.d.ts.map +1 -0
- package/src/services/voice/system-audio-sink.test.ts +29 -0
- package/src/services/voice/system-audio-sink.ts +366 -0
- package/src/services/voice/transcriber.d.ts +244 -0
- package/src/services/voice/transcriber.d.ts.map +1 -0
- package/src/services/voice/transcriber.test.ts +392 -0
- package/src/services/voice/transcriber.ts +704 -0
- package/src/services/voice/turn-controller.d.ts +183 -0
- package/src/services/voice/turn-controller.d.ts.map +1 -0
- package/src/services/voice/turn-controller.test.ts +575 -0
- package/src/services/voice/turn-controller.ts +596 -0
- package/src/services/voice/types.d.ts +643 -0
- package/src/services/voice/types.d.ts.map +1 -0
- package/src/services/voice/types.ts +699 -0
- package/src/services/voice/vad.d.ts +282 -0
- package/src/services/voice/vad.d.ts.map +1 -0
- package/src/services/voice/vad.test.ts +480 -0
- package/src/services/voice/vad.ts +827 -0
- package/src/services/voice/vad.v1-v4.test.ts +222 -0
- package/src/services/voice/voice-budget.d.ts +241 -0
- package/src/services/voice/voice-budget.d.ts.map +1 -0
- package/src/services/voice/voice-budget.test.ts +420 -0
- package/src/services/voice/voice-budget.ts +656 -0
- package/src/services/voice/voice-duet.test.ts +375 -0
- package/src/services/voice/voice-emotion-classifier.d.ts +95 -0
- package/src/services/voice/voice-emotion-classifier.d.ts.map +1 -0
- package/src/services/voice/voice-emotion-classifier.test.ts +210 -0
- package/src/services/voice/voice-emotion-classifier.ts +273 -0
- package/src/services/voice/voice-preset-format.d.ts +158 -0
- package/src/services/voice/voice-preset-format.d.ts.map +1 -0
- package/src/services/voice/voice-preset-format.ts +700 -0
- package/src/services/voice/voice-preset-generator.test.ts +89 -0
- package/src/services/voice/voice-profile-artifact.d.ts +116 -0
- package/src/services/voice/voice-profile-artifact.d.ts.map +1 -0
- package/src/services/voice/voice-profile-artifact.test.ts +138 -0
- package/src/services/voice/voice-profile-artifact.ts +518 -0
- package/src/services/voice/voice-profile-routes.d.ts +83 -0
- package/src/services/voice/voice-profile-routes.d.ts.map +1 -0
- package/src/services/voice/voice-profile-routes.test.ts +429 -0
- package/src/services/voice/voice-profile-routes.ts +425 -0
- package/src/services/voice/voice-scenario.ts +154 -0
- package/src/services/voice/voice-settings.d.ts +82 -0
- package/src/services/voice/voice-settings.d.ts.map +1 -0
- package/src/services/voice/voice-settings.ts +172 -0
- package/src/services/voice/voice-state-machine.d.ts +364 -0
- package/src/services/voice/voice-state-machine.d.ts.map +1 -0
- package/src/services/voice/voice-state-machine.ts +727 -0
- package/src/services/voice/voice-workbench-report.test.ts +168 -0
- package/src/services/voice/voice-workbench-report.ts +326 -0
- package/src/services/voice/voice-workbench.test.ts +158 -0
- package/src/services/voice/voice.test.ts +1070 -0
- package/src/services/voice/wake-word-ggml.d.ts +101 -0
- package/src/services/voice/wake-word-ggml.d.ts.map +1 -0
- package/src/services/voice/wake-word-ggml.ts +320 -0
- package/src/services/voice/wake-word.d.ts +255 -0
- package/src/services/voice/wake-word.d.ts.map +1 -0
- package/src/services/voice/wake-word.test.ts +298 -0
- package/src/services/voice/wake-word.ts +554 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts +70 -0
- package/src/services/voice/wrap-with-first-line-cache.d.ts.map +1 -0
- package/src/services/voice/wrap-with-first-line-cache.ts +267 -0
- package/src/services/voice-model-updater.d.ts +240 -0
- package/src/services/voice-model-updater.d.ts.map +1 -0
- package/src/services/voice-model-updater.ts +724 -0
- package/src/services/voice-prewarm.d.ts +3 -0
- package/src/services/voice-prewarm.d.ts.map +1 -0
- package/src/services/voice-prewarm.ts +51 -0
- package/dist/index.d.ts +0 -37
- package/dist/index.js +0 -1098
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pyannote-segmentation-3.0 shared types and pure segmentation logic.
|
|
3
|
+
*
|
|
4
|
+
* Diarization runs EXCLUSIVELY through the fused `libelizainference`
|
|
5
|
+
* `eliza_inference_diariz_*` ABI (`FusedDiarizer` in `diarizer-fused.ts`).
|
|
6
|
+
* The standalone `libvoice_classifier` binding has been removed — there is one
|
|
7
|
+
* on-device voice runtime.
|
|
8
|
+
*
|
|
9
|
+
* This file holds the shared types (`Diarizer`, `LocalSpeakerSegment`,
|
|
10
|
+
* `DiarizerOutput`), the model-id / window constants, the structured
|
|
11
|
+
* `DiarizerUnavailableError`, and the pure `classifyFramesToSegments` reducer
|
|
12
|
+
* the fused diarizer feeds its per-frame labels through.
|
|
13
|
+
*/
|
|
14
|
+
export declare const PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID: "pyannote-segmentation-3.0-int8";
|
|
15
|
+
export declare const PYANNOTE_SEGMENTATION_3_FP32_MODEL_ID: "pyannote-segmentation-3.0-fp32";
|
|
16
|
+
export type PyannoteDiarizerModelId = typeof PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID | typeof PYANNOTE_SEGMENTATION_3_FP32_MODEL_ID;
|
|
17
|
+
/** pyannote 3.0 segmentation window length (seconds) — model-fixed. */
|
|
18
|
+
export declare const PYANNOTE_WINDOW_SECONDS = 5;
|
|
19
|
+
/** Required mono sample rate (matches upstream training config). */
|
|
20
|
+
export declare const PYANNOTE_SAMPLE_RATE = 16000;
|
|
21
|
+
/** Number of output frames per 5 s window (= 293 in the upstream export). */
|
|
22
|
+
export declare const PYANNOTE_FRAMES_PER_WINDOW = 293;
|
|
23
|
+
/** Per-frame stride in milliseconds (5_000ms / 293 frames ≈ 17.06 ms). */
|
|
24
|
+
export declare const PYANNOTE_FRAME_STRIDE_MS: number;
|
|
25
|
+
/** Output class count — 3 single + 3 overlap + 1 silence = 7. */
|
|
26
|
+
export declare const PYANNOTE_CLASS_COUNT = 7;
|
|
27
|
+
/**
|
|
28
|
+
* Powerset mapping of pyannote-3 segmentation classes. Each class is
|
|
29
|
+
* the set of local speaker indices active in that frame. Class 0 is the
|
|
30
|
+
* silence/no-speaker frame. This matches the upstream `Powerset` head
|
|
31
|
+
* with `max_speakers_per_chunk=3, max_speakers_per_frame=2`.
|
|
32
|
+
*/
|
|
33
|
+
export declare const PYANNOTE_CLASS_TO_SPEAKERS: ReadonlyArray<ReadonlyArray<number>>;
|
|
34
|
+
/** Thrown when the diarizer cannot be constructed. */
|
|
35
|
+
export declare class DiarizerUnavailableError extends Error {
|
|
36
|
+
readonly code: "ort-missing" | "native-missing" | "library-missing" | "model-missing" | "model-unavailable" | "model-load-failed" | "model-shape-mismatch" | "forward-not-implemented" | "invalid-input";
|
|
37
|
+
constructor(code: DiarizerUnavailableError["code"], message: string);
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* One speaker-tagged span within a diarized window. `localSpeakerId` is
|
|
41
|
+
* **window-local** (0..2): the same physical speaker gets different
|
|
42
|
+
* local ids in different windows. The profile store re-clusters local
|
|
43
|
+
* ids into stable identities via the WeSpeaker embedding cosine.
|
|
44
|
+
*/
|
|
45
|
+
export interface LocalSpeakerSegment {
|
|
46
|
+
startMs: number;
|
|
47
|
+
endMs: number;
|
|
48
|
+
localSpeakerId: number;
|
|
49
|
+
/** Best class confidence over the span (max softmax). */
|
|
50
|
+
confidence: number;
|
|
51
|
+
/** True if the span contains any overlap-class frames. */
|
|
52
|
+
hasOverlap: boolean;
|
|
53
|
+
}
|
|
54
|
+
export interface DiarizerOutput {
|
|
55
|
+
segments: LocalSpeakerSegment[];
|
|
56
|
+
/** Number of distinct local speakers observed in the window. */
|
|
57
|
+
localSpeakerCount: number;
|
|
58
|
+
/** Total speech (any-speaker) duration in milliseconds. */
|
|
59
|
+
speechMs: number;
|
|
60
|
+
}
|
|
61
|
+
export interface Diarizer {
|
|
62
|
+
readonly modelId: PyannoteDiarizerModelId;
|
|
63
|
+
readonly sampleRate: number;
|
|
64
|
+
/** Process one ~5 s window of PCM. */
|
|
65
|
+
diarizeWindow(pcm: Float32Array): Promise<DiarizerOutput>;
|
|
66
|
+
dispose(): Promise<void>;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Reduce a per-frame class probability tensor into one segment per
|
|
70
|
+
* (local speaker × contiguous frame run). Frames where the silence
|
|
71
|
+
* class wins are excluded; frames in overlap classes contribute to
|
|
72
|
+
* **all** speakers in that class.
|
|
73
|
+
*/
|
|
74
|
+
export declare function classifyFramesToSegments(classProbs: Float32Array, frames: number, classCount: number, startMs: number, frameStrideMs: number): DiarizerOutput;
|
|
75
|
+
//# sourceMappingURL=diarizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"diarizer.d.ts","sourceRoot":"","sources":["diarizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAEH,eAAO,MAAM,qCAAqC,EACjD,gCAAyC,CAAC;AAC3C,eAAO,MAAM,qCAAqC,EACjD,gCAAyC,CAAC;AAC3C,MAAM,MAAM,uBAAuB,GAChC,OAAO,qCAAqC,GAC5C,OAAO,qCAAqC,CAAC;AAEhD,uEAAuE;AACvE,eAAO,MAAM,uBAAuB,IAAI,CAAC;AACzC,oEAAoE;AACpE,eAAO,MAAM,oBAAoB,QAAS,CAAC;AAC3C,6EAA6E;AAC7E,eAAO,MAAM,0BAA0B,MAAM,CAAC;AAC9C,0EAA0E;AAC1E,eAAO,MAAM,wBAAwB,QAC0B,CAAC;AAChE,iEAAiE;AACjE,eAAO,MAAM,oBAAoB,IAAI,CAAC;AAEtC;;;;;GAKG;AACH,eAAO,MAAM,0BAA0B,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAS1E,CAAC;AAEH,sDAAsD;AACtD,qBAAa,wBAAyB,SAAQ,KAAK;IAClD,QAAQ,CAAC,IAAI,EACV,aAAa,GACb,gBAAgB,GAChB,iBAAiB,GACjB,eAAe,GACf,mBAAmB,GACnB,mBAAmB,GACnB,sBAAsB,GACtB,yBAAyB,GACzB,eAAe,CAAC;gBACP,IAAI,EAAE,wBAAwB,CAAC,MAAM,CAAC,EAAE,OAAO,EAAE,MAAM;CAKnE;AAED;;;;;GAKG;AACH,MAAM,WAAW,mBAAmB;IACnC,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,MAAM,CAAC;IACvB,yDAAyD;IACzD,UAAU,EAAE,MAAM,CAAC;IACnB,0DAA0D;IAC1D,UAAU,EAAE,OAAO,CAAC;CACpB;AAED,MAAM,WAAW,cAAc;IAC9B,QAAQ,EAAE,mBAAmB,EAAE,CAAC;IAChC,gEAAgE;IAChE,iBAAiB,EAAE,MAAM,CAAC;IAC1B,2DAA2D;IAC3D,QAAQ,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,QAAQ;IACxB,QAAQ,CAAC,OAAO,EAAE,uBAAuB,CAAC;IAC1C,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;IAC5B,sCAAsC;IACtC,aAAa,CAAC,GAAG,EAAE,YAAY,GAAG,OAAO,CAAC,cAAc,CAAC,CAAC;IAC1D,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;CACzB;AAmBD;;;;;GAKG;AACH,wBAAgB,wBAAwB,CACvC,UAAU,EAAE,YAAY,EACxB,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,OAAO,EAAE,MAAM,EACf,aAAa,EAAE,MAAM,GACnB,cAAc,CAsFhB"}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pyannote-segmentation-3.0 shared types and pure segmentation logic.
|
|
3
|
+
*
|
|
4
|
+
* Diarization runs EXCLUSIVELY through the fused `libelizainference`
|
|
5
|
+
* `eliza_inference_diariz_*` ABI (`FusedDiarizer` in `diarizer-fused.ts`).
|
|
6
|
+
* The standalone `libvoice_classifier` binding has been removed — there is one
|
|
7
|
+
* on-device voice runtime.
|
|
8
|
+
*
|
|
9
|
+
* This file holds the shared types (`Diarizer`, `LocalSpeakerSegment`,
|
|
10
|
+
* `DiarizerOutput`), the model-id / window constants, the structured
|
|
11
|
+
* `DiarizerUnavailableError`, and the pure `classifyFramesToSegments` reducer
|
|
12
|
+
* the fused diarizer feeds its per-frame labels through.
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
export const PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID =
|
|
16
|
+
"pyannote-segmentation-3.0-int8" as const;
|
|
17
|
+
export const PYANNOTE_SEGMENTATION_3_FP32_MODEL_ID =
|
|
18
|
+
"pyannote-segmentation-3.0-fp32" as const;
|
|
19
|
+
export type PyannoteDiarizerModelId =
|
|
20
|
+
| typeof PYANNOTE_SEGMENTATION_3_INT8_MODEL_ID
|
|
21
|
+
| typeof PYANNOTE_SEGMENTATION_3_FP32_MODEL_ID;
|
|
22
|
+
|
|
23
|
+
/** pyannote 3.0 segmentation window length (seconds) — model-fixed. */
|
|
24
|
+
export const PYANNOTE_WINDOW_SECONDS = 5;
|
|
25
|
+
/** Required mono sample rate (matches upstream training config). */
|
|
26
|
+
export const PYANNOTE_SAMPLE_RATE = 16_000;
|
|
27
|
+
/** Number of output frames per 5 s window (= 293 in the upstream export). */
|
|
28
|
+
export const PYANNOTE_FRAMES_PER_WINDOW = 293;
|
|
29
|
+
/** Per-frame stride in milliseconds (5_000ms / 293 frames ≈ 17.06 ms). */
|
|
30
|
+
export const PYANNOTE_FRAME_STRIDE_MS =
|
|
31
|
+
(1_000 * PYANNOTE_WINDOW_SECONDS) / PYANNOTE_FRAMES_PER_WINDOW;
|
|
32
|
+
/** Output class count — 3 single + 3 overlap + 1 silence = 7. */
|
|
33
|
+
export const PYANNOTE_CLASS_COUNT = 7;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Powerset mapping of pyannote-3 segmentation classes. Each class is
|
|
37
|
+
* the set of local speaker indices active in that frame. Class 0 is the
|
|
38
|
+
* silence/no-speaker frame. This matches the upstream `Powerset` head
|
|
39
|
+
* with `max_speakers_per_chunk=3, max_speakers_per_frame=2`.
|
|
40
|
+
*/
|
|
41
|
+
export const PYANNOTE_CLASS_TO_SPEAKERS: ReadonlyArray<ReadonlyArray<number>> =
|
|
42
|
+
[
|
|
43
|
+
[], // 0: silence
|
|
44
|
+
[0], // 1: speaker 0 only
|
|
45
|
+
[1], // 2: speaker 1 only
|
|
46
|
+
[2], // 3: speaker 2 only
|
|
47
|
+
[0, 1], // 4: speakers 0+1 overlap
|
|
48
|
+
[0, 2], // 5: speakers 0+2 overlap
|
|
49
|
+
[1, 2], // 6: speakers 1+2 overlap
|
|
50
|
+
];
|
|
51
|
+
|
|
52
|
+
/** Thrown when the diarizer cannot be constructed. */
|
|
53
|
+
export class DiarizerUnavailableError extends Error {
|
|
54
|
+
readonly code:
|
|
55
|
+
| "ort-missing"
|
|
56
|
+
| "native-missing"
|
|
57
|
+
| "library-missing"
|
|
58
|
+
| "model-missing"
|
|
59
|
+
| "model-unavailable"
|
|
60
|
+
| "model-load-failed"
|
|
61
|
+
| "model-shape-mismatch"
|
|
62
|
+
| "forward-not-implemented"
|
|
63
|
+
| "invalid-input";
|
|
64
|
+
constructor(code: DiarizerUnavailableError["code"], message: string) {
|
|
65
|
+
super(message);
|
|
66
|
+
this.name = "DiarizerUnavailableError";
|
|
67
|
+
this.code = code;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* One speaker-tagged span within a diarized window. `localSpeakerId` is
|
|
73
|
+
* **window-local** (0..2): the same physical speaker gets different
|
|
74
|
+
* local ids in different windows. The profile store re-clusters local
|
|
75
|
+
* ids into stable identities via the WeSpeaker embedding cosine.
|
|
76
|
+
*/
|
|
77
|
+
export interface LocalSpeakerSegment {
|
|
78
|
+
startMs: number;
|
|
79
|
+
endMs: number;
|
|
80
|
+
localSpeakerId: number;
|
|
81
|
+
/** Best class confidence over the span (max softmax). */
|
|
82
|
+
confidence: number;
|
|
83
|
+
/** True if the span contains any overlap-class frames. */
|
|
84
|
+
hasOverlap: boolean;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export interface DiarizerOutput {
|
|
88
|
+
segments: LocalSpeakerSegment[];
|
|
89
|
+
/** Number of distinct local speakers observed in the window. */
|
|
90
|
+
localSpeakerCount: number;
|
|
91
|
+
/** Total speech (any-speaker) duration in milliseconds. */
|
|
92
|
+
speechMs: number;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export interface Diarizer {
|
|
96
|
+
readonly modelId: PyannoteDiarizerModelId;
|
|
97
|
+
readonly sampleRate: number;
|
|
98
|
+
/** Process one ~5 s window of PCM. */
|
|
99
|
+
diarizeWindow(pcm: Float32Array): Promise<DiarizerOutput>;
|
|
100
|
+
dispose(): Promise<void>;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/** Numerically-stable softmax over the last axis. */
|
|
104
|
+
function softmax(row: Float32Array): Float32Array {
|
|
105
|
+
let max = -Infinity;
|
|
106
|
+
for (let i = 0; i < row.length; i += 1) {
|
|
107
|
+
if (row[i] > max) max = row[i];
|
|
108
|
+
}
|
|
109
|
+
const out = new Float32Array(row.length);
|
|
110
|
+
let sum = 0;
|
|
111
|
+
for (let i = 0; i < row.length; i += 1) {
|
|
112
|
+
out[i] = Math.exp(row[i] - max);
|
|
113
|
+
sum += out[i];
|
|
114
|
+
}
|
|
115
|
+
if (sum === 0) return out;
|
|
116
|
+
for (let i = 0; i < row.length; i += 1) out[i] /= sum;
|
|
117
|
+
return out;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Reduce a per-frame class probability tensor into one segment per
|
|
122
|
+
* (local speaker × contiguous frame run). Frames where the silence
|
|
123
|
+
* class wins are excluded; frames in overlap classes contribute to
|
|
124
|
+
* **all** speakers in that class.
|
|
125
|
+
*/
|
|
126
|
+
export function classifyFramesToSegments(
|
|
127
|
+
classProbs: Float32Array,
|
|
128
|
+
frames: number,
|
|
129
|
+
classCount: number,
|
|
130
|
+
startMs: number,
|
|
131
|
+
frameStrideMs: number,
|
|
132
|
+
): DiarizerOutput {
|
|
133
|
+
if (classProbs.length !== frames * classCount) {
|
|
134
|
+
throw new DiarizerUnavailableError(
|
|
135
|
+
"model-load-failed",
|
|
136
|
+
`[pyannote] frame×class tensor mismatch: have ${classProbs.length}, expected ${frames * classCount}`,
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
type Active = {
|
|
140
|
+
startFrame: number;
|
|
141
|
+
endFrame: number;
|
|
142
|
+
confSum: number;
|
|
143
|
+
count: number;
|
|
144
|
+
hasOverlap: boolean;
|
|
145
|
+
};
|
|
146
|
+
// Per-speaker active runs. The pyannote-3 head supports 3 speakers.
|
|
147
|
+
const open = new Map<number, Active>();
|
|
148
|
+
const closed: Array<Active & { speakerId: number }> = [];
|
|
149
|
+
|
|
150
|
+
let speechFrames = 0;
|
|
151
|
+
|
|
152
|
+
for (let f = 0; f < frames; f += 1) {
|
|
153
|
+
const offset = f * classCount;
|
|
154
|
+
const row = classProbs.subarray(offset, offset + classCount);
|
|
155
|
+
const probs = softmax(row);
|
|
156
|
+
// Pick winning class.
|
|
157
|
+
let winner = 0;
|
|
158
|
+
let winnerProb = probs[0];
|
|
159
|
+
for (let c = 1; c < classCount; c += 1) {
|
|
160
|
+
if (probs[c] > winnerProb) {
|
|
161
|
+
winner = c;
|
|
162
|
+
winnerProb = probs[c];
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
const activeSpeakers = PYANNOTE_CLASS_TO_SPEAKERS[winner] ?? [];
|
|
166
|
+
const isOverlap = activeSpeakers.length > 1;
|
|
167
|
+
if (activeSpeakers.length > 0) speechFrames += 1;
|
|
168
|
+
|
|
169
|
+
// Close runs for speakers not active this frame.
|
|
170
|
+
for (const [sid, run] of open.entries()) {
|
|
171
|
+
if (!activeSpeakers.includes(sid)) {
|
|
172
|
+
closed.push({ ...run, speakerId: sid });
|
|
173
|
+
open.delete(sid);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
// Open / extend runs for active speakers.
|
|
177
|
+
for (const sid of activeSpeakers) {
|
|
178
|
+
const existing = open.get(sid);
|
|
179
|
+
if (existing) {
|
|
180
|
+
existing.endFrame = f + 1;
|
|
181
|
+
existing.confSum += winnerProb;
|
|
182
|
+
existing.count += 1;
|
|
183
|
+
existing.hasOverlap = existing.hasOverlap || isOverlap;
|
|
184
|
+
} else {
|
|
185
|
+
open.set(sid, {
|
|
186
|
+
startFrame: f,
|
|
187
|
+
endFrame: f + 1,
|
|
188
|
+
confSum: winnerProb,
|
|
189
|
+
count: 1,
|
|
190
|
+
hasOverlap: isOverlap,
|
|
191
|
+
});
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
// Flush remaining open runs.
|
|
196
|
+
for (const [sid, run] of open.entries()) {
|
|
197
|
+
closed.push({ ...run, speakerId: sid });
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const segments = closed
|
|
201
|
+
.map<LocalSpeakerSegment>((run) => ({
|
|
202
|
+
startMs: Math.round(startMs + run.startFrame * frameStrideMs),
|
|
203
|
+
endMs: Math.round(startMs + run.endFrame * frameStrideMs),
|
|
204
|
+
localSpeakerId: run.speakerId,
|
|
205
|
+
confidence: run.count > 0 ? run.confSum / run.count : 0,
|
|
206
|
+
hasOverlap: run.hasOverlap,
|
|
207
|
+
}))
|
|
208
|
+
.sort((a, b) =>
|
|
209
|
+
a.startMs !== b.startMs ? a.startMs - b.startMs : a.endMs - b.endMs,
|
|
210
|
+
);
|
|
211
|
+
|
|
212
|
+
const localSpeakers = new Set(segments.map((s) => s.localSpeakerId));
|
|
213
|
+
return {
|
|
214
|
+
segments,
|
|
215
|
+
localSpeakerCount: localSpeakers.size,
|
|
216
|
+
speechMs: Math.round(speechFrames * frameStrideMs),
|
|
217
|
+
};
|
|
218
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speaker-embedding encoder — fused `libelizainference` binding (ABI v6).
|
|
3
|
+
*
|
|
4
|
+
* The strategic on-device voice engine is the single fused-FFI
|
|
5
|
+
* `libelizainference` library (the merged llama.cpp fork — see
|
|
6
|
+
* `plugins/plugin-local-inference/native/CLAUDE.md` §1). This class drives the
|
|
7
|
+
* WeSpeaker ResNet34-LM speaker encoder through that one native handle via the
|
|
8
|
+
* `eliza_inference_speaker_*` ABI. This is the SOLE on-device speaker-encoder
|
|
9
|
+
* runtime — the same `ffi`/`ctx` pair powers VAD / wake-word / TTS / ASR, so the
|
|
10
|
+
* whole voice pipeline runs through one library.
|
|
11
|
+
*
|
|
12
|
+
* Shape mirrors the legacy `encoder.ts::SpeakerEncoder` contract exactly:
|
|
13
|
+
* - 16 kHz mono fp32 PCM in,
|
|
14
|
+
* - one L2-normalized 256-d embedding out,
|
|
15
|
+
* - `encode(pcm)` / `dispose()`.
|
|
16
|
+
*
|
|
17
|
+
* No silent fallback: when the fused build does not export the speaker ABI
|
|
18
|
+
* (`eliza_inference_speaker_supported() == 0`) `load()` throws a structured
|
|
19
|
+
* `SpeakerEncoderGgmlUnavailableError` (AGENTS.md §3 — no synthetic
|
|
20
|
+
* embeddings, no standalone-lib fallback).
|
|
21
|
+
*/
|
|
22
|
+
import type { ElizaInferenceContextHandle, ElizaInferenceFfi } from "../ffi-bindings";
|
|
23
|
+
import type { SpeakerEncoder } from "./encoder";
|
|
24
|
+
export interface FusedSpeakerEncoderOptions {
|
|
25
|
+
ffi: ElizaInferenceFfi;
|
|
26
|
+
ctx: ElizaInferenceContextHandle | (() => ElizaInferenceContextHandle);
|
|
27
|
+
/**
|
|
28
|
+
* Optional explicit WeSpeaker GGUF path. `null` lets the native runtime
|
|
29
|
+
* resolve the bundle's `speaker/` dir (the default).
|
|
30
|
+
*/
|
|
31
|
+
ggufPath?: string | null;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Fused-`libelizainference` WeSpeaker speaker encoder. Owns one
|
|
35
|
+
* `eliza_inference_speaker_*` session; `encode()` runs one forward pass over
|
|
36
|
+
* the supplied 16 kHz PCM and returns the normalized 256-d embedding. The
|
|
37
|
+
* native side owns the model graph; this class is a thin handle.
|
|
38
|
+
*/
|
|
39
|
+
export declare class FusedSpeakerEncoder implements SpeakerEncoder {
|
|
40
|
+
private readonly ffi;
|
|
41
|
+
private readonly handle;
|
|
42
|
+
readonly embeddingDim = 256;
|
|
43
|
+
readonly sampleRate = 16000;
|
|
44
|
+
readonly modelId: "wespeaker-resnet34-lm-int8";
|
|
45
|
+
private disposed;
|
|
46
|
+
private constructor();
|
|
47
|
+
/**
|
|
48
|
+
* True only when the fused `libelizainference` build exports the speaker
|
|
49
|
+
* ABI and advertises support at runtime.
|
|
50
|
+
*/
|
|
51
|
+
static isSupported(ffi: ElizaInferenceFfi | null | undefined): boolean;
|
|
52
|
+
/**
|
|
53
|
+
* Open a native speaker-encoder session. Throws
|
|
54
|
+
* `SpeakerEncoderGgmlUnavailableError` when the runtime is not present.
|
|
55
|
+
*/
|
|
56
|
+
static load(opts: FusedSpeakerEncoderOptions): Promise<FusedSpeakerEncoder>;
|
|
57
|
+
encode(pcm: Float32Array): Promise<Float32Array>;
|
|
58
|
+
dispose(): Promise<void>;
|
|
59
|
+
}
|
|
60
|
+
//# sourceMappingURL=encoder-fused.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"encoder-fused.d.ts","sourceRoot":"","sources":["encoder-fused.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,OAAO,KAAK,EACX,2BAA2B,EAC3B,iBAAiB,EAEjB,MAAM,iBAAiB,CAAC;AACzB,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,WAAW,CAAC;AAShD,MAAM,WAAW,0BAA0B;IAC1C,GAAG,EAAE,iBAAiB,CAAC;IACvB,GAAG,EAAE,2BAA2B,GAAG,CAAC,MAAM,2BAA2B,CAAC,CAAC;IACvE;;;OAGG;IACH,QAAQ,CAAC,EAAE,MAAM,GAAG,IAAI,CAAC;CACzB;AAED;;;;;GAKG;AACH,qBAAa,mBAAoB,YAAW,cAAc;IAOxD,OAAO,CAAC,QAAQ,CAAC,GAAG;IACpB,OAAO,CAAC,QAAQ,CAAC,MAAM;IAPxB,QAAQ,CAAC,YAAY,OAA8B;IACnD,QAAQ,CAAC,UAAU,SAA4B;IAC/C,QAAQ,CAAC,OAAO,+BAAuC;IACvD,OAAO,CAAC,QAAQ,CAAS;IAEzB,OAAO;IAKP;;;OAGG;IACH,MAAM,CAAC,WAAW,CAAC,GAAG,EAAE,iBAAiB,GAAG,IAAI,GAAG,SAAS,GAAG,OAAO;IAKtE;;;OAGG;WACU,IAAI,CAChB,IAAI,EAAE,0BAA0B,GAC9B,OAAO,CAAC,mBAAmB,CAAC;IAyBzB,MAAM,CAAC,GAAG,EAAE,YAAY,GAAG,OAAO,CAAC,YAAY,CAAC;IA6BhD,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAK9B"}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Real-FFI tests for `FusedSpeakerEncoder`: run against the ACTUAL fused
|
|
3
|
+
* `libelizainference` — loaded, `create`d, and probed for `speakerSupported()`
|
|
4
|
+
* — never a stub. The speaker encoder is the SOLE on-device speaker runtime
|
|
5
|
+
* (the `eliza_inference_speaker_*` ABI off the one fused handle).
|
|
6
|
+
*
|
|
7
|
+
* Skipped (not faked) when the fused lib is not resolvable, or when it does not
|
|
8
|
+
* link the WeSpeaker speaker graph. To run them, point `ELIZA_INFERENCE_LIBRARY`
|
|
9
|
+
* (or `ELIZA_INFERENCE_LIB_DIR`) at a built `libelizainference` with the speaker
|
|
10
|
+
* ABI, or build one via `packages/app-core/scripts/build-llama-cpp-mtp.mjs`.
|
|
11
|
+
* Runs in the post-merge `bun test` lane (`*.real.test.ts` is excluded from the
|
|
12
|
+
* default lane in `vitest.config.ts`).
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { existsSync, mkdtempSync, rmSync } from "node:fs";
|
|
16
|
+
import os from "node:os";
|
|
17
|
+
import path from "node:path";
|
|
18
|
+
import {
|
|
19
|
+
afterAll,
|
|
20
|
+
afterEach,
|
|
21
|
+
beforeAll,
|
|
22
|
+
beforeEach,
|
|
23
|
+
describe,
|
|
24
|
+
expect,
|
|
25
|
+
it,
|
|
26
|
+
} from "vitest";
|
|
27
|
+
|
|
28
|
+
import { resolveFusedLibraryPath } from "../../desktop-fused-ffi-backend-runtime";
|
|
29
|
+
import {
|
|
30
|
+
type ElizaInferenceContextHandle,
|
|
31
|
+
type ElizaInferenceFfi,
|
|
32
|
+
loadElizaInferenceFfi,
|
|
33
|
+
} from "../ffi-bindings";
|
|
34
|
+
import { FusedSpeakerEncoder } from "./encoder-fused";
|
|
35
|
+
|
|
36
|
+
const EMB_DIM = 256;
|
|
37
|
+
const MIN_SAMPLES = 16_000;
|
|
38
|
+
|
|
39
|
+
const isBun = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined";
|
|
40
|
+
const LIB_PATH = resolveFusedLibraryPath(null, process.env);
|
|
41
|
+
// The native speaker_open needs a WeSpeaker GGUF. Provide one via
|
|
42
|
+
// ELIZA_TEST_SPEAKER_GGUF (e.g. wespeaker-resnet34-lm.gguf); the encode
|
|
43
|
+
// assertions skip honestly when it isn't supplied — they are never faked.
|
|
44
|
+
const SPEAKER_GGUF = process.env.ELIZA_TEST_SPEAKER_GGUF?.trim();
|
|
45
|
+
const HAVE_MODEL = !!SPEAKER_GGUF && existsSync(SPEAKER_GGUF);
|
|
46
|
+
|
|
47
|
+
describe.skipIf(!isBun || !LIB_PATH)("FusedSpeakerEncoder — real FFI", () => {
|
|
48
|
+
let ffi: ElizaInferenceFfi;
|
|
49
|
+
let ctx: ElizaInferenceContextHandle;
|
|
50
|
+
let tmp: string;
|
|
51
|
+
|
|
52
|
+
beforeAll(() => {
|
|
53
|
+
// LIB_PATH is non-null inside the skipIf-guarded block.
|
|
54
|
+
ffi = loadElizaInferenceFfi(LIB_PATH as string);
|
|
55
|
+
});
|
|
56
|
+
afterAll(() => {
|
|
57
|
+
ffi?.close();
|
|
58
|
+
});
|
|
59
|
+
beforeEach(() => {
|
|
60
|
+
tmp = mkdtempSync(path.join(os.tmpdir(), "speaker-fused-real-"));
|
|
61
|
+
ctx = ffi.create(tmp);
|
|
62
|
+
});
|
|
63
|
+
afterEach(() => {
|
|
64
|
+
ffi.destroy(ctx);
|
|
65
|
+
rmSync(tmp, { recursive: true, force: true });
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it("isSupported() reflects the loaded build's speaker ABI", () => {
|
|
69
|
+
expect(typeof FusedSpeakerEncoder.isSupported(ffi)).toBe("boolean");
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it.skipIf(!HAVE_MODEL)(
|
|
73
|
+
"encode() returns a finite 256-d embedding off the real WeSpeaker graph",
|
|
74
|
+
async () => {
|
|
75
|
+
const enc = await FusedSpeakerEncoder.load({
|
|
76
|
+
ffi,
|
|
77
|
+
ctx,
|
|
78
|
+
ggufPath: SPEAKER_GGUF,
|
|
79
|
+
});
|
|
80
|
+
expect(enc.embeddingDim).toBe(EMB_DIM);
|
|
81
|
+
expect(enc.sampleRate).toBe(MIN_SAMPLES);
|
|
82
|
+
// 1 s of a 220 Hz tone — a real, finite input the native graph accepts.
|
|
83
|
+
const pcm = new Float32Array(MIN_SAMPLES);
|
|
84
|
+
for (let i = 0; i < pcm.length; i += 1) {
|
|
85
|
+
pcm[i] = 0.2 * Math.sin((2 * Math.PI * 220 * i) / MIN_SAMPLES);
|
|
86
|
+
}
|
|
87
|
+
const emb = await enc.encode(pcm);
|
|
88
|
+
expect(emb.length).toBe(EMB_DIM);
|
|
89
|
+
expect(emb.every((v) => Number.isFinite(v))).toBe(true);
|
|
90
|
+
// A non-degenerate embedding has real magnitude.
|
|
91
|
+
let norm = 0;
|
|
92
|
+
for (const v of emb) norm += v * v;
|
|
93
|
+
expect(Math.sqrt(norm)).toBeGreaterThan(0);
|
|
94
|
+
await enc.dispose();
|
|
95
|
+
},
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
it.skipIf(!HAVE_MODEL)(
|
|
99
|
+
"rejects pcm shorter than the minimum window before hitting the native graph",
|
|
100
|
+
async () => {
|
|
101
|
+
const enc = await FusedSpeakerEncoder.load({
|
|
102
|
+
ffi,
|
|
103
|
+
ctx,
|
|
104
|
+
ggufPath: SPEAKER_GGUF,
|
|
105
|
+
});
|
|
106
|
+
await expect(enc.encode(new Float32Array(100))).rejects.toMatchObject({
|
|
107
|
+
name: "SpeakerEncoderGgmlUnavailableError",
|
|
108
|
+
code: "invalid-input",
|
|
109
|
+
});
|
|
110
|
+
await enc.dispose();
|
|
111
|
+
},
|
|
112
|
+
);
|
|
113
|
+
});
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speaker-embedding encoder — fused `libelizainference` binding (ABI v6).
|
|
3
|
+
*
|
|
4
|
+
* The strategic on-device voice engine is the single fused-FFI
|
|
5
|
+
* `libelizainference` library (the merged llama.cpp fork — see
|
|
6
|
+
* `plugins/plugin-local-inference/native/CLAUDE.md` §1). This class drives the
|
|
7
|
+
* WeSpeaker ResNet34-LM speaker encoder through that one native handle via the
|
|
8
|
+
* `eliza_inference_speaker_*` ABI. This is the SOLE on-device speaker-encoder
|
|
9
|
+
* runtime — the same `ffi`/`ctx` pair powers VAD / wake-word / TTS / ASR, so the
|
|
10
|
+
* whole voice pipeline runs through one library.
|
|
11
|
+
*
|
|
12
|
+
* Shape mirrors the legacy `encoder.ts::SpeakerEncoder` contract exactly:
|
|
13
|
+
* - 16 kHz mono fp32 PCM in,
|
|
14
|
+
* - one L2-normalized 256-d embedding out,
|
|
15
|
+
* - `encode(pcm)` / `dispose()`.
|
|
16
|
+
*
|
|
17
|
+
* No silent fallback: when the fused build does not export the speaker ABI
|
|
18
|
+
* (`eliza_inference_speaker_supported() == 0`) `load()` throws a structured
|
|
19
|
+
* `SpeakerEncoderGgmlUnavailableError` (AGENTS.md §3 — no synthetic
|
|
20
|
+
* embeddings, no standalone-lib fallback).
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import type {
|
|
24
|
+
ElizaInferenceContextHandle,
|
|
25
|
+
ElizaInferenceFfi,
|
|
26
|
+
NativeSpeakerHandle,
|
|
27
|
+
} from "../ffi-bindings";
|
|
28
|
+
import type { SpeakerEncoder } from "./encoder";
|
|
29
|
+
import { WESPEAKER_RESNET34_LM_INT8_MODEL_ID } from "./encoder";
|
|
30
|
+
import {
|
|
31
|
+
SPEAKER_GGML_EMBEDDING_DIM,
|
|
32
|
+
SPEAKER_GGML_MIN_SAMPLES,
|
|
33
|
+
SPEAKER_GGML_SAMPLE_RATE,
|
|
34
|
+
SpeakerEncoderGgmlUnavailableError,
|
|
35
|
+
} from "./encoder-ggml";
|
|
36
|
+
|
|
37
|
+
export interface FusedSpeakerEncoderOptions {
|
|
38
|
+
ffi: ElizaInferenceFfi;
|
|
39
|
+
ctx: ElizaInferenceContextHandle | (() => ElizaInferenceContextHandle);
|
|
40
|
+
/**
|
|
41
|
+
* Optional explicit WeSpeaker GGUF path. `null` lets the native runtime
|
|
42
|
+
* resolve the bundle's `speaker/` dir (the default).
|
|
43
|
+
*/
|
|
44
|
+
ggufPath?: string | null;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Fused-`libelizainference` WeSpeaker speaker encoder. Owns one
|
|
49
|
+
* `eliza_inference_speaker_*` session; `encode()` runs one forward pass over
|
|
50
|
+
* the supplied 16 kHz PCM and returns the normalized 256-d embedding. The
|
|
51
|
+
* native side owns the model graph; this class is a thin handle.
|
|
52
|
+
*/
|
|
53
|
+
export class FusedSpeakerEncoder implements SpeakerEncoder {
|
|
54
|
+
readonly embeddingDim = SPEAKER_GGML_EMBEDDING_DIM;
|
|
55
|
+
readonly sampleRate = SPEAKER_GGML_SAMPLE_RATE;
|
|
56
|
+
readonly modelId = WESPEAKER_RESNET34_LM_INT8_MODEL_ID;
|
|
57
|
+
private disposed = false;
|
|
58
|
+
|
|
59
|
+
private constructor(
|
|
60
|
+
private readonly ffi: ElizaInferenceFfi,
|
|
61
|
+
private readonly handle: NativeSpeakerHandle,
|
|
62
|
+
) {}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* True only when the fused `libelizainference` build exports the speaker
|
|
66
|
+
* ABI and advertises support at runtime.
|
|
67
|
+
*/
|
|
68
|
+
static isSupported(ffi: ElizaInferenceFfi | null | undefined): boolean {
|
|
69
|
+
if (!ffi || typeof ffi.speakerSupported !== "function") return false;
|
|
70
|
+
return ffi.speakerSupported();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Open a native speaker-encoder session. Throws
|
|
75
|
+
* `SpeakerEncoderGgmlUnavailableError` when the runtime is not present.
|
|
76
|
+
*/
|
|
77
|
+
static async load(
|
|
78
|
+
opts: FusedSpeakerEncoderOptions,
|
|
79
|
+
): Promise<FusedSpeakerEncoder> {
|
|
80
|
+
if (!FusedSpeakerEncoder.isSupported(opts.ffi)) {
|
|
81
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
82
|
+
"native-missing",
|
|
83
|
+
"[speaker-fused] The native speaker encoder is not present in this libelizainference build. Rebuild with the WeSpeaker forward graph linked in (eliza_inference_speaker_* symbols).",
|
|
84
|
+
);
|
|
85
|
+
}
|
|
86
|
+
if (
|
|
87
|
+
!opts.ffi.speakerOpen ||
|
|
88
|
+
!opts.ffi.speakerEmbed ||
|
|
89
|
+
!opts.ffi.speakerClose
|
|
90
|
+
) {
|
|
91
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
92
|
+
"model-load-failed",
|
|
93
|
+
"[speaker-fused] Speaker support probe succeeded, but the required FFI methods are missing on the binding.",
|
|
94
|
+
);
|
|
95
|
+
}
|
|
96
|
+
const ctx = typeof opts.ctx === "function" ? opts.ctx() : opts.ctx;
|
|
97
|
+
const handle = opts.ffi.speakerOpen({
|
|
98
|
+
ctx,
|
|
99
|
+
ggufPath: opts.ggufPath ?? null,
|
|
100
|
+
});
|
|
101
|
+
return new FusedSpeakerEncoder(opts.ffi, handle);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
async encode(pcm: Float32Array): Promise<Float32Array> {
|
|
105
|
+
if (this.disposed) {
|
|
106
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
107
|
+
"model-load-failed",
|
|
108
|
+
"[speaker-fused] encode called after dispose()",
|
|
109
|
+
);
|
|
110
|
+
}
|
|
111
|
+
if (!(pcm instanceof Float32Array)) {
|
|
112
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
113
|
+
"invalid-input",
|
|
114
|
+
"[speaker-fused] pcm must be a Float32Array",
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
if (pcm.length < SPEAKER_GGML_MIN_SAMPLES) {
|
|
118
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
119
|
+
"invalid-input",
|
|
120
|
+
`[speaker-fused] pcm too short: ${pcm.length} samples < ${SPEAKER_GGML_MIN_SAMPLES}`,
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
const embed = this.ffi.speakerEmbed;
|
|
124
|
+
if (!embed) {
|
|
125
|
+
throw new SpeakerEncoderGgmlUnavailableError(
|
|
126
|
+
"model-load-failed",
|
|
127
|
+
"[speaker-fused] encode missing FFI method",
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
return embed({ speaker: this.handle, pcm });
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
async dispose(): Promise<void> {
|
|
134
|
+
if (this.disposed) return;
|
|
135
|
+
this.disposed = true;
|
|
136
|
+
this.ffi.speakerClose?.(this.handle);
|
|
137
|
+
}
|
|
138
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speaker-embedding encoder — shared constants, error class, and the
|
|
3
|
+
* embedding-distance helper.
|
|
4
|
+
*
|
|
5
|
+
* The speaker encoder runs EXCLUSIVELY through the fused `libelizainference`
|
|
6
|
+
* `eliza_inference_speaker_*` ABI (`FusedSpeakerEncoder` in `encoder-fused.ts`).
|
|
7
|
+
* The standalone `libvoice_classifier` binding that previously lived here has
|
|
8
|
+
* been removed — there is one on-device voice runtime.
|
|
9
|
+
*
|
|
10
|
+
* This module retains the pieces the fused path shares:
|
|
11
|
+
* - the canonical dims (`SPEAKER_GGML_*`), pinned at 256 to match the C-side
|
|
12
|
+
* `VOICE_SPEAKER_EMBEDDING_DIM` and the WeSpeaker ResNet34-LM head,
|
|
13
|
+
* - the structured `SpeakerEncoderGgmlUnavailableError` the fused encoder
|
|
14
|
+
* throws (no synthetic embedding fallback),
|
|
15
|
+
* - the pure `voiceSpeakerDistance` cosine-distance helper.
|
|
16
|
+
*/
|
|
17
|
+
/** Output embedding dim. Matches `VOICE_SPEAKER_EMBEDDING_DIM`. */
|
|
18
|
+
export declare const SPEAKER_GGML_EMBEDDING_DIM = 256;
|
|
19
|
+
/** Required input sample rate. */
|
|
20
|
+
export declare const SPEAKER_GGML_SAMPLE_RATE = 16000;
|
|
21
|
+
/** Minimum useful audio window (~1.0 s). */
|
|
22
|
+
export declare const SPEAKER_GGML_MIN_SAMPLES = 16000;
|
|
23
|
+
export declare class SpeakerEncoderGgmlUnavailableError extends Error {
|
|
24
|
+
readonly code: "native-missing" | "library-missing" | "model-missing" | "model-load-failed" | "model-shape-mismatch" | "forward-not-implemented" | "invalid-input";
|
|
25
|
+
constructor(code: SpeakerEncoderGgmlUnavailableError["code"], message: string);
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Cosine distance between two 256-dim speaker embeddings. Defined as
|
|
29
|
+
* `1 - cos_similarity(a, b)`, range [0, 2]. Mirrors the C-side
|
|
30
|
+
* `voice_speaker_distance` helper exactly.
|
|
31
|
+
*/
|
|
32
|
+
export declare function voiceSpeakerDistance(a: Float32Array, b: Float32Array): number;
|
|
33
|
+
//# sourceMappingURL=encoder-ggml.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"encoder-ggml.d.ts","sourceRoot":"","sources":["encoder-ggml.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,mEAAmE;AACnE,eAAO,MAAM,0BAA0B,MAAM,CAAC;AAE9C,kCAAkC;AAClC,eAAO,MAAM,wBAAwB,QAAS,CAAC;AAE/C,4CAA4C;AAC5C,eAAO,MAAM,wBAAwB,QAAS,CAAC;AAE/C,qBAAa,kCAAmC,SAAQ,KAAK;IAC5D,QAAQ,CAAC,IAAI,EACV,gBAAgB,GAChB,iBAAiB,GACjB,eAAe,GACf,mBAAmB,GACnB,sBAAsB,GACtB,yBAAyB,GACzB,eAAe,CAAC;gBAElB,IAAI,EAAE,kCAAkC,CAAC,MAAM,CAAC,EAChD,OAAO,EAAE,MAAM;CAMhB;AAED;;;;GAIG;AACH,wBAAgB,oBAAoB,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CA4B7E"}
|